Bundled libwebp updated to version 0.6.0

This commit imports libwebp 0.6.0, including AUTHORS, COPYING, ChangeLog, NEWS, PATENTS, README and src directories. In src, only includes header and source files. Upstream changes since 0.5.1 have been merged in. Also updated version in qt_attribution.json. Conflicts: src/3rdparty/libwebp.pri src/3rdparty/libwebp/qt_attribution.json src/3rdparty/libwebp/src/webp/config.h Change-Id: I001aa7a3fabf0130b54f9005c23aa822bc1d0ec1 Reviewed-by: Eirik Aavitsland <eirik.aavitsland@qt.io>
author: Liang Qi <liang.qi@qt.io> 2017-03-07 13:05:21 +0100
committer: Liang Qi <liang.qi@qt.io> 2017-03-13 10:47:45 +0000
commit: b7ec9e78633d8f2c75a8b02e17e169497bb103e2 (patch)
tree: e4be04af4dbcf8cd635715efdf4e769281183746
parent: f2dbc67c2b032a5f27d0224e020fb6dfcd3fd142 (diff)
146 files changed, 14025 insertions, 6482 deletions
diff --git a/src/3rdparty/libwebp.pri b/src/3rdparty/libwebp.pri
index f21ba6d..61fbee9 100644
--- a/src/3rdparty/libwebp.pri
+++ b/src/3rdparty/libwebp.pri
@@ -1,30 +1,29 @@
-CONFIG += object_parallel_to_source no_batch
-
 INCLUDEPATH += \
     $$PWD/libwebp \
     $$PWD/libwebp/src \
     $$PWD/libwebp/src/dec \
     $$PWD/libwebp/src/enc \
-    $$PWD/libwebp/src/extra \
     $$PWD/libwebp/src/dsp \
     $$PWD/libwebp/src/mux \
     $$PWD/libwebp/src/utils \
     $$PWD/libwebp/src/webp
 
 SOURCES += \
-    $$PWD/libwebp/src/dec/alpha.c \
-    $$PWD/libwebp/src/dec/buffer.c \
-    $$PWD/libwebp/src/dec/frame.c \
-    $$PWD/libwebp/src/dec/idec.c \
-    $$PWD/libwebp/src/dec/io.c \
-    $$PWD/libwebp/src/dec/quant.c \
-    $$PWD/libwebp/src/dec/tree.c \
-    $$PWD/libwebp/src/dec/vp8.c \
-    $$PWD/libwebp/src/dec/vp8l.c \
-    $$PWD/libwebp/src/dec/webp.c \
+    $$PWD/libwebp/src/dec/alpha_dec.c \
+    $$PWD/libwebp/src/dec/buffer_dec.c \
+    $$PWD/libwebp/src/dec/frame_dec.c \
+    $$PWD/libwebp/src/dec/idec_dec.c \
+    $$PWD/libwebp/src/dec/io_dec.c \
+    $$PWD/libwebp/src/dec/quant_dec.c \
+    $$PWD/libwebp/src/dec/tree_dec.c \
+    $$PWD/libwebp/src/dec/vp8_dec.c \
+    $$PWD/libwebp/src/dec/vp8l_dec.c \
+    $$PWD/libwebp/src/dec/webp_dec.c \
     $$PWD/libwebp/src/demux/demux.c \
     $$PWD/libwebp/src/demux/anim_decode.c \
+    $$PWD/libwebp/src/dsp/alpha_processing.c \
     $$PWD/libwebp/src/dsp/alpha_processing_mips_dsp_r2.c \
+    $$PWD/libwebp/src/dsp/alpha_processing_sse2.c \
     $$PWD/libwebp/src/dsp/alpha_processing_sse41.c \
     $$PWD/libwebp/src/dsp/argb.c \
     $$PWD/libwebp/src/dsp/argb_mips_dsp_r2.c \
@@ -35,80 +34,83 @@ SOURCES += \
     $$PWD/libwebp/src/dsp/cost_sse2.c \
     $$PWD/libwebp/src/dsp/cpu.c \
     $$PWD/libwebp/src/dsp/dec.c \
+    $$PWD/libwebp/src/dsp/dec_clip_tables.c \
+    $$PWD/libwebp/src/dsp/dec_mips32.c \
     $$PWD/libwebp/src/dsp/dec_mips_dsp_r2.c \
     $$PWD/libwebp/src/dsp/dec_msa.c \
     $$PWD/libwebp/src/dsp/dec_sse2.c \
     $$PWD/libwebp/src/dsp/dec_sse41.c \
     $$PWD/libwebp/src/dsp/enc.c \
+    $$PWD/libwebp/src/dsp/enc_avx2.c \
+    $$PWD/libwebp/src/dsp/enc_mips32.c \
     $$PWD/libwebp/src/dsp/enc_mips_dsp_r2.c \
+    $$PWD/libwebp/src/dsp/enc_msa.c \
     $$PWD/libwebp/src/dsp/enc_sse2.c \
     $$PWD/libwebp/src/dsp/enc_sse41.c \
     $$PWD/libwebp/src/dsp/filters.c \
     $$PWD/libwebp/src/dsp/filters_mips_dsp_r2.c \
+    $$PWD/libwebp/src/dsp/filters_msa.c \
     $$PWD/libwebp/src/dsp/filters_sse2.c \
     $$PWD/libwebp/src/dsp/lossless.c \
     $$PWD/libwebp/src/dsp/lossless_enc.c \
     $$PWD/libwebp/src/dsp/lossless_enc_mips32.c \
     $$PWD/libwebp/src/dsp/lossless_enc_mips_dsp_r2.c \
+    $$PWD/libwebp/src/dsp/lossless_enc_msa.c \
     $$PWD/libwebp/src/dsp/lossless_enc_sse2.c \
     $$PWD/libwebp/src/dsp/lossless_enc_sse41.c \
     $$PWD/libwebp/src/dsp/lossless_mips_dsp_r2.c \
     $$PWD/libwebp/src/dsp/rescaler.c \
     $$PWD/libwebp/src/dsp/rescaler_mips32.c \
     $$PWD/libwebp/src/dsp/rescaler_mips_dsp_r2.c \
+    $$PWD/libwebp/src/dsp/rescaler_msa.c \
     $$PWD/libwebp/src/dsp/rescaler_sse2.c \
     $$PWD/libwebp/src/dsp/upsampling.c \
     $$PWD/libwebp/src/dsp/upsampling_mips_dsp_r2.c \
+    $$PWD/libwebp/src/dsp/upsampling_msa.c \
     $$PWD/libwebp/src/dsp/upsampling_sse2.c \
     $$PWD/libwebp/src/dsp/yuv.c \
     $$PWD/libwebp/src/dsp/yuv_mips_dsp_r2.c \
-    $$PWD/libwebp/src/dsp/alpha_processing.c \
-    $$PWD/libwebp/src/dsp/alpha_processing_sse2.c \
-    $$PWD/libwebp/src/dsp/dec_clip_tables.c \
-    $$PWD/libwebp/src/dsp/dec_mips32.c \
-    $$PWD/libwebp/src/dsp/enc_avx2.c \
-    $$PWD/libwebp/src/dsp/enc_mips32.c \
     $$PWD/libwebp/src/dsp/lossless_sse2.c \
     $$PWD/libwebp/src/dsp/yuv_mips32.c \
     $$PWD/libwebp/src/dsp/yuv_sse2.c \
-    $$PWD/libwebp/src/enc/alpha.c \
-    $$PWD/libwebp/src/enc/analysis.c \
-    $$PWD/libwebp/src/enc/backward_references.c \
-    $$PWD/libwebp/src/enc/config.c \
-    $$PWD/libwebp/src/enc/cost.c \
-    $$PWD/libwebp/src/enc/delta_palettization.c \
-    $$PWD/libwebp/src/enc/filter.c \
-    $$PWD/libwebp/src/enc/frame.c \
-    $$PWD/libwebp/src/enc/histogram.c \
-    $$PWD/libwebp/src/enc/iterator.c \
-    $$PWD/libwebp/src/enc/near_lossless.c \
-    $$PWD/libwebp/src/enc/picture.c \
-    $$PWD/libwebp/src/enc/quant.c \
-    $$PWD/libwebp/src/enc/syntax.c \
-    $$PWD/libwebp/src/enc/token.c \
-    $$PWD/libwebp/src/enc/tree.c \
-    $$PWD/libwebp/src/enc/vp8l.c \
-    $$PWD/libwebp/src/enc/webpenc.c \
-    $$PWD/libwebp/src/enc/picture_csp.c \
-    $$PWD/libwebp/src/enc/picture_psnr.c \
-    $$PWD/libwebp/src/enc/picture_rescale.c \
-    $$PWD/libwebp/src/enc/picture_tools.c \
-    $$PWD/libwebp/src/extras/extras.c \
+    $$PWD/libwebp/src/enc/alpha_enc.c \
+    $$PWD/libwebp/src/enc/analysis_enc.c \
+    $$PWD/libwebp/src/enc/backward_references_enc.c \
+    $$PWD/libwebp/src/enc/config_enc.c \
+    $$PWD/libwebp/src/enc/cost_enc.c \
+    $$PWD/libwebp/src/enc/delta_palettization_enc.c \
+    $$PWD/libwebp/src/enc/filter_enc.c \
+    $$PWD/libwebp/src/enc/frame_enc.c \
+    $$PWD/libwebp/src/enc/histogram_enc.c \
+    $$PWD/libwebp/src/enc/iterator_enc.c \
+    $$PWD/libwebp/src/enc/near_lossless_enc.c \
+    $$PWD/libwebp/src/enc/picture_enc.c \
+    $$PWD/libwebp/src/enc/picture_csp_enc.c \
+    $$PWD/libwebp/src/enc/picture_psnr_enc.c \
+    $$PWD/libwebp/src/enc/picture_rescale_enc.c \
+    $$PWD/libwebp/src/enc/picture_tools_enc.c \
+    $$PWD/libwebp/src/enc/predictor_enc.c \
+    $$PWD/libwebp/src/enc/quant_enc.c \
+    $$PWD/libwebp/src/enc/syntax_enc.c \
+    $$PWD/libwebp/src/enc/token_enc.c \
+    $$PWD/libwebp/src/enc/tree_enc.c \
+    $$PWD/libwebp/src/enc/vp8l_enc.c \
+    $$PWD/libwebp/src/enc/webp_enc.c \
     $$PWD/libwebp/src/mux/anim_encode.c \
     $$PWD/libwebp/src/mux/muxedit.c \
     $$PWD/libwebp/src/mux/muxinternal.c \
     $$PWD/libwebp/src/mux/muxread.c \
-    $$PWD/libwebp/src/utils/bit_reader.c \
-    $$PWD/libwebp/src/utils/bit_writer.c \
-    $$PWD/libwebp/src/utils/color_cache.c \
-    $$PWD/libwebp/src/utils/filters.c \
-    $$PWD/libwebp/src/utils/huffman.c \
-    $$PWD/libwebp/src/utils/huffman_encode.c \
-    $$PWD/libwebp/src/utils/quant_levels.c \
-    $$PWD/libwebp/src/utils/quant_levels_dec.c \
-    $$PWD/libwebp/src/utils/random.c \
-    $$PWD/libwebp/src/utils/rescaler.c \
-    $$PWD/libwebp/src/utils/thread.c \
+    $$PWD/libwebp/src/utils/bit_reader_utils.c \
+    $$PWD/libwebp/src/utils/bit_writer_utils.c \
+    $$PWD/libwebp/src/utils/color_cache_utils.c \
+    $$PWD/libwebp/src/utils/filters_utils.c \
+    $$PWD/libwebp/src/utils/huffman_utils.c \
+    $$PWD/libwebp/src/utils/huffman_encode_utils.c \
+    $$PWD/libwebp/src/utils/quant_levels_utils.c \
+    $$PWD/libwebp/src/utils/quant_levels_dec_utils.c \
+    $$PWD/libwebp/src/utils/random_utils.c \
+    $$PWD/libwebp/src/utils/rescaler_utils.c \
+    $$PWD/libwebp/src/utils/thread_utils.c \
     $$PWD/libwebp/src/utils/utils.c
 
 android {
@@ -122,8 +124,10 @@ integrity {
 
 equals(QT_ARCH, arm)|equals(QT_ARCH, arm64) {
     SOURCES_FOR_NEON += \
+        $$PWD/libwebp/src/dsp/alpha_processing_neon.c \
         $$PWD/libwebp/src/dsp/dec_neon.c \
         $$PWD/libwebp/src/dsp/enc_neon.c \
+        $$PWD/libwebp/src/dsp/filters_neon.c \
         $$PWD/libwebp/src/dsp/lossless_enc_neon.c \
         $$PWD/libwebp/src/dsp/lossless_neon.c \
         $$PWD/libwebp/src/dsp/rescaler_neon.c \
diff --git a/src/3rdparty/libwebp/AUTHORS b/src/3rdparty/libwebp/AUTHORS
index 0f382da..b6e9cfb 100644
--- a/src/3rdparty/libwebp/AUTHORS
+++ b/src/3rdparty/libwebp/AUTHORS
@@ -2,8 +2,10 @@ Contributors:
 - Charles Munger (clm at google dot com)
 - Christian Duvivier (cduvivier at google dot com)
 - Djordje Pesut (djordje dot pesut at imgtec dot com)
+- Hui Su (huisu at google dot com)
 - James Zern (jzern at google dot com)
 - Jan Engelhardt (jengelh at medozas dot de)
+- Jehan (jehan at girinstud dot io)
 - Johann (johann dot koenig at duck dot com)
 - Jovan Zelincevic (jovan dot zelincevic at imgtec dot com)
 - Jyrki Alakuijala (jyrki at google dot com)
@@ -16,6 +18,7 @@ Contributors:
 - Mislav Bradac (mislavm at google dot com)
 - Nico Weber (thakis at chromium dot org)
 - Noel Chromium (noel at chromium dot org)
+- Owen Rodley (orodley at google dot com)
 - Parag Salasakar (img dot mips1 at gmail dot com)
 - Pascal Massimino (pascal dot massimino at gmail dot com)
 - Paweł Hajdan, Jr (phajdan dot jr at chromium dot org)
diff --git a/src/3rdparty/libwebp/ChangeLog b/src/3rdparty/libwebp/ChangeLog
index 99fb3c0..7ac7b5f 100644
--- a/src/3rdparty/libwebp/ChangeLog
+++ b/src/3rdparty/libwebp/ChangeLog
@@ -1,3105 +1,3484 @@
-deb54d9 Clarify the expected 'config' lifespan in WebPIDecode()
-c7e2d24 update ChangeLog (tag: v0.5.1-rc5)
-c7eb06f Fix corner case in CostManagerInit.
-ab7937a gif2webp: normalize the number of .'s in the help message
-3cdec84 vwebp: normalize the number of .'s in the help message
-bdf6241 cwebp: normalize the number of .'s in the help message
-06a38c7 fix rescaling bug: alpha plane wasn't filled with 0xff
-319e37b Improve lossless compression.
-447adbc 'our bug tracker' -> 'the bug tracker'
-97b9e64 normalize the number of .'s in the help message
-bb50bf4 pngdec,ReadFunc: throw an error on invalid read
-38063af decode.h,WebPGetInfo: normalize function comment
-9e8e1b7 Inline GetResidual for speed.
-7d58d1b Speed-up uniform-region processing.
-23e29cb Merge "Fix a boundary case in BackwardReferencesHashChainDistanceOnly." into 0.5.1
-0bb23b2 free -> WebPSafeFree()
-e7b9177 Merge "DecodeImageData(): change the incorrect assert" into 0.5.1
-2abfa54 DecodeImageData(): change the incorrect assert
-5a48fcd Merge "configure: test for -Wfloat-conversion"
-0174d18 Fix a boundary case in BackwardReferencesHashChainDistanceOnly.
-6a9c262 Merge "Added MSA optimized transform functions"
-cfbcc5e Make sure to consider small distances in LZ77.
-5e60c42 Added MSA optimized transform functions
-3dc28d7 configure: test for -Wfloat-conversion
-f2a0946 add some asserts to delimit the perimeter of CostManager's operation
-9a583c6 fix invalid-write bug for alpha-decoding
-f66512d make gradlew executable
-6fda58f backward_references: quiet double->int warning
-a48cc9d Merge "Fix a compression regression for images with long uniform regions." into 0.5.1
-cc2720c Merge "Revert an LZ77 boundary constant." into 0.5.1
-059aab4 Fix a compression regression for images with long uniform regions.
-b0c7e49 Check more backward matches with higher quality.
-a361151 Revert an LZ77 boundary constant.
-8190374 README: fix typo
-7551db4 update NEWS
-0fb2269 bump version to 0.5.1
-f453761 update AUTHORS & .mailmap
-3259571 Refactor GetColorPalette method.
-1df5e26 avoid using tmp histogram in PreparePair()
-7685123 fix comment typos
-a246b92 Speedup backward references.
-76d73f1 Merge "CostManager: introduce a free-list of ~10 intervals"
-eab39d8 CostManager: introduce a free-list of ~10 intervals
-4c59aac Merge "mips msa webp configuration"
-043c33f Merge "Improve speed and compression in backward reference for lossless."
-71be9b8 Merge "clarify variable names in HistogramRemap()"
-0ba7fd7 Improve speed and compression in backward reference for lossless.
-0481d42 CostManager: cache one interval and re-use it when possible
-41b7e6b Merge "histogram: fix bin calculation"
-96c3d62 histogram: fix bin calculation
-fe9e31e clarify variable names in HistogramRemap()
-ce3c824 disable near-lossless quantization if palette is used
-e11da08 mips msa webp configuration
-5f8f998 mux: Presence of unknown chunks should trigger VP8X chunk output.
-cadec0b Merge "Sync mips32 and dsp_r2 YUV->RGB code with C verison"
-d963775 Compute the hash chain once and for all for lossless compression.
-50a4866 Sync mips32 and dsp_r2 YUV->RGB code with C verison
-eee788e Merge "introduce a common signature for all image reader function"
-d77b877 introduce a common signature for all image reader function
-ca8d951 remove some obsolete TODOs
-ae2a722 collect all decoding utilities from examples/ in libexampledec.a
-0b8ae85 Merge "Move DitherCombine8x8 to dsp/dec.c"
-77cad88 Merge "ReadWebP: avoid conversion to ARGB if final format is YUVA"
-ab8d669 ReadWebP: avoid conversion to ARGB if final format is YUVA
-f8b7ce9 Merge "test pointer to NULL explicitly"
-5df6f21 test pointer to NULL explicitly
-77f21c9 Move DitherCombine8x8 to dsp/dec.c
-c9e6d86 Add gradle support
-c65f41e Revert "Add gradle support"
-bf731ed Add gradle support
-08333b8 WebPAnimEncoder: Detect when canvas is modified, restore only when needed.
-0209d7e Merge "speed-up MapToPalette() with binary search"
-fdd29a3 speed-up MapToPalette() with binary search
-cf4a651 Revert "Refactor GetColorPalette method."
-0a27aca Merge changes Idfa8ce83,I19adc9c4
-f25c440 WebPAnimEncoder: Restore original canvas between multiple encodes.
-169004b Refactor GetColorPalette method.
-576362a VP8LDoFillBitWindow: support big-endian in fast path
-ac49e4e bit_reader.c: s/VP8L_USE_UNALIGNED_LOAD/VP8L_USE_FAST_LOAD/
-d39ceb5 VP8LDoFillBitWindow: remove stale TODO
-2ec2de1 Merge "Speed-up BackwardReferencesHashChainDistanceOnly."
-3e023c1 Speed-up BackwardReferencesHashChainDistanceOnly.
-f2e1efb Improve near lossless compression when a prediction filter is used.
-e15afbc dsp.h: fix ubsan macro name
-e53c9cc dsp.h: add WEBP_UBSAN_IGNORE_UNSIGNED_OVERFLOW
-af81fdb utils.h: quiet -fsanitize=undefined warnings
-ea0be35 dsp.h: remove utils.h include
-cd276ae utils/*.c: ../utils/utils.h -> ./utils.h
-c892713 utils/Makefile.am: add some missing headers
-ea24e02 Merge "dsp.h: add WEBP_UBSAN_IGNORE_UNDEF"
-369e264 dsp.h: add WEBP_UBSAN_IGNORE_UNDEF
-0d020a7 Merge "add runtime NEON detection"
-5ee2136 Merge "add VP8LAddPixels() to lossless.h"
-47435a6 add VP8LAddPixels() to lossless.h
-8fa6ac6 remove two ubsan warnings
-74fb56f add runtime NEON detection
-4154a83 MIPS update to new Unfilter API
-c80b9fc Merge "cherry-pick decoder fix for 64-bit android devices"
-6235147 cherry-pick decoder fix for 64-bit android devices
-d41b8c4 configure: test for -Wformat-* w/-Wformat present
-5f95589 Fix WEBP_ALIGN in case the argument is a pointer to a type larger than a byte.
-2309fd5 replace num_parts_ by num_parts_minus_one_ (unsigned)
-9629f4b SimplifySegments: quiet -Warray-bounds warning
-de47492 Merge "update the Unfilter API in dsp to process one row independently"
-2102ccd update the Unfilter API in dsp to process one row independently
-e3912d5 WebPAnimEncoder: Restore canvas before evaluating blending possibility.
-6e12e1e WebPAnimEncoder: Fix for single-frame optimization.
-602f344 Merge changes I1d03acac,Ifcb64219
-95ecccf only apply color-mapping for alpha on the cropped area
-47dd070 anim_diff: Add an experimental option for max inter-frame diff.
-aa809cf only allocate alpha_plane_ up to crop_bottom row
-31f2b8d WebPAnimEncoder: FlattenSimilarPixels(): look for similar
-774dfbd perform alpha filtering within the decoding loop
-a4cae68 lossless decoding: only process decoded row up to last_row
-238cdcd Only call WebPDequantizeLevels() on cropped area
-cf6c713 alpha: preparatory cleanup
-b95ac0a Merge "VP8GetHeaders(): initialize VP8Io with sane value for crop/scale dimensions"
-8923139 VP8GetHeaders(): initialize VP8Io with sane value for crop/scale dimensions
-5828e19 use_8b_decode -> use_8b_decode_
-8dca024 fix bug in alpha.c that was triggering a memory error in incremental mode
-9a950c5 WebPAnimEncoder: Disable filtering when blending is used with lossy encoding.
-eb42390 WebPAnimEncoder: choose max diff for framerect based on quality.
-ff0a94b WebPAnimEncoder lossy: ignore small pixel differences for frame rectangles.
-f804008 gif2webp: Remove the 'prev_to_prev_canvas' buffer.
-6d8c07d Merge "WebPDequantizeLevels(): use stride in CountLevels()"
-d96fe5e WebPDequantizeLevels(): use stride in CountLevels()
-ec1b240 WebPPictureImport*: check output pointer
-c076876 Merge "Revert "Re-enable encoding of alpha plane with color cache for next release.""
-41f14bc WebPPictureImport*: check src pointer
-64eed38 Pass stride parameter to WebPDequantizeLevels()
-97934e2 Revert "Re-enable encoding of alpha plane with color cache for next release."
-e88c4ca fix -m 2 mode-cost evaluation (causing partition0 overflow)
-4562e83 Merge "add extra meaning to WebPDecBuffer::is_external_memory"
-abdb109 add extra meaning to WebPDecBuffer::is_external_memory
-875aec7 enc_neon,cosmetics: break long comment
-71e856c GetMBSSIM,cosmetics: fix alignment
-a90edff fix missing 'extern' for SSIM function in dsp/
-423ecaf move some SSIM-accumulation function for dsp/
-f08e662 Merge "Fix FindClosestDiscretized in near lossless:"
-0d40cc5 enc_neon,Disto4x4: remove an unnecessary transpose
-e8feb20 Fix FindClosestDiscretized in near lossless:
-8200643 anim_util: quiet static analysis warning
-a6f23c4 Merge "AnimEncoder: Support progress hook and user data."
-a519377 Merge "Near lossless feature: fix some comments."
-da98d31 AnimEncoder: Support progress hook and user data.
-3335713 Near lossless feature: fix some comments.
-0beed01 cosmetics: fix indent after 2f5e898
-6753f35 Merge "FTransformWHT optimization."
-6583bb1 Improve SSE4.1 implementation of TTransform.
-7561d0c FTransformWHT optimization.
-7ccdb73 fix indentation after patch #328220
-6ec0d2a clarify the logic of the error path when decoding fails.
-8aa352b Merge "Remove an unnecessary transposition in TTransform."
-db86088 Merge "remove useless #include"
-9960c31 Remove an unnecessary transposition in TTransform.
-6e36b51 Small speedup in FTransform.
-9dbd4aa Merge "fix C and SIMD flags completion."
-e60853e Add missing common_sse2.h file to makefile.unix
-696eb2b fix C and SIMD flags completion.
-2b4fe33 Merge "fix multiple allocation for transform buffer"
-2f5e898 fix multiple allocation for transform buffer
-bf2b4f1 Regroup common SSE code + optimization.
-4ed650a force "-pass 6" if -psnr or -size is used but -pass isn't.
-3ef1ce9 yuv_sse2: fix -Wconstant-conversion warning
-a7a03e9 Merge changes I4852d18f,I51ccb85d
-5e122bd gif2webp: set enc_options.verbose = 0 w/-quiet
-ab3c258 anim_encode,DefaultEncoderOptions: init verbose
-8f0dee7 Merge "configure: fix builtin detection w/-Werror"
-4a7b85a cmake: fix builtin detection w/-Werror
-b74657f configure: fix builtin detection w/-Werror
-3661b98 Add a CMakeLists.txt
-75f4af4 remove useless #include
-6c1d763 avoid Yoda style for comparison
-8ce975a SSE optimization for vector mismatch.
-7db5383 Merge tag 'v0.5.0'
-37f0494 update ChangeLog (tag: v0.5.0-rc1, tag: v0.5.0, origin/0.5.0, 0.5.0)
-7e7b6cc faster rgb565/rgb4444/argb output
-4c7f565 update NEWS
-1f62b6b update AUTHORS
-e224fdc update mailmap
-7110050 bump version to 0.5.0
-230a685 README: update help text, repo link
-d48e427 Merge "demux: accept raw bitstreams"
-99a01f4 Merge "Unify some entropy functions."
-4b025f1 Merge "configure: disable asserts by default"
-92cbddf Merge "fix PrintBlockInfo()"
-ca509a3 Unify some entropy functions.
-367bf90 fix PrintBlockInfo()
-b0547ff move back common constants for lossless_enc*.c into the .h
-fb4c783 lossless: simpler alpha cleanup preprocessing
-ba7f4b6 Merge "anim_diff: add brief description of options"
-47ddd5a Move some codec logic out of ./dsp .
-b4106c4 anim_diff: add brief description of options
-357f455 yuv_sse2: fix 32-bit visual studio build
-b9d80fa configure: disable asserts by default
-7badd3d cosmetic fix: sizeof(type) -> sizeof(*var)
-80ce27d Speed up 24-bit packing / unpacking in YUV / RGB conversions.
-68eebcb remove a TODO about rotation
-2dee296 remove few obsolete TODO about aligned loads in SSE2
-e0c0bb3 remove TODO about unused ref_lf_delta[]
-9cf1cc2 remove few TODO:   * 256 -> RD_DISTO_MULT   * don't use TDisto for UV mode picking
-7918964 Merge changes from topic 'demux-fragment-cleanup'
-47399f9 demux: remove GetFragment()
-d3cfb79 demux: remove dead fragment related TODO
-ab714b8 demux, Frame: remove is_fragment_ field
-b105921 yuv_sse2, cosmetics: fix indent
-466c92e demux,WebPIterator: remove fragment_num/num_fragments
-11714ff demux: remove WebPDemuxSelectFragment
-c0f7cc4 fix for bug #280: UMR in next->bits
-578beeb Merge "enc/Makefile.am: add missing headers"
-1a819f0 makefile.unix: make visibility=hidden the default
-d4f9c2e enc/Makefile.am: add missing headers
-846caff configure: check for -fvisibility=hidden
-3f3ea2c demux: accept raw bitstreams
-d6dad5d man cwebp: add precision about exactness of the 'lossless' mode
-46bb1e3 Merge "gifdec: remove utils.h include"
-2b882e9 Merge "Makefile.vc: define WEBP_HAVE_GIF for gifdec.c"
-892b923 Merge "man/*, AUTHORS: clarify origin of the tool"
-e5687a1 Merge "fix optimized build with -mcmodel=medium"
-e56e685 Makefile.vc: define WEBP_HAVE_GIF for gifdec.c
-4077d94 gifdec: remove utils.h include
-b5e30da man/*, AUTHORS: clarify origin of the tool
-b275e59 fix optimized build with -mcmodel=medium
-64da45a cosmetics, cwebp: fix indent
-038a060 Merge "add disto-based refinement for UV mode (if method = 1 or 2)"
-2835089 Provide an SSE2 implementation of CombinedShannonEntropy.
-e6c9351 add disto-based refinement for UV mode (if method = 1 or 2)
-04507dc Merge "fix undefined behaviour during shift, using a cast"
-793c526 Merge "wicdec: add support for reading from stdin"
-d3d1639 Optimize the heap usage in HistogramCombineGreedy.
-202a710 fix undefined behaviour during shift, using a cast
-14d27a4 improve method #2 by merging DistoRefine() and  SimpleQuantize()
-cb1ce99 Merge "10% faster table-less SSE2/NEON version of YUV->RGB conversion"
-ac761a3 10% faster table-less SSE2/NEON version of YUV->RGB conversion
-79fcf29 wicdec: add support for reading from stdin
-015f173 Merge "cwebp: add support for stdin input"
-a9947c3 cwebp: add support for stdin input
-7eb01ff Merge "Improved alpha cleanup for the webp encoder when prediction transform is used."
-fb8c910 Merge "introduce WebPMemToUint32 and WebPUint32ToMem for memory access"
-bd91af2 Merge "bit_reader: remove aarch64 BITS TODO"
-6c702b8 Speed up hash chain initialization using memset.
-4c60f63 make ReadPNG and ReadJPEG take a filename instead of a FILE
-464ed10 bit_reader: remove aarch64 BITS TODO
-d478e58 Merge "configure: update issue tracker"
-6938111 Improved alpha cleanup for the webp encoder when prediction transform is used.
-2c08aac introduce WebPMemToUint32 and WebPUint32ToMem for memory access
-010ca3d Fix FindMatchLength with non-aligned buffers.
-a90e1e3 README: add prerequisites for an autoconf build
-458f086 configure: update issue tracker
-3391459 vwebp: work around the transparent background with GLUT bug
-e4a7eed cosmetics: fix indent
-0837512 Merge "Make a separate case for low_effort in CopyImageWithPrediction"
-aa2eb2d Merge "cosmetics: fix indent"
-b7551e9 cosmetics: fix indent
-5bda52d Make a separate case for low_effort in CopyImageWithPrediction
-66fa598 Merge "configure: fix intrinsics build w/older gcc"
-5ae220b backward_references.c: Fixed compiler warning
-1556da0 Merge "configure: restore 2 warnings"
-71a17e5 configure: restore 2 warnings
-9eeabc0 configure: fix intrinsics build w/older gcc
-363babe Merge "fix some warning about unaligned 32b reads"
-a141178 Optimization in hash chain comparison for 64 bit Arrays were compared 32 bits at a time, it is now done 64 bits at a time. Overall encoding speed-up is only of 0.2% on @skal's small PNG corpus. It is of 3% on my initial 1.3 Mp desktop screenshot image.
-829bd14 Combine Huffman cost and bit entropy into one loop
-a7a954c Merge "lossless: make prediction in encoder work per scanline"
-61b605b Merge "fix of undefined multiply (int32 overflow)"
-239421c lossless: make prediction in encoder work per scanline
-f5ca40e fix of undefined multiply (int32 overflow)
-5cd2ef4 Merge changes from topic 'win-threading-compat'
-76ce918 Makefile.vc: enable WEBP_USE_THREAD for windows phone
-d2afe97 thread: use CreateThread for windows phone
-0fd0e12 thread: use WaitForSingleObjectEx if available
-63fadc9 thread: use InitializeCriticalSectionEx if available
-110ad58 thread: use native windows cond var if available
-912c9fd dec/webp: use GetLE(24|32) from utils
-f169448 utils/GetLE32: correct uint32 promotion
-158763d Merge "always call WebPInitSamplers(), don't try to be smart"
-3770f3b Merge "cleanup the YFIX/TFIX difference by removing some code and #define"
-a40f60a Merge "3% speed improvement for lossless webp encoder for low effort mode:"
-ed1c2bc always call WebPInitSamplers(), don't try to be smart
-b8c44f1 3% speed improvement for lossless webp encoder for low effort mode:
-997e103 cleanup the YFIX/TFIX difference by removing some code and #define
-d73d1c8 Merge "Make discarding invisible RGB values (cleanup alpha) the default."
-1f9be97 Make discarding invisible RGB values (cleanup alpha) the default.
-f240117 Make dwebp listen more to the -quiet flag
-b37b017 fix for issue #275: don't compare to out-of-bound pointers
-21735e0 speed-up trivial one-symbol decoding case for lossless
-397863b Refactor CopyPlane() and CopyPixels() methods: put them in utils.
-6ecd72f Re-enable encoding of alpha plane with color cache for next release.
-1f7148a Merge "remove unused fields from WebPDecoderOptions and WebPBitstreamFeatures"
-6ae395f Merge "use ExReadFile() for ReadYUV()"
-8076a00 gitignore list: add anim_diff.
-1c1702d use ExReadFile() for ReadYUV()
-775d3a3 remove unused fields from WebPDecoderOptions and WebPBitstreamFeatures
-c13245c AnimEncoder: Add a GetError() method.
-688b265 AnimDecoder API: Add a GetDemuxer() method.
-1aa4e3d WebPAnimDecoder: add an option to enable multi-threaded decoding.
-3584abc AnimDecoder: option to decode to common color modes.
-afd5a62 Merge "mux.h does NOT need to include encode.h"
-8550d44 Merge "migrate anim_diff tool from C++ to C89"
-96201e5 migrate anim_diff tool from C++ to C89
-945cfa3 mux.h does NOT need to include encode.h
-8da07e8 Merge "~2x faster SSE2 RGB24toY, BGR24toY, ARGBToY|UV"
-bfd3fc0 ~2x faster SSE2 RGB24toY, BGR24toY, ARGBToY|UV
-0243242 man/cwebp.1, cosmetics: escape '-'s
-96f5b42 man/cwebp: group lossy-only options
-52fdbdf extract some RGB24 to Luma conversion function from enc/ to dsp/
-ab8c230 add missing \n
-8304179 sync NEWS with 0.4.4
-5bd04a0 sync versions with 0.4.4
-8f1fcc1 Merge "Move ARGB->YUV functions from dec/vp8l.c to dsp/yuv.c"
-25bf2ce fix some warning about unaligned 32b reads
-922268f s/TIFF/WebP
-fa8927e Move ARGB->YUV functions from dec/vp8l.c to dsp/yuv.c
-9b37359 Merge "for ReadXXXX() image-readers, use the value of pic->use_argb"
-f7c507a Merge "remove unnecessary #include "yuv.h""
-7861578 for ReadXXXX() image-readers, use the value of pic->use_argb
-14e4043 remove unnecessary #include "yuv.h"
-469ba2c vwebp: fix incorrect clipping w/NO_BLEND
-4b9186b update issue tracker url
-d64d376 change WEBP_ALIGN_CST value to 31
-f717b82 vp8l.c, cosmetics: fix indent after 95509f9
-927ccdc Merge "fix alignment of allocated memory in AllocateTransformBuffer"
-fea94b2 fix alignment of allocated memory in AllocateTransformBuffer
-5aa8d61 Merge "MIPS: rescaler code synced with C implementation"
-e7fb267 MIPS: rescaler code synced with C implementation
-93c86ed Merge "format_constants.h: MKFOURCC, correct cast"
-5d791d2 format_constants.h: MKFOURCC, correct cast
-65726cd dsp/lossless: Average2, make a constant unsigned
-d26d9de Use __has_builtin to check clang support
-12ec204 moved ALIGN_CST into util/utils.h and renamed WEBP_ALIGN_xxx
-a264083 Merge "rescaler: ~20% faster SSE2 implementation for lossless ImportRowExpand"
-3fb600d Merge "wicdec: fix alpha detection w/64bpp BGRA/RGBA"
-67c547f rescaler: ~20% faster SSE2 implementation for lossless ImportRowExpand
-99e3f81 Merge "large re-organization of the delta-palettization code"
-95509f9 large re-organization of the delta-palettization code
-74fb458 fix for weird msvc warning message
-ae49ad8 Merge "SSE2 implementation of ImportRowShrink"
-932fd4d SSE2 implementation of ImportRowShrink
-badfcba wicdec: fix alpha detection w/64bpp BGRA/RGBA
-35cafa6 Merge "iosbuild: fix linking with Xcode 7 / iOS SDK 9"
-b0c9d8a label rename: NO_CHANGE -> NoChange
-b4e731c neon-implementation for rescaler code
-db1321a iosbuild: fix linking with Xcode 7 / iOS SDK 9
-6dfa5e3 rescaler: better handling of the fxy_scale=0 special case.
-55c0529 Revert "rescaler: better handling of the fxy_scale=0 special case."
-9f226bf rescaler: better handling of the fxy_scale=0 special case.
-f7b8f90 delta_palettization.*: add copyright
-c1e1b71 Changed delta palette to compress better
-0dd2826 Merge "Add delta_palettization feature to WebP"
-48f66b6 Add delta_palettization feature to WebP
-27933e2 anim_encoder: drop a frame if it has same pixels as the prev frame.
-df9f6ec Merge "webpmux/DisplayInfo: send non-error output to stdout"
-8af4993 Merge "rescaler_mips_dsp_r2: cosmetics, fix indent"
-2b9d249 Merge "rescaler: cosmetics, join two lines"
-cc020a8 webpmux/DisplayInfo: send non-error output to stdout
-a288e74 configure: add -Wshorten-64-to-32
-c4c3cf2 pngdec: fix type conversion warnings
-bef8e97 webpmux: fix type conversion warning
-5a84460 rescaler_mips_dsp_r2: cosmetics, fix indent
-acde0aa rescaler: cosmetics, join two lines
-306ce4f rescaler: move the 1x1 or 2x1 handling one level up
-cced974 remove _mm_set_epi64x(), which is too specific
-56668c9 fix warnings about uint64_t -> uint32_t conversion
-76a7dc3 rescaler: add some SSE2 code
-1df1d0e rescaler: harmonize function protos
-9ba1894 rescaler: simplify ImportRow logic
-5ff0079 fix rescaler vertical interpolation
-cd82440 VP8LAllocateHistogramSet: align histogram[] entries
-a406b1d Merge "fix memory over-allocation in lossless rescaler init"
-0fde33e add missing const in VP8InitFrame signature
-ac7d5e8 fix memory over-allocation in lossless rescaler init
-017f8cc Loosen the buffer size checks for Y/U/V/A too.
-15ca501 loosen the padding check on buffer size
-d623a87 dec_neon: add whitespace around stringizing operator
-29377d5 dsp/mips: cosmetics: add whitespace around XSTR macro
-eebaf97 dsp/mips: add whitespace around stringizing operator
-d39dc8f Create a WebPAnimDecoder API.
-03fb752 gif2webp: print output file size
-14efabb Android: limit use of cpufeatures
-7b83adb preparatory cosmetics for Rescaler code fix and clean-up
-77fb41c dec/vp8l/DecodeAlphaData: remove redundant cast
-90fcfcd Insert less hash chain entries from the beginnings of long copies.
-bd55604 SSE2: add yuv444 converters, re-using yuv_sse2.c
-41a5d99 add a -quiet option to 'dwebp'
-80ab3ed Merge "README: update dwebp help output after 1e595fe"
-32b71b2 README: update dwebp help output after 1e595fe
-3ec1182 use the DispatchAlpha() call from dsp
-c5f0062 incorporate bzero() into WebPRescalerInit() instead of call site
-3ebcdd4 remove duplicate "#include <stdlib.h>"
-1e595fe dwebp: add -resize as a synonym for -scale
-24a9693 dec: allow 0 as a scaling dimension
-b918724 utils/rescaler: add WebPRescalerGetScaledDimensions
-923e8ed Merge "update NEWS"
-020fd09 Merge "WebPPictureDistortion: support ARGB format for 'pic' when computing distortion."
-6a5292f update NEWS
-56a2e9f WebPPictureDistortion: support ARGB format for 'pic' when computing distortion.
-0ae582e configure: test and add -Wunreachable-code
-c2f9dc0 bit_writer: convert VP8L macro values to immediates
-b969f88 Reduce magic in palette reordering
-acb297e anim_diff: add a -raw_comparison flag
-155c1b2 Merge changes I76f4d6fe,I45434639
-717e4d5 mips32/mipsDSPr2: function ImportRow rebased
-7df9389 fix rescaling bug (uninitialized read, see bug #254).
-5cdcd56 lossless_enc_neon: add VP8LTransformColor
-a53c336 lossless_neon: add VP8LTransformColorInverse
-99131e7 Merge changes I9fb25a89,Ibc648e9e
-c455676 simplify the main loop for downscaling
-2a010f9 lossless_neon: remove predictors 5-13
-ca221bb ll_enc_neon: enable VP8LSubtractGreenFromBlueAndRed
-585d93d Container spec: clarify ordering of ALPH chunk.
-01d61fd lossless: ~20 % speedup
-f722c8f lossless: Speed up ComputeCacheEntropy by 40 %
-1ceecdc add a VP8LColorCacheSet() method for color cache
-17eb609 lossless: Allow copying from prev row in rle-mode.
-f3a7a5b lossless: bit writer optimization
-d97b9ff Merge changes from topic 'lossless-enc-improvements'
-0250dfc msvc: fix pointer type warning in BitsLog2Floor
-52931fd lossless: combine the Huffman code with extra bits
-c4855ca lossless: Inlining add literal
-8e9c94d lossless: simplify HashChainFindCopy heuristics
-888429f lossless: 0.5 % compression density improvement
-7b23b19 lossless: Add zeroes into the predicted histograms.
-85b44d8 lossless: encoding, don't compute unnecessary histo
-d92453f lossless: Remove about 25 % of the speed degradation
-2cce031 Faster alpha coding for webp
-5e75642 lossless: rle mode not to accept lengths smaller than 4.
-84326e4 lossless: Less code for the entropy selection
-16ab951 lossless: 0.37 % compression density improvement
-822f113 add WebPFree() to the API
-0ae2c2e SSE2/SSE41: optimize SSE_16xN loops
-39216e5 cosmetics: fix indent after 32462a07
-559e54c Merge "SSE2: slightly faster FTransformWHT"
-8ef9a63 SSE2: slightly faster FTransformWHT
-f27f773 lossless_neon: enable VP8LAddGreenToBlueAndRed
-36e9c4b SSE2: minor cosmetrics on in-loop filter code
-4741fac dsp/lossless_*sse2: remove some unnecessary inlines
-1819965 fix warning ("left shift of negative value") using a cast
-7017001 SSE2: speed-up some lossless-encoding functions
-abcb012 Merge "SSE2: slightly faster (~5%) AddGreenToBlueAndRed()"
-2df5bd3 Merge "Speedup to HuffmanCostCombinedCount"
-9e356d6 SSE2: slightly faster (~5%) AddGreenToBlueAndRed()
-fc6c75a SSE2: 53% faster TransformColor[Inverse]
-49073da SSE2: 46% speed-up of TransformColor[Inverse]
-32462a0 Speedup to HuffmanCostCombinedCount
-f3d687e SSE4.1 implementation of some lossless encoding functions
-bfc300c SSE4.1 implementation of some alpha-processing functions
-7f9c98f Merge "sse2 in-loop: simplify SignedShift8b() a bit"
-ef314a5 dec_sse2/GetNotHEV: micro optimization
-a729cff sse2 in-loop: simplify SignedShift8b() a bit
-422ec9f simplify Load8x4() a bit
-8df238e Merge "remove some duplicate FlipSign()"
-751506c remove some duplicate FlipSign()
-65ef5af Merge "lossless: 0.13% compression density gain"
-2beef2f lossless: 0.13% compression density gain
-3033f24 lossless: 0.06 % compression density improvement
-64960da dec_neon: add VE8uv / VE16
-14dbd87 dec_neon: add HE8uv / HE16
-ac76801 introduce FTransform2 to perform two transforms at a time.
-aa6065a dec_neon: use vld1_dup(mem) rather than vdup(mem[0])
-8b63ac7 Merge "dec_neon: add TM16"
-f51be09 Merge "dec_neon/TrueMotion: simply left border load"
-dc48196 dec_neon: add TM16
-ea95b30 dec_neon/TrueMotion: simply left border load
-f262d61 speed-up SetResidualSSE2
-bf46d0a fix mips2 build target
-929a0fd enc_sse2/TTransform: simplify abs calculation
-17dbd05 enc_sse2/CollectHistogram: simplify abs calculation
-a6c1593 dec_neon: add DC16 intra predictors
-03b4f50 Makefile.vc: add anim_diff build support.
-1b98987 Merge changes I9cd84125,Iee7e387f,I7548be72
-acd7b5a Introduce a test tool anim_diff.
-f274a96 dsp/enc_sse2: add luma4 intra predictors
-040b11b dsp/enc_sse2: add chroma intra predictors
-aee021b dsp/enc_sse2: add luma16 intra predictors
-9e00a49 makefile.unix: remove superclean target
-cefc9c0 makefile.unix: clean up after extras target
-4c9af02 dec_neon: add DC8uvNoTopLeft
-dd55b87 Merge "doc/webp-container-spec: update repo browser link"
-f048696 doc/webp-container-spec: update repo browser link
-9287761 Merge "GetResidualCostSSE2: simplify abs calculation"
-0e00936 dsp/cpu.c(x86): check maximum supported cpuid feature
-b243a4b GetResidualCostSSE2: simplify abs calculation
-6d4602b Merge "fix typo: constitutes -> constitute"
-5fe1fe3 fix typo: constitutes -> constitute
-b83bd7c Merge "populate 'libwebpextras' with: import gray, rgb565 and rgb4444 functions"
-b0114a3 Merge "histogram.h: cosmetics: remove unnecessary includes"
-feab45e gifdec: Move inclusion of webp/config.h to header.
-dbba67d histogram.h: cosmetics: remove unnecessary includes
-e978fec Merge "VP8LBitReader: fix remaining ubsan error with large shifts"
-d6fe588 Merge "ReconstructRow: move some one-time inits out of the main loop"
-a21d647 ReconstructRow: move some one-time inits out of the main loop
-7a01c3c VP8LBitReader: fix remaining ubsan error with large shifts
-7fa67c9 change GetPixPairHash64() return type to uint32_t
-ec1fb9f Merge "dsp/enc.c: cosmetics: move DST() def closer to use"
-7073bfb Merge "split 64-mult hashing into two 32-bit multiplies"
-0768b25 dsp/enc.c: cosmetics: move DST() def closer to use
-6a48b8f Merge "fix MSVC size_t->int conversion warning"
-1db07cd Merge "anim_encode: cosmetics: fix alignment"
-e28271a anim_encode: cosmetics: fix alignment
-7fe357b split 64-mult hashing into two 32-bit multiplies
-af74c14 populate 'libwebpextras' with: import gray, rgb565 and rgb4444 functions
-6121413 remove VP8Residual::cost unused field
-e254482 fix MSVC size_t->int conversion warning
-b69a6c3 vwebp: don't redefine snprintf with VS2015+
-0ac29c5 AnimEncoder API: Consistent use of trailing underscores in struct.
-d484555 AnimEncoder API: Use timestamp instead of duration as input to Add().
-9904e36 dsp/dec_sse2: DC8uv / DC8uvNoLeft speedup
-7df2049 dsp/dec_sse2: DC16 / DC16NoLeft speedup
-8e515df Merge "makefile.unix: add some missing headers"
-db12250 cosmetics: vp8enci.h: break long line
-bf516a8 makefile.unix: add some missing headers
-b44eda3 dsp: add DSP_INIT_STUB
-03e76e9 clarify the comment about double-setting the status in SetError()
-9fecdd7 remove unused EmitRGB()
-43f010d move ReconstructRow to top
-82d9802 add a dec/common.h header to collect common enc/dec #defines
-5d4744a Merge "enc_sse41: add Disto4x4 / Disto16x16"
-e38886a mux.h: Bump up ABI version
-46305ca configure: add --disable-<avx2|sse4.1|sse2>
-2fc8b65 CPPFLAGS->CFLAGS for detecting sse4.1 in preprocessor
-1a338fb enc_sse41: add Disto4x4 / Disto16x16
-9405550 encoding SSE4.1 stub for StoreHistogram + Quantize + SSE_16xN
-c64659e remove duplicate variables after the lossless{_enc}.c split
-67ba7c7 enc_sse2: call local FTransform in CollectHistogram
-1824979 dsp: s/VP8LSetHistogramData/VP8SetHistogramData/
-ede5e15 cosmetics: dsp/lossless.h: reorder prototypes
-553051f dsp/lossless: split enc/dec functions
-9064adc Merge "conditionally add -msse4.1 in Makefile.unix"
-cecf509 dsp/yuv*.c: rework WEBP_USE_<arch> ifdef
-6584d39 dsp/upsampling*.c: rework WEBP_USE_<arch> ifdef
-8080942 dsp/rescaler*.c: rework WEBP_USE_<arch> ifdef
-1d93dde dsp/lossless*.c: rework WEBP_USE_<arch> ifdef
-73805ff dsp/filters*.c: rework WEBP_USE_<arch> ifdef
-fbdcef2 dsp/enc*.c: rework WEBP_USE_<arch> ifdef
-66de69c dsp/dec*.c: rework WEBP_USE_<arch> ifdef
-48e4ffd dsp/cost*.c: rework WEBP_USE_<arch> ifdef
-29fd6f9 dsp/argb*.c: rework WEBP_USE_<arch> ifdef
-80ff381 dsp/alpha*.c: rework WEBP_USE_<arch> ifdef
-bf09cf1 conditionally add -msse4.1 in Makefile.unix
-e9570dd stub for SSE4.1 support.
-4a95384 Merge "dsp: add sse4.1 detection"
-cabf4bd dsp: add sse4.1 detection
-4ecba1a thread.h: rename interface param
-b8d706c Merge "sync versions with 0.4.3"
-ae64a71 Merge "add shell for libwebpextras"
-92a5da9 sync versions with 0.4.3
-9d4e2d1 Merge "~30% faster smart-yuv (-pre 4) with early-out criterion"
-b1bdbba ~30% faster smart-yuv (-pre 4) with early-out criterion
-7efb974 Merge "Disable NEON code on Native Client"
-ac4f578 Disable NEON code on Native Client
-0873f85 AnimEncoder API: Support input frames in YUV(A) format.
-5c176d2 add shell for libwebpextras
-44bd956 fix signature for VP8RecordCoeffTokens()
-c9b8ea0 small cosmetics on TokenBuffer.
-76394c0 Merge "MIPS: dspr2: added optimization for TrueMotion"
-0f77369 WebPPictureRescale: add a note about 0 width/height
-241bb5d MIPS: dspr2: added optimization for TrueMotion
-6cef0e4 examples/Android.mk: add webpmux_example target
-53c16ff Android.mk: add webpmux target
-21852a0 Android.mk: add webpdemux target
-8697a3b Android.mk: add webpdecoder{,_static} targets
-4a67049 Android.mk: split source lists per-directory
-b5e7942 MIPS: dspr2: Added optimization for some convert functions
-0f595db MIPS: dspr2: Added optimization for some convert functions
-8a218b4 MIPS: [mips32|dspr2]: GetResidualCost rebased
-ef98750 Speedup method StoreImageToBitMask by 5%.
-602a00f fix iOS arm64 build with Xcode 6.3
-2382050 1-2% faster encoding by removing an indirection in GetResidualCost()
-eddb7e7 MIPS: dspr2: added otpimization for DC8uv, DC8uvNoTop and DC8uvNoLeft
-73ba291 MIPS: dspr2: added optimization for functions RD4 and LD4
-c7129da Merge "4-5% faster encoding using SSE2 for GetResidualCost"
-94380d0 MIPS: dspr2: added optimizaton for functions VE4 and DC4
-2a40709 4-5% faster encoding using SSE2 for GetResidualCost
-17e1986 Merge "MIPS: dspr2: added optimization for simple filtering functions"
-3ec404c Merge "dsp: normalize WEBP_TSAN_IGNORE_FUNCTION usage"
-b969f5d dsp: normalize WEBP_TSAN_IGNORE_FUNCTION usage
-d7b8e71 MIPS: dspr2: added optimization for simple filtering functions
-235f774 Merge "MIPS: dspr2: Added optimization for function VP8LTransformColorInverse_C"
-42a8a62 MIPS: dspr2: Added optimization for function VP8LTransformColorInverse_C
-b442bef Merge "ApplyFiltersAndEncode: only copy lossless stats"
-b510fbf doc/webp-container-spec: note MSB order for chunk diagrams
-9bc0f92 ApplyFiltersAndEncode: only copy lossless stats
-3030f11 Merge "dsp/mips: add some missing TSan annotations"
-dfcf459 Merge "MIPS: dspr2: Added optimization for function VP8LAddGreenToBlueAndRed_C"
-55c75a2 dsp/mips: add some missing TSan annotations
-2cb879f MIPS: dspr2: Added optimization for function VP8LAddGreenToBlueAndRed_C
-e155601 move some cost tables from enc/ to dsp/
-c3a0316 Merge "picture_csp: fix build w/USE_GAMMA_COMPRESSION undefined"
-39537d7 Merge "VP8LDspInitMIPSdspR2: add missing TSan annotation"
-1dd419c picture_csp: fix build w/USE_GAMMA_COMPRESSION undefined
-43fd354 VP8LDspInitMIPSdspR2: add missing TSan annotation
-c7233df Merge "VP8LDspInit: remove memcpy"
-0ec4da9 picture_csp::InitGammaTables*: add missing TSan annotations
-35579a4 VP8LDspInit: remove memcpy
-97f6aff VP8YUVInit: add missing TSan annotation
-f9016d6 dsp/enc::InitTables: add missing TSan annotation
-e3d9771 VP8EncDspCostInit*: add missing TSan annotations
-d97c143 Merge "doc/webp-container-spec: cosmetics"
-309b790 MIPS: mips32: Added optimization for function SetResidualCoeffs
-a987fae MIPS: dspr2: added optimization for function GetResidualCost
-e7d3df2 doc/webp-container-spec: cosmetics
-be6635e Merge "VP8TBufferClear: remove some misleading const's"
-02971e7 Merge "VP8EmitTokens: remove unnecessary param void cast"
-3b77e5a VP8TBufferClear: remove some misleading const's
-aa139c8 VP8EmitTokens: remove unnecessary param void cast
-c24d8f1 cosmetics: upsampling_sse2: add const to some casts
-1829c42 cosmetics: lossless_sse2: add const to some casts
-183168f cosmetics: enc_sse2: add const to some casts
-860badc cosmetics: dec_sse2: add const to some casts
-0254db9 cosmetics: argb_sse2: add const to some casts
-1aadf85 cosmetics: alpha_processing_sse2: add const to some casts
-1579de3 vwebp: clear canvas at the beginning of each loop
-4b9fa5d Merge "webp-container-spec: clarify background clear on loop"
-4c82284 Updated the near-lossless level mapping.
-5603947 webp-container-spec: clarify background clear on loop
-19f0ba0 Implement true-motion prediction in SSE2
-774d4cb make VP8PredLuma16[] array non-const
-d7eabb8 Merge "MIPS: dspr2: Added optimization for function CollectHistogram"
-fe42739 Use integers for kmin/kmax for simplicity.
-b9df35f AnimEncode API: kmax=0 should imply all keyframes.
-6ce296d MIPS: dspr2: Added optimization for function CollectHistogram
-2c906c4 vwebp: remove unnecessary static Help() prototype
-be0fd1d Merge "dec/vp8: clear 'dither_' on skipped blocks"
-e96170f Merge "vwebp/animation: display last frame on end-of-loop"
-0f017b5 vwebp/animation: display last frame on end-of-loop
-c86b40c enc/near_lossless.c: fix alignment
-66935fb dec/vp8: clear 'dither_' on skipped blocks
-b7de794 Merge "lossless_neon: enable subtract green for aarch64"
-77724f7 SSE2 version of GradientUnfilter
-416e1ce lossless_neon: enable subtract green for aarch64
-72831f6 Speedup AnalyzeAndInit for low effort compression.
-a659748 Speedup Analyze methods for lossless compression.
-98c8138 Enable Near-lossless feature.
-c6b2454 AnimEncoder API: Fix for kmax=1 and default kmin case.
-022d2f8 add SSE2 variants for alpha filtering functions
-2db15a9 Temporarily disable encoding of alpha plane with color cache.
-1d575cc Merge "Lossless decoding: Remove an unnecessary if condition."
-cafa1d8 Merge "Simplify backward refs calculation for low-effort."
-7afdaf8 Alpha coding: reorganize the filter/unfiltering code
-4d6d728 Simplify backward refs calculation for low-effort.
-ec0d1be Cleaup Near-lossless code.
-9814ddb Remove the post-transform near-lossless heuristic.
-4509e32 Lossless decoding: Remove an unnecessary if condition.
-f2ebc4a Merge "Regression fix for lossless decoding"
-783a8cd Regression fix for lossless decoding
-9a062b8 AnimEncoder: Bugfix for kmin = 1 and kmax = 2.
-0f027a7 simplify smart RGB->YUV conversion code
-0d5b334 BackwardReferencesHashChainFollowChosenPath: remove unused variable
-f480d1a Fix to near lossless artefacts on palettized images.
-d4615d0 Merge changes Ia1686828,I399fda40
-cb4a18a rename HashChainInit into HashChainReset
-f079e48 use uint16_t for chosen_path[]
-da09121 MIPS: dspr2: Added optimization for function FTransformWHT
-b8c2013 Merge "wicdec: (msvs) quiet some /analyze warnings"
-9b228b5 wicdec: (msvs) quiet some /analyze warnings
-daeb276 Merge "MIPS: dspr2: Added optimization for MultARGBRow function"
-cc08742 Merge "dsp/cpu: (msvs) add include for __cpuidex"
-4a82aab Merge changes I87544e92,I0bb6cda5
-7a19139 dwebp/WritePNG: mark png variables volatile
-775dfad dwebp: include setjmp.h w/WEBP_HAVE_PNG
-47d26be dwebp: correct sign in format strings
-f0e0677 VP8LEncodeStream: add an assert
-c5f7747 VP8LColorCacheCopy: promote an int before shifting
-0de5f33 dsp/cpu: (msvs) add include for __cpuidex
-7d850f7 MIPS: dspr2: Added optimization for MultARGBRow function
-5487529 MIPS: dspr2: added optimization for function QuantizeBlock
-4fbe9cf dsp/cpu: (msvs) avoid immintrin.h on _M_ARM
-3fd5903 simplify/reorganize arguments for CollectColorBlueTransforms
-b9e356b Disable costly TraceBackwards for method=0.
-a7e7caa MIPS: dspr2: added optimization for function TransformColorRed
-2cb3918 Merge "MIPS: dspr2: added optimization for function TransformColorBlue"
-279e661 Merge "dsp/cpu: add include for _xgetbv() w/MSVS"
-b6c0428 dsp/cpu: add include for _xgetbv() w/MSVS
-d1c4ffa gif2webp: Move GIF decoding related code to a support library.
-07c3955 Merge "AnimEncoder API: Add info in README.mux"
-7b16197 MIPS: dspr2: added optimization for function TransformColorBlue
-d7c4b02 cpu: fix AVX2 detection for gcc/clang targets
-9d29946 AnimEncoder API: Add info in README.mux
-d581ba4 follow-up: clean up WebPRescalerXXX dsp function
-f8740f0 dsp: s/USE_INTRINSICS/WEBP_USE_INTRINSICS/
-ce73abe Merge "introduce a separate WebPRescalerDspInit to initialize pointers"
-ab66bec introduce a separate WebPRescalerDspInit to initialize pointers
-205c7f2 fix handling of zero-sized partition #0 corner case
-cbcdd5f Merge "move rescaler functions to rescaler* files in src/dsp/"
-bf586e8 Merge changes I230b3532,Idf3057a7
-6dc79dc Merge "anim_encode: fix type conversion warnings"
-11fce25 Merge "dec_neon: remove returns from void functions"
-c4e63f9 Makefile.vc: add gif2webp target
-4f43d38 enable NEON for Windows ARM builds
-3f6615a Makefile.vc: add rudimentary Windows ARM support
-e7c5954 dec_neon: remove returns from void functions
-f79c163 anim_encode: fix type conversion warnings
-0f54f1e Remove gif2webp_util which is no longer needed.
-cbcbedd move rescaler functions to rescaler* files in src/dsp/
-ac79ed1 webpmux: remove experimental fragment handling
-e8694d4 mux: remove experimental FRGM parsing
-9e92b6e AnimEncoder API: Optimize single-frame animated images
-abbae27 Merge "Move over gif2webp to the new AnimEncoder API."
-a28c4b3 MIPS: move WORK_AROUND_GCC define to appropriate place
-012d2c6 MIPS: dspr2: added optimization for functions SSEAxB
-67720c8 Move over gif2webp to the new AnimEncoder API.
-9241ecf MIPS: dspr2: added optimization for function Average
-9422211 Merge "Tune BackwardReferencesLz77 for low_effort (m=0)."
-df40057 Merge "Speedup VP8LGetHistoImageSymbols for low effort (m=0) mode."
-ea08466 Tune BackwardReferencesLz77 for low_effort (m=0).
-b0b973c Speedup VP8LGetHistoImageSymbols for low effort (m=0) mode.
-c6d3292 argb_sse2: cosmetics
-67f601c make the 'last_cpuinfo_used' variable names unique
-b948986 AnimEncoder API: Init method for default options.
-856f8ec Merge "AnimEncoder API: Remove AnimEncoderFrameOptions."
-c537514 Merge "AnimEncoder API: GenerateCandidates bugfix."
-dc0ce03 Merge "AnimEncoder API: Compute change rectangle for first frame too."
-f00b639 Merge "AnimEncoder API: In Assemble(), always set animation parameters."
-29ed796 Merge "AnimEncoder lib cleanup: prev to prev canvas not needed."
-9f0dd6e Merge "WebPAnimEncoder API: Header and implementation"
-5e56bbe AnimEncoder API: Remove AnimEncoderFrameOptions.
-b902c3e AnimEncoder API: GenerateCandidates bugfix.
-ef3c39b AnimEncoder API: Compute change rectangle for first frame too.
-eec423a AnimEncoder API: In Assemble(), always set animation parameters.
-ae1c046 AnimEncoder lib cleanup: prev to prev canvas not needed.
-4b997ae WebPAnimEncoder API: Header and implementation
-72208be move argb_*.o build target to encoder list
-9592053 Merge "multi-thread fix: lock each entry points with a static var"
-4c1b300 Merge "SSE2 implementation of VP8PackARGB"
-fbcc200 Merge "add -Wformat-nonliteral and -Wformat-security"
-80d950d add -Wformat-nonliteral and -Wformat-security
-04c20e7 Merge "MIPS: dspr2: added optimization for function Intra4Preds"
-a437694 multi-thread fix: lock each entry points with a static var
-ca7f60d SSE2 implementation of VP8PackARGB
-72d573f simplify the PackARGB signature
-4e2589f demux: restore strict fragment flag check
-4ba8e07 Merge "webp-container-spec: remove references to fragments"
-e752f0a Merge "demux: remove experimental FRGM parsing"
-f8abb11 Merge changes I109ec4d9,I73fe7743
-ae2188a MIPS: dspr2: added optimization for function Intra4Preds
-1f4b864 move VP8EncDspARGBInit() call closer to where it's needed
-14108d7 dec_neon: add DC8uvNoTop / DC8uvNoLeft
-d8340da dec_neon: add DC8uv
-a66e66c webp-container-spec: remove references to fragments
-7ce8788 MIPS: dspr2: added optimization for function MakeARGB32
-012e623 demux: remove experimental FRGM parsing
-87c3d53 method=0: Don't evaluate any predictor
-6f4fcb9 Merge "MIPS: dspr2: added optimization for function ImportRow"
-2428445 replace unneeded calls to HistogramCopy() by swaps
-bdf7b40 MIPS: dspr2: added optimization for function ImportRow
-e66a922 Merge "MIPS: dspr2: added optimization for function ExportRowC"
-c279fec MIPS: dspr2: added optimization for function ExportRowC
-31a9cf6 Speedup WebP lossless compression for low effort (m=0) mode with following: - Disable Cross-Color transform. - Evaluate predictors #11 (paeth), #12 and #13 only.
-9275d91 MIPS: dspr2: added optimization for function TrueMotion
-26106d6 Merge "enc_neon: fix building with non-Xcode clang (iOS)"
-1c4e3ef unroll the kBands[] indirection to remove a dereference in GetCoeffs()
-a3946b8 enc_neon: fix building with non-Xcode clang (iOS)
-8ed9c00 Merge "simplify the Histogram struct, to only store max_value and last_nz"
-bad7757 simplify the Histogram struct, to only store max_value and last_nz
-3cca0dc MIPS: dspr2: Added optimization for DCMode function
-37e395f MIPS: fix functions to use generic BPS istead of hardcoded value
-9475bef PickBestUV: fix VP8Copy16x8 invocation
-441f273 Merge changes I55f8da52,Id73a1e96
-4a279a6 cosmetics: add some missing != NULL comparisons
-66ad372 factorize BPS definition in dsp.h and add VP8Copy16x8
-432e5b5 make ALIGN_xxx naming consistent
-5760604 encoder: switch BPS to 32 instead of 16
-1b66bbe MIPS: dspr2: added optimization for function TransformColor_C
-c6d0f9e histogram: cosmetics
-f399d30 Merge changes I6eac17e5,I32d2b514
-9de9074 dec_neon: add TM8uv
-8e517ec bit_reader/kVP8NewRange: range_t -> uint8_t
-e185713 dsp: initialize VP8PredChroma8 in VP8DspInit()
-e0c809a Move Entropy methods to lossless.c
-a96ccf8 iosbuild: add x64_64 simulator support
-a0df551 Remove handling for WEBP_HINT_GRAPH
-413dfc0 Move static method definition before its usage.
-0f23566 Update BackwardRefsWithLocalCache.
-d69e36e Remove TODOs from lossless encoder code.
-fdaac8e Optmize VP8LGetBackwardReferences LZ77 references.
-2f0e2ba MIPS: dspr2: added optimization for function Select
-a3e79a4 Merge "WebPEncode: Support encoding same pic twice (even if modified)"
-e4f4ddd WebPEncode: Support encoding same pic twice (even if modified)
-cbc3fbb Merge "Updated VP8LGetBackwardReferences and color cache."
-95a9bd8 Updated VP8LGetBackwardReferences and color cache.
-54f2c14 MIPS: dspr2: added optimization for function FTransform
-aa42f42 MIPS: dspr2: Added optimization for function VP8LSubtractGreenFromBlueAndRed
-11a25f7 Merge "FlattenSimilarBlocks should only be tried when blending is possible."
-5cccdad FlattenSimilarBlocks should only be tried when blending is possible.
-95ca44a MIPS: dspr2: added optimization for Disto4x4
-4171b67 backward_references.c: reindent after c8581b0
-c8581b0 Optimize BackwardReferences for RLE encoding.
-5798eee MIPS: dspr2: unfilters bugfix (Ie7b7387478a6b5c3f08691628ae00f059cf6d899)
-4167a3f Optimize backwardreferences
-d18554c Merge "webp/types.h: use inline for clang++/-std=c++11"
-7489b0e gif2webp: Add '-min-size' option to get best compression.
-77bdddf Speed up BackwardReferences
-6638710 webp/types.h: use inline for clang++/-std=c++11
-abf0420 Enable entropy based merge histo for (q<100)
-572022a filters_mips_dsp_r2.c: disable unfilters
-a28e21b MIPS: dspr2: Added optimization for function ClampedAddSubtractFull
-18d5a1e MIPS: dspr2: added optimization for function ClampedAddSubtractHalf
-829a8c1 MIPS: dspr2: added optimization for ITransform
-c94ed49 gif2webp: Use the default hint instead of WEBP_HINT_GRAPH.
-653ace5 Increase the MAX_COLOR_CACHE_BITS from 9 to 10.
-919220c Change the logic adjusting the Histogram bits.
-53b096c Merge "Fix bug in VP8LCalculateEstimateForCacheSize."
-e912bd5 Fix bug in VP8LCalculateEstimateForCacheSize.
-541d783 Merge "dec_neon: add RD4 intra predictor"
-f8cd067 Merge "Makefile.vc: add a 'legacy' RTLIBCFG option"
-22881c9 dec_neon: add RD4 intra predictor
-613d281 update NEWS
-1304eb3 Merge "dec_neon: DC4: use pair-wise adds for top row"
-34c20c0 Makefile.vc: add a 'legacy' RTLIBCFG option
-7083006 Merge "dsp/dec_{neon,sse2}: VE4: normalize variable names"
-0db9031 dsp/dec_{neon,sse2}: VE4: normalize variable names
-b5bc153 dec_neon: DC4: use pair-wise adds for top row
-5b90d8f Unify the API between VP8BitWriter and VP8LBitWriter
-f7ada56 Merge changes I2e06907b,Ia9ed4ca6,I782282ff
-5beb6bf Merge "dec_neon: add VE4 intra predictor"
-eba6ce0 dec_neon: add DC4 intra predictor
-79abfbd dec_neon: add TM4 intra predictor
-fe395f0 dec_neon: add LD4 intra predictor
-32de385 dec_neon: add VE4 intra predictor
-72395ba Merge "Modify CostModel to allocate optimal memory."
-65e5eb8 gif2webp: Support GIF_DISPOSE_RESTORE_PREVIOUS
-e4c829e gif2webp: Handle frames with odd offsets + disposal to background.
-c2b5a03 Modify CostModel to allocate optimal memory.
-b7a33d7 implement VE4/HE4/RD4/... in SSE2
-97c76f1 make VP8PredLuma4[] non-const and initialize array in VP8DspInit()
-0ea8c6c Merge "PrintReg: output to stderr"
-d7ff2f9 Merge "stopwatch.h: fix includes"
-f85ec71 PrintReg: output to stderr
-54edbf6 stopwatch.h: fix includes
-139142e Optimize BackwardReferenceHashChainFollowPath.
-5f36b68 enc/backward_references.c: fix indent
-e0e9960 Merge "sync version numbers to 0.4.2 release"
-64ac514 sync version numbers to 0.4.2 release
-c24f895 Simplify and speedup Backward refs computation.
-d1c359e fix shared object build with -fvisibility=hidden
-a4c3a31 WEBP_TSAN_IGNORE_FUNCTION: fix gcc compat warning
-f358eeb add code for testing random incremental decoding in dwebp
-8024729 mark some init function as being safe for thread_sanitizer.
-79b5bdb bit_reader.h: cosmetics: fix a typo
-6c67368 Improved near-lossless mode.
-0ce27e7 enc_mips32: workaround gcc-4.9 bug
-aca1b98 enc/vp8l.c: fix indent
-ca00502 Evaluate non-palette compression for palette image
-c8a87bb AssignSegments: quiet -Warray-bounds warning
-32f67e3 Merge "enc_neon: initialize vectors w/vdup_n_u32"
-fabc65d 1-3% faster encoding optimizing SSE_NxN functions
-7534d71 enc_neon: initialize vectors w/vdup_n_u32
-5f81391 Merge "Fix return code of EncodeImageInternal()"
-e321abe Fix return code of EncodeImageInternal()
-f82cb06 optimize palette ordering
-f545fee don't set the alpha value for histogram index image
-2d9b0a4 add WebPDispatchAlphaToGreen() to dsp
-1bd4c2a Merge "Change Entropy based Histogram Combine heuristic."
-e295b8f Merge "iosbuild: cleanup"
-1be4e76 Merge "iosbuild: output autoconf req. on failure"
-d5e498d Change Entropy based Histogram Combine heuristic.
-47a2d8e fix MSVC float->int conversion warning
-041956f iosbuild: cleanup
-767eb40 iosbuild: output autoconf req. on failure
-35ad48b HistoHeapInit: correct positions allocation size
-45d9635 lossless: entropy clustering for high qualities.
-dc37df8 fix type warning for VS9_x64
-9f7d9e6 iosbuild: make iOS 6 the minimum requirement
-fdd6528 Remove unused VP8LDecoder member variable
-ea3bba5 Merge "rewrite Disto4x4 in enc_neon.c with intrinsic"
-f060dfc add lossless incremental decoding support
-ab70794 rewrite Disto4x4 in enc_neon.c with intrinsic
-d447163 MIPS: dspr2: added optimization for function FilterLoop24
-2aef54d Merge "prepare VP8LDecodeImage for incremental decode"
-aed0f5a Merge "MIPS: dspr2: added optimization for function FilterLoop26"
-2863068 prepare VP8LDecodeImage for incremental decode
-248f3ae remove br->error_ field
-49e1504 MIPS: dspr2: added optimization for function FilterLoop26
-38128cb iobuild.sh: only install .h files in Headers
-c792d41 Premultiply with alpha during U/V downsampling
-0cc811d gif2webp: Background color correction
-d7167ff Amend the lossless spec according to issue #205, #206 and #224
-b901416 Record the lossless size stats.
-cddd334 Add a WebPExtractAlpha function to dsp
-0716a98 fix indent after I0204949917836f74c0eb4ba5a7f4052a4797833b
-f9ced95 Optimize lossless decoding for trivial(ARB) codes.
-924fcfd Merge "webpmux: simplify InitializeConfig()"
-c0a462c webpmux: simplify InitializeConfig()
-6986bb5 webpmux: fix indent
-f89e169 webpmux: fix exit status on numeric value parse error
-2172cb6 Merge "webpmux: fix loop_count range check"
-e3b343e Merge "examples: warn on invalid numeric parameters"
-0e23c48 webpmux: fix loop_count range check
-6208338 Merge "fix loop bug in DispatchAlpha()"
-d51f3e4 gif2webp: Handle frames with missing  graphic control extension
-690b491 fix loop bug in DispatchAlpha()
-96d43a8 examples: warn on invalid numeric parameters
-3101f53 MIPS: dspr2: added optimization for TransformOne
-a6bb9b1 SSE2 for inverse Mult(ARGB)Row and ApplyAlphaMultiply
-d84a8ff Remove default initialization of decoder status.
-be70b86 configure: simplify libpng-config invocation
-e0a9932 Rectify bug in lossless incremental decoding.
-e2502a9 MIPS: dspr2: added optimization for TransformAC3
-24e1072 MIPS: dspr2: added optimization for TransformDC
-c0e84df Merge "Slightly faster lossless decoding (1%)"
-8dd28bb Slightly faster lossless decoding (1%)
-f010359 MIPS: dspr2: added optimization for ColorIndexInverseTransforms
-d3242ae make VP8LSetBitPos() set br->eos_ flag
-a9decb5 Lossless decoding: fix eos_ flag condition
-3fea6a2 fix erroneous dec->status_ setting
-80b8099 MIPS: dspr2: add some specific mips code to commit I2c3f2b12f8df15b785fad5a9c56316e954ae0c53
-e564062 Merge "further refine the COPY_PATTERN optim for DecodeAlpha"
-854509f enc/histogram.c: reindent after f4059d0
-3442196 Merge "~3-5% faster encoding optimizing PickBestIntra*()"
-865069c further refine the COPY_PATTERN optim for DecodeAlpha
-a595622 added C-level optimization for DecodeAlphaData function
-187d379 add a fallback to ALPHA_NO_COMPRESSION
-a48a2d7 ~3-5% faster encoding optimizing PickBestIntra*()
-a614019 ExUtilReadFromStdin: (windows) open stdin in bin mode
-e80eab1 webpmux: (windows) open stdout in binary mode
-e9bfb11 cwebp: (windows) open stdout in binary mode
-5927e15 example_util: add ExUtilSetBinaryMode
-30f3b75 webpmux man page: Clarify some title, descriptions and examples
-77d4c7e address cosmetic comments from patch #71380
-f75dfbf Speed up Huffman decoding for lossless
-637b388 dsp/lossless: workaround gcc-4.9 bug on arm
-8323a90 dsp.h: collect gcc/clang version test macros
-e6c4b52 move static initialization of WebPYUV444Converters[] to the Init function.
-49911d4 Merge "fix indentation"
-f4059d0 Code cleanup for HistogramRemap.
-e632b09 fix indentation
-f5c04d6 Merge "add a DispatchAlpha() for SSE2 that handles 8 pixels at a time"
-fc98edd add a DispatchAlpha() for SSE2 that handles 8 pixels at a time
-73d361d introduce VP8EncQuantize2Blocks to quantize two blocks at a time
-0b21c30 MIPS: dspr2: added optimization for EmitAlphaRGB
-953acd5 enc_neon: enable QuantizeBlock for aarch64
-f4ae143 MIPS: mips32: code rebase
-5697715 MIPS: dspr2: added optimizations for VP8YuvTo*
-2523aa7 SmartRGBYUV: fix odd-width problem with pixel replication
-ee52dc4 fix some MSVC64 warning about float conversion
-3fca851 cpu: check for _MSC_VER before using msvc inline asm
-e2a83d7 faster RGB->YUV conversion function (~7% speedup)
-de2d03e Merge "Add smart RGB->YUV conversion option -pre 4"
-3fc4c53 Add smart RGB->YUV conversion option -pre 4
-b4dc406 MIPS: dspr2: added optimization for (un)filters
-137e609 Merge "configure: add work around for gcc-4.9 aarch64 bug"
-b61c9ce MIPS: dspr2: Optimization of some simple point-sampling functions
-e2b8cec configure: add work around for gcc-4.9 aarch64 bug
-98c5410 MIPS: mips32r2: added optimization for BSwap32
-dab702b Update PATENTS to reflect s/VP8/WebM/g
-b564f7c Merge "MIPS: detect mips32r6 and disable mips32r1 code"
-b7e5a5c MIPS: detect mips32r6 and disable mips32r1 code
-63c2fc0 Correctly use the AC_CANONICAL_* macros
-bb07022 Merge "cosmetics"
-e300c9d cosmetics
-0e519ee Merge "cosmetics: remove some extraneous 'extern's"
-3ef0f08 Merge "vp8enci.h: cosmetics: fix '*' placement"
-4c6dde3 bit_writer: cosmetics: rename kFlush() -> Flush()
-f7b4c48 cosmetics: remove some extraneous 'extern's
-b47fb00 vp8enci.h: cosmetics: fix '*' placement
-b5a36cc add -near_lossless [0..100] experimental option
-0524d9e dsp: detect mips64 & disable mips32 code
-d3485d9 cwebp.1: fix quality description placement
-29a9fe2 Merge tag 'v0.4.1'
-8af2771 update ChangeLog (tag: v0.4.1, origin/0.4.1, 0.4.1)
-e09e9ff Record & log the image pre-processing time.
-f59c0b4 iosbuild.sh: specify optimization flags
-8d34ea3 update ChangeLog (tag: v0.4.1-rc1)
-dbc3da6 makefile.unix: add vwebp.1 to the dist target
-89a7c83 update ChangeLog
-ffe67ee Merge "update NEWS for the next release" into 0.4.1
-2def1fe gif2webp: dust up the help message
-fb668d7 remove -noalphadither option from README/vwebp.1
-e49f693 update NEWS for the next release
-cd01358 Merge "update AUTHORS" into 0.4.1
-268d01e update AUTHORS
-85213b9 bump version to 0.4.1
-695f80a Merge "restore mux API compatibility" into 0.4.1
-862d296 restore mux API compatibility
-8f6f8c5 remove the !WEBP_REFERENCE_IMPLEMENTATION tweak in Put8x8uv
-d713a69 Merge changes If4debc15,I437a5d5f into 0.4.1
-c2fc52e restore encode API compatibility
-793368e restore decode API compatibility
-b8984f3 gif2webp: fix compile with giflib 5.1.0
-222f9b1 gif2webp: simplify giflib version checking
-d2cc61b Extend MakeARGB32() to accept Alpha channel.
-4595b62 Merge "use explicit size of kErrorMessages[] arrays"
-157de01 Merge "Actuate memory stats for PRINT_MEMORY_INFO"
-fbda2f4 JPEG decoder: delay conversion to YUV to WebPEncode() call
-0b747b1 use explicit size of kErrorMessages[] arrays
-3398d81 Actuate memory stats for PRINT_MEMORY_INFO
-6f3202b Merge "move WebPPictureInit to picture.c"
-6c347bb move WebPPictureInit to picture.c
-fb3acf1 fix configure message for multi-thread
-40b086f configure: check for _beginthreadex
-1549d62 reorder the YUVA->ARGB and ARGB->YUVA functions correctly
-c6461bf Merge "extract colorspace code from picture.c into picture_csp.c"
-736f2a1 extract colorspace code from picture.c into picture_csp.c
-645daa0 Merge "configure: check for -Wformat-security"
-abafed8 configure: check for -Wformat-security
-fbadb48 split monolithic picture.c into picture_{tools,psnr,rescale}.c
-c76f07e dec_neon/TransformAC3: initialize vector w/vcreate
-bb4fc05 gif2webp: Allow single-frame animations
-46fd44c thread: remove harmless race on status_ in End()
-5a1a726 Merge "configure: check for __builtin_bswapXX()"
-6781423 configure: check for __builtin_bswapXX()
-6450c48 configure: fix iOS builds
-6422e68 VP8LFillBitWindow: enable fast path for 32-bit builds
-4f7f52b VP8LFillBitWindow: respect WEBP_FORCE_ALIGNED
-e458bad endian_inl.h: implement htoleXX with BSwapXX
-f2664d1 endian_inl.h: add BSwap16
-6fbf534 Merge "configure: add --enable-aligned"
-dc0f479 configure: add --enable-aligned
-9cc69e2 Merge "configure: support WIC + OpenGL under mingw64"
-257adfb remove experimental YUV444 YUV422 and YUV400 code
-10f4257 configure: support WIC + OpenGL under mingw64
-380cca4 configure.ac: add AC_C_BIGENDIAN
-ee70a90 endian_inl.h: add BSwap64
-47779d4 endian_inl.h: add BSwap32
-d5104b1 utils: add endian_inl.h
-58ab622 Merge "make alpha-detection loop in IsKeyFrame() in good x/y order"
-9d56290 make alpha-detection loop in IsKeyFrame() in good x/y order
-516971b lossless: Remove unaligned read warning
-b8b596f Merge "configure.ac: add an autoconf version prerequisite"
-34b02f8 configure.ac: add an autoconf version prerequisite
-e59f536 neon: normalize vdup_n_* usage
-6ee7160 Merge changes I0da7b3d3,Idad2f278,I4accc305
-abc02f2 Merge "fix (uncompiled) typo"
-bc03670 neon: add INIT_VECTOR4
-6c1c632 neon: add INIT_VECTOR3
-dc7687e neon: add INIT_VECTOR2
-4536e7c add WebPMuxSetCanvasSize() to the mux API
-824eab1 fix (uncompiled) typo
-1f3e5f1 remove unused 'shift' argument and QFIX2 define
-8e86705 Merge "VP8LoadNewBytes: use __builtin_bswap32 if available"
-1b6a263 Merge "Fix handling of weird GIF with canvas dimension 0x0"
-1da3d46 VP8LoadNewBytes: use __builtin_bswap32 if available
-1582e40 Fix handling of weird GIF with canvas dimension 0x0
-b8811da Merge "rename interface -> winterface"
-db8b8b5 Fix logic in the GIF LOOP-detection parsing
-25aaddc rename interface -> winterface
-5584d9d make WebPSetWorkerInterface() check its arguments
-a9ef7ef Merge "cosmetics: update thread.h comments"
-c6af999 Merge "dust up the help message"
-0a8b886 dust up the help message
-a9cf319 cosmetics: update thread.h comments
-27bfeee QuantizeBlock SSE2 Optimization:
-2bc0dc3 Merge "webpmux: warn when odd frame offsets are used"
-3114ebe Merge changes Id8edd3c1,Id418eb96,Ide05e3be
-c072663 webpmux: warn when odd frame offsets are used
-c5c6b40 Merge "add alpha dithering for lossy"
-d514678 examples/Android.mk: add cwebp
-ca0fa7c Android.mk: move dwebp to examples/Android.mk
-73d8fca Android.mk: add ENABLE_SHARED flag
-6e93317 muxread: fix out of bounds read
-8b0f6a4 Makefile.vc: fix CFLAGS assignment w/HAVE_AVX2=1
-bbe32df add alpha dithering for lossy
-7902076 Merge "make error-code reporting consistent upon malloc failure"
-77bf441 make error-code reporting consistent upon malloc failure
-7a93c00 **/Makefile.am: remove unused AM_CPPFLAGS
-24e3080 Add an interface abstraction to the WebP worker thread implementation
-d6cd635 Merge "fix orig_rect==NULL case"
-2bfd1ff fix orig_rect==NULL case
-059e21c Merge "configure: move config.h to src/webp/config.h"
-f05fe00 properly report back encoding error code in WebPFrameCacheAddFrame()
-32b3137 configure: move config.h to src/webp/config.h
-90090d9 Merge changes I7c675e51,I84f7d785
-ae7661b makefiles: define WEBP_HAVE_AVX2 when appropriate
-69fce2e remove the special casing for res->first in VP8SetResidualCoeffs
-6e61a3a configure: test for -msse2
-b9d2efc rename upsampling_mips32.c to yuv_mips32.c
-bdfeeba dsp/yuv: move sse2 functions to yuv_sse2.c
-46b32e8 Merge "configure: set WEBP_HAVE_AVX2 when available"
-88305db Merge "VP8RandomBits2: prevent signed int overflow"
-73fee88 VP8RandomBits2: prevent signed int overflow
-db4860b enc_sse2: prevent signed int overflow
-3fdaf4d Merge "real fix for longjmp warning"
-385e334 real fix for longjmp warning
-230a055 configure: set WEBP_HAVE_AVX2 when available
-a2ac8a4 restore original value_/range_ field order
-5e2ee56 Merge "remove libwebpdspdecode dep on libwebpdsp_avx2"
-61362db remove libwebpdspdecode dep on libwebpdsp_avx2
-42c447a Merge "lossy bit-reader clean-up:"
-479ffd8 Merge "remove unused #include's"
-9754d39 Merge "strong filtering speed-up (~2-3% x86, ~1-2% for NEON)"
-158aff9 remove unused #include's
-09545ee lossy bit-reader clean-up:
-ea8b0a1 strong filtering speed-up (~2-3% x86, ~1-2% for NEON)
-6679f89 Optimize VP8SetResidualCoeffs.
-ac591cf fix for gcc-4.9 warnings about longjmp + local variables
-4dfa86b dsp/cpu: NaCl has no support for xgetbv
-4c39869 Merge "cwebp: fallback to native webp decode in WIC builds"
-33aa497 Merge "cwebp: add some missing newlines in longhelp output"
-c9b340a fix missing WebPInitAlphaProcessing call for premultiplied colorspace output
-57897ba Merge "lossless_neon: use vcreate_*() where appropriate"
-6aa4777 Merge "(enc|dec)_neon: use vcreate_*() where appropriate"
-0d346e4 Always reinit VP8TransformWHT instead of hard-coding
-7d039fc cwebp: fallback to native webp decode in WIC builds
-d471f42 cwebp: add some missing newlines in longhelp output
-bf0e003 lossless_neon: use vcreate_*() where appropriate
-9251c2f (enc|dec)_neon: use vcreate_*() where appropriate
-399b916 lossy decoding: correct alpha-rescaling for YUVA format
-78c12ed Merge "Makefile.vc: add rudimentary avx2 support"
-dc5b122 try to remove the spurious warning for static analysis
-ddfefd6 Makefile.vc: add rudimentary avx2 support
-a891164 Merge "simplify VP8LInitBitReader()"
-fdbcd44 simplify VP8LInitBitReader()
-7c00428 makefile.unix: add rudimentary avx2 support
-515e35c Merge "add stub dsp/enc_avx2.c"
-a05dc14 SSE2: yuv->rgb speed-up for point-sampling
-178e9a6 add stub dsp/enc_avx2.c
-1b99c09 Merge "configure: add a test for -mavx2"
-fe72807 configure: add a test for -mavx2
-e46a247 cpu: fix check for __cpuidex availability
-176fda2 fix the bit-writer for lossless in 32bit mode
-541784c dsp.h: add a check for AVX2 / define WEBP_USE_AVX2
-bdb151e dsp/cpu: add AVX2 detection
-ab9f2f8 Merge "revamp the point-sampling functions by processing a full plane"
-a2f8b28 revamp the point-sampling functions by processing a full plane
-ef07602 use decoder's DSP functions for autofilter
-2b5cb32 Merge "dsp/cpu: add AVX detection"
-df08e67 dsp/cpu: add AVX detection
-e2f405c Merge "clean-up and slight speed-up in-loop filtering SSE2"
-f60957b clean-up and slight speed-up in-loop filtering SSE2
-9fc3ae4 .gitattributes: treat .ppm as binary
-3da924b Merge "dsp/WEBP_USE_NEON: test for __aarch64__"
-c716449 Android.mk: always include *_neon.c in the build
-a577b23 dsp/WEBP_USE_NEON: test for __aarch64__
-54bfffc move RemapBitReader() from idec.c to bit_reader code
-34168ec Merge "remove all unused layer code"
-f1e7717 remove all unused layer code
-b0757db Code cleanup for VP8LGetHistoImageSymbols.
-5fe628d make the token page size be variable instead of fixed 8192
-f948d08 memory debug: allow setting pre-defined malloc failure points
-ca3d746 use block-based allocation for backward refs storage, and free-lists
-1ba61b0 enable NEON intrinsics in aarch64 builds
-b9d2bb6 dsp/neon.h: coalesce intrinsics-related defines
-b5c7525 iosbuild: add support for iOSv7/aarch64
-9383afd Reduce number of memory allocations while decoding lossless.
-888e63e Merge "dsp/lossless: prevent signed int overflow in left shift ops"
-8137f3e Merge "instrument memory allocation routines for debugging"
-2aa1873 instrument memory allocation routines for debugging
-d3bcf72 Don't allocate VP8LHashChain, but treat like automatic object
-bd6b861 dsp/lossless: prevent signed int overflow in left shift ops
-b7f19b8 Merge "dec/vp8l: prevent signed int overflow in left shift ops"
-29059d5 Merge "remove some uint64_t casts and use."
-e69a1df dec/vp8l: prevent signed int overflow in left shift ops
-cf5eb8a remove some uint64_t casts and use.
-38e2db3 MIPS: MIPS32r1: Added optimization for HistogramAdd.
-e0609ad dwebp: fix exit code on webp load failure
-bbd358a Merge "example_util.h: avoid forward declaring enums"
-8955da2 example_util.h: avoid forward declaring enums
-6d6865f Added SSE2 variants for Average2/3/4
-b3a616b make HistogramAdd() a pointer in dsp
-c8bbb63 dec_neon: relocate some inline-asm defines
-4e393bb dec_neon: enable intrinsics-only functions
-ba99a92 dec_neon: use positive tests for USE_INTRINSICS
-69058ff Merge "example_util: add ExUtilDecodeWebPIncremental"
-a7828e8 dec_neon: make WORK_AROUND_GCC conditional on version
-3f3d717 Merge "enc_neon: enable intrinsics-only functions"
-de3cb6c Merge "move LOCAL_GCC_VERSION def to dsp.h"
-1b2fe14 example_util: add ExUtilDecodeWebPIncremental
-ca49e7a Merge "enc_neon: move Transpose4x4 to dsp/neon.h"
-ad900ab Merge "fix warning about size_t -> int conversion"
-4825b43 fix warning about size_t -> int conversion
-42b35e0 enc_neon: enable intrinsics-only functions
-f937e01 move LOCAL_GCC_VERSION def to dsp.h
-5e1a17e enc_neon: move Transpose4x4 to dsp/neon.h
-c7b92a5 dec_neon: (WORK_AROUND_GCC) delete unused Load4x8
-8e5f90b Merge "make ExUtilLoadWebP() accept NULL bitstream param."
-05d4c1b Merge "cwebp: add webpdec"
-ddeb6ac cwebp: add webpdec
-35d7d09 Merge "Reduce memory footprint for encoding WebP lossless."
-0b89610 Reduce memory footprint for encoding WebP lossless.
-f0b65c9 make ExUtilLoadWebP() accept NULL bitstream param.
-9c0a60c Merge "dwebp: move webp decoding to example_util"
-1d62acf MIPS: MIPS32r1: Added optimization for HuffmanCost functions.
-4a0e739 dwebp: move webp decoding to example_util
-c022046 Merge "Bugfix: Incremental decode of lossy-alpha"
-8c7cd72 Bugfix: Incremental decode of lossy-alpha
-7955152 MIPS: fix error with number of registers.
-b1dabe3 Merge "Move the HuffmanCost() function to dsp lib"
-75b1200 Move the HuffmanCost() function to dsp lib
-2772b8b MIPS: fix assembler error revealed by clang's debug build
-6653b60 enc_mips32: fix unused symbol warning in debug
-8dec120 enc_mips32: disable ITransform(One) in debug builds
-98519dd enc_neon: convert Disto4x4 to intrinsics
-fe9317c cosmetics:
-953b074 enc_neon: cosmetics
-a9fc697 Merge "WIP: extract the float-calculation of HuffmanCost from loop"
-3f84b52 Merge "replace some mult-long (vmull_u8) with mult-long-accumulate (vmlal_u8)"
-4ae0533 MIPS: MIPS32r1: Added optimizations for ExtraCost functions.
-b30a04c WIP: extract the float-calculation of HuffmanCost from loop
-a8fe8ce Merge "NEON intrinsics version of CollectHistogram"
-95203d2 NEON intrinsics version of CollectHistogram
-7ca2e74 replace some mult-long (vmull_u8) with mult-long-accumulate (vmlal_u8)
-41c6efb fix lossless_neon.c
-8ff96a0 NEON intrinsics version of FTransform
-0214f4a Merge "MIPS: MIPS32r1: Added optimizations for FastLog2"
-baabf1e MIPS: MIPS32r1: Added optimizations for FastLog2
-3d49871 NEON functions for lossless coding
-3fe0291 MIPS: MIPS32r1: Added optimizations for SSE functions.
-c503b48 Merge "fix the gcc-4.6.0 bug by implementing alternative method"
-abe6f48 fix the gcc-4.6.0 bug by implementing alternative method
-5598bde enc_mips32.c: fix file mode
-2b1b4d5 MIPS: MIPS32r1: Add optimization for GetResidualCost
-f0a1f3c Merge "MIPS: MIPS32r1: Added optimization for FTransform"
-7231f61 MIPS: MIPS32r1: Added optimization for FTransform
-869eaf6  ~30% encoding speedup: use NEON for QuantizeBlock()
-f758af6 enc_neon: convert FTransformWHT to intrinsics
-7dad095 MIPS: MIPS32r1: Added optimization for Disto4x4 (TTransform)
-2298d5f MIPS: MIPS32r1: Added optimization for QuantizeBlock
-e88150c Merge "MIPS: MIPS32r1: Add optimization for ITransform"
-de693f2 lossless_neon: disable VP8LConvert* functions
-4143332 NEON intrinsics for encoding
-0ca2914 MIPS: MIPS32r1: Add optimization for ITransform
-71bca5e dec_neon: use vst_lane instead of vget_lane
-bf06105 Intrinsics NEON version of TransformOne
-19c6f1b Merge "dec_neon: use vld?_lane instead of vset?_lane"
-7a94c0c upsampling_neon: drop NEON suffix from local functions
-d14669c upsampling_sse2: drop SSE2 suffix from local functions
-2ca42a4 enc_sse2: drop SSE2 suffix from local functions
-d038e61 dec_sse2: drop SSE2 suffix from local functions
-fa52d75 dec_neon: use vld?_lane instead of vset?_lane
-c520e77 cosmetic: fix long line
-4b0f2da Merge "add intrinsics NEON code for chroma strong-filtering"
-e351ec0 add intrinsics NEON code for chroma strong-filtering
-aaf734b Merge "Add SSE2 version of forward cross-color transform"
-c90a902 Add SSE2 version of forward cross-color transform
-bc374ff Use histogram_bits to initalize transform_bits.
-2132992 Merge "Add strong filtering intrinsics (inner and outer edges)"
-5fbff3a Add strong filtering intrinsics (inner and outer edges)
-d4813f0 Add SSE2 function for Inverse Cross-color Transform
-2602956 dec_neon: add strong loopfilter intrinsics
-cca7d7e Merge "add intrinsics version of SimpleHFilter16NEON()"
-1a05dfa windows: fix dll builds
-d6c50d8 Merge "add some colorspace conversion functions in NEON"
-4fd7c82 SSE2 variants of Subtract-Green: Rectify loop condition
-97e5fac add some colorspace conversion functions in NEON
-b9a7a45 add intrinsics version of SimpleHFilter16NEON()
-daccbf4 add light filtering NEON intrinsics
-af44460 fix typo in STORE_WHT
-6af6b8e Tune HistogramCombineBin for large images.
-af93bdd use WebPSafe[CM]alloc/WebPSafeFree instead of [cm]alloc/free
-51f406a lossless_sse2: relocate VP8LDspInitSSE2 proto
-0f4f721 separate SSE2 lossless functions into its own file
-514fc25 VP8LConvertFromBGRA: use conversion function pointers
-6d2f352 dsp/dec: TransformDCUV: use VP8TransformDC
-defc8e1 Merge "fix out-of-bound read during alpha-plane decoding"
-fbed364 Merge "dsp: reuse wht transform from dec in encoder"
-d846708 Merge "Add SSE2 version of ARGB -> BGR/RGB/... conversion functions"
-207d03b fix out-of-bound read during alpha-plane decoding
-d1b33ad 2-5% faster trellis with clang/MacOS (and ~2-3% on ARM)
-369c26d Add SSE2 version of ARGB -> BGR/RGB/... conversion functions
-df230f2 dsp: reuse wht transform from dec in encoder
-80e218d Android.mk: fix build with APP_ABI=armeabi-v7a-hard
-59daf08 Merge "cosmetics:"
-5362200 cosmetics:
-3e7f34a AssignSegments: quiet array-bounds warning
-3c2ebf5 Merge "UpdateHistogramCost: avoid implicit double->float"
-cf821c8 UpdateHistogramCost: avoid implicit double->float
-312e638 Extend the search space for GetBestGreenRedToBlue
-1c58526 Fix few nits
-fef2270 Optimize and re-structure VP8LGetHistoImageSymbols
-068b14a Optimize lossless decoding.
-5f0cfa8 Do a binary search to get the optimum cache bits.
-24ca367 Merge "allow 'cwebp -o -' to emit output to stdout"
-e12f874 allow 'cwebp -o -' to emit output to stdout
-2bcad89 allow some more stdin/stout I/O
-84ed4b3 fix cwebp.1 typos after patch #69199
-65b99f1 add a -z option to cwebp, and WebPConfigLosslessPreset() function
-3017661 4-5% faster trellis by removing some unneeded calculations.
-687a58e histogram.c: reindent after b33e8a0
-06d456f Merge "~3-4% faster lossless encoding"
-c60de26 ~3-4% faster lossless encoding
-42eb06f Merge "few cosmetics after patch #69079"
-82af826 few cosmetics after patch #69079
-b33e8a0 Refactor code for HistogramCombine.
-ca1bfff Merge "5-10% encoding speedup with faster trellis (-m 6)"
-5aeeb08 5-10% encoding speedup with faster trellis (-m 6)
-82ae1bf cosmetics: normalize VP8GetCPUInfo checks
-e3dd924 Merge "Refactor GetBestPredictorForTile for future tuning."
-206cc1b Refactor GetBestPredictorForTile for future tuning.
-3cb8406 Merge "speed-up trellis quant (~5-10% overall speed-up)"
-b66f222 Merge "lossy encoding: ~3% speed-up"
-4287d0d speed-up trellis quant (~5-10% overall speed-up)
-390c8b3 lossy encoding: ~3% speed-up
-9a463c4 Merge "dec_neon: convert TransformWHT to intrinsics"
-e8605e9 Merge "dec_neon: add ConvertU8ToS16"
-4aa3e41 MIPS: MIPS32r1: rescaler bugfix
-c16cd99 Speed up lossless encoder.
-9d6b5ff dec_neon: convert TransformWHT to intrinsics
-2ff0aae dec_neon: add ConvertU8ToS16
-77a8f91 fix compilation with USE_YUVj flag
-4acbec1 Merge changes I3b240ffb,Ia9370283,Ia2d28728
-2719bb7 dec_neon: TransformAC3: work on packed vectors
-b7b60ca dec_neon: add SaturateAndStore4x4
-b7685d7 Rescale: let ImportRow / ExportRow be pointer-to-function
-e02f16e dec_neon.c: convert TransformDC to intrinsics
-9cba963 add missing file
-8992ddb use static clipping tables
-0235d5e 1-2% faster quantization in SSE2
-b2fbc36 fix VC12-x64 warning
-6e37cb9 Merge "cosmetics: backward_references.c: reindent after a7d2ee3"
-a42ea97 cosmetics: backward_references.c: reindent after a7d2ee3
-6c32744 Merge "fix missing __BIG_ENDIAN__ definition on some platform"
-a8b6aad fix missing __BIG_ENDIAN__ definition on some platform
-fde2904 Increase initial buffer size for VP8L Bit Writer.
-a7d2ee3 Optimize cache estimate logic.
-7fb6095 Merge "dec_neon.c: add TransformAC3"
-bf182e8 VP8LBitWriter: use a bit-accumulator
-3f40b4a Merge "MIPS: MIPS32r1: clang macro warning resolved"
-1684f4e WebP Decoder: Mark some truncated bitstreams as invalid
-acbedac MIPS: MIPS32r1: clang macro warning resolved
-228e487 dec_neon.c: add TransformAC3
-393f89b Android.mk: avoid gcc-specific flags with clang
-32aeaf1 revamp VP8LColorSpaceTransform() a bit
-0c7cc4c Merge "Don't dereference NULL, ensure HashChain fully initialized"
-391316f Don't dereference NULL, ensure HashChain fully initialized
-926ff40 WEBP_SWAP_16BIT_CSP: remove code dup
-1d1cd3b Fix decode bug for rgbA_4444/RGBA_4444 color-modes.
-939e70e update AUTHORS file
-8934a62 cosmetics: *_mips32.c
-dd438c9 MIPS: MIPS32r1: Optimization of some simple point-sampling functions. PATCH [6/6]
-5352091 Added support for calling sampling functions via pointers.
-d16c697 MIPS: MIPS32r1: Optimization of filter functions. PATCH [5/6]
-04336fc MIPS: MIPS32r1: Optimization of function TransformOne. PATCH [4/6]
-92d8fc7 MIPS: MIPS32r1: Optimization of function WebPRescalerImportRow. PATCH [3/6]
-bbc23ff parse one row of intra modes altogether
-a2f608f Merge "MIPS: MIPS32r1: Optimization of function WebPRescalerExportRow. [2/6]"
-8823085 MIPS: MIPS32r1: Optimization of function WebPRescalerExportRow. [2/6]
-c5a5b02 decode mt+incremental: fix segfault in debug builds
-9882b2f always use fast-analysis for all methods.
-000adac Merge "autoconf: update ax_pthread.m4"
-2d2fc37 update .gitignore
-5bf4255 Merge "Make it possible to avoid automagic dependencies"
-c1cb193 disable NEON for arm64 platform
-73a304e Make it possible to avoid automagic dependencies
-4d493f8 MIPS: MIPS32r1: Decoder bit reader function optimized. PATCH [1/6]
-c741183 make WebPCleanupTransparentArea work with argb picture
-5da1855 add a decoding option to flip image vertically
-00c3c4e Merge "add man/vwebp.1"
-2c6bb42 add man/vwebp.1
-ea59a8e Merge "Merge tag 'v0.4.0'"
-7574bed fix comments related to array sizes
-0b5a90f dwebp.1: fix option formatting
-effcb0f Merge tag 'v0.4.0'
-7c76255 autoconf: update ax_pthread.m4
-fff2a11 make -short work with -print_ssim, -print_psnr, etc.
-68e7901 update ChangeLog (tag: v0.4.0-rc1, tag: v0.4.0, origin/0.4.0, 0.4.0)
-256e433 update NEWS description with new general features
-2962534 Merge "gif2webp: don't use C99 %zu" into 0.4.0
-3b9f9dd gif2webp: don't use C99 %zu
-b5b2e3c cwebp: fix metadata output w/lossy+alpha
-ad26df1 makefile.unix: clean up libgif2webp_util.a
-c3b4557 update Changelog
-ca84112 Merge "bump version to 0.4.0" into 0.4.0
-8c524db bump version to 0.4.0
-eec2398 update AUTHORS & .mailmap
-b9bbf6a update NEWS for 0.4.0
-c72e081 Merge "dec/webp.c: don't wait for data before reporting w/h"
-5ad6531 dec/frame.c: fix formatting
-f7fc4bc dec/webp.c: don't wait for data before reporting w/h
-66a32af Merge "NEON speed up"
-26d842e NEON speed up
-f307f98 Merge "webpmux: let -- stop parameter parsing"
-fe051da Merge "README: add a section on gif2webp"
-6fd2bd6 Merge "manpage pedantry"
-4af1900 README: add a section on gif2webp
-6f36ade manpage pedantry
-f9016cb README: update dwebp options
-b4fa0a4 webpmux: let -- stop parameter parsing
-a9a20ac gif2webp: Add a multi-threaded encode option
-495bef4 fix bug in TrellisQuantize
-605a712 simplify __cplusplus ifdef
-33109f9 Merge "drop: ifdef __cplusplus checks from C files"
-7f9de0b Merge changes I994a5587,I8467bb71,I13b50688,I1e2c9c7b
-5459030 gif2webp: let -- stop parameter parsing
-a4b0aa0 vwebp: let -- stop parameter parsing
-98af68f cwebp: let -- stop parameter parsing
-a33831e dwebp: let -- stop parameter parsing
-3630124 add some checks on error paths
-ce4c713 Merge "autoconf: add --disable-wic"
-5227d99 drop: ifdef __cplusplus checks from C files
-f645355 dwebp.1: fix typo
-f91034f Merge "cwebp: print metadata stats when no output file is given"
-d493455 gif2webp: Backward compatibility for giflib version <= 4.1.3
-4c617d3 gif2webp: Disable output of ICC profile by default
-73b731f introduce a special quantization function for WHT
-41c0cc4 Make Forward WHT transform use 32bit fixed-point calculation
-a3359f5 Only compute quantization params once
-7049043 cwebp: print metadata stats when no output file is given
-d513bb6 * fix off-by-one zthresh calculation * remove the sharpening for non luma-AC coeffs * adjust the bias a little bit to compensate for this
-ad9dec0 Merge "cosmetics: dwebp: fix local function name format"
-f737f03 Merge "dwebp: remove a dead store"
-3c3a70d Merge "makefile.unix: install binaries in $(DESTDIR)/bin/"
-150b655 Merge "Android.mk: add some release compile flags"
-dbebd33 cosmetics: dwebp: fix local function name format
-2774995 dwebp: remove a dead store
-a01e04f autoconf: add --disable-wic
-5009b22 makefile.unix: install binaries in $(DESTDIR)/bin/
-bab30fc Merge "fix -print_psnr / ssim options"
-ebef7fb fix -print_psnr / ssim options
-cb63785 Merge "fix bug due to overzealous check in WebPPictureYUVAToARGB()"
-8189885 Merge "EstimateBestFilter: use an int to iterate WEBP_FILTER_TYPE"
-4ad7d33 Android.mk: add some release compile flags
-c12e236 cosmetics: fix a few typos
-6f10403 fix bug due to overzealous check in WebPPictureYUVAToARGB()
-3f6c35c EstimateBestFilter: use an int to iterate WEBP_FILTER_TYPE
-cc55790 Merge changes I8bb7a4dc,I2c180051,I021a014f,I8a224a62
-c536afb Merge "cosmetics: fix some typos"
-cbdd3e6 add a -dither dithering option to the decoder
-e812401 Updated iosbuild.sh for XCode 5.x
-4931c32 cosmetics: fix some typos
-05aacf7 mux: add some missing casts
-617d934 enc/vp8l: add a missing cast
-46db286 idec: add some missing casts
-b524e33 ErrorStatusLossless: correct return type
-cb261f7 fix a descaling bug for vertical/horizontal U/V interpolation
-bcb3955 Merge changes I48968468,I181bc736
-73f5213 gif2webp: Add a mixed compression mode
-6198715 demux: split chunk parsing from ParseVP8X
-d2e3f4e demux: add a tail pointer for chunks
-87cffcc demux: cosmetics: s/has_frames/is_animation/
-e18e667 demux: strictly enforce the animation flag
-c4f39f4 demux: cosmetics: remove a useless break
-61cb884 demux: (non-exp) fail if the fragmented flag is set
-ff379db few % speedup of lossless encoding
-df3649a remove all disabled code related to P-frames
-6d0cb3d Merge "gif2webp: kmin = 0 should suppress key-frame addition."
-3655598 gif2webp: kmin = 0 should suppress key-frame addition.
-7708e60 Merge "detect flatness in blocks and favor DC prediction"
-06b1503 Merge "add comment about the kLevelsFromDelta[][] LUT generation"
-5935259 add comment about the kLevelsFromDelta[][] LUT generation
-e3312ea detect flatness in blocks and favor DC prediction
-ebc9b1e Merge "VPLBitReader bugfix: Catch error if bit_pos > LBITS too."
-96ad0e0 VPLBitReader bugfix: Catch error if bit_pos > LBITS too.
-a014e9c tune quantization biases toward higher precision
-1e89861 add helpful PrintBlockInfo() function
-596a6d7 make use of 'extern' consistent in function declarations
-c8d48c6 Merge "extract random utils to their own file util/random.[ch]"
-98aa33c extract random utils to their own file util/random.[ch]
-432a723 Merge "swig: add basic go bindings"
-fab618b Merge "rename libwebp.i -> libwebp.swig"
-e4e7fcd swig: add basic go bindings
-d340872 Merge "fast auto-determined filtering strength"
-f8bfd5c fast auto-determined filtering strength
-ac0bf95 small clean-up in ExpandMatrix()
-1939607 rename libwebp.i -> libwebp.swig
-43148b6 filtering: precompute ilimit and hev_threshold
-18f992e simplify f_inner calculation a little
-241d11f add missing const
-86c0031 add a 'format' field to WebPBitstreamFeatures
-dde91fd Demux: Correct the extended format validation
-5d6c5bd add entry for '-resize' option in cwebp's man
-7c098d1 Use some gamma-curve range compression when computing U/V average
-0b2b050 Use deterministic random-dithering during RGB->YUV conversion
-8a2fa09 Add a second multi-thread method
-7d6f2da Merge "up to 20% faster multi-threaded decoding"
-266f63e Merge "libwebp.jar: build w/Java 1.6 for Android compat"
-0532149 up to 20% faster multi-threaded decoding
-38efdc2 Simplify the gif2webp tool: move the optimization details to util
-de89951 libwebp.jar: build w/Java 1.6 for Android compat
-cb22155 Decode a full row of bitstream before reconstructing
-dca8a4d Merge "NEON/simple loopfilter: avoid q4-q7 registers"
-9e84d90 Merge "NEON/TransformWHT: avoid q4-q7 registers"
-fc10249 NEON/simple loopfilter: avoid q4-q7 registers
-2f09d63 NEON/TransformWHT: avoid q4-q7 registers
-77585a2 Merge "use a macrofunc for setting NzCoeffs bits"
-d155507 Merge "use HINT_GRAPH as image_hint for gif source"
-9c56164 Merge "only print GIF_DISPOSE_WARNING once"
-0587986 use HINT_GRAPH as image_hint for gif source
-0b28d7a use a macrofunc for setting NzCoeffs bits
-f9bbc2a Special-case sparse transform
-0012519 gif2webp: detect and flatten uniformly similar blocks
-0deaf0f only print GIF_DISPOSE_WARNING once
-6a8c0eb Merge "small optimization in segment-smoothing loop"
-f7146bc small optimization in segment-smoothing loop
-5a7533c small gif2webp fix
-4df0c89 Merge changes Ic697660c,I27285521
-5b2e6bd Android.mk: add a dwebp target
-f910a84 Android.mk: update build flags
-63f9aba special-case WHT transform when there's only DC
-80911ae Merge "7-8% faster decoding by rewriting GetCoeffs()"
-606c430 gif2webp: Improved compression for lossy animated WebP
-fb887f7 gif2webp: Different kmin/kmax defaults for lossy and lossless
-2a98136 7-8% faster decoding by rewriting GetCoeffs()
-92d47e4 improve VP8L signature detection by checking the version bits too
-5cd43e4 Add -incremental option to dwebp
-54b8e3f webpmux: DisplayInfo(): remove unnecessary error checks.
-40ae352 fix memleak in WebPIDelete()
-d966265 mux.h doc: WebPMuxGetFrame() can return WEBP_MUX_MEMORY_ERROR too.
-0e6747f webpmux -info: display dimensions and has_alpha per frame
-d78a82c Sanity check for underflow
-8498f4b Merge "remove -Wshadow warnings"
-e89c6fc Avoid a potential memleak
-3ebe175 Merge "break down the proba 4D-array into some handy structs"
-6a44550 break down the proba 4D-array into some handy structs
-2f5e893 remove -Wshadow warnings
-bf3a29b Merge "add proper WEBP_HAVE_GIF and WEBP_HAVE_GL flags"
-2b0a759 Merge "fix some warnings from static analysis"
-22dd07c mux.h: Some doc corrections
-79ff034 add proper WEBP_HAVE_GIF and WEBP_HAVE_GL flags
-d51f45f fix some warnings from static analysis
-d134307 fix conversion warning on MSVC
-d538cea gif2webp: Support a 'min' and 'max'  key frame interval
-80b54e1 allow search with token buffer loop and fix PARTITION0 problem
-b7d4e04 add VP8EstimateTokenSize()
-10fddf5 enc/quant.c: silence a warning
-399cd45 Merge "fix compile error on ARM/gcc"
-9f24519 encoder: misc rate-related fixes
-c663bb2 Merge "simplify VP8IteratorSaveBoundary() arg passing"
-fa46b31 Demux.h: Correct a method name reference
-f8398c9 fix compile error on ARM/gcc
-f691f0e simplify VP8IteratorSaveBoundary() arg passing
-42542be up to 6% faster encoding with clang compiler
-93402f0 multi-threaded segment analysis
-7e2d659 Merge "remove the PACK() bit-packing tricks"
-c13fecf remove the PACK() bit-packing tricks
-2fd091c Merge "use NULL for lf_stats_ testing, not bool"
-b11c9d6 dwebp: use default dct_method
-4bb8465 Merge "(de)mux.h: wrap pseudo-code in /* */"
-cfb56b1 make -pass option work with token buffers
-5416aab (de)mux.h: wrap pseudo-code in /* */
-35dba33 use NULL for lf_stats_ testing, not bool
-733a7fa enc->Iterator memory cleanup
-e81fac8 Add support for "no blend" in webpmux binary
-3b80bc4 gif2webp: Separate out each step into a method
-bef7e9c Add doc precision about demux object keeping pointers to data.
-61405a1 dwebp: enable stdout output with WIC
-6eabb88 Merge "Animated WebP: add "do no blend" option to spec"
-be20dec fix compilation for BITS 24
-e58cc13 Merge "dwebp: s/unsigned char/uint8_t/"
-72501d4 dwebp: s/unsigned char/uint8_t/
-2c9633e Merge "gif2webp: Insert independent frames at regular intervals."
-f0d6a14 gif2webp: Insert independent frames at regular intervals.
-b25a6fb yuv.h: fix indent
-ede3602 Merge "cosmetics: fix indent"
-3a65122 dwebp: fix stdout related output
-388a724 cosmetics: fix indent
-4c7322c Merge "dsp: msvc compatibility"
-d50c7e3 Merge "5-7% faster SSE2 versions of YUV->RGB conversion functions"
-b8ab784 Merge "simplify upsampler calls: only allow 'bottom' to be NULL"
-df6cebf 5-7% faster SSE2 versions of YUV->RGB conversion functions
-ad6ac32 simplify upsampler calls: only allow 'bottom' to be NULL
-a5e8afa output to stdout if file name is "-"
-f358450 dsp: msvc compatibility
-43a7c8e Merge "cosmetics"
-4c5f19c Merge "bit_reader.h: cosmetics"
-f72fab7 cosmetics
-14dd5e7 fix const-ness
-b20aec4 Merge "Support for 'do not blend' option in vwebp"
-dcf6522 Support for 'do not blend' option in vwebp
-d5bad03 Animated WebP: add "do no blend" option to spec
-a2f5f73 Merge "Support for "Do not blend" in mux and demux libraries"
-e081f2f Pack code & extra_bits to Struct (VP8LPrefixCode).
-6284854 Support for "Do not blend" in mux and demux libraries
-f486aaa Merge "slightly faster ParseIntraMode"
-d171863 slightly faster ParseIntraMode
-3ceca8a bit_reader.h: cosmetics
-69257f7 Create LUT for PrefixEncode.
-988b708 add WebPWorkerExecute() for convenient bypass
-06e2498 Merge "VP8EncIterator clean-up"
-de4d4ad VP8EncIterator clean-up
-7bbe952 Merge "cosmetics: thread.c: drop a redundant comment"
-da41148 cosmetics: thread.c: drop a redundant comment
-feb4b6e thread.h: #ifdef when checking WEBP_USE_THREAD
-8924a3a thread.c: drop WebPWorker prefix from static funcs
-1aed8f2 Merge "fix indent"
-4038ed1 fix indent
-1693fd9 Demux: A new state WEBP_DEMUX_PARSE_ERROR
-8dcae8b fix rescaling-with-alpha inaccuracy
-11249ab Merge changes I9b4dc36c,I4e0eef4d
-52508a1 Mux: support parsing unknown chunks within a frame/fragment.
-05db057 WebPMuxSetChunk: remove unused variable
-8ba1bf6 Stricter check for presence of alpha when writing lossless images
-a03c351 Demux: WebPIterator now also denotes if the frame has alpha.
-6df743a Decoder: handle fragments case correctly too.
-faa4b07 Support for unknown chunks in mux library
-7d60bbc Speed up HashChainFindCopy function.
-6674014 Speedup Alpha plane encoding.
-b7346a1 0.1 % speedup to decoding
-c606182 webp-container-spec: Tighten language added by last
-a34a502 pngdec: output error messages from libpng
-e84c625 Merge "Detect canvas and image size mismatch in decoder."
-f626fe2 Detect canvas and image size mismatch in decoder.
-f5fbdee demux: stricter image bounds check
-30c8158 add extra assert in Huffman decode code
-8967b9f SSE2 for lossless decoding (critical) functions.
-699d80e Jump-lookup for Huffman coding
-c34307a fix some VS9 warnings about type conversion
-eeada35 pngdec: add missing include
-54b6510 gif2webp: If aligning to even offsets, extra pixels should be transparent
-0bcf5ce Merge "remove a malloc() in case we're using only FILTER_NONE for alpha"
-2c07143 remove a malloc() in case we're using only FILTER_NONE for alpha
-a4d5f59 Faster lossless decoding
-fd53bb7 Merge "alternate LUT-base reverse-bits code"
-d1c166e Merge "Container spec: a clarification on background color."
-fdb9177 Rename a method
-5e96753 Container spec: a clarification on background color.
-30e77d0 Merge branch '0.3.0'
-1b631e2 alternate LUT-base reverse-bits code
-24cc307 ~20% faster lossless decoding
-313d853 Speedup for decoding lossless WebP photographs:
-24ee098 change the bytes_per_pixels_ field into more evocative use_8b_decode
-2a04b03 update ChangeLog (tag: v0.3.1-rc2, tag: v0.3.1)
-7288950 Regression fix for alpha channels using color cache:
-2e377b5 wicdec: silence a format warning
-ad9e42a muxedit: silence some uninitialized warnings
-3307c16 Don't set alpha-channel to 0xff for alpha->green uplift
-5130770 Merge "wicdec: silence a format warning"
-a37eff4 Regression fix for alpha channels using color cache:
-241cf99 Merge "muxedit: silence some uninitialized warnings"
-c8f9c84 Regression fix for alpha unfiltering:
-14cd5c6 muxedit: silence some uninitialized warnings
-a368db8 dec/vp8l: quiet vs9 x64 type conversion warning
-ffae9f3 wicdec: silence a format warning
-8cf0701 Alpha encoding: never filter in case of NO_COMPRESSION
-825e73b update ChangeLog (tag: v0.3.1-rc1)
-abf6f69 update NEWS
-5a92c1a bump version to 0.3.1
-86daf77 store top Y/U/V samples in packed fashion
-67bc353 Revert "add WebPBlendAlpha() function to blend colors against background"
-068db59 Intertwined decoding of alpha and RGB
-38cc011 Simplify forward-WHT + SSE2 version
-3fa595a Support decoding upto given row in DECODE_DATA_FUNC
-520f005 DequantizeLevels(): Add 'row' and 'num_rows' args
-47374b8 Alpha unfilter for given set of rows
-f32097e probe input file and quick-check for WebP format.
-a2aed1d configure: improve gl/glut library test
-c7e89cb update copyright text
-a00380d configure: remove use of AS_VAR_APPEND
-a94a88d fix EXIF parsing in PNG
-a71e5d8 add doc precision for WebPPictureCopy() and WebPPictureView()
-8287012 remove datatype qualifier for vmnv
-e190843 fix a memory leak in gif2webp
-0b18b9e fix two minor memory leaks in webpmux
-db5095d remove some cruft from swig/libwebp.jar
-850e956 README: update swig notes
-bddd9b0 swig/python: add minimal documentation
-d573a8d swig: add python encode support
-6b93187 swig/java: reduce wrapper function code duplication
-6fe536f swig/java: rework uint8_t typemap
-a2ea464 Fix the bug in ApplyPalette.
-7bb28d2 webp/lossless: fix big endian BGRA output
-f036d4b Speed up ApplyPalette for ARGB pixels.
-8112c8c remove some warnings:
-cc128e0 Further reduce memory to decode lossy+alpha images
-07db70d fix for big-endian
-eda8a7d gif2webp: Fix signed/unsigned comparison mismatch
-31f346f Makefile.vc: fix libwebpdemux dll variable typo
-6c76d28 swig: add python (decode) support
-b4f5bb6 swig: cosmetics
-498d4dd WebP-Lossless encoding improvements.
-26e7244 swig: ifdef some Java specific code
-8ecec68 configure: add warning related flags
-e676b04 configure: add GLUT detection; build vwebp
-b0ffc43 Alpha decoding: significantly reduce memory usage
-20aa7a8 configure: add --enable-everything
-b8307cc configure.ac: add some helper macros
-980e7ae Remove the gcc compilation comments
-7f25ff9 gif2webp: Fix ICC and XMP support
-d8e5321 Add missing name to AUTHORS
-11edf5e Demux: Fix a potential memleak
-c7b9218 don't forward declare enums
-7a650c6 prevent signed int overflow in left shift ops
-31bea32 add precision about dynamic output reallocation with IDecoder
-c22877f Add incremental support for extended format files
-5051245 Makefile.vc: have 'all' target build everything
-8191dec Makefile.vc: flags cleanup
-b9d7473 Makefile.vc: drop /FD flag
-5568dbc update gitignore
-f4c7b65 WebPEncode: An additional check. Start VP8EncLoop/VP8EncTokenLoop only if VP8EncStartAlpha succeeded.
-1fb04be pngdec: Avoid a double-free.
-dcbb1ca add WebPBlendAlpha() function to blend colors against background
-bc9f5fb configure.ac: add AM_PROG_AR for automake >= 1.12
-bf867bf Tuned cross_color parameter (step) for lower qual
-90e2ec5 Merge "probe input file and quick-check for WebP format."
-7180d7f Merge "update copyright text"
-830f72b probe input file and quick-check for WebP format.
-2ccf58d configure: improve gl/glut library test
-d640614 update copyright text
-c2113ad Merge "configure: remove use of AS_VAR_APPEND"
-9326a56 configure: remove use of AS_VAR_APPEND
-ea63d61 fix a type warning on VS9 x86
-bec1109 fix EXIF parsing in PNG
-b6e65f3 Merge "fix warnings for vs9 x64"
-438946d fix warnings for vs9 x64
-f4710e3 collect macroblock reconstruction data in VP8MBData struct
-23d28e2 add doc precision for WebPPictureCopy() and WebPPictureView()
-518f2cd cosmetics: gif2webp: fix indent
-af358e6 Merge "remove datatype qualifier for vmnv"
-3fe9163 remove datatype qualifier for vmnv
-764fdff fix a memory leak in gif2webp
-3e59a74 fix two minor memory leaks in webpmux
-47b9862 Merge "README: update swig notes"
-325d15f remove some cruft from swig/libwebp.jar
-4a7627c README: update swig notes
-5da81e3 Merge "swig/python: add minimal documentation"
-f39e08f Merge "swig: add python encode support"
-6ca4a3e Merge "swig/java: reduce wrapper function code duplication"
-8f8702b Merge "swig/java: rework uint8_t typemap"
-91413be reduce memory for VP8MB and remove bitfields use
-7413394 Fix the memory leak in ApplyFilters.
-2053c2c simplify the alpha-filter testing loop
-825b64d swig/python: add minimal documentation
-14677e1 swig: add python encode support
-a5c297c swig/java: reduce wrapper function code duplication
-ad4a367 swig/java: rework uint8_t typemap
-0d25876 use uint8_t for inv_palette[]
-afa3450 Fix the bug in ApplyPalette.
-2d6ac42 Merge "webp/lossless: fix big endian BGRA output"
-2ca8396 webp/lossless: fix big endian BGRA output
-742110c Speed up ApplyPalette for ARGB pixels.
-2451e47 misc code cleanup
-83db404 Merge "swig: add python (decode) support"
-eeeea8b Merge "swig: cosmetics"
-d5f9b8f Merge "libwebp: fix vp8 encoder mem alloc offsetting"
-d8edd83 libwebp: fix vp8 encoder mem alloc offsetting
-8983b83 remove use of bit-fields in VP8FInfo
-87a4fca remove some warnings:
-ba8f74e Merge "fix for big-endian"
-a65067f Merge "Further reduce memory to decode lossy+alpha images"
-64c8448 Further reduce memory to decode lossy+alpha images
-332130b Mux: make a few methods static
-4437061 fix for big-endian
-5199eab Merge "add uncompressed TIFF output support"
-a3aede9 add uncompressed TIFF output support
-f975b67 Merge "gif2webp: Fix signed/unsigned comparison mismatch"
-5fbc734 Merge "GetFeatures: Detect invalid VP8X/VP8/VP8L data"
-d5060c8 Merge "mux.h: A comment fix + some consistency fixes"
-352d0de GetFeatures: Detect invalid VP8X/VP8/VP8L data
-3ef79fe Cosmetic: "width * height"
-043e1ae gif2webp: Fix signed/unsigned comparison mismatch
-5818cff mux.h: A comment fix + some consistency fixes
-1153f88 Merge "swig: ifdef some Java specific code"
-3eeedae Makefile.vc: fix libwebpdemux dll variable typo
-f980faf swig: add python (decode) support
-7f5f42b swig: cosmetics
-8eae188 WebP-Lossless encoding improvements.
-c7247c4 swig: ifdef some Java specific code
-4cb234d Merge "Mux: make ValidateForSingleImage() method static"
-ed6f530 Merge "Add GetCanvasSize() method to mux"
-1d530c9 Mux: make ValidateForSingleImage() method static
-bba4c2b configure: add warning related flags
-fffefd1 Add GetCanvasSize() method to mux
-732da8d Merge "configure: add GLUT detection; build vwebp"
-0e513f7 configure: add GLUT detection; build vwebp
-55d1c15 Merge "Alpha decoding: significantly reduce memory usage"
-13d99fb Merge "configure: add --enable-everything"
-2bf698f Merge "configure.ac: add some helper macros"
-edccd19 Alpha decoding: significantly reduce memory usage
-3cafcc9 configure: add --enable-everything
-4ef1447 configure.ac: add some helper macros
-a4e1cdb Remove the gcc compilation comments
-6393fe4 Cosmetic fixes
-9c4ce97 Simplify forward-WHT + SSE2 version
-878b9da fix missed optim
-0004617 VP8GetInfo(): Check for zero width or height.
-9bf3129 align VP8Encoder::nz_ allocation
-5da165c fix CheckMode() signature
-0ece07d Merge "explicitly pad bitfields to 32-bits"
-9dbc9d1 explicitly pad bitfields to 32-bits
-5369a80 Merge "prevent signed int overflow in left shift ops"
-70e3971 Merge "cosmetics: remove unnecessary ';'s"
-d3136ce Merge "don't forward declare enums"
-b26e5ad gif2webp: Fix ICC and XMP support
-46089b2 Add missing name to AUTHORS
-94328d6 Demux: Fix a potential memleak
-96e948d don't forward declare enums
-f4f9088 prevent signed int overflow in left shift ops
-0261545 cosmetics: remove unnecessary ';'s
-7ebdf11 Merge "Fix few missing comparisons to NULL"
-1579989 Fix few missing comparisons to NULL
-ea1b21c Cleaned up VP8GetHeaders() so that it parses only frame header
-b66caee dwebp: add support for BMP output
-ff885bf add precision about dynamic output reallocation with IDecoder
-79241d5 Merge "Makefile.vc: have 'all' target build everything"
-ac1c729 Merge "Makefile.vc: flags cleanup"
-118a055 Merge "Makefile.vc: drop /FD flag"
-ecad010 Merge "update gitignore"
-a681b4f Rename PRE_VP8 state to WEBP_HEADER
-ead4d47 Add incremental support for extended format files
-69d0f92 Makefile.vc: have 'all' target build everything
-5296749 Makefile.vc: flags cleanup
-c61baf0 Makefile.vc: drop /FD flag
-3a15125 update gitignore
-5167ca4 Merge "WebPEncode: An additional check. Start VP8EncLoop/VP8EncTokenLoop only if VP8EncStartAlpha succeeded."
-67708d6 WebPEncode: An additional check. Start VP8EncLoop/VP8EncTokenLoop only if VP8EncStartAlpha succeeded.
-b68912a pngdec: Avoid a double-free.
-82abbe1 Merge "configure.ac: add AM_PROG_AR for automake >= 1.12"
-e7d9548 add WebPBlendAlpha() function to blend colors against background
-ed4dc71 configure.ac: add AM_PROG_AR for automake >= 1.12
-df4a406 Merge branch '0.3.0'
-1e0d4b8 Update ChangeLog (tag: v0.3.0-rc7, tag: v0.3.0)
-d52b405 Cosmetic fixes
-6cb4a61 misc style fix
-68111ab add missing YUVA->ARGB automatic conversion in WebPEncode()
-e9a7990 Cosmetic fixes
-403bfe8 Container spec: Clarify frame disposal
-2aaa423 Merge "add missing YUVA->ARGB automatic conversion in WebPEncode()"
-07d87bd add missing YUVA->ARGB automatic conversion in WebPEncode()
-142c462 misc style fix
-3e7a13a Merge "Container spec: clarify the background color field" into 0.3.0
-14af774 container doc: add a note about the 'ANMF' payload
-cc635ef Container spec: clarify the background color field
-e3e3394 container doc: move RIFF description to own section
-4299f39 libwebp/mux: fix double free
-33f9a69 Merge "demux: keep a frame tail pointer; used in AddFrame" into 0.3.0
-a2a7b95 use WebPDataCopy() instead of re-coding it.
-6f18f12 demux: keep a frame tail pointer; used in AddFrame
-e5af49e add doc precision about WebPParseHeaders() return codes
-db46daa Merge "Makefile.vc: fix dynamic builds" into 0.3.0
-53c77af Merge "gif2webp: Bgcolor fix for a special case" into 0.3.0
-a5ebd14 gif2webp: Bgcolor fix for a special case
-6378f23 Merge "vwebp/animation: fix background dispose" into 0.3.0
-3c8eb9a fix bad saturation order in QuantizeBlock
-04c7a2e vwebp/animation: fix background dispose
-81a5069 Makefile.vc: fix dynamic builds
-5f25c39 update ChangeLog (tag: v0.3.0-rc6)
-14d42af examples: don't use C99 %zu
-5ccf1fe update ChangeLog
-2560c24 update NEWS
-f43bafc Merge changes Iecccb09c,If5ee9fd2,I3e181ce4 into 0.3.0
-a788644 dwebp: warn when decoding animated webp's
-302efcd Decode: return more meaningful error for animation
-ad45273 WebPBitstreamFeatures: add has_animation field
-783dfa4 disable FRGM decoding for good in libwebpmux
-4b956be Update ChangeLog
-ad8b86d update NEWS
-3e084f6 Merge "demux cosmetics: comments/rename internal function" into 0.3.0
-d3f8c62 Merge "move WebPFeatureFlags declaration" into 0.3.0
-7386fe5 Merge "libwebp{demux,mux}: install mux_types.h" into 0.3.0
-d6cd4e9 Merge "bump decode abi" into 0.3.0
-17f8da5 bump decode abi
-97684ae Merge "add doc precision about WebPDemuxPartial()" into 0.3.0
-f933fd2 move WebPFeatureFlags declaration
-289bc47 libwebp{demux,mux}: install mux_types.h
-224e8d4 add doc precision about WebPDemuxPartial()
-4c18e80 demux cosmetics: comments/rename internal function
-7cfd1bf update AUTHORS
-401f7b8 Merge "speed-up lossless (~3%) with ad-hoc histogram cost evaluation" into 0.3.0
-1fc8ffc Merge "makefile.unix: dist related changes" into 0.3.0
-8a89c6e Merge changes I466c377f,Ib761ebd3,I694857fc into 0.3.0
-f4ffb2d speed-up lossless (~3%) with ad-hoc histogram cost evaluation
-723847d gif2webp: only write error messages to stderr
-701b9e2 makefile.unix: dist related changes
-bb85b43 Merge "update NEWS" into 0.3.0
-59423a2 gif2webp: fix crash on open failure with libgif5
-9acb17d gif2webp: silence a unused param warning
-7d9fdc2 Merge "README updates" into 0.3.0
-5621934 Merge "build: fix install race on shared headers" into 0.3.0
-70809d8 Merge "bump version to 0.3.0" into 0.3.0
-d851cd1 demux: make the parse a bit more strict
-28bb410 update NEWS
-cef9388 bump version to 0.3.0
-9048494 build: fix install race on shared headers
-1e67e8e README updates
-42b611a Merge "configure: drop experimental from mux/demux" into 0.3.0
-096a8e3 Merge "vwebp: add color profile support" into 0.3.0
-ddfee5d vwebp: add color profile support
-0d6927d Merge "Mark fragment options as experimental in webpmux" into 0.3.0
-5dbd403 Mark fragment options as experimental in webpmux
-a0a6648 configure: drop experimental from mux/demux
-ee65bad Merge "add support for BITS > 32" into 0.3.0
-744930d add support for BITS > 32
-7dd288f cwebp: fix build
-19a8dd0 Merge "Makefile.vc: add vwebp.exe target" into 0.3.0
-50eedda Merge "examples: normalize icc related program arguments" into 0.3.0
-757f637 Merge "Makefile.vc: add libwebpdecoder target" into 0.3.0
-b65c4b7 Makefile.vc: add libwebpdecoder target
-f8db7b4 Merge "vwebp: replace doubles w/floats where appropriate" into 0.3.0
-d99aa56 Makefile.vc: add vwebp.exe target
-013023e vwebp: replace doubles w/floats where appropriate
-9b3db89 README.mux: add version reference
-7b6a26c Merge "cwebp: output metadata statistics" into 0.3.0
-d8dc72a examples: normalize icc related program arguments
-7bfc905 Merge "make alpha unfilter work in-place" into 0.3.0
-0037b2d Merge "add LUT-free reference code for YUV->RGB conversion." into 0.3.0
-166bf74 Merge "demux: disable fragment parsing" into 0.3.0
-126974b add LUT-free reference code for YUV->RGB conversion.
-0aef3eb make alpha unfilter work in-place
-14ef500 Merge "Remove 'status: experimental' from container spec" into 0.3.0
-d40c98e Merge "webpmux binary: tiny style fix" into 0.3.0
-0bc4268 cwebp: output metadata statistics
-bc03980 Merge "autoconf: normalize experimental define" into 0.3.0
-d1e21b1 Remove 'status: experimental' from container spec
-7681bb9 webpmux binary: tiny style fix
-a3dd3d0 avoid installing example_util.h
-252320e demux: disable fragment parsing
-537bde0 autoconf: normalize experimental define
-5e338e0 Merge changes I33e8a613,I8e8a7b44 into 0.3.0
-d9d0ea1 Merge changes If21e3ec7,I991fc30b into 0.3.0
-627f5ca automake: add reference to libwebp for mux/demux
-eef73d0 don't consolidate proba stats too often
-05ec4cc libwebp{,decoder}.pc: add pthread flags
-1bfcf5b add libwebpmux.pc
-26ca843 add libwebpdemux.pc
-69e2590 Merge "Tune Lossless compression for lower qualities."
-0478b5d Tune Lossless compression for lower qualities.
-39f7586 add a mention of parallel alpha encoding in the NEWS
-5a21d96 Merge "1.5x-2x faster encoding for method 3 and up"
-9bfbdd1 1.5x-2x faster encoding for method 3 and up
-27dc741 Correct frame options order in README.mux
-be2fd17 Mux: fix a scenario with bad ANMF/FRGM size
-19eb012 Merge "Demux: Add option to get frame count using GetI()"
-7368b8c Merge "WebPGetFeatures() out of if condition for clarity."
-f604c9a Merge "fix windows build"
-153f94e fix windows build
-847b492 Merge "vwebp: use magenta for 'i'nfo display"
-25ea46b Merge "vwebp: add keyboard shortcuts to help output"
-bea7cca vwebp: use magenta for 'i'nfo display
-8fab161 webpmux: correct -frame param order in help output
-03cc23d vwebp: add keyboard shortcuts to help output
-068eba8 Demux: Add option to get frame count using GetI()
-988b8f5 WebPGetFeatures() out of if condition for clarity.
-6933d91 Merge "gif2webp: Be lenient about background color index."
-4d0f7c5 Merge "WebPGetFeatures() behavior change:"
-fdeeb01 gif2webp: Be lenient about background color index.
-ad25032 Merge "multi-threaded alpha encoding for lossy"
-4e32d3e Merge "fix compilation of token.c"
-f817930 multi-threaded alpha encoding for lossy
-8805035 fix compilation of token.c
-fc81621 code using the actual values for num_parts_, not the ones from config
-7265535 Merge "move the config check from .c to .h"
-dd9e76f move the config check from .c to .h
-956b217 WebPGetFeatures() behavior change:
-df02e4c WebPDemuxGetI behavior change:
-633c004 Merge "rebalance method tools (-m) for methods [0..4]"
-58ca6f6 rebalance method tools (-m) for methods [0..4]
-7648c3c Merge "describe rd-opt levels introduce VP8RDLevel enum"
-67fb100 Merge "autoconf: enable silent-rules by default"
-a5042a3 GetVersion() methods for mux and demux
-5189957 describe rd-opt levels introduce VP8RDLevel enum
-4e094ac autoconf: enable silent-rules by default
-b7eaa85 inline VP8LFastLog2() and VP8LFastSLog2 for small values
-5cf7792 split quant_levels.c into decoder and encoder version
-e5d3ffe Merge "Update code example in README.mux"
-ac5a915 Update code example in README.mux
-38a91e9 Add example code snippet for demux API
-5f557f3 README.mux: add info about Demux API and vwebp
-c0ba090 backward_references: avoid signed integer overflow
-943386d disable SSE2 for now
-9479fb7 lossless encoding speedup
-ec2030a merge two lines together
-b67956c Merge "Remove ReadOneBit() and ReadSymbolUnsafe()"
-1667bde Remove ReadOneBit() and ReadSymbolUnsafe()
-3151669 wicdec + dwebp cosmetics: normalize formatting
-92668da change default filtering parameters:   * type is now 'strong'   * strength is now '60'
-b7490f8 introduce WEBP_REFERENCE_IMPLEMENTATION compile option
-3383885 faster decoding (3%-6%)
-5c3e381 Merge "add a -jpeg_like option"
-c231104 remove unused declaration of VP8Zigzag
-3615295 Merge "wicdec: add alpha support for paletted formats"
-c9f1649 wicdec: add alpha support for paletted formats
-1262f81 Merge "wicdec: silence some warnings"
-e7ea61e wicdec: silence some warnings
-23c0f35 fix missing intptr_t->int cast for MSVC
-e895059 add a -jpeg_like option
-1f803f6 Merge "Tune alpha quality mapping to more reasonable values."
-1267d49 Tune alpha quality mapping to more reasonable values.
-043076e Merge "speed-up lossless in BackwardTrace"
-f3a44dc remove one malloc from TraceBackwards()
-0fc1a3a speed-up lossless in BackwardTrace
-7c732e5 cwebp: centralize WebPCleanupTransparentArea()
-7381254 Merge "wicdec: add ICC profile extraction"
-e83ff7d wicdec: add ICC profile extraction
-146c6e3 Merge "cosmetics: pngdec: normalize default label location"
-a8f549d Merge "manpages: italicize option parameters"
-e118db8 Merge "encode.h: note the need to free() WebPMemoryWriter"
-1dfee6d cosmetics: pngdec: normalize default label location
-14c3820 manpages: italicize option parameters
-7defbfa encode.h: note the need to free() WebPMemoryWriter
-88d382a cwebp: cleanup after memory_writer
-12d6cec fix extra space in dwebp.1 man
-b01681a Fix for demuxer frame iteration:
-56c12aa Demuxer creation fix:
-66c810b add a -yuv option to dwebp (very similar to -pgm)
-841a3ba Merge "Remove -Wshadow warnings."
-8fd0252 Merge "upsampling_neon.c: fix build"
-6efed26 Remove -Wshadow warnings.
-60904aa Merge "allow WebPINewRGB/YUVA to be passed a NULL output buffer."
-b7adf37 allow WebPINewRGB/YUVA to be passed a NULL output buffer.
-27f8f74 upsampling_neon.c: fix build
-06b9cdf gitignore: add IOS related directories
-f112221 Merge "Fix more comments for iobuild.sh"
-fe4d25d Fix more comments for iobuild.sh
-1de3e25 Merge "NEON optimised yuv to rgb conversion"
-090b708 NEON optimised yuv to rgb conversion
-daa0647 Merge "Add ios build script for building iOS library."
-79fe39e Add ios build script for building iOS library.
-126c035 remove some more -Wshadow warnings
-522e9d6 Merge "cwebp: enable '-metadata'"
-76ec5fa cwebp: enable '-metadata'
-aeb91a9 Merge "cosmetics: break a few long lines"
-be7c96b cosmetics: break a few long lines
-cff8ddb Merge "add libwebpdecoder.pc"
-93148ab Merge "libwebp.pc.in: detab"
-6477f95 Merge "Makefile.vc: normalize path separator"
-bed1ed7 add libwebpdecoder.pc
-46168b2 libwebp.pc.in: detab
-a941a34 Fixed few nits in the build files.
-dd7a49b Makefile.vc: normalize path separator
-9161be8 Merge "cwebp: extract WIC decoding to its own module"
-08e7c58 Merge "Provide an option to build decoder library."
-0aeba52 Provide an option to build decoder library.
-757ebcb catch malloc(0)/calloc(0) with an assert
-152ec3d Merge "handle malloc(0) and calloc(0) uniformly on all platforms"
-a452a55 cwebp: extract WIC decoding to its own module
-2b252a5 Merge "Provide option to swap bytes for 16 bit colormodes"
-94a48b4 Provide option to swap bytes for 16 bit colormodes
-42f8f93 handle malloc(0) and calloc(0) uniformly on all platforms
-8b2152c Merge "add an extra assert to check memory bounds"
-0d19fbf remove some -Wshadow warnings
-cd22f65 add an extra assert to check memory bounds
-8189fed Merge "Add details and reference about the YUV->RGB conversion"
-1d2702b Merge "Formatting fixes in lossless bitstream spec"
-8425aae Formatting fixes in lossless bitstream spec
-a556cb1 Add details and reference about the YUV->RGB conversion
-d8f21e0 add link to SSIM description on Wikipedia
-18e9167 Merge "WebP-lossless spec clarifications:"
-98e25b9 Merge "cwebp: add -metadata option"
-f01c2a5 WebP-lossless spec clarifications:
-f4a9797 Merge "Disto4x4 and Disto16x16 in NEON"
-47b7b0b Disto4x4 and Disto16x16 in NEON
-7eaee9f cwebp: add -metadata option
-36c52c2 tiffdec: use toff_t for exif ifd offset
-7c8111e Merge "cwebp/tiffdec: add TIFF metadata extraction"
-e6409ad Remove redundant include from dsp/lossless code.
-1ab5b3a Merge "configure: fix --with-gifincludedir"
-03c749e configure: fix --with-gifincludedir
-8b65063 multiple libgif versions support for gif2webp
-476e293 gif2webp: Use DGifOpenFileName()
-b50f277 tiffdec: correct format string
-2b9048e Merge "tiffdec: check error returns for width/height"
-a1b5a9a Merge "cwebp/tiff: use the first image directory"
-079423f tiffdec: check error returns for width/height
-d62824a Merge "cwebp/jpegdec: add JPEG metadata extraction"
-03afaca Merge "cwebp: add PNG metadata extraction"
-2c72496 cwebp/jpegdec: add JPEG metadata extraction
-dba64d9 cwebp: add PNG metadata extraction
-1f075f8 Lossless spec corrections/rewording/clarifications
-2914ecf cwebp/tiffdec: add TIFF metadata extraction
-d82a3e3 More corrections/clarifications in lossless spec:
-bd00255 cwebp/tiff: use the first image directory
-df7aa07 Merge "Cleanup around jpegdec"
-0f57dcc decoding speed-up (~1%)
-bcec339 Lossless bitstream clarification:
-6bf2087 add examples/metadata.c
-207f89c Merge "configure: add libwebpdemux status to summary"
-1bd287a Cleanup around jpegdec
-9145567 Merge "cosmetics: use '== 0' in size checks"
-d6b88b7 cosmetics: use '== 0' in size checks
-d3dace2 cosmetics: jpegdec
-2f69af7 configure: add libwebpdemux status to summary
-1c1c564 cwebp: extract tiff decoding to its own module
-6a871d6 cwebp: extract jpeg decoding to its own module
-2ee228f cwebp: extract png decoding to its own module
-4679db0 Merge "cwebp: add metadata framework"
-63aba3a cwebp: add metadata framework
-931bd51 lossless bitstream: block size bits correction
-e4fc4c1 lossless bitstream: block size bits correction
-d65ec67 fix build, move token.c to src/enc/
-657f5c9 move token buffer to its own file (token.c)
-c34a375 introduce GetLargeValue() to slim-fast GetCoeffs().
-d5838cd faster non-transposing SSE2 4x4 FTransform
-f76191f speed up GetResidualCost()
-ba2aa0f Add support for BITS=24 case
-2e7f6e8 makefile.unix: Dependency on libraries
-dca8421 Merge "Separate out mux and demux code and libraries:"
-23782f9 Separate out mux and demux code and libraries:
-bd56a01 configure: add summary output
-90e5e31 dwebp manual: point to webpmux, gif2webp.
-540790c gif2webp.c: add a note about prerequisites
-d1edf69 cwebp man page: meaning of '-q' for lossy/lossless
-79efa1d Add man page for gif2webp utility
-2243e40 Merge "gif2webp build support with autoconf tools"
-c40efca gif2webp build support with autoconf tools
-6523e2d WebP Container:
-4da788d Merge "simplify the fwd transform"
-42c3b55 simplify the fwd transform
-41a6ced user GLfloat instead of float
-b542611 fix indentation
-68f282f * handle offset in anim viewer 'vwebp' * fix gif2webp to handle disposal method and odd offset correctly
-118cb31 Merge "add SSE2 version of Sum of Square error for 16x16, 16x8 and 8x8 case"
-8a7c3cc Merge "Change the order of -frame argument to be more natural"
-99e0a70 Merge "Simplify the texture evaluation Disto4x4()"
-0f923c3 make the bundling work in a tmp buffer
-e5c3b3f Simplify the texture evaluation Disto4x4()
-4860008 Change the order of -frame argument to be more natural
-35bfd4c add SSE2 version of Sum of Square error for 16x16, 16x8 and 8x8 case
-a7305c2 Clarification for unknown chunks
-4c4398e Refine WebP Container Spec wrt unknown chunks.
-2ca642e Rectify WebPMuxGetFeatures:
-7caab1d Some cosmetic/comment fixes.
-60b2651 Merge "Write a GIF to WebP converter based on libgif."
-c7127a4 Merge "Add NEON version of FTransformWHT"
-11b2721 Write a GIF to WebP converter based on libgif.
-e9a15a3 ExUtilWriteFile() to write memory segment to file
-74356eb Add a simple cleanup step in mux assembly:
-51bb1e5 mux.h: correct WebPDemuxSelectFragment() prototype
-22a0fd9 Add NEON version of FTransformWHT
-fa30c86 Update mux code to match the spec wrt animation
-d9c5fbe by-pass Analysis pass in case segments=1
-d2ad445 Merge changes Ibeccffc3,Id1585b16
-5c8be25 Merge "Chunk fourCCs for XMP/EXIF"
-a00a3da Use 'frgm' instead of 'tile' in webpmux parameters
-81b8a74 Design change in ANMF and FRGM chunks:
-f903cba Chunk fourCCs for XMP/EXIF
-812933d Tune performance of HistogramCombine
-52ad197 Animation specification in container spec
-001b930 Image fragment specification in container spec
-391f9db Ordering of description of bits in container spec
-d573577 Metadata specification in container spec
-1c4609b Merge commit 'v0.2.1'
-0ca584c Merge "Color profile specification in container spec"
-e8b41ad add NEON asm version for WHT inverse transform
-af6f0db Color profile specification in container spec
-a61a824 Merge "Add NULL check in chunk APIs"
-0e8b7ee fix WebPPictureView() unassigned strides
-75e5f17 ARM/NEON: 30% encoding speed-up
-02b4356 Add NULL check in chunk APIs
-a077072 mux struct naming
-6c66dde Merge "Tune Lossless encoder"
-ab5ea21 Tune Lossless encoder
-74fefc8 Update ChangeLog (tag: v0.2.1, origin/0.2.0, 0.2.0)
-92f8059 Rename some chunks:
-3bb4bbe Merge "Mux API change:"
-d0c79f0 Mux API change:
-abc0604 Merge "update NEWS" into 0.2.0
-57cf313 update NEWS
-25f585c bump version to 0.2.1
-fed7c04 libwebp: validate chunk size in ParseOptionalChunks
-552cd9b cwebp (windows): fix alpha image import on XP
-b14fea9 autoconf/libwebp: enable dll builds for mingw
-4a8fb27 [cd]webp: always output windows errors
-d662158 fix double to float conversion warning
-72b96a6 cwebp: fix jpg encodes on XP
-734f762 VP8LAllocateHistogramSet: fix overflow in size calculation
-f9cb58f GetHistoBits: fix integer overflow
-b30add2 EncodeImageInternal: fix uninitialized free
-3de58d7 fix the -g/O3 discrepancy for 32bit compile
-77aa7d5 fix the BITS=8 case
-e5970bd Make *InitSSE2() functions be empty on non-SSE2 platform
-ef5cc47 make *InitSSE2() functions be empty on non-SSE2 platform
-c4ea259 make VP8DspInitNEON() public
-8344ead Merge "libwebp: validate chunk size in ParseOptionalChunks"
-4828bb9 Merge "cwebp (windows): fix alpha image import on XP"
-3076333 libwebp: validate chunk size in ParseOptionalChunks
-7048189 AccumulateLSIM: fix double -> float warnings
-eda8ee4 cwebp (windows): fix alpha image import on XP
-c6e9865 Merge "add EXPERIMENTAL code for YUV-JPEG colorspace"
-f0360b4 add EXPERIMENTAL code for YUV-JPEG colorspace
-f86e6ab add LSIM metric to WebPPictureDistortion()
-c3aa215 Speed up HistogramCombine for lower qualities.
-1765cb1 Merge "autoconf/libwebp: enable dll builds for mingw"
-a13562e autoconf/libwebp: enable dll builds for mingw
-9f469b5 typo: no_fancy -> no_fancy_upsampling
-1a27f2f Merge "fix double to float conversion warning"
-cf1e90d Merge "cwebp: fix jpg encodes on XP"
-f2b5d19 [cd]webp: always output windows errors
-e855208 fix double to float conversion warning
-ecd66f7 cwebp: fix jpg encodes on XP
-7b3eb37 Tune lossless compression to get better gains.
-ce8bff4 Merge "VP8LAllocateHistogramSet: fix overflow in size calculation"
-ab5b67a Merge "EncodeImageInternal: fix uninitialized free"
-7fee5d1 Merge "GetHistoBits: fix integer overflow"
-a6ae04d VP8LAllocateHistogramSet: fix overflow in size calculation
-80237c4 GetHistoBits: fix integer overflow
-8a99723 EncodeImageInternal: fix uninitialized free
-0b9e682 minor cosmetics
-a792b91 fix the -g/O3 discrepancy for 32bit compile
-73ba435 Merge "detect and merge similar segments"
-fee6627 detect and merge similar segments
-0c44f41 src/webp/*.h: don't forward declare enums in C++
-d7a5ac8 vwebp: use demux interface
-931e0ea Merge "replace 'typedef struct {} X;" by "typedef struct X X; struct X {};""
-8f216f7 remove cases of equal comparison for qsort()
-28d25c8 replace 'typedef struct {} X;" by "typedef struct X X; struct X {};"
-2afee60 speed up for ARM using 8bit for boolean decoder
-5725cab new segmentation algorithm
-2cf1f81 Merge "fix the BITS=8 case"
-12f78ae fix the BITS=8 case
-6920c71 fix MSVC warnings regarding implicit uint64 to uint32 conversions
-f6c096a webpmux binary: Rename 'xmp' option to 'meta'
-ddfe871 webpmux help correction
-b7c5544 Merge "Make *InitSSE2() functions be empty on non-SSE2 platform"
-1c04a0d Common APIs for chunks metadata and color profile.
-2a3117a Merge "Create WebPMuxFrameInfo struct for Mux APIs"
-5c3a723 Make *InitSSE2() functions be empty on non-SSE2 platform
-7c6e60f make *InitSSE2() functions be empty on non-SSE2 platform
-c7eb457 make VP8DspInitNEON() public
-ab3234a Create WebPMuxFrameInfo struct for Mux APIs
-e3990fd Alignment fixes
-e55fbd6 Merge branch '0.2.0'
-4238bc0 Update ChangeLog (tag: v0.2.0)
-c655380 dec/io.c: cosmetics
-fe1958f RGBA4444: harmonize lossless/lossy alpha values
-681cb30 fix RGBA4444 output w/fancy upsampling
-f06c1d8 Merge "Alignment fix" into 0.2.0
-f56e98f Alignment fix
-6fe843b avoid rgb-premultiply if there's only trivial alpha values
-528a11a fix the ARGB4444 premultiply arithmetic
-a0a4885 Lossless decoder fix for a special transform order
-62dd9bb Update encoding heuristic w.r.t palette colors.
-6f4272b remove unused ApplyInverseTransform()
-93bf0fa Update ChangeLog (tag: v0.2.0-rc1)
-5934fc5 update AUTHORS
-014a711 update NEWS
-43b0d61 add support for ARGB -> YUVA conversion for lossless decoder
-33705ca bump version to 0.2.0
-c40d7ef fix alpha-plane check + add extra checks
-a06f802 MODE_YUVA: set alpha to opaque if the image has none
-52a87dd Merge "silence one more warning" into 0.2.0
-3b02309 silence one more warning
-f94b04f move some RGB->YUV functions to yuv.h
-4b71ba0 README: sync [cd]webp help output
-c9ae57f man/dwebp.1: add links to output file format details
-292ec5c quiet a few 'uninitialized' warnings
-4af3f6c fix indentation
-9b261bf remove the last NOT_HAVE_LOG2 instances
-323dc4d remove use of log2(). Use VP8LFastLog2() instead.
-8c515d5 Merge "harness some malloc/calloc to use WebPSafeMalloc and WebPSafeCalloc" into 0.2.0
-d4b4bb0 Merge changes I46090628,I1a41b2ce into 0.2.0
-bff34ac harness some malloc/calloc to use WebPSafeMalloc and WebPSafeCalloc
-a3c063c Merge "extra size check for security" into 0.2.0
-5e79630 Merge "WebPEncode: clear stats at the start of encode" into 0.2.0
-f1edf62 Merge "rationalize use of color-cache" into 0.2.0
-c193331 extra size check for security
-906be65 rationalize use of color-cache
-dd1c387 Add image-hint for low-color images.
-4eb7aa6 Merge "WebPCheckMalloc() and WebPCheckCalloc():" into 0.2.0
-80cc730 WebPCheckMalloc() and WebPCheckCalloc():
-183cba8 check VP8LBitWriterInit return
-cbfa9ee lossless: fix crash on user abort
-256afef cwebp: exit immediately on version mismatch
-475d87d WebPEncode: clear stats at the start of encode
-a7cc729 fix type and conversion warnings
-7d853d7 add stats for lossless
-d39177b make QuantizeLevels() store the sum of squared error
-5955cf5 replace x*155/100 by x*101581>>16
-7d732f9 make QuantizeLevels() store the sum of squared error
-e45a446 replace x*155/100 by x*101581>>16
-159b75d cwebp output size consistency:
-cbee59e Merge commit 'v0.1.99'
-1889e9b dwebp: report -alpha option
-3bc3f7c Merge "dwebp: add PAM output support" into 0.2.0
-d919ed0 dwebp: add PAM output support
-85e215d README/manpages/configure: update website link
-c3a207b Update ChangeLog (tag: v0.1.99)
-d1fd782 Merge "add extra precision about default values and behaviour" into 0.2.0
-efc826e add extra precision about default values and behaviour
-9f29635 header/doc clean up
-ff9fd1b Makefile.vc: fix webpmux.exe *-dynamic builds
-8aacc7b remove INAM, ICOP, ... chunks from the test webp file.
-2fc1301 harmonize authors as "Name (mail@address)"
-4a9f37b Merge "update NEWS" into 0.2.0
-7415ae1 makefile.unix: provide examples/webpmux target
-ce82ced update NEWS
-641e28e Merge "man/cwebp.1: wording, change the date" into 0.2.0
-c37c23e README: cosmetics
-3976dcd man/cwebp.1: wording, change the date
-3e5bbe1 Merge "rename 'use_argb_input' to 'use_argb'" into 0.2.0
-ce90847 Merge "add some padding bytes areas for later use" into 0.2.0
-2390dab Merge "fixing the findings by Frederic Kayser to the bitstream spec" into 0.2.0
-0275159 add a very crude progress report for lossless
-a4b9b1c Remove some unused enum values.
-dd10817 rename 'use_argb_input' to 'use_argb'
-90516ae add some padding bytes areas for later use
-d03b250 fixing the findings by Frederic Kayser to the bitstream spec
-ce156af add missing ABI compatibility checks
-9d45416 Merge "Doc: container spec text tweaks" into 0.2.0
-4e2e0a8 Doc: container spec text tweaks
-f7f16a2 add ABI compatibility check
-2a77557 Merge "swig: add WebPEncodeLossless* wrappers" into 0.2.0
-a3ec622 mux.h: remove '* const' from function parameters
-31426eb encode.h: remove '* const' from function parameters
-9838e5d decode.h: remove '* const' from function parameters
-4972302 swig: add WebPEncodeLossless* wrappers
-9ff00ca bump encoder/decoder versions
-c2416c9 add lossless quick encoding functions to the public API
-4c1f5d6 Merge "NEWS: mention decode_vp8.h is no longer installed" into 0.2.0
-6cb2277 NEWS: mention decode_vp8.h is no longer installed
-d5e5ad6 move decode_vp8.h from webp/ to dec/
-8d3b04a Merge "header clean-up" into 0.2.0
-02201c3 Merge "remove one malloc() by making color_cache non dynamic" into 0.2.0
-d708ec1 Merge "move MIN/MAX_HISTO_BITS to format_constants.h" into 0.2.0
-ab2da3e Merge "add a malloc() check" into 0.2.0
-2d571bd add a malloc() check
-7f0c178 remove one malloc() by making color_cache non dynamic
-6569cd7 Merge "VP8LFillBitWindow: use 64-bit path for msvc x64 builds" into 0.2.0
-23d34f3 header clean-up
-2a3ab6f move MIN/MAX_HISTO_BITS to format_constants.h
-985d3da Merge "shuffle variables in HashChainFindCopy" into 0.2.0
-cdf885c shuffle variables in HashChainFindCopy
-c3b014d Android.mk: add missing lossless files
-8c1cc6b makefile.unix dist: explicitly name installed includes
-7f4647e Merge "clarify the colorspace naming and byte ordering of decoded samples" into 0.2.0
-cbf6972 clarify the colorspace naming and byte ordering of decoded samples
-857650c Mux: Add WebPDataInit() and remove WebPImageInfo
-ff771e7 don't install webp/decode_vp8.h
-596dff7 VP8LFillBitWindow: use 64-bit path for msvc x64 builds
-3ca7ce9 Merge "doc: remove non-finalized chunk references" into 0.2.0
-1efaa5a Merge "bump versions" into 0.2.0
-51fa13e Merge "README: update cwebp help output" into 0.2.0
-12f9aed README: update cwebp help output
-f0b5def bump versions
-4c42a61 update AUTHORS
-6431a1c doc: remove non-finalized chunk references
-8130c4c Merge "build: remove libwebpmux from default targets/config"
-23b4443 Merge "configure: broaden test for libpng-config"
-85bff2c Merge "doc: correct lossless prefix coding table & code"
-05108f6 Merge "More spec/code matching in mux:"
-6808e69 More spec/code matching in mux:
-bd2b46f Merge "doc/webp-container-spec: light cosmetics"
-20ead32 doc/webp-container-spec: light cosmetics
-1d40a8b configure: add pthread detection
-b5e9067 fix some int <-> size_t mix for buffer sizes
-e41a759 build: remove libwebpmux from default targets/config
-0fc2baa configure: broaden test for libpng-config
-45b8272 Merge "restore authorship to lossless bitstream doc"
-06ba059 restore authorship to lossless bitstream doc
-44a09a3 add missing description of the alpha filtering methods
-63db87d Merge "vwebp: add checkboard background for alpha display"
-a73b897 vwebp: add checkboard background for alpha display
-939158c Merge "vwebp: fix info display"
-b35c07d vwebp: fix info display
-48b39eb fix underflow for very short bitstreams
-7e62298 cosmetics: param alignment, manpage wording
-1bd7dd5 Merge changes I7b0afb0d,I7ecc9708
-ac69e63 Merge "Updated cwebp man's help for Alpha & Lossless."
-c0e8859 Get rid of image_info_ from WebPChunk struct.
-135ca69 WebP Container Spec:
-eb6f9b8 Updated cwebp man's help for Alpha & Lossless.
-0fa844f cosmetic fixes on assert and 'const' where applicable
-7f22bd2 check limit of width * height is 32 bits
-16c46e8 autoconf/make: cosmetics: break long lines
-ab22a07 configure: add helper macro to define --with-*
-c17699b configure: add libtiff test
-0e09732 Merge "cwebp: fix crash with yuv input + lossless"
-88a510f Merge "fix big-endian VP8LWriteBits"
-da99e3b Merge "Makefile.vc: split mux into separate lib"
-7bda392 cwebp: fix crash with yuv input + lossless
-f56a369 fix big-endian VP8LWriteBits
-54169d6 Merge "cwebp: name InputFileFormat members consistently"
-e2feefa Makefile.vc: split mux into separate lib
-27caa5a Merge "cwebp: add basic TIFF support"
-d8921dd cwebp: name InputFileFormat members consistently
-6f76d24 cwebp: add basic TIFF support
-4691407 Merge changes If39ab7f5,I3658b5ae
-cca7c7b Fixed nit: 10 -> 10.f
-5d09a24 WebPMuxCreate() error handling:
-777341c Fix a memleak in WebPMuxCreate()
-61c9d16 doc: correct lossless prefix coding table & code
-4c39757 Merge "mark VP8{,L}{GetInfo,CheckSignature} as WEBP_EXTERN"
-e4e36cc Merge "Mux: Allow only some frames/tiles to have alpha."
-ad2aad3 Merge "WebP Decoding error handling:"
-97649c8 Mux: Allow only some frames/tiles to have alpha.
-f864be3 Lower the quality settings for Alpha encoding.
-3ba81bb WebP Decoding error handling:
-fcc6992 add automatic YUVA/ARGB conversion during WebPEncode()
-802e012 fix compilation in non-FANCY_UPSAMPLING mode
-e012dfd make width/height coding match the spec
-228d96a mark VP8{,L}{GetInfo,CheckSignature} as WEBP_EXTERN
-637a314 remove the now unused *KeepA variants
-d11f6fc webpmux returns error strings rather than numbers
-fcec059 makefile.unix: cwebp: fix OSX link
-6b811f1 Merge "doc: remove lossless pdf"
-c963482 doc: remove lossless pdf
-b9ae4f0 cosmetics after mux changes b74ed6e, b494ad5
-b494ad5 Mux: only allow adding frame/tiles at the end.
-2c341b0 Merge "Added image characteristic hint for the codec."
-d373076 Added image characteristic hint for the codec.
-2ed2adb Merge "msvc: add intrinsic based BitsLog2Floor"
-e595e7c Merge "add demux.c to the makefiles"
-da47b5b Merge "demux: add {Next,Prev}Chunk"
-e5f4674 add demux.c to the makefiles
-4708393 demux: add {Next,Prev}Chunk
-e8a0a82 demux: quiet msvc warnings
-7f8472a Update the WebP Container Spec.
-31b68fe cleanup WebPPicture struct and API
-9144a18 add overflow check before calling malloc()
-81720c9 consistency cosmetics
-2ebe839 Merge "Add kramdown version information to README"
-7144308 enc/vp8l.c: fix build
-b7ac19f Add kramdown version information to README
-efdcb66 Merge "Edit for consistency, usage and grammar."
-0822010 Enable alpha in vvwebp
-8de9a08 Merge "Mux API change:"
-b74ed6e Mux API change:
-233a589 take picture->argb_stride into account for lossless coding
-04e33f1 Edit for consistency, usage and grammar.
-a575b4b Merge "cosmetics: add missing const"
-8d99b0f Merge "cosmetics: remove unimplemented function proto"
-69d0221 cosmetics: add missing const
-5b08318 cosmetics: remove unimplemented function proto
-b7fb0ed Log warning for unsupported options for lossless.
-e1f769f msvc: add intrinsic based BitsLog2Floor
-8a69c7d Bug-fix: Clamp backward dist to 1.
-b5b6ac9 Merge "Bring the special writer 'WebPMemoryWriter' to public API"
-a6a1909 Merge "Fix floating point exception with cwebp -progress"
-f2cee06 Fix floating point exception with cwebp -progress
-91b7a8c Bring the special writer 'WebPMemoryWriter' to public API
-310e297 support resize and crop for RGBA input
-a89835d Merge changes Ice662960,Ie8d7aa90,I2d996d5e,I01c04772
-ce614c0 Merge "dec/vp8: avoid setting decoder status twice"
-900285d dec/vp8: avoid setting decoder status twice
-8227adc Merge changes I6f02b0d0,I5cbc9c0a,I9dd9d4ed,Id684d2a1
-dcda59c Merge "demux: rename SetTile to SelectTile"
-622ef12 demux: rename SetTile to SelectTile
-81ebd37 Merge "demux: add {Next,Prev}Frame"
-02dd37a demux: add {Next,Prev}Frame
-4b79fa5 Merge "Limit the maximum size of huffman Image to 16MB."
-9aa34b3 Manually number "chapters," as chapter numbers are used in the narrative.
-2a4c6c2 Re-wrap at <= 72 columns
-a45adc1 Apply inline emphasis and monospacing, per gdoc / PDF
-9101120 Incorporate gdoc changes through 2012-06-08
-7a18248 Removed CodeRay syntax declarations ...
-b3ec18c Provide for code-block syntax highlighting.
-709d770 Replace high ASCII artifacts (curly quotes, etc.).
-930e8ab Lossless WebP doc largely ported to markdown text.
-18cae37 msvc: silence some build warnings
-b392308 Limit the maximum size of huffman Image to 16MB.
-f180df2 Merge "libwebp/demux: add Frame/Chunk iteration"
-2bbe1c9 Merge "Enable lossless encoder code"
-d0601b0 Merge changes I1d97a633,I81c59093
-78f3e34 Enable lossless encoder code
-d974a9c Merge "libwebp/demux: add simple format parsing"
-26bf223 Merge "libwebp: add WebPDemux stub functions"
-2f66668 Merge "modify WebPParseHeaders to allow reuse by GetFeatures"
-b402b1f libwebp/demux: add Frame/Chunk iteration
-ad9ada3 libwebp/demux: add WebPDemuxGetI
-2f2d4d5 libwebp/demux: add extended format parsing
-962dcef libwebp/demux: add simple format parsing
-f8f9408 libwebp: add WebPDemux stub functions
-fb47bb5 Merge "NumNamedElements() should take an enum param."
-7c68980 Fix asserts in Palette and BackwardReference code.
-fbdcb7e NumNamedElements() should take an enum param.
-fb4943b modify WebPParseHeaders to allow reuse by GetFeatures
-3697b5c write an ad-hoc EncodeImageInternal variant
-eaee9e7 Bug-Fix: Decode small (less than 32 bytes) images.
-0bceae4 Merge "cwebp: fix alpha reporting in stats output"
-0424b1e Rebase default encoding settings.
-c71ff9e cwebp: fix alpha reporting in stats output
-e2ffe44 Merge "Stop indefinite recursion for Huffman Image."
-70eb2bd Stop indefinite recursion for Huffman Image.
-f3bab8e Update vwebp
-6d5c797 Remove support for partial files in Mux.
-f1df558 WebPMuxAssemble() returns WebPData*.
-814a063 Rename 'Add' APIs to 'Set'.
-bbb0218 Update Mux psuedo-code examples.
-4fc4a47 Use WebPData in MUX set APIs
-c67bc97 Merge "add WebPPictureImportRGBX() and WebPPictureImportBGRX()"
-27519bc add WebPPictureImportRGBX() and WebPPictureImportBGRX()
-f80cd27 factorize code in Import()
-9b71502 histogram: add log2 wrapper
-8c34378 Merge "fix some implicit type conversion warnings"
-42f6df9 fix some implicit type conversion warnings
-250c16e Merge "doc: update lossless pdf"
-9d9daba Merge "add a PDF of the lossless spec"
-8fbb918 prefer webp/types.h over stdint.h
-0ca170c doc: update lossless pdf
-0862ac6 add a PDF of the lossless spec
-437999f introduce a generic WebPPictureHasTransparency() function
-d2b6c6c cosmetic fixes after Idaba281a
-b4e6645 Merge "add colorspace for premultiplied alpha"
-48f8275 add colorspace for premultiplied alpha
-069f903 Change in lossless bit-stream.
-5f7bb3f Merge "WebPReportProgress: use non-encoder specific params"
-f18281f WebPReportProgress: use non-encoder specific params
-9ef3228 Add support for raw lossless bitstream in decoder.
-7cbee29 Fix bug: InitIo reseting fancy_upsampling flag.
-880fd98 vwebp: fix exit w/freeglut
-1875d92 trap two unchecked error conditions
-87b4a90 no need to have mux.h as noinst clause in enc/
-88f41ec doc: fix bit alignment in VP8X chunk
-52f5a4e Merge "fix bug with lossy-alpha output stride"
-3bde22d fix bug with lossy-alpha output stride
-42d61b6 update the spec for the lossy-alpha compression methods.
-e75dc80 Move some more defines to format_constants.h
-c13f663 Move consts to internal header format_constants.h
-7f2dfc9 use a bit-set transforms_seen_ instead of looping
-18da1f5 modulate alpha-compression effort according to config.method
-f5f2fff Merge "Alpha flag fix for lossless."
-c975c44 Alpha flag fix for lossless.
-4f067fb Merge "Android: only build dec_neon with NEON support"
-255c66b Android: only build dec_neon with NEON support
-8f9117a cosmetics: signature fixes
-39bf5d6 use header-less lossless bitstream for alpha channel
-75d7f3b Merge "make input data be 'const' for VP8LInverseTransform()"
-9a721c6 make input data be 'const' for VP8LInverseTransform()
-9fc64ed Disallow re-use of same transformation.
-98ec717  use a function pointer for ProcessRows()
-f7ae5e3 cosmetics: join line
-140b89a factor out buffer alloc in AllocateARGBBuffers()
-a107dfa Rectify WebPParseOptionalChunks().
-237eab6 Add two more color-spaces for lossless decoding.
-27f417a fix orthographic typo
-489ec33 add VP8LEncodeStream() to compress lossless image stream
-fa8bc3d make WebPEncodingSetError() take a const picture
-638528c bitstream update for lossy alpha compression
-d73e63a add DequantizeLevels() placeholder
-ec122e0 remove arch-dependent rand()
-d40e765 fix alignment
-1dd6a8b Merge "remove tcoder, switch alpha-plane compression to lossless"
-3e863dd remove tcoder, switch alpha-plane compression to lossless
-8d77dc2 Add support for lossless in mux:
-831bd13 Make tile size a function of encoding method.
-778c522 Merge "remove some variable shadowing"
-817c9dc Few more HuffmanTreeToken conversions.
-37a77a6 remove some variable shadowing
-89c07c9 Merge "normalize example header includes"
-4aff411 Merge "add example_util.[hc]"
-00b29e2 normalize example header includes
-061263a add example_util.[hc]
-c6882c4 merge all tree processing into a single VP8LProcessTree()
-9c7a3cf fix VP8LHistogramNumCodes to handle the case palette_code_bits == 0
-b5551d2 Merge "Added HuffmanTreeCode Struct for tree codes."
-8b85d01 Added HuffmanTreeCode Struct for tree codes.
-093f76d Merge "Allocate single memory in GetHuffBitLengthsAndCodes."
-41d8049 Allocate single memory in GetHuffBitLengthsAndCodes.
-1b04f6d Correct size in VP8L header.
-2924a5a Makefile.vc: split object lists based on directory
-c8f2416 Merge "add assert(tokens)"
-4323994 add assert(tokens)
-9f54745 Catch an error in DecodeImageData().
-ac8e5e4 minor typo and style fix
-9f566d1 clean-up around Huffman-encode
-c579a71 Introduce CHUNK_SIZE_BYTES in muxi.h.
-14757f8 Make sure huffman trees always have valid symbols
-4105061 makefile.unix: add support for building vwebp
-48b3772 Merge "fixed signed/unsigned comparison warning"
-57f696d Merge "EncodeImageInternal: fix potential leak"
-d972cdf EncodeImageInternal: fix potential leak
-5cd12c3 fixed signed/unsigned comparison warning
-cdca30d Merge "cosmetics: shorten long line"
-e025fb5 cosmetics: shorten long line
-22671ed Merge "enc/vp8l: fix double free on error"
-e1b9b05 Merge "cosmetics: VP8LCreateHuffmanTree: fix indent"
-a8e725f enc/vp8l: fix double free on error
-27541fb cosmetics: VP8LCreateHuffmanTree: fix indent
-1d38b25 cwebp/windows: use MAKE_REFGUID where appropriate
-817ef6e Merge "cwebp: fix WIC/Microsoft SDK compatibility issue"
-902d3e3 cwebp: fix WIC/Microsoft SDK compatibility issue
-89d803c Merge "Fix a crash due to wrong pointer-integer arithmetic."
-cb1bd74 Merge "Fix a crash in lossless decoder."
-de2fe20 Merge "Some cleanup in VP8LCreateHuffmanTree() (and related functions CompareHuffmanTrees() and SetBitDepths()): - Move 'tree_size' initialization and malloc for 'tree + tree_pool'   outside the loop. - Some renames/tweaks for readability."
-ce69177 Fix a crash due to wrong pointer-integer arithmetic.
-e40a368 Fix a crash in lossless decoder.
-3927ff3 remove unneeded error condition for WebPMuxNumNamedElements()
-2c140e1 Some cleanup in VP8LCreateHuffmanTree() (and related functions CompareHuffmanTrees() and SetBitDepths()): - Move 'tree_size' initialization and malloc for 'tree + tree_pool'   outside the loop. - Some renames/tweaks for readability.
-861a5b7 add support for animation
-eb5c16c Merge "Set correct encode size in encoder's stats."
-4abe04a fix the return value and handle missing input file case.
-2fafb85 Set correct encode size in encoder's stats.
-e7167a2 Provide one entry point for backward references.
-c4ccab6 Print relevant lossless encoding stats in cwebp.
-e3302cf GetHuffBitLengthsAndCodes: reduce level of indirection
-b5f2a9e enc/vp8l: fix uninitialized variable warning
-7885f8b makefile.unix: add lossless encoder files
-1261a4c Merge "cosmetics"
-3926b5b Merge "dsp/cpu.c: Android: fix crash on non-neon arm builds"
-834f937 dsp/cpu.c: Android: fix crash on non-neon arm builds
-126e160 cosmetics
-e38602d Merge branch 'lossless_encoder'
-e8d3d6a split StoreHuffmanCode() into smaller functions
-d0d8899 more consolidation: introduce VP8LHistogramSet
-1a210ef big code clean-up and refactoring and optimization
-41b5c8f Some cosmetics in histogram.c
-ada6ff7 Approximate FastLog between value range [256, 8192]
-ec123ca Forgot to update out_bit_costs to symbol_bit_costs at one instance.
-cf33ccd Evaluate output cluster's bit_costs once in HistogramRefine.
-781c01f Simple Huffman code changes.
-a2849bc Lossless decoder: remove an unneeded param in ReadHuffmanCodeLengths().
-b39e748 Reducing emerging palette size from 11 to 9 bits.
-bfc73db Move GetHistImageSymbols to histogram.c
-889a578 Improve predict vs no-predict heuristic.
-01f5066 code-moving and clean-up
-31035f3 reduce memory usage by allocating only one histo
-fbb501b Restrict histo_bits to ensure histo_image size is under 32MB
-8415ddf further simplification for the meta-Huffman coding
-e491729 A quick pass of cleanup in backward reference code
-83332b3 Make transform bits a function of encode method (-m).
-72920ca introduce -lossless option, protected by USE_LOSSLESS_ENCODER
-c6ac4df Run TraceBackwards for higher qualities.
-412222c Make histo_bits and transform_bits function of quality.
-149b509 Update lossless encoder strategy:
-0e6fa06 cache_bits passed to EncodeImageInternal()
-e38b40a Factorize code for clearing HtreeGroup.
-6f4a16e Removing the indirection of meta-huffman tables.
-3d33ecd Some renaming/comments related to palette in lossless encoder.
-4d02d58 Lossless encoder: correction in Palette storage
-4a63623 fix a memleak in EncodeImageInternal()
-0993a61 Full and final fix for prediction transform
-afd2102 Fix cross-color transform in lossless encoder
-b96d874 Need to write a '0' bit at the end of transforms.
-54dad7e Color cache size should be counted as 0 when cache bits = 0
-4f0c5ca Fix prediction transform in lossless encoder.
-36dabda Fix memory leak in method EncodeImageInternal for histogram_image.
-352a4f4 Get rid of PackLiteralBitLengths()
-d673b6b Change the predictor function to pass left pixel
-b2f9946 Fix CopyTileWithPrediction()
-84547f5 Add EncodeImageInternal() method.
-6b38378 Guard the lossless encoder (in flux) under a flag
-09f7532 Fix few nits (const qualifiers)
-648be39 Added implementation for various lossless functions
-32714ce Add VP8L prefix to backward ref & histogram methods.
-fcba7be Fixed header file tag (WEBP_UTILS_HUFFMAN_ENCODE_H_)
-bc70374 Add backward_ref, histogram & huffman encode modules from lossless.
-fdccaad Fixing nits
-227110c libwebp interface changes for lossless encoding.
-50679ac minor style fixes
-b38dfcc remove unneeded reference to NUM_LITERAL_CODES
-8979675 harmonize header description
-c04eb7b tcoder.c: define NOT_HAVE_LOG2 for MSVC builds
-9a214fa Merge "VP8[L]GetInfo: check input pointers"
-5c5be8b VP8[L]GetInfo: check input pointers
-0c188fe Merge changes I431acdfe,I713659b7
-b3515c6 mux: drop 'chunk' from ChunkInfo member names
-aea7923 muxi.h: remove some unused defines
-0142249 update NEWS file for next release
-29e3f7e Merge "dec: remove deprecated WebPINew()"
-4718e44 Merge "muxedit: a few more size_t changes"
-82654f9 Merge "muxedit: remove a few redundant NULL checks"
-02f27fb dec: remove deprecated WebPINew()
-ccddb3f muxedit: remove a few redundant NULL checks
-a6cdf71 muxedit: a few more size_t changes
-a384689 Merge "mux: remove unused LIST_ID"
-11ae46a alpha.c: quiet some size_t -> int conversion warnings
-dee4669 mux: remove unused LIST_ID
-03f1f49 mux: add version checked entry points
-6a0abda Merge "doc: tile/alpha corrections"
-c8139fb Merge "few cosmetics"
-6833873 Merge "lossless: remove some size_t -> int conversions"
-5249e94 doc: tile/alpha corrections
-d96e722 huffman: quiet int64 -> int conversion warning
-532020f lossless: remove some size_t -> int conversions
-23be6ed few cosmetics
-1349eda Merge "configure: AC_ARG_* use AS_HELP_STRING"
-bfbcc60 configure: AC_ARG_* use AS_HELP_STRING
-1427ca8 Merge "Makefile.am: header file maintenance"
-087332e Merge "remove unused parameter 'round' from CalcProba()"
-9630e16 remove unused parameter 'round' from CalcProba()
-92092ea Merge "bit_reader.h: correct include"
-a87fc3f Merge "mux: ensure # images = # tiles"
-53af99b Merge "mux: use size_t consistently"
-39a57da Makefile.am: header file maintenance
-1bd0bd0 bit_reader.h: correct include
-326a3c6 mux: ensure # images = # tiles
-95667b8 mux: use size_t consistently
-231ec1f Removing the indirection of meta-huffman tables.
-15ebcba check return pointer from MuxImageGetListFromId
-b0d6c4a Merge "configure: remove test for zlib.h"
-8cccac5 Merge "dsp/lossless: silence some build warnings"
-b08819a dsp/lossless: silence some build warnings
-7ae2252 Android.mk: SSE2 & NEON updates
-0a49e3f Merge "makefile.unix add missing header files"
-2e75a9a Merge "decode.h: use size_t consistently"
-fa13035 configure: remove test for zlib.h
-d3adc81 makefile.unix add missing header files
-262fe01 Merge "makefile.unix & Android.mk: cosmetics"
-4cce137 Merge "enc_sse2 add missing stdlib.h include"
-80256b8 enc_sse2 add missing stdlib.h include
-9b3d1f3 decode.h: use size_t consistently
-64083d3 Merge "Makefile.am: cosmetics"
-dceb8b4 Merge changes If1331d3c,I86fe3847
-0e33d7b Merge "webp/decode.h: fix prototypes"
-fac0f12 rename BitReader to VP8LBitReader
-fbd82b5 types.h: centralize use of stddef.h
-2154835 Makefile.am: cosmetics
-1c92bd3 vp8io: use size_t for buffer size
-90ead71 fix some more uint32_t -> size_t typing
-cbe705c webp/decode.h: fix prototypes
-3f8ec1c makefile.unix & Android.mk: cosmetics
-217ec7f Remove tabs in configure.ac
-b3d35fc Merge "Android.mk & Makefile.vc: add new files"
-0df04b9 Android.mk & Makefile.vc: add new files
-e4f20c5 Merge "automake: replace 'silent-rules' w/AM_SILENT_RULES"
-8d254a0 cosmetics
-6860c2e fix some uint32_t -> size_t typing
-4af1858 Fix a crash due to max symbol in a tree >= alphabet size
-6f01b83 split the VP8 and VP8L decoding properly
-f2623db enable lossless decoder
-b96efd7 add dec/vp8i.h changes from experimental
-19f6398 add dec/vp8l{i.h,.c} from experimental
-c4ae53c add utils/bit_reader.[hc] changes from experimental
-514d008 add dsp/lossless.[hc] from experimental
-9c67291 add utils/huffman.[hc] from experimental
-337914a add utils/color_cache.[hc] from experimental
-b3bf8fe the read-overflow code-path wasn't reporting as an error
-1db888b take colorspace into account when cropping
-61c2d51 move the rescaling code into its own file and make enc/ and dec/ use it.
-efc2016 Make rescaler methods generic
-3eacee8 Move rescaler methods out of io.c.
-a69b893 automake: replace 'silent-rules' w/AM_SILENT_RULES
-6f7bf64 issue 111: fix little-endian problem in bit-reader
-ed278e2 Removed unnecessary lookup
-cd8c3ba fix some warnings: down-cast and possibly-uninitialized variable
-0a7102b ~1% improvement of alpha compression
-3bc1b14 Merge "Reformat container doc"
-dc17abd mux: cosmetics
-cb5810d Merge "WebPMuxGetImage: allow image param to be NULL"
-506a4af mux: cosmetics
-135e8b1 WebPMuxGetImage: allow image param to be NULL
-de556b6 Merge "README.mux: reword some descriptions"
-0ee2aeb Makefile.vc: use batch mode rules
-d9acddc msvc: move {i,p}db creation to object directory
-237c9aa Merge "expose WebPFree function for DLL builds"
-b3e4054 silence msvc debug build warning
-45feb55 expose WebPFree function for DLL builds
-11316d8 README.mux: reword some descriptions
-4be52f4 factorize WebPMuxValidate
-14f6b9f mux: light cleanup
-5e96a5d add more param checks to WebPPictureDistortion()
-8abaf82 Merge "silence some type size related warnings"
-1601a39 silence some type size related warnings
-f3abe52 Merge "idec: simplify buffer size calculation"
-a9c5cd4 idec: simplify buffer size calculation
-7b06bd7 Merge "configure/automake: add silent-rules option"
-e9a7d14 Reformat container doc
-d4e5c7f configure/automake: add silent-rules option
-5081db7 configure/automake: no -version-info for convenience libs
-85b6ff6 Merge "idec: fix WebPIUpdate failure"
-7bb6a9c idec: fix internal state corruption
-89cd1bb idec: fix WebPIUpdate failure
-01b6380 4-5% faster decoding, optimized byte loads in arithmetic decoder.
-631117e Merge "cosmetics & warnings"
-a0b2736 cosmetics & warnings
-f73947f use 32bit for storing dequant coeffs, instead of 16b.
-b960030 Merge "store prediction mode array as uint8_t[16], not int[16]."
-7b67881 store prediction mode array as uint8_t[16], not int[16].
-cab8d4d Merge "NEON TransformOne"
-ba503fd NEON TransformOne
-9f740e3 Merge "gcc warning fix: remove the 'const' qualifier."
-f76d358 gcc warning fix: remove the 'const' qualifier.
-e78478d Merge "webpmux: make more use of WebPData"
-f85bba3 Merge "manpages: add BUGS section"
-48a43bb Merge "makefile.unix: variable cosmetics"
-c274dc9 makefile.unix: variable cosmetics
-1f7b859 re-organize the error-handling in the main loop a bit
-1336fa7 Only recompute level_cost_[] when needed
-771ee44 manpages: add BUGS section
-0f7820e webpmux: make more use of WebPData
-974aaff examples: logging updates
-6c14aad Merge "better token buffer code"
-f405425 better token buffer code
-18d959f Merge "mux: add WebPData type"
-eec4b87 mux: add WebPData type
-0de3096 use 16bit counters for recording proba counts
-7f23678 fix for LevelCost + little speed-up
-7107d54 further speed-up/cleanup of RecordCoeffs() and GetResidualCost()
-fd22104 Introduce Token buffer (unused for now)
-5fa148f Merge "speed-up GetResidualCost()"
-28a9d9b speed-up GetResidualCost()
-11e7dad Merge "misc cosmetics"
-378086b misc cosmetics
-d61479f add -print_psnr and -print_ssim options to cwebp.
-2e3e8b2 add a WebPCleanupTransparentArea() method
-552c121 Merge "mux: plug some memory leaks on error"
-a2a81f7 Merge "fix Mach-O shared library build"
-b3482c4 Merge "fix gcc-4.0 apple 32-bit build"
-e4e3ec1 fix gcc-4.0 apple 32-bit build
-b0d2fec mux: plug some memory leaks on error
-f0d2c7a pass of cosmetics
-b309a6f fix Mach-O shared library build
-241ddd3 doc: delete mux container pdf
-8b1ba27 doc: update VP8 decode guide link
-7e4371c WebPMuxCreate: fix unchecked malloc
-eb42558 Merge "have makefile.unix clean up src/webp/*~ too"
-a85c363 Merge "correct EncodeAlpha documentation"
-a33842f Merge "Update webp container spec with alpha filter options."
-8d6490d Incremental support for some of the mux APIs.
-b8375ab have makefile.unix clean up src/webp/*~ too
-b5855fc correct EncodeAlpha documentation
-dba37fe Update webp container spec with alpha filter options.
-2e74ec8 fix compile under MINGW
-716d1d7 fix suboptimal MAX_LEN cut-off limit
-57cab7b Harmonize the alpha-filter predictions at boundary
-3a98953 Merge "Fix bug for Alpha in RGBA_4444 color-mode."
-8ca2076 Introduce a 'fast' alpha mode
-221a06b Fix bug for Alpha in RGBA_4444 color-mode.
-ad1e163 cosmetics: normalize copyright headers
-c77424d cosmetics: light include cleanup
-9d0e17c fix msvc build breakage after 252028a
-7c4c177 Some readability fixes for mux library
-d8a47e6 Merge "Add predictive filtering option for Alpha."
-252028a Add predictive filtering option for Alpha.
-9b69be1 Merge "Simplify mux library code"
-a056170 Simplify mux library code
-992187a improve log2 test
-e852f83 update Android.mk file list
-a90cb2b reduce number of copies and mallocs in alpha plane enc/dec
-b1662b0 fix some more type conversion warnings w/MSVC
-223d8c6 fix some uint64_t -> int conversion warnings with MSC
-c1a0437 Merge "simplify checks for enabling SSE2 code"
-f06817a simplify checks for enabling SSE2 code
-948d4fe silence a msvc build warning
-9117954 vwebp: msvc build tweaks
-7937b40 simple WebP viewer, based on OpenGL
-6aac1df add a bunch of missing 'extern "C"'
-421eb99 Merge "Remove assigned-but-not-used variable "br""
-91e27f4 better fitting names for upsampling functions
-a5d7ed5 Remove assigned-but-not-used variable "br"
-f62d2c9 remove unused 'has_alpha' from VP8GetInfo() signature
-08e8658 trap alpha-decoding error
-b361eca add cut-off to arith coder probability update.
-8666a93 Some bug-fixes for images with alpha.
-273a12a fix off-by-1 diff in case cropping and simple filtering
-2f741d1 webpmux: ReadImage: fix ptr free in error case
-721f3f4 fix alpha decode
-60942c8 fix the has_alpha_ order
-30971c9 Implement progress report (and user abort)
-eda520a cosmetics after 9523f2a
-38bd5bb Merge "Better alpha support in webpmux binary"
-ccbaebf Merge "Updated the includes to relative paths."
-d71fbdc fix small typo in error message array
-cdf97aa Better alpha support in webpmux binary
-885f25b Updated the includes to relative paths.
-a0ec9aa Update WebP encoder (cwebp) to support Alpha.
-667b769 Fixed the include for types.h within mux.h
-9523f2a Add Alpha Encode support from WebPEncode.
-16612dd Merge "Add Alpha Decode support from WebPDecode."
-d117a94 Add Alpha Decode support from WebPDecode.
-6722873 cosmetics after e1947a9
-e1947a9 Add Alpha encode/decode code.
-afc4c5d simplify code by introducing a CopyPlane() helper func
-113b312 Merge "MUX API Updates"
-c398f59 MUX API Updates
-5acf04e remove orphan source file
-059f03e Merge "dec: validate colorspace before using as array index"
-70a0398 Merge "factorize some code"
-9b243b3 factorize some code
-372e2b4 Correct a bug in ReadPNG() with GRAY_ALPHA images
-469d6eb Merge "Makefile.am: remove redundant noinst_HEADERS"
-9fe3372 dec: validate colorspace before using as array index
-8962030 remove orphan source file
-ced3e3f Makefile.am: remove redundant noinst_HEADERS
-964387e use WEBP_INLINE for inline function declarations
-90880a1 Merge "manpages: break long lines"
-b591089 Merge "manpages: minor formatting updates"
-4c451e4 Merge "Rectify the Chunk parsing logic."
-04e84cf examples: slight cleanup
-099717c manpages: break long lines
-1daf39b manpages: minor formatting updates
-abd030b fix missing "(void)" in function signature
-f6a7d75 remove useless test
-f07b213 Rectify the Chunk parsing logic.
-b8634f7 webpmux: fix lib link order
-42c2e68 Fix missing coma (on uncompiled code)
-d8329d4 Android.mk: add missing source files
-13a54df Merge "More aggressive copy-edit; add TODO; validate HTML5"
-868b96a More aggressive copy-edit; add TODO; validate HTML5
-767afea configure: check for a symbol contained in libpng
-408b891 Merge "Linewrap at 72 cols. Casual copy-edit."
-3ae318c Merge "Restore (most) emphasis; add emphasis to normative RFC 2119 terms (MUST, etc.)"
-918eb2d Merge "Basic container doc source clean-up; fix lists and pseudocode blocks."
-03bec9e Linewrap at 72 cols. Casual copy-edit.
-2678d81 Restore (most) emphasis; add emphasis to normative RFC 2119 terms (MUST, etc.)
-428674d Basic container doc source clean-up; fix lists and pseudocode blocks.
-6a77d92 Merge "Makefile.vc: cosmetics"
-28c38e8 Merge "Makefile.vc: condense directory creation rules"
-55be2cf Initial import of container spec document, from pdftotext transform.
-a82a788 Makefile.vc: cosmetics
-c8f41ce Makefile.vc: condense directory creation rules
-2b877cd Some fixes to Makefile.vc to support the src\mux directory.
-3eb969b Merge "Add Makefile.vc for Mux library & binary."
-e78e971 Add Makefile.vc for Mux library & binary.
-6aedde5 Add manual for WebPMux tool.
-8a360d0 Merge "Added WebPMux Binary."
-a4f32ca Added WebPMux Binary.
-f3bf4c7 Added Mux Container Spec & README for MUX-API.
-9f761cf Changed function signature for WebPMuxCreate
-5f31b5e Merge "Add Mux library for manipulating WebP container."
-2315785 Add Mux library for manipulating WebP container.
-7e198ab update ChangeLog (tag: v0.1.3)
-dfc9c1e Harmonize the dates
-28ad70c Fix PNG decoding bug
-846e93c Update AUTHORS & add .mailmap
-563e52d cosmetics after '76036f5 Refactor decoder library'
-76036f5 Refactor decoder library
-377ef43 configure.ac: update AC_INIT params
-7a8d876 use a user-visible MACRO for max width/height.
-d4e9f55 NEON decode support in WebP
-0ee683b update libtool version-info
-fdbe02c windows: match _cond_destroy logic w/return variable name
-206b686 README: correct advanced decode api pseudo-code
-6a32a0f make VP8BitReader a typedef, for better re-use
-b112e83 create a libwebputils under src/utils
-ee697d9 harmonize the include guards and #endif comments
-a1ec07a Fixing compiler error in non x86 arch.
-dcfa509 Fixed recursive inclusion of bit_writer.h and vp8enci.h.
-e06ac08 create a separate libwebpdsp under src/dsp
-ebeb412 use unsigned int for bitfields
-341cc56 make kNewRange a static array
-227a91e README: minor wording update
-05bd8e6 add man pages to dist
-812dfa1 bump up versions in preparations for 0.1.3
-a5b78c8 wrap alpha-related options under WEBP_EXPERIMENTAL_FEATURES flag
-34dc790 regen ChangeLog for 0.1.3-rc2
-7c43663 Silence some (more) Visual Studio warnings.
-60306e8 add top-level gitattributes
-2aa6b80 Slience some Visual Studio warnings.
-4cbbb29 Merge "bump up version for next freeze"
-a329167 bump up version for next freeze
-c7e86ab cosmetics: fix comment line lengths
-c9e037a makefile.unix: add simple dist target
-87d58ce makefile.unix: rule maintenance
-d477de7 mend
-fac15ec Update NEWS & README for next release V0.1.3
-6215595 Merge "add a -partition_limit option to limit the number of bits used by intra4x4"
-3814b76 Merge "reorganize chunk-parsing code"
-900286e add a -partition_limit option to limit the number of bits used by intra4x4
-cd12b4b add the missing cost for I4/I16 mode selection
-dfcc213 reorganize chunk-parsing code
-3cf2030 initialize pointers to function within VP8DspInit()
-d21b479 Merge "windows: add decode threading support"
-473ae95 fix hang on thread creation failure
-fccca42 windows: add decode threading support
-a31f843 Use the exact PNG_INCLUDES/PNG_LIBS when testing for -lpng
-ad9b45f Merge "Makefile.vc: rule maintenance"
-565a2ca Makefile.vc: rule maintenance
-2d0da68 makefile.unix: disable Wvla by default
-fc7815d multi-thread decoding: ~25-30% faster
-acd8ba4 io->teardown() was not always called upon error
-c85527b Merge "Makefile.vc: add DLL configs"
-e1e9be3 cosmetics: spelling/grammar in README and lib headers
-b4d0ef8 Makefile.vc: add DLL configs
-998754a remove unused nb_i4_ and nb_i16_ fields.
-9f01ce3 rename WebPDecBuffer::memory -> private_memory
-fb5d659 fix an overflow bug in LUT calculation
-d646d5c swig: add WebPDecodeARGB
-78aeed4 add missing WebPDecodeARGBInto() and switch ARGB4444 to RGBA4444 as was intended
-cd7c529 explicitly mark library functions as extern
-19db59f add support for RGB565, ARGB4444 and ARGB colorspace (decoder)
-c915fb2 encoder speed-up: hardcode special level values
-c558bda Rename and improve the API to retrieve decoded area
-bf599d7 Merge "makefile.unix: disable -Wvla by default"
-c9ea03d SSE2 version of strong filtering
-993af3e makefile.unix: disable -Wvla by default
-3827e1b Merge "examples: (windows/WIC) add alpha support"
-e291fae SSE2 functions for the fancy upsampler.
-a06bbe2 add WebPISetIOHooks() to set some custom hooks on the incremental decoder object.
-7643a6f Merge "makefile.unix: use uname to detect OSX environment"
-5142a0b export alpha channel (if present) when dumping to PGM format
-14d5731 makefile.unix: use uname to detect OSX environment
-0805706 examples: quiet warnings
-3cfe088 examples: (windows/WIC) add alpha support
-13ed94b add compile warning for variable-length-array
-5a18eb1 Merge "add Advanced Decoding Interface"
-5c4f27f add missing \n
-f4c4e41 80 cols fix
-d260310 add Advanced Decoding Interface
-bd2f65f sse2 version of the complex filter
-96ed9ce perform two idct transforms at a time when possible
-01af7b6 use aligned stored
-0e1d1fd Merge "Makefile.vc: add experimental target"
-2a1292a Makefile.vc: add experimental target
-23bf351 Enable decode SSE2 for Visual Studio
-131a4b7 dec/dsp_sse2: fix visual studio compile
-00d9d68 swig: file reorganization
-7fc7e0d Merge "swig/java: basic encode support"
-3be57b1 fix MSVC compile for WEBP_EXPERIMENTAL_FEATURES
-40a7e34 dec/dsp: disable sse2 for Visual Studio builds
-e4d540c add SSE2 code for transform
-54f2170 swig/java: basic encode support
-c5d4584 call function pointers instead of C-version
-ea43f04 Merge "configure: mingw32 targets: test for WIC support"
-a11009d SSE2 version of simple in-loop filtering
-42548da shave one unneeded filter-cache line
-31f9dc6 configure: mingw32 targets: test for WIC support
-1955969 Merge "split expression in two."
-415dbe4 split expression in two.
-e29072a configure: test for zlib only w/--enable-experimental
-b2b0090 Simplify Visual Studio ifdefs
-ca7a2fd Add error reporting from encoding failures.
-6c9405d Merge "Makefile.vc: require CFG with clean target"
-0424ecd Makefile.vc: require CFG with clean target
-003417c Enable SSE2 for Visual Studio builds
-af10db4 little speed up for VP8BitUpdate()
-e71418f more MSVC files to ignore
-46d9036 cosmetics
-edf59ab typo fix
-72229f5 Add support for x64 and SSE2 builds under Windows.
-92e5c6e VP8GetInfo() + WebPResetDecParams()
-416b7a6 raise the fixed-point precision for the rescaler
-aa87e4e fix alignment
-eb66670 disable WEBP_EXPERIMENTAL_FEATURES
-c5ae7f6 typo fix: USE_ => WEBP_
-d041efa swig: add libwebp.jar/libwebp_java_wrap.c
-f6fb387 add swig interface
-e927390 align buffer for double too
-842c009 fix -strong option
-d0a7038 Merge "cosmetics"
-fc0a02e fix the dichotomy loop
-38369c0 cosmetics
-8dfc4c6 factorize and unify GetAlpha() between the C and SSE2 version
-6d0e66c prepare experimentation with yuv444 / 422
-79cc49f add a --enable-experimental option to './configure'
-d757523 sse2 version of CollectHistogram()
-c1c728d add an extra #ifdef WEBP_EXPERIMENTAL_FEATURES to avoid 'unused variable' warning
-60c61d2 always call VP*EncDeleteAlpha() unconditionnally, for simplicity
-0f8c638 simply don't call WriteExtensions() if WEBP_EXPERIMENTAL_FEATURES is not defined
-47c661d rename swap -> swap_rb
-10d55bb move chunk[] declaration out of the for() loop
-517cec2 fix indentation
-f7d9e26 fix merge problems
-8fd42b3 add a stride 'a_stride' for the alpha plane
-b8dcbf2 fix alpha-plane copy and crop methods
-cdef89d fix some 'unused variable' warning
-fb29c26 SSE2 version of the fwd transform and the squared sum metric
-2ab4b72 EXPERIMENTAL: add support for alpha channel
-cfbf88a add SSE2 functions. ~2x faster encoding on average.
-e7ff3f9 merge two ITransforms together when applicable and change the TTransform to return the sum directly.
-ca55413 fix WebPIDecGetRGB() to accept any RGB(A) mode, not just MODE_RGB
-8aa50ef fix some 'man' typos
-d3f3bdd update ChangeLog (tag: v0.1.2)
-d7e9a69 update contributor list
-261abb8 add a 'superclean' section
-276ae82 Remove files not mean to be in git, and update .gitignore
-2486845 build: prepare libwebp.pc
-14ceb6e add "-version" description to man pages
-b247a3b Create the m4 directory, and also place .gitignore in there for libtool.
-cdd734c Resolve automake warnings
-c5fa726 build: add pkgconfig files
-b20aaca build: just use autoreconf, avoid calling tools manually
-4b0b0d6 cwebp: use modern functions
-efbc6c4 update Android.mk
-7777570 better version of ChangeLog
-fa70d2b update version number in the DOC
-f8db5d5 more C89-fixes
-0de013b fix typos
-650ffa3 add version getters for decoder and encoder
-be4867d doc for incremental decoding
-56732a1 add idec.obj in MSVC makefile
-208afb5 add c++ guards
-8bf76fe add incremental decoding
-1f28832 'inline' isn't defined in strict ansi c89
-8b77c63 move the quantization function to dsp.c
-b2c3575 add a 'last_y' field to WebPDecParams
-2654c3d correctly pass along the exact same status returned from ParsePartitions
-4704146 add missing precision in the man
-6d978a6 add error messages
-6463e6a add some install instructions, and fix intel-mac flags
-05fb7bf Merge ".gitignore: initial version"
-c33f019 .gitignore: initial version
-e532b9a Makefile: allow out of tree builds
-4c0da7a enable sparse dc/ac transforms
-07dbb8d clarify the return logic
-5c69e1b fix bigger-by-1 array
-7c5267e fix a (harmless) typo: non_zero_ -> non_zero_ac_
-bc75213 fix missing free()
-af3e2aa remove trailing spaces
-13e50da make the bitreader preload at least 8bits, instead of post-load them (this makes initialization easier and will be helpful for incremental decoding). Modify ParsePartitions() to accommodate for truncated input.
-f4888f7 emit 9 - nb_bits trailing zeros instead of 8
-3db6525 separate block-parsing into a visible VP8DecodeMB()
-a871de0 add missing extern "C"
-b3ce8c5 remove a gcc warning about type pun by using a proper union'd type
-e186371 update after addition of webpi.h
-3e856e2 Extract some useful functions around decoding buffer WebPDecParams.
-d5bc05a make the filtering process match libvpx and ffvp8
-dd60138 add man pages for cwebp(1) and dwebp(1)
-c4fa364 fix header
-5b70b37 * add an option to bypass_filtering in VP8Io.
-b97a400 simplify QuantizeBlock code a bit
-84b58eb add more checks around picture allocation
-b65a3e1     remove absolute_delta_ field and syntax code
-0744e84 Dont' open output file until we're sure the input file is valid
-d5bd54c fix typo and buggy line
-f7a9549 Add a simple top-level makefile.unix for quick & easy build.
-5f36b94 update the doc for the -f option
-f61d14a a WebP encoder converts PNG & JPEG to WebP
-81c9662 oops: forgotten call to Initialize() + move the error message to a more useful place
-87ffa00 typo: fix a missing 'R', was confusing.
-b04b857 * add decoding measurement using stopwatch.h (use -v option) * support PNG output through WIC on Win32
-746a482 * make (*put)() hook return a bool for abort request. * add an enum for VP8Status() to make things clearer
-73c973e * strengthen riff/chunk size checks * don't consider odd-sized chunks being an error
-1dc4611 add support for PNG output (default) regularize include guards
-860641d fix a typo: sizeof(kYModeProbaInter0) => sizeof(kUVModeProbaInter0)
-3254fc5 fix some petty constness fix the ./configure file too
-504d339 fix eof_ mis-initialization
-2bc0778 leftover Makefile.* from previous commit
-d2cf04e move Makefile.am one level below, to src/dec fix typos here and there dwebp is now an installed program
-ade92de typo: vp8.h -> decode_vp8.h
-d724124 forgot to declare types.h to be installed
-6421a7a move the decoder sourcetree to a sub-location src/dec to make room for future libs sources
-a9b3eab correct layout name is IMC4.
-2330522 handle corner case of zero-dimensions
-280c365 make VP8Init() handle short buffers (< 2 bytes) correctly
-b1c9e8b handle error cases more robustly
-0e94935 Merge "table-less version of clip_8b()"
-1e0a2d2 table-less version of clip_8b()
-e12109e dwebp: change -yuv option to -raw change the layout to IMC2
-d72180a speed-up fancy upscaler
-9145f3b reset eof_ at construction time
-a7ee055 simplify the logic of GetCoeffs()
-f67b593 lot of cosmetics
-ea27d7c fix endian problem on PowerPC
-beb0a1b fix signature of VP8StoreBlock
-b128c5e Merge "fancy chroma upscaling"
-6a37a2a fancy chroma upscaling
-ff565ed fix two numeric typos
-5a936a0 use uintptr_t for casting pointers to ints
-e14a030 for cross_compiling=yes to prevent executing any binary
-83b545e add vc9+ makefile
-296f691 fix output loop for small height
-cbfbb5c convert to plain-C
-f09f96e Fix declaration after statement warning
-5981ee5 Fix UV plane ac/dc quantizer transposition
-c8d15ef convert to ANSI-C
-c3f41cb Initial commit
+20a7fea0 extras/Makefile.am: fix libwebpextras.la reference
+415f3ffe update ChangeLog (tag: v0.6.0-rc3)
+3c6d1224 update NEWS
+ee4a4141 update AUTHORS
+32ed856f Fix "all|no frames are keyframes" settings.
+f4dc56fd disable GradientUnfilter_NEON
+0d8e0588 img2webp: treat -loop as a no-op w/single images
+b0450139 ReadImage(): restore size reporting
+0ad3b4ef update ChangeLog (tag: v0.6.0-rc2)
+6451709e img2webp,get_disto: fix image decode w/WIC builds
+92504d21 get_disto: make ReadPicture() return a bool
+c3e4b3a9 update NEWS
+3363eb6d man/img2webp.1: fix formatting warning
+4d1312f2 update NEWS
+36c42ea4 bump version to 0.6.0
+bb498a51 update AUTHORS
+84cef16f Makefile.vc: fix CFG=debug-dynamic build
+919f9e2f Merge "add .rc files for windows dll versioning"
+f1ae8af4 Merge ".gitignore: add img2webp"
+4689ce16 cwebp: add a -sharp_yuv option for 'sharp' RGB->YUV conversion
+79bf46f1 rename the pretentious SmartYUV into SharpYUV
+eb1dc89a silently expose use_delta_palette in the WebPConfig API
+c85b0dde .gitignore: add img2webp
+43d3f01a add .rc files for windows dll versioning
+668e1dd4 src/{dec,enc,utils}: give filenames a unique suffix
+0e6b7f33 Merge "iosbuild.sh: only add required headers to framework"
+29ed6f9a iosbuild.sh: only add required headers to framework
+71c53f1a NEON: speed-up strong filtering
+73f567ea Merge "get_disto: remove redundant reader check"
+9e14276f Merge "makefiles: prune get_disto & webp_quality deps"
+99965bac Merge "Makefile.vc: add get_disto.exe, webp_quality.exe"
+d4912238 get_disto: remove redundant reader check
+ea482409 makefiles: prune get_disto & webp_quality deps
+2ede5a19 Makefile.vc: add get_disto.exe, webp_quality.exe
+a345068a ARM: speed up bitreader by avoiding tables
+1dc82a6b Merge "introduce a generic GetCoeffs() function pointer"
+8074b89e introduce a generic GetCoeffs() function pointer
+749a45a5 Merge "NEON: implement alpha-filters (horizontal/vertical/gradient)"
+74c053b5 Merge "NEON: fix overflow in SSE NxN calculation"
+0a3aeff7 Merge "dsp: WebPExtractGreen function for alpha decompression"
+1de931c6 NEON: implement alpha-filters (horizontal/vertical/gradient)
+9b3aca40 NEON: fix overflow in SSE NxN calculation
+1c07a3c6 dsp: WebPExtractGreen function for alpha decompression
+9ed5e3e5 use pointers for WebPRescaler's in WebPDecParams
+db013a8d Merge "ARM: don't use USE_GENERIC_TREE"
+fcd4784d use a 8b table for C-version for clz()
+fbb5c473 ARM: don't use USE_GENERIC_TREE
+8fda5612 Merge "add a kSlowSSSE3 feature for CPUInfo"
+86bbd245 add a kSlowSSSE3 feature for CPUInfo
+7c2779e9 Get code to fully compile in C++.
+250c3586 Merge "When compiling as C++, avoid narrowing warnings."
+c0648ac2 When compiling as C++, avoid narrowing warnings.
+0d55f60c 40% faster ApplyAlphaMultiply_SSE2
+49d0280d NEON: implement several alpha-processing functions
+48b1e85f SSE2: 15% faster alpha-processing functions
+e3b8abbc fix warning from static analysis.
+28fe054e SSE2: 30% faster ApplyAlphaMultiply()
+f44acd25 Merge "Properly compute the optimal color cache size."
+527844fe Properly compute the optimal color cache size.
+be0ef639 fix a comment typo
+8874b162 Fix a non-deterministic color cache size computation.
+d712e20d Do not allow a color cache size bigger than the number of colors.
+ecff04f6 re-introduce some comments in Huffman Cost.
+259e9828 replace 'ptr + y * stride' by 'ptr += stride'
+00b08c88 Merge "NEON: 5% faster conversion to RGB565 and RGBA4444"
+0e7f4447 Merge "NEON: faster fancy upsampling"
+b016cb91 NEON: faster fancy upsampling
+1cb63801 Call the C function to finish off lossless SSE loops only when necessary.
+875fafc1 Implement BundleColorMap in SSE2.
+3674d49e Merge "remove Clang warnings with unused arch arguments."
+f04eb376 Merge tag 'v0.5.2'
+341d711c NEON: 5% faster conversion to RGB565 and RGBA4444
+abb54827 remove Clang warnings with unused arch arguments.
+ece9684f update ChangeLog (tag: v0.5.2-rc2, tag: v0.5.2, origin/0.5.2, 0.5.2)
+aa7744ca anim_util: quiet implicit conv warnings in 32-bit
+d9120271 jpegdec: correct ContextFill signature
+24eb3940 Remove some errors when compiling the code as C++.
+a4a8e5f3 vwebp: clear canvas during resize w/o animation
+67c25ad5 vwebp: clear canvas during resize w/o animation
+a4bbe4b3 fix indentation
+31ca2a80 tiffdec: restore libtiff 3.9.x compatibility
+b2f77b57 update NEWS
+5ab6d9de AnimEncoder: avoid freeing uninitialized memory pointer.
+f29bf582 WebPAnimEncoder: If 'minimize_size' and 'allow_mixed' on, try lossy + lossless.
+3ebe1c00 AnimEncoder: avoid freeing uninitialized memory pointer.
+df780e0e fix a potential overflow with MALLOC_LIMIT
+58fc5078 Merge "PredictorSub: implement fully-SSE2 version"
+9cc42167 PredictorSub: implement fully-SSE2 version
+0aa1f35c remove dependency of imageio/ to stopwatch.h
+cb9ec84b Merge "remove the dependency to stop_watch.[ch] in imageio"
+dc0c01fb Merge "anim_util: quiet implicit conv warnings in 32-bit"
+827d3c50 Merge "fix a potential overflow with MALLOC_LIMIT"
+1e2e25b0 anim_util: quiet implicit conv warnings in 32-bit
+218460cd bump version to 0.5.2
+de7d654d update AUTHORS & .mailmap
+273367c1 Merge "dsp/lossless.c,cosmetics: fix indent"
+76bbcf2e fix a potential overflow with MALLOC_LIMIT
+8ac1abfe Merge "jpegdec: correct ContextFill signature"
+cb215aed remove the dependency to stop_watch.[ch] in imageio
+2423017a dsp/lossless.c,cosmetics: fix indent
+74a12b10 iosbuild.sh: add WebPDecoder.framework + encoder
+a9cc7621 Merge "iosbuild.sh: add WebPDecoder.framework + encoder"
+fbba5bc2 optimize predictor #1 in plain-C For some reason, gcc has hard time inlining this one...
+9ae0b3f6 Merge "SSE2: slightly (~2%) faster Predictor #1"
+c1f97bd7 SSE2: slightly (~2%) faster Predictor #1
+ea664b89 SSE2: 10% faster Predictor #11
+be7dcc08 AnimEncoder: Correctly skip a frame when sub-rectangle is empty.
+40885830 Fix assertions in WebPRescalerExportRow()
+1d5046d1 iosbuild.sh: add WebPDecoder.framework + encoder
+cec72014 jpegdec: correct ContextFill signature
+8f38c72e fix a typo in WebPPictureYUVAToARGB's doc
+33ca93f9 systematically call WebPDemuxReleaseIterator() on dec->prev_iter_
+76e19073 doc: use two's complement explicitly for uint8->int8 conversion
+f91ba963 Anim_encoder: correctly handle enc->prev_candidate_undecided_
+25d74e65 WebPPictureDistortion(): free() -> WebPSafeFree()
+03f1c008 mux/Makefile.am: add missing -lm
+58410cd6 fix bug in RefineUsingDistortion()
+e168af8c fix filtering auto-adjustment
+ed9dec41 fix doc and code snippet for WebPINewDecoder() doc
+3c49178f prevent 32b overflow for very large canvas_width / height
+9595f290 fix anim_util.c compilation when HAVE_GIF is not defined.
+7ec9552c Make gif transparent color to be transparent black
+b3fb8bb6 slightly faster Predictor #11 in NEON
+9871335f Add a CMake option for WEBP_SWAP_16BIT_CSP.
+0ae32226 Fix missing cpu-features for Android.
+ab4c8056 cpu.cmake: improve webp_check_compiler_flag output
+eec5fa3a Provide support for CMake on Android studio 2.2.
+004d5690 Split the main CMake file.
+4fe5d588 Android.mk: use -fvisibility=hidden
+bd63a31a vwebp: ensure setenv() is available in stdlib.h
+363a5681 vwebp: handle window resizing properly
+a0d2753f lower WEBP_MAX_ALLOCABLE_MEMORY default
+31fe11a5  fix infinite loop in case of PARTITION0 overflow
+532215dd Change the rule of picking UV mode in MBAnalyzeBestUVMode()
+9c75dbd3 cwebp.1: improve some grammar
+af2e05cb vwebp: Clear previous frame when a key triggers a redraw
+26ffa296 Add descriptions of default configuration in help info.
+7416280d Fix an unsigned integer overflow error in enc/cost.h
+13cf1d2e Do token recording and counting in a single loop
+eb9a4b97 Reset segment id if we decide not to update segment map
+42ebe3b7 configure: fix NEON flag detection under gcc 6
+76ebbfff NEON: implement predictor #13
+95b12a08 Merge "Revert Average3 and Average4"
+54ab2e75 Revert Average3 and Average4
+fe12330c 3-5% faster Predictor #5, #6, #7 and #10 for NEON
+fbfb3bef ~2% faster predictor #10 for NEON
+d4b7d801 lossless_sse2: use the local functions
+a5e3b225 Lossless decoder SSE2 improvements.
+58a1f124 ~2% faster predictor #12 in NEON.
+906c3b63 Merge "Implement lossless transforms in NEON."
+d23abe4e Implement lossless transforms in NEON.
+2e6cb6f3 Give more flexibility to the predictor generating macro.
+28e0bb70 Merge "Fix race condition in multi-threading initialization."
+64704530 Fix race condition in multi-threading initialization.
+bded7848 img2webp: fix default -lossless value and use pic.argb=1
+0e61a513 Merge "img2webp: convert a sequence of images to an animated webp"
+1cc79e92 AnimEncoder: Correctly skip a frame when sub-rectangle is empty.
+03f40955 img2webp: convert a sequence of images to an animated webp
+ea72cd60 add missing 'extern' keyword for predictor dcl
+67879e6d SSE implementation of decoding predictors.
+34aee990 Merge "vwebp: make 'd' key toggle the debugging of fragments"
+a41296ae Fix potentially uninitialized value.
+c85adb33 vwebp: make 'd' key toggle the debugging of fragments
+4239a148 Make the lossless predictors work on a batch of pixels.
+bc18ebad fix extra 'const's in signatures
+71e2f5ca Remove memcpy in lossless decoding.
+7474d46e Do not use a register array in SSE.
+67748b41 Improve latency of FTransform2.
+16951b19 Merge "Provide an SSE implementation of ConvertBGRAToRGB"
+6540cd0e Provide an SSE implementation of ConvertBGRAToRGB
+de568abf Android.mk: use -fvisibility=hidden
+3c2a61b0 remove some unneeded casts
+9ac063c3 add dsp functions for SmartYUV
+22efabdd Merge "smart_yuv: switch to planar instead of packed r/g/b processing"
+1d6e7bf3 smart_yuv: switch to planar instead of packed r/g/b processing
+0a3838ca fix bug in RefineUsingDistortion()
+c0699515 webpmux -duration: set default 'end' value equal to 'start'
+83cbfa09 Import: use relative pointer offsets
+a1ade40e PreprocessARGB: use relative pointer offsets
+fd4d090f ConvertWRGBToYUV: use relative pointer offsets
+9daad459 ImportYUVAFromRGBA: use relative pointer offsets
+f90c60d1 Merge "add a "-duration duration,start,end" option to webpmux"
+3f182d36 add a "-duration duration,start,end" option to webpmux
+342e15f0 Import: use relative pointer offsets
+1147ab4e PreprocessARGB: use relative pointer offsets
+e4cd4daf fix filtering auto-adjustment
+e7152856 fix doc and code snippet for WebPINewDecoder() doc
+de9fa507 ConvertWRGBToYUV: use relative pointer offsets
+deb1b831 ImportYUVAFromRGBA: use relative pointer offsets
+c284780f imageio_util: add ImgIoUtilCheckSizeArgumentsOverflow
+e375080d gifdec,Remap: avoid out of bounds colormap read
+c222a053 additional fix for stride type as size_t
+bb233617 fix potential overflow when width * height * 4 >= (1<<32)
+883d41fb gif2webp: fix crash with NULL extension data
+cac9a36a gifdec,Remap: avoid out of bounds colormap read
+4595e01f Revert "gifdec,Remap: avoid out of bounds colormap read"
+fb52d443 gifdec: make some constants unsigned
+f048d38d gifdec,Remap: avoid out of bounds colormap read
+31b1e343 fix SSIM metric ... by ignoring too-dark area
+2f51b614 introduce WebPPlaneDistortion to compute plane distortion
+0104d730 configure: fix NEON flag detection under gcc 6
+265abbe9 Merge "additional fix for stride type as size_t"
+f7601aa6 Merge "Introduce a generic WebPGetImageReader(type) function"
+ce873320 Introduce a generic WebPGetImageReader(type) function
+2a2773ea imageio/*dec,Read*: add input parameter checks
+9f5c8eca additional fix for stride type as size_t
+4eb5df28 remove unused stride fields from VP8Iterator
+11bc423a MIN_LENGTH cleanups.
+273d035a Merge "fix a typo in WebPPictureYUVAToARGB's doc"
+4db82a17 Merge "fix potential overflow when width * height * 4 >= (1<<32)"
+e2affacc fix potential overflow when width * height * 4 >= (1<<32)
+dc789ada fix a typo in WebPPictureYUVAToARGB's doc
+539f5a68 Fix non-included header in config.c.
+aaf2a6a6 systematically call WebPDemuxReleaseIterator() on dec->prev_iter_
+20ef9915 Merge "imageio_util: add ImgIoUtilCheckSizeArgumentsOverflow"
+bc86b7a8 imageio_util: add ImgIoUtilCheckSizeArgumentsOverflow
+806f6279 gif2webp: fix crash with NULL extension data
+68ae5b67 Add libwebp/src/mux/animi.h
+28ce3043 Remove some errors when compiling the code as C++.
+b34abcb8 Favor keeping the areas locally similar in spatial prediction mode selection
+ba843a92 fix some SSIM calculations
+51b71fd2 Merge "vwebp: ensure setenv() is available in stdlib.h"
+fb01743a get_disto: fix the r/g/b order for luma calculation
+bfab8947 vwebp: ensure setenv() is available in stdlib.h
+9310d192 vwebp: handle window resizing properly
+f79450ca Speedup ApplyMap.
+cfdda7c6 Merge "prevent 32b overflow for very large canvas_width / height"
+e36396ba Merge "get_disto: new option to compute SSIM map and convert to gray"
+18a9a0ab Add an API to import a color-mapped image.
+30d43706 Speed-up Combined entropy for palettized histograms.
+36aa087b get_disto: new option to compute SSIM map and convert to gray
+86a84b35 2x faster SSE2 implementation of SSIMGet
+b8384b53 lower WEBP_MAX_ALLOCABLE_MEMORY default
+1c364400 prevent 32b overflow for very large canvas_width / height
+eee0cce1 Merge "Small LZ77 speedups."
+5f1caf29 Small LZ77 speedups.
+1effde7b fix anim_util.c compilation when HAVE_GIF is not defined.
+a2fe9bf4 Speedup TrellisQuantizeBlock().
+573cce27 smartYUV improvements
+21e7537a  fix infinite loop in case of PARTITION0 overflow
+053a1565 Merge "Change the rule of picking UV mode in MBAnalyzeBestUVMode()"
+1377ac2e Change the rule of picking UV mode in MBAnalyzeBestUVMode()
+7c1fb7d0 fix uint32_t initialization (0. -> 0)
+bfff0bf3 speed-up SSIM calculation
+64577de8 De-VP8L-ize GetEntropUnrefinedHelper.
+a7be7328 Merge "refactor the PSNR / SSIM calculation code"
+50c3d7da refactor the PSNR / SSIM calculation code
+d6228aed indentation fix after I7055d3ee3bd7ed5e78e94ae82cb858fa7db3ddc0
+dd538b19 Remove unused declaration.
+6cc48b17 Move some lossless logic out of dsp.
+78363e9e Merge "Remove a redundant call to InitLeft() in VP8IteratorReset()"
+ffd01929 Refactor VP8IteratorNext().
+c4f6d9c9 Remove a redundant call to InitLeft() in VP8IteratorReset()
+c27d8210 Merge "smartYUV: simplify main loop"
+07795296 smartYUV: simplify main loop
+c9b45863 Split off common lossless dsp inline functions.
+490ae5b1 smartYUV: improve initial state for faster convergence
+894232be smartYUV: fix and simplify the over-zealous stop criterion
+8de08483 Remove unused code in webpi.h
+41cab7fe imageio/Android.mk: correct imagedec dependencies
+82c91c70 Merge "libimageenc.a: extract image-saving code from dwebp"
+af1ad3e2 libimageenc.a: extract image-saving code from dwebp
+dd7309e3 Merge "doc: use two's complement explicitly for uint8->int8 conversion"
+6105777e Merge "add gif2webp to CMake"
+13ae011e doc: use two's complement explicitly for uint8->int8 conversion
+4bda0cfb add gif2webp to CMake
+6029c7fe Merge "remove mention of fragment, frgm, FRGM, etc."
+545c147f remove mention of fragment, frgm, FRGM, etc.
+5b46f7fc cwebp.1: improve some grammar
+9e478f80 dec/vp8l.c: add assertions in EmitRescaledRowsRGBA/YUVA
+43bd8958 Make gif transparent color to be transparent black
+0887fc2d Merge "get_disto: add a '-o file' option to save a diff map"
+0de48e18 get_disto: add a '-o file' option to save a diff map
+0a57ad0d cosmetics: WebPSafeAlloc -> WebPSafeMalloc
+0a4699bc Merge "WebPPictureDistortion(): free() -> WebPSafeFree()"
+29fedbf5 Anim_encoder: correctly handle enc->prev_candidate_undecided_
+32dead4e WebPPictureDistortion(): free() -> WebPSafeFree()
+85cd5d06 Smarter LZ77 for uniform regions.
+6585075f Change PixelsAreSimilar() to handle black pixels correctly.
+c0a27fd2 vwebp: Clear previous frame when a key triggers a redraw
+57a5e3b6 webp_quality should return '0' in case of success.
+7f1b897b Faster stochastic histogram merging.
+48c810b8 Merge "remove WEBP_FORCE_ALIGNED and use memcpy() instead."
+3884972e remove WEBP_FORCE_ALIGNED and use memcpy() instead.
+485cac1a switch libimagedec.a and libimageio_util.a to avoid undefined symbol
+005e15b1 Merge "{extras,mux}/Makefile.am: add missing -lm"
+6ab496ed fix some 'unsigned integer overflow' warnings in ubsan
+8a4ebc6a Revert "fix 'unsigned integer overflow' warnings in ubsan"
+9d4f209f Merge changes I25711dd5,I43188fab
+e44f5248 fix 'unsigned integer overflow' warnings in ubsan
+27b5d991 Fix assertions in WebPRescalerExportRow()
+74f6f9e7 Add descriptions of default configuration in help info.
+aaf2530c {extras,mux}/Makefile.am: add missing -lm
+1269dc7c Refactor VP8LColorCacheContains()
+40872fb2 dec_neon,NeedsHev: micro optimization
+7b54e26b Add a CMake option for WEBP_SWAP_16BIT_CSP.
+d2223d8d Fix missing cpu-features for Android.
+bf16a4b4 Merge "cpu.cmake: improve webp_check_compiler_flag output"
+ee1057e3 cpu.cmake: improve webp_check_compiler_flag output
+b551e587 cosmetics: add {}s on continued control statements
+d2e4484e dsp/Makefile.am: put msa source in correct lib
+c7f66c82 Merge "utils/thread.c,cosmetics: join a few lines"
+98d8f295 Merge "examples/Makefile.am,cosmetics: sort binary targets"
+39f4ffbc utils/thread.c,cosmetics: join a few lines
+a86ce2b1 Merge "extras/Makefile.am: don't install libwebpextras"
+6fa9fe24 extras/Makefile.am: don't install libwebpextras
+0b2c58a9 Fix an unsigned integer overflow error in enc/cost.h
+d7ce4a2e examples/Makefile.am,cosmetics: sort binary targets
+386e4ba2 Reset segment id if we decide not to update segment map
+7b87e848 Merge "Add MSA optimized YUV to RGB upsampling functions"
+d3ddacb6 Add MSA optimized YUV to RGB upsampling functions
+eb98d8d8 webp_quality: detect lossless format and features
+ebee57f4 move imageio/example_util.[hc] (back to) examples/
+99542bbf webpdec: s/ExUtil//
+da573cf4 imageio_util: s/ExUtil/ImgIoUtil/
+bdda5bd4 split example_util.h
+15ed462b .gitignore: add extras/{get_disto,webp_quality}
+7be57489 Merge "VP8EstimateQuality(): roughty estimate webp bitstream quality factor"
+57020525 Makefile.vc: add missing imageio target
+e8ab6a82 VP8EstimateQuality(): roughty estimate webp bitstream quality factor
+fee7b3d6 Merge "'extras/get_disto' example: compute PSNR between two files"
+1e7d4401 'extras/get_disto' example: compute PSNR between two files
+4cecab63 pngdec.c,jpegdec.[hc]: remove unnecessary includes
+259f0434 makefile.unix: normalize image decode lib name
+ed34c39b fix: examples/libexample_dec.a => imageio/libexample_dec.a
+33d8d0d4 Merge "move examples/{example_util,image_dec} to imageio/"
+c960b82e Merge "extras.h: correct include guard"
+fe3cd28a Merge ".gitignore: add .gradle, /build"
+45fbeba5 Merge "Do token recording and counting in a single loop"
+4f33c820 .gitignore: add .gradle, /build
+c379b55a move examples/{example_util,image_dec} to imageio/
+5108d9aa extras.h: correct include guard
+ad497fbc move src/extras to the top-level
+0c0fb832 Do token recording and counting in a single loop
+9ac74f92 Add MSA optimized rescaling functions
+cb19dbc1 Add MSA optimized color transform functions
+3f4042b5 WebPAnimEncoder: If 'minimize_size' and 'allow_mixed' on, try lossy + lossless.
+5e2eb89e cosmetics,dsp/*msa.c: associate '*' with the type
+5b60db5c FastMBAnalyze() for quick i16/i4 decision
+567e6977 Add MSA optimized CollectHistogram function
+c54ab8dd Add MSA optimized quantization functions
+ec6f68c5 Merge "Remove QuantizeBlockWHT() in enc.c"
+2a5c417c Apply the RLE heuristic to LZ77.
+91b59e88 Remove QuantizeBlockWHT() in enc.c
+fe572737 Add MSA optimized SSE functions
+6b53ca87 cosmetics,(dec|enc)_sse2.c: fix indent
+b15d00d9 Merge "Add MSA optimized encoder IntraChromaPreds function"
+afe3cec8 Add MSA optimized encoder IntraChromaPreds function
+fc8cad9f reduce the number of malloc/free cycles in huffman.c
+7b4b05e0 Add MSA optimized encoder Intra16Preds function
+c18787a0 Add MSA optimized encoder Intra4Preds function
+479d1908 webpmux: Also print compression info per frame.
+a80e8cfd Provide support for CMake on Android studio 2.2.
+6c628410 Split the main CMake file.
+bbb6ecd9 Merge "Add MSA optimized distortion functions"
+7915396f Add MSA optimized distortion functions
+652e944f Merge "build.gradle: remove tab"
+c0991a14 io,EmitRescaledAlphaYUV: factor out a common expr
+48bf5ed1 build.gradle: remove tab
+bfef6c9f Merge tag 'v0.5.1'
+3d97bb75 update ChangeLog (tag: v0.5.1, origin/0.5.1, 0.5.1)
+deb54d91 Clarify the expected 'config' lifespan in WebPIDecode()
+435308e0 Add MSA optimized encoder transform functions
+dce64bfa Add MSA optimized alpha filter functions
+429120d0 Add MSA optimized color transform functions
+c7e2d245 update ChangeLog (tag: v0.5.1-rc5)
+55b2fede normalize the macros' "do {...} while (0)" constructs
+701c772e Add MSA optimized colorspace conversion functions
+c7eb06f7 Fix corner case in CostManagerInit.
+f918cb10 fix rescaling bug: alpha plane wasn't filled with 0xff
+ab7937a5 gif2webp: normalize the number of .'s in the help message
+3cdec847 vwebp: normalize the number of .'s in the help message
+bdf6241e cwebp: normalize the number of .'s in the help message
+06a38c7b fix rescaling bug: alpha plane wasn't filled with 0xff
+319e37be Improve lossless compression.
+6a197937 Add MSA optimized intra pred chroma functions
+447adbce 'our bug tracker' -> 'the bug tracker'
+97b9e644 normalize the number of .'s in the help message
+293d786f Added MSA optimized intra prediction 16x16 functions
+0afa0ce2 Added MSA optimized intra prediction 4x4 functions
+a6621bac Added MSA optimized simple edge filtering functions
+bb50bf42 pngdec,ReadFunc: throw an error on invalid read
+38063af1 decode.h,WebPGetInfo: normalize function comment
+1ebf193c Added MSA optimized chroma edge filtering functions
+9ad2352d Merge "Added MSA optimized edge filtering functions"
+60751096 Added MSA optimized edge filtering functions
+9e8e1b7b Inline GetResidual for speed.
+7d58d1b7 Speed-up uniform-region processing.
+8ec7032b simplify HistogramCombineEntropyBin()
+23e29cb1 Merge "Fix a boundary case in BackwardReferencesHashChainDistanceOnly." into 0.5.1
+472a049b remove bin_map[] allocation altogether
+0bb23b2c free -> WebPSafeFree()
+a977b4b5 Merge "rewrite the bin_map clustering to use less memory"
+3591ba66 rewrite the bin_map clustering to use less memory
+e6ac450c utils.[hc]: s/MAX_COLOR_COUNT/MAX_PALETTE_SIZE/
+e7b91772 Merge "DecodeImageData(): change the incorrect assert" into 0.5.1
+2abfa54f DecodeImageData(): change the incorrect assert
+5a48fcd8 Merge "configure: test for -Wfloat-conversion"
+0174d18d Fix a boundary case in BackwardReferencesHashChainDistanceOnly.
+6a9c262a Merge "Added MSA optimized transform functions"
+cfbcc5ec Make sure to consider small distances in LZ77.
+5e60c42a Added MSA optimized transform functions
+3dc28d76 configure: test for -Wfloat-conversion
+f2a0946a add some asserts to delimit the perimeter of CostManager's operation
+9a583c66 fix invalid-write bug for alpha-decoding
+f66512db make gradlew executable
+6fda58f1 backward_references: quiet double->int warning
+a48cc9d2 Merge "Fix a compression regression for images with long uniform regions." into 0.5.1
+cc2720c1 Merge "Revert an LZ77 boundary constant." into 0.5.1
+059aab4f Fix a compression regression for images with long uniform regions.
+b0c7e49e Check more backward matches with higher quality.
+a3611513 Revert an LZ77 boundary constant.
+8190374c README: fix typo
+7551db44 update NEWS
+0fb2269c bump version to 0.5.1
+f4537610 update AUTHORS & .mailmap
+3259571e Refactor GetColorPalette method.
+1df5e260 avoid using tmp histogram in PreparePair()
+7685123a fix comment typos
+a246b921 Speedup backward references.
+76d73f18 Merge "CostManager: introduce a free-list of ~10 intervals"
+eab39d81 CostManager: introduce a free-list of ~10 intervals
+4c59aac0 Merge "mips msa webp configuration"
+043c33f1 Merge "Improve speed and compression in backward reference for lossless."
+71be9b8c Merge "clarify variable names in HistogramRemap()"
+0ba7fd70 Improve speed and compression in backward reference for lossless.
+0481d42a CostManager: cache one interval and re-use it when possible
+41b7e6b5 Merge "histogram: fix bin calculation"
+96c3d624 histogram: fix bin calculation
+fe9e31ef clarify variable names in HistogramRemap()
+ce3c8247 disable near-lossless quantization if palette is used
+e11da081 mips msa webp configuration
+5f8f998d mux: Presence of unknown chunks should trigger VP8X chunk output.
+cadec0b1 Merge "Sync mips32 and dsp_r2 YUV->RGB code with C verison"
+d9637758 Compute the hash chain once and for all for lossless compression.
+50a48665 Sync mips32 and dsp_r2 YUV->RGB code with C verison
+eee788e2 Merge "introduce a common signature for all image reader function"
+d77b877c introduce a common signature for all image reader function
+ca8d9519 remove some obsolete TODOs
+ae2a7222 collect all decoding utilities from examples/ in libexampledec.a
+0b8ae852 Merge "Move DitherCombine8x8 to dsp/dec.c"
+77cad885 Merge "ReadWebP: avoid conversion to ARGB if final format is YUVA"
+ab8d6698 ReadWebP: avoid conversion to ARGB if final format is YUVA
+f8b7ce9e Merge "test pointer to NULL explicitly"
+5df6f214 test pointer to NULL explicitly
+77f21c9c Move DitherCombine8x8 to dsp/dec.c
+c9e6d865 Add gradle support
+c65f41e8 Revert "Add gradle support"
+bf731ede Add gradle support
+08333b85 WebPAnimEncoder: Detect when canvas is modified, restore only when needed.
+0209d7e6 Merge "speed-up MapToPalette() with binary search"
+fdd29a3d speed-up MapToPalette() with binary search
+cf4a651b Revert "Refactor GetColorPalette method."
+0a27aca3 Merge changes Idfa8ce83,I19adc9c4
+f25c4406 WebPAnimEncoder: Restore original canvas between multiple encodes.
+169004b1 Refactor GetColorPalette method.
+576362ab VP8LDoFillBitWindow: support big-endian in fast path
+ac49e4e4 bit_reader.c: s/VP8L_USE_UNALIGNED_LOAD/VP8L_USE_FAST_LOAD/
+d39ceb58 VP8LDoFillBitWindow: remove stale TODO
+2ec2de14 Merge "Speed-up BackwardReferencesHashChainDistanceOnly."
+3e023c17 Speed-up BackwardReferencesHashChainDistanceOnly.
+f2e1efbe Improve near lossless compression when a prediction filter is used.
+e15afbce dsp.h: fix ubsan macro name
+e53c9ccb dsp.h: add WEBP_UBSAN_IGNORE_UNSIGNED_OVERFLOW
+af81fdb7 utils.h: quiet -fsanitize=undefined warnings
+ea0be354 dsp.h: remove utils.h include
+cd276aec utils/*.c: ../utils/utils.h -> ./utils.h
+c8927131 utils/Makefile.am: add some missing headers
+ea24e026 Merge "dsp.h: add WEBP_UBSAN_IGNORE_UNDEF"
+369e264e dsp.h: add WEBP_UBSAN_IGNORE_UNDEF
+0d020a78 Merge "add runtime NEON detection"
+5ee2136a Merge "add VP8LAddPixels() to lossless.h"
+47435a61 add VP8LAddPixels() to lossless.h
+8fa6ac68 remove two ubsan warnings
+74fb56fb add runtime NEON detection
+4154a839 MIPS update to new Unfilter API
+c80b9fc8 Merge "cherry-pick decoder fix for 64-bit android devices"
+6235147e cherry-pick decoder fix for 64-bit android devices
+d41b8c43 configure: test for -Wformat-* w/-Wformat present
+5f95589f Fix WEBP_ALIGN in case the argument is a pointer to a type larger than a byte.
+2309fd5c replace num_parts_ by num_parts_minus_one_ (unsigned)
+9629f4bc SimplifySegments: quiet -Warray-bounds warning
+de47492e Merge "update the Unfilter API in dsp to process one row independently"
+2102ccd0 update the Unfilter API in dsp to process one row independently
+e3912d56 WebPAnimEncoder: Restore canvas before evaluating blending possibility.
+6e12e1e3 WebPAnimEncoder: Fix for single-frame optimization.
+602f344a Merge changes I1d03acac,Ifcb64219
+95ecccf6 only apply color-mapping for alpha on the cropped area
+47dd0708 anim_diff: Add an experimental option for max inter-frame diff.
+aa809cfe only allocate alpha_plane_ up to crop_bottom row
+31f2b8d8 WebPAnimEncoder: FlattenSimilarPixels(): look for similar
+774dfbdc perform alpha filtering within the decoding loop
+a4cae68d lossless decoding: only process decoded row up to last_row
+238cdcdb Only call WebPDequantizeLevels() on cropped area
+cf6c713a alpha: preparatory cleanup
+b95ac0a2 Merge "VP8GetHeaders(): initialize VP8Io with sane value for crop/scale dimensions"
+89231394 VP8GetHeaders(): initialize VP8Io with sane value for crop/scale dimensions
+5828e199 use_8b_decode -> use_8b_decode_
+8dca0247 fix bug in alpha.c that was triggering a memory error in incremental mode
+9a950c53 WebPAnimEncoder: Disable filtering when blending is used with lossy encoding.
+eb423903 WebPAnimEncoder: choose max diff for framerect based on quality.
+ff0a94be WebPAnimEncoder lossy: ignore small pixel differences for frame rectangles.
+f8040084 gif2webp: Remove the 'prev_to_prev_canvas' buffer.
+6d8c07d3 Merge "WebPDequantizeLevels(): use stride in CountLevels()"
+d96fe5e0 WebPDequantizeLevels(): use stride in CountLevels()
+ec1b2407 WebPPictureImport*: check output pointer
+c0768769 Merge "Revert "Re-enable encoding of alpha plane with color cache for next release.""
+41f14bcb WebPPictureImport*: check src pointer
+64eed387 Pass stride parameter to WebPDequantizeLevels()
+97934e24 Revert "Re-enable encoding of alpha plane with color cache for next release."
+e88c4ca0 fix -m 2 mode-cost evaluation (causing partition0 overflow)
+4562e83d Merge "add extra meaning to WebPDecBuffer::is_external_memory"
+abdb109f add extra meaning to WebPDecBuffer::is_external_memory
+875aec70 enc_neon,cosmetics: break long comment
+71e856cf GetMBSSIM,cosmetics: fix alignment
+a90edffb fix missing 'extern' for SSIM function in dsp/
+423ecaf4 move some SSIM-accumulation function for dsp/
+f08e6624 Merge "Fix FindClosestDiscretized in near lossless:"
+0d40cc5e enc_neon,Disto4x4: remove an unnecessary transpose
+e8feb20e Fix FindClosestDiscretized in near lossless:
+82006430 anim_util: quiet static analysis warning
+a6f23c49 Merge "AnimEncoder: Support progress hook and user data."
+a5193774 Merge "Near lossless feature: fix some comments."
+da98d31c AnimEncoder: Support progress hook and user data.
+33357131 Near lossless feature: fix some comments.
+0beed01a cosmetics: fix indent after 2f5e898
+6753f35c Merge "FTransformWHT optimization."
+6583bb1a Improve SSE4.1 implementation of TTransform.
+7561d0c3 FTransformWHT optimization.
+7ccdb734 fix indentation after patch #328220
+6ec0d2a9 clarify the logic of the error path when decoding fails.
+8aa352b2 Merge "Remove an unnecessary transposition in TTransform."
+db860884 Merge "remove useless #include"
+9960c316 Remove an unnecessary transposition in TTransform.
+6e36b511 Small speedup in FTransform.
+9dbd4aad Merge "fix C and SIMD flags completion."
+e60853ea Add missing common_sse2.h file to makefile.unix
+696eb2b0 fix C and SIMD flags completion.
+2b4fe33e Merge "fix multiple allocation for transform buffer"
+2f5e8986 fix multiple allocation for transform buffer
+bf2b4f11 Regroup common SSE code + optimization.
+4ed650a1 force "-pass 6" if -psnr or -size is used but -pass isn't.
+3ef1ce98 yuv_sse2: fix -Wconstant-conversion warning
+a7a03e9f Merge changes I4852d18f,I51ccb85d
+5e122bd6 gif2webp: set enc_options.verbose = 0 w/-quiet
+ab3c2583 anim_encode,DefaultEncoderOptions: init verbose
+8f0dee77 Merge "configure: fix builtin detection w/-Werror"
+4a7b85a9 cmake: fix builtin detection w/-Werror
+b74657fb configure: fix builtin detection w/-Werror
+3661b980 Add a CMakeLists.txt
+75f4af4d remove useless #include
+6c1d7631 avoid Yoda style for comparison
+8ce975ac SSE optimization for vector mismatch.
+7db53831 Merge tag 'v0.5.0'
+37f04949 update ChangeLog (tag: v0.5.0-rc1, tag: v0.5.0, origin/0.5.0, 0.5.0)
+7e7b6ccc faster rgb565/rgb4444/argb output
+4c7f565f update NEWS
+1f62b6b2 update AUTHORS
+e224fdc8 update mailmap
+71100500 bump version to 0.5.0
+230a685e README: update help text, repo link
+d48e427b Merge "demux: accept raw bitstreams"
+99a01f4f Merge "Unify some entropy functions."
+4b025f10 Merge "configure: disable asserts by default"
+92cbddf8 Merge "fix PrintBlockInfo()"
+ca509a33 Unify some entropy functions.
+367bf903 fix PrintBlockInfo()
+b0547ff0 move back common constants for lossless_enc*.c into the .h
+fb4c7832 lossless: simpler alpha cleanup preprocessing
+ba7f4b68 Merge "anim_diff: add brief description of options"
+47ddd5a4 Move some codec logic out of ./dsp .
+b4106c44 anim_diff: add brief description of options
+357f455d yuv_sse2: fix 32-bit visual studio build
+b9d80fa4 configure: disable asserts by default
+7badd3da cosmetic fix: sizeof(type) -> sizeof(*var)
+80ce27d3 Speed up 24-bit packing / unpacking in YUV / RGB conversions.
+68eebcb0 remove a TODO about rotation
+2dee2966 remove few obsolete TODO about aligned loads in SSE2
+e0c0bb34 remove TODO about unused ref_lf_delta[]
+9cf1cc2b remove few TODO:   * 256 -> RD_DISTO_MULT   * don't use TDisto for UV mode picking
+79189645 Merge changes from topic 'demux-fragment-cleanup'
+47399f92 demux: remove GetFragment()
+d3cfb79a demux: remove dead fragment related TODO
+ab714b8a demux, Frame: remove is_fragment_ field
+b105921c yuv_sse2, cosmetics: fix indent
+466c92e8 demux,WebPIterator: remove fragment_num/num_fragments
+11714ff1 demux: remove WebPDemuxSelectFragment
+c0f7cc47 fix for bug #280: UMR in next->bits
+578beeb8 Merge "enc/Makefile.am: add missing headers"
+1a819f00 makefile.unix: make visibility=hidden the default
+d4f9c2ef enc/Makefile.am: add missing headers
+846caff4 configure: check for -fvisibility=hidden
+3f3ea2c5 demux: accept raw bitstreams
+d6dad5d0 man cwebp: add precision about exactness of the 'lossless' mode
+46bb1e34 Merge "gifdec: remove utils.h include"
+2b882e94 Merge "Makefile.vc: define WEBP_HAVE_GIF for gifdec.c"
+892b9238 Merge "man/*, AUTHORS: clarify origin of the tool"
+e5687a18 Merge "fix optimized build with -mcmodel=medium"
+e56e6859 Makefile.vc: define WEBP_HAVE_GIF for gifdec.c
+4077d944 gifdec: remove utils.h include
+b5e30dac man/*, AUTHORS: clarify origin of the tool
+b275e598 fix optimized build with -mcmodel=medium
+64da45a9 cosmetics, cwebp: fix indent
+038a060d Merge "add disto-based refinement for UV mode (if method = 1 or 2)"
+2835089d Provide an SSE2 implementation of CombinedShannonEntropy.
+e6c93519 add disto-based refinement for UV mode (if method = 1 or 2)
+04507dc9 Merge "fix undefined behaviour during shift, using a cast"
+793c5261 Merge "wicdec: add support for reading from stdin"
+d3d16397 Optimize the heap usage in HistogramCombineGreedy.
+202a710b fix undefined behaviour during shift, using a cast
+14d27a46 improve method #2 by merging DistoRefine() and  SimpleQuantize()
+cb1ce996 Merge "10% faster table-less SSE2/NEON version of YUV->RGB conversion"
+ac761a37 10% faster table-less SSE2/NEON version of YUV->RGB conversion
+79fcf29a wicdec: add support for reading from stdin
+015f173f Merge "cwebp: add support for stdin input"
+a9947c32 cwebp: add support for stdin input
+7eb01ff3 Merge "Improved alpha cleanup for the webp encoder when prediction transform is used."
+fb8c9106 Merge "introduce WebPMemToUint32 and WebPUint32ToMem for memory access"
+bd91af20 Merge "bit_reader: remove aarch64 BITS TODO"
+6c702b81 Speed up hash chain initialization using memset.
+4c60f63c make ReadPNG and ReadJPEG take a filename instead of a FILE
+464ed10f bit_reader: remove aarch64 BITS TODO
+d478e589 Merge "configure: update issue tracker"
+69381113 Improved alpha cleanup for the webp encoder when prediction transform is used.
+2c08aac8 introduce WebPMemToUint32 and WebPUint32ToMem for memory access
+010ca3d1 Fix FindMatchLength with non-aligned buffers.
+a90e1e3f README: add prerequisites for an autoconf build
+458f0866 configure: update issue tracker
+33914595 vwebp: work around the transparent background with GLUT bug
+e4a7eed4 cosmetics: fix indent
+08375129 Merge "Make a separate case for low_effort in CopyImageWithPrediction"
+aa2eb2d4 Merge "cosmetics: fix indent"
+b7551e90 cosmetics: fix indent
+5bda52d4 Make a separate case for low_effort in CopyImageWithPrediction
+66fa598a Merge "configure: fix intrinsics build w/older gcc"
+5ae220be backward_references.c: Fixed compiler warning
+1556da09 Merge "configure: restore 2 warnings"
+71a17e58 configure: restore 2 warnings
+9eeabc07 configure: fix intrinsics build w/older gcc
+363babe2 Merge "fix some warning about unaligned 32b reads"
+a1411782 Optimization in hash chain comparison for 64 bit Arrays were compared 32 bits at a time, it is now done 64 bits at a time. Overall encoding speed-up is only of 0.2% on @skal's small PNG corpus. It is of 3% on my initial 1.3 Mp desktop screenshot image.
+829bd141 Combine Huffman cost and bit entropy into one loop
+a7a954c8 Merge "lossless: make prediction in encoder work per scanline"
+61b605b4 Merge "fix of undefined multiply (int32 overflow)"
+239421c5 lossless: make prediction in encoder work per scanline
+f5ca40e0 fix of undefined multiply (int32 overflow)
+5cd2ef4c Merge changes from topic 'win-threading-compat'
+76ce9187 Makefile.vc: enable WEBP_USE_THREAD for windows phone
+d2afe974 thread: use CreateThread for windows phone
+0fd0e12b thread: use WaitForSingleObjectEx if available
+63fadc9f thread: use InitializeCriticalSectionEx if available
+110ad583 thread: use native windows cond var if available
+912c9fdf dec/webp: use GetLE(24|32) from utils
+f1694481 utils/GetLE32: correct uint32 promotion
+158763de Merge "always call WebPInitSamplers(), don't try to be smart"
+3770f3bb Merge "cleanup the YFIX/TFIX difference by removing some code and #define"
+a40f60a9 Merge "3% speed improvement for lossless webp encoder for low effort mode:"
+ed1c2bc6 always call WebPInitSamplers(), don't try to be smart
+b8c44f1a 3% speed improvement for lossless webp encoder for low effort mode:
+997e1038 cleanup the YFIX/TFIX difference by removing some code and #define
+d73d1c8b Merge "Make discarding invisible RGB values (cleanup alpha) the default."
+1f9be97c Make discarding invisible RGB values (cleanup alpha) the default.
+f240117b Make dwebp listen more to the -quiet flag
+b37b0179 fix for issue #275: don't compare to out-of-bound pointers
+21735e06 speed-up trivial one-symbol decoding case for lossless
+397863bd Refactor CopyPlane() and CopyPixels() methods: put them in utils.
+6ecd72f8 Re-enable encoding of alpha plane with color cache for next release.
+1f7148a4 Merge "remove unused fields from WebPDecoderOptions and WebPBitstreamFeatures"
+6ae395fa Merge "use ExReadFile() for ReadYUV()"
+8076a00e gitignore list: add anim_diff.
+1c1702d8 use ExReadFile() for ReadYUV()
+775d3a37 remove unused fields from WebPDecoderOptions and WebPBitstreamFeatures
+c13245c7 AnimEncoder: Add a GetError() method.
+688b265d AnimDecoder API: Add a GetDemuxer() method.
+1aa4e3d6 WebPAnimDecoder: add an option to enable multi-threaded decoding.
+3584abca AnimDecoder: option to decode to common color modes.
+afd5a62c Merge "mux.h does NOT need to include encode.h"
+8550d443 Merge "migrate anim_diff tool from C++ to C89"
+96201e50 migrate anim_diff tool from C++ to C89
+945cfa3b mux.h does NOT need to include encode.h
+8da07e8d Merge "~2x faster SSE2 RGB24toY, BGR24toY, ARGBToY|UV"
+bfd3fc02 ~2x faster SSE2 RGB24toY, BGR24toY, ARGBToY|UV
+02432427 man/cwebp.1, cosmetics: escape '-'s
+96f5b423 man/cwebp: group lossy-only options
+52fdbdfe extract some RGB24 to Luma conversion function from enc/ to dsp/
+ab8c2300 add missing \n
+8304179a sync NEWS with 0.4.4
+5bd04a08 sync versions with 0.4.4
+8f1fcc15 Merge "Move ARGB->YUV functions from dec/vp8l.c to dsp/yuv.c"
+25bf2ce5 fix some warning about unaligned 32b reads
+922268fd s/TIFF/WebP
+fa8927ef Move ARGB->YUV functions from dec/vp8l.c to dsp/yuv.c
+9b373598 Merge "for ReadXXXX() image-readers, use the value of pic->use_argb"
+f7c507a5 Merge "remove unnecessary #include "yuv.h""
+7861578b for ReadXXXX() image-readers, use the value of pic->use_argb
+14e4043b remove unnecessary #include "yuv.h"
+469ba2cd vwebp: fix incorrect clipping w/NO_BLEND
+4b9186b2 update issue tracker url
+d64d376c change WEBP_ALIGN_CST value to 31
+f717b828 vp8l.c, cosmetics: fix indent after 95509f9
+927ccdc4 Merge "fix alignment of allocated memory in AllocateTransformBuffer"
+fea94b2b fix alignment of allocated memory in AllocateTransformBuffer
+5aa8d61f Merge "MIPS: rescaler code synced with C implementation"
+e7fb267d MIPS: rescaler code synced with C implementation
+93c86ed5 Merge "format_constants.h: MKFOURCC, correct cast"
+5d791d26 format_constants.h: MKFOURCC, correct cast
+65726cd3 dsp/lossless: Average2, make a constant unsigned
+d26d9def Use __has_builtin to check clang support
+12ec204e moved ALIGN_CST into util/utils.h and renamed WEBP_ALIGN_xxx
+a2640838 Merge "rescaler: ~20% faster SSE2 implementation for lossless ImportRowExpand"
+3fb600d5 Merge "wicdec: fix alpha detection w/64bpp BGRA/RGBA"
+67c547fd rescaler: ~20% faster SSE2 implementation for lossless ImportRowExpand
+99e3f812 Merge "large re-organization of the delta-palettization code"
+95509f99 large re-organization of the delta-palettization code
+74fb458b fix for weird msvc warning message
+ae49ad86 Merge "SSE2 implementation of ImportRowShrink"
+932fd4df SSE2 implementation of ImportRowShrink
+badfcbaa wicdec: fix alpha detection w/64bpp BGRA/RGBA
+35cafa6c Merge "iosbuild: fix linking with Xcode 7 / iOS SDK 9"
+b0c9d8af label rename: NO_CHANGE -> NoChange
+b4e731cd neon-implementation for rescaler code
+db1321a6 iosbuild: fix linking with Xcode 7 / iOS SDK 9
+6dfa5e3e rescaler: better handling of the fxy_scale=0 special case.
+55c05293 Revert "rescaler: better handling of the fxy_scale=0 special case."
+9f226bf8 rescaler: better handling of the fxy_scale=0 special case.
+f7b8f907 delta_palettization.*: add copyright
+c1e1b710 Changed delta palette to compress better
+0dd28267 Merge "Add delta_palettization feature to WebP"
+48f66b66 Add delta_palettization feature to WebP
+27933e2a anim_encoder: drop a frame if it has same pixels as the prev frame.
+df9f6ec8 Merge "webpmux/DisplayInfo: send non-error output to stdout"
+8af4993b Merge "rescaler_mips_dsp_r2: cosmetics, fix indent"
+2b9d2495 Merge "rescaler: cosmetics, join two lines"
+cc020a8c webpmux/DisplayInfo: send non-error output to stdout
+a288e746 configure: add -Wshorten-64-to-32
+c4c3cf2d pngdec: fix type conversion warnings
+bef8e97d webpmux: fix type conversion warning
+5a84460d rescaler_mips_dsp_r2: cosmetics, fix indent
+acde0aae rescaler: cosmetics, join two lines
+306ce4fd rescaler: move the 1x1 or 2x1 handling one level up
+cced974b remove _mm_set_epi64x(), which is too specific
+56668c9f fix warnings about uint64_t -> uint32_t conversion
+76a7dc39 rescaler: add some SSE2 code
+1df1d0ee rescaler: harmonize function protos
+9ba1894b rescaler: simplify ImportRow logic
+5ff0079e fix rescaler vertical interpolation
+cd82440e VP8LAllocateHistogramSet: align histogram[] entries
+a406b1dd Merge "fix memory over-allocation in lossless rescaler init"
+0fde33e3 add missing const in VP8InitFrame signature
+ac7d5e8d fix memory over-allocation in lossless rescaler init
+017f8ccc Loosen the buffer size checks for Y/U/V/A too.
+15ca5014 loosen the padding check on buffer size
+d623a870 dec_neon: add whitespace around stringizing operator
+29377d55 dsp/mips: cosmetics: add whitespace around XSTR macro
+eebaf97f dsp/mips: add whitespace around stringizing operator
+d39dc8f3 Create a WebPAnimDecoder API.
+03fb7522 gif2webp: print output file size
+14efabbf Android: limit use of cpufeatures
+7b83adbe preparatory cosmetics for Rescaler code fix and clean-up
+77fb41c2 dec/vp8l/DecodeAlphaData: remove redundant cast
+90fcfcd9 Insert less hash chain entries from the beginnings of long copies.
+bd55604d SSE2: add yuv444 converters, re-using yuv_sse2.c
+41a5d99d add a -quiet option to 'dwebp'
+80ab3edb Merge "README: update dwebp help output after 1e595fe"
+32b71b2e README: update dwebp help output after 1e595fe
+3ec11827 use the DispatchAlpha() call from dsp
+c5f00621 incorporate bzero() into WebPRescalerInit() instead of call site
+3ebcdd41 remove duplicate "#include <stdlib.h>"
+1e595fe1 dwebp: add -resize as a synonym for -scale
+24a96932 dec: allow 0 as a scaling dimension
+b9187242 utils/rescaler: add WebPRescalerGetScaledDimensions
+923e8eda Merge "update NEWS"
+020fd099 Merge "WebPPictureDistortion: support ARGB format for 'pic' when computing distortion."
+6a5292f6 update NEWS
+56a2e9f5 WebPPictureDistortion: support ARGB format for 'pic' when computing distortion.
+0ae582e4 configure: test and add -Wunreachable-code
+c2f9dc06 bit_writer: convert VP8L macro values to immediates
+b969f888 Reduce magic in palette reordering
+acb297e9 anim_diff: add a -raw_comparison flag
+155c1b22 Merge changes I76f4d6fe,I45434639
+717e4d5a mips32/mipsDSPr2: function ImportRow rebased
+7df93893 fix rescaling bug (uninitialized read, see bug #254).
+5cdcd561 lossless_enc_neon: add VP8LTransformColor
+a53c3369 lossless_neon: add VP8LTransformColorInverse
+99131e7f Merge changes I9fb25a89,Ibc648e9e
+c4556766 simplify the main loop for downscaling
+2a010f99 lossless_neon: remove predictors 5-13
+ca221bbc ll_enc_neon: enable VP8LSubtractGreenFromBlueAndRed
+585d93db Container spec: clarify ordering of ALPH chunk.
+01d61fd9 lossless: ~20 % speedup
+f722c8f0 lossless: Speed up ComputeCacheEntropy by 40 %
+1ceecdc8 add a VP8LColorCacheSet() method for color cache
+17eb6099 lossless: Allow copying from prev row in rle-mode.
+f3a7a5bf lossless: bit writer optimization
+d97b9ff7 Merge changes from topic 'lossless-enc-improvements'
+0250dfcc msvc: fix pointer type warning in BitsLog2Floor
+52931fd5 lossless: combine the Huffman code with extra bits
+c4855ca2 lossless: Inlining add literal
+8e9c94de lossless: simplify HashChainFindCopy heuristics
+888429f4 lossless: 0.5 % compression density improvement
+7b23b198 lossless: Add zeroes into the predicted histograms.
+85b44d8a lossless: encoding, don't compute unnecessary histo
+d92453f3 lossless: Remove about 25 % of the speed degradation
+2cce0317 Faster alpha coding for webp
+5e75642e lossless: rle mode not to accept lengths smaller than 4.
+84326e4a lossless: Less code for the entropy selection
+16ab951a lossless: 0.37 % compression density improvement
+822f113e add WebPFree() to the API
+0ae2c2e4 SSE2/SSE41: optimize SSE_16xN loops
+39216e59 cosmetics: fix indent after 32462a07
+559e54ca Merge "SSE2: slightly faster FTransformWHT"
+8ef9a63b SSE2: slightly faster FTransformWHT
+f27f7735 lossless_neon: enable VP8LAddGreenToBlueAndRed
+36e9c4bc SSE2: minor cosmetrics on in-loop filter code
+4741fac4 dsp/lossless_*sse2: remove some unnecessary inlines
+1819965e fix warning ("left shift of negative value") using a cast
+70170014 SSE2: speed-up some lossless-encoding functions
+abcb0128 Merge "SSE2: slightly faster (~5%) AddGreenToBlueAndRed()"
+2df5bd30 Merge "Speedup to HuffmanCostCombinedCount"
+9e356d6b SSE2: slightly faster (~5%) AddGreenToBlueAndRed()
+fc6c75a2 SSE2: 53% faster TransformColor[Inverse]
+49073da6 SSE2: 46% speed-up of TransformColor[Inverse]
+32462a07 Speedup to HuffmanCostCombinedCount
+f3d687e3 SSE4.1 implementation of some lossless encoding functions
+bfc300c7 SSE4.1 implementation of some alpha-processing functions
+7f9c98f2 Merge "sse2 in-loop: simplify SignedShift8b() a bit"
+ef314a5d dec_sse2/GetNotHEV: micro optimization
+a729cff9 sse2 in-loop: simplify SignedShift8b() a bit
+422ec9fb simplify Load8x4() a bit
+8df238ec Merge "remove some duplicate FlipSign()"
+751506c4 remove some duplicate FlipSign()
+65ef5afc Merge "lossless: 0.13% compression density gain"
+2beef2f2 lossless: 0.13% compression density gain
+3033f24c lossless: 0.06 % compression density improvement
+64960da9 dec_neon: add VE8uv / VE16
+14dbd87b dec_neon: add HE8uv / HE16
+ac768011 introduce FTransform2 to perform two transforms at a time.
+aa6065ae dec_neon: use vld1_dup(mem) rather than vdup(mem[0])
+8b63ac78 Merge "dec_neon: add TM16"
+f51be09e Merge "dec_neon/TrueMotion: simply left border load"
+dc48196b dec_neon: add TM16
+ea95b305 dec_neon/TrueMotion: simply left border load
+f262d612 speed-up SetResidualSSE2
+bf46d0ac fix mips2 build target
+929a0fdc enc_sse2/TTransform: simplify abs calculation
+17dbd058 enc_sse2/CollectHistogram: simplify abs calculation
+a6c15936 dec_neon: add DC16 intra predictors
+03b4f50d Makefile.vc: add anim_diff build support.
+1b989874 Merge changes I9cd84125,Iee7e387f,I7548be72
+acd7b5af Introduce a test tool anim_diff.
+f274a96c dsp/enc_sse2: add luma4 intra predictors
+040b11bd dsp/enc_sse2: add chroma intra predictors
+aee021bb dsp/enc_sse2: add luma16 intra predictors
+9e00a499 makefile.unix: remove superclean target
+cefc9c09 makefile.unix: clean up after extras target
+4c9af023 dec_neon: add DC8uvNoTopLeft
+dd55b873 Merge "doc/webp-container-spec: update repo browser link"
+f0486968 doc/webp-container-spec: update repo browser link
+9287761d Merge "GetResidualCostSSE2: simplify abs calculation"
+0e009366 dsp/cpu.c(x86): check maximum supported cpuid feature
+b243a4bc GetResidualCostSSE2: simplify abs calculation
+6d4602b8 Merge "fix typo: constitutes -> constitute"
+5fe1fe37 fix typo: constitutes -> constitute
+b83bd7c4 Merge "populate 'libwebpextras' with: import gray, rgb565 and rgb4444 functions"
+b0114a32 Merge "histogram.h: cosmetics: remove unnecessary includes"
+feab45ef gifdec: Move inclusion of webp/config.h to header.
+dbba67d1 histogram.h: cosmetics: remove unnecessary includes
+e978fec6 Merge "VP8LBitReader: fix remaining ubsan error with large shifts"
+d6fe5884 Merge "ReconstructRow: move some one-time inits out of the main loop"
+a21d647c ReconstructRow: move some one-time inits out of the main loop
+7a01c3c3 VP8LBitReader: fix remaining ubsan error with large shifts
+7fa67c9b change GetPixPairHash64() return type to uint32_t
+ec1fb9f8 Merge "dsp/enc.c: cosmetics: move DST() def closer to use"
+7073bfb3 Merge "split 64-mult hashing into two 32-bit multiplies"
+0768b252 dsp/enc.c: cosmetics: move DST() def closer to use
+6a48b8f0 Merge "fix MSVC size_t->int conversion warning"
+1db07cde Merge "anim_encode: cosmetics: fix alignment"
+e28271a3 anim_encode: cosmetics: fix alignment
+7fe357b8 split 64-mult hashing into two 32-bit multiplies
+af74c145 populate 'libwebpextras' with: import gray, rgb565 and rgb4444 functions
+61214134 remove VP8Residual::cost unused field
+e2544823 fix MSVC size_t->int conversion warning
+b69a6c35 vwebp: don't redefine snprintf with VS2015+
+0ac29c51 AnimEncoder API: Consistent use of trailing underscores in struct.
+d4845550 AnimEncoder API: Use timestamp instead of duration as input to Add().
+9904e365 dsp/dec_sse2: DC8uv / DC8uvNoLeft speedup
+7df20497 dsp/dec_sse2: DC16 / DC16NoLeft speedup
+8e515dfe Merge "makefile.unix: add some missing headers"
+db12250f cosmetics: vp8enci.h: break long line
+bf516a87 makefile.unix: add some missing headers
+b44eda3f dsp: add DSP_INIT_STUB
+03e76e96 clarify the comment about double-setting the status in SetError()
+9fecdd71 remove unused EmitRGB()
+43f010dd move ReconstructRow to top
+82d98020 add a dec/common.h header to collect common enc/dec #defines
+5d4744a2 Merge "enc_sse41: add Disto4x4 / Disto16x16"
+e38886a7 mux.h: Bump up ABI version
+46305ca6 configure: add --disable-<avx2|sse4.1|sse2>
+2fc8b658 CPPFLAGS->CFLAGS for detecting sse4.1 in preprocessor
+1a338fb3 enc_sse41: add Disto4x4 / Disto16x16
+94055503 encoding SSE4.1 stub for StoreHistogram + Quantize + SSE_16xN
+c64659e1 remove duplicate variables after the lossless{_enc}.c split
+67ba7c7a enc_sse2: call local FTransform in CollectHistogram
+18249799 dsp: s/VP8LSetHistogramData/VP8SetHistogramData/
+ede5e158 cosmetics: dsp/lossless.h: reorder prototypes
+553051f7 dsp/lossless: split enc/dec functions
+9064adc8 Merge "conditionally add -msse4.1 in Makefile.unix"
+cecf5096 dsp/yuv*.c: rework WEBP_USE_<arch> ifdef
+6584d398 dsp/upsampling*.c: rework WEBP_USE_<arch> ifdef
+80809422 dsp/rescaler*.c: rework WEBP_USE_<arch> ifdef
+1d93ddec dsp/lossless*.c: rework WEBP_USE_<arch> ifdef
+73805ff2 dsp/filters*.c: rework WEBP_USE_<arch> ifdef
+fbdcef24 dsp/enc*.c: rework WEBP_USE_<arch> ifdef
+66de69c1 dsp/dec*.c: rework WEBP_USE_<arch> ifdef
+48e4ffd1 dsp/cost*.c: rework WEBP_USE_<arch> ifdef
+29fd6f90 dsp/argb*.c: rework WEBP_USE_<arch> ifdef
+80ff3813 dsp/alpha*.c: rework WEBP_USE_<arch> ifdef
+bf09cf1e conditionally add -msse4.1 in Makefile.unix
+e9570dd9 stub for SSE4.1 support.
+4a95384b Merge "dsp: add sse4.1 detection"
+cabf4bd2 dsp: add sse4.1 detection
+4ecba1ab thread.h: rename interface param
+b8d706c8 Merge "sync versions with 0.4.3"
+ae64a711 Merge "add shell for libwebpextras"
+92a5da9c sync versions with 0.4.3
+9d4e2d16 Merge "~30% faster smart-yuv (-pre 4) with early-out criterion"
+b1bdbbab ~30% faster smart-yuv (-pre 4) with early-out criterion
+7efb9748 Merge "Disable NEON code on Native Client"
+ac4f5784 Disable NEON code on Native Client
+0873f85b AnimEncoder API: Support input frames in YUV(A) format.
+5c176d2d add shell for libwebpextras
+44bd9561 fix signature for VP8RecordCoeffTokens()
+c9b8ea0e small cosmetics on TokenBuffer.
+76394c09 Merge "MIPS: dspr2: added optimization for TrueMotion"
+0f773693 WebPPictureRescale: add a note about 0 width/height
+241bb5d9 MIPS: dspr2: added optimization for TrueMotion
+6cef0e4f examples/Android.mk: add webpmux_example target
+53c16ff0 Android.mk: add webpmux target
+21852a00 Android.mk: add webpdemux target
+8697a3bc Android.mk: add webpdecoder{,_static} targets
+4a670491 Android.mk: split source lists per-directory
+b5e79422 MIPS: dspr2: Added optimization for some convert functions
+0f595db6 MIPS: dspr2: Added optimization for some convert functions
+8a218b4a MIPS: [mips32|dspr2]: GetResidualCost rebased
+ef987500 Speedup method StoreImageToBitMask by 5%.
+602a00f9 fix iOS arm64 build with Xcode 6.3
+23820507 1-2% faster encoding by removing an indirection in GetResidualCost()
+eddb7e70 MIPS: dspr2: added otpimization for DC8uv, DC8uvNoTop and DC8uvNoLeft
+73ba2915 MIPS: dspr2: added optimization for functions RD4 and LD4
+c7129da5 Merge "4-5% faster encoding using SSE2 for GetResidualCost"
+94380d00 MIPS: dspr2: added optimizaton for functions VE4 and DC4
+2a407092 4-5% faster encoding using SSE2 for GetResidualCost
+17e19862 Merge "MIPS: dspr2: added optimization for simple filtering functions"
+3ec404c4 Merge "dsp: normalize WEBP_TSAN_IGNORE_FUNCTION usage"
+b969f5df dsp: normalize WEBP_TSAN_IGNORE_FUNCTION usage
+d7b8e711 MIPS: dspr2: added optimization for simple filtering functions
+235f774e Merge "MIPS: dspr2: Added optimization for function VP8LTransformColorInverse_C"
+42a8a628 MIPS: dspr2: Added optimization for function VP8LTransformColorInverse_C
+b442bef3 Merge "ApplyFiltersAndEncode: only copy lossless stats"
+b510fbfe doc/webp-container-spec: note MSB order for chunk diagrams
+9bc0f922 ApplyFiltersAndEncode: only copy lossless stats
+3030f115 Merge "dsp/mips: add some missing TSan annotations"
+dfcf4593 Merge "MIPS: dspr2: Added optimization for function VP8LAddGreenToBlueAndRed_C"
+55c75a25 dsp/mips: add some missing TSan annotations
+2cb879f0 MIPS: dspr2: Added optimization for function VP8LAddGreenToBlueAndRed_C
+e1556010 move some cost tables from enc/ to dsp/
+c3a03168 Merge "picture_csp: fix build w/USE_GAMMA_COMPRESSION undefined"
+39537d7c Merge "VP8LDspInitMIPSdspR2: add missing TSan annotation"
+1dd419ce picture_csp: fix build w/USE_GAMMA_COMPRESSION undefined
+43fd3543 VP8LDspInitMIPSdspR2: add missing TSan annotation
+c7233dfc Merge "VP8LDspInit: remove memcpy"
+0ec4da96 picture_csp::InitGammaTables*: add missing TSan annotations
+35579a49 VP8LDspInit: remove memcpy
+97f6aff8 VP8YUVInit: add missing TSan annotation
+f9016d66 dsp/enc::InitTables: add missing TSan annotation
+e3d9771a VP8EncDspCostInit*: add missing TSan annotations
+d97c143d Merge "doc/webp-container-spec: cosmetics"
+309b7908 MIPS: mips32: Added optimization for function SetResidualCoeffs
+a987faed MIPS: dspr2: added optimization for function GetResidualCost
+e7d3df23 doc/webp-container-spec: cosmetics
+be6635e9 Merge "VP8TBufferClear: remove some misleading const's"
+02971e72 Merge "VP8EmitTokens: remove unnecessary param void cast"
+3b77e5a7 VP8TBufferClear: remove some misleading const's
+aa139c8f VP8EmitTokens: remove unnecessary param void cast
+c24d8f14 cosmetics: upsampling_sse2: add const to some casts
+1829c42c cosmetics: lossless_sse2: add const to some casts
+183168f3 cosmetics: enc_sse2: add const to some casts
+860badca cosmetics: dec_sse2: add const to some casts
+0254db97 cosmetics: argb_sse2: add const to some casts
+1aadf856 cosmetics: alpha_processing_sse2: add const to some casts
+1579de3c vwebp: clear canvas at the beginning of each loop
+4b9fa5d0 Merge "webp-container-spec: clarify background clear on loop"
+4c82284d Updated the near-lossless level mapping.
+56039479 webp-container-spec: clarify background clear on loop
+19f0ba0e Implement true-motion prediction in SSE2
+774d4cb7 make VP8PredLuma16[] array non-const
+d7eabb80 Merge "MIPS: dspr2: Added optimization for function CollectHistogram"
+fe42739c Use integers for kmin/kmax for simplicity.
+b9df35f7 AnimEncode API: kmax=0 should imply all keyframes.
+6ce296da MIPS: dspr2: Added optimization for function CollectHistogram
+2c906c40 vwebp: remove unnecessary static Help() prototype
+be0fd1d5 Merge "dec/vp8: clear 'dither_' on skipped blocks"
+e96170fe Merge "vwebp/animation: display last frame on end-of-loop"
+0f017b56 vwebp/animation: display last frame on end-of-loop
+c86b40cc enc/near_lossless.c: fix alignment
+66935fb9 dec/vp8: clear 'dither_' on skipped blocks
+b7de7946 Merge "lossless_neon: enable subtract green for aarch64"
+77724f70 SSE2 version of GradientUnfilter
+416e1cea lossless_neon: enable subtract green for aarch64
+72831f6b Speedup AnalyzeAndInit for low effort compression.
+a6597483 Speedup Analyze methods for lossless compression.
+98c81386 Enable Near-lossless feature.
+c6b24543 AnimEncoder API: Fix for kmax=1 and default kmin case.
+022d2f88 add SSE2 variants for alpha filtering functions
+2db15a95 Temporarily disable encoding of alpha plane with color cache.
+1d575ccd Merge "Lossless decoding: Remove an unnecessary if condition."
+cafa1d88 Merge "Simplify backward refs calculation for low-effort."
+7afdaf84 Alpha coding: reorganize the filter/unfiltering code
+4d6d7285 Simplify backward refs calculation for low-effort.
+ec0d1be5 Cleaup Near-lossless code.
+9814ddb6 Remove the post-transform near-lossless heuristic.
+4509e32e Lossless decoding: Remove an unnecessary if condition.
+f2ebc4a8 Merge "Regression fix for lossless decoding"
+783a8cda Regression fix for lossless decoding
+9a062b8e AnimEncoder: Bugfix for kmin = 1 and kmax = 2.
+0f027a72 simplify smart RGB->YUV conversion code
+0d5b334e BackwardReferencesHashChainFollowChosenPath: remove unused variable
+f480d1a7 Fix to near lossless artefacts on palettized images.
+d4615d08 Merge changes Ia1686828,I399fda40
+cb4a18a7 rename HashChainInit into HashChainReset
+f079e487 use uint16_t for chosen_path[]
+da091212 MIPS: dspr2: Added optimization for function FTransformWHT
+b8c20135 Merge "wicdec: (msvs) quiet some /analyze warnings"
+9b228b54 wicdec: (msvs) quiet some /analyze warnings
+daeb276a Merge "MIPS: dspr2: Added optimization for MultARGBRow function"
+cc087424 Merge "dsp/cpu: (msvs) add include for __cpuidex"
+4a82aab5 Merge changes I87544e92,I0bb6cda5
+7a191398 dwebp/WritePNG: mark png variables volatile
+775dfad2 dwebp: include setjmp.h w/WEBP_HAVE_PNG
+47d26be7 dwebp: correct sign in format strings
+f0e0677b VP8LEncodeStream: add an assert
+c5f7747f VP8LColorCacheCopy: promote an int before shifting
+0de5f33e dsp/cpu: (msvs) add include for __cpuidex
+7d850f7b MIPS: dspr2: Added optimization for MultARGBRow function
+54875293 MIPS: dspr2: added optimization for function QuantizeBlock
+4fbe9cf2 dsp/cpu: (msvs) avoid immintrin.h on _M_ARM
+3fd59039 simplify/reorganize arguments for CollectColorBlueTransforms
+b9e356b9 Disable costly TraceBackwards for method=0.
+a7e7caa4 MIPS: dspr2: added optimization for function TransformColorRed
+2cb39180 Merge "MIPS: dspr2: added optimization for function TransformColorBlue"
+279e6613 Merge "dsp/cpu: add include for _xgetbv() w/MSVS"
+b6c0428e dsp/cpu: add include for _xgetbv() w/MSVS
+d1c4ffae gif2webp: Move GIF decoding related code to a support library.
+07c39559 Merge "AnimEncoder API: Add info in README.mux"
+7b161973 MIPS: dspr2: added optimization for function TransformColorBlue
+d7c4b02a cpu: fix AVX2 detection for gcc/clang targets
+9d299469 AnimEncoder API: Add info in README.mux
+d581ba40 follow-up: clean up WebPRescalerXXX dsp function
+f8740f0d dsp: s/USE_INTRINSICS/WEBP_USE_INTRINSICS/
+ce73abe0 Merge "introduce a separate WebPRescalerDspInit to initialize pointers"
+ab66beca introduce a separate WebPRescalerDspInit to initialize pointers
+205c7f26 fix handling of zero-sized partition #0 corner case
+cbcdd5ff Merge "move rescaler functions to rescaler* files in src/dsp/"
+bf586e88 Merge changes I230b3532,Idf3057a7
+6dc79dc2 Merge "anim_encode: fix type conversion warnings"
+11fce25a Merge "dec_neon: remove returns from void functions"
+c4e63f99 Makefile.vc: add gif2webp target
+4f43d38c enable NEON for Windows ARM builds
+3f6615ac Makefile.vc: add rudimentary Windows ARM support
+e7c5954c dec_neon: remove returns from void functions
+f79c163b anim_encode: fix type conversion warnings
+0f54f1ec Remove gif2webp_util which is no longer needed.
+cbcbedd0 move rescaler functions to rescaler* files in src/dsp/
+ac79ed19 webpmux: remove experimental fragment handling
+e8694d4d mux: remove experimental FRGM parsing
+9e92b6ea AnimEncoder API: Optimize single-frame animated images
+abbae279 Merge "Move over gif2webp to the new AnimEncoder API."
+a28c4b36 MIPS: move WORK_AROUND_GCC define to appropriate place
+012d2c60 MIPS: dspr2: added optimization for functions SSEAxB
+67720c8b Move over gif2webp to the new AnimEncoder API.
+9241ecf4 MIPS: dspr2: added optimization for function Average
+9422211d Merge "Tune BackwardReferencesLz77 for low_effort (m=0)."
+df40057b Merge "Speedup VP8LGetHistoImageSymbols for low effort (m=0) mode."
+ea08466d Tune BackwardReferencesLz77 for low_effort (m=0).
+b0b973c3 Speedup VP8LGetHistoImageSymbols for low effort (m=0) mode.
+c6d32927 argb_sse2: cosmetics
+67f601cd make the 'last_cpuinfo_used' variable names unique
+b9489861 AnimEncoder API: Init method for default options.
+856f8ec1 Merge "AnimEncoder API: Remove AnimEncoderFrameOptions."
+c537514d Merge "AnimEncoder API: GenerateCandidates bugfix."
+dc0ce039 Merge "AnimEncoder API: Compute change rectangle for first frame too."
+f00b639b Merge "AnimEncoder API: In Assemble(), always set animation parameters."
+29ed796c Merge "AnimEncoder lib cleanup: prev to prev canvas not needed."
+9f0dd6e5 Merge "WebPAnimEncoder API: Header and implementation"
+5e56bbe0 AnimEncoder API: Remove AnimEncoderFrameOptions.
+b902c3ea AnimEncoder API: GenerateCandidates bugfix.
+ef3c39bb AnimEncoder API: Compute change rectangle for first frame too.
+eec423ab AnimEncoder API: In Assemble(), always set animation parameters.
+ae1c046e AnimEncoder lib cleanup: prev to prev canvas not needed.
+4b997ae4 WebPAnimEncoder API: Header and implementation
+72208bec move argb_*.o build target to encoder list
+95920538 Merge "multi-thread fix: lock each entry points with a static var"
+4c1b300a Merge "SSE2 implementation of VP8PackARGB"
+fbcc2004 Merge "add -Wformat-nonliteral and -Wformat-security"
+80d950d9 add -Wformat-nonliteral and -Wformat-security
+04c20e75 Merge "MIPS: dspr2: added optimization for function Intra4Preds"
+a437694a multi-thread fix: lock each entry points with a static var
+ca7f60db SSE2 implementation of VP8PackARGB
+72d573f6 simplify the PackARGB signature
+4e2589ff demux: restore strict fragment flag check
+4ba8e074 Merge "webp-container-spec: remove references to fragments"
+e752f0a6 Merge "demux: remove experimental FRGM parsing"
+f8abb112 Merge changes I109ec4d9,I73fe7743
+ae2188a4 MIPS: dspr2: added optimization for function Intra4Preds
+1f4b8642 move VP8EncDspARGBInit() call closer to where it's needed
+14108d78 dec_neon: add DC8uvNoTop / DC8uvNoLeft
+d8340da7 dec_neon: add DC8uv
+a66e66c7 webp-container-spec: remove references to fragments
+7ce8788b MIPS: dspr2: added optimization for function MakeARGB32
+012e623d demux: remove experimental FRGM parsing
+87c3d531 method=0: Don't evaluate any predictor
+6f4fcb98 Merge "MIPS: dspr2: added optimization for function ImportRow"
+24284459 replace unneeded calls to HistogramCopy() by swaps
+bdf7b40c MIPS: dspr2: added optimization for function ImportRow
+e66a9225 Merge "MIPS: dspr2: added optimization for function ExportRowC"
+c279fec1 MIPS: dspr2: added optimization for function ExportRowC
+31a9cf64 Speedup WebP lossless compression for low effort (m=0) mode with following: - Disable Cross-Color transform. - Evaluate predictors #11 (paeth), #12 and #13 only.
+9275d91c MIPS: dspr2: added optimization for function TrueMotion
+26106d66 Merge "enc_neon: fix building with non-Xcode clang (iOS)"
+1c4e3efe unroll the kBands[] indirection to remove a dereference in GetCoeffs()
+a3946b89 enc_neon: fix building with non-Xcode clang (iOS)
+8ed9c00d Merge "simplify the Histogram struct, to only store max_value and last_nz"
+bad77571 simplify the Histogram struct, to only store max_value and last_nz
+3cca0dc7 MIPS: dspr2: Added optimization for DCMode function
+37e395fd MIPS: fix functions to use generic BPS istead of hardcoded value
+9475bef4 PickBestUV: fix VP8Copy16x8 invocation
+441f273f Merge changes I55f8da52,Id73a1e96
+4a279a68 cosmetics: add some missing != NULL comparisons
+66ad3725 factorize BPS definition in dsp.h and add VP8Copy16x8
+432e5b55 make ALIGN_xxx naming consistent
+57606047 encoder: switch BPS to 32 instead of 16
+1b66bbe9 MIPS: dspr2: added optimization for function TransformColor_C
+c6d0f9e7 histogram: cosmetics
+f399d307 Merge changes I6eac17e5,I32d2b514
+9de9074c dec_neon: add TM8uv
+8e517eca bit_reader/kVP8NewRange: range_t -> uint8_t
+e1857139 dsp: initialize VP8PredChroma8 in VP8DspInit()
+e0c809ad Move Entropy methods to lossless.c
+a96ccf8f iosbuild: add x64_64 simulator support
+a0df5510 Remove handling for WEBP_HINT_GRAPH
+413dfc0c Move static method definition before its usage.
+0f235665 Update BackwardRefsWithLocalCache.
+d69e36ec Remove TODOs from lossless encoder code.
+fdaac8e0 Optmize VP8LGetBackwardReferences LZ77 references.
+2f0e2ba8 MIPS: dspr2: added optimization for function Select
+a3e79a46 Merge "WebPEncode: Support encoding same pic twice (even if modified)"
+e4f4dddb WebPEncode: Support encoding same pic twice (even if modified)
+cbc3fbb4 Merge "Updated VP8LGetBackwardReferences and color cache."
+95a9bd85 Updated VP8LGetBackwardReferences and color cache.
+54f2c14c MIPS: dspr2: added optimization for function FTransform
+aa42f423 MIPS: dspr2: Added optimization for function VP8LSubtractGreenFromBlueAndRed
+11a25f75 Merge "FlattenSimilarBlocks should only be tried when blending is possible."
+5cccdadf FlattenSimilarBlocks should only be tried when blending is possible.
+95ca44a7 MIPS: dspr2: added optimization for Disto4x4
+4171b672 backward_references.c: reindent after c8581b0
+c8581b06 Optimize BackwardReferences for RLE encoding.
+5798eee6 MIPS: dspr2: unfilters bugfix (Ie7b7387478a6b5c3f08691628ae00f059cf6d899)
+4167a3f5 Optimize backwardreferences
+d18554c3 Merge "webp/types.h: use inline for clang++/-std=c++11"
+7489b0e7 gif2webp: Add '-min-size' option to get best compression.
+77bdddf0 Speed up BackwardReferences
+6638710b webp/types.h: use inline for clang++/-std=c++11
+abf04205 Enable entropy based merge histo for (q<100)
+572022a3 filters_mips_dsp_r2.c: disable unfilters
+a28e21b1 MIPS: dspr2: Added optimization for function ClampedAddSubtractFull
+18d5a1ef MIPS: dspr2: added optimization for function ClampedAddSubtractHalf
+829a8c19 MIPS: dspr2: added optimization for ITransform
+c94ed49e gif2webp: Use the default hint instead of WEBP_HINT_GRAPH.
+653ace55 Increase the MAX_COLOR_CACHE_BITS from 9 to 10.
+919220c7 Change the logic adjusting the Histogram bits.
+53b096c0 Merge "Fix bug in VP8LCalculateEstimateForCacheSize."
+e912bd55 Fix bug in VP8LCalculateEstimateForCacheSize.
+541d7839 Merge "dec_neon: add RD4 intra predictor"
+f8cd0672 Merge "Makefile.vc: add a 'legacy' RTLIBCFG option"
+22881c99 dec_neon: add RD4 intra predictor
+613d281e update NEWS
+1304eb34 Merge "dec_neon: DC4: use pair-wise adds for top row"
+34c20c06 Makefile.vc: add a 'legacy' RTLIBCFG option
+7083006b Merge "dsp/dec_{neon,sse2}: VE4: normalize variable names"
+0db9031c dsp/dec_{neon,sse2}: VE4: normalize variable names
+b5bc1530 dec_neon: DC4: use pair-wise adds for top row
+5b90d8fe Unify the API between VP8BitWriter and VP8LBitWriter
+f7ada560 Merge changes I2e06907b,Ia9ed4ca6,I782282ff
+5beb6bf0 Merge "dec_neon: add VE4 intra predictor"
+eba6ce06 dec_neon: add DC4 intra predictor
+79abfbd9 dec_neon: add TM4 intra predictor
+fe395f0e dec_neon: add LD4 intra predictor
+32de385e dec_neon: add VE4 intra predictor
+72395ba9 Merge "Modify CostModel to allocate optimal memory."
+65e5eb8a gif2webp: Support GIF_DISPOSE_RESTORE_PREVIOUS
+e4c829ef gif2webp: Handle frames with odd offsets + disposal to background.
+c2b5a039 Modify CostModel to allocate optimal memory.
+b7a33d7e implement VE4/HE4/RD4/... in SSE2
+97c76f1f make VP8PredLuma4[] non-const and initialize array in VP8DspInit()
+0ea8c6c2 Merge "PrintReg: output to stderr"
+d7ff2f97 Merge "stopwatch.h: fix includes"
+f85ec712 PrintReg: output to stderr
+54edbf65 stopwatch.h: fix includes
+139142e4 Optimize BackwardReferenceHashChainFollowPath.
+5f36b68d enc/backward_references.c: fix indent
+e0e9960d Merge "sync version numbers to 0.4.2 release"
+64ac5144 sync version numbers to 0.4.2 release
+c24f8954 Simplify and speedup Backward refs computation.
+d1c359ef fix shared object build with -fvisibility=hidden
+a4c3a31b WEBP_TSAN_IGNORE_FUNCTION: fix gcc compat warning
+f358eeb8 add code for testing random incremental decoding in dwebp
+80247291 mark some init function as being safe for thread_sanitizer.
+79b5bdbf bit_reader.h: cosmetics: fix a typo
+6c673681 Improved near-lossless mode.
+0ce27e71 enc_mips32: workaround gcc-4.9 bug
+aca1b98f enc/vp8l.c: fix indent
+ca005027 Evaluate non-palette compression for palette image
+c8a87bb6 AssignSegments: quiet -Warray-bounds warning
+32f67e30 Merge "enc_neon: initialize vectors w/vdup_n_u32"
+fabc65da 1-3% faster encoding optimizing SSE_NxN functions
+7534d716 enc_neon: initialize vectors w/vdup_n_u32
+5f813912 Merge "Fix return code of EncodeImageInternal()"
+e321abe4 Fix return code of EncodeImageInternal()
+f82cb06a optimize palette ordering
+f545feee don't set the alpha value for histogram index image
+2d9b0a44 add WebPDispatchAlphaToGreen() to dsp
+1bd4c2ad Merge "Change Entropy based Histogram Combine heuristic."
+e295b8f1 Merge "iosbuild: cleanup"
+1be4e760 Merge "iosbuild: output autoconf req. on failure"
+d5e498d4 Change Entropy based Histogram Combine heuristic.
+47a2d8e1 fix MSVC float->int conversion warning
+041956f6 iosbuild: cleanup
+767eb402 iosbuild: output autoconf req. on failure
+35ad48b8 HistoHeapInit: correct positions allocation size
+45d9635f lossless: entropy clustering for high qualities.
+dc37df8c fix type warning for VS9_x64
+9f7d9e6d iosbuild: make iOS 6 the minimum requirement
+fdd6528b Remove unused VP8LDecoder member variable
+ea3bba5a Merge "rewrite Disto4x4 in enc_neon.c with intrinsic"
+f060dfc4 add lossless incremental decoding support
+ab70794d rewrite Disto4x4 in enc_neon.c with intrinsic
+d4471637 MIPS: dspr2: added optimization for function FilterLoop24
+2aef54d4 Merge "prepare VP8LDecodeImage for incremental decode"
+aed0f5a2 Merge "MIPS: dspr2: added optimization for function FilterLoop26"
+28630685 prepare VP8LDecodeImage for incremental decode
+248f3aed remove br->error_ field
+49e15044 MIPS: dspr2: added optimization for function FilterLoop26
+38128cb9 iobuild.sh: only install .h files in Headers
+c792d412 Premultiply with alpha during U/V downsampling
+0cc811d7 gif2webp: Background color correction
+d7167ff7 Amend the lossless spec according to issue #205, #206 and #224
+b901416b Record the lossless size stats.
+cddd3340 Add a WebPExtractAlpha function to dsp
+0716a98e fix indent after I0204949917836f74c0eb4ba5a7f4052a4797833b
+f9ced95a Optimize lossless decoding for trivial(ARB) codes.
+924fcfd9 Merge "webpmux: simplify InitializeConfig()"
+c0a462ca webpmux: simplify InitializeConfig()
+6986bb5e webpmux: fix indent
+f89e1690 webpmux: fix exit status on numeric value parse error
+2172cb62 Merge "webpmux: fix loop_count range check"
+e3b343ec Merge "examples: warn on invalid numeric parameters"
+0e23c487 webpmux: fix loop_count range check
+6208338a Merge "fix loop bug in DispatchAlpha()"
+d51f3e40 gif2webp: Handle frames with missing  graphic control extension
+690b491a fix loop bug in DispatchAlpha()
+96d43a87 examples: warn on invalid numeric parameters
+3101f537 MIPS: dspr2: added optimization for TransformOne
+a6bb9b17 SSE2 for inverse Mult(ARGB)Row and ApplyAlphaMultiply
+d84a8ffd Remove default initialization of decoder status.
+be70b86c configure: simplify libpng-config invocation
+e0a99321 Rectify bug in lossless incremental decoding.
+e2502a97 MIPS: dspr2: added optimization for TransformAC3
+24e1072a MIPS: dspr2: added optimization for TransformDC
+c0e84df8 Merge "Slightly faster lossless decoding (1%)"
+8dd28bb5 Slightly faster lossless decoding (1%)
+f0103595 MIPS: dspr2: added optimization for ColorIndexInverseTransforms
+d3242aee make VP8LSetBitPos() set br->eos_ flag
+a9decb55 Lossless decoding: fix eos_ flag condition
+3fea6a28 fix erroneous dec->status_ setting
+80b8099f MIPS: dspr2: add some specific mips code to commit I2c3f2b12f8df15b785fad5a9c56316e954ae0c53
+e5640625 Merge "further refine the COPY_PATTERN optim for DecodeAlpha"
+854509fe enc/histogram.c: reindent after f4059d0
+34421964 Merge "~3-5% faster encoding optimizing PickBestIntra*()"
+865069c1 further refine the COPY_PATTERN optim for DecodeAlpha
+a5956228 added C-level optimization for DecodeAlphaData function
+187d379d add a fallback to ALPHA_NO_COMPRESSION
+a48a2d76 ~3-5% faster encoding optimizing PickBestIntra*()
+a6140194 ExUtilReadFromStdin: (windows) open stdin in bin mode
+e80eab1f webpmux: (windows) open stdout in binary mode
+e9bfb116 cwebp: (windows) open stdout in binary mode
+5927e15b example_util: add ExUtilSetBinaryMode
+30f3b75b webpmux man page: Clarify some title, descriptions and examples
+77d4c7e3 address cosmetic comments from patch #71380
+f75dfbf2 Speed up Huffman decoding for lossless
+637b3888 dsp/lossless: workaround gcc-4.9 bug on arm
+8323a903 dsp.h: collect gcc/clang version test macros
+e6c4b52f move static initialization of WebPYUV444Converters[] to the Init function.
+49911d4d Merge "fix indentation"
+f4059d0c Code cleanup for HistogramRemap.
+e632b092 fix indentation
+f5c04d64 Merge "add a DispatchAlpha() for SSE2 that handles 8 pixels at a time"
+fc98edd9 add a DispatchAlpha() for SSE2 that handles 8 pixels at a time
+73d361dd introduce VP8EncQuantize2Blocks to quantize two blocks at a time
+0b21c30b MIPS: dspr2: added optimization for EmitAlphaRGB
+953acd56 enc_neon: enable QuantizeBlock for aarch64
+f4ae1437 MIPS: mips32: code rebase
+56977154 MIPS: dspr2: added optimizations for VP8YuvTo*
+2523aa73 SmartRGBYUV: fix odd-width problem with pixel replication
+ee52dc4e fix some MSVC64 warning about float conversion
+3fca851a cpu: check for _MSC_VER before using msvc inline asm
+e2a83d71 faster RGB->YUV conversion function (~7% speedup)
+de2d03e1 Merge "Add smart RGB->YUV conversion option -pre 4"
+3fc4c539 Add smart RGB->YUV conversion option -pre 4
+b4dc4069 MIPS: dspr2: added optimization for (un)filters
+137e6090 Merge "configure: add work around for gcc-4.9 aarch64 bug"
+b61c9cec MIPS: dspr2: Optimization of some simple point-sampling functions
+e2b8cec0 configure: add work around for gcc-4.9 aarch64 bug
+98c54107 MIPS: mips32r2: added optimization for BSwap32
+dab702b3 Update PATENTS to reflect s/VP8/WebM/g
+b564f7c7 Merge "MIPS: detect mips32r6 and disable mips32r1 code"
+b7e5a5c4 MIPS: detect mips32r6 and disable mips32r1 code
+63c2fc02 Correctly use the AC_CANONICAL_* macros
+bb07022b Merge "cosmetics"
+e300c9d8 cosmetics
+0e519eea Merge "cosmetics: remove some extraneous 'extern's"
+3ef0f08a Merge "vp8enci.h: cosmetics: fix '*' placement"
+4c6dde37 bit_writer: cosmetics: rename kFlush() -> Flush()
+f7b4c48b cosmetics: remove some extraneous 'extern's
+b47fb00a vp8enci.h: cosmetics: fix '*' placement
+b5a36cc9 add -near_lossless [0..100] experimental option
+0524d9e5 dsp: detect mips64 & disable mips32 code
+d3485d96 cwebp.1: fix quality description placement
+29a9fe22 Merge tag 'v0.4.1'
+8af27718 update ChangeLog (tag: v0.4.1, origin/0.4.1, 0.4.1)
+e09e9ff6 Record & log the image pre-processing time.
+f59c0b4b iosbuild.sh: specify optimization flags
+8d34ea3e update ChangeLog (tag: v0.4.1-rc1)
+dbc3da66 makefile.unix: add vwebp.1 to the dist target
+89a7c83c update ChangeLog
+ffe67ee9 Merge "update NEWS for the next release" into 0.4.1
+2def1fe6 gif2webp: dust up the help message
+fb668d78 remove -noalphadither option from README/vwebp.1
+e49f693b update NEWS for the next release
+cd013580 Merge "update AUTHORS" into 0.4.1
+268d01eb update AUTHORS
+85213b9b bump version to 0.4.1
+695f80ae Merge "restore mux API compatibility" into 0.4.1
+862d296c restore mux API compatibility
+8f6f8c5d remove the !WEBP_REFERENCE_IMPLEMENTATION tweak in Put8x8uv
+d713a696 Merge changes If4debc15,I437a5d5f into 0.4.1
+c2fc52e4 restore encode API compatibility
+793368e8 restore decode API compatibility
+b8984f31 gif2webp: fix compile with giflib 5.1.0
+222f9b1a gif2webp: simplify giflib version checking
+d2cc61b7 Extend MakeARGB32() to accept Alpha channel.
+4595b62b Merge "use explicit size of kErrorMessages[] arrays"
+157de015 Merge "Actuate memory stats for PRINT_MEMORY_INFO"
+fbda2f49 JPEG decoder: delay conversion to YUV to WebPEncode() call
+0b747b1b use explicit size of kErrorMessages[] arrays
+3398d81a Actuate memory stats for PRINT_MEMORY_INFO
+6f3202be Merge "move WebPPictureInit to picture.c"
+6c347bbb move WebPPictureInit to picture.c
+fb3acf19 fix configure message for multi-thread
+40b086f7 configure: check for _beginthreadex
+1549d620 reorder the YUVA->ARGB and ARGB->YUVA functions correctly
+c6461bfd Merge "extract colorspace code from picture.c into picture_csp.c"
+736f2a17 extract colorspace code from picture.c into picture_csp.c
+645daa03 Merge "configure: check for -Wformat-security"
+abafed86 configure: check for -Wformat-security
+fbadb480 split monolithic picture.c into picture_{tools,psnr,rescale}.c
+c76f07ec dec_neon/TransformAC3: initialize vector w/vcreate
+bb4fc051 gif2webp: Allow single-frame animations
+46fd44c1 thread: remove harmless race on status_ in End()
+5a1a7264 Merge "configure: check for __builtin_bswapXX()"
+6781423b configure: check for __builtin_bswapXX()
+6450c48d configure: fix iOS builds
+6422e683 VP8LFillBitWindow: enable fast path for 32-bit builds
+4f7f52b2 VP8LFillBitWindow: respect WEBP_FORCE_ALIGNED
+e458badc endian_inl.h: implement htoleXX with BSwapXX
+f2664d1a endian_inl.h: add BSwap16
+6fbf5345 Merge "configure: add --enable-aligned"
+dc0f479d configure: add --enable-aligned
+9cc69e2b Merge "configure: support WIC + OpenGL under mingw64"
+257adfb0 remove experimental YUV444 YUV422 and YUV400 code
+10f4257c configure: support WIC + OpenGL under mingw64
+380cca4f configure.ac: add AC_C_BIGENDIAN
+ee70a901 endian_inl.h: add BSwap64
+47779d46 endian_inl.h: add BSwap32
+d5104b1f utils: add endian_inl.h
+58ab6224 Merge "make alpha-detection loop in IsKeyFrame() in good x/y order"
+9d562902 make alpha-detection loop in IsKeyFrame() in good x/y order
+516971b1 lossless: Remove unaligned read warning
+b8b596f6 Merge "configure.ac: add an autoconf version prerequisite"
+34b02f8c configure.ac: add an autoconf version prerequisite
+e59f5360 neon: normalize vdup_n_* usage
+6ee7160d Merge changes I0da7b3d3,Idad2f278,I4accc305
+abc02f24 Merge "fix (uncompiled) typo"
+bc03670f neon: add INIT_VECTOR4
+6c1c632b neon: add INIT_VECTOR3
+dc7687e5 neon: add INIT_VECTOR2
+4536e7c4 add WebPMuxSetCanvasSize() to the mux API
+824eab10 fix (uncompiled) typo
+1f3e5f1e remove unused 'shift' argument and QFIX2 define
+8e867051 Merge "VP8LoadNewBytes: use __builtin_bswap32 if available"
+1b6a2635 Merge "Fix handling of weird GIF with canvas dimension 0x0"
+1da3d461 VP8LoadNewBytes: use __builtin_bswap32 if available
+1582e402 Fix handling of weird GIF with canvas dimension 0x0
+b8811dac Merge "rename interface -> winterface"
+db8b8b5f Fix logic in the GIF LOOP-detection parsing
+25aaddc8 rename interface -> winterface
+5584d9d2 make WebPSetWorkerInterface() check its arguments
+a9ef7ef9 Merge "cosmetics: update thread.h comments"
+c6af9991 Merge "dust up the help message"
+0a8b8863 dust up the help message
+a9cf3191 cosmetics: update thread.h comments
+27bfeee4 QuantizeBlock SSE2 Optimization:
+2bc0dc3e Merge "webpmux: warn when odd frame offsets are used"
+3114ebe4 Merge changes Id8edd3c1,Id418eb96,Ide05e3be
+c0726634 webpmux: warn when odd frame offsets are used
+c5c6b408 Merge "add alpha dithering for lossy"
+d5146784 examples/Android.mk: add cwebp
+ca0fa7c7 Android.mk: move dwebp to examples/Android.mk
+73d8fca0 Android.mk: add ENABLE_SHARED flag
+6e93317f muxread: fix out of bounds read
+8b0f6a48 Makefile.vc: fix CFLAGS assignment w/HAVE_AVX2=1
+bbe32df1 add alpha dithering for lossy
+79020767 Merge "make error-code reporting consistent upon malloc failure"
+77bf4410 make error-code reporting consistent upon malloc failure
+7a93c000 **/Makefile.am: remove unused AM_CPPFLAGS
+24e30805 Add an interface abstraction to the WebP worker thread implementation
+d6cd6358 Merge "fix orig_rect==NULL case"
+2bfd1ffa fix orig_rect==NULL case
+059e21c1 Merge "configure: move config.h to src/webp/config.h"
+f05fe006 properly report back encoding error code in WebPFrameCacheAddFrame()
+32b31379 configure: move config.h to src/webp/config.h
+90090d99 Merge changes I7c675e51,I84f7d785
+ae7661b3 makefiles: define WEBP_HAVE_AVX2 when appropriate
+69fce2ea remove the special casing for res->first in VP8SetResidualCoeffs
+6e61a3a9 configure: test for -msse2
+b9d2efc6 rename upsampling_mips32.c to yuv_mips32.c
+bdfeebaa dsp/yuv: move sse2 functions to yuv_sse2.c
+46b32e86 Merge "configure: set WEBP_HAVE_AVX2 when available"
+88305db4 Merge "VP8RandomBits2: prevent signed int overflow"
+73fee88c VP8RandomBits2: prevent signed int overflow
+db4860b3 enc_sse2: prevent signed int overflow
+3fdaf4d2 Merge "real fix for longjmp warning"
+385e3340 real fix for longjmp warning
+230a0555 configure: set WEBP_HAVE_AVX2 when available
+a2ac8a42 restore original value_/range_ field order
+5e2ee56f Merge "remove libwebpdspdecode dep on libwebpdsp_avx2"
+61362db5 remove libwebpdspdecode dep on libwebpdsp_avx2
+42c447ae Merge "lossy bit-reader clean-up:"
+479ffd8b Merge "remove unused #include's"
+9754d39a Merge "strong filtering speed-up (~2-3% x86, ~1-2% for NEON)"
+158aff9b remove unused #include's
+09545eea lossy bit-reader clean-up:
+ea8b0a17 strong filtering speed-up (~2-3% x86, ~1-2% for NEON)
+6679f899 Optimize VP8SetResidualCoeffs.
+ac591cf2 fix for gcc-4.9 warnings about longjmp + local variables
+4dfa86b2 dsp/cpu: NaCl has no support for xgetbv
+4c398699 Merge "cwebp: fallback to native webp decode in WIC builds"
+33aa497e Merge "cwebp: add some missing newlines in longhelp output"
+c9b340a2 fix missing WebPInitAlphaProcessing call for premultiplied colorspace output
+57897bae Merge "lossless_neon: use vcreate_*() where appropriate"
+6aa4777b Merge "(enc|dec)_neon: use vcreate_*() where appropriate"
+0d346e41 Always reinit VP8TransformWHT instead of hard-coding
+7d039fc3 cwebp: fallback to native webp decode in WIC builds
+d471f424 cwebp: add some missing newlines in longhelp output
+bf0e0030 lossless_neon: use vcreate_*() where appropriate
+9251c2f6 (enc|dec)_neon: use vcreate_*() where appropriate
+399b916d lossy decoding: correct alpha-rescaling for YUVA format
+78c12ed8 Merge "Makefile.vc: add rudimentary avx2 support"
+dc5b122f try to remove the spurious warning for static analysis
+ddfefd62 Makefile.vc: add rudimentary avx2 support
+a8911643 Merge "simplify VP8LInitBitReader()"
+fdbcd44d simplify VP8LInitBitReader()
+7c004287 makefile.unix: add rudimentary avx2 support
+515e35cf Merge "add stub dsp/enc_avx2.c"
+a05dc140 SSE2: yuv->rgb speed-up for point-sampling
+178e9a69 add stub dsp/enc_avx2.c
+1b99c09c Merge "configure: add a test for -mavx2"
+fe728071 configure: add a test for -mavx2
+e46a247c cpu: fix check for __cpuidex availability
+176fda26 fix the bit-writer for lossless in 32bit mode
+541784c7 dsp.h: add a check for AVX2 / define WEBP_USE_AVX2
+bdb151ee dsp/cpu: add AVX2 detection
+ab9f2f86 Merge "revamp the point-sampling functions by processing a full plane"
+a2f8b289 revamp the point-sampling functions by processing a full plane
+ef076026 use decoder's DSP functions for autofilter
+2b5cb326 Merge "dsp/cpu: add AVX detection"
+df08e67e dsp/cpu: add AVX detection
+e2f405c9 Merge "clean-up and slight speed-up in-loop filtering SSE2"
+f60957bf clean-up and slight speed-up in-loop filtering SSE2
+9fc3ae46 .gitattributes: treat .ppm as binary
+3da924b5 Merge "dsp/WEBP_USE_NEON: test for __aarch64__"
+c7164490 Android.mk: always include *_neon.c in the build
+a577b23a dsp/WEBP_USE_NEON: test for __aarch64__
+54bfffca move RemapBitReader() from idec.c to bit_reader code
+34168ecb Merge "remove all unused layer code"
+f1e77173 remove all unused layer code
+b0757db7 Code cleanup for VP8LGetHistoImageSymbols.
+5fe628d3 make the token page size be variable instead of fixed 8192
+f948d08c memory debug: allow setting pre-defined malloc failure points
+ca3d746e use block-based allocation for backward refs storage, and free-lists
+1ba61b09 enable NEON intrinsics in aarch64 builds
+b9d2bb67 dsp/neon.h: coalesce intrinsics-related defines
+b5c75258 iosbuild: add support for iOSv7/aarch64
+9383afd5 Reduce number of memory allocations while decoding lossless.
+888e63ed Merge "dsp/lossless: prevent signed int overflow in left shift ops"
+8137f3ed Merge "instrument memory allocation routines for debugging"
+2aa18736 instrument memory allocation routines for debugging
+d3bcf72b Don't allocate VP8LHashChain, but treat like automatic object
+bd6b8619 dsp/lossless: prevent signed int overflow in left shift ops
+b7f19b83 Merge "dec/vp8l: prevent signed int overflow in left shift ops"
+29059d51 Merge "remove some uint64_t casts and use."
+e69a1df4 dec/vp8l: prevent signed int overflow in left shift ops
+cf5eb8ad remove some uint64_t casts and use.
+38e2db3e MIPS: MIPS32r1: Added optimization for HistogramAdd.
+e0609ade dwebp: fix exit code on webp load failure
+bbd358a8 Merge "example_util.h: avoid forward declaring enums"
+8955da21 example_util.h: avoid forward declaring enums
+6d6865f0 Added SSE2 variants for Average2/3/4
+b3a616b3 make HistogramAdd() a pointer in dsp
+c8bbb636 dec_neon: relocate some inline-asm defines
+4e393bb9 dec_neon: enable intrinsics-only functions
+ba99a922 dec_neon: use positive tests for USE_INTRINSICS
+69058ff8 Merge "example_util: add ExUtilDecodeWebPIncremental"
+a7828e8b dec_neon: make WORK_AROUND_GCC conditional on version
+3f3d717a Merge "enc_neon: enable intrinsics-only functions"
+de3cb6c8 Merge "move LOCAL_GCC_VERSION def to dsp.h"
+1b2fe14d example_util: add ExUtilDecodeWebPIncremental
+ca49e7ad Merge "enc_neon: move Transpose4x4 to dsp/neon.h"
+ad900abd Merge "fix warning about size_t -> int conversion"
+4825b436 fix warning about size_t -> int conversion
+42b35e08 enc_neon: enable intrinsics-only functions
+f937e012 move LOCAL_GCC_VERSION def to dsp.h
+5e1a17ef enc_neon: move Transpose4x4 to dsp/neon.h
+c7b92a5a dec_neon: (WORK_AROUND_GCC) delete unused Load4x8
+8e5f90b0 Merge "make ExUtilLoadWebP() accept NULL bitstream param."
+05d4c1b7 Merge "cwebp: add webpdec"
+ddeb6ac8 cwebp: add webpdec
+35d7d095 Merge "Reduce memory footprint for encoding WebP lossless."
+0b896101 Reduce memory footprint for encoding WebP lossless.
+f0b65c9a make ExUtilLoadWebP() accept NULL bitstream param.
+9c0a60cc Merge "dwebp: move webp decoding to example_util"
+1d62acf6 MIPS: MIPS32r1: Added optimization for HuffmanCost functions.
+4a0e7390 dwebp: move webp decoding to example_util
+c0220460 Merge "Bugfix: Incremental decode of lossy-alpha"
+8c7cd722 Bugfix: Incremental decode of lossy-alpha
+7955152d MIPS: fix error with number of registers.
+b1dabe37 Merge "Move the HuffmanCost() function to dsp lib"
+75b12006 Move the HuffmanCost() function to dsp lib
+2772b8bd MIPS: fix assembler error revealed by clang's debug build
+6653b601 enc_mips32: fix unused symbol warning in debug
+8dec1209 enc_mips32: disable ITransform(One) in debug builds
+98519dd5 enc_neon: convert Disto4x4 to intrinsics
+fe9317c9 cosmetics:
+953b0746 enc_neon: cosmetics
+a9fc697c Merge "WIP: extract the float-calculation of HuffmanCost from loop"
+3f84b521 Merge "replace some mult-long (vmull_u8) with mult-long-accumulate (vmlal_u8)"
+4ae0533f MIPS: MIPS32r1: Added optimizations for ExtraCost functions.
+b30a04cf WIP: extract the float-calculation of HuffmanCost from loop
+a8fe8ce2 Merge "NEON intrinsics version of CollectHistogram"
+95203d2d NEON intrinsics version of CollectHistogram
+7ca2e74b replace some mult-long (vmull_u8) with mult-long-accumulate (vmlal_u8)
+41c6efbd fix lossless_neon.c
+8ff96a02 NEON intrinsics version of FTransform
+0214f4a9 Merge "MIPS: MIPS32r1: Added optimizations for FastLog2"
+baabf1ea MIPS: MIPS32r1: Added optimizations for FastLog2
+3d49871d NEON functions for lossless coding
+3fe02915 MIPS: MIPS32r1: Added optimizations for SSE functions.
+c503b485 Merge "fix the gcc-4.6.0 bug by implementing alternative method"
+abe6f487 fix the gcc-4.6.0 bug by implementing alternative method
+5598bdec enc_mips32.c: fix file mode
+2b1b4d5a MIPS: MIPS32r1: Add optimization for GetResidualCost
+f0a1f3cd Merge "MIPS: MIPS32r1: Added optimization for FTransform"
+7231f610 MIPS: MIPS32r1: Added optimization for FTransform
+869eaf6c  ~30% encoding speedup: use NEON for QuantizeBlock()
+f758af6b enc_neon: convert FTransformWHT to intrinsics
+7dad095b MIPS: MIPS32r1: Added optimization for Disto4x4 (TTransform)
+2298d5f3 MIPS: MIPS32r1: Added optimization for QuantizeBlock
+e88150c9 Merge "MIPS: MIPS32r1: Add optimization for ITransform"
+de693f25 lossless_neon: disable VP8LConvert* functions
+4143332b NEON intrinsics for encoding
+0ca2914b MIPS: MIPS32r1: Add optimization for ITransform
+71bca5ec dec_neon: use vst_lane instead of vget_lane
+bf061052 Intrinsics NEON version of TransformOne
+19c6f1ba Merge "dec_neon: use vld?_lane instead of vset?_lane"
+7a94c0cf upsampling_neon: drop NEON suffix from local functions
+d14669c8 upsampling_sse2: drop SSE2 suffix from local functions
+2ca42a4f enc_sse2: drop SSE2 suffix from local functions
+d038e619 dec_sse2: drop SSE2 suffix from local functions
+fa52d752 dec_neon: use vld?_lane instead of vset?_lane
+c520e77d cosmetic: fix long line
+4b0f2dae Merge "add intrinsics NEON code for chroma strong-filtering"
+e351ec07 add intrinsics NEON code for chroma strong-filtering
+aaf734b8 Merge "Add SSE2 version of forward cross-color transform"
+c90a902e Add SSE2 version of forward cross-color transform
+bc374ff3 Use histogram_bits to initalize transform_bits.
+2132992d Merge "Add strong filtering intrinsics (inner and outer edges)"
+5fbff3a6 Add strong filtering intrinsics (inner and outer edges)
+d4813f0c Add SSE2 function for Inverse Cross-color Transform
+26029568 dec_neon: add strong loopfilter intrinsics
+cca7d7ef Merge "add intrinsics version of SimpleHFilter16NEON()"
+1a05dfa7 windows: fix dll builds
+d6c50d8a Merge "add some colorspace conversion functions in NEON"
+4fd7c82e SSE2 variants of Subtract-Green: Rectify loop condition
+97e5fac3 add some colorspace conversion functions in NEON
+b9a7a45f add intrinsics version of SimpleHFilter16NEON()
+daccbf40 add light filtering NEON intrinsics
+af444608 fix typo in STORE_WHT
+6af6b8e1 Tune HistogramCombineBin for large images.
+af93bdd6 use WebPSafe[CM]alloc/WebPSafeFree instead of [cm]alloc/free
+51f406a5 lossless_sse2: relocate VP8LDspInitSSE2 proto
+0f4f721b separate SSE2 lossless functions into its own file
+514fc251 VP8LConvertFromBGRA: use conversion function pointers
+6d2f3527 dsp/dec: TransformDCUV: use VP8TransformDC
+defc8e1b Merge "fix out-of-bound read during alpha-plane decoding"
+fbed3643 Merge "dsp: reuse wht transform from dec in encoder"
+d8467084 Merge "Add SSE2 version of ARGB -> BGR/RGB/... conversion functions"
+207d03b4 fix out-of-bound read during alpha-plane decoding
+d1b33ad5 2-5% faster trellis with clang/MacOS (and ~2-3% on ARM)
+369c26dd Add SSE2 version of ARGB -> BGR/RGB/... conversion functions
+df230f27 dsp: reuse wht transform from dec in encoder
+80e218d4 Android.mk: fix build with APP_ABI=armeabi-v7a-hard
+59daf083 Merge "cosmetics:"
+53622008 cosmetics:
+3e7f34a3 AssignSegments: quiet array-bounds warning
+3c2ebf58 Merge "UpdateHistogramCost: avoid implicit double->float"
+cf821c82 UpdateHistogramCost: avoid implicit double->float
+312e638f Extend the search space for GetBestGreenRedToBlue
+1c58526f Fix few nits
+fef22704 Optimize and re-structure VP8LGetHistoImageSymbols
+068b14ac Optimize lossless decoding.
+5f0cfa80 Do a binary search to get the optimum cache bits.
+24ca3678 Merge "allow 'cwebp -o -' to emit output to stdout"
+e12f874e allow 'cwebp -o -' to emit output to stdout
+2bcad89b allow some more stdin/stout I/O
+84ed4b3a fix cwebp.1 typos after patch #69199
+65b99f1c add a -z option to cwebp, and WebPConfigLosslessPreset() function
+30176619 4-5% faster trellis by removing some unneeded calculations.
+687a58ec histogram.c: reindent after b33e8a0
+06d456f6 Merge "~3-4% faster lossless encoding"
+c60de260 ~3-4% faster lossless encoding
+42eb06fc Merge "few cosmetics after patch #69079"
+82af8264 few cosmetics after patch #69079
+b33e8a05 Refactor code for HistogramCombine.
+ca1bfff5 Merge "5-10% encoding speedup with faster trellis (-m 6)"
+5aeeb087 5-10% encoding speedup with faster trellis (-m 6)
+82ae1bf2 cosmetics: normalize VP8GetCPUInfo checks
+e3dd9243 Merge "Refactor GetBestPredictorForTile for future tuning."
+206cc1be Refactor GetBestPredictorForTile for future tuning.
+3cb84062 Merge "speed-up trellis quant (~5-10% overall speed-up)"
+b66f2227 Merge "lossy encoding: ~3% speed-up"
+4287d0d4 speed-up trellis quant (~5-10% overall speed-up)
+390c8b31 lossy encoding: ~3% speed-up
+9a463c4a Merge "dec_neon: convert TransformWHT to intrinsics"
+e8605e96 Merge "dec_neon: add ConvertU8ToS16"
+4aa3e412 MIPS: MIPS32r1: rescaler bugfix
+c16cd99a Speed up lossless encoder.
+9d6b5ff1 dec_neon: convert TransformWHT to intrinsics
+2ff0aae2 dec_neon: add ConvertU8ToS16
+77a8f919 fix compilation with USE_YUVj flag
+4acbec1b Merge changes I3b240ffb,Ia9370283,Ia2d28728
+2719bb7e dec_neon: TransformAC3: work on packed vectors
+b7b60ca1 dec_neon: add SaturateAndStore4x4
+b7685d73 Rescale: let ImportRow / ExportRow be pointer-to-function
+e02f16ef dec_neon.c: convert TransformDC to intrinsics
+9cba963f add missing file
+8992ddb7 use static clipping tables
+0235d5e4 1-2% faster quantization in SSE2
+b2fbc36c fix VC12-x64 warning
+6e37cb94 Merge "cosmetics: backward_references.c: reindent after a7d2ee3"
+a42ea974 cosmetics: backward_references.c: reindent after a7d2ee3
+6c327442 Merge "fix missing __BIG_ENDIAN__ definition on some platform"
+a8b6aad1 fix missing __BIG_ENDIAN__ definition on some platform
+fde2904b Increase initial buffer size for VP8L Bit Writer.
+a7d2ee39 Optimize cache estimate logic.
+7fb6095b Merge "dec_neon.c: add TransformAC3"
+bf182e83 VP8LBitWriter: use a bit-accumulator
+3f40b4a5 Merge "MIPS: MIPS32r1: clang macro warning resolved"
+1684f4ee WebP Decoder: Mark some truncated bitstreams as invalid
+acbedac4 MIPS: MIPS32r1: clang macro warning resolved
+228e4877 dec_neon.c: add TransformAC3
+393f89b7 Android.mk: avoid gcc-specific flags with clang
+32aeaf11 revamp VP8LColorSpaceTransform() a bit
+0c7cc4ca Merge "Don't dereference NULL, ensure HashChain fully initialized"
+391316fe Don't dereference NULL, ensure HashChain fully initialized
+926ff402 WEBP_SWAP_16BIT_CSP: remove code dup
+1d1cd3bb Fix decode bug for rgbA_4444/RGBA_4444 color-modes.
+939e70e7 update AUTHORS file
+8934a622 cosmetics: *_mips32.c
+dd438c9a MIPS: MIPS32r1: Optimization of some simple point-sampling functions. PATCH [6/6]
+53520911 Added support for calling sampling functions via pointers.
+d16c6974 MIPS: MIPS32r1: Optimization of filter functions. PATCH [5/6]
+04336fc7 MIPS: MIPS32r1: Optimization of function TransformOne. PATCH [4/6]
+92d8fc7d MIPS: MIPS32r1: Optimization of function WebPRescalerImportRow. PATCH [3/6]
+bbc23ff3 parse one row of intra modes altogether
+a2f608f9 Merge "MIPS: MIPS32r1: Optimization of function WebPRescalerExportRow. [2/6]"
+88230854 MIPS: MIPS32r1: Optimization of function WebPRescalerExportRow. [2/6]
+c5a5b028 decode mt+incremental: fix segfault in debug builds
+9882b2f9 always use fast-analysis for all methods.
+000adac0 Merge "autoconf: update ax_pthread.m4"
+2d2fc37d update .gitignore
+5bf4255a Merge "Make it possible to avoid automagic dependencies"
+c1cb1933 disable NEON for arm64 platform
+73a304e9 Make it possible to avoid automagic dependencies
+4d493f8d MIPS: MIPS32r1: Decoder bit reader function optimized. PATCH [1/6]
+c741183c make WebPCleanupTransparentArea work with argb picture
+5da18552 add a decoding option to flip image vertically
+00c3c4e1 Merge "add man/vwebp.1"
+2c6bb428 add man/vwebp.1
+ea59a8e9 Merge "Merge tag 'v0.4.0'"
+7574bed4 fix comments related to array sizes
+0b5a90fd dwebp.1: fix option formatting
+effcb0fd Merge tag 'v0.4.0'
+7c76255d autoconf: update ax_pthread.m4
+fff2a11b make -short work with -print_ssim, -print_psnr, etc.
+68e7901d update ChangeLog (tag: v0.4.0-rc1, tag: v0.4.0, origin/0.4.0, 0.4.0)
+256e4333 update NEWS description with new general features
+29625340 Merge "gif2webp: don't use C99 %zu" into 0.4.0
+3b9f9dd0 gif2webp: don't use C99 %zu
+b5b2e3c7 cwebp: fix metadata output w/lossy+alpha
+ad26df1a makefile.unix: clean up libgif2webp_util.a
+c3b45570 update Changelog
+ca841121 Merge "bump version to 0.4.0" into 0.4.0
+8c524db8 bump version to 0.4.0
+eec2398c update AUTHORS & .mailmap
+b9bbf6a1 update NEWS for 0.4.0
+c72e0811 Merge "dec/webp.c: don't wait for data before reporting w/h"
+5ad65314 dec/frame.c: fix formatting
+f7fc4bc8 dec/webp.c: don't wait for data before reporting w/h
+66a32af5 Merge "NEON speed up"
+26d842eb NEON speed up
+f307f98b Merge "webpmux: let -- stop parameter parsing"
+fe051da7 Merge "README: add a section on gif2webp"
+6fd2bd62 Merge "manpage pedantry"
+4af19007 README: add a section on gif2webp
+6f36ade9 manpage pedantry
+f9016cb9 README: update dwebp options
+b4fa0a47 webpmux: let -- stop parameter parsing
+a9a20acf gif2webp: Add a multi-threaded encode option
+495bef41 fix bug in TrellisQuantize
+605a7127 simplify __cplusplus ifdef
+33109f99 Merge "drop: ifdef __cplusplus checks from C files"
+7f9de0b9 Merge changes I994a5587,I8467bb71,I13b50688,I1e2c9c7b
+5459030b gif2webp: let -- stop parameter parsing
+a4b0aa06 vwebp: let -- stop parameter parsing
+98af68fe cwebp: let -- stop parameter parsing
+a33831e2 dwebp: let -- stop parameter parsing
+36301249 add some checks on error paths
+ce4c7139 Merge "autoconf: add --disable-wic"
+5227d991 drop: ifdef __cplusplus checks from C files
+f6453559 dwebp.1: fix typo
+f91034f2 Merge "cwebp: print metadata stats when no output file is given"
+d4934553 gif2webp: Backward compatibility for giflib version <= 4.1.3
+4c617d32 gif2webp: Disable output of ICC profile by default
+73b731fb introduce a special quantization function for WHT
+41c0cc4b Make Forward WHT transform use 32bit fixed-point calculation
+a3359f5d Only compute quantization params once
+70490437 cwebp: print metadata stats when no output file is given
+d513bb62 * fix off-by-one zthresh calculation * remove the sharpening for non luma-AC coeffs * adjust the bias a little bit to compensate for this
+ad9dec0c Merge "cosmetics: dwebp: fix local function name format"
+f737f037 Merge "dwebp: remove a dead store"
+3c3a70da Merge "makefile.unix: install binaries in $(DESTDIR)/bin/"
+150b655f Merge "Android.mk: add some release compile flags"
+dbebd33b cosmetics: dwebp: fix local function name format
+27749951 dwebp: remove a dead store
+a01e04fe autoconf: add --disable-wic
+5009b227 makefile.unix: install binaries in $(DESTDIR)/bin/
+bab30fca Merge "fix -print_psnr / ssim options"
+ebef7fb3 fix -print_psnr / ssim options
+cb637855 Merge "fix bug due to overzealous check in WebPPictureYUVAToARGB()"
+8189885b Merge "EstimateBestFilter: use an int to iterate WEBP_FILTER_TYPE"
+4ad7d335 Android.mk: add some release compile flags
+c12e2369 cosmetics: fix a few typos
+6f104034 fix bug due to overzealous check in WebPPictureYUVAToARGB()
+3f6c35c6 EstimateBestFilter: use an int to iterate WEBP_FILTER_TYPE
+cc55790e Merge changes I8bb7a4dc,I2c180051,I021a014f,I8a224a62
+c536afb5 Merge "cosmetics: fix some typos"
+cbdd3e6e add a -dither dithering option to the decoder
+e8124012 Updated iosbuild.sh for XCode 5.x
+4931c329 cosmetics: fix some typos
+05aacf77 mux: add some missing casts
+617d9348 enc/vp8l: add a missing cast
+46db2865 idec: add some missing casts
+b524e336 ErrorStatusLossless: correct return type
+cb261f79 fix a descaling bug for vertical/horizontal U/V interpolation
+bcb3955c Merge changes I48968468,I181bc736
+73f52133 gif2webp: Add a mixed compression mode
+6198715e demux: split chunk parsing from ParseVP8X
+d2e3f4e6 demux: add a tail pointer for chunks
+87cffcc3 demux: cosmetics: s/has_frames/is_animation/
+e18e6677 demux: strictly enforce the animation flag
+c4f39f4a demux: cosmetics: remove a useless break
+61cb884d demux: (non-exp) fail if the fragmented flag is set
+ff379db3 few % speedup of lossless encoding
+df3649a2 remove all disabled code related to P-frames
+6d0cb3de Merge "gif2webp: kmin = 0 should suppress key-frame addition."
+36555983 gif2webp: kmin = 0 should suppress key-frame addition.
+7708e609 Merge "detect flatness in blocks and favor DC prediction"
+06b1503e Merge "add comment about the kLevelsFromDelta[][] LUT generation"
+5935259c add comment about the kLevelsFromDelta[][] LUT generation
+e3312ea6 detect flatness in blocks and favor DC prediction
+ebc9b1ee Merge "VPLBitReader bugfix: Catch error if bit_pos > LBITS too."
+96ad0e0a VPLBitReader bugfix: Catch error if bit_pos > LBITS too.
+a014e9c9 tune quantization biases toward higher precision
+1e898619 add helpful PrintBlockInfo() function
+596a6d73 make use of 'extern' consistent in function declarations
+c8d48c6e Merge "extract random utils to their own file util/random.[ch]"
+98aa33cf extract random utils to their own file util/random.[ch]
+432a723e Merge "swig: add basic go bindings"
+fab618b5 Merge "rename libwebp.i -> libwebp.swig"
+e4e7fcd6 swig: add basic go bindings
+d3408720 Merge "fast auto-determined filtering strength"
+f8bfd5cd fast auto-determined filtering strength
+ac0bf951 small clean-up in ExpandMatrix()
+1939607e rename libwebp.i -> libwebp.swig
+43148b6c filtering: precompute ilimit and hev_threshold
+18f992ec simplify f_inner calculation a little
+241d11f1 add missing const
+86c0031e add a 'format' field to WebPBitstreamFeatures
+dde91fde Demux: Correct the extended format validation
+5d6c5bd2 add entry for '-resize' option in cwebp's man
+7c098d18 Use some gamma-curve range compression when computing U/V average
+0b2b0504 Use deterministic random-dithering during RGB->YUV conversion
+8a2fa099 Add a second multi-thread method
+7d6f2da0 Merge "up to 20% faster multi-threaded decoding"
+266f63ea Merge "libwebp.jar: build w/Java 1.6 for Android compat"
+0532149c up to 20% faster multi-threaded decoding
+38efdc2e Simplify the gif2webp tool: move the optimization details to util
+de899516 libwebp.jar: build w/Java 1.6 for Android compat
+cb221552 Decode a full row of bitstream before reconstructing
+dca8a4d3 Merge "NEON/simple loopfilter: avoid q4-q7 registers"
+9e84d901 Merge "NEON/TransformWHT: avoid q4-q7 registers"
+fc10249b NEON/simple loopfilter: avoid q4-q7 registers
+2f09d63e NEON/TransformWHT: avoid q4-q7 registers
+77585a2b Merge "use a macrofunc for setting NzCoeffs bits"
+d155507c Merge "use HINT_GRAPH as image_hint for gif source"
+9c561646 Merge "only print GIF_DISPOSE_WARNING once"
+05879865 use HINT_GRAPH as image_hint for gif source
+0b28d7ab use a macrofunc for setting NzCoeffs bits
+f9bbc2a0 Special-case sparse transform
+00125196 gif2webp: detect and flatten uniformly similar blocks
+0deaf0fa only print GIF_DISPOSE_WARNING once
+6a8c0eb7 Merge "small optimization in segment-smoothing loop"
+f7146bc1 small optimization in segment-smoothing loop
+5a7533ce small gif2webp fix
+4df0c89e Merge changes Ic697660c,I27285521
+5b2e6bd3 Android.mk: add a dwebp target
+f910a84e Android.mk: update build flags
+63f9aba4 special-case WHT transform when there's only DC
+80911aef Merge "7-8% faster decoding by rewriting GetCoeffs()"
+606c4304 gif2webp: Improved compression for lossy animated WebP
+fb887f7f gif2webp: Different kmin/kmax defaults for lossy and lossless
+2a981366 7-8% faster decoding by rewriting GetCoeffs()
+92d47e4c improve VP8L signature detection by checking the version bits too
+5cd43e43 Add -incremental option to dwebp
+54b8e3f6 webpmux: DisplayInfo(): remove unnecessary error checks.
+40ae3520 fix memleak in WebPIDelete()
+d9662658 mux.h doc: WebPMuxGetFrame() can return WEBP_MUX_MEMORY_ERROR too.
+0e6747f8 webpmux -info: display dimensions and has_alpha per frame
+d78a82c4 Sanity check for underflow
+8498f4bf Merge "remove -Wshadow warnings"
+e89c6fc8 Avoid a potential memleak
+3ebe1757 Merge "break down the proba 4D-array into some handy structs"
+6a44550a break down the proba 4D-array into some handy structs
+2f5e8934 remove -Wshadow warnings
+bf3a29b3 Merge "add proper WEBP_HAVE_GIF and WEBP_HAVE_GL flags"
+2b0a7593 Merge "fix some warnings from static analysis"
+22dd07ce mux.h: Some doc corrections
+79ff0346 add proper WEBP_HAVE_GIF and WEBP_HAVE_GL flags
+d51f45f0 fix some warnings from static analysis
+d134307b fix conversion warning on MSVC
+d538cea8 gif2webp: Support a 'min' and 'max'  key frame interval
+80b54e1c allow search with token buffer loop and fix PARTITION0 problem
+b7d4e042 add VP8EstimateTokenSize()
+10fddf53 enc/quant.c: silence a warning
+399cd456 Merge "fix compile error on ARM/gcc"
+9f24519e encoder: misc rate-related fixes
+c663bb21 Merge "simplify VP8IteratorSaveBoundary() arg passing"
+fa46b312 Demux.h: Correct a method name reference
+f8398c9d fix compile error on ARM/gcc
+f691f0e4 simplify VP8IteratorSaveBoundary() arg passing
+42542be8 up to 6% faster encoding with clang compiler
+93402f02 multi-threaded segment analysis
+7e2d6595 Merge "remove the PACK() bit-packing tricks"
+c13fecf9 remove the PACK() bit-packing tricks
+2fd091c9 Merge "use NULL for lf_stats_ testing, not bool"
+b11c9d62 dwebp: use default dct_method
+4bb8465f Merge "(de)mux.h: wrap pseudo-code in /* */"
+cfb56b17 make -pass option work with token buffers
+5416aab4 (de)mux.h: wrap pseudo-code in /* */
+35dba337 use NULL for lf_stats_ testing, not bool
+733a7faa enc->Iterator memory cleanup
+e81fac86 Add support for "no blend" in webpmux binary
+3b80bc48 gif2webp: Separate out each step into a method
+bef7e9cc Add doc precision about demux object keeping pointers to data.
+61405a14 dwebp: enable stdout output with WIC
+6eabb886 Merge "Animated WebP: add "do no blend" option to spec"
+be20decb fix compilation for BITS 24
+e58cc137 Merge "dwebp: s/unsigned char/uint8_t/"
+72501d43 dwebp: s/unsigned char/uint8_t/
+2c9633e8 Merge "gif2webp: Insert independent frames at regular intervals."
+f0d6a14b gif2webp: Insert independent frames at regular intervals.
+b25a6fbf yuv.h: fix indent
+ede3602e Merge "cosmetics: fix indent"
+3a65122a dwebp: fix stdout related output
+388a7249 cosmetics: fix indent
+4c7322c8 Merge "dsp: msvc compatibility"
+d50c7e32 Merge "5-7% faster SSE2 versions of YUV->RGB conversion functions"
+b8ab7847 Merge "simplify upsampler calls: only allow 'bottom' to be NULL"
+df6cebfa 5-7% faster SSE2 versions of YUV->RGB conversion functions
+ad6ac32d simplify upsampler calls: only allow 'bottom' to be NULL
+a5e8afaf output to stdout if file name is "-"
+f358450f dsp: msvc compatibility
+43a7c8eb Merge "cosmetics"
+4c5f19c1 Merge "bit_reader.h: cosmetics"
+f72fab70 cosmetics
+14dd5e78 fix const-ness
+b20aec49 Merge "Support for 'do not blend' option in vwebp"
+dcf65222 Support for 'do not blend' option in vwebp
+d5bad033 Animated WebP: add "do no blend" option to spec
+a2f5f73d Merge "Support for "Do not blend" in mux and demux libraries"
+e081f2f3 Pack code & extra_bits to Struct (VP8LPrefixCode).
+6284854b Support for "Do not blend" in mux and demux libraries
+f486aaa9 Merge "slightly faster ParseIntraMode"
+d1718632 slightly faster ParseIntraMode
+3ceca8ad bit_reader.h: cosmetics
+69257f70 Create LUT for PrefixEncode.
+988b7084 add WebPWorkerExecute() for convenient bypass
+06e24987 Merge "VP8EncIterator clean-up"
+de4d4ad5 VP8EncIterator clean-up
+7bbe9529 Merge "cosmetics: thread.c: drop a redundant comment"
+da411485 cosmetics: thread.c: drop a redundant comment
+feb4b6e6 thread.h: #ifdef when checking WEBP_USE_THREAD
+8924a3a7 thread.c: drop WebPWorker prefix from static funcs
+1aed8f2a Merge "fix indent"
+4038ed15 fix indent
+1693fd9b Demux: A new state WEBP_DEMUX_PARSE_ERROR
+8dcae8b3 fix rescaling-with-alpha inaccuracy
+11249abf Merge changes I9b4dc36c,I4e0eef4d
+52508a1f Mux: support parsing unknown chunks within a frame/fragment.
+05db0572 WebPMuxSetChunk: remove unused variable
+8ba1bf61 Stricter check for presence of alpha when writing lossless images
+a03c3516 Demux: WebPIterator now also denotes if the frame has alpha.
+6df743a3 Decoder: handle fragments case correctly too.
+faa4b07e Support for unknown chunks in mux library
+7d60bbc6 Speed up HashChainFindCopy function.
+66740140 Speedup Alpha plane encoding.
+b7346a1e 0.1 % speedup to decoding
+c606182e webp-container-spec: Tighten language added by last
+a34a5029 pngdec: output error messages from libpng
+e84c625d Merge "Detect canvas and image size mismatch in decoder."
+f626fe2e Detect canvas and image size mismatch in decoder.
+f5fbdee0 demux: stricter image bounds check
+30c8158a add extra assert in Huffman decode code
+8967b9f3 SSE2 for lossless decoding (critical) functions.
+699d80ea Jump-lookup for Huffman coding
+c34307ab fix some VS9 warnings about type conversion
+eeada35c pngdec: add missing include
+54b65108 gif2webp: If aligning to even offsets, extra pixels should be transparent
+0bcf5ce3 Merge "remove a malloc() in case we're using only FILTER_NONE for alpha"
+2c07143b remove a malloc() in case we're using only FILTER_NONE for alpha
+a4d5f59d Faster lossless decoding
+fd53bb75 Merge "alternate LUT-base reverse-bits code"
+d1c166ef Merge "Container spec: a clarification on background color."
+fdb91779 Rename a method
+5e967532 Container spec: a clarification on background color.
+30e77d0f Merge branch '0.3.0'
+1b631e29 alternate LUT-base reverse-bits code
+24cc307a ~20% faster lossless decoding
+313d853f Speedup for decoding lossless WebP photographs:
+24ee098a change the bytes_per_pixels_ field into more evocative use_8b_decode
+2a04b034 update ChangeLog (tag: v0.3.1-rc2, tag: v0.3.1)
+7288950b Regression fix for alpha channels using color cache:
+2e377b53 wicdec: silence a format warning
+ad9e42a6 muxedit: silence some uninitialized warnings
+3307c163 Don't set alpha-channel to 0xff for alpha->green uplift
+5130770c Merge "wicdec: silence a format warning"
+a37eff47 Regression fix for alpha channels using color cache:
+241cf99b Merge "muxedit: silence some uninitialized warnings"
+c8f9c84d Regression fix for alpha unfiltering:
+14cd5c6c muxedit: silence some uninitialized warnings
+a368db81 dec/vp8l: quiet vs9 x64 type conversion warning
+ffae9f31 wicdec: silence a format warning
+8cf0701e Alpha encoding: never filter in case of NO_COMPRESSION
+825e73b1 update ChangeLog (tag: v0.3.1-rc1)
+abf6f691 update NEWS
+5a92c1a5 bump version to 0.3.1
+86daf77c store top Y/U/V samples in packed fashion
+67bc353e Revert "add WebPBlendAlpha() function to blend colors against background"
+068db59e Intertwined decoding of alpha and RGB
+38cc0114 Simplify forward-WHT + SSE2 version
+3fa595a5 Support decoding upto given row in DECODE_DATA_FUNC
+520f005f DequantizeLevels(): Add 'row' and 'num_rows' args
+47374b82 Alpha unfilter for given set of rows
+f32097e0 probe input file and quick-check for WebP format.
+a2aed1d0 configure: improve gl/glut library test
+c7e89cbb update copyright text
+a00380d2 configure: remove use of AS_VAR_APPEND
+a94a88dd fix EXIF parsing in PNG
+a71e5d84 add doc precision for WebPPictureCopy() and WebPPictureView()
+8287012e remove datatype qualifier for vmnv
+e1908430 fix a memory leak in gif2webp
+0b18b9ee fix two minor memory leaks in webpmux
+db5095d5 remove some cruft from swig/libwebp.jar
+850e956f README: update swig notes
+bddd9b0a swig/python: add minimal documentation
+d573a8d5 swig: add python encode support
+6b931875 swig/java: reduce wrapper function code duplication
+6fe536f4 swig/java: rework uint8_t typemap
+a2ea4643 Fix the bug in ApplyPalette.
+7bb28d2a webp/lossless: fix big endian BGRA output
+f036d4bf Speed up ApplyPalette for ARGB pixels.
+8112c8cf remove some warnings:
+cc128e0b Further reduce memory to decode lossy+alpha images
+07db70d2 fix for big-endian
+eda8a7de gif2webp: Fix signed/unsigned comparison mismatch
+31f346fe Makefile.vc: fix libwebpdemux dll variable typo
+6c76d28e swig: add python (decode) support
+b4f5bb6c swig: cosmetics
+498d4dd6 WebP-Lossless encoding improvements.
+26e72442 swig: ifdef some Java specific code
+8ecec686 configure: add warning related flags
+e676b043 configure: add GLUT detection; build vwebp
+b0ffc437 Alpha decoding: significantly reduce memory usage
+20aa7a8d configure: add --enable-everything
+b8307cc0 configure.ac: add some helper macros
+980e7ae9 Remove the gcc compilation comments
+7f25ff99 gif2webp: Fix ICC and XMP support
+d8e53211 Add missing name to AUTHORS
+11edf5e2 Demux: Fix a potential memleak
+c7b92184 don't forward declare enums
+7a650c6a prevent signed int overflow in left shift ops
+31bea324 add precision about dynamic output reallocation with IDecoder
+c22877f7 Add incremental support for extended format files
+5051245f Makefile.vc: have 'all' target build everything
+8191deca Makefile.vc: flags cleanup
+b9d74735 Makefile.vc: drop /FD flag
+5568dbcf update gitignore
+f4c7b654 WebPEncode: An additional check. Start VP8EncLoop/VP8EncTokenLoop only if VP8EncStartAlpha succeeded.
+1fb04bec pngdec: Avoid a double-free.
+dcbb1ca5 add WebPBlendAlpha() function to blend colors against background
+bc9f5fbe configure.ac: add AM_PROG_AR for automake >= 1.12
+bf867bf2 Tuned cross_color parameter (step) for lower qual
+90e2ec5a Merge "probe input file and quick-check for WebP format."
+7180d7ff Merge "update copyright text"
+830f72b7 probe input file and quick-check for WebP format.
+2ccf58d6 configure: improve gl/glut library test
+d640614d update copyright text
+c2113ad4 Merge "configure: remove use of AS_VAR_APPEND"
+9326a56f configure: remove use of AS_VAR_APPEND
+ea63d619 fix a type warning on VS9 x86
+bec11092 fix EXIF parsing in PNG
+b6e65f3d Merge "fix warnings for vs9 x64"
+438946dc fix warnings for vs9 x64
+f4710e3b collect macroblock reconstruction data in VP8MBData struct
+23d28e21 add doc precision for WebPPictureCopy() and WebPPictureView()
+518f2cd7 cosmetics: gif2webp: fix indent
+af358e68 Merge "remove datatype qualifier for vmnv"
+3fe91635 remove datatype qualifier for vmnv
+764fdffa fix a memory leak in gif2webp
+3e59a74d fix two minor memory leaks in webpmux
+47b9862f Merge "README: update swig notes"
+325d15ff remove some cruft from swig/libwebp.jar
+4a7627c2 README: update swig notes
+5da81e33 Merge "swig/python: add minimal documentation"
+f39e08f2 Merge "swig: add python encode support"
+6ca4a3e3 Merge "swig/java: reduce wrapper function code duplication"
+8f8702b0 Merge "swig/java: rework uint8_t typemap"
+91413be2 reduce memory for VP8MB and remove bitfields use
+7413394e Fix the memory leak in ApplyFilters.
+2053c2cf simplify the alpha-filter testing loop
+825b64db swig/python: add minimal documentation
+14677e11 swig: add python encode support
+a5c297c8 swig/java: reduce wrapper function code duplication
+ad4a367d swig/java: rework uint8_t typemap
+0d25876b use uint8_t for inv_palette[]
+afa3450c Fix the bug in ApplyPalette.
+2d6ac422 Merge "webp/lossless: fix big endian BGRA output"
+2ca83968 webp/lossless: fix big endian BGRA output
+742110cc Speed up ApplyPalette for ARGB pixels.
+2451e47d misc code cleanup
+83db4043 Merge "swig: add python (decode) support"
+eeeea8b5 Merge "swig: cosmetics"
+d5f9b8f3 Merge "libwebp: fix vp8 encoder mem alloc offsetting"
+d8edd835 libwebp: fix vp8 encoder mem alloc offsetting
+8983b83e remove use of bit-fields in VP8FInfo
+87a4fca2 remove some warnings:
+ba8f74e2 Merge "fix for big-endian"
+a65067fa Merge "Further reduce memory to decode lossy+alpha images"
+64c84486 Further reduce memory to decode lossy+alpha images
+332130b9 Mux: make a few methods static
+44370617 fix for big-endian
+5199eab5 Merge "add uncompressed TIFF output support"
+a3aede97 add uncompressed TIFF output support
+f975b67f Merge "gif2webp: Fix signed/unsigned comparison mismatch"
+5fbc734b Merge "GetFeatures: Detect invalid VP8X/VP8/VP8L data"
+d5060c87 Merge "mux.h: A comment fix + some consistency fixes"
+352d0dee GetFeatures: Detect invalid VP8X/VP8/VP8L data
+3ef79fef Cosmetic: "width * height"
+043e1ae4 gif2webp: Fix signed/unsigned comparison mismatch
+5818cff7 mux.h: A comment fix + some consistency fixes
+1153f888 Merge "swig: ifdef some Java specific code"
+3eeedae1 Makefile.vc: fix libwebpdemux dll variable typo
+f980faf4 swig: add python (decode) support
+7f5f42bb swig: cosmetics
+8eae188a WebP-Lossless encoding improvements.
+c7247c4c swig: ifdef some Java specific code
+4cb234d5 Merge "Mux: make ValidateForSingleImage() method static"
+ed6f5308 Merge "Add GetCanvasSize() method to mux"
+1d530c9a Mux: make ValidateForSingleImage() method static
+bba4c2b2 configure: add warning related flags
+fffefd18 Add GetCanvasSize() method to mux
+732da8d0 Merge "configure: add GLUT detection; build vwebp"
+0e513f7a configure: add GLUT detection; build vwebp
+55d1c150 Merge "Alpha decoding: significantly reduce memory usage"
+13d99fb5 Merge "configure: add --enable-everything"
+2bf698fe Merge "configure.ac: add some helper macros"
+edccd194 Alpha decoding: significantly reduce memory usage
+3cafcc9a configure: add --enable-everything
+4ef14477 configure.ac: add some helper macros
+a4e1cdbb Remove the gcc compilation comments
+6393fe4b Cosmetic fixes
+9c4ce971 Simplify forward-WHT + SSE2 version
+878b9da5 fix missed optim
+00046171 VP8GetInfo(): Check for zero width or height.
+9bf31293 align VP8Encoder::nz_ allocation
+5da165cf fix CheckMode() signature
+0ece07dc Merge "explicitly pad bitfields to 32-bits"
+9dbc9d19 explicitly pad bitfields to 32-bits
+5369a80f Merge "prevent signed int overflow in left shift ops"
+70e39712 Merge "cosmetics: remove unnecessary ';'s"
+d3136ce2 Merge "don't forward declare enums"
+b26e5ad5 gif2webp: Fix ICC and XMP support
+46089b20 Add missing name to AUTHORS
+94328d64 Demux: Fix a potential memleak
+96e948d7 don't forward declare enums
+f4f90880 prevent signed int overflow in left shift ops
+0261545e cosmetics: remove unnecessary ';'s
+7ebdf110 Merge "Fix few missing comparisons to NULL"
+1579989e Fix few missing comparisons to NULL
+ea1b21cf Cleaned up VP8GetHeaders() so that it parses only frame header
+b66caee4 dwebp: add support for BMP output
+ff885bfe add precision about dynamic output reallocation with IDecoder
+79241d5a Merge "Makefile.vc: have 'all' target build everything"
+ac1c729b Merge "Makefile.vc: flags cleanup"
+118a055c Merge "Makefile.vc: drop /FD flag"
+ecad0109 Merge "update gitignore"
+a681b4f4 Rename PRE_VP8 state to WEBP_HEADER
+ead4d478 Add incremental support for extended format files
+69d0f926 Makefile.vc: have 'all' target build everything
+52967498 Makefile.vc: flags cleanup
+c61baf0c Makefile.vc: drop /FD flag
+3a15125d update gitignore
+5167ca47 Merge "WebPEncode: An additional check. Start VP8EncLoop/VP8EncTokenLoop only if VP8EncStartAlpha succeeded."
+67708d67 WebPEncode: An additional check. Start VP8EncLoop/VP8EncTokenLoop only if VP8EncStartAlpha succeeded.
+b68912af pngdec: Avoid a double-free.
+82abbe12 Merge "configure.ac: add AM_PROG_AR for automake >= 1.12"
+e7d9548c add WebPBlendAlpha() function to blend colors against background
+ed4dc717 configure.ac: add AM_PROG_AR for automake >= 1.12
+df4a406d Merge branch '0.3.0'
+1e0d4b8c Update ChangeLog (tag: v0.3.0-rc7, tag: v0.3.0)
+d52b405d Cosmetic fixes
+6cb4a618 misc style fix
+68111ab0 add missing YUVA->ARGB automatic conversion in WebPEncode()
+e9a7990b Cosmetic fixes
+403bfe82 Container spec: Clarify frame disposal
+2aaa423b Merge "add missing YUVA->ARGB automatic conversion in WebPEncode()"
+07d87bda add missing YUVA->ARGB automatic conversion in WebPEncode()
+142c4629 misc style fix
+3e7a13a0 Merge "Container spec: clarify the background color field" into 0.3.0
+14af7745 container doc: add a note about the 'ANMF' payload
+cc635efa Container spec: clarify the background color field
+e3e33949 container doc: move RIFF description to own section
+4299f398 libwebp/mux: fix double free
+33f9a692 Merge "demux: keep a frame tail pointer; used in AddFrame" into 0.3.0
+a2a7b959 use WebPDataCopy() instead of re-coding it.
+6f18f12f demux: keep a frame tail pointer; used in AddFrame
+e5af49e9 add doc precision about WebPParseHeaders() return codes
+db46daab Merge "Makefile.vc: fix dynamic builds" into 0.3.0
+53c77afc Merge "gif2webp: Bgcolor fix for a special case" into 0.3.0
+a5ebd143 gif2webp: Bgcolor fix for a special case
+6378f238 Merge "vwebp/animation: fix background dispose" into 0.3.0
+3c8eb9a8 fix bad saturation order in QuantizeBlock
+04c7a2ec vwebp/animation: fix background dispose
+81a50695 Makefile.vc: fix dynamic builds
+5f25c396 update ChangeLog (tag: v0.3.0-rc6)
+14d42af2 examples: don't use C99 %zu
+5ccf1fe5 update ChangeLog
+2560c243 update NEWS
+f43bafc3 Merge changes Iecccb09c,If5ee9fd2,I3e181ce4 into 0.3.0
+a788644f dwebp: warn when decoding animated webp's
+302efcdb Decode: return more meaningful error for animation
+ad452735 WebPBitstreamFeatures: add has_animation field
+783dfa49 disable FRGM decoding for good in libwebpmux
+4b956be0 Update ChangeLog
+ad8b86d7 update NEWS
+3e084f63 Merge "demux cosmetics: comments/rename internal function" into 0.3.0
+d3f8c621 Merge "move WebPFeatureFlags declaration" into 0.3.0
+7386fe50 Merge "libwebp{demux,mux}: install mux_types.h" into 0.3.0
+d6cd4e90 Merge "bump decode abi" into 0.3.0
+17f8da5c bump decode abi
+97684ae2 Merge "add doc precision about WebPDemuxPartial()" into 0.3.0
+f933fd2a move WebPFeatureFlags declaration
+289bc47b libwebp{demux,mux}: install mux_types.h
+224e8d46 add doc precision about WebPDemuxPartial()
+4c18e80c demux cosmetics: comments/rename internal function
+7cfd1bf1 update AUTHORS
+401f7b85 Merge "speed-up lossless (~3%) with ad-hoc histogram cost evaluation" into 0.3.0
+1fc8ffca Merge "makefile.unix: dist related changes" into 0.3.0
+8a89c6ed Merge changes I466c377f,Ib761ebd3,I694857fc into 0.3.0
+f4ffb2d5 speed-up lossless (~3%) with ad-hoc histogram cost evaluation
+723847d5 gif2webp: only write error messages to stderr
+701b9e2a makefile.unix: dist related changes
+bb85b437 Merge "update NEWS" into 0.3.0
+59423a24 gif2webp: fix crash on open failure with libgif5
+9acb17de gif2webp: silence a unused param warning
+7d9fdc23 Merge "README updates" into 0.3.0
+5621934e Merge "build: fix install race on shared headers" into 0.3.0
+70809d89 Merge "bump version to 0.3.0" into 0.3.0
+d851cd1d demux: make the parse a bit more strict
+28bb4107 update NEWS
+cef93882 bump version to 0.3.0
+9048494d build: fix install race on shared headers
+1e67e8ef README updates
+42b611a4 Merge "configure: drop experimental from mux/demux" into 0.3.0
+096a8e32 Merge "vwebp: add color profile support" into 0.3.0
+ddfee5dc vwebp: add color profile support
+0d6927d3 Merge "Mark fragment options as experimental in webpmux" into 0.3.0
+5dbd4030 Mark fragment options as experimental in webpmux
+a0a6648c configure: drop experimental from mux/demux
+ee65bad8 Merge "add support for BITS > 32" into 0.3.0
+744930db add support for BITS > 32
+7dd288f0 cwebp: fix build
+19a8dd01 Merge "Makefile.vc: add vwebp.exe target" into 0.3.0
+50eeddad Merge "examples: normalize icc related program arguments" into 0.3.0
+757f637f Merge "Makefile.vc: add libwebpdecoder target" into 0.3.0
+b65c4b7c Makefile.vc: add libwebpdecoder target
+f8db7b4a Merge "vwebp: replace doubles w/floats where appropriate" into 0.3.0
+d99aa56f Makefile.vc: add vwebp.exe target
+013023e7 vwebp: replace doubles w/floats where appropriate
+9b3db894 README.mux: add version reference
+7b6a26cf Merge "cwebp: output metadata statistics" into 0.3.0
+d8dc72a0 examples: normalize icc related program arguments
+7bfc9056 Merge "make alpha unfilter work in-place" into 0.3.0
+0037b2d2 Merge "add LUT-free reference code for YUV->RGB conversion." into 0.3.0
+166bf744 Merge "demux: disable fragment parsing" into 0.3.0
+126974b4 add LUT-free reference code for YUV->RGB conversion.
+0aef3ebd make alpha unfilter work in-place
+14ef5005 Merge "Remove 'status: experimental' from container spec" into 0.3.0
+d40c98e1 Merge "webpmux binary: tiny style fix" into 0.3.0
+0bc42689 cwebp: output metadata statistics
+bc039803 Merge "autoconf: normalize experimental define" into 0.3.0
+d1e21b13 Remove 'status: experimental' from container spec
+7681bb96 webpmux binary: tiny style fix
+a3dd3d0f avoid installing example_util.h
+252320e2 demux: disable fragment parsing
+537bde05 autoconf: normalize experimental define
+5e338e0b Merge changes I33e8a613,I8e8a7b44 into 0.3.0
+d9d0ea1b Merge changes If21e3ec7,I991fc30b into 0.3.0
+627f5ca6 automake: add reference to libwebp for mux/demux
+eef73d07 don't consolidate proba stats too often
+05ec4cc2 libwebp{,decoder}.pc: add pthread flags
+1bfcf5bf add libwebpmux.pc
+26ca843d add libwebpdemux.pc
+69e25906 Merge "Tune Lossless compression for lower qualities."
+0478b5d2 Tune Lossless compression for lower qualities.
+39f7586f add a mention of parallel alpha encoding in the NEWS
+5a21d967 Merge "1.5x-2x faster encoding for method 3 and up"
+9bfbdd14 1.5x-2x faster encoding for method 3 and up
+27dc741b Correct frame options order in README.mux
+be2fd173 Mux: fix a scenario with bad ANMF/FRGM size
+19eb012c Merge "Demux: Add option to get frame count using GetI()"
+7368b8cb Merge "WebPGetFeatures() out of if condition for clarity."
+f604c9a4 Merge "fix windows build"
+153f94e8 fix windows build
+847b4924 Merge "vwebp: use magenta for 'i'nfo display"
+25ea46bd Merge "vwebp: add keyboard shortcuts to help output"
+bea7ccaf vwebp: use magenta for 'i'nfo display
+8fab161a webpmux: correct -frame param order in help output
+03cc23d6 vwebp: add keyboard shortcuts to help output
+068eba8d Demux: Add option to get frame count using GetI()
+988b8f56 WebPGetFeatures() out of if condition for clarity.
+6933d910 Merge "gif2webp: Be lenient about background color index."
+4d0f7c55 Merge "WebPGetFeatures() behavior change:"
+fdeeb01d gif2webp: Be lenient about background color index.
+ad250320 Merge "multi-threaded alpha encoding for lossy"
+4e32d3e1 Merge "fix compilation of token.c"
+f817930a multi-threaded alpha encoding for lossy
+88050351 fix compilation of token.c
+fc816219 code using the actual values for num_parts_, not the ones from config
+72655350 Merge "move the config check from .c to .h"
+dd9e76f7 move the config check from .c to .h
+956b217a WebPGetFeatures() behavior change:
+df02e4ce WebPDemuxGetI behavior change:
+633c004d Merge "rebalance method tools (-m) for methods [0..4]"
+58ca6f65 rebalance method tools (-m) for methods [0..4]
+7648c3cc Merge "describe rd-opt levels introduce VP8RDLevel enum"
+67fb1003 Merge "autoconf: enable silent-rules by default"
+a5042a32 GetVersion() methods for mux and demux
+5189957e describe rd-opt levels introduce VP8RDLevel enum
+4e094ace autoconf: enable silent-rules by default
+b7eaa85d inline VP8LFastLog2() and VP8LFastSLog2 for small values
+5cf7792e split quant_levels.c into decoder and encoder version
+e5d3ffe2 Merge "Update code example in README.mux"
+ac5a9156 Update code example in README.mux
+38a91e99 Add example code snippet for demux API
+5f557f3c README.mux: add info about Demux API and vwebp
+c0ba0903 backward_references: avoid signed integer overflow
+943386db disable SSE2 for now
+9479fb7d lossless encoding speedup
+ec2030a8 merge two lines together
+b67956c0 Merge "Remove ReadOneBit() and ReadSymbolUnsafe()"
+1667bded Remove ReadOneBit() and ReadSymbolUnsafe()
+3151669b wicdec + dwebp cosmetics: normalize formatting
+92668da6 change default filtering parameters:   * type is now 'strong'   * strength is now '60'
+b7490f85 introduce WEBP_REFERENCE_IMPLEMENTATION compile option
+33838857 faster decoding (3%-6%)
+5c3e381b Merge "add a -jpeg_like option"
+c2311046 remove unused declaration of VP8Zigzag
+36152957 Merge "wicdec: add alpha support for paletted formats"
+c9f16490 wicdec: add alpha support for paletted formats
+1262f81e Merge "wicdec: silence some warnings"
+e7ea61eb wicdec: silence some warnings
+23c0f354 fix missing intptr_t->int cast for MSVC
+e895059a add a -jpeg_like option
+1f803f64 Merge "Tune alpha quality mapping to more reasonable values."
+1267d498 Tune alpha quality mapping to more reasonable values.
+043076e2 Merge "speed-up lossless in BackwardTrace"
+f3a44dcd remove one malloc from TraceBackwards()
+0fc1a3a0 speed-up lossless in BackwardTrace
+7c732e59 cwebp: centralize WebPCleanupTransparentArea()
+7381254e Merge "wicdec: add ICC profile extraction"
+e83ff7de wicdec: add ICC profile extraction
+146c6e3b Merge "cosmetics: pngdec: normalize default label location"
+a8f549d7 Merge "manpages: italicize option parameters"
+e118db83 Merge "encode.h: note the need to free() WebPMemoryWriter"
+1dfee6db cosmetics: pngdec: normalize default label location
+14c38200 manpages: italicize option parameters
+7defbfad encode.h: note the need to free() WebPMemoryWriter
+88d382a0 cwebp: cleanup after memory_writer
+12d6cecf fix extra space in dwebp.1 man
+b01681a9 Fix for demuxer frame iteration:
+56c12aa6 Demuxer creation fix:
+66c810bc add a -yuv option to dwebp (very similar to -pgm)
+841a3ba5 Merge "Remove -Wshadow warnings."
+8fd02527 Merge "upsampling_neon.c: fix build"
+6efed268 Remove -Wshadow warnings.
+60904aa6 Merge "allow WebPINewRGB/YUVA to be passed a NULL output buffer."
+b7adf376 allow WebPINewRGB/YUVA to be passed a NULL output buffer.
+27f8f742 upsampling_neon.c: fix build
+06b9cdf1 gitignore: add IOS related directories
+f112221e Merge "Fix more comments for iobuild.sh"
+fe4d25dd Fix more comments for iobuild.sh
+1de3e252 Merge "NEON optimised yuv to rgb conversion"
+090b708a NEON optimised yuv to rgb conversion
+daa06476 Merge "Add ios build script for building iOS library."
+79fe39e2 Add ios build script for building iOS library.
+126c035f remove some more -Wshadow warnings
+522e9d61 Merge "cwebp: enable '-metadata'"
+76ec5fa1 cwebp: enable '-metadata'
+aeb91a9d Merge "cosmetics: break a few long lines"
+be7c96b0 cosmetics: break a few long lines
+cff8ddb6 Merge "add libwebpdecoder.pc"
+93148ab8 Merge "libwebp.pc.in: detab"
+6477f955 Merge "Makefile.vc: normalize path separator"
+bed1ed7c add libwebpdecoder.pc
+46168b2d libwebp.pc.in: detab
+a941a346 Fixed few nits in the build files.
+dd7a49b2 Makefile.vc: normalize path separator
+9161be86 Merge "cwebp: extract WIC decoding to its own module"
+08e7c58e Merge "Provide an option to build decoder library."
+0aeba528 Provide an option to build decoder library.
+757ebcb1 catch malloc(0)/calloc(0) with an assert
+152ec3d2 Merge "handle malloc(0) and calloc(0) uniformly on all platforms"
+a452a555 cwebp: extract WIC decoding to its own module
+2b252a53 Merge "Provide option to swap bytes for 16 bit colormodes"
+94a48b4b Provide option to swap bytes for 16 bit colormodes
+42f8f934 handle malloc(0) and calloc(0) uniformly on all platforms
+8b2152c5 Merge "add an extra assert to check memory bounds"
+0d19fbff remove some -Wshadow warnings
+cd22f655 add an extra assert to check memory bounds
+8189feda Merge "Add details and reference about the YUV->RGB conversion"
+1d2702b1 Merge "Formatting fixes in lossless bitstream spec"
+8425aaee Formatting fixes in lossless bitstream spec
+a556cb1a Add details and reference about the YUV->RGB conversion
+d8f21e0b add link to SSIM description on Wikipedia
+18e9167e Merge "WebP-lossless spec clarifications:"
+98e25b9b Merge "cwebp: add -metadata option"
+f01c2a53 WebP-lossless spec clarifications:
+f4a97970 Merge "Disto4x4 and Disto16x16 in NEON"
+47b7b0ba Disto4x4 and Disto16x16 in NEON
+7eaee9f1 cwebp: add -metadata option
+36c52c2c tiffdec: use toff_t for exif ifd offset
+7c8111e4 Merge "cwebp/tiffdec: add TIFF metadata extraction"
+e6409adc Remove redundant include from dsp/lossless code.
+1ab5b3a7 Merge "configure: fix --with-gifincludedir"
+03c749eb configure: fix --with-gifincludedir
+8b650635 multiple libgif versions support for gif2webp
+476e293f gif2webp: Use DGifOpenFileName()
+b50f277b tiffdec: correct format string
+2b9048e3 Merge "tiffdec: check error returns for width/height"
+a1b5a9a3 Merge "cwebp/tiff: use the first image directory"
+079423f5 tiffdec: check error returns for width/height
+d62824af Merge "cwebp/jpegdec: add JPEG metadata extraction"
+03afaca4 Merge "cwebp: add PNG metadata extraction"
+2c724968 cwebp/jpegdec: add JPEG metadata extraction
+dba64d91 cwebp: add PNG metadata extraction
+1f075f89 Lossless spec corrections/rewording/clarifications
+2914ecfd cwebp/tiffdec: add TIFF metadata extraction
+d82a3e33 More corrections/clarifications in lossless spec:
+bd002557 cwebp/tiff: use the first image directory
+df7aa076 Merge "Cleanup around jpegdec"
+0f57dcc3 decoding speed-up (~1%)
+bcec339b Lossless bitstream clarification:
+6bf20874 add examples/metadata.c
+207f89c0 Merge "configure: add libwebpdemux status to summary"
+1bd287a6 Cleanup around jpegdec
+91455679 Merge "cosmetics: use '== 0' in size checks"
+d6b88b76 cosmetics: use '== 0' in size checks
+d3dace2f cosmetics: jpegdec
+2f69af73 configure: add libwebpdemux status to summary
+1c1c5646 cwebp: extract tiff decoding to its own module
+6a871d66 cwebp: extract jpeg decoding to its own module
+2ee228f9 cwebp: extract png decoding to its own module
+4679db00 Merge "cwebp: add metadata framework"
+63aba3ae cwebp: add metadata framework
+931bd516 lossless bitstream: block size bits correction
+e4fc4c1c lossless bitstream: block size bits correction
+d65ec678 fix build, move token.c to src/enc/
+657f5c91 move token buffer to its own file (token.c)
+c34a3758 introduce GetLargeValue() to slim-fast GetCoeffs().
+d5838cd5 faster non-transposing SSE2 4x4 FTransform
+f76191f9 speed up GetResidualCost()
+ba2aa0fd Add support for BITS=24 case
+2e7f6e8e makefile.unix: Dependency on libraries
+dca84219 Merge "Separate out mux and demux code and libraries:"
+23782f95 Separate out mux and demux code and libraries:
+bd56a01f configure: add summary output
+90e5e319 dwebp manual: point to webpmux, gif2webp.
+540790ca gif2webp.c: add a note about prerequisites
+d1edf697 cwebp man page: meaning of '-q' for lossy/lossless
+79efa1d0 Add man page for gif2webp utility
+2243e40c Merge "gif2webp build support with autoconf tools"
+c40efca1 gif2webp build support with autoconf tools
+6523e2d4 WebP Container:
+4da788da Merge "simplify the fwd transform"
+42c3b550 simplify the fwd transform
+41a6ced9 user GLfloat instead of float
+b5426119 fix indentation
+68f282f7 * handle offset in anim viewer 'vwebp' * fix gif2webp to handle disposal method and odd offset correctly
+118cb312 Merge "add SSE2 version of Sum of Square error for 16x16, 16x8 and 8x8 case"
+8a7c3cc8 Merge "Change the order of -frame argument to be more natural"
+99e0a707 Merge "Simplify the texture evaluation Disto4x4()"
+0f923c3f make the bundling work in a tmp buffer
+e5c3b3f5 Simplify the texture evaluation Disto4x4()
+48600084 Change the order of -frame argument to be more natural
+35bfd4c0 add SSE2 version of Sum of Square error for 16x16, 16x8 and 8x8 case
+a7305c2e Clarification for unknown chunks
+4c4398e2 Refine WebP Container Spec wrt unknown chunks.
+2ca642e0 Rectify WebPMuxGetFeatures:
+7caab1d8 Some cosmetic/comment fixes.
+60b2651a Merge "Write a GIF to WebP converter based on libgif."
+c7127a4d Merge "Add NEON version of FTransformWHT"
+11b27212 Write a GIF to WebP converter based on libgif.
+e9a15a37 ExUtilWriteFile() to write memory segment to file
+74356eb5 Add a simple cleanup step in mux assembly:
+51bb1e5d mux.h: correct WebPDemuxSelectFragment() prototype
+22a0fd9d Add NEON version of FTransformWHT
+fa30c863 Update mux code to match the spec wrt animation
+d9c5fbef by-pass Analysis pass in case segments=1
+d2ad4450 Merge changes Ibeccffc3,Id1585b16
+5c8be251 Merge "Chunk fourCCs for XMP/EXIF"
+a00a3daf Use 'frgm' instead of 'tile' in webpmux parameters
+81b8a741 Design change in ANMF and FRGM chunks:
+f903cbab Chunk fourCCs for XMP/EXIF
+812933d6 Tune performance of HistogramCombine
+52ad1979 Animation specification in container spec
+001b9302 Image fragment specification in container spec
+391f9db9 Ordering of description of bits in container spec
+d5735776 Metadata specification in container spec
+1c4609b1 Merge commit 'v0.2.1'
+0ca584cb Merge "Color profile specification in container spec"
+e8b41ad1 add NEON asm version for WHT inverse transform
+af6f0db2 Color profile specification in container spec
+a61a824b Merge "Add NULL check in chunk APIs"
+0e8b7eed fix WebPPictureView() unassigned strides
+75e5f17e ARM/NEON: 30% encoding speed-up
+02b43568 Add NULL check in chunk APIs
+a0770727 mux struct naming
+6c66dde8 Merge "Tune Lossless encoder"
+ab5ea217 Tune Lossless encoder
+74fefc8c Update ChangeLog (tag: v0.2.1, origin/0.2.0, 0.2.0)
+92f8059c Rename some chunks:
+3bb4bbeb Merge "Mux API change:"
+d0c79f05 Mux API change:
+abc06044 Merge "update NEWS" into 0.2.0
+57cf313b update NEWS
+25f585c4 bump version to 0.2.1
+fed7c048 libwebp: validate chunk size in ParseOptionalChunks
+552cd9bc cwebp (windows): fix alpha image import on XP
+b14fea99 autoconf/libwebp: enable dll builds for mingw
+4a8fb272 [cd]webp: always output windows errors
+d6621580 fix double to float conversion warning
+72b96a69 cwebp: fix jpg encodes on XP
+734f762a VP8LAllocateHistogramSet: fix overflow in size calculation
+f9cb58fb GetHistoBits: fix integer overflow
+b30add20 EncodeImageInternal: fix uninitialized free
+3de58d77 fix the -g/O3 discrepancy for 32bit compile
+77aa7d50 fix the BITS=8 case
+e5970bda Make *InitSSE2() functions be empty on non-SSE2 platform
+ef5cc47e make *InitSSE2() functions be empty on non-SSE2 platform
+c4ea259d make VP8DspInitNEON() public
+8344eadf Merge "libwebp: validate chunk size in ParseOptionalChunks"
+4828bb93 Merge "cwebp (windows): fix alpha image import on XP"
+30763333 libwebp: validate chunk size in ParseOptionalChunks
+70481898 AccumulateLSIM: fix double -> float warnings
+eda8ee4b cwebp (windows): fix alpha image import on XP
+c6e98658 Merge "add EXPERIMENTAL code for YUV-JPEG colorspace"
+f0360b4f add EXPERIMENTAL code for YUV-JPEG colorspace
+f86e6abe add LSIM metric to WebPPictureDistortion()
+c3aa215a Speed up HistogramCombine for lower qualities.
+1765cb1c Merge "autoconf/libwebp: enable dll builds for mingw"
+a13562e8 autoconf/libwebp: enable dll builds for mingw
+9f469b57 typo: no_fancy -> no_fancy_upsampling
+1a27f2f8 Merge "fix double to float conversion warning"
+cf1e90de Merge "cwebp: fix jpg encodes on XP"
+f2b5d19b [cd]webp: always output windows errors
+e855208c fix double to float conversion warning
+ecd66f77 cwebp: fix jpg encodes on XP
+7b3eb372 Tune lossless compression to get better gains.
+ce8bff45 Merge "VP8LAllocateHistogramSet: fix overflow in size calculation"
+ab5b67a1 Merge "EncodeImageInternal: fix uninitialized free"
+7fee5d12 Merge "GetHistoBits: fix integer overflow"
+a6ae04d4 VP8LAllocateHistogramSet: fix overflow in size calculation
+80237c43 GetHistoBits: fix integer overflow
+8a997235 EncodeImageInternal: fix uninitialized free
+0b9e6829 minor cosmetics
+a792b913 fix the -g/O3 discrepancy for 32bit compile
+73ba4357 Merge "detect and merge similar segments"
+fee66275 detect and merge similar segments
+0c44f415 src/webp/*.h: don't forward declare enums in C++
+d7a5ac86 vwebp: use demux interface
+931e0ea1 Merge "replace 'typedef struct {} X;" by "typedef struct X X; struct X {};""
+8f216f7e remove cases of equal comparison for qsort()
+28d25c82 replace 'typedef struct {} X;" by "typedef struct X X; struct X {};"
+2afee60a speed up for ARM using 8bit for boolean decoder
+5725caba new segmentation algorithm
+2cf1f815 Merge "fix the BITS=8 case"
+12f78aec fix the BITS=8 case
+6920c71f fix MSVC warnings regarding implicit uint64 to uint32 conversions
+f6c096aa webpmux binary: Rename 'xmp' option to 'meta'
+ddfe871a webpmux help correction
+b7c55442 Merge "Make *InitSSE2() functions be empty on non-SSE2 platform"
+1c04a0d4 Common APIs for chunks metadata and color profile.
+2a3117a1 Merge "Create WebPMuxFrameInfo struct for Mux APIs"
+5c3a7231 Make *InitSSE2() functions be empty on non-SSE2 platform
+7c6e60f4 make *InitSSE2() functions be empty on non-SSE2 platform
+c7eb4576 make VP8DspInitNEON() public
+ab3234ae Create WebPMuxFrameInfo struct for Mux APIs
+e3990fd8 Alignment fixes
+e55fbd6d Merge branch '0.2.0'
+4238bc0a Update ChangeLog (tag: v0.2.0)
+c655380c dec/io.c: cosmetics
+fe1958f1 RGBA4444: harmonize lossless/lossy alpha values
+681cb30a fix RGBA4444 output w/fancy upsampling
+f06c1d8f Merge "Alignment fix" into 0.2.0
+f56e98fd Alignment fix
+6fe843ba avoid rgb-premultiply if there's only trivial alpha values
+528a11af fix the ARGB4444 premultiply arithmetic
+a0a48855 Lossless decoder fix for a special transform order
+62dd9bb2 Update encoding heuristic w.r.t palette colors.
+6f4272b0 remove unused ApplyInverseTransform()
+93bf0faa Update ChangeLog (tag: v0.2.0-rc1)
+5934fc59 update AUTHORS
+014a711d update NEWS
+43b0d610 add support for ARGB -> YUVA conversion for lossless decoder
+33705ca0 bump version to 0.2.0
+c40d7ef1 fix alpha-plane check + add extra checks
+a06f8023 MODE_YUVA: set alpha to opaque if the image has none
+52a87dd7 Merge "silence one more warning" into 0.2.0
+3b023093 silence one more warning
+f94b04f0 move some RGB->YUV functions to yuv.h
+4b71ba03 README: sync [cd]webp help output
+c9ae57f5 man/dwebp.1: add links to output file format details
+292ec5cc quiet a few 'uninitialized' warnings
+4af3f6c4 fix indentation
+9b261bf5 remove the last NOT_HAVE_LOG2 instances
+323dc4d9 remove use of log2(). Use VP8LFastLog2() instead.
+8c515d54 Merge "harness some malloc/calloc to use WebPSafeMalloc and WebPSafeCalloc" into 0.2.0
+d4b4bb02 Merge changes I46090628,I1a41b2ce into 0.2.0
+bff34ac1 harness some malloc/calloc to use WebPSafeMalloc and WebPSafeCalloc
+a3c063c7 Merge "extra size check for security" into 0.2.0
+5e796300 Merge "WebPEncode: clear stats at the start of encode" into 0.2.0
+f1edf62f Merge "rationalize use of color-cache" into 0.2.0
+c1933317 extra size check for security
+906be657 rationalize use of color-cache
+dd1c3873 Add image-hint for low-color images.
+4eb7aa64 Merge "WebPCheckMalloc() and WebPCheckCalloc():" into 0.2.0
+80cc7303 WebPCheckMalloc() and WebPCheckCalloc():
+183cba83 check VP8LBitWriterInit return
+cbfa9eec lossless: fix crash on user abort
+256afefa cwebp: exit immediately on version mismatch
+475d87d7 WebPEncode: clear stats at the start of encode
+a7cc7291 fix type and conversion warnings
+7d853d79 add stats for lossless
+d39177b7 make QuantizeLevels() store the sum of squared error
+5955cf5e replace x*155/100 by x*101581>>16
+7d732f90 make QuantizeLevels() store the sum of squared error
+e45a446a replace x*155/100 by x*101581>>16
+159b75d3 cwebp output size consistency:
+cbee59eb Merge commit 'v0.1.99'
+1889e9b6 dwebp: report -alpha option
+3bc3f7c0 Merge "dwebp: add PAM output support" into 0.2.0
+d919ed06 dwebp: add PAM output support
+85e215d3 README/manpages/configure: update website link
+c3a207b9 Update ChangeLog (tag: v0.1.99)
+d1fd7826 Merge "add extra precision about default values and behaviour" into 0.2.0
+efc826e0 add extra precision about default values and behaviour
+9f29635d header/doc clean up
+ff9fd1ba Makefile.vc: fix webpmux.exe *-dynamic builds
+8aacc7b0 remove INAM, ICOP, ... chunks from the test webp file.
+2fc13015 harmonize authors as "Name (mail@address)"
+4a9f37b7 Merge "update NEWS" into 0.2.0
+7415ae13 makefile.unix: provide examples/webpmux target
+ce82cedc update NEWS
+641e28e8 Merge "man/cwebp.1: wording, change the date" into 0.2.0
+c37c23e5 README: cosmetics
+3976dcd5 man/cwebp.1: wording, change the date
+3e5bbe1c Merge "rename 'use_argb_input' to 'use_argb'" into 0.2.0
+ce90847a Merge "add some padding bytes areas for later use" into 0.2.0
+2390dabc Merge "fixing the findings by Frederic Kayser to the bitstream spec" into 0.2.0
+02751591 add a very crude progress report for lossless
+a4b9b1c6 Remove some unused enum values.
+dd108176 rename 'use_argb_input' to 'use_argb'
+90516ae8 add some padding bytes areas for later use
+d03b2503 fixing the findings by Frederic Kayser to the bitstream spec
+ce156afc add missing ABI compatibility checks
+9d45416a Merge "Doc: container spec text tweaks" into 0.2.0
+4e2e0a8c Doc: container spec text tweaks
+f7f16a29 add ABI compatibility check
+2a775570 Merge "swig: add WebPEncodeLossless* wrappers" into 0.2.0
+a3ec6225 mux.h: remove '* const' from function parameters
+31426eba encode.h: remove '* const' from function parameters
+9838e5d5 decode.h: remove '* const' from function parameters
+4972302d swig: add WebPEncodeLossless* wrappers
+9ff00cae bump encoder/decoder versions
+c2416c9b add lossless quick encoding functions to the public API
+4c1f5d64 Merge "NEWS: mention decode_vp8.h is no longer installed" into 0.2.0
+6cb2277d NEWS: mention decode_vp8.h is no longer installed
+d5e5ad63 move decode_vp8.h from webp/ to dec/
+8d3b04a2 Merge "header clean-up" into 0.2.0
+02201c35 Merge "remove one malloc() by making color_cache non dynamic" into 0.2.0
+d708ec14 Merge "move MIN/MAX_HISTO_BITS to format_constants.h" into 0.2.0
+ab2da3e9 Merge "add a malloc() check" into 0.2.0
+2d571bd8 add a malloc() check
+7f0c178e remove one malloc() by making color_cache non dynamic
+6569cd7c Merge "VP8LFillBitWindow: use 64-bit path for msvc x64 builds" into 0.2.0
+23d34f31 header clean-up
+2a3ab6f9 move MIN/MAX_HISTO_BITS to format_constants.h
+985d3da6 Merge "shuffle variables in HashChainFindCopy" into 0.2.0
+cdf885c6 shuffle variables in HashChainFindCopy
+c3b014db Android.mk: add missing lossless files
+8c1cc6b5 makefile.unix dist: explicitly name installed includes
+7f4647ee Merge "clarify the colorspace naming and byte ordering of decoded samples" into 0.2.0
+cbf69724 clarify the colorspace naming and byte ordering of decoded samples
+857650c8 Mux: Add WebPDataInit() and remove WebPImageInfo
+ff771e77 don't install webp/decode_vp8.h
+596dff78 VP8LFillBitWindow: use 64-bit path for msvc x64 builds
+3ca7ce98 Merge "doc: remove non-finalized chunk references" into 0.2.0
+1efaa5a3 Merge "bump versions" into 0.2.0
+51fa13e1 Merge "README: update cwebp help output" into 0.2.0
+12f9aede README: update cwebp help output
+f0b5defb bump versions
+4c42a61b update AUTHORS
+6431a1ce doc: remove non-finalized chunk references
+8130c4cc Merge "build: remove libwebpmux from default targets/config"
+23b44438 Merge "configure: broaden test for libpng-config"
+85bff2cd Merge "doc: correct lossless prefix coding table & code"
+05108f6e Merge "More spec/code matching in mux:"
+6808e69d More spec/code matching in mux:
+bd2b46f5 Merge "doc/webp-container-spec: light cosmetics"
+20ead329 doc/webp-container-spec: light cosmetics
+1d40a8bc configure: add pthread detection
+b5e9067a fix some int <-> size_t mix for buffer sizes
+e41a7596 build: remove libwebpmux from default targets/config
+0fc2baae configure: broaden test for libpng-config
+45b8272c Merge "restore authorship to lossless bitstream doc"
+06ba0590 restore authorship to lossless bitstream doc
+44a09a3c add missing description of the alpha filtering methods
+63db87dd Merge "vwebp: add checkboard background for alpha display"
+a73b8978 vwebp: add checkboard background for alpha display
+939158ce Merge "vwebp: fix info display"
+b35c07d9 vwebp: fix info display
+48b39eb1 fix underflow for very short bitstreams
+7e622984 cosmetics: param alignment, manpage wording
+1bd7dd50 Merge changes I7b0afb0d,I7ecc9708
+ac69e63e Merge "Updated cwebp man's help for Alpha & Lossless."
+c0e8859d Get rid of image_info_ from WebPChunk struct.
+135ca69e WebP Container Spec:
+eb6f9b8a Updated cwebp man's help for Alpha & Lossless.
+0fa844fb cosmetic fixes on assert and 'const' where applicable
+7f22bd25 check limit of width * height is 32 bits
+16c46e83 autoconf/make: cosmetics: break long lines
+ab22a07a configure: add helper macro to define --with-*
+c17699b3 configure: add libtiff test
+0e09732c Merge "cwebp: fix crash with yuv input + lossless"
+88a510ff Merge "fix big-endian VP8LWriteBits"
+da99e3bf Merge "Makefile.vc: split mux into separate lib"
+7bda392b cwebp: fix crash with yuv input + lossless
+f56a369a fix big-endian VP8LWriteBits
+54169d6c Merge "cwebp: name InputFileFormat members consistently"
+e2feefa9 Makefile.vc: split mux into separate lib
+27caa5aa Merge "cwebp: add basic TIFF support"
+d8921dd4 cwebp: name InputFileFormat members consistently
+6f76d246 cwebp: add basic TIFF support
+4691407b Merge changes If39ab7f5,I3658b5ae
+cca7c7b8 Fixed nit: 10 -> 10.f
+5d09a244 WebPMuxCreate() error handling:
+777341c3 Fix a memleak in WebPMuxCreate()
+61c9d161 doc: correct lossless prefix coding table & code
+4c397579 Merge "mark VP8{,L}{GetInfo,CheckSignature} as WEBP_EXTERN"
+e4e36cc6 Merge "Mux: Allow only some frames/tiles to have alpha."
+ad2aad3c Merge "WebP Decoding error handling:"
+97649c8f Mux: Allow only some frames/tiles to have alpha.
+f864be3b Lower the quality settings for Alpha encoding.
+3ba81bbe WebP Decoding error handling:
+fcc69923 add automatic YUVA/ARGB conversion during WebPEncode()
+802e012a fix compilation in non-FANCY_UPSAMPLING mode
+e012dfd9 make width/height coding match the spec
+228d96a5 mark VP8{,L}{GetInfo,CheckSignature} as WEBP_EXTERN
+637a314f remove the now unused *KeepA variants
+d11f6fcc webpmux returns error strings rather than numbers
+fcec0593 makefile.unix: cwebp: fix OSX link
+6b811f1b Merge "doc: remove lossless pdf"
+c9634821 doc: remove lossless pdf
+b9ae4f0d cosmetics after mux changes b74ed6e, b494ad5
+b494ad50 Mux: only allow adding frame/tiles at the end.
+2c341b0e Merge "Added image characteristic hint for the codec."
+d373076a Added image characteristic hint for the codec.
+2ed2adb5 Merge "msvc: add intrinsic based BitsLog2Floor"
+e595e7c5 Merge "add demux.c to the makefiles"
+da47b5bd Merge "demux: add {Next,Prev}Chunk"
+e5f46742 add demux.c to the makefiles
+4708393c demux: add {Next,Prev}Chunk
+e8a0a821 demux: quiet msvc warnings
+7f8472a6 Update the WebP Container Spec.
+31b68fe6 cleanup WebPPicture struct and API
+9144a186 add overflow check before calling malloc()
+81720c91 consistency cosmetics
+2ebe8394 Merge "Add kramdown version information to README"
+71443084 enc/vp8l.c: fix build
+b7ac19fe Add kramdown version information to README
+efdcb667 Merge "Edit for consistency, usage and grammar."
+08220102 Enable alpha in vvwebp
+8de9a084 Merge "Mux API change:"
+b74ed6e7 Mux API change:
+233a589e take picture->argb_stride into account for lossless coding
+04e33f17 Edit for consistency, usage and grammar.
+a575b4bc Merge "cosmetics: add missing const"
+8d99b0f4 Merge "cosmetics: remove unimplemented function proto"
+69d02217 cosmetics: add missing const
+5b08318b cosmetics: remove unimplemented function proto
+b7fb0ed5 Log warning for unsupported options for lossless.
+e1f769fe msvc: add intrinsic based BitsLog2Floor
+8a69c7d8 Bug-fix: Clamp backward dist to 1.
+b5b6ac97 Merge "Bring the special writer 'WebPMemoryWriter' to public API"
+a6a1909f Merge "Fix floating point exception with cwebp -progress"
+f2cee067 Fix floating point exception with cwebp -progress
+91b7a8c7 Bring the special writer 'WebPMemoryWriter' to public API
+310e2972 support resize and crop for RGBA input
+a89835d3 Merge changes Ice662960,Ie8d7aa90,I2d996d5e,I01c04772
+ce614c0c Merge "dec/vp8: avoid setting decoder status twice"
+900285da dec/vp8: avoid setting decoder status twice
+8227adc8 Merge changes I6f02b0d0,I5cbc9c0a,I9dd9d4ed,Id684d2a1
+dcda59c1 Merge "demux: rename SetTile to SelectTile"
+622ef12e demux: rename SetTile to SelectTile
+81ebd375 Merge "demux: add {Next,Prev}Frame"
+02dd37a2 demux: add {Next,Prev}Frame
+4b79fa59 Merge "Limit the maximum size of huffman Image to 16MB."
+9aa34b34 Manually number "chapters," as chapter numbers are used in the narrative.
+2a4c6c29 Re-wrap at <= 72 columns
+a45adc19 Apply inline emphasis and monospacing, per gdoc / PDF
+91011206 Incorporate gdoc changes through 2012-06-08
+7a182487 Removed CodeRay syntax declarations ...
+b3ec18c5 Provide for code-block syntax highlighting.
+709d7702 Replace high ASCII artifacts (curly quotes, etc.).
+930e8abb Lossless WebP doc largely ported to markdown text.
+18cae37b msvc: silence some build warnings
+b3923084 Limit the maximum size of huffman Image to 16MB.
+f180df2a Merge "libwebp/demux: add Frame/Chunk iteration"
+2bbe1c9a Merge "Enable lossless encoder code"
+d0601b01 Merge changes I1d97a633,I81c59093
+78f3e345 Enable lossless encoder code
+d974a9cc Merge "libwebp/demux: add simple format parsing"
+26bf2232 Merge "libwebp: add WebPDemux stub functions"
+2f666688 Merge "modify WebPParseHeaders to allow reuse by GetFeatures"
+b402b1fb libwebp/demux: add Frame/Chunk iteration
+ad9ada3b libwebp/demux: add WebPDemuxGetI
+2f2d4d58 libwebp/demux: add extended format parsing
+962dcef6 libwebp/demux: add simple format parsing
+f8f94081 libwebp: add WebPDemux stub functions
+fb47bb5c Merge "NumNamedElements() should take an enum param."
+7c689805 Fix asserts in Palette and BackwardReference code.
+fbdcb7ea NumNamedElements() should take an enum param.
+fb4943bd modify WebPParseHeaders to allow reuse by GetFeatures
+3697b5ce write an ad-hoc EncodeImageInternal variant
+eaee9e79 Bug-Fix: Decode small (less than 32 bytes) images.
+0bceae48 Merge "cwebp: fix alpha reporting in stats output"
+0424b1ef Rebase default encoding settings.
+c71ff9e3 cwebp: fix alpha reporting in stats output
+e2ffe446 Merge "Stop indefinite recursion for Huffman Image."
+70eb2bd6 Stop indefinite recursion for Huffman Image.
+f3bab8eb Update vwebp
+6d5c797c Remove support for partial files in Mux.
+f1df5587 WebPMuxAssemble() returns WebPData*.
+814a0639 Rename 'Add' APIs to 'Set'.
+bbb0218f Update Mux psuedo-code examples.
+4fc4a47f Use WebPData in MUX set APIs
+c67bc979 Merge "add WebPPictureImportRGBX() and WebPPictureImportBGRX()"
+27519bc2 add WebPPictureImportRGBX() and WebPPictureImportBGRX()
+f80cd27e factorize code in Import()
+9b715026 histogram: add log2 wrapper
+8c34378f Merge "fix some implicit type conversion warnings"
+42f6df9d fix some implicit type conversion warnings
+250c16e3 Merge "doc: update lossless pdf"
+9d9daba4 Merge "add a PDF of the lossless spec"
+8fbb9188 prefer webp/types.h over stdint.h
+0ca170c2 doc: update lossless pdf
+0862ac6e add a PDF of the lossless spec
+437999fb introduce a generic WebPPictureHasTransparency() function
+d2b6c6c0 cosmetic fixes after Idaba281a
+b4e6645c Merge "add colorspace for premultiplied alpha"
+48f82757 add colorspace for premultiplied alpha
+069f903a Change in lossless bit-stream.
+5f7bb3f5 Merge "WebPReportProgress: use non-encoder specific params"
+f18281ff WebPReportProgress: use non-encoder specific params
+9ef32283 Add support for raw lossless bitstream in decoder.
+7cbee29a Fix bug: InitIo reseting fancy_upsampling flag.
+880fd98c vwebp: fix exit w/freeglut
+1875d926 trap two unchecked error conditions
+87b4a908 no need to have mux.h as noinst clause in enc/
+88f41ec6 doc: fix bit alignment in VP8X chunk
+52f5a4ef Merge "fix bug with lossy-alpha output stride"
+3bde22d7 fix bug with lossy-alpha output stride
+42d61b6d update the spec for the lossy-alpha compression methods.
+e75dc805 Move some more defines to format_constants.h
+c13f6632 Move consts to internal header format_constants.h
+7f2dfc92 use a bit-set transforms_seen_ instead of looping
+18da1f53 modulate alpha-compression effort according to config.method
+f5f2fff6 Merge "Alpha flag fix for lossless."
+c975c44e Alpha flag fix for lossless.
+4f067fb2 Merge "Android: only build dec_neon with NEON support"
+255c66b4 Android: only build dec_neon with NEON support
+8f9117a9 cosmetics: signature fixes
+39bf5d64 use header-less lossless bitstream for alpha channel
+75d7f3b2 Merge "make input data be 'const' for VP8LInverseTransform()"
+9a721c6d make input data be 'const' for VP8LInverseTransform()
+9fc64edc Disallow re-use of same transformation.
+98ec717f  use a function pointer for ProcessRows()
+f7ae5e37 cosmetics: join line
+140b89a3 factor out buffer alloc in AllocateARGBBuffers()
+a107dfa8 Rectify WebPParseOptionalChunks().
+237eab67 Add two more color-spaces for lossless decoding.
+27f417ab fix orthographic typo
+489ec335 add VP8LEncodeStream() to compress lossless image stream
+fa8bc3db make WebPEncodingSetError() take a const picture
+638528cd bitstream update for lossy alpha compression
+d73e63a7 add DequantizeLevels() placeholder
+ec122e09 remove arch-dependent rand()
+d40e7653 fix alignment
+1dd6a8b6 Merge "remove tcoder, switch alpha-plane compression to lossless"
+3e863dda remove tcoder, switch alpha-plane compression to lossless
+8d77dc29 Add support for lossless in mux:
+831bd131 Make tile size a function of encoding method.
+778c5228 Merge "remove some variable shadowing"
+817c9dce Few more HuffmanTreeToken conversions.
+37a77a6b remove some variable shadowing
+89c07c96 Merge "normalize example header includes"
+4aff411f Merge "add example_util.[hc]"
+00b29e28 normalize example header includes
+061263a7 add example_util.[hc]
+c6882c49 merge all tree processing into a single VP8LProcessTree()
+9c7a3cf5 fix VP8LHistogramNumCodes to handle the case palette_code_bits == 0
+b5551d2e Merge "Added HuffmanTreeCode Struct for tree codes."
+8b85d01c Added HuffmanTreeCode Struct for tree codes.
+093f76d8 Merge "Allocate single memory in GetHuffBitLengthsAndCodes."
+41d80494 Allocate single memory in GetHuffBitLengthsAndCodes.
+1b04f6d2 Correct size in VP8L header.
+2924a5ae Makefile.vc: split object lists based on directory
+c8f24165 Merge "add assert(tokens)"
+43239947 add assert(tokens)
+9f547450 Catch an error in DecodeImageData().
+ac8e5e42 minor typo and style fix
+9f566d1d clean-up around Huffman-encode
+c579a710 Introduce CHUNK_SIZE_BYTES in muxi.h.
+14757f8a Make sure huffman trees always have valid symbols
+41050618 makefile.unix: add support for building vwebp
+48b37721 Merge "fixed signed/unsigned comparison warning"
+57f696da Merge "EncodeImageInternal: fix potential leak"
+d972cdf2 EncodeImageInternal: fix potential leak
+5cd12c3d fixed signed/unsigned comparison warning
+cdca30d0 Merge "cosmetics: shorten long line"
+e025fb55 cosmetics: shorten long line
+22671ed6 Merge "enc/vp8l: fix double free on error"
+e1b9b052 Merge "cosmetics: VP8LCreateHuffmanTree: fix indent"
+a8e725f8 enc/vp8l: fix double free on error
+27541fbd cosmetics: VP8LCreateHuffmanTree: fix indent
+1d38b258 cwebp/windows: use MAKE_REFGUID where appropriate
+817ef6e9 Merge "cwebp: fix WIC/Microsoft SDK compatibility issue"
+902d3e3b cwebp: fix WIC/Microsoft SDK compatibility issue
+89d803c4 Merge "Fix a crash due to wrong pointer-integer arithmetic."
+cb1bd741 Merge "Fix a crash in lossless decoder."
+de2fe202 Merge "Some cleanup in VP8LCreateHuffmanTree() (and related functions CompareHuffmanTrees() and SetBitDepths()): - Move 'tree_size' initialization and malloc for 'tree + tree_pool'   outside the loop. - Some renames/tweaks for readability."
+ce69177a Fix a crash due to wrong pointer-integer arithmetic.
+e40a3684 Fix a crash in lossless decoder.
+3927ff3a remove unneeded error condition for WebPMuxNumNamedElements()
+2c140e11 Some cleanup in VP8LCreateHuffmanTree() (and related functions CompareHuffmanTrees() and SetBitDepths()): - Move 'tree_size' initialization and malloc for 'tree + tree_pool'   outside the loop. - Some renames/tweaks for readability.
+861a5b7b add support for animation
+eb5c16cc Merge "Set correct encode size in encoder's stats."
+4abe04a2 fix the return value and handle missing input file case.
+2fafb855 Set correct encode size in encoder's stats.
+e7167a2b Provide one entry point for backward references.
+c4ccab64 Print relevant lossless encoding stats in cwebp.
+e3302cfd GetHuffBitLengthsAndCodes: reduce level of indirection
+b5f2a9ed enc/vp8l: fix uninitialized variable warning
+7885f8b2 makefile.unix: add lossless encoder files
+1261a4c8 Merge "cosmetics"
+3926b5be Merge "dsp/cpu.c: Android: fix crash on non-neon arm builds"
+834f937f dsp/cpu.c: Android: fix crash on non-neon arm builds
+126e1606 cosmetics
+e38602d2 Merge branch 'lossless_encoder'
+e8d3d6a0 split StoreHuffmanCode() into smaller functions
+d0d88990 more consolidation: introduce VP8LHistogramSet
+1a210ef1 big code clean-up and refactoring and optimization
+41b5c8ff Some cosmetics in histogram.c
+ada6ff77 Approximate FastLog between value range [256, 8192]
+ec123ca3 Forgot to update out_bit_costs to symbol_bit_costs at one instance.
+cf33ccd1 Evaluate output cluster's bit_costs once in HistogramRefine.
+781c01f4 Simple Huffman code changes.
+a2849bc5 Lossless decoder: remove an unneeded param in ReadHuffmanCodeLengths().
+b39e7487 Reducing emerging palette size from 11 to 9 bits.
+bfc73db4 Move GetHistImageSymbols to histogram.c
+889a5786 Improve predict vs no-predict heuristic.
+01f50663 code-moving and clean-up
+31035f3b reduce memory usage by allocating only one histo
+fbb501b8 Restrict histo_bits to ensure histo_image size is under 32MB
+8415ddf3 further simplification for the meta-Huffman coding
+e4917299 A quick pass of cleanup in backward reference code
+83332b3c Make transform bits a function of encode method (-m).
+72920caa introduce -lossless option, protected by USE_LOSSLESS_ENCODER
+c6ac4dfb Run TraceBackwards for higher qualities.
+412222c8 Make histo_bits and transform_bits function of quality.
+149b5098 Update lossless encoder strategy:
+0e6fa065 cache_bits passed to EncodeImageInternal()
+e38b40a9 Factorize code for clearing HtreeGroup.
+6f4a16ea Removing the indirection of meta-huffman tables.
+3d33ecd1 Some renaming/comments related to palette in lossless encoder.
+4d02d586 Lossless encoder: correction in Palette storage
+4a636235 fix a memleak in EncodeImageInternal()
+0993a611 Full and final fix for prediction transform
+afd2102f Fix cross-color transform in lossless encoder
+b96d8740 Need to write a '0' bit at the end of transforms.
+54dad7e5 Color cache size should be counted as 0 when cache bits = 0
+4f0c5caf Fix prediction transform in lossless encoder.
+36dabdad Fix memory leak in method EncodeImageInternal for histogram_image.
+352a4f49 Get rid of PackLiteralBitLengths()
+d673b6b9 Change the predictor function to pass left pixel
+b2f99465 Fix CopyTileWithPrediction()
+84547f54 Add EncodeImageInternal() method.
+6b38378a Guard the lossless encoder (in flux) under a flag
+09f7532c Fix few nits (const qualifiers)
+648be393 Added implementation for various lossless functions
+32714ce3 Add VP8L prefix to backward ref & histogram methods.
+fcba7be2 Fixed header file tag (WEBP_UTILS_HUFFMAN_ENCODE_H_)
+bc703746 Add backward_ref, histogram & huffman encode modules from lossless.
+fdccaadd Fixing nits
+227110c4 libwebp interface changes for lossless encoding.
+50679acf minor style fixes
+b38dfccf remove unneeded reference to NUM_LITERAL_CODES
+8979675b harmonize header description
+c04eb7be tcoder.c: define NOT_HAVE_LOG2 for MSVC builds
+9a214fa1 Merge "VP8[L]GetInfo: check input pointers"
+5c5be8ba VP8[L]GetInfo: check input pointers
+0c188fec Merge changes I431acdfe,I713659b7
+b3515c62 mux: drop 'chunk' from ChunkInfo member names
+aea7923c muxi.h: remove some unused defines
+01422492 update NEWS file for next release
+29e3f7ec Merge "dec: remove deprecated WebPINew()"
+4718e449 Merge "muxedit: a few more size_t changes"
+82654f96 Merge "muxedit: remove a few redundant NULL checks"
+02f27fbd dec: remove deprecated WebPINew()
+ccddb3fc muxedit: remove a few redundant NULL checks
+a6cdf710 muxedit: a few more size_t changes
+a3846892 Merge "mux: remove unused LIST_ID"
+11ae46ae alpha.c: quiet some size_t -> int conversion warnings
+dee46692 mux: remove unused LIST_ID
+03f1f493 mux: add version checked entry points
+6a0abdaa Merge "doc: tile/alpha corrections"
+c8139fbe Merge "few cosmetics"
+68338737 Merge "lossless: remove some size_t -> int conversions"
+5249e94a doc: tile/alpha corrections
+d96e722b huffman: quiet int64 -> int conversion warning
+532020f2 lossless: remove some size_t -> int conversions
+23be6edf few cosmetics
+1349edad Merge "configure: AC_ARG_* use AS_HELP_STRING"
+bfbcc60a configure: AC_ARG_* use AS_HELP_STRING
+1427ca8e Merge "Makefile.am: header file maintenance"
+087332e3 Merge "remove unused parameter 'round' from CalcProba()"
+9630e168 remove unused parameter 'round' from CalcProba()
+92092eaa Merge "bit_reader.h: correct include"
+a87fc3f6 Merge "mux: ensure # images = # tiles"
+53af99b1 Merge "mux: use size_t consistently"
+39a57dae Makefile.am: header file maintenance
+1bd0bd0d bit_reader.h: correct include
+326a3c6b mux: ensure # images = # tiles
+95667b8d mux: use size_t consistently
+231ec1fb Removing the indirection of meta-huffman tables.
+15ebcbaa check return pointer from MuxImageGetListFromId
+b0d6c4a7 Merge "configure: remove test for zlib.h"
+8cccac50 Merge "dsp/lossless: silence some build warnings"
+b08819a6 dsp/lossless: silence some build warnings
+7ae22521 Android.mk: SSE2 & NEON updates
+0a49e3f3 Merge "makefile.unix add missing header files"
+2e75a9a1 Merge "decode.h: use size_t consistently"
+fa13035e configure: remove test for zlib.h
+d3adc81d makefile.unix add missing header files
+262fe01b Merge "makefile.unix & Android.mk: cosmetics"
+4cce137e Merge "enc_sse2 add missing stdlib.h include"
+80256b85 enc_sse2 add missing stdlib.h include
+9b3d1f3a decode.h: use size_t consistently
+64083d3c Merge "Makefile.am: cosmetics"
+dceb8b4d Merge changes If1331d3c,I86fe3847
+0e33d7bf Merge "webp/decode.h: fix prototypes"
+fac0f12e rename BitReader to VP8LBitReader
+fbd82b5a types.h: centralize use of stddef.h
+2154835f Makefile.am: cosmetics
+1c92bd37 vp8io: use size_t for buffer size
+90ead710 fix some more uint32_t -> size_t typing
+cbe705c7 webp/decode.h: fix prototypes
+3f8ec1c2 makefile.unix & Android.mk: cosmetics
+217ec7f4 Remove tabs in configure.ac
+b3d35fc1 Merge "Android.mk & Makefile.vc: add new files"
+0df04b9e Android.mk & Makefile.vc: add new files
+e4f20c5b Merge "automake: replace 'silent-rules' w/AM_SILENT_RULES"
+8d254a09 cosmetics
+6860c2ea fix some uint32_t -> size_t typing
+4af1858a Fix a crash due to max symbol in a tree >= alphabet size
+6f01b830 split the VP8 and VP8L decoding properly
+f2623dbe enable lossless decoder
+b96efd7d add dec/vp8i.h changes from experimental
+19f6398e add dec/vp8l{i.h,.c} from experimental
+c4ae53c8 add utils/bit_reader.[hc] changes from experimental
+514d0089 add dsp/lossless.[hc] from experimental
+9c67291d add utils/huffman.[hc] from experimental
+337914a0 add utils/color_cache.[hc] from experimental
+b3bf8fe7 the read-overflow code-path wasn't reporting as an error
+1db888ba take colorspace into account when cropping
+61c2d51f move the rescaling code into its own file and make enc/ and dec/ use it.
+efc2016a Make rescaler methods generic
+3eacee81 Move rescaler methods out of io.c.
+a69b893d automake: replace 'silent-rules' w/AM_SILENT_RULES
+6f7bf645 issue 111: fix little-endian problem in bit-reader
+ed278e22 Removed unnecessary lookup
+cd8c3ba7 fix some warnings: down-cast and possibly-uninitialized variable
+0a7102ba ~1% improvement of alpha compression
+3bc1b141 Merge "Reformat container doc"
+dc17abdc mux: cosmetics
+cb5810df Merge "WebPMuxGetImage: allow image param to be NULL"
+506a4af2 mux: cosmetics
+135e8b19 WebPMuxGetImage: allow image param to be NULL
+de556b68 Merge "README.mux: reword some descriptions"
+0ee2aeb9 Makefile.vc: use batch mode rules
+d9acddc0 msvc: move {i,p}db creation to object directory
+237c9aa7 Merge "expose WebPFree function for DLL builds"
+b3e4054f silence msvc debug build warning
+45feb55d expose WebPFree function for DLL builds
+11316d84 README.mux: reword some descriptions
+4be52f4a factorize WebPMuxValidate
+14f6b9f6 mux: light cleanup
+5e96a5db add more param checks to WebPPictureDistortion()
+8abaf820 Merge "silence some type size related warnings"
+1601a39b silence some type size related warnings
+f3abe520 Merge "idec: simplify buffer size calculation"
+a9c5cd4c idec: simplify buffer size calculation
+7b06bd7f Merge "configure/automake: add silent-rules option"
+e9a7d145 Reformat container doc
+d4e5c7f3 configure/automake: add silent-rules option
+5081db78 configure/automake: no -version-info for convenience libs
+85b6ff68 Merge "idec: fix WebPIUpdate failure"
+7bb6a9cc idec: fix internal state corruption
+89cd1bb8 idec: fix WebPIUpdate failure
+01b63806 4-5% faster decoding, optimized byte loads in arithmetic decoder.
+631117ea Merge "cosmetics & warnings"
+a0b2736d cosmetics & warnings
+f73947f4 use 32bit for storing dequant coeffs, instead of 16b.
+b9600308 Merge "store prediction mode array as uint8_t[16], not int[16]."
+7b67881a store prediction mode array as uint8_t[16], not int[16].
+cab8d4dc Merge "NEON TransformOne"
+ba503fda NEON TransformOne
+9f740e3b Merge "gcc warning fix: remove the 'const' qualifier."
+f76d3587 gcc warning fix: remove the 'const' qualifier.
+e78478d6 Merge "webpmux: make more use of WebPData"
+f85bba3d Merge "manpages: add BUGS section"
+48a43bbf Merge "makefile.unix: variable cosmetics"
+c274dc96 makefile.unix: variable cosmetics
+1f7b8595 re-organize the error-handling in the main loop a bit
+1336fa71 Only recompute level_cost_[] when needed
+771ee449 manpages: add BUGS section
+0f7820e6 webpmux: make more use of WebPData
+974aaff3 examples: logging updates
+6c14aadd Merge "better token buffer code"
+f4054250 better token buffer code
+18d959fa Merge "mux: add WebPData type"
+eec4b877 mux: add WebPData type
+0de3096b use 16bit counters for recording proba counts
+7f23678d fix for LevelCost + little speed-up
+7107d544 further speed-up/cleanup of RecordCoeffs() and GetResidualCost()
+fd221040 Introduce Token buffer (unused for now)
+5fa148f4 Merge "speed-up GetResidualCost()"
+28a9d9b4 speed-up GetResidualCost()
+11e7dadd Merge "misc cosmetics"
+378086bd misc cosmetics
+d61479f9 add -print_psnr and -print_ssim options to cwebp.
+2e3e8b2e add a WebPCleanupTransparentArea() method
+552c1217 Merge "mux: plug some memory leaks on error"
+a2a81f7d Merge "fix Mach-O shared library build"
+b3482c43 Merge "fix gcc-4.0 apple 32-bit build"
+e4e3ec19 fix gcc-4.0 apple 32-bit build
+b0d2fecf mux: plug some memory leaks on error
+f0d2c7a7 pass of cosmetics
+b309a6f9 fix Mach-O shared library build
+241ddd38 doc: delete mux container pdf
+8b1ba272 doc: update VP8 decode guide link
+7e4371c5 WebPMuxCreate: fix unchecked malloc
+eb425586 Merge "have makefile.unix clean up src/webp/*~ too"
+a85c3631 Merge "correct EncodeAlpha documentation"
+a33842fd Merge "Update webp container spec with alpha filter options."
+8d6490da Incremental support for some of the mux APIs.
+b8375abd have makefile.unix clean up src/webp/*~ too
+b5855fc7 correct EncodeAlpha documentation
+dba37fea Update webp container spec with alpha filter options.
+2e74ec8b fix compile under MINGW
+716d1d7f fix suboptimal MAX_LEN cut-off limit
+57cab7b8 Harmonize the alpha-filter predictions at boundary
+3a989534 Merge "Fix bug for Alpha in RGBA_4444 color-mode."
+8ca2076d Introduce a 'fast' alpha mode
+221a06bb Fix bug for Alpha in RGBA_4444 color-mode.
+ad1e163a cosmetics: normalize copyright headers
+c77424d7 cosmetics: light include cleanup
+9d0e17c9 fix msvc build breakage after 252028a
+7c4c177c Some readability fixes for mux library
+d8a47e66 Merge "Add predictive filtering option for Alpha."
+252028aa Add predictive filtering option for Alpha.
+9b69be1c Merge "Simplify mux library code"
+a056170e Simplify mux library code
+992187a3 improve log2 test
+e852f832 update Android.mk file list
+a90cb2be reduce number of copies and mallocs in alpha plane enc/dec
+b1662b05 fix some more type conversion warnings w/MSVC
+223d8c60 fix some uint64_t -> int conversion warnings with MSC
+c1a0437b Merge "simplify checks for enabling SSE2 code"
+f06817aa simplify checks for enabling SSE2 code
+948d4fe9 silence a msvc build warning
+91179549 vwebp: msvc build tweaks
+7937b409 simple WebP viewer, based on OpenGL
+6aac1df1 add a bunch of missing 'extern "C"'
+421eb99d Merge "Remove assigned-but-not-used variable "br""
+91e27f45 better fitting names for upsampling functions
+a5d7ed5c Remove assigned-but-not-used variable "br"
+f62d2c94 remove unused 'has_alpha' from VP8GetInfo() signature
+08e86582 trap alpha-decoding error
+b361eca1 add cut-off to arith coder probability update.
+8666a93a Some bug-fixes for images with alpha.
+273a12a0 fix off-by-1 diff in case cropping and simple filtering
+2f741d1e webpmux: ReadImage: fix ptr free in error case
+721f3f48 fix alpha decode
+60942c8c fix the has_alpha_ order
+30971c9e Implement progress report (and user abort)
+eda520a9 cosmetics after 9523f2a
+38bd5bb5 Merge "Better alpha support in webpmux binary"
+ccbaebfe Merge "Updated the includes to relative paths."
+d71fbdcc fix small typo in error message array
+cdf97aa2 Better alpha support in webpmux binary
+885f25bc Updated the includes to relative paths.
+a0ec9aac Update WebP encoder (cwebp) to support Alpha.
+667b769a Fixed the include for types.h within mux.h
+9523f2a5 Add Alpha Encode support from WebPEncode.
+16612ddd Merge "Add Alpha Decode support from WebPDecode."
+d117a940 Add Alpha Decode support from WebPDecode.
+67228734 cosmetics after e1947a9
+e1947a92 Add Alpha encode/decode code.
+afc4c5d6 simplify code by introducing a CopyPlane() helper func
+113b3128 Merge "MUX API Updates"
+c398f595 MUX API Updates
+5acf04ef remove orphan source file
+059f03ef Merge "dec: validate colorspace before using as array index"
+70a03989 Merge "factorize some code"
+9b243b3d factorize some code
+372e2b46 Correct a bug in ReadPNG() with GRAY_ALPHA images
+469d6eb9 Merge "Makefile.am: remove redundant noinst_HEADERS"
+9fe3372f dec: validate colorspace before using as array index
+8962030f remove orphan source file
+ced3e3f4 Makefile.am: remove redundant noinst_HEADERS
+964387ed use WEBP_INLINE for inline function declarations
+90880a11 Merge "manpages: break long lines"
+b5910895 Merge "manpages: minor formatting updates"
+4c451e4a Merge "Rectify the Chunk parsing logic."
+04e84cf1 examples: slight cleanup
+099717ce manpages: break long lines
+1daf39bb manpages: minor formatting updates
+abd030b5 fix missing "(void)" in function signature
+f6a7d758 remove useless test
+f07b2138 Rectify the Chunk parsing logic.
+b8634f7d webpmux: fix lib link order
+42c2e682 Fix missing coma (on uncompiled code)
+d8329d41 Android.mk: add missing source files
+13a54df5 Merge "More aggressive copy-edit; add TODO; validate HTML5"
+868b96ae More aggressive copy-edit; add TODO; validate HTML5
+767afea2 configure: check for a symbol contained in libpng
+408b8918 Merge "Linewrap at 72 cols. Casual copy-edit."
+3ae318c7 Merge "Restore (most) emphasis; add emphasis to normative RFC 2119 terms (MUST, etc.)"
+918eb2d8 Merge "Basic container doc source clean-up; fix lists and pseudocode blocks."
+03bec9e0 Linewrap at 72 cols. Casual copy-edit.
+2678d819 Restore (most) emphasis; add emphasis to normative RFC 2119 terms (MUST, etc.)
+428674da Basic container doc source clean-up; fix lists and pseudocode blocks.
+6a77d928 Merge "Makefile.vc: cosmetics"
+28c38e8c Merge "Makefile.vc: condense directory creation rules"
+55be2cf8 Initial import of container spec document, from pdftotext transform.
+a82a788b Makefile.vc: cosmetics
+c8f41ce5 Makefile.vc: condense directory creation rules
+2b877cd0 Some fixes to Makefile.vc to support the src\mux directory.
+3eb969b3 Merge "Add Makefile.vc for Mux library & binary."
+e78e971e Add Makefile.vc for Mux library & binary.
+6aedde58 Add manual for WebPMux tool.
+8a360d0a Merge "Added WebPMux Binary."
+a4f32cae Added WebPMux Binary.
+f3bf4c76 Added Mux Container Spec & README for MUX-API.
+9f761cfa Changed function signature for WebPMuxCreate
+5f31b5ec Merge "Add Mux library for manipulating WebP container."
+2315785f Add Mux library for manipulating WebP container.
+7e198abb update ChangeLog (tag: v0.1.3)
+dfc9c1ea Harmonize the dates
+28ad70c5 Fix PNG decoding bug
+846e93c5 Update AUTHORS & add .mailmap
+563e52d6 cosmetics after '76036f5 Refactor decoder library'
+76036f54 Refactor decoder library
+377ef43c configure.ac: update AC_INIT params
+7a8d8762 use a user-visible MACRO for max width/height.
+d4e9f559 NEON decode support in WebP
+0ee683b5 update libtool version-info
+fdbe02c5 windows: match _cond_destroy logic w/return variable name
+206b686b README: correct advanced decode api pseudo-code
+6a32a0f5 make VP8BitReader a typedef, for better re-use
+b112e836 create a libwebputils under src/utils
+ee697d9f harmonize the include guards and #endif comments
+a1ec07a6 Fixing compiler error in non x86 arch.
+dcfa509a Fixed recursive inclusion of bit_writer.h and vp8enci.h.
+e06ac088 create a separate libwebpdsp under src/dsp
+ebeb412a use unsigned int for bitfields
+341cc56a make kNewRange a static array
+227a91e5 README: minor wording update
+05bd8e6a add man pages to dist
+812dfa1a bump up versions in preparations for 0.1.3
+a5b78c81 wrap alpha-related options under WEBP_EXPERIMENTAL_FEATURES flag
+34dc7907 regen ChangeLog for 0.1.3-rc2
+7c436630 Silence some (more) Visual Studio warnings.
+60306e8c add top-level gitattributes
+2aa6b80e Slience some Visual Studio warnings.
+4cbbb290 Merge "bump up version for next freeze"
+a3291674 bump up version for next freeze
+c7e86aba cosmetics: fix comment line lengths
+c9e037ab makefile.unix: add simple dist target
+87d58ce9 makefile.unix: rule maintenance
+d477de77 mend
+fac15ec7 Update NEWS & README for next release V0.1.3
+6215595c Merge "add a -partition_limit option to limit the number of bits used by intra4x4"
+3814b76c Merge "reorganize chunk-parsing code"
+900286e0 add a -partition_limit option to limit the number of bits used by intra4x4
+cd12b4b0 add the missing cost for I4/I16 mode selection
+dfcc2136 reorganize chunk-parsing code
+3cf20306 initialize pointers to function within VP8DspInit()
+d21b4795 Merge "windows: add decode threading support"
+473ae953 fix hang on thread creation failure
+fccca420 windows: add decode threading support
+a31f843a Use the exact PNG_INCLUDES/PNG_LIBS when testing for -lpng
+ad9b45f1 Merge "Makefile.vc: rule maintenance"
+565a2cab Makefile.vc: rule maintenance
+2d0da681 makefile.unix: disable Wvla by default
+fc7815d6 multi-thread decoding: ~25-30% faster
+acd8ba42 io->teardown() was not always called upon error
+c85527b1 Merge "Makefile.vc: add DLL configs"
+e1e9be35 cosmetics: spelling/grammar in README and lib headers
+b4d0ef8f Makefile.vc: add DLL configs
+998754a7 remove unused nb_i4_ and nb_i16_ fields.
+9f01ce3a rename WebPDecBuffer::memory -> private_memory
+fb5d659b fix an overflow bug in LUT calculation
+d646d5c7 swig: add WebPDecodeARGB
+78aeed40 add missing WebPDecodeARGBInto() and switch ARGB4444 to RGBA4444 as was intended
+cd7c5292 explicitly mark library functions as extern
+19db59f8 add support for RGB565, ARGB4444 and ARGB colorspace (decoder)
+c915fb2a encoder speed-up: hardcode special level values
+c558bdad Rename and improve the API to retrieve decoded area
+bf599d74 Merge "makefile.unix: disable -Wvla by default"
+c9ea03d7 SSE2 version of strong filtering
+993af3e2 makefile.unix: disable -Wvla by default
+3827e1bc Merge "examples: (windows/WIC) add alpha support"
+e291fae0 SSE2 functions for the fancy upsampler.
+a06bbe2e add WebPISetIOHooks() to set some custom hooks on the incremental decoder object.
+7643a6f2 Merge "makefile.unix: use uname to detect OSX environment"
+5142a0be export alpha channel (if present) when dumping to PGM format
+14d5731c makefile.unix: use uname to detect OSX environment
+08057062 examples: quiet warnings
+3cfe0888 examples: (windows/WIC) add alpha support
+13ed94b8 add compile warning for variable-length-array
+5a18eb1a Merge "add Advanced Decoding Interface"
+5c4f27f9 add missing \n
+f4c4e416 80 cols fix
+d2603105 add Advanced Decoding Interface
+bd2f65f6 sse2 version of the complex filter
+96ed9ce0 perform two idct transforms at a time when possible
+01af7b69 use aligned stored
+0e1d1fdf Merge "Makefile.vc: add experimental target"
+2a1292a6 Makefile.vc: add experimental target
+23bf351e Enable decode SSE2 for Visual Studio
+131a4b7b dec/dsp_sse2: fix visual studio compile
+00d9d680 swig: file reorganization
+7fc7e0d9 Merge "swig/java: basic encode support"
+3be57b16 fix MSVC compile for WEBP_EXPERIMENTAL_FEATURES
+40a7e347 dec/dsp: disable sse2 for Visual Studio builds
+e4d540c8 add SSE2 code for transform
+54f2170a swig/java: basic encode support
+c5d4584b call function pointers instead of C-version
+ea43f045 Merge "configure: mingw32 targets: test for WIC support"
+a11009d7 SSE2 version of simple in-loop filtering
+42548da9 shave one unneeded filter-cache line
+31f9dc6f configure: mingw32 targets: test for WIC support
+19559699 Merge "split expression in two."
+415dbe46 split expression in two.
+e29072a8 configure: test for zlib only w/--enable-experimental
+b2b0090b Simplify Visual Studio ifdefs
+ca7a2fd6 Add error reporting from encoding failures.
+6c9405db Merge "Makefile.vc: require CFG with clean target"
+0424ecd9 Makefile.vc: require CFG with clean target
+003417c7 Enable SSE2 for Visual Studio builds
+af10db4a little speed up for VP8BitUpdate()
+e71418f8 more MSVC files to ignore
+46d90363 cosmetics
+edf59ab3 typo fix
+72229f5f Add support for x64 and SSE2 builds under Windows.
+92e5c6e1 VP8GetInfo() + WebPResetDecParams()
+416b7a6b raise the fixed-point precision for the rescaler
+aa87e4e0 fix alignment
+eb66670c disable WEBP_EXPERIMENTAL_FEATURES
+c5ae7f65 typo fix: USE_ => WEBP_
+d041efae swig: add libwebp.jar/libwebp_java_wrap.c
+f6fb3877 add swig interface
+e9273902 align buffer for double too
+842c009b fix -strong option
+d0a70387 Merge "cosmetics"
+fc0a02e5 fix the dichotomy loop
+38369c03 cosmetics
+8dfc4c6f factorize and unify GetAlpha() between the C and SSE2 version
+6d0e66c2 prepare experimentation with yuv444 / 422
+79cc49f5 add a --enable-experimental option to './configure'
+d7575238 sse2 version of CollectHistogram()
+c1c728d6 add an extra #ifdef WEBP_EXPERIMENTAL_FEATURES to avoid 'unused variable' warning
+60c61d2d always call VP*EncDeleteAlpha() unconditionnally, for simplicity
+0f8c6384 simply don't call WriteExtensions() if WEBP_EXPERIMENTAL_FEATURES is not defined
+47c661d5 rename swap -> swap_rb
+10d55bbb move chunk[] declaration out of the for() loop
+517cec21 fix indentation
+f7d9e261 fix merge problems
+8fd42b3a add a stride 'a_stride' for the alpha plane
+b8dcbf2f fix alpha-plane copy and crop methods
+cdef89de fix some 'unused variable' warning
+fb29c262 SSE2 version of the fwd transform and the squared sum metric
+2ab4b72f EXPERIMENTAL: add support for alpha channel
+cfbf88a6 add SSE2 functions. ~2x faster encoding on average.
+e7ff3f9a merge two ITransforms together when applicable and change the TTransform to return the sum directly.
+ca554137 fix WebPIDecGetRGB() to accept any RGB(A) mode, not just MODE_RGB
+8aa50efd fix some 'man' typos
+d3f3bdda update ChangeLog (tag: v0.1.2)
+d7e9a69c update contributor list
+261abb8e add a 'superclean' section
+276ae825 Remove files not mean to be in git, and update .gitignore
+24868455 build: prepare libwebp.pc
+14ceb6e8 add "-version" description to man pages
+b247a3b2 Create the m4 directory, and also place .gitignore in there for libtool.
+cdd734c9 Resolve automake warnings
+c5fa726e build: add pkgconfig files
+b20aaca2 build: just use autoreconf, avoid calling tools manually
+4b0b0d66 cwebp: use modern functions
+efbc6c41 update Android.mk
+7777570b better version of ChangeLog
+fa70d2b7 update version number in the DOC
+f8db5d5d more C89-fixes
+0de013b3 fix typos
+650ffa3b add version getters for decoder and encoder
+be4867d2 doc for incremental decoding
+56732a1b add idec.obj in MSVC makefile
+208afb5e add c++ guards
+8bf76fe0 add incremental decoding
+1f288328 'inline' isn't defined in strict ansi c89
+8b77c632 move the quantization function to dsp.c
+b2c3575c add a 'last_y' field to WebPDecParams
+2654c3da correctly pass along the exact same status returned from ParsePartitions
+4704146a add missing precision in the man
+6d978a6c add error messages
+6463e6ab add some install instructions, and fix intel-mac flags
+05fb7bfc Merge ".gitignore: initial version"
+c33f0195 .gitignore: initial version
+e532b9ab Makefile: allow out of tree builds
+4c0da7aa enable sparse dc/ac transforms
+07dbb8d5 clarify the return logic
+5c69e1bb fix bigger-by-1 array
+7c5267e3 fix a (harmless) typo: non_zero_ -> non_zero_ac_
+bc752135 fix missing free()
+af3e2aaa remove trailing spaces
+13e50da6 make the bitreader preload at least 8bits, instead of post-load them (this makes initialization easier and will be helpful for incremental decoding). Modify ParsePartitions() to accommodate for truncated input.
+f4888f77 emit 9 - nb_bits trailing zeros instead of 8
+3db65255 separate block-parsing into a visible VP8DecodeMB()
+a871de02 add missing extern "C"
+b3ce8c52 remove a gcc warning about type pun by using a proper union'd type
+e1863715 update after addition of webpi.h
+3e856e2d Extract some useful functions around decoding buffer WebPDecParams.
+d5bc05a4 make the filtering process match libvpx and ffvp8
+dd60138d add man pages for cwebp(1) and dwebp(1)
+c4fa3644 fix header
+5b70b378 * add an option to bypass_filtering in VP8Io.
+b97a4003 simplify QuantizeBlock code a bit
+84b58ebb add more checks around picture allocation
+b65a3e10     remove absolute_delta_ field and syntax code
+0744e842 Dont' open output file until we're sure the input file is valid
+d5bd54c7 fix typo and buggy line
+f7a9549d Add a simple top-level makefile.unix for quick & easy build.
+5f36b944 update the doc for the -f option
+f61d14aa a WebP encoder converts PNG & JPEG to WebP
+81c96621 oops: forgotten call to Initialize() + move the error message to a more useful place
+87ffa005 typo: fix a missing 'R', was confusing.
+b04b857a * add decoding measurement using stopwatch.h (use -v option) * support PNG output through WIC on Win32
+746a4820 * make (*put)() hook return a bool for abort request. * add an enum for VP8Status() to make things clearer
+73c973e6 * strengthen riff/chunk size checks * don't consider odd-sized chunks being an error
+1dc4611a add support for PNG output (default) regularize include guards
+860641df fix a typo: sizeof(kYModeProbaInter0) => sizeof(kUVModeProbaInter0)
+3254fc52 fix some petty constness fix the ./configure file too
+504d3393 fix eof_ mis-initialization
+2bc0778f leftover Makefile.* from previous commit
+d2cf04e4 move Makefile.am one level below, to src/dec fix typos here and there dwebp is now an installed program
+ade92de8 typo: vp8.h -> decode_vp8.h
+d7241241 forgot to declare types.h to be installed
+6421a7a4 move the decoder sourcetree to a sub-location src/dec to make room for future libs sources
+a9b3eab6 correct layout name is IMC4.
+2330522c handle corner case of zero-dimensions
+280c3658 make VP8Init() handle short buffers (< 2 bytes) correctly
+b1c9e8b4 handle error cases more robustly
+0e94935c Merge "table-less version of clip_8b()"
+1e0a2d25 table-less version of clip_8b()
+e12109ee dwebp: change -yuv option to -raw change the layout to IMC2
+d72180a4 speed-up fancy upscaler
+9145f3bc reset eof_ at construction time
+a7ee0559 simplify the logic of GetCoeffs()
+f67b5939 lot of cosmetics
+ea27d7c6 fix endian problem on PowerPC
+beb0a1ba fix signature of VP8StoreBlock
+b128c5e2 Merge "fancy chroma upscaling"
+6a37a2aa fancy chroma upscaling
+ff565edc fix two numeric typos
+5a936a0a use uintptr_t for casting pointers to ints
+e14a0301 for cross_compiling=yes to prevent executing any binary
+83b545ee add vc9+ makefile
+296f6914 fix output loop for small height
+cbfbb5c3 convert to plain-C
+f09f96ee Fix declaration after statement warning
+5981ee55 Fix UV plane ac/dc quantizer transposition
+c8d15efa convert to ANSI-C
+c3f41cb4 Initial commit
diff --git a/src/3rdparty/libwebp/NEWS b/src/3rdparty/libwebp/NEWS
index 30554bf..3bf4bd0 100644
--- a/src/3rdparty/libwebp/NEWS
+++ b/src/3rdparty/libwebp/NEWS
@@ -1,3 +1,31 @@
+- 1/26/2017: version 0.6.0
+  * lossless performance and compression improvements
+  * miscellaneous performance improvements (SSE2, NEON, MSA)
+  * webpmux gained a -duration option allowing for frame timing modification
+  * new img2webp utility allowing a sequence of images to be converted to
+    animated webp
+  * API changes:
+    - libwebp:
+      WebPPictureSharpARGBToYUVA
+      WebPPlaneDistortion
+    - libwebpmux / gif2webp:
+      WebPAnimEncoderOptions: kmax <= 0 now disables keyframes, kmax == 1
+                              forces all keyframes. See mux.h and the gif2webp
+                              manpage for details.
+
+- 12/13/2016: version 0.5.2
+  This is a binary compatible release.
+  This release covers CVE-2016-8888 and CVE-2016-9085.
+  * further security related hardening in the tools; fixes to
+    gif2webp/AnimEncoder (issues #310, #314, #316, #322), cwebp/libwebp (issue
+    #312)
+  * full libwebp (encoder & decoder) iOS framework; libwebpdecoder
+    WebP.framework renamed to WebPDecoder.framework (issue #307)
+  * CMake support for Android Studio (2.2)
+  * miscellaneous build related fixes (issue #306, #313)
+  * miscellaneous documentation improvements (issue #225)
+  * minor lossy encoder fixes and improvements
+
 - 6/14/2016: version 0.5.1
   This is a binary compatible release.
   * miscellaneous bug fixes (issues #280, #289)
diff --git a/src/3rdparty/libwebp/README b/src/3rdparty/libwebp/README
index 90f8f10..4c15c4a 100644
--- a/src/3rdparty/libwebp/README
+++ b/src/3rdparty/libwebp/README
@@ -4,7 +4,7 @@
           \__\__/\____/\_____/__/ ____  ___
                 / _/ /    \    \ /  _ \/ _/
                /  \_/   / /   \ \   __/  \__
-               \____/____/\_____/_____/____/v0.5.1
+               \____/____/\_____/_____/____/v0.6.0
 
 Description:
 ============
@@ -220,8 +220,9 @@ assumed to be a PNG, JPEG, TIFF or WebP file.
 Options:
   -h / -help ............. short help
   -H / -longhelp ......... long help
-  -q <float> ............. quality factor (0:small..100:big)
-  -alpha_q <int> ......... transparency-compression quality (0..100)
+  -q <float> ............. quality factor (0:small..100:big), default=75
+  -alpha_q <int> ......... transparency-compression quality (0..100),
+                           default=100
   -preset <string> ....... preset setting, one of:
                             default, photo, picture,
                             drawing, icon, text
@@ -229,17 +230,18 @@ Options:
   -z <int> ............... activates lossless preset with given
                            level in [0:fast, ..., 9:slowest]
 
-  -m <int> ............... compression method (0=fast, 6=slowest)
-  -segments <int> ........ number of segments to use (1..4)
+  -m <int> ............... compression method (0=fast, 6=slowest), default=4
+  -segments <int> ........ number of segments to use (1..4), default=4
   -size <int> ............ target size (in bytes)
   -psnr <float> .......... target PSNR (in dB. typically: 42)
 
   -s <int> <int> ......... input size (width x height) for YUV
-  -sns <int> ............. spatial noise shaping (0:off, 100:max)
-  -f <int> ............... filter strength (0=off..100)
-  -sharpness <int> ....... filter sharpness (0:most .. 7:least sharp)
+  -sns <int> ............. spatial noise shaping (0:off, 100:max), default=50
+  -f <int> ............... filter strength (0=off..100), default=60
+  -sharpness <int> ....... filter sharpness (0:most .. 7:least sharp), default=0
   -strong ................ use strong filter instead of simple (default)
   -nostrong .............. use simple filter instead of strong
+  -sharp_yuv ............. use sharper (and slower) RGB->YUV conversion
   -partition_limit <int> . limit quality to fit the 512k limit on
                            the first partition (0=no degradation ... 100=full)
   -pass <int> ............ analysis pass number (1..10)
@@ -252,18 +254,18 @@ Options:
   -print_ssim ............ prints averaged SSIM distortion
   -print_lsim ............ prints local-similarity distortion
   -d <file.pgm> .......... dump the compressed output (PGM file)
-  -alpha_method <int> .... transparency-compression method (0..1)
+  -alpha_method <int> .... transparency-compression method (0..1), default=1
   -alpha_filter <string> . predictive filtering for alpha plane,
                            one of: none, fast (default) or best
-  -exact ................. preserve RGB values in transparent area
+  -exact ................. preserve RGB values in transparent area, default=off
   -blend_alpha <hex> ..... blend colors against background color
                            expressed as RGB values written in
                            hexadecimal, e.g. 0xc0e0d0 for red=0xc0
                            green=0xe0 and blue=0xd0
   -noalpha ............... discard any transparency information
-  -lossless .............. encode image losslessly
+  -lossless .............. encode image losslessly, default=off
   -near_lossless <int> ... use near-lossless image
-                           preprocessing (0..100=off)
+                           preprocessing (0..100=off), default=100
   -hint <string> ......... specify image characteristics hint,
                            one of: photo, picture or graph
 
@@ -383,6 +385,7 @@ Options are:
 Keyboard shortcuts:
   'c' ................ toggle use of color profile
   'i' ................ overlay file information
+  'd' ................ disable blending & disposal (debug)
   'q' / 'Q' / ESC .... quit
 
 Building:
@@ -411,6 +414,37 @@ $ make -f makefile.unix examples/vwebp
 > nmake /f Makefile.vc CFG=release-static \
     ../obj/x64/release-static/bin/vwebp.exe
 
+Animation creation tool:
+========================
+The utility 'img2webp' can turn a sequence of input images (PNG, JPEG, ...)
+into an animated WebP file. It offers fine control over duration, encoding
+modes, etc.
+
+Usage:
+
+  img2webp [file-level options] [image files...] [per-frame options...]
+
+File-level options (only used at the start of compression):
+ -min_size ............ minimize size
+ -loop <int> .......... loop count (default: 0, = infinite loop)
+ -kmax <int> .......... maximum number of frame between key-frames
+                        (0=only keyframes)
+ -kmin <int> .......... minimum number of frame between key-frames
+                        (0=disable key-frames altogether)
+ -mixed ............... use mixed lossy/lossless automatic mode
+ -v ................... verbose mode
+ -h ................... this help
+
+Per-frame options (only used for subsequent images input):
+ -d <int> ............. frame duration in ms (default: 100)
+ -lossless  ........... use lossless mode (default)
+ -lossy ... ........... use lossy mode
+ -q <float> ........... quality
+ -m <int> ............. method to use
+
+example: img2webp -loop 2 in0.png -lossy in1.jpg
+                  -d 80 in2.tiff -o out.webp
+
 Animated GIF conversion:
 ========================
 Animated GIF files can be converted to WebP files with animation using the
diff --git a/src/3rdparty/libwebp/qt_attribution.json b/src/3rdparty/libwebp/qt_attribution.json
index 825cfea..09f91dd 100644
--- a/src/3rdparty/libwebp/qt_attribution.json
+++ b/src/3rdparty/libwebp/qt_attribution.json
@@ -6,7 +6,7 @@
 
     "Description": "WebP is a new image format that provides lossless and lossy compression for images on the web.",
     "Homepage": "https://developers.google.com/speed/webp/",
-    "Version": "0.5.1",
+    "Version": "0.6.0",
     "License": "BSD 3-clause \"New\" or \"Revised\" License",
     "LicenseId": "BSD-3-Clause",
     "LicenseFile": "COPYING",
diff --git a/src/3rdparty/libwebp/src/dec/alpha.c b/src/3rdparty/libwebp/src/dec/alpha_dec.c
index 028eb3d..83ffd4b 100644
--- a/src/3rdparty/libwebp/src/dec/alpha.c
+++ b/src/3rdparty/libwebp/src/dec/alpha_dec.c
@@ -12,11 +12,11 @@
 // Author: Skal (pascal.massimino@gmail.com)
 
 #include <stdlib.h>
-#include "./alphai.h"
-#include "./vp8i.h"
-#include "./vp8li.h"
+#include "./alphai_dec.h"
+#include "./vp8i_dec.h"
+#include "./vp8li_dec.h"
 #include "../dsp/dsp.h"
-#include "../utils/quant_levels_dec.h"
+#include "../utils/quant_levels_dec_utils.h"
 #include "../utils/utils.h"
 #include "../webp/format_constants.h"
 
@@ -67,7 +67,7 @@ static int ALPHInit(ALPHDecoder* const dec, const uint8_t* data,
   }
 
   dec->method_ = (data[0] >> 0) & 0x03;
-  dec->filter_ = (data[0] >> 2) & 0x03;
+  dec->filter_ = (WEBP_FILTER_TYPE)((data[0] >> 2) & 0x03);
   dec->pre_processing_ = (data[0] >> 4) & 0x03;
   rsrv = (data[0] >> 6) & 0x03;
   if (dec->method_ < ALPHA_NO_COMPRESSION ||
diff --git a/src/3rdparty/libwebp/src/dec/alphai.h b/src/3rdparty/libwebp/src/dec/alphai_dec.h
index 69dd7c0..561e815 100644
--- a/src/3rdparty/libwebp/src/dec/alphai.h
+++ b/src/3rdparty/libwebp/src/dec/alphai_dec.h
@@ -14,8 +14,8 @@
 #ifndef WEBP_DEC_ALPHAI_H_
 #define WEBP_DEC_ALPHAI_H_
 
-#include "./webpi.h"
-#include "../utils/filters.h"
+#include "./webpi_dec.h"
+#include "../utils/filters_utils.h"
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/src/3rdparty/libwebp/src/dec/buffer.c b/src/3rdparty/libwebp/src/dec/buffer_dec.c
index 547e69b..c685fd5 100644
--- a/src/3rdparty/libwebp/src/dec/buffer.c
+++ b/src/3rdparty/libwebp/src/dec/buffer_dec.c
@@ -13,8 +13,8 @@
 
 #include <stdlib.h>
 
-#include "./vp8i.h"
-#include "./webpi.h"
+#include "./vp8i_dec.h"
+#include "./webpi_dec.h"
 #include "../utils/utils.h"
 
 //------------------------------------------------------------------------------
diff --git a/src/3rdparty/libwebp/src/dec/common.h b/src/3rdparty/libwebp/src/dec/common_dec.h
index 6961e22..6961e22 100644
--- a/src/3rdparty/libwebp/src/dec/common.h
+++ b/src/3rdparty/libwebp/src/dec/common_dec.h
diff --git a/src/3rdparty/libwebp/src/dec/frame.c b/src/3rdparty/libwebp/src/dec/frame_dec.c
index 22d291d..f91e27f 100644
--- a/src/3rdparty/libwebp/src/dec/frame.c
+++ b/src/3rdparty/libwebp/src/dec/frame_dec.c
@@ -12,7 +12,7 @@
 // Author: Skal (pascal.massimino@gmail.com)
 
 #include <stdlib.h>
-#include "./vp8i.h"
+#include "./vp8i_dec.h"
 #include "../utils/utils.h"
 
 //------------------------------------------------------------------------------
@@ -723,7 +723,7 @@ static int AllocateMemory(VP8Decoder* const dec) {
       return VP8SetError(dec, VP8_STATUS_OUT_OF_MEMORY,
                          "no memory during frame initialization.");
     }
-    // down-cast is ok, thanks to WebPSafeAlloc() above.
+    // down-cast is ok, thanks to WebPSafeMalloc() above.
     dec->mem_size_ = (size_t)needed;
   }
 
diff --git a/src/3rdparty/libwebp/src/dec/idec.c b/src/3rdparty/libwebp/src/dec/idec_dec.c
index 8de1319..78fb2e7 100644
--- a/src/3rdparty/libwebp/src/dec/idec.c
+++ b/src/3rdparty/libwebp/src/dec/idec_dec.c
@@ -15,9 +15,9 @@
 #include <string.h>
 #include <stdlib.h>
 
-#include "./alphai.h"
-#include "./webpi.h"
-#include "./vp8i.h"
+#include "./alphai_dec.h"
+#include "./webpi_dec.h"
+#include "./vp8i_dec.h"
 #include "../utils/utils.h"
 
 // In append mode, buffer allocations increase as multiples of this value.
diff --git a/src/3rdparty/libwebp/src/dec/io.c b/src/3rdparty/libwebp/src/dec/io_dec.c
index 8d5c43f..8bfab86 100644
--- a/src/3rdparty/libwebp/src/dec/io.c
+++ b/src/3rdparty/libwebp/src/dec/io_dec.c
@@ -13,8 +13,8 @@
 
 #include <assert.h>
 #include <stdlib.h>
-#include "../dec/vp8i.h"
-#include "./webpi.h"
+#include "../dec/vp8i_dec.h"
+#include "./webpi_dec.h"
 #include "../dsp/dsp.h"
 #include "../dsp/yuv.h"
 #include "../utils/utils.h"
@@ -256,7 +256,7 @@ static int Rescale(const uint8_t* src, int src_stride,
 static int EmitRescaledYUV(const VP8Io* const io, WebPDecParams* const p) {
   const int mb_h = io->mb_h;
   const int uv_mb_h = (mb_h + 1) >> 1;
-  WebPRescaler* const scaler = &p->scaler_y;
+  WebPRescaler* const scaler = p->scaler_y;
   int num_lines_out = 0;
   if (WebPIsAlphaMode(p->output->colorspace) && io->a != NULL) {
     // Before rescaling, we premultiply the luma directly into the io->y
@@ -267,29 +267,28 @@ static int EmitRescaledYUV(const VP8Io* const io, WebPDecParams* const p) {
                  io->a, io->width, io->mb_w, mb_h, 0);
   }
   num_lines_out = Rescale(io->y, io->y_stride, mb_h, scaler);
-  Rescale(io->u, io->uv_stride, uv_mb_h, &p->scaler_u);
-  Rescale(io->v, io->uv_stride, uv_mb_h, &p->scaler_v);
+  Rescale(io->u, io->uv_stride, uv_mb_h, p->scaler_u);
+  Rescale(io->v, io->uv_stride, uv_mb_h, p->scaler_v);
   return num_lines_out;
 }
 
 static int EmitRescaledAlphaYUV(const VP8Io* const io, WebPDecParams* const p,
                                 int expected_num_lines_out) {
   const WebPYUVABuffer* const buf = &p->output->u.YUVA;
+  uint8_t* const dst_a = buf->a + p->last_y * buf->a_stride;
   if (io->a != NULL) {
-    uint8_t* dst_y = buf->y + p->last_y * buf->y_stride;
-    const uint8_t* src_a = buf->a + p->last_y * buf->a_stride;
-    const int num_lines_out = Rescale(io->a, io->width, io->mb_h, &p->scaler_a);
-    (void)expected_num_lines_out;
+    uint8_t* const dst_y = buf->y + p->last_y * buf->y_stride;
+    const int num_lines_out = Rescale(io->a, io->width, io->mb_h, p->scaler_a);
     assert(expected_num_lines_out == num_lines_out);
     if (num_lines_out > 0) {   // unmultiply the Y
-      WebPMultRows(dst_y, buf->y_stride, src_a, buf->a_stride,
-                   p->scaler_a.dst_width, num_lines_out, 1);
+      WebPMultRows(dst_y, buf->y_stride, dst_a, buf->a_stride,
+                   p->scaler_a->dst_width, num_lines_out, 1);
     }
   } else if (buf->a != NULL) {
     // the user requested alpha, but there is none, set it to opaque.
     assert(p->last_y + expected_num_lines_out <= io->scaled_height);
-    FillAlphaPlane(buf->a + p->last_y * buf->a_stride,
-                   io->scaled_width, expected_num_lines_out, buf->a_stride);
+    FillAlphaPlane(dst_a, io->scaled_width, expected_num_lines_out,
+                   buf->a_stride);
   }
   return 0;
 }
@@ -305,31 +304,42 @@ static int InitYUVRescaler(const VP8Io* const io, WebPDecParams* const p) {
   const int uv_in_height = (io->mb_h + 1) >> 1;
   const size_t work_size = 2 * out_width;   // scratch memory for luma rescaler
   const size_t uv_work_size = 2 * uv_out_width;  // and for each u/v ones
-  size_t tmp_size;
+  size_t tmp_size, rescaler_size;
   rescaler_t* work;
+  WebPRescaler* scalers;
+  const int num_rescalers = has_alpha ? 4 : 3;
 
   tmp_size = (work_size + 2 * uv_work_size) * sizeof(*work);
   if (has_alpha) {
     tmp_size += work_size * sizeof(*work);
   }
-  p->memory = WebPSafeMalloc(1ULL, tmp_size);
+  rescaler_size = num_rescalers * sizeof(*p->scaler_y) + WEBP_ALIGN_CST;
+
+  p->memory = WebPSafeMalloc(1ULL, tmp_size + rescaler_size);
   if (p->memory == NULL) {
     return 0;   // memory error
   }
   work = (rescaler_t*)p->memory;
-  WebPRescalerInit(&p->scaler_y, io->mb_w, io->mb_h,
+
+  scalers = (WebPRescaler*)WEBP_ALIGN((const uint8_t*)work + tmp_size);
+  p->scaler_y = &scalers[0];
+  p->scaler_u = &scalers[1];
+  p->scaler_v = &scalers[2];
+  p->scaler_a = has_alpha ? &scalers[3] : NULL;
+
+  WebPRescalerInit(p->scaler_y, io->mb_w, io->mb_h,
                    buf->y, out_width, out_height, buf->y_stride, 1,
                    work);
-  WebPRescalerInit(&p->scaler_u, uv_in_width, uv_in_height,
+  WebPRescalerInit(p->scaler_u, uv_in_width, uv_in_height,
                    buf->u, uv_out_width, uv_out_height, buf->u_stride, 1,
                    work + work_size);
-  WebPRescalerInit(&p->scaler_v, uv_in_width, uv_in_height,
+  WebPRescalerInit(p->scaler_v, uv_in_width, uv_in_height,
                    buf->v, uv_out_width, uv_out_height, buf->v_stride, 1,
                    work + work_size + uv_work_size);
   p->emit = EmitRescaledYUV;
 
   if (has_alpha) {
-    WebPRescalerInit(&p->scaler_a, io->mb_w, io->mb_h,
+    WebPRescalerInit(p->scaler_a, io->mb_w, io->mb_h,
                      buf->a, out_width, out_height, buf->a_stride, 1,
                      work + work_size + 2 * uv_work_size);
     p->emit_alpha = EmitRescaledAlphaYUV;
@@ -349,15 +359,15 @@ static int ExportRGB(WebPDecParams* const p, int y_pos) {
   int num_lines_out = 0;
   // For RGB rescaling, because of the YUV420, current scan position
   // U/V can be +1/-1 line from the Y one.  Hence the double test.
-  while (WebPRescalerHasPendingOutput(&p->scaler_y) &&
-         WebPRescalerHasPendingOutput(&p->scaler_u)) {
+  while (WebPRescalerHasPendingOutput(p->scaler_y) &&
+         WebPRescalerHasPendingOutput(p->scaler_u)) {
     assert(y_pos + num_lines_out < p->output->height);
-    assert(p->scaler_u.y_accum == p->scaler_v.y_accum);
-    WebPRescalerExportRow(&p->scaler_y);
-    WebPRescalerExportRow(&p->scaler_u);
-    WebPRescalerExportRow(&p->scaler_v);
-    convert(p->scaler_y.dst, p->scaler_u.dst, p->scaler_v.dst,
-            dst, p->scaler_y.dst_width);
+    assert(p->scaler_u->y_accum == p->scaler_v->y_accum);
+    WebPRescalerExportRow(p->scaler_y);
+    WebPRescalerExportRow(p->scaler_u);
+    WebPRescalerExportRow(p->scaler_v);
+    convert(p->scaler_y->dst, p->scaler_u->dst, p->scaler_v->dst,
+            dst, p->scaler_y->dst_width);
     dst += buf->stride;
     ++num_lines_out;
   }
@@ -371,15 +381,15 @@ static int EmitRescaledRGB(const VP8Io* const io, WebPDecParams* const p) {
   int num_lines_out = 0;
   while (j < mb_h) {
     const int y_lines_in =
-        WebPRescalerImport(&p->scaler_y, mb_h - j,
+        WebPRescalerImport(p->scaler_y, mb_h - j,
                            io->y + j * io->y_stride, io->y_stride);
     j += y_lines_in;
-    if (WebPRescaleNeededLines(&p->scaler_u, uv_mb_h - uv_j)) {
+    if (WebPRescaleNeededLines(p->scaler_u, uv_mb_h - uv_j)) {
       const int u_lines_in =
-          WebPRescalerImport(&p->scaler_u, uv_mb_h - uv_j,
+          WebPRescalerImport(p->scaler_u, uv_mb_h - uv_j,
                              io->u + uv_j * io->uv_stride, io->uv_stride);
       const int v_lines_in =
-          WebPRescalerImport(&p->scaler_v, uv_mb_h - uv_j,
+          WebPRescalerImport(p->scaler_v, uv_mb_h - uv_j,
                              io->v + uv_j * io->uv_stride, io->uv_stride);
       (void)v_lines_in;   // remove a gcc warning
       assert(u_lines_in == v_lines_in);
@@ -400,13 +410,13 @@ static int ExportAlpha(WebPDecParams* const p, int y_pos, int max_lines_out) {
   int num_lines_out = 0;
   const int is_premult_alpha = WebPIsPremultipliedMode(colorspace);
   uint32_t non_opaque = 0;
-  const int width = p->scaler_a.dst_width;
+  const int width = p->scaler_a->dst_width;
 
-  while (WebPRescalerHasPendingOutput(&p->scaler_a) &&
+  while (WebPRescalerHasPendingOutput(p->scaler_a) &&
          num_lines_out < max_lines_out) {
     assert(y_pos + num_lines_out < p->output->height);
-    WebPRescalerExportRow(&p->scaler_a);
-    non_opaque |= WebPDispatchAlpha(p->scaler_a.dst, 0, width, 1, dst, 0);
+    WebPRescalerExportRow(p->scaler_a);
+    non_opaque |= WebPDispatchAlpha(p->scaler_a->dst, 0, width, 1, dst, 0);
     dst += buf->stride;
     ++num_lines_out;
   }
@@ -428,18 +438,18 @@ static int ExportAlphaRGBA4444(WebPDecParams* const p, int y_pos,
 #endif
   int num_lines_out = 0;
   const WEBP_CSP_MODE colorspace = p->output->colorspace;
-  const int width = p->scaler_a.dst_width;
+  const int width = p->scaler_a->dst_width;
   const int is_premult_alpha = WebPIsPremultipliedMode(colorspace);
   uint32_t alpha_mask = 0x0f;
 
-  while (WebPRescalerHasPendingOutput(&p->scaler_a) &&
+  while (WebPRescalerHasPendingOutput(p->scaler_a) &&
          num_lines_out < max_lines_out) {
     int i;
     assert(y_pos + num_lines_out < p->output->height);
-    WebPRescalerExportRow(&p->scaler_a);
+    WebPRescalerExportRow(p->scaler_a);
     for (i = 0; i < width; ++i) {
       // Fill in the alpha value (converted to 4 bits).
-      const uint32_t alpha_value = p->scaler_a.dst[i] >> 4;
+      const uint32_t alpha_value = p->scaler_a->dst[i] >> 4;
       alpha_dst[2 * i] = (alpha_dst[2 * i] & 0xf0) | alpha_value;
       alpha_mask &= alpha_value;
     }
@@ -455,7 +465,7 @@ static int ExportAlphaRGBA4444(WebPDecParams* const p, int y_pos,
 static int EmitRescaledAlphaRGB(const VP8Io* const io, WebPDecParams* const p,
                                 int expected_num_out_lines) {
   if (io->a != NULL) {
-    WebPRescaler* const scaler = &p->scaler_a;
+    WebPRescaler* const scaler = p->scaler_a;
     int lines_left = expected_num_out_lines;
     const int y_end = p->last_y + lines_left;
     while (lines_left > 0) {
@@ -477,7 +487,9 @@ static int InitRGBRescaler(const VP8Io* const io, WebPDecParams* const p) {
   const size_t work_size = 2 * out_width;   // scratch memory for one rescaler
   rescaler_t* work;  // rescalers work area
   uint8_t* tmp;   // tmp storage for scaled YUV444 samples before RGB conversion
-  size_t tmp_size1, tmp_size2, total_size;
+  size_t tmp_size1, tmp_size2, total_size, rescaler_size;
+  WebPRescaler* scalers;
+  const int num_rescalers = has_alpha ? 4 : 3;
 
   tmp_size1 = 3 * work_size;
   tmp_size2 = 3 * out_width;
@@ -486,26 +498,35 @@ static int InitRGBRescaler(const VP8Io* const io, WebPDecParams* const p) {
     tmp_size2 += out_width;
   }
   total_size = tmp_size1 * sizeof(*work) + tmp_size2 * sizeof(*tmp);
-  p->memory = WebPSafeMalloc(1ULL, total_size);
+  rescaler_size = num_rescalers * sizeof(*p->scaler_y) + WEBP_ALIGN_CST;
+
+  p->memory = WebPSafeMalloc(1ULL, total_size + rescaler_size);
   if (p->memory == NULL) {
     return 0;   // memory error
   }
   work = (rescaler_t*)p->memory;
   tmp = (uint8_t*)(work + tmp_size1);
-  WebPRescalerInit(&p->scaler_y, io->mb_w, io->mb_h,
+
+  scalers = (WebPRescaler*)WEBP_ALIGN((const uint8_t*)work + total_size);
+  p->scaler_y = &scalers[0];
+  p->scaler_u = &scalers[1];
+  p->scaler_v = &scalers[2];
+  p->scaler_a = has_alpha ? &scalers[3] : NULL;
+
+  WebPRescalerInit(p->scaler_y, io->mb_w, io->mb_h,
                    tmp + 0 * out_width, out_width, out_height, 0, 1,
                    work + 0 * work_size);
-  WebPRescalerInit(&p->scaler_u, uv_in_width, uv_in_height,
+  WebPRescalerInit(p->scaler_u, uv_in_width, uv_in_height,
                    tmp + 1 * out_width, out_width, out_height, 0, 1,
                    work + 1 * work_size);
-  WebPRescalerInit(&p->scaler_v, uv_in_width, uv_in_height,
+  WebPRescalerInit(p->scaler_v, uv_in_width, uv_in_height,
                    tmp + 2 * out_width, out_width, out_height, 0, 1,
                    work + 2 * work_size);
   p->emit = EmitRescaledRGB;
   WebPInitYUV444Converters();
 
   if (has_alpha) {
-    WebPRescalerInit(&p->scaler_a, io->mb_w, io->mb_h,
+    WebPRescalerInit(p->scaler_a, io->mb_w, io->mb_h,
                      tmp + 3 * out_width, out_width, out_height, 0, 1,
                      work + 3 * work_size);
     p->emit_alpha = EmitRescaledAlphaRGB;
diff --git a/src/3rdparty/libwebp/src/dec/quant.c b/src/3rdparty/libwebp/src/dec/quant_dec.c
index 5b648f9..14e3198 100644
--- a/src/3rdparty/libwebp/src/dec/quant.c
+++ b/src/3rdparty/libwebp/src/dec/quant_dec.c
@@ -11,7 +11,7 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
-#include "./vp8i.h"
+#include "./vp8i_dec.h"
 
 static WEBP_INLINE int clip(int v, int M) {
   return v < 0 ? 0 : v > M ? M : v;
diff --git a/src/3rdparty/libwebp/src/dec/tree.c b/src/3rdparty/libwebp/src/dec/tree_dec.c
index c2007ea..9e805f6 100644
--- a/src/3rdparty/libwebp/src/dec/tree.c
+++ b/src/3rdparty/libwebp/src/dec/tree_dec.c
@@ -11,10 +11,13 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
-#include "./vp8i.h"
-#include "../utils/bit_reader_inl.h"
+#include "./vp8i_dec.h"
+#include "../utils/bit_reader_inl_utils.h"
 
+#if !defined(__arm__) && !defined(_M_ARM) && !defined(__aarch64__)
+// using a table is ~1-2% slower on ARM. Prefer the coded-tree approach then.
 #define USE_GENERIC_TREE
+#endif
 
 #ifdef USE_GENERIC_TREE
 static const int8_t kYModesIntra4[18] = {
diff --git a/src/3rdparty/libwebp/src/dec/vp8.c b/src/3rdparty/libwebp/src/dec/vp8_dec.c
index 336680c..fad8d9c 100644
--- a/src/3rdparty/libwebp/src/dec/vp8.c
+++ b/src/3rdparty/libwebp/src/dec/vp8_dec.c
@@ -13,11 +13,11 @@
 
 #include <stdlib.h>
 
-#include "./alphai.h"
-#include "./vp8i.h"
-#include "./vp8li.h"
-#include "./webpi.h"
-#include "../utils/bit_reader_inl.h"
+#include "./alphai_dec.h"
+#include "./vp8i_dec.h"
+#include "./vp8li_dec.h"
+#include "./webpi_dec.h"
+#include "../utils/bit_reader_inl_utils.h"
 #include "../utils/utils.h"
 
 //------------------------------------------------------------------------------
@@ -27,6 +27,16 @@ int WebPGetDecoderVersion(void) {
 }
 
 //------------------------------------------------------------------------------
+// Signature and pointer-to-function for GetCoeffs() variants below.
+
+typedef int (*GetCoeffsFunc)(VP8BitReader* const br,
+                             const VP8BandProbas* const prob[],
+                             int ctx, const quant_t dq, int n, int16_t* out);
+static volatile GetCoeffsFunc GetCoeffs = NULL;
+
+static void InitGetCoeffs(void);
+
+//------------------------------------------------------------------------------
 // VP8Decoder
 
 static void SetOk(VP8Decoder* const dec) {
@@ -51,6 +61,7 @@ VP8Decoder* VP8New(void) {
     WebPGetWorkerInterface()->Init(&dec->worker_);
     dec->ready_ = 0;
     dec->num_parts_minus_one_ = 0;
+    InitGetCoeffs();
   }
   return dec;
 }
@@ -273,12 +284,14 @@ int VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io) {
     frm_hdr->profile_ = (bits >> 1) & 7;
     frm_hdr->show_ = (bits >> 4) & 1;
     frm_hdr->partition_length_ = (bits >> 5);
-    if (frm_hdr->profile_ > 3)
+    if (frm_hdr->profile_ > 3) {
       return VP8SetError(dec, VP8_STATUS_BITSTREAM_ERROR,
                          "Incorrect keyframe parameters.");
-    if (!frm_hdr->show_)
+    }
+    if (!frm_hdr->show_) {
       return VP8SetError(dec, VP8_STATUS_UNSUPPORTED_FEATURE,
                          "Frame not displayable.");
+    }
     buf += 3;
     buf_size -= 3;
   }
@@ -420,8 +433,9 @@ static int GetLargeValue(VP8BitReader* const br, const uint8_t* const p) {
 }
 
 // Returns the position of the last non-zero coeff plus one
-static int GetCoeffs(VP8BitReader* const br, const VP8BandProbas* const prob[],
-                     int ctx, const quant_t dq, int n, int16_t* out) {
+static int GetCoeffsFast(VP8BitReader* const br,
+                         const VP8BandProbas* const prob[],
+                         int ctx, const quant_t dq, int n, int16_t* out) {
   const uint8_t* p = prob[n]->probas_[ctx];
   for (; n < 16; ++n) {
     if (!VP8GetBit(br, p[0])) {
@@ -447,6 +461,46 @@ static int GetCoeffs(VP8BitReader* const br, const VP8BandProbas* const prob[],
   return 16;
 }
 
+// This version of GetCoeffs() uses VP8GetBitAlt() which is an alternate version
+// of VP8GetBitAlt() targeting specific platforms.
+static int GetCoeffsAlt(VP8BitReader* const br,
+                        const VP8BandProbas* const prob[],
+                        int ctx, const quant_t dq, int n, int16_t* out) {
+  const uint8_t* p = prob[n]->probas_[ctx];
+  for (; n < 16; ++n) {
+    if (!VP8GetBitAlt(br, p[0])) {
+      return n;  // previous coeff was last non-zero coeff
+    }
+    while (!VP8GetBitAlt(br, p[1])) {       // sequence of zero coeffs
+      p = prob[++n]->probas_[0];
+      if (n == 16) return 16;
+    }
+    {        // non zero coeff
+      const VP8ProbaArray* const p_ctx = &prob[n + 1]->probas_[0];
+      int v;
+      if (!VP8GetBitAlt(br, p[2])) {
+        v = 1;
+        p = p_ctx[1];
+      } else {
+        v = GetLargeValue(br, p);
+        p = p_ctx[2];
+      }
+      out[kZigzag[n]] = VP8GetSigned(br, v) * dq[n > 0];
+    }
+  }
+  return 16;
+}
+
+WEBP_TSAN_IGNORE_FUNCTION static void InitGetCoeffs(void) {
+  if (GetCoeffs == NULL) {
+    if (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kSlowSSSE3)) {
+      GetCoeffs = GetCoeffsAlt;
+    } else {
+      GetCoeffs = GetCoeffsFast;
+    }
+  }
+}
+
 static WEBP_INLINE uint32_t NzCodeBits(uint32_t nz_coeffs, int nz, int dc_nz) {
   nz_coeffs <<= 2;
   nz_coeffs |= (nz > 3) ? 3 : (nz > 1) ? 2 : dc_nz;
diff --git a/src/3rdparty/libwebp/src/dec/decode_vp8.h b/src/3rdparty/libwebp/src/dec/vp8_dec.h
index b9337bb..b9337bb 100644
--- a/src/3rdparty/libwebp/src/dec/decode_vp8.h
+++ b/src/3rdparty/libwebp/src/dec/vp8_dec.h
diff --git a/src/3rdparty/libwebp/src/dec/vp8i.h b/src/3rdparty/libwebp/src/dec/vp8i_dec.h
index 00da02b..555853e 100644
--- a/src/3rdparty/libwebp/src/dec/vp8i.h
+++ b/src/3rdparty/libwebp/src/dec/vp8i_dec.h
@@ -15,11 +15,11 @@
 #define WEBP_DEC_VP8I_H_
 
 #include <string.h>     // for memcpy()
-#include "./common.h"
-#include "./vp8li.h"
-#include "../utils/bit_reader.h"
-#include "../utils/random.h"
-#include "../utils/thread.h"
+#include "./common_dec.h"
+#include "./vp8li_dec.h"
+#include "../utils/bit_reader_utils.h"
+#include "../utils/random_utils.h"
+#include "../utils/thread_utils.h"
 #include "../dsp/dsp.h"
 
 #ifdef __cplusplus
@@ -31,8 +31,8 @@ extern "C" {
 
 // version numbers
 #define DEC_MAJ_VERSION 0
-#define DEC_MIN_VERSION 5
-#define DEC_REV_VERSION 1
+#define DEC_MIN_VERSION 6
+#define DEC_REV_VERSION 0
 
 // YUV-cache parameters. Cache is 32-bytes wide (= one cacheline).
 // Constraints are: We need to store one 16x16 block of luma samples (y),
diff --git a/src/3rdparty/libwebp/src/dec/vp8l.c b/src/3rdparty/libwebp/src/dec/vp8l_dec.c
index cb2e317..ef359a9 100644
--- a/src/3rdparty/libwebp/src/dec/vp8l.c
+++ b/src/3rdparty/libwebp/src/dec/vp8l_dec.c
@@ -14,13 +14,14 @@
 
 #include <stdlib.h>
 
-#include "./alphai.h"
-#include "./vp8li.h"
+#include "./alphai_dec.h"
+#include "./vp8li_dec.h"
 #include "../dsp/dsp.h"
 #include "../dsp/lossless.h"
+#include "../dsp/lossless_common.h"
 #include "../dsp/yuv.h"
-#include "../utils/endian_inl.h"
-#include "../utils/huffman.h"
+#include "../utils/endian_inl_utils.h"
+#include "../utils/huffman_utils.h"
 #include "../utils/utils.h"
 
 #define NUM_ARGB_CACHE_ROWS          16
@@ -547,11 +548,14 @@ static int EmitRescaledRowsRGBA(const VP8LDecoder* const dec,
     uint8_t* const row_out = out + num_lines_out * out_stride;
     const int lines_left = mb_h - num_lines_in;
     const int needed_lines = WebPRescaleNeededLines(dec->rescaler, lines_left);
+    int lines_imported;
     assert(needed_lines > 0 && needed_lines <= lines_left);
     WebPMultARGBRows(row_in, in_stride,
                      dec->rescaler->src_width, needed_lines, 0);
-    WebPRescalerImport(dec->rescaler, lines_left, row_in, in_stride);
-    num_lines_in += needed_lines;
+    lines_imported =
+        WebPRescalerImport(dec->rescaler, lines_left, row_in, in_stride);
+    assert(lines_imported == needed_lines);
+    num_lines_in += lines_imported;
     num_lines_out += Export(dec->rescaler, colorspace, out_stride, row_out);
   }
   return num_lines_out;
@@ -623,9 +627,12 @@ static int EmitRescaledRowsYUVA(const VP8LDecoder* const dec,
   while (num_lines_in < mb_h) {
     const int lines_left = mb_h - num_lines_in;
     const int needed_lines = WebPRescaleNeededLines(dec->rescaler, lines_left);
+    int lines_imported;
     WebPMultARGBRows(in, in_stride, dec->rescaler->src_width, needed_lines, 0);
-    WebPRescalerImport(dec->rescaler, lines_left, in, in_stride);
-    num_lines_in += needed_lines;
+    lines_imported =
+        WebPRescalerImport(dec->rescaler, lines_left, in, in_stride);
+    assert(lines_imported == needed_lines);
+    num_lines_in += lines_imported;
     in += needed_lines * in_stride;
     y_pos += ExportYUVA(dec, y_pos);
   }
@@ -705,13 +712,15 @@ static void ApplyInverseTransforms(VP8LDecoder* const dec, int num_rows,
   uint32_t* const rows_out = dec->argb_cache_;
 
   // Inverse transforms.
-  // TODO: most transforms only need to operate on the cropped region only.
-  memcpy(rows_out, rows_in, cache_pixs * sizeof(*rows_out));
   while (n-- > 0) {
     VP8LTransform* const transform = &dec->transforms_[n];
     VP8LInverseTransform(transform, start_row, end_row, rows_in, rows_out);
     rows_in = rows_out;
   }
+  if (rows_in != rows_out) {
+    // No transform called, hence just copy.
+    memcpy(rows_out, rows_in, cache_pixs * sizeof(*rows_out));
+  }
 }
 
 // Processes (transforms, scales & color-converts) the rows decoded after the
@@ -1210,8 +1219,9 @@ static int ExpandColorMap(int num_colors, VP8LTransform* const transform) {
       // Equivalent to AddPixelEq(), on a byte-basis.
       new_data[i] = (data[i] + new_data[i - 4]) & 0xff;
     }
-    for (; i < 4 * final_num_colors; ++i)
+    for (; i < 4 * final_num_colors; ++i) {
       new_data[i] = 0;  // black tail.
+    }
     WebPSafeFree(transform->data_);
     transform->data_ = new_color_map;
   }
@@ -1482,9 +1492,8 @@ static void ExtractAlphaRows(VP8LDecoder* const dec, int last_row) {
     const int cache_pixs = width * num_rows_to_process;
     uint8_t* const dst = output + width * cur_row;
     const uint32_t* const src = dec->argb_cache_;
-    int i;
     ApplyInverseTransforms(dec, num_rows_to_process, in);
-    for (i = 0; i < cache_pixs; ++i) dst[i] = (src[i] >> 8) & 0xff;
+    WebPExtractGreen(src, dst, cache_pixs);
     AlphaApplyFilter(alph_dec,
                      cur_row, cur_row + num_rows_to_process, dst, width);
     num_rows -= num_rows_to_process;
@@ -1552,6 +1561,8 @@ int VP8LDecodeAlphaImageStream(ALPHDecoder* const alph_dec, int last_row) {
     return 1;  // done
   }
 
+  if (!alph_dec->use_8b_decode_) WebPInitAlphaProcessing();
+
   // Decode (with special row processing).
   return alph_dec->use_8b_decode_ ?
       DecodeAlphaData(dec, (uint8_t*)dec->pixels_, dec->width_, dec->height_,
diff --git a/src/3rdparty/libwebp/src/dec/vp8li.h b/src/3rdparty/libwebp/src/dec/vp8li_dec.h
index 9313bdc..097a9d0 100644
--- a/src/3rdparty/libwebp/src/dec/vp8li.h
+++ b/src/3rdparty/libwebp/src/dec/vp8li_dec.h
@@ -16,10 +16,10 @@
 #define WEBP_DEC_VP8LI_H_
 
 #include <string.h>     // for memcpy()
-#include "./webpi.h"
-#include "../utils/bit_reader.h"
-#include "../utils/color_cache.h"
-#include "../utils/huffman.h"
+#include "./webpi_dec.h"
+#include "../utils/bit_reader_utils.h"
+#include "../utils/color_cache_utils.h"
+#include "../utils/huffman_utils.h"
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/src/3rdparty/libwebp/src/dec/webp.c b/src/3rdparty/libwebp/src/dec/webp_dec.c
index d0b912f..a8e9c2c 100644
--- a/src/3rdparty/libwebp/src/dec/webp.c
+++ b/src/3rdparty/libwebp/src/dec/webp_dec.c
@@ -13,9 +13,9 @@
 
 #include <stdlib.h>
 
-#include "./vp8i.h"
-#include "./vp8li.h"
-#include "./webpi.h"
+#include "./vp8i_dec.h"
+#include "./vp8li_dec.h"
+#include "./webpi_dec.h"
 #include "../utils/utils.h"
 #include "../webp/mux_types.h"  // ALPHA_FLAG
 
@@ -39,8 +39,8 @@
 //   20..23  VP8X flags bit-map corresponding to the chunk-types present.
 //   24..26  Width of the Canvas Image.
 //   27..29  Height of the Canvas Image.
-// There can be extra chunks after the "VP8X" chunk (ICCP, FRGM, ANMF, VP8,
-// VP8L, XMP, EXIF  ...)
+// There can be extra chunks after the "VP8X" chunk (ICCP, ANMF, VP8, VP8L,
+// XMP, EXIF  ...)
 // All sizes are in little-endian order.
 // Note: chunk data size must be padded to multiple of 2 when written.
 
@@ -289,7 +289,6 @@ static VP8StatusCode ParseHeadersInternal(const uint8_t* data,
   int found_riff = 0;
   int found_vp8x = 0;
   int animation_present = 0;
-  int fragments_present = 0;
   const int have_all_data = (headers != NULL) ? headers->have_all_data : 0;
 
   VP8StatusCode status;
@@ -318,7 +317,6 @@ static VP8StatusCode ParseHeadersInternal(const uint8_t* data,
       return status;  // Wrong VP8X / insufficient data.
     }
     animation_present = !!(flags & ANIMATION_FLAG);
-    fragments_present = !!(flags & FRAGMENTS_FLAG);
     if (!found_riff && found_vp8x) {
       // Note: This restriction may be removed in the future, if it becomes
       // necessary to send VP8X chunk to the decoder.
@@ -330,8 +328,7 @@ static VP8StatusCode ParseHeadersInternal(const uint8_t* data,
 
     image_width = canvas_width;
     image_height = canvas_height;
-    if (found_vp8x && (animation_present || fragments_present) &&
-        headers == NULL) {
+    if (found_vp8x && animation_present && headers == NULL) {
       status = VP8_STATUS_OK;
       goto ReturnWidthHeight;  // Just return features from VP8X header.
     }
@@ -362,7 +359,7 @@ static VP8StatusCode ParseHeadersInternal(const uint8_t* data,
     return VP8_STATUS_BITSTREAM_ERROR;
   }
 
-  if (format != NULL && !(animation_present || fragments_present)) {
+  if (format != NULL && !animation_present) {
     *format = hdrs.is_lossless ? 2 : 1;
   }
 
diff --git a/src/3rdparty/libwebp/src/dec/webpi.h b/src/3rdparty/libwebp/src/dec/webpi_dec.h
index 991b194..696abc1 100644
--- a/src/3rdparty/libwebp/src/dec/webpi.h
+++ b/src/3rdparty/libwebp/src/dec/webpi_dec.h
@@ -18,8 +18,8 @@
 extern "C" {
 #endif
 
-#include "../utils/rescaler.h"
-#include "./decode_vp8.h"
+#include "../utils/rescaler_utils.h"
+#include "./vp8_dec.h"
 
 //------------------------------------------------------------------------------
 // WebPDecParams: Decoding output parameters. Transient internal object.
@@ -38,27 +38,18 @@ struct WebPDecParams {
 
   int last_y;                 // coordinate of the line that was last output
   const WebPDecoderOptions* options;  // if not NULL, use alt decoding features
-  // rescalers
-  WebPRescaler scaler_y, scaler_u, scaler_v, scaler_a;
+
+  WebPRescaler* scaler_y, *scaler_u, *scaler_v, *scaler_a;  // rescalers
   void* memory;                  // overall scratch memory for the output work.
 
   OutputFunc emit;               // output RGB or YUV samples
   OutputAlphaFunc emit_alpha;    // output alpha channel
   OutputRowFunc emit_alpha_row;  // output one line of rescaled alpha values
-
-  WebPDecBuffer* final_output;   // In case the user supplied a slow-memory
-                                 // output, we decode image in temporary buffer
-                                 // (this::output) and copy it here.
-  WebPDecBuffer tmp_buffer;      // this::output will point to this one in case
-                                 // of slow memory.
 };
 
 // Should be called first, before any use of the WebPDecParams object.
 void WebPResetDecParams(WebPDecParams* const params);
 
-// Delete all memory (after an error occurred, for instance)
-void WebPFreeDecParams(WebPDecParams* const params);
-
 //------------------------------------------------------------------------------
 // Header parsing helpers
 
diff --git a/src/3rdparty/libwebp/src/demux/anim_decode.c b/src/3rdparty/libwebp/src/demux/anim_decode.c
index 1989eb4..f1cf176 100644
--- a/src/3rdparty/libwebp/src/demux/anim_decode.c
+++ b/src/3rdparty/libwebp/src/demux/anim_decode.c
@@ -112,18 +112,15 @@ WebPAnimDecoder* WebPAnimDecoderNewInternal(
   dec->info_.bgcolor = WebPDemuxGetI(dec->demux_, WEBP_FF_BACKGROUND_COLOR);
   dec->info_.frame_count = WebPDemuxGetI(dec->demux_, WEBP_FF_FRAME_COUNT);
 
-  {
-    const int canvas_bytes =
-        dec->info_.canvas_width * NUM_CHANNELS * dec->info_.canvas_height;
-    // Note: calloc() because we fill frame with zeroes as well.
-    dec->curr_frame_ = WebPSafeCalloc(1ULL, canvas_bytes);
-    if (dec->curr_frame_ == NULL) goto Error;
-    dec->prev_frame_disposed_ = WebPSafeCalloc(1ULL, canvas_bytes);
-    if (dec->prev_frame_disposed_ == NULL) goto Error;
-  }
+  // Note: calloc() because we fill frame with zeroes as well.
+  dec->curr_frame_ = (uint8_t*)WebPSafeCalloc(
+      dec->info_.canvas_width * NUM_CHANNELS, dec->info_.canvas_height);
+  if (dec->curr_frame_ == NULL) goto Error;
+  dec->prev_frame_disposed_ = (uint8_t*)WebPSafeCalloc(
+      dec->info_.canvas_width * NUM_CHANNELS, dec->info_.canvas_height);
+  if (dec->prev_frame_disposed_ == NULL) goto Error;
 
   WebPAnimDecoderReset(dec);
-
   return dec;
 
  Error:
@@ -144,9 +141,13 @@ static int IsFullFrame(int width, int height, int canvas_width,
 }
 
 // Clear the canvas to transparent.
-static void ZeroFillCanvas(uint8_t* buf, uint32_t canvas_width,
-                           uint32_t canvas_height) {
-  memset(buf, 0, canvas_width * NUM_CHANNELS * canvas_height);
+static int ZeroFillCanvas(uint8_t* buf, uint32_t canvas_width,
+                          uint32_t canvas_height) {
+  const uint64_t size =
+      (uint64_t)canvas_width * canvas_height * NUM_CHANNELS * sizeof(*buf);
+  if (size != (size_t)size) return 0;
+  memset(buf, 0, (size_t)size);
+  return 1;
 }
 
 // Clear given frame rectangle to transparent.
@@ -162,10 +163,13 @@ static void ZeroFillFrameRect(uint8_t* buf, int buf_stride, int x_offset,
 }
 
 // Copy width * height pixels from 'src' to 'dst'.
-static void CopyCanvas(const uint8_t* src, uint8_t* dst,
-                       uint32_t width, uint32_t height) {
+static int CopyCanvas(const uint8_t* src, uint8_t* dst,
+                      uint32_t width, uint32_t height) {
+  const uint64_t size = (uint64_t)width * height * NUM_CHANNELS;
+  if (size != (size_t)size) return 0;
   assert(src != NULL && dst != NULL);
-  memcpy(dst, src, width * NUM_CHANNELS * height);
+  memcpy(dst, src, (size_t)size);
+  return 1;
 }
 
 // Returns true if the current frame is a key-frame.
@@ -328,9 +332,14 @@ int WebPAnimDecoderGetNext(WebPAnimDecoder* dec,
   is_key_frame = IsKeyFrame(&iter, &dec->prev_iter_,
                             dec->prev_frame_was_keyframe_, width, height);
   if (is_key_frame) {
-    ZeroFillCanvas(dec->curr_frame_, width, height);
+    if (!ZeroFillCanvas(dec->curr_frame_, width, height)) {
+      goto Error;
+    }
   } else {
-    CopyCanvas(dec->prev_frame_disposed_, dec->curr_frame_, width, height);
+    if (!CopyCanvas(dec->prev_frame_disposed_, dec->curr_frame_,
+                    width, height)) {
+      goto Error;
+    }
   }
 
   // Decode.
@@ -393,6 +402,7 @@ int WebPAnimDecoderGetNext(WebPAnimDecoder* dec,
 
   // Update info of the previous frame and dispose it for the next iteration.
   dec->prev_frame_timestamp_ = timestamp;
+  WebPDemuxReleaseIterator(&dec->prev_iter_);
   dec->prev_iter_ = iter;
   dec->prev_frame_was_keyframe_ = is_key_frame;
   CopyCanvas(dec->curr_frame_, dec->prev_frame_disposed_, width, height);
@@ -421,6 +431,7 @@ int WebPAnimDecoderHasMoreFrames(const WebPAnimDecoder* dec) {
 void WebPAnimDecoderReset(WebPAnimDecoder* dec) {
   if (dec != NULL) {
     dec->prev_frame_timestamp_ = 0;
+    WebPDemuxReleaseIterator(&dec->prev_iter_);
     memset(&dec->prev_iter_, 0, sizeof(dec->prev_iter_));
     dec->prev_frame_was_keyframe_ = 0;
     dec->next_frame_ = 1;
@@ -434,6 +445,7 @@ const WebPDemuxer* WebPAnimDecoderGetDemuxer(const WebPAnimDecoder* dec) {
 
 void WebPAnimDecoderDelete(WebPAnimDecoder* dec) {
   if (dec != NULL) {
+    WebPDemuxReleaseIterator(&dec->prev_iter_);
     WebPDemuxDelete(dec->demux_);
     WebPSafeFree(dec->curr_frame_);
     WebPSafeFree(dec->prev_frame_disposed_);
diff --git a/src/3rdparty/libwebp/src/demux/demux.c b/src/3rdparty/libwebp/src/demux/demux.c
index 0d2989f..100eab8 100644
--- a/src/3rdparty/libwebp/src/demux/demux.c
+++ b/src/3rdparty/libwebp/src/demux/demux.c
@@ -25,7 +25,7 @@
 
 #define DMUX_MAJ_VERSION 0
 #define DMUX_MIN_VERSION 3
-#define DMUX_REV_VERSION 0
+#define DMUX_REV_VERSION 2
 
 typedef struct {
   size_t start_;        // start location of the data
@@ -590,7 +590,6 @@ static int CheckFrameBounds(const Frame* const frame, int exact,
 
 static int IsValidExtendedFormat(const WebPDemuxer* const dmux) {
   const int is_animation = !!(dmux->feature_flags_ & ANIMATION_FLAG);
-  const int is_fragmented = !!(dmux->feature_flags_ & FRAGMENTS_FLAG);
   const Frame* f = dmux->frames_;
 
   if (dmux->state_ == WEBP_DEMUX_PARSING_HEADER) return 1;
@@ -598,7 +597,7 @@ static int IsValidExtendedFormat(const WebPDemuxer* const dmux) {
   if (dmux->canvas_width_ <= 0 || dmux->canvas_height_ <= 0) return 0;
   if (dmux->loop_count_ < 0) return 0;
   if (dmux->state_ == WEBP_DEMUX_DONE && dmux->frames_ == NULL) return 0;
-  if (is_fragmented) return 0;
+  if (dmux->feature_flags_ & ~ALL_VALID_FLAGS) return 0;  // invalid bitstream
 
   while (f != NULL) {
     const int cur_frame_set = f->frame_num_;
diff --git a/src/3rdparty/libwebp/src/dsp/alpha_processing.c b/src/3rdparty/libwebp/src/dsp/alpha_processing.c
index 1716cac..4b60e09 100644
--- a/src/3rdparty/libwebp/src/dsp/alpha_processing.c
+++ b/src/3rdparty/libwebp/src/dsp/alpha_processing.c
@@ -284,9 +284,9 @@ static void ApplyAlphaMultiply_16b(uint8_t* rgba4444,
 #endif
 }
 
-static int DispatchAlpha(const uint8_t* alpha, int alpha_stride,
-                         int width, int height,
-                         uint8_t* dst, int dst_stride) {
+static int DispatchAlpha_C(const uint8_t* alpha, int alpha_stride,
+                           int width, int height,
+                           uint8_t* dst, int dst_stride) {
   uint32_t alpha_mask = 0xff;
   int i, j;
 
@@ -303,9 +303,9 @@ static int DispatchAlpha(const uint8_t* alpha, int alpha_stride,
   return (alpha_mask != 0xff);
 }
 
-static void DispatchAlphaToGreen(const uint8_t* alpha, int alpha_stride,
-                                 int width, int height,
-                                 uint32_t* dst, int dst_stride) {
+static void DispatchAlphaToGreen_C(const uint8_t* alpha, int alpha_stride,
+                                   int width, int height,
+                                   uint32_t* dst, int dst_stride) {
   int i, j;
   for (j = 0; j < height; ++j) {
     for (i = 0; i < width; ++i) {
@@ -316,9 +316,9 @@ static void DispatchAlphaToGreen(const uint8_t* alpha, int alpha_stride,
   }
 }
 
-static int ExtractAlpha(const uint8_t* argb, int argb_stride,
-                        int width, int height,
-                        uint8_t* alpha, int alpha_stride) {
+static int ExtractAlpha_C(const uint8_t* argb, int argb_stride,
+                          int width, int height,
+                          uint8_t* alpha, int alpha_stride) {
   uint8_t alpha_mask = 0xff;
   int i, j;
 
@@ -334,11 +334,17 @@ static int ExtractAlpha(const uint8_t* argb, int argb_stride,
   return (alpha_mask == 0xff);
 }
 
+static void ExtractGreen_C(const uint32_t* argb, uint8_t* alpha, int size) {
+  int i;
+  for (i = 0; i < size; ++i) alpha[i] = argb[i] >> 8;
+}
+
 void (*WebPApplyAlphaMultiply)(uint8_t*, int, int, int, int);
 void (*WebPApplyAlphaMultiply4444)(uint8_t*, int, int, int);
 int (*WebPDispatchAlpha)(const uint8_t*, int, int, int, uint8_t*, int);
 void (*WebPDispatchAlphaToGreen)(const uint8_t*, int, int, int, uint32_t*, int);
 int (*WebPExtractAlpha)(const uint8_t*, int, int, int, uint8_t*, int);
+void (*WebPExtractGreen)(const uint32_t* argb, uint8_t* alpha, int size);
 
 //------------------------------------------------------------------------------
 // Init function
@@ -346,6 +352,7 @@ int (*WebPExtractAlpha)(const uint8_t*, int, int, int, uint8_t*, int);
 extern void WebPInitAlphaProcessingMIPSdspR2(void);
 extern void WebPInitAlphaProcessingSSE2(void);
 extern void WebPInitAlphaProcessingSSE41(void);
+extern void WebPInitAlphaProcessingNEON(void);
 
 static volatile VP8CPUInfo alpha_processing_last_cpuinfo_used =
     (VP8CPUInfo)&alpha_processing_last_cpuinfo_used;
@@ -357,9 +364,11 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessing(void) {
   WebPMultRow = WebPMultRowC;
   WebPApplyAlphaMultiply = ApplyAlphaMultiply;
   WebPApplyAlphaMultiply4444 = ApplyAlphaMultiply_16b;
-  WebPDispatchAlpha = DispatchAlpha;
-  WebPDispatchAlphaToGreen = DispatchAlphaToGreen;
-  WebPExtractAlpha = ExtractAlpha;
+
+  WebPDispatchAlpha = DispatchAlpha_C;
+  WebPDispatchAlphaToGreen = DispatchAlphaToGreen_C;
+  WebPExtractAlpha = ExtractAlpha_C;
+  WebPExtractGreen = ExtractGreen_C;
 
   // If defined, use CPUInfo() to overwrite some pointers with faster versions.
   if (VP8GetCPUInfo != NULL) {
@@ -373,6 +382,11 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessing(void) {
 #endif
     }
 #endif
+#if defined(WEBP_USE_NEON)
+    if (VP8GetCPUInfo(kNEON)) {
+      WebPInitAlphaProcessingNEON();
+    }
+#endif
 #if defined(WEBP_USE_MIPS_DSP_R2)
     if (VP8GetCPUInfo(kMIPSdspR2)) {
       WebPInitAlphaProcessingMIPSdspR2();
diff --git a/src/3rdparty/libwebp/src/dsp/alpha_processing_neon.c b/src/3rdparty/libwebp/src/dsp/alpha_processing_neon.c
new file mode 100644
index 0000000..606a401
--- /dev/null
+++ b/src/3rdparty/libwebp/src/dsp/alpha_processing_neon.c
@@ -0,0 +1,191 @@
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Utilities for processing transparent channel, NEON version.
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "./dsp.h"
+
+#if defined(WEBP_USE_NEON)
+
+#include "./neon.h"
+
+//------------------------------------------------------------------------------
+
+#define MULTIPLIER(a) ((a) * 0x8081)
+#define PREMULTIPLY(x, m) (((x) * (m)) >> 23)
+
+#define MULTIPLY_BY_ALPHA(V, ALPHA, OTHER) do {                        \
+  const uint8x8_t alpha = (V).val[(ALPHA)];                            \
+  const uint16x8_t r1 = vmull_u8((V).val[1], alpha);                   \
+  const uint16x8_t g1 = vmull_u8((V).val[2], alpha);                   \
+  const uint16x8_t b1 = vmull_u8((V).val[(OTHER)], alpha);             \
+  /* we use: v / 255 = (v + 1 + (v >> 8)) >> 8 */                      \
+  const uint16x8_t r2 = vsraq_n_u16(r1, r1, 8);                        \
+  const uint16x8_t g2 = vsraq_n_u16(g1, g1, 8);                        \
+  const uint16x8_t b2 = vsraq_n_u16(b1, b1, 8);                        \
+  const uint16x8_t r3 = vaddq_u16(r2, kOne);                           \
+  const uint16x8_t g3 = vaddq_u16(g2, kOne);                           \
+  const uint16x8_t b3 = vaddq_u16(b2, kOne);                           \
+  (V).val[1] = vshrn_n_u16(r3, 8);                                     \
+  (V).val[2] = vshrn_n_u16(g3, 8);                                     \
+  (V).val[(OTHER)] = vshrn_n_u16(b3, 8);                               \
+} while (0)
+
+static void ApplyAlphaMultiply_NEON(uint8_t* rgba, int alpha_first,
+                                    int w, int h, int stride) {
+  const uint16x8_t kOne = vdupq_n_u16(1u);
+  while (h-- > 0) {
+    uint32_t* const rgbx = (uint32_t*)rgba;
+    int i = 0;
+    if (alpha_first) {
+      for (; i + 8 <= w; i += 8) {
+        // load aaaa...|rrrr...|gggg...|bbbb...
+        uint8x8x4_t RGBX = vld4_u8((const uint8_t*)(rgbx + i));
+        MULTIPLY_BY_ALPHA(RGBX, 0, 3);
+        vst4_u8((uint8_t*)(rgbx + i), RGBX);
+      }
+    } else {
+      for (; i + 8 <= w; i += 8) {
+        uint8x8x4_t RGBX = vld4_u8((const uint8_t*)(rgbx + i));
+        MULTIPLY_BY_ALPHA(RGBX, 3, 0);
+        vst4_u8((uint8_t*)(rgbx + i), RGBX);
+      }
+    }
+    // Finish with left-overs.
+    for (; i < w; ++i) {
+      uint8_t* const rgb = rgba + (alpha_first ? 1 : 0);
+      const uint8_t* const alpha = rgba + (alpha_first ? 0 : 3);
+      const uint32_t a = alpha[4 * i];
+      if (a != 0xff) {
+        const uint32_t mult = MULTIPLIER(a);
+        rgb[4 * i + 0] = PREMULTIPLY(rgb[4 * i + 0], mult);
+        rgb[4 * i + 1] = PREMULTIPLY(rgb[4 * i + 1], mult);
+        rgb[4 * i + 2] = PREMULTIPLY(rgb[4 * i + 2], mult);
+      }
+    }
+    rgba += stride;
+  }
+}
+#undef MULTIPLY_BY_ALPHA
+#undef MULTIPLIER
+#undef PREMULTIPLY
+
+//------------------------------------------------------------------------------
+
+static int DispatchAlpha_NEON(const uint8_t* alpha, int alpha_stride,
+                              int width, int height,
+                              uint8_t* dst, int dst_stride) {
+  uint32_t alpha_mask = 0xffffffffu;
+  uint8x8_t mask8 = vdup_n_u8(0xff);
+  uint32_t tmp[2];
+  int i, j;
+  for (j = 0; j < height; ++j) {
+    // We don't know if alpha is first or last in dst[] (depending on rgbA/Argb
+    // mode). So we must be sure dst[4*i + 8 - 1] is writable for the store.
+    // Hence the test with 'width - 1' instead of just 'width'.
+    for (i = 0; i + 8 <= width - 1; i += 8) {
+      uint8x8x4_t rgbX = vld4_u8((const uint8_t*)(dst + 4 * i));
+      const uint8x8_t alphas = vld1_u8(alpha + i);
+      rgbX.val[0] = alphas;
+      vst4_u8((uint8_t*)(dst + 4 * i), rgbX);
+      mask8 = vand_u8(mask8, alphas);
+    }
+    for (; i < width; ++i) {
+      const uint32_t alpha_value = alpha[i];
+      dst[4 * i] = alpha_value;
+      alpha_mask &= alpha_value;
+    }
+    alpha += alpha_stride;
+    dst += dst_stride;
+  }
+  vst1_u8((uint8_t*)tmp, mask8);
+  alpha_mask &= tmp[0];
+  alpha_mask &= tmp[1];
+  return (alpha_mask != 0xffffffffu);
+}
+
+static void DispatchAlphaToGreen_NEON(const uint8_t* alpha, int alpha_stride,
+                                      int width, int height,
+                                      uint32_t* dst, int dst_stride) {
+  int i, j;
+  uint8x8x4_t greens;   // leave A/R/B channels zero'd.
+  greens.val[0] = vdup_n_u8(0);
+  greens.val[2] = vdup_n_u8(0);
+  greens.val[3] = vdup_n_u8(0);
+  for (j = 0; j < height; ++j) {
+    for (i = 0; i + 8 <= width; i += 8) {
+      greens.val[1] = vld1_u8(alpha + i);
+      vst4_u8((uint8_t*)(dst + i), greens);
+    }
+    for (; i < width; ++i) dst[i] = alpha[i] << 8;
+    alpha += alpha_stride;
+    dst += dst_stride;
+  }
+}
+
+static int ExtractAlpha_NEON(const uint8_t* argb, int argb_stride,
+                             int width, int height,
+                             uint8_t* alpha, int alpha_stride) {
+  uint32_t alpha_mask = 0xffffffffu;
+  uint8x8_t mask8 = vdup_n_u8(0xff);
+  uint32_t tmp[2];
+  int i, j;
+  for (j = 0; j < height; ++j) {
+    // We don't know if alpha is first or last in dst[] (depending on rgbA/Argb
+    // mode). So we must be sure dst[4*i + 8 - 1] is writable for the store.
+    // Hence the test with 'width - 1' instead of just 'width'.
+    for (i = 0; i + 8 <= width - 1; i += 8) {
+      const uint8x8x4_t rgbX = vld4_u8((const uint8_t*)(argb + 4 * i));
+      const uint8x8_t alphas = rgbX.val[0];
+      vst1_u8((uint8_t*)(alpha + i), alphas);
+      mask8 = vand_u8(mask8, alphas);
+    }
+    for (; i < width; ++i) {
+      alpha[i] = argb[4 * i];
+      alpha_mask &= alpha[i];
+    }
+    argb += argb_stride;
+    alpha += alpha_stride;
+  }
+  vst1_u8((uint8_t*)tmp, mask8);
+  alpha_mask &= tmp[0];
+  alpha_mask &= tmp[1];
+  return (alpha_mask == 0xffffffffu);
+}
+
+static void ExtractGreen_NEON(const uint32_t* argb,
+                              uint8_t* alpha, int size) {
+  int i;
+  for (i = 0; i + 16 <= size; i += 16) {
+    const uint8x16x4_t rgbX = vld4q_u8((const uint8_t*)(argb + i));
+    const uint8x16_t greens = rgbX.val[1];
+    vst1q_u8(alpha + i, greens);
+  }
+  for (; i < size; ++i) alpha[i] = (argb[i] >> 8) & 0xff;
+}
+
+//------------------------------------------------------------------------------
+
+extern void WebPInitAlphaProcessingNEON(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessingNEON(void) {
+  WebPApplyAlphaMultiply = ApplyAlphaMultiply_NEON;
+  WebPDispatchAlpha = DispatchAlpha_NEON;
+  WebPDispatchAlphaToGreen = DispatchAlphaToGreen_NEON;
+  WebPExtractAlpha = ExtractAlpha_NEON;
+  WebPExtractGreen = ExtractGreen_NEON;
+}
+
+#else  // !WEBP_USE_NEON
+
+WEBP_DSP_INIT_STUB(WebPInitAlphaProcessingNEON)
+
+#endif  // WEBP_USE_NEON
diff --git a/src/3rdparty/libwebp/src/dsp/alpha_processing_sse2.c b/src/3rdparty/libwebp/src/dsp/alpha_processing_sse2.c
index 5acb481..83dc559 100644
--- a/src/3rdparty/libwebp/src/dsp/alpha_processing_sse2.c
+++ b/src/3rdparty/libwebp/src/dsp/alpha_processing_sse2.c
@@ -150,46 +150,46 @@ static int ExtractAlpha(const uint8_t* argb, int argb_stride,
 #define PREMULTIPLY(x, m) (((x) * (m)) >> 23)
 
 // We can't use a 'const int' for the SHUFFLE value, because it has to be an
-// immediate in the _mm_shufflexx_epi16() instruction. We really a macro here.
-#define APPLY_ALPHA(RGBX, SHUFFLE, MASK, MULT) do {             \
-  const __m128i argb0 = _mm_loadl_epi64((__m128i*)&(RGBX));     \
-  const __m128i argb1 = _mm_unpacklo_epi8(argb0, zero);         \
-  const __m128i alpha0 = _mm_and_si128(argb1, MASK);            \
-  const __m128i alpha1 = _mm_shufflelo_epi16(alpha0, SHUFFLE);  \
-  const __m128i alpha2 = _mm_shufflehi_epi16(alpha1, SHUFFLE);  \
-  /* alpha2 = [0 a0 a0 a0][0 a1 a1 a1] */                       \
-  const __m128i scale0 = _mm_mullo_epi16(alpha2, MULT);         \
-  const __m128i scale1 = _mm_mulhi_epu16(alpha2, MULT);         \
-  const __m128i argb2 = _mm_mulhi_epu16(argb1, scale0);         \
-  const __m128i argb3 = _mm_mullo_epi16(argb1, scale1);         \
-  const __m128i argb4 = _mm_adds_epu16(argb2, argb3);           \
-  const __m128i argb5 = _mm_srli_epi16(argb4, 7);               \
-  const __m128i argb6 = _mm_or_si128(argb5, alpha0);            \
-  const __m128i argb7 = _mm_packus_epi16(argb6, zero);          \
-  _mm_storel_epi64((__m128i*)&(RGBX), argb7);                   \
+// immediate in the _mm_shufflexx_epi16() instruction. We really need a macro.
+// We use: v / 255 = (v * 0x8081) >> 23, where v = alpha * {r,g,b} is a 16bit
+// value.
+#define APPLY_ALPHA(RGBX, SHUFFLE) do {                              \
+  const __m128i argb0 = _mm_loadu_si128((const __m128i*)&(RGBX));    \
+  const __m128i argb1_lo = _mm_unpacklo_epi8(argb0, zero);           \
+  const __m128i argb1_hi = _mm_unpackhi_epi8(argb0, zero);           \
+  const __m128i alpha0_lo = _mm_or_si128(argb1_lo, kMask);           \
+  const __m128i alpha0_hi = _mm_or_si128(argb1_hi, kMask);           \
+  const __m128i alpha1_lo = _mm_shufflelo_epi16(alpha0_lo, SHUFFLE); \
+  const __m128i alpha1_hi = _mm_shufflelo_epi16(alpha0_hi, SHUFFLE); \
+  const __m128i alpha2_lo = _mm_shufflehi_epi16(alpha1_lo, SHUFFLE); \
+  const __m128i alpha2_hi = _mm_shufflehi_epi16(alpha1_hi, SHUFFLE); \
+  /* alpha2 = [ff a0 a0 a0][ff a1 a1 a1] */                          \
+  const __m128i A0_lo = _mm_mullo_epi16(alpha2_lo, argb1_lo);        \
+  const __m128i A0_hi = _mm_mullo_epi16(alpha2_hi, argb1_hi);        \
+  const __m128i A1_lo = _mm_mulhi_epu16(A0_lo, kMult);               \
+  const __m128i A1_hi = _mm_mulhi_epu16(A0_hi, kMult);               \
+  const __m128i A2_lo = _mm_srli_epi16(A1_lo, 7);                    \
+  const __m128i A2_hi = _mm_srli_epi16(A1_hi, 7);                    \
+  const __m128i A3 = _mm_packus_epi16(A2_lo, A2_hi);                 \
+  _mm_storeu_si128((__m128i*)&(RGBX), A3);                           \
 } while (0)
 
-static void ApplyAlphaMultiply(uint8_t* rgba, int alpha_first,
-                               int w, int h, int stride) {
+static void ApplyAlphaMultiply_SSE2(uint8_t* rgba, int alpha_first,
+                                    int w, int h, int stride) {
   const __m128i zero = _mm_setzero_si128();
-  const int kSpan = 2;
-  const int w2 = w & ~(kSpan - 1);
+  const __m128i kMult = _mm_set1_epi16(0x8081u);
+  const __m128i kMask = _mm_set_epi16(0, 0xff, 0xff, 0, 0, 0xff, 0xff, 0);
+  const int kSpan = 4;
   while (h-- > 0) {
     uint32_t* const rgbx = (uint32_t*)rgba;
     int i;
     if (!alpha_first) {
-      const __m128i kMask = _mm_set_epi16(0xff, 0, 0, 0, 0xff, 0, 0, 0);
-      const __m128i kMult =
-          _mm_set_epi16(0, 0x8081, 0x8081, 0x8081, 0, 0x8081, 0x8081, 0x8081);
-      for (i = 0; i < w2; i += kSpan) {
-        APPLY_ALPHA(rgbx[i], _MM_SHUFFLE(0, 3, 3, 3), kMask, kMult);
+      for (i = 0; i + kSpan <= w; i += kSpan) {
+        APPLY_ALPHA(rgbx[i], _MM_SHUFFLE(2, 3, 3, 3));
       }
     } else {
-      const __m128i kMask = _mm_set_epi16(0, 0, 0, 0xff, 0, 0, 0, 0xff);
-      const __m128i kMult =
-          _mm_set_epi16(0x8081, 0x8081, 0x8081, 0, 0x8081, 0x8081, 0x8081, 0);
-      for (i = 0; i < w2; i += kSpan) {
-        APPLY_ALPHA(rgbx[i], _MM_SHUFFLE(0, 0, 0, 3), kMask, kMult);
+      for (i = 0; i + kSpan <= w; i += kSpan) {
+        APPLY_ALPHA(rgbx[i], _MM_SHUFFLE(0, 0, 0, 1));
       }
     }
     // Finish with left-overs.
@@ -213,64 +213,51 @@ static void ApplyAlphaMultiply(uint8_t* rgba, int alpha_first,
 // -----------------------------------------------------------------------------
 // Apply alpha value to rows
 
-// We use: kINV255 = (1 << 24) / 255 = 0x010101
-// So: a * kINV255 = (a << 16) | [(a << 8) | a]
-// -> _mm_mulhi_epu16() takes care of the (a<<16) part,
-// and _mm_mullo_epu16(a * 0x0101,...) takes care of the "(a << 8) | a" one.
-
-static void MultARGBRow(uint32_t* const ptr, int width, int inverse) {
+static void MultARGBRow_SSE2(uint32_t* const ptr, int width, int inverse) {
   int x = 0;
   if (!inverse) {
     const int kSpan = 2;
     const __m128i zero = _mm_setzero_si128();
-    const __m128i kRound =
-        _mm_set_epi16(0, 1 << 7, 1 << 7, 1 << 7, 0, 1 << 7, 1 << 7, 1 << 7);
-    const __m128i kMult =
-        _mm_set_epi16(0, 0x0101, 0x0101, 0x0101, 0, 0x0101, 0x0101, 0x0101);
-    const __m128i kOne64 = _mm_set_epi16(1u << 8, 0, 0, 0, 1u << 8, 0, 0, 0);
-    const int w2 = width & ~(kSpan - 1);
-    for (x = 0; x < w2; x += kSpan) {
-      const __m128i argb0 = _mm_loadl_epi64((__m128i*)&ptr[x]);
-      const __m128i argb1 = _mm_unpacklo_epi8(argb0, zero);
-      const __m128i tmp0 = _mm_shufflelo_epi16(argb1, _MM_SHUFFLE(3, 3, 3, 3));
-      const __m128i tmp1 = _mm_shufflehi_epi16(tmp0, _MM_SHUFFLE(3, 3, 3, 3));
-      const __m128i tmp2 = _mm_srli_epi64(tmp1, 16);
-      const __m128i scale0 = _mm_mullo_epi16(tmp1, kMult);
-      const __m128i scale1 = _mm_or_si128(tmp2, kOne64);
-      const __m128i argb2 = _mm_mulhi_epu16(argb1, scale0);
-      const __m128i argb3 = _mm_mullo_epi16(argb1, scale1);
-      const __m128i argb4 = _mm_adds_epu16(argb2, argb3);
-      const __m128i argb5 = _mm_adds_epu16(argb4, kRound);
-      const __m128i argb6 = _mm_srli_epi16(argb5, 8);
-      const __m128i argb7 = _mm_packus_epi16(argb6, zero);
-      _mm_storel_epi64((__m128i*)&ptr[x], argb7);
+    const __m128i k128 = _mm_set1_epi16(128);
+    const __m128i kMult = _mm_set1_epi16(0x0101);
+    const __m128i kMask = _mm_set_epi16(0, 0xff, 0, 0, 0, 0xff, 0, 0);
+    for (x = 0; x + kSpan <= width; x += kSpan) {
+      // To compute 'result = (int)(a * x / 255. + .5)', we use:
+      //   tmp = a * v + 128, result = (tmp * 0x0101u) >> 16
+      const __m128i A0 = _mm_loadl_epi64((const __m128i*)&ptr[x]);
+      const __m128i A1 = _mm_unpacklo_epi8(A0, zero);
+      const __m128i A2 = _mm_or_si128(A1, kMask);
+      const __m128i A3 = _mm_shufflelo_epi16(A2, _MM_SHUFFLE(2, 3, 3, 3));
+      const __m128i A4 = _mm_shufflehi_epi16(A3, _MM_SHUFFLE(2, 3, 3, 3));
+      // here, A4 = [ff a0 a0 a0][ff a1 a1 a1]
+      const __m128i A5 = _mm_mullo_epi16(A4, A1);
+      const __m128i A6 = _mm_add_epi16(A5, k128);
+      const __m128i A7 = _mm_mulhi_epu16(A6, kMult);
+      const __m128i A10 = _mm_packus_epi16(A7, zero);
+      _mm_storel_epi64((__m128i*)&ptr[x], A10);
     }
   }
   width -= x;
   if (width > 0) WebPMultARGBRowC(ptr + x, width, inverse);
 }
 
-static void MultRow(uint8_t* const ptr, const uint8_t* const alpha,
-                    int width, int inverse) {
+static void MultRow_SSE2(uint8_t* const ptr, const uint8_t* const alpha,
+                         int width, int inverse) {
   int x = 0;
   if (!inverse) {
-    const int kSpan = 8;
     const __m128i zero = _mm_setzero_si128();
-    const __m128i kRound = _mm_set1_epi16(1 << 7);
-    const int w2 = width & ~(kSpan - 1);
-    for (x = 0; x < w2; x += kSpan) {
+    const __m128i k128 = _mm_set1_epi16(128);
+    const __m128i kMult = _mm_set1_epi16(0x0101);
+    for (x = 0; x + 8 <= width; x += 8) {
       const __m128i v0 = _mm_loadl_epi64((__m128i*)&ptr[x]);
+      const __m128i a0 = _mm_loadl_epi64((const __m128i*)&alpha[x]);
       const __m128i v1 = _mm_unpacklo_epi8(v0, zero);
-      const __m128i alpha0 = _mm_loadl_epi64((const __m128i*)&alpha[x]);
-      const __m128i alpha1 = _mm_unpacklo_epi8(alpha0, zero);
-      const __m128i alpha2 = _mm_unpacklo_epi8(alpha0, alpha0);
-      const __m128i v2 = _mm_mulhi_epu16(v1, alpha2);
-      const __m128i v3 = _mm_mullo_epi16(v1, alpha1);
-      const __m128i v4 = _mm_adds_epu16(v2, v3);
-      const __m128i v5 = _mm_adds_epu16(v4, kRound);
-      const __m128i v6 = _mm_srli_epi16(v5, 8);
-      const __m128i v7 = _mm_packus_epi16(v6, zero);
-      _mm_storel_epi64((__m128i*)&ptr[x], v7);
+      const __m128i a1 = _mm_unpacklo_epi8(a0, zero);
+      const __m128i v2 = _mm_mullo_epi16(v1, a1);
+      const __m128i v3 = _mm_add_epi16(v2, k128);
+      const __m128i v4 = _mm_mulhi_epu16(v3, kMult);
+      const __m128i v5 = _mm_packus_epi16(v4, zero);
+      _mm_storel_epi64((__m128i*)&ptr[x], v5);
     }
   }
   width -= x;
@@ -283,9 +270,9 @@ static void MultRow(uint8_t* const ptr, const uint8_t* const alpha,
 extern void WebPInitAlphaProcessingSSE2(void);
 
 WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessingSSE2(void) {
-  WebPMultARGBRow = MultARGBRow;
-  WebPMultRow = MultRow;
-  WebPApplyAlphaMultiply = ApplyAlphaMultiply;
+  WebPMultARGBRow = MultARGBRow_SSE2;
+  WebPMultRow = MultRow_SSE2;
+  WebPApplyAlphaMultiply = ApplyAlphaMultiply_SSE2;
   WebPDispatchAlpha = DispatchAlpha;
   WebPDispatchAlphaToGreen = DispatchAlphaToGreen;
   WebPExtractAlpha = ExtractAlpha;
diff --git a/src/3rdparty/libwebp/src/dsp/common_sse2.h b/src/3rdparty/libwebp/src/dsp/common_sse2.h
index 7cea13f..995d7cf 100644
--- a/src/3rdparty/libwebp/src/dsp/common_sse2.h
+++ b/src/3rdparty/libwebp/src/dsp/common_sse2.h
@@ -100,6 +100,91 @@ static WEBP_INLINE void VP8Transpose_2_4x4_16b(
   // a03 a13 a23 a33   b03 b13 b23 b33
 }
 
+//------------------------------------------------------------------------------
+// Channel mixing.
+
+// Function used several times in VP8PlanarTo24b.
+// It samples the in buffer as follows: one every two unsigned char is stored
+// at the beginning of the buffer, while the other half is stored at the end.
+#define VP8PlanarTo24bHelper(IN, OUT)                            \
+  do {                                                           \
+    const __m128i v_mask = _mm_set1_epi16(0x00ff);               \
+    /* Take one every two upper 8b values.*/                     \
+    (OUT##0) = _mm_packus_epi16(_mm_and_si128((IN##0), v_mask),  \
+                                _mm_and_si128((IN##1), v_mask)); \
+    (OUT##1) = _mm_packus_epi16(_mm_and_si128((IN##2), v_mask),  \
+                                _mm_and_si128((IN##3), v_mask)); \
+    (OUT##2) = _mm_packus_epi16(_mm_and_si128((IN##4), v_mask),  \
+                                _mm_and_si128((IN##5), v_mask)); \
+    /* Take one every two lower 8b values.*/                     \
+    (OUT##3) = _mm_packus_epi16(_mm_srli_epi16((IN##0), 8),      \
+                                _mm_srli_epi16((IN##1), 8));     \
+    (OUT##4) = _mm_packus_epi16(_mm_srli_epi16((IN##2), 8),      \
+                                _mm_srli_epi16((IN##3), 8));     \
+    (OUT##5) = _mm_packus_epi16(_mm_srli_epi16((IN##4), 8),      \
+                                _mm_srli_epi16((IN##5), 8));     \
+  } while (0)
+
+// Pack the planar buffers
+// rrrr... rrrr... gggg... gggg... bbbb... bbbb....
+// triplet by triplet in the output buffer rgb as rgbrgbrgbrgb ...
+static WEBP_INLINE void VP8PlanarTo24b(__m128i* const in0, __m128i* const in1,
+                                       __m128i* const in2, __m128i* const in3,
+                                       __m128i* const in4, __m128i* const in5) {
+  // The input is 6 registers of sixteen 8b but for the sake of explanation,
+  // let's take 6 registers of four 8b values.
+  // To pack, we will keep taking one every two 8b integer and move it
+  // around as follows:
+  // Input:
+  //   r0r1r2r3 | r4r5r6r7 | g0g1g2g3 | g4g5g6g7 | b0b1b2b3 | b4b5b6b7
+  // Split the 6 registers in two sets of 3 registers: the first set as the even
+  // 8b bytes, the second the odd ones:
+  //   r0r2r4r6 | g0g2g4g6 | b0b2b4b6 | r1r3r5r7 | g1g3g5g7 | b1b3b5b7
+  // Repeat the same permutations twice more:
+  //   r0r4g0g4 | b0b4r1r5 | g1g5b1b5 | r2r6g2g6 | b2b6r3r7 | g3g7b3b7
+  //   r0g0b0r1 | g1b1r2g2 | b2r3g3b3 | r4g4b4r5 | g5b5r6g6 | b6r7g7b7
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+  VP8PlanarTo24bHelper(*in, tmp);
+  VP8PlanarTo24bHelper(tmp, *in);
+  VP8PlanarTo24bHelper(*in, tmp);
+  // We need to do it two more times than the example as we have sixteen bytes.
+  {
+    __m128i out0, out1, out2, out3, out4, out5;
+    VP8PlanarTo24bHelper(tmp, out);
+    VP8PlanarTo24bHelper(out, *in);
+  }
+}
+
+#undef VP8PlanarTo24bHelper
+
+// Convert four packed four-channel buffers like argbargbargbargb... into the
+// split channels aaaaa ... rrrr ... gggg .... bbbbb ......
+static WEBP_INLINE void VP8L32bToPlanar(__m128i* const in0,
+                                        __m128i* const in1,
+                                        __m128i* const in2,
+                                        __m128i* const in3) {
+  // Column-wise transpose.
+  const __m128i A0 = _mm_unpacklo_epi8(*in0, *in1);
+  const __m128i A1 = _mm_unpackhi_epi8(*in0, *in1);
+  const __m128i A2 = _mm_unpacklo_epi8(*in2, *in3);
+  const __m128i A3 = _mm_unpackhi_epi8(*in2, *in3);
+  const __m128i B0 = _mm_unpacklo_epi8(A0, A1);
+  const __m128i B1 = _mm_unpackhi_epi8(A0, A1);
+  const __m128i B2 = _mm_unpacklo_epi8(A2, A3);
+  const __m128i B3 = _mm_unpackhi_epi8(A2, A3);
+  // C0 = g7 g6 ... g1 g0 | b7 b6 ... b1 b0
+  // C1 = a7 a6 ... a1 a0 | r7 r6 ... r1 r0
+  const __m128i C0 = _mm_unpacklo_epi8(B0, B1);
+  const __m128i C1 = _mm_unpackhi_epi8(B0, B1);
+  const __m128i C2 = _mm_unpacklo_epi8(B2, B3);
+  const __m128i C3 = _mm_unpackhi_epi8(B2, B3);
+  // Gather the channels.
+  *in0 = _mm_unpackhi_epi64(C1, C3);
+  *in1 = _mm_unpacklo_epi64(C1, C3);
+  *in2 = _mm_unpackhi_epi64(C0, C2);
+  *in3 = _mm_unpacklo_epi64(C0, C2);
+}
+
 #endif  // WEBP_USE_SSE2
 
 #ifdef __cplusplus
diff --git a/src/3rdparty/libwebp/src/dsp/cost.c b/src/3rdparty/libwebp/src/dsp/cost.c
index fe72d26..58ddea7 100644
--- a/src/3rdparty/libwebp/src/dsp/cost.c
+++ b/src/3rdparty/libwebp/src/dsp/cost.c
@@ -10,7 +10,7 @@
 // Author: Skal (pascal.massimino@gmail.com)
 
 #include "./dsp.h"
-#include "../enc/cost.h"
+#include "../enc/cost_enc.h"
 
 //------------------------------------------------------------------------------
 // Boolean-cost cost table
diff --git a/src/3rdparty/libwebp/src/dsp/cost_mips32.c b/src/3rdparty/libwebp/src/dsp/cost_mips32.c
index d1e240e..3102da8 100644
--- a/src/3rdparty/libwebp/src/dsp/cost_mips32.c
+++ b/src/3rdparty/libwebp/src/dsp/cost_mips32.c
@@ -13,7 +13,7 @@
 
 #if defined(WEBP_USE_MIPS32)
 
-#include "../enc/cost.h"
+#include "../enc/cost_enc.h"
 
 static int GetResidualCost(int ctx0, const VP8Residual* const res) {
   int temp0, temp1;
diff --git a/src/3rdparty/libwebp/src/dsp/cost_mips_dsp_r2.c b/src/3rdparty/libwebp/src/dsp/cost_mips_dsp_r2.c
index ce64067..6ec8aeb 100644
--- a/src/3rdparty/libwebp/src/dsp/cost_mips_dsp_r2.c
+++ b/src/3rdparty/libwebp/src/dsp/cost_mips_dsp_r2.c
@@ -13,7 +13,7 @@
 
 #if defined(WEBP_USE_MIPS_DSP_R2)
 
-#include "../enc/cost.h"
+#include "../enc/cost_enc.h"
 
 static int GetResidualCost(int ctx0, const VP8Residual* const res) {
   int temp0, temp1;
diff --git a/src/3rdparty/libwebp/src/dsp/cost_sse2.c b/src/3rdparty/libwebp/src/dsp/cost_sse2.c
index 0cb1c1f..421d51f 100644
--- a/src/3rdparty/libwebp/src/dsp/cost_sse2.c
+++ b/src/3rdparty/libwebp/src/dsp/cost_sse2.c
@@ -16,8 +16,8 @@
 #if defined(WEBP_USE_SSE2)
 #include <emmintrin.h>
 
-#include "../enc/cost.h"
-#include "../enc/vp8enci.h"
+#include "../enc/cost_enc.h"
+#include "../enc/vp8i_enc.h"
 #include "../utils/utils.h"
 
 //------------------------------------------------------------------------------
diff --git a/src/3rdparty/libwebp/src/dsp/cpu.c b/src/3rdparty/libwebp/src/dsp/cpu.c
index cbb08db..b5583b6 100644
--- a/src/3rdparty/libwebp/src/dsp/cpu.c
+++ b/src/3rdparty/libwebp/src/dsp/cpu.c
@@ -95,26 +95,62 @@ static WEBP_INLINE uint64_t xgetbv(void) {
 #endif
 
 #if defined(__i386__) || defined(__x86_64__) || defined(WEBP_MSC_SSE2)
+
+// helper function for run-time detection of slow SSSE3 platforms
+static int CheckSlowModel(int info) {
+  // Table listing display models with longer latencies for the bsr instruction
+  // (ie 2 cycles vs 10/16 cycles) and some SSSE3 instructions like pshufb.
+  // Refer to Intel 64 and IA-32 Architectures Optimization Reference Manual.
+  static const uint8_t kSlowModels[] = {
+    0x37, 0x4a, 0x4d,  // Silvermont Microarchitecture
+    0x1c, 0x26, 0x27   // Atom Microarchitecture
+  };
+  const uint32_t model = ((info & 0xf0000) >> 12) | ((info >> 4) & 0xf);
+  const uint32_t family = (info >> 8) & 0xf;
+  if (family == 0x06) {
+    size_t i;
+    for (i = 0; i < sizeof(kSlowModels) / sizeof(kSlowModels[0]); ++i) {
+      if (model == kSlowModels[i]) return 1;
+    }
+  }
+  return 0;
+}
+
 static int x86CPUInfo(CPUFeature feature) {
   int max_cpuid_value;
   int cpu_info[4];
+  int is_intel = 0;
 
   // get the highest feature value cpuid supports
   GetCPUInfo(cpu_info, 0);
   max_cpuid_value = cpu_info[0];
   if (max_cpuid_value < 1) {
     return 0;
+  } else {
+    const int VENDOR_ID_INTEL_EBX = 0x756e6547;  // uneG
+    const int VENDOR_ID_INTEL_EDX = 0x49656e69;  // Ieni
+    const int VENDOR_ID_INTEL_ECX = 0x6c65746e;  // letn
+    is_intel = (cpu_info[1] == VENDOR_ID_INTEL_EBX &&
+                cpu_info[2] == VENDOR_ID_INTEL_ECX &&
+                cpu_info[3] == VENDOR_ID_INTEL_EDX);    // genuine Intel?
   }
 
   GetCPUInfo(cpu_info, 1);
   if (feature == kSSE2) {
-    return 0 != (cpu_info[3] & 0x04000000);
+    return !!(cpu_info[3] & (1 << 26));
   }
   if (feature == kSSE3) {
-    return 0 != (cpu_info[2] & 0x00000001);
+    return !!(cpu_info[2] & (1 << 0));
+  }
+  if (feature == kSlowSSSE3) {
+    if (is_intel && (cpu_info[2] & (1 << 0))) {   // SSSE3?
+      return CheckSlowModel(cpu_info[0]);
+    }
+    return 0;
   }
+
   if (feature == kSSE4_1) {
-    return 0 != (cpu_info[2] & 0x00080000);
+    return !!(cpu_info[2] & (1 << 19));
   }
   if (feature == kAVX) {
     // bits 27 (OSXSAVE) & 28 (256-bit AVX)
@@ -126,7 +162,7 @@ static int x86CPUInfo(CPUFeature feature) {
   if (feature == kAVX2) {
     if (x86CPUInfo(kAVX) && max_cpuid_value >= 7) {
       GetCPUInfo(cpu_info, 7);
-      return ((cpu_info[1] & 0x00000020) == 0x00000020);
+      return !!(cpu_info[1] & (1 << 5));
     }
   }
   return 0;
@@ -184,4 +220,3 @@ VP8CPUInfo VP8GetCPUInfo = mipsCPUInfo;
 #else
 VP8CPUInfo VP8GetCPUInfo = NULL;
 #endif
-
diff --git a/src/3rdparty/libwebp/src/dsp/dec.c b/src/3rdparty/libwebp/src/dsp/dec.c
index e92d693..007e985 100644
--- a/src/3rdparty/libwebp/src/dsp/dec.c
+++ b/src/3rdparty/libwebp/src/dsp/dec.c
@@ -12,7 +12,7 @@
 // Author: Skal (pascal.massimino@gmail.com)
 
 #include "./dsp.h"
-#include "../dec/vp8i.h"
+#include "../dec/vp8i_dec.h"
 #include "../utils/utils.h"
 
 //------------------------------------------------------------------------------
@@ -239,7 +239,7 @@ VP8PredFunc VP8PredLuma16[NUM_B_DC_MODES];
 //------------------------------------------------------------------------------
 // 4x4
 
-#define AVG3(a, b, c) (((a) + 2 * (b) + (c) + 2) >> 2)
+#define AVG3(a, b, c) ((uint8_t)(((a) + 2 * (b) + (c) + 2) >> 2))
 #define AVG2(a, b) (((a) + (b) + 1) >> 1)
 
 static void VE4(uint8_t* dst) {    // vertical
diff --git a/src/3rdparty/libwebp/src/dsp/dec_clip_tables.c b/src/3rdparty/libwebp/src/dsp/dec_clip_tables.c
index 3b6dde8..74ba34c 100644
--- a/src/3rdparty/libwebp/src/dsp/dec_clip_tables.c
+++ b/src/3rdparty/libwebp/src/dsp/dec_clip_tables.c
@@ -63,7 +63,7 @@ static const uint8_t abs0[255 + 255 + 1] = {
   0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff
 };
 
-static const int8_t sclip1[1020 + 1020 + 1] = {
+static const uint8_t sclip1[1020 + 1020 + 1] = {
   0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
   0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
   0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
@@ -236,7 +236,7 @@ static const int8_t sclip1[1020 + 1020 + 1] = {
   0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f
 };
 
-static const int8_t sclip2[112 + 112 + 1] = {
+static const uint8_t sclip2[112 + 112 + 1] = {
   0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0,
   0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0,
   0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0,
@@ -339,8 +339,8 @@ static volatile int tables_ok = 0;
 
 #endif
 
-const int8_t* const VP8ksclip1 = &sclip1[1020];
-const int8_t* const VP8ksclip2 = &sclip2[112];
+const int8_t* const VP8ksclip1 = (const int8_t*)&sclip1[1020];
+const int8_t* const VP8ksclip2 = (const int8_t*)&sclip2[112];
 const uint8_t* const VP8kclip1 = &clip1[255];
 const uint8_t* const VP8kabs0 = &abs0[255];
 
diff --git a/src/3rdparty/libwebp/src/dsp/dec_msa.c b/src/3rdparty/libwebp/src/dsp/dec_msa.c
index f76055c..8d9c98c 100644
--- a/src/3rdparty/libwebp/src/dsp/dec_msa.c
+++ b/src/3rdparty/libwebp/src/dsp/dec_msa.c
@@ -154,6 +154,820 @@ static void TransformAC3(const int16_t* in, uint8_t* dst) {
 }
 
 //------------------------------------------------------------------------------
+// Edge filtering functions
+
+#define FLIP_SIGN2(in0, in1, out0, out1) {  \
+  out0 = (v16i8)__msa_xori_b(in0, 0x80);    \
+  out1 = (v16i8)__msa_xori_b(in1, 0x80);    \
+}
+
+#define FLIP_SIGN4(in0, in1, in2, in3, out0, out1, out2, out3) {  \
+  FLIP_SIGN2(in0, in1, out0, out1);                               \
+  FLIP_SIGN2(in2, in3, out2, out3);                               \
+}
+
+#define FILT_VAL(q0_m, p0_m, mask, filt) do {  \
+  v16i8 q0_sub_p0;                             \
+  q0_sub_p0 = __msa_subs_s_b(q0_m, p0_m);      \
+  filt = __msa_adds_s_b(filt, q0_sub_p0);      \
+  filt = __msa_adds_s_b(filt, q0_sub_p0);      \
+  filt = __msa_adds_s_b(filt, q0_sub_p0);      \
+  filt = filt & mask;                          \
+} while (0)
+
+#define FILT2(q_m, p_m, q, p) do {            \
+  u_r = SRAI_H(temp1, 7);                     \
+  u_r = __msa_sat_s_h(u_r, 7);                \
+  u_l = SRAI_H(temp3, 7);                     \
+  u_l = __msa_sat_s_h(u_l, 7);                \
+  u = __msa_pckev_b((v16i8)u_l, (v16i8)u_r);  \
+  q_m = __msa_subs_s_b(q_m, u);               \
+  p_m = __msa_adds_s_b(p_m, u);               \
+  q = __msa_xori_b((v16u8)q_m, 0x80);         \
+  p = __msa_xori_b((v16u8)p_m, 0x80);         \
+} while (0)
+
+#define LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev) do {  \
+  v16i8 p1_m, p0_m, q0_m, q1_m;                         \
+  v16i8 filt, t1, t2;                                   \
+  const v16i8 cnst4b = __msa_ldi_b(4);                  \
+  const v16i8 cnst3b = __msa_ldi_b(3);                  \
+                                                        \
+  FLIP_SIGN4(p1, p0, q0, q1, p1_m, p0_m, q0_m, q1_m);   \
+  filt = __msa_subs_s_b(p1_m, q1_m);                    \
+  filt = filt & hev;                                    \
+  FILT_VAL(q0_m, p0_m, mask, filt);                     \
+  t1 = __msa_adds_s_b(filt, cnst4b);                    \
+  t1 = SRAI_B(t1, 3);                                   \
+  t2 = __msa_adds_s_b(filt, cnst3b);                    \
+  t2 = SRAI_B(t2, 3);                                   \
+  q0_m = __msa_subs_s_b(q0_m, t1);                      \
+  q0 = __msa_xori_b((v16u8)q0_m, 0x80);                 \
+  p0_m = __msa_adds_s_b(p0_m, t2);                      \
+  p0 = __msa_xori_b((v16u8)p0_m, 0x80);                 \
+  filt = __msa_srari_b(t1, 1);                          \
+  hev = __msa_xori_b(hev, 0xff);                        \
+  filt = filt & hev;                                    \
+  q1_m = __msa_subs_s_b(q1_m, filt);                    \
+  q1 = __msa_xori_b((v16u8)q1_m, 0x80);                 \
+  p1_m = __msa_adds_s_b(p1_m, filt);                    \
+  p1 = __msa_xori_b((v16u8)p1_m, 0x80);                 \
+} while (0)
+
+#define LPF_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev) do {  \
+  v16i8 p2_m, p1_m, p0_m, q2_m, q1_m, q0_m;                   \
+  v16i8 u, filt, t1, t2, filt_sign;                           \
+  v8i16 filt_r, filt_l, u_r, u_l;                             \
+  v8i16 temp0, temp1, temp2, temp3;                           \
+  const v16i8 cnst4b = __msa_ldi_b(4);                        \
+  const v16i8 cnst3b = __msa_ldi_b(3);                        \
+  const v8i16 cnst9h = __msa_ldi_h(9);                        \
+                                                              \
+  FLIP_SIGN4(p1, p0, q0, q1, p1_m, p0_m, q0_m, q1_m);         \
+  filt = __msa_subs_s_b(p1_m, q1_m);                          \
+  FILT_VAL(q0_m, p0_m, mask, filt);                           \
+  FLIP_SIGN2(p2, q2, p2_m, q2_m);                             \
+  t2 = filt & hev;                                            \
+  /* filt_val &= ~hev */                                      \
+  hev = __msa_xori_b(hev, 0xff);                              \
+  filt = filt & hev;                                          \
+  t1 = __msa_adds_s_b(t2, cnst4b);                            \
+  t1 = SRAI_B(t1, 3);                                         \
+  t2 = __msa_adds_s_b(t2, cnst3b);                            \
+  t2 = SRAI_B(t2, 3);                                         \
+  q0_m = __msa_subs_s_b(q0_m, t1);                            \
+  p0_m = __msa_adds_s_b(p0_m, t2);                            \
+  filt_sign = __msa_clti_s_b(filt, 0);                        \
+  ILVRL_B2_SH(filt_sign, filt, filt_r, filt_l);               \
+  /* update q2/p2 */                                          \
+  temp0 = filt_r * cnst9h;                                    \
+  temp1 = ADDVI_H(temp0, 63);                                 \
+  temp2 = filt_l * cnst9h;                                    \
+  temp3 = ADDVI_H(temp2, 63);                                 \
+  FILT2(q2_m, p2_m, q2, p2);                                  \
+  /* update q1/p1 */                                          \
+  temp1 = temp1 + temp0;                                      \
+  temp3 = temp3 + temp2;                                      \
+  FILT2(q1_m, p1_m, q1, p1);                                  \
+  /* update q0/p0 */                                          \
+  temp1 = temp1 + temp0;                                      \
+  temp3 = temp3 + temp2;                                      \
+  FILT2(q0_m, p0_m, q0, p0);                                  \
+} while (0)
+
+#define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in,                 \
+                     q0_in, q1_in, q2_in, q3_in,                 \
+                     limit_in, b_limit_in, thresh_in,            \
+                     hev_out, mask_out) do {                     \
+  v16u8 p3_asub_p2_m, p2_asub_p1_m, p1_asub_p0_m, q1_asub_q0_m;  \
+  v16u8 p1_asub_q1_m, p0_asub_q0_m, q3_asub_q2_m, q2_asub_q1_m;  \
+  v16u8 flat_out;                                                \
+                                                                 \
+  /* absolute subtraction of pixel values */                     \
+  p3_asub_p2_m = __msa_asub_u_b(p3_in, p2_in);                   \
+  p2_asub_p1_m = __msa_asub_u_b(p2_in, p1_in);                   \
+  p1_asub_p0_m = __msa_asub_u_b(p1_in, p0_in);                   \
+  q1_asub_q0_m = __msa_asub_u_b(q1_in, q0_in);                   \
+  q2_asub_q1_m = __msa_asub_u_b(q2_in, q1_in);                   \
+  q3_asub_q2_m = __msa_asub_u_b(q3_in, q2_in);                   \
+  p0_asub_q0_m = __msa_asub_u_b(p0_in, q0_in);                   \
+  p1_asub_q1_m = __msa_asub_u_b(p1_in, q1_in);                   \
+  /* calculation of hev */                                       \
+  flat_out = __msa_max_u_b(p1_asub_p0_m, q1_asub_q0_m);          \
+  hev_out = (thresh_in < flat_out);                              \
+  /* calculation of mask */                                      \
+  p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p0_asub_q0_m);     \
+  p1_asub_q1_m = SRAI_B(p1_asub_q1_m, 1);                        \
+  p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p1_asub_q1_m);     \
+  mask_out = (b_limit_in < p0_asub_q0_m);                        \
+  mask_out = __msa_max_u_b(flat_out, mask_out);                  \
+  p3_asub_p2_m = __msa_max_u_b(p3_asub_p2_m, p2_asub_p1_m);      \
+  mask_out = __msa_max_u_b(p3_asub_p2_m, mask_out);              \
+  q2_asub_q1_m = __msa_max_u_b(q2_asub_q1_m, q3_asub_q2_m);      \
+  mask_out = __msa_max_u_b(q2_asub_q1_m, mask_out);              \
+  mask_out = (limit_in < mask_out);                              \
+  mask_out = __msa_xori_b(mask_out, 0xff);                       \
+} while (0)
+
+#define ST6x1_UB(in0, in0_idx, in1, in1_idx, pdst, stride) do { \
+  const uint16_t tmp0_h = __msa_copy_s_h((v8i16)in1, in1_idx);  \
+  const uint32_t tmp0_w = __msa_copy_s_w((v4i32)in0, in0_idx);  \
+  SW(tmp0_w, pdst);                                             \
+  SH(tmp0_h, pdst + stride);                                    \
+} while (0)
+
+#define ST6x4_UB(in0, start_in0_idx, in1, start_in1_idx, pdst, stride) do { \
+  uint8_t* ptmp1 = (uint8_t*)pdst;                                          \
+  ST6x1_UB(in0, start_in0_idx, in1, start_in1_idx, ptmp1, 4);               \
+  ptmp1 += stride;                                                          \
+  ST6x1_UB(in0, start_in0_idx + 1, in1, start_in1_idx + 1, ptmp1, 4);       \
+  ptmp1 += stride;                                                          \
+  ST6x1_UB(in0, start_in0_idx + 2, in1, start_in1_idx + 2, ptmp1, 4);       \
+  ptmp1 += stride;                                                          \
+  ST6x1_UB(in0, start_in0_idx + 3, in1, start_in1_idx + 3, ptmp1, 4);       \
+} while (0)
+
+#define LPF_SIMPLE_FILT(p1_in, p0_in, q0_in, q1_in, mask) do {       \
+    v16i8 p1_m, p0_m, q0_m, q1_m, filt, filt1, filt2;                \
+    const v16i8 cnst4b = __msa_ldi_b(4);                             \
+    const v16i8 cnst3b =  __msa_ldi_b(3);                            \
+                                                                     \
+    FLIP_SIGN4(p1_in, p0_in, q0_in, q1_in, p1_m, p0_m, q0_m, q1_m);  \
+    filt = __msa_subs_s_b(p1_m, q1_m);                               \
+    FILT_VAL(q0_m, p0_m, mask, filt);                                \
+    filt1 = __msa_adds_s_b(filt, cnst4b);                            \
+    filt1 = SRAI_B(filt1, 3);                                        \
+    filt2 = __msa_adds_s_b(filt, cnst3b);                            \
+    filt2 = SRAI_B(filt2, 3);                                        \
+    q0_m = __msa_subs_s_b(q0_m, filt1);                              \
+    p0_m = __msa_adds_s_b(p0_m, filt2);                              \
+    q0_in = __msa_xori_b((v16u8)q0_m, 0x80);                         \
+    p0_in = __msa_xori_b((v16u8)p0_m, 0x80);                         \
+} while (0)
+
+#define LPF_SIMPLE_MASK(p1, p0, q0, q1, b_limit, mask) do {    \
+    v16u8 p1_a_sub_q1, p0_a_sub_q0;                            \
+                                                               \
+    p0_a_sub_q0 = __msa_asub_u_b(p0, q0);                      \
+    p1_a_sub_q1 = __msa_asub_u_b(p1, q1);                      \
+    p1_a_sub_q1 = (v16u8)__msa_srli_b((v16i8)p1_a_sub_q1, 1);  \
+    p0_a_sub_q0 = __msa_adds_u_b(p0_a_sub_q0, p0_a_sub_q0);    \
+    mask = __msa_adds_u_b(p0_a_sub_q0, p1_a_sub_q1);           \
+    mask = (mask <= b_limit);                                  \
+} while (0)
+
+static void VFilter16(uint8_t* src, int stride,
+                      int b_limit_in, int limit_in, int thresh_in) {
+  uint8_t* ptemp = src - 4 * stride;
+  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+  v16u8 mask, hev;
+  const v16u8 thresh = (v16u8)__msa_fill_b(thresh_in);
+  const v16u8 limit = (v16u8)__msa_fill_b(limit_in);
+  const v16u8 b_limit = (v16u8)__msa_fill_b(b_limit_in);
+
+  LD_UB8(ptemp, stride, p3, p2, p1, p0, q0, q1, q2, q3);
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+               hev, mask);
+  LPF_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);
+  ptemp = src - 3 * stride;
+  ST_UB4(p2, p1, p0, q0, ptemp, stride);
+  ptemp += (4 * stride);
+  ST_UB2(q1, q2, ptemp, stride);
+}
+
+static void HFilter16(uint8_t* src, int stride,
+                      int b_limit_in, int limit_in, int thresh_in) {
+  uint8_t* ptmp  = src - 4;
+  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+  v16u8 mask, hev;
+  v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8;
+  v16u8 row9, row10, row11, row12, row13, row14, row15;
+  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  const v16u8 b_limit = (v16u8)__msa_fill_b(b_limit_in);
+  const v16u8 limit = (v16u8)__msa_fill_b(limit_in);
+  const v16u8 thresh = (v16u8)__msa_fill_b(thresh_in);
+
+  LD_UB8(ptmp, stride, row0, row1, row2, row3, row4, row5, row6, row7);
+  ptmp += (8 * stride);
+  LD_UB8(ptmp, stride, row8, row9, row10, row11, row12, row13, row14, row15);
+  TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
+                      row8, row9, row10, row11, row12, row13, row14, row15,
+                      p3, p2, p1, p0, q0, q1, q2, q3);
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+               hev, mask);
+  LPF_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);
+  ILVR_B2_SH(p1, p2, q0, p0, tmp0, tmp1);
+  ILVRL_H2_SH(tmp1, tmp0, tmp3, tmp4);
+  ILVL_B2_SH(p1, p2, q0, p0, tmp0, tmp1);
+  ILVRL_H2_SH(tmp1, tmp0, tmp6, tmp7);
+  ILVRL_B2_SH(q2, q1, tmp2, tmp5);
+  ptmp = src - 3;
+  ST6x1_UB(tmp3, 0, tmp2, 0, ptmp, 4);
+  ptmp += stride;
+  ST6x1_UB(tmp3, 1, tmp2, 1, ptmp, 4);
+  ptmp += stride;
+  ST6x1_UB(tmp3, 2, tmp2, 2, ptmp, 4);
+  ptmp += stride;
+  ST6x1_UB(tmp3, 3, tmp2, 3, ptmp, 4);
+  ptmp += stride;
+  ST6x1_UB(tmp4, 0, tmp2, 4, ptmp, 4);
+  ptmp += stride;
+  ST6x1_UB(tmp4, 1, tmp2, 5, ptmp, 4);
+  ptmp += stride;
+  ST6x1_UB(tmp4, 2, tmp2, 6, ptmp, 4);
+  ptmp += stride;
+  ST6x1_UB(tmp4, 3, tmp2, 7, ptmp, 4);
+  ptmp += stride;
+  ST6x1_UB(tmp6, 0, tmp5, 0, ptmp, 4);
+  ptmp += stride;
+  ST6x1_UB(tmp6, 1, tmp5, 1, ptmp, 4);
+  ptmp += stride;
+  ST6x1_UB(tmp6, 2, tmp5, 2, ptmp, 4);
+  ptmp += stride;
+  ST6x1_UB(tmp6, 3, tmp5, 3, ptmp, 4);
+  ptmp += stride;
+  ST6x1_UB(tmp7, 0, tmp5, 4, ptmp, 4);
+  ptmp += stride;
+  ST6x1_UB(tmp7, 1, tmp5, 5, ptmp, 4);
+  ptmp += stride;
+  ST6x1_UB(tmp7, 2, tmp5, 6, ptmp, 4);
+  ptmp += stride;
+  ST6x1_UB(tmp7, 3, tmp5, 7, ptmp, 4);
+}
+
+// on three inner edges
+static void VFilterHorEdge16i(uint8_t* src, int stride,
+                              int b_limit, int limit, int thresh) {
+  v16u8 mask, hev;
+  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+  const v16u8 thresh0 = (v16u8)__msa_fill_b(thresh);
+  const v16u8 b_limit0 = (v16u8)__msa_fill_b(b_limit);
+  const v16u8 limit0 = (v16u8)__msa_fill_b(limit);
+
+  LD_UB8((src - 4 * stride), stride, p3, p2, p1, p0, q0, q1, q2, q3);
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0,
+               hev, mask);
+  LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);
+  ST_UB4(p1, p0, q0, q1, (src - 2 * stride), stride);
+}
+
+static void VFilter16i(uint8_t* src_y, int stride,
+                       int b_limit, int limit, int thresh) {
+  VFilterHorEdge16i(src_y +  4 * stride, stride, b_limit, limit, thresh);
+  VFilterHorEdge16i(src_y +  8 * stride, stride, b_limit, limit, thresh);
+  VFilterHorEdge16i(src_y + 12 * stride, stride, b_limit, limit, thresh);
+}
+
+static void HFilterVertEdge16i(uint8_t* src, int stride,
+                               int b_limit, int limit, int thresh) {
+  v16u8 mask, hev;
+  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+  v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
+  v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
+  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+  const v16u8 thresh0 = (v16u8)__msa_fill_b(thresh);
+  const v16u8 b_limit0 = (v16u8)__msa_fill_b(b_limit);
+  const v16u8 limit0 = (v16u8)__msa_fill_b(limit);
+
+  LD_UB8(src - 4, stride, row0, row1, row2, row3, row4, row5, row6, row7);
+  LD_UB8(src - 4 + (8 * stride), stride,
+         row8, row9, row10, row11, row12, row13, row14, row15);
+  TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
+                      row8, row9, row10, row11, row12, row13, row14, row15,
+                      p3, p2, p1, p0, q0, q1, q2, q3);
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0,
+               hev, mask);
+  LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);
+  ILVR_B2_SH(p0, p1, q1, q0, tmp0, tmp1);
+  ILVRL_H2_SH(tmp1, tmp0, tmp2, tmp3);
+  ILVL_B2_SH(p0, p1, q1, q0, tmp0, tmp1);
+  ILVRL_H2_SH(tmp1, tmp0, tmp4, tmp5);
+  src -= 2;
+  ST4x8_UB(tmp2, tmp3, src, stride);
+  src += (8 * stride);
+  ST4x8_UB(tmp4, tmp5, src, stride);
+}
+
+static void HFilter16i(uint8_t* src_y, int stride,
+                       int b_limit, int limit, int thresh) {
+  HFilterVertEdge16i(src_y +  4, stride, b_limit, limit, thresh);
+  HFilterVertEdge16i(src_y +  8, stride, b_limit, limit, thresh);
+  HFilterVertEdge16i(src_y + 12, stride, b_limit, limit, thresh);
+}
+
+// 8-pixels wide variants, for chroma filtering
+static void VFilter8(uint8_t* src_u, uint8_t* src_v, int stride,
+                     int b_limit_in, int limit_in, int thresh_in) {
+  uint8_t* ptmp_src_u = src_u - 4 * stride;
+  uint8_t* ptmp_src_v = src_v - 4 * stride;
+  uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d;
+  v16u8 p3, p2, p1, p0, q3, q2, q1, q0, mask, hev;
+  v16u8 p3_u, p2_u, p1_u, p0_u, q3_u, q2_u, q1_u, q0_u;
+  v16u8 p3_v, p2_v, p1_v, p0_v, q3_v, q2_v, q1_v, q0_v;
+  const v16u8 b_limit = (v16u8)__msa_fill_b(b_limit_in);
+  const v16u8 limit = (v16u8)__msa_fill_b(limit_in);
+  const v16u8 thresh = (v16u8)__msa_fill_b(thresh_in);
+
+  LD_UB8(ptmp_src_u, stride, p3_u, p2_u, p1_u, p0_u, q0_u, q1_u, q2_u, q3_u);
+  LD_UB8(ptmp_src_v, stride, p3_v, p2_v, p1_v, p0_v, q0_v, q1_v, q2_v, q3_v);
+  ILVR_D4_UB(p3_v, p3_u, p2_v, p2_u, p1_v, p1_u, p0_v, p0_u, p3, p2, p1, p0);
+  ILVR_D4_UB(q0_v, q0_u, q1_v, q1_u, q2_v, q2_u, q3_v, q3_u, q0, q1, q2, q3);
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+               hev, mask);
+  LPF_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);
+  p2_d = __msa_copy_s_d((v2i64)p2, 0);
+  p1_d = __msa_copy_s_d((v2i64)p1, 0);
+  p0_d = __msa_copy_s_d((v2i64)p0, 0);
+  q0_d = __msa_copy_s_d((v2i64)q0, 0);
+  q1_d = __msa_copy_s_d((v2i64)q1, 0);
+  q2_d = __msa_copy_s_d((v2i64)q2, 0);
+  ptmp_src_u += stride;
+  SD4(p2_d, p1_d, p0_d, q0_d, ptmp_src_u, stride);
+  ptmp_src_u += (4 * stride);
+  SD(q1_d, ptmp_src_u);
+  ptmp_src_u += stride;
+  SD(q2_d, ptmp_src_u);
+  p2_d = __msa_copy_s_d((v2i64)p2, 1);
+  p1_d = __msa_copy_s_d((v2i64)p1, 1);
+  p0_d = __msa_copy_s_d((v2i64)p0, 1);
+  q0_d = __msa_copy_s_d((v2i64)q0, 1);
+  q1_d = __msa_copy_s_d((v2i64)q1, 1);
+  q2_d = __msa_copy_s_d((v2i64)q2, 1);
+  ptmp_src_v += stride;
+  SD4(p2_d, p1_d, p0_d, q0_d, ptmp_src_v, stride);
+  ptmp_src_v += (4 * stride);
+  SD(q1_d, ptmp_src_v);
+  ptmp_src_v += stride;
+  SD(q2_d, ptmp_src_v);
+}
+
+static void HFilter8(uint8_t* src_u, uint8_t* src_v, int stride,
+                     int b_limit_in, int limit_in, int thresh_in) {
+  uint8_t* ptmp_src_u = src_u - 4;
+  uint8_t* ptmp_src_v = src_v - 4;
+  v16u8 p3, p2, p1, p0, q3, q2, q1, q0, mask, hev;
+  v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8;
+  v16u8 row9, row10, row11, row12, row13, row14, row15;
+  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  const v16u8 b_limit = (v16u8)__msa_fill_b(b_limit_in);
+  const v16u8 limit = (v16u8)__msa_fill_b(limit_in);
+  const v16u8 thresh = (v16u8)__msa_fill_b(thresh_in);
+
+  LD_UB8(ptmp_src_u, stride, row0, row1, row2, row3, row4, row5, row6, row7);
+  LD_UB8(ptmp_src_v, stride,
+         row8, row9, row10, row11, row12, row13, row14, row15);
+  TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
+                      row8, row9, row10, row11, row12, row13, row14, row15,
+                      p3, p2, p1, p0, q0, q1, q2, q3);
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+               hev, mask);
+  LPF_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);
+  ILVR_B2_SH(p1, p2, q0, p0, tmp0, tmp1);
+  ILVRL_H2_SH(tmp1, tmp0, tmp3, tmp4);
+  ILVL_B2_SH(p1, p2, q0, p0, tmp0, tmp1);
+  ILVRL_H2_SH(tmp1, tmp0, tmp6, tmp7);
+  ILVRL_B2_SH(q2, q1, tmp2, tmp5);
+  ptmp_src_u += 1;
+  ST6x4_UB(tmp3, 0, tmp2, 0, ptmp_src_u, stride);
+  ptmp_src_u += 4 * stride;
+  ST6x4_UB(tmp4, 0, tmp2, 4, ptmp_src_u, stride);
+  ptmp_src_v += 1;
+  ST6x4_UB(tmp6, 0, tmp5, 0, ptmp_src_v, stride);
+  ptmp_src_v += 4 * stride;
+  ST6x4_UB(tmp7, 0, tmp5, 4, ptmp_src_v, stride);
+}
+
+static void VFilter8i(uint8_t* src_u, uint8_t* src_v, int stride,
+                      int b_limit_in, int limit_in, int thresh_in) {
+  uint64_t p1_d, p0_d, q0_d, q1_d;
+  v16u8 p3, p2, p1, p0, q3, q2, q1, q0, mask, hev;
+  v16u8 p3_u, p2_u, p1_u, p0_u, q3_u, q2_u, q1_u, q0_u;
+  v16u8 p3_v, p2_v, p1_v, p0_v, q3_v, q2_v, q1_v, q0_v;
+  const v16u8 thresh = (v16u8)__msa_fill_b(thresh_in);
+  const v16u8 limit = (v16u8)__msa_fill_b(limit_in);
+  const v16u8 b_limit = (v16u8)__msa_fill_b(b_limit_in);
+
+  LD_UB8(src_u, stride, p3_u, p2_u, p1_u, p0_u, q0_u, q1_u, q2_u, q3_u);
+  src_u += (5 * stride);
+  LD_UB8(src_v, stride, p3_v, p2_v, p1_v, p0_v, q0_v, q1_v, q2_v, q3_v);
+  src_v += (5 * stride);
+  ILVR_D4_UB(p3_v, p3_u, p2_v, p2_u, p1_v, p1_u, p0_v, p0_u, p3, p2, p1, p0);
+  ILVR_D4_UB(q0_v, q0_u, q1_v, q1_u, q2_v, q2_u, q3_v, q3_u, q0, q1, q2, q3);
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+               hev, mask);
+  LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);
+  p1_d = __msa_copy_s_d((v2i64)p1, 0);
+  p0_d = __msa_copy_s_d((v2i64)p0, 0);
+  q0_d = __msa_copy_s_d((v2i64)q0, 0);
+  q1_d = __msa_copy_s_d((v2i64)q1, 0);
+  SD4(q1_d, q0_d, p0_d, p1_d, src_u, -stride);
+  p1_d = __msa_copy_s_d((v2i64)p1, 1);
+  p0_d = __msa_copy_s_d((v2i64)p0, 1);
+  q0_d = __msa_copy_s_d((v2i64)q0, 1);
+  q1_d = __msa_copy_s_d((v2i64)q1, 1);
+  SD4(q1_d, q0_d, p0_d, p1_d, src_v, -stride);
+}
+
+static void HFilter8i(uint8_t* src_u, uint8_t* src_v, int stride,
+                      int b_limit_in, int limit_in, int thresh_in) {
+  v16u8 p3, p2, p1, p0, q3, q2, q1, q0, mask, hev;
+  v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8;
+  v16u8 row9, row10, row11, row12, row13, row14, row15;
+  v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+  const v16u8 thresh = (v16u8)__msa_fill_b(thresh_in);
+  const v16u8 limit = (v16u8)__msa_fill_b(limit_in);
+  const v16u8 b_limit = (v16u8)__msa_fill_b(b_limit_in);
+
+  LD_UB8(src_u, stride, row0, row1, row2, row3, row4, row5, row6, row7);
+  LD_UB8(src_v, stride,
+         row8, row9, row10, row11, row12, row13, row14, row15);
+  TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
+                      row8, row9, row10, row11, row12, row13, row14, row15,
+                      p3, p2, p1, p0, q0, q1, q2, q3);
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+               hev, mask);
+  LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);
+  ILVR_B2_SW(p0, p1, q1, q0, tmp0, tmp1);
+  ILVRL_H2_SW(tmp1, tmp0, tmp2, tmp3);
+  ILVL_B2_SW(p0, p1, q1, q0, tmp0, tmp1);
+  ILVRL_H2_SW(tmp1, tmp0, tmp4, tmp5);
+  src_u += 2;
+  ST4x4_UB(tmp2, tmp2, 0, 1, 2, 3, src_u, stride);
+  src_u += 4 * stride;
+  ST4x4_UB(tmp3, tmp3, 0, 1, 2, 3, src_u, stride);
+  src_v += 2;
+  ST4x4_UB(tmp4, tmp4, 0, 1, 2, 3, src_v, stride);
+  src_v += 4 * stride;
+  ST4x4_UB(tmp5, tmp5, 0, 1, 2, 3, src_v, stride);
+}
+
+static void SimpleVFilter16(uint8_t* src, int stride, int b_limit_in) {
+  v16u8 p1, p0, q1, q0, mask;
+  const v16u8 b_limit = (v16u8)__msa_fill_b(b_limit_in);
+
+  LD_UB4(src - 2 * stride, stride, p1, p0, q0, q1);
+  LPF_SIMPLE_MASK(p1, p0, q0, q1, b_limit, mask);
+  LPF_SIMPLE_FILT(p1, p0, q0, q1, mask);
+  ST_UB2(p0, q0, src - stride, stride);
+}
+
+static void SimpleHFilter16(uint8_t* src, int stride, int b_limit_in) {
+  v16u8 p1, p0, q1, q0, mask, row0, row1, row2, row3, row4, row5, row6, row7;
+  v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
+  v8i16 tmp0, tmp1;
+  const v16u8 b_limit = (v16u8)__msa_fill_b(b_limit_in);
+  uint8_t* ptemp_src = src - 2;
+
+  LD_UB8(ptemp_src, stride, row0, row1, row2, row3, row4, row5, row6, row7);
+  LD_UB8(ptemp_src + 8 * stride, stride,
+         row8, row9, row10, row11, row12, row13, row14, row15);
+  TRANSPOSE16x4_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
+                      row8, row9, row10, row11, row12, row13, row14, row15,
+                      p1, p0, q0, q1);
+  LPF_SIMPLE_MASK(p1, p0, q0, q1, b_limit, mask);
+  LPF_SIMPLE_FILT(p1, p0, q0, q1, mask);
+  ILVRL_B2_SH(q0, p0, tmp1, tmp0);
+  ptemp_src += 1;
+  ST2x4_UB(tmp1, 0, ptemp_src, stride);
+  ptemp_src += 4 * stride;
+  ST2x4_UB(tmp1, 4, ptemp_src, stride);
+  ptemp_src += 4 * stride;
+  ST2x4_UB(tmp0, 0, ptemp_src, stride);
+  ptemp_src += 4 * stride;
+  ST2x4_UB(tmp0, 4, ptemp_src, stride);
+  ptemp_src += 4 * stride;
+}
+
+static void SimpleVFilter16i(uint8_t* src_y, int stride, int b_limit_in) {
+  SimpleVFilter16(src_y +  4 * stride, stride, b_limit_in);
+  SimpleVFilter16(src_y +  8 * stride, stride, b_limit_in);
+  SimpleVFilter16(src_y + 12 * stride, stride, b_limit_in);
+}
+
+static void SimpleHFilter16i(uint8_t* src_y, int stride, int b_limit_in) {
+  SimpleHFilter16(src_y +  4, stride, b_limit_in);
+  SimpleHFilter16(src_y +  8, stride, b_limit_in);
+  SimpleHFilter16(src_y + 12, stride, b_limit_in);
+}
+
+//------------------------------------------------------------------------------
+// Intra predictions
+//------------------------------------------------------------------------------
+
+// 4x4
+
+static void DC4(uint8_t* dst) {   // DC
+  uint32_t dc = 4;
+  int i;
+  for (i = 0; i < 4; ++i) dc += dst[i - BPS] + dst[-1 + i * BPS];
+  dc >>= 3;
+  dc = dc | (dc << 8) | (dc << 16) | (dc << 24);
+  SW4(dc, dc, dc, dc, dst, BPS);
+}
+
+static void TM4(uint8_t* dst) {
+  const uint8_t* const ptemp = dst - BPS - 1;
+  v8i16 T, d, r0, r1, r2, r3;
+  const v16i8 zero = { 0 };
+  const v8i16 TL = (v8i16)__msa_fill_h(ptemp[0 * BPS]);
+  const v8i16 L0 = (v8i16)__msa_fill_h(ptemp[1 * BPS]);
+  const v8i16 L1 = (v8i16)__msa_fill_h(ptemp[2 * BPS]);
+  const v8i16 L2 = (v8i16)__msa_fill_h(ptemp[3 * BPS]);
+  const v8i16 L3 = (v8i16)__msa_fill_h(ptemp[4 * BPS]);
+  const v16u8 T1 = LD_UB(ptemp + 1);
+
+  T  = (v8i16)__msa_ilvr_b(zero, (v16i8)T1);
+  d = T - TL;
+  ADD4(d, L0, d, L1, d, L2, d, L3, r0, r1, r2, r3);
+  CLIP_SH4_0_255(r0, r1, r2, r3);
+  PCKEV_ST4x4_UB(r0, r1, r2, r3, dst, BPS);
+}
+
+static void VE4(uint8_t* dst) {    // vertical
+  const uint8_t* const ptop = dst - BPS - 1;
+  const uint32_t val0 = LW(ptop + 0);
+  const uint32_t val1 = LW(ptop + 4);
+  uint32_t out;
+  v16u8 A, B, C, AC, B2, R;
+
+  INSERT_W2_UB(val0, val1, A);
+  B = SLDI_UB(A, A, 1);
+  C = SLDI_UB(A, A, 2);
+  AC = __msa_ave_u_b(A, C);
+  B2 = __msa_ave_u_b(B, B);
+  R = __msa_aver_u_b(AC, B2);
+  out = __msa_copy_s_w((v4i32)R, 0);
+  SW4(out, out, out, out, dst, BPS);
+}
+
+static void RD4(uint8_t* dst) {   // Down-right
+  const uint8_t* const ptop = dst - 1 - BPS;
+  uint32_t val0 = LW(ptop + 0);
+  uint32_t val1 = LW(ptop + 4);
+  uint32_t val2, val3;
+  v16u8 A, B, C, AC, B2, R, A1;
+
+  INSERT_W2_UB(val0, val1, A1);
+  A = SLDI_UB(A1, A1, 12);
+  A = (v16u8)__msa_insert_b((v16i8)A, 3, ptop[1 * BPS]);
+  A = (v16u8)__msa_insert_b((v16i8)A, 2, ptop[2 * BPS]);
+  A = (v16u8)__msa_insert_b((v16i8)A, 1, ptop[3 * BPS]);
+  A = (v16u8)__msa_insert_b((v16i8)A, 0, ptop[4 * BPS]);
+  B = SLDI_UB(A, A, 1);
+  C = SLDI_UB(A, A, 2);
+  AC = __msa_ave_u_b(A, C);
+  B2 = __msa_ave_u_b(B, B);
+  R = __msa_aver_u_b(AC, B2);
+  val3 = __msa_copy_s_w((v4i32)R, 0);
+  R = SLDI_UB(R, R, 1);
+  val2 = __msa_copy_s_w((v4i32)R, 0);
+  R = SLDI_UB(R, R, 1);
+  val1 = __msa_copy_s_w((v4i32)R, 0);
+  R = SLDI_UB(R, R, 1);
+  val0 = __msa_copy_s_w((v4i32)R, 0);
+  SW4(val0, val1, val2, val3, dst, BPS);
+}
+
+static void LD4(uint8_t* dst) {   // Down-Left
+  const uint8_t* const ptop = dst - BPS;
+  uint32_t val0 = LW(ptop + 0);
+  uint32_t val1 = LW(ptop + 4);
+  uint32_t val2, val3;
+  v16u8 A, B, C, AC, B2, R;
+
+  INSERT_W2_UB(val0, val1, A);
+  B = SLDI_UB(A, A, 1);
+  C = SLDI_UB(A, A, 2);
+  C = (v16u8)__msa_insert_b((v16i8)C, 6, ptop[7]);
+  AC = __msa_ave_u_b(A, C);
+  B2 = __msa_ave_u_b(B, B);
+  R = __msa_aver_u_b(AC, B2);
+  val0 = __msa_copy_s_w((v4i32)R, 0);
+  R = SLDI_UB(R, R, 1);
+  val1 = __msa_copy_s_w((v4i32)R, 0);
+  R = SLDI_UB(R, R, 1);
+  val2 = __msa_copy_s_w((v4i32)R, 0);
+  R = SLDI_UB(R, R, 1);
+  val3 = __msa_copy_s_w((v4i32)R, 0);
+  SW4(val0, val1, val2, val3, dst, BPS);
+}
+
+// 16x16
+
+static void DC16(uint8_t* dst) {   // DC
+  uint32_t dc = 16;
+  int i;
+  const v16u8 rtop = LD_UB(dst - BPS);
+  const v8u16 dctop = __msa_hadd_u_h(rtop, rtop);
+  v16u8 out;
+
+  for (i = 0; i < 16; ++i) {
+    dc += dst[-1 + i * BPS];
+  }
+  dc += HADD_UH_U32(dctop);
+  out = (v16u8)__msa_fill_b(dc >> 5);
+  ST_UB8(out, out, out, out, out, out, out, out, dst, BPS);
+  ST_UB8(out, out, out, out, out, out, out, out, dst + 8 * BPS, BPS);
+}
+
+static void TM16(uint8_t* dst) {
+  int j;
+  v8i16 d1, d2;
+  const v16i8 zero = { 0 };
+  const v8i16 TL = (v8i16)__msa_fill_h(dst[-1 - BPS]);
+  const v16i8 T = LD_SB(dst - BPS);
+
+  ILVRL_B2_SH(zero, T, d1, d2);
+  SUB2(d1, TL, d2, TL, d1, d2);
+  for (j = 0; j < 16; j += 4) {
+    v16i8 t0, t1, t2, t3;
+    v8i16 r0, r1, r2, r3, r4, r5, r6, r7;
+    const v8i16 L0 = (v8i16)__msa_fill_h(dst[-1 + 0 * BPS]);
+    const v8i16 L1 = (v8i16)__msa_fill_h(dst[-1 + 1 * BPS]);
+    const v8i16 L2 = (v8i16)__msa_fill_h(dst[-1 + 2 * BPS]);
+    const v8i16 L3 = (v8i16)__msa_fill_h(dst[-1 + 3 * BPS]);
+    ADD4(d1, L0, d1, L1, d1, L2, d1, L3, r0, r1, r2, r3);
+    ADD4(d2, L0, d2, L1, d2, L2, d2, L3, r4, r5, r6, r7);
+    CLIP_SH4_0_255(r0, r1, r2, r3);
+    CLIP_SH4_0_255(r4, r5, r6, r7);
+    PCKEV_B4_SB(r4, r0, r5, r1, r6, r2, r7, r3, t0, t1, t2, t3);
+    ST_SB4(t0, t1, t2, t3, dst, BPS);
+    dst += 4 * BPS;
+  }
+}
+
+static void VE16(uint8_t* dst) {   // vertical
+  const v16u8 rtop = LD_UB(dst - BPS);
+  ST_UB8(rtop, rtop, rtop, rtop, rtop, rtop, rtop, rtop, dst, BPS);
+  ST_UB8(rtop, rtop, rtop, rtop, rtop, rtop, rtop, rtop, dst + 8 * BPS, BPS);
+}
+
+static void HE16(uint8_t* dst) {   // horizontal
+  int j;
+  for (j = 16; j > 0; j -= 4) {
+    const v16u8 L0 = (v16u8)__msa_fill_b(dst[-1 + 0 * BPS]);
+    const v16u8 L1 = (v16u8)__msa_fill_b(dst[-1 + 1 * BPS]);
+    const v16u8 L2 = (v16u8)__msa_fill_b(dst[-1 + 2 * BPS]);
+    const v16u8 L3 = (v16u8)__msa_fill_b(dst[-1 + 3 * BPS]);
+    ST_UB4(L0, L1, L2, L3, dst, BPS);
+    dst += 4 * BPS;
+  }
+}
+
+static void DC16NoTop(uint8_t* dst) {   // DC with top samples not available
+  int j;
+  uint32_t dc = 8;
+  v16u8 out;
+
+  for (j = 0; j < 16; ++j) {
+    dc += dst[-1 + j * BPS];
+  }
+  out = (v16u8)__msa_fill_b(dc >> 4);
+  ST_UB8(out, out, out, out, out, out, out, out, dst, BPS);
+  ST_UB8(out, out, out, out, out, out, out, out, dst + 8 * BPS, BPS);
+}
+
+static void DC16NoLeft(uint8_t* dst) {   // DC with left samples not available
+  uint32_t dc = 8;
+  const v16u8 rtop = LD_UB(dst - BPS);
+  const v8u16 dctop = __msa_hadd_u_h(rtop, rtop);
+  v16u8 out;
+
+  dc += HADD_UH_U32(dctop);
+  out = (v16u8)__msa_fill_b(dc >> 4);
+  ST_UB8(out, out, out, out, out, out, out, out, dst, BPS);
+  ST_UB8(out, out, out, out, out, out, out, out, dst + 8 * BPS, BPS);
+}
+
+static void DC16NoTopLeft(uint8_t* dst) {   // DC with nothing
+  const v16u8 out = (v16u8)__msa_fill_b(0x80);
+  ST_UB8(out, out, out, out, out, out, out, out, dst, BPS);
+  ST_UB8(out, out, out, out, out, out, out, out, dst + 8 * BPS, BPS);
+}
+
+// Chroma
+
+#define STORE8x8(out, dst) do {                 \
+  SD4(out, out, out, out, dst + 0 * BPS, BPS);  \
+  SD4(out, out, out, out, dst + 4 * BPS, BPS);  \
+} while (0)
+
+static void DC8uv(uint8_t* dst) {   // DC
+  uint32_t dc = 8;
+  int i;
+  uint64_t out;
+  const v16u8 rtop = LD_UB(dst - BPS);
+  const v8u16 temp0 = __msa_hadd_u_h(rtop, rtop);
+  const v4u32 temp1 = __msa_hadd_u_w(temp0, temp0);
+  const v2u64 temp2 = __msa_hadd_u_d(temp1, temp1);
+  v16u8 dctemp;
+
+  for (i = 0; i < 8; ++i) {
+    dc += dst[-1 + i * BPS];
+  }
+  dc += __msa_copy_s_w((v4i32)temp2, 0);
+  dctemp = (v16u8)__msa_fill_b(dc >> 4);
+  out = __msa_copy_s_d((v2i64)dctemp, 0);
+  STORE8x8(out, dst);
+}
+
+static void TM8uv(uint8_t* dst) {
+  int j;
+  const v16i8 T1 = LD_SB(dst - BPS);
+  const v16i8 zero = { 0 };
+  const v8i16 T  = (v8i16)__msa_ilvr_b(zero, T1);
+  const v8i16 TL = (v8i16)__msa_fill_h(dst[-1 - BPS]);
+  const v8i16 d = T - TL;
+
+  for (j = 0; j < 8; j += 4) {
+    v16i8 t0, t1;
+    v8i16 r0 = (v8i16)__msa_fill_h(dst[-1 + 0 * BPS]);
+    v8i16 r1 = (v8i16)__msa_fill_h(dst[-1 + 1 * BPS]);
+    v8i16 r2 = (v8i16)__msa_fill_h(dst[-1 + 2 * BPS]);
+    v8i16 r3 = (v8i16)__msa_fill_h(dst[-1 + 3 * BPS]);
+    ADD4(d, r0, d, r1, d, r2, d, r3, r0, r1, r2, r3);
+    CLIP_SH4_0_255(r0, r1, r2, r3);
+    PCKEV_B2_SB(r1, r0, r3, r2, t0, t1);
+    ST4x4_UB(t0, t1, 0, 2, 0, 2, dst, BPS);
+    ST4x4_UB(t0, t1, 1, 3, 1, 3, dst + 4, BPS);
+    dst += 4 * BPS;
+  }
+}
+
+static void VE8uv(uint8_t* dst) {   // vertical
+  const v16u8 rtop = LD_UB(dst - BPS);
+  const uint64_t out = __msa_copy_s_d((v2i64)rtop, 0);
+  STORE8x8(out, dst);
+}
+
+static void HE8uv(uint8_t* dst) {   // horizontal
+  int j;
+  for (j = 0; j < 8; j += 4) {
+    const v16u8 L0 = (v16u8)__msa_fill_b(dst[-1 + 0 * BPS]);
+    const v16u8 L1 = (v16u8)__msa_fill_b(dst[-1 + 1 * BPS]);
+    const v16u8 L2 = (v16u8)__msa_fill_b(dst[-1 + 2 * BPS]);
+    const v16u8 L3 = (v16u8)__msa_fill_b(dst[-1 + 3 * BPS]);
+    const uint64_t out0 = __msa_copy_s_d((v2i64)L0, 0);
+    const uint64_t out1 = __msa_copy_s_d((v2i64)L1, 0);
+    const uint64_t out2 = __msa_copy_s_d((v2i64)L2, 0);
+    const uint64_t out3 = __msa_copy_s_d((v2i64)L3, 0);
+    SD4(out0, out1, out2, out3, dst, BPS);
+    dst += 4 * BPS;
+  }
+}
+
+static void DC8uvNoLeft(uint8_t* dst) {   // DC with no left samples
+  const uint32_t dc = 4;
+  const v16u8 rtop = LD_UB(dst - BPS);
+  const v8u16 temp0 = __msa_hadd_u_h(rtop, rtop);
+  const v4u32 temp1 = __msa_hadd_u_w(temp0, temp0);
+  const v2u64 temp2 = __msa_hadd_u_d(temp1, temp1);
+  const uint32_t sum_m = __msa_copy_s_w((v4i32)temp2, 0);
+  const v16u8 dcval = (v16u8)__msa_fill_b((dc + sum_m) >> 3);
+  const uint64_t out = __msa_copy_s_d((v2i64)dcval, 0);
+  STORE8x8(out, dst);
+}
+
+static void DC8uvNoTop(uint8_t* dst) {   // DC with no top samples
+  uint32_t dc = 4;
+  int i;
+  uint64_t out;
+  v16u8 dctemp;
+
+  for (i = 0; i < 8; ++i) {
+    dc += dst[-1 + i * BPS];
+  }
+  dctemp = (v16u8)__msa_fill_b(dc >> 3);
+  out = __msa_copy_s_d((v2i64)dctemp, 0);
+  STORE8x8(out, dst);
+}
+
+static void DC8uvNoTopLeft(uint8_t* dst) {   // DC with nothing
+  const uint64_t out = 0x8080808080808080ULL;
+  STORE8x8(out, dst);
+}
+
+//------------------------------------------------------------------------------
 // Entry point
 
 extern void VP8DspInitMSA(void);
@@ -163,6 +977,39 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitMSA(void) {
   VP8Transform = TransformTwo;
   VP8TransformDC = TransformDC;
   VP8TransformAC3 = TransformAC3;
+
+  VP8VFilter16  = VFilter16;
+  VP8HFilter16  = HFilter16;
+  VP8VFilter16i = VFilter16i;
+  VP8HFilter16i = HFilter16i;
+  VP8VFilter8  = VFilter8;
+  VP8HFilter8  = HFilter8;
+  VP8VFilter8i = VFilter8i;
+  VP8HFilter8i = HFilter8i;
+  VP8SimpleVFilter16  = SimpleVFilter16;
+  VP8SimpleHFilter16  = SimpleHFilter16;
+  VP8SimpleVFilter16i = SimpleVFilter16i;
+  VP8SimpleHFilter16i = SimpleHFilter16i;
+
+  VP8PredLuma4[0] = DC4;
+  VP8PredLuma4[1] = TM4;
+  VP8PredLuma4[2] = VE4;
+  VP8PredLuma4[4] = RD4;
+  VP8PredLuma4[6] = LD4;
+  VP8PredLuma16[0] = DC16;
+  VP8PredLuma16[1] = TM16;
+  VP8PredLuma16[2] = VE16;
+  VP8PredLuma16[3] = HE16;
+  VP8PredLuma16[4] = DC16NoTop;
+  VP8PredLuma16[5] = DC16NoLeft;
+  VP8PredLuma16[6] = DC16NoTopLeft;
+  VP8PredChroma8[0] = DC8uv;
+  VP8PredChroma8[1] = TM8uv;
+  VP8PredChroma8[2] = VE8uv;
+  VP8PredChroma8[3] = HE8uv;
+  VP8PredChroma8[4] = DC8uvNoTop;
+  VP8PredChroma8[5] = DC8uvNoLeft;
+  VP8PredChroma8[6] = DC8uvNoTopLeft;
 }
 
 #else  // !WEBP_USE_MSA
diff --git a/src/3rdparty/libwebp/src/dsp/dec_neon.c b/src/3rdparty/libwebp/src/dsp/dec_neon.c
index a63f43f..34796cf 100644
--- a/src/3rdparty/libwebp/src/dsp/dec_neon.c
+++ b/src/3rdparty/libwebp/src/dsp/dec_neon.c
@@ -17,7 +17,7 @@
 #if defined(WEBP_USE_NEON)
 
 #include "./neon.h"
-#include "../dec/vp8i.h"
+#include "../dec/vp8i_dec.h"
 
 //------------------------------------------------------------------------------
 // NxM Loading functions
@@ -666,9 +666,8 @@ static uint8x16_t NeedsHev(const uint8x16_t p1, const uint8x16_t p0,
   const uint8x16_t hev_thresh_v = vdupq_n_u8((uint8_t)hev_thresh);
   const uint8x16_t a_p1_p0 = vabdq_u8(p1, p0);  // abs(p1 - p0)
   const uint8x16_t a_q1_q0 = vabdq_u8(q1, q0);  // abs(q1 - q0)
-  const uint8x16_t mask1 = vcgtq_u8(a_p1_p0, hev_thresh_v);
-  const uint8x16_t mask2 = vcgtq_u8(a_q1_q0, hev_thresh_v);
-  const uint8x16_t mask = vorrq_u8(mask1, mask2);
+  const uint8x16_t a_max = vmaxq_u8(a_p1_p0, a_q1_q0);
+  const uint8x16_t mask = vcgtq_u8(a_max, hev_thresh_v);
   return mask;
 }
 
@@ -756,24 +755,25 @@ static void ApplyFilter6(
     const int8x16_t delta,
     uint8x16_t* const op2, uint8x16_t* const op1, uint8x16_t* const op0,
     uint8x16_t* const oq0, uint8x16_t* const oq1, uint8x16_t* const oq2) {
-  const int16x8_t kCst63 = vdupq_n_s16(63);
-  const int8x8_t kCst27 = vdup_n_s8(27);
-  const int8x8_t kCst18 = vdup_n_s8(18);
-  const int8x8_t kCst9 = vdup_n_s8(9);
+  // We have to compute: X = (9*a+63) >> 7, Y = (18*a+63)>>7, Z = (27*a+63) >> 7
+  // Turns out, there's a common sub-expression S=9 * a - 1 that can be used
+  // with the special vqrshrn_n_s16 rounding-shift-and-narrow instruction:
+  //   X = (S + 64) >> 7, Y = (S + 32) >> 6, Z = (18 * a + S + 64) >> 7
   const int8x8_t delta_lo = vget_low_s8(delta);
   const int8x8_t delta_hi = vget_high_s8(delta);
-  const int16x8_t s1_lo = vmlal_s8(kCst63, kCst27, delta_lo);  // 63 + 27 * a
-  const int16x8_t s1_hi = vmlal_s8(kCst63, kCst27, delta_hi);  // 63 + 27 * a
-  const int16x8_t s2_lo = vmlal_s8(kCst63, kCst18, delta_lo);  // 63 + 18 * a
-  const int16x8_t s2_hi = vmlal_s8(kCst63, kCst18, delta_hi);  // 63 + 18 * a
-  const int16x8_t s3_lo = vmlal_s8(kCst63, kCst9, delta_lo);   // 63 + 9 * a
-  const int16x8_t s3_hi = vmlal_s8(kCst63, kCst9, delta_hi);   // 63 + 9 * a
-  const int8x8_t a1_lo = vqshrn_n_s16(s1_lo, 7);
-  const int8x8_t a1_hi = vqshrn_n_s16(s1_hi, 7);
-  const int8x8_t a2_lo = vqshrn_n_s16(s2_lo, 7);
-  const int8x8_t a2_hi = vqshrn_n_s16(s2_hi, 7);
-  const int8x8_t a3_lo = vqshrn_n_s16(s3_lo, 7);
-  const int8x8_t a3_hi = vqshrn_n_s16(s3_hi, 7);
+  const int8x8_t kCst9 = vdup_n_s8(9);
+  const int16x8_t kCstm1 = vdupq_n_s16(-1);
+  const int8x8_t kCst18 = vdup_n_s8(18);
+  const int16x8_t S_lo = vmlal_s8(kCstm1, kCst9, delta_lo);  // S = 9 * a - 1
+  const int16x8_t S_hi = vmlal_s8(kCstm1, kCst9, delta_hi);
+  const int16x8_t Z_lo = vmlal_s8(S_lo, kCst18, delta_lo);   // S + 18 * a
+  const int16x8_t Z_hi = vmlal_s8(S_hi, kCst18, delta_hi);
+  const int8x8_t a3_lo = vqrshrn_n_s16(S_lo, 7);   // (9 * a + 63) >> 7
+  const int8x8_t a3_hi = vqrshrn_n_s16(S_hi, 7);
+  const int8x8_t a2_lo = vqrshrn_n_s16(S_lo, 6);   // (9 * a + 31) >> 6
+  const int8x8_t a2_hi = vqrshrn_n_s16(S_hi, 6);
+  const int8x8_t a1_lo = vqrshrn_n_s16(Z_lo, 7);   // (27 * a + 63) >> 7
+  const int8x8_t a1_hi = vqrshrn_n_s16(Z_hi, 7);
   const int8x16_t a1 = vcombine_s8(a1_lo, a1_hi);
   const int8x16_t a2 = vcombine_s8(a2_lo, a2_hi);
   const int8x16_t a3 = vcombine_s8(a3_lo, a3_hi);
diff --git a/src/3rdparty/libwebp/src/dsp/dec_sse2.c b/src/3rdparty/libwebp/src/dsp/dec_sse2.c
index f0a8ddc..411fb02 100644
--- a/src/3rdparty/libwebp/src/dsp/dec_sse2.c
+++ b/src/3rdparty/libwebp/src/dsp/dec_sse2.c
@@ -22,7 +22,7 @@
 
 #include <emmintrin.h>
 #include "./common_sse2.h"
-#include "../dec/vp8i.h"
+#include "../dec/vp8i_dec.h"
 #include "../utils/utils.h"
 
 //------------------------------------------------------------------------------
@@ -140,7 +140,7 @@ static void Transform(const int16_t* in, uint8_t* dst, int do_two) {
 
     // Transpose the two 4x4.
     VP8Transpose_2_4x4_16b(&shifted0, &shifted1, &shifted2, &shifted3, &T0, &T1,
-                        &T2, &T3);
+                           &T2, &T3);
   }
 
   // Add inverse transform to 'dst' and store.
diff --git a/src/3rdparty/libwebp/src/dsp/dec_sse41.c b/src/3rdparty/libwebp/src/dsp/dec_sse41.c
index 8d6aed1..4e81ec4 100644
--- a/src/3rdparty/libwebp/src/dsp/dec_sse41.c
+++ b/src/3rdparty/libwebp/src/dsp/dec_sse41.c
@@ -16,7 +16,7 @@
 #if defined(WEBP_USE_SSE41)
 
 #include <smmintrin.h>
-#include "../dec/vp8i.h"
+#include "../dec/vp8i_dec.h"
 #include "../utils/utils.h"
 
 static void HE16(uint8_t* dst) {     // horizontal
diff --git a/src/3rdparty/libwebp/src/dsp/dsp.h b/src/3rdparty/libwebp/src/dsp/dsp.h
index 1faac27..813fed4 100644
--- a/src/3rdparty/libwebp/src/dsp/dsp.h
+++ b/src/3rdparty/libwebp/src/dsp/dsp.h
@@ -111,8 +111,7 @@ extern "C" {
 
 #define WEBP_UBSAN_IGNORE_UNDEF
 #define WEBP_UBSAN_IGNORE_UNSIGNED_OVERFLOW
-#if !defined(WEBP_FORCE_ALIGNED) && defined(__clang__) && \
-    defined(__has_attribute)
+#if defined(__clang__) && defined(__has_attribute)
 #if __has_attribute(no_sanitize)
 // This macro prevents the undefined behavior sanitizer from reporting
 // failures. This is only meant to silence unaligned loads on platforms that
@@ -133,6 +132,7 @@ extern "C" {
 typedef enum {
   kSSE2,
   kSSE3,
+  kSlowSSSE3,  // special feature for slow SSSE3 architectures
   kSSE4_1,
   kAVX,
   kAVX2,
@@ -185,6 +185,11 @@ typedef int (*VP8WMetric)(const uint8_t* pix, const uint8_t* ref,
 // 4 by 4 symmetric matrix.
 extern VP8WMetric VP8TDisto4x4, VP8TDisto16x16;
 
+// Compute the average (DC) of four 4x4 blocks.
+// Each sub-4x4 block #i sum is stored in dc[i].
+typedef void (*VP8MeanMetric)(const uint8_t* ref, uint32_t dc[4]);
+extern VP8MeanMetric VP8Mean16x4;
+
 typedef void (*VP8BlockCopy)(const uint8_t* src, uint8_t* dst);
 extern VP8BlockCopy VP8Copy4x4;
 extern VP8BlockCopy VP8Copy16x8;
@@ -246,30 +251,37 @@ extern VP8GetResidualCostFunc VP8GetResidualCost;
 void VP8EncDspCostInit(void);
 
 //------------------------------------------------------------------------------
-// SSIM utils
+// SSIM / PSNR utils
 
 // struct for accumulating statistical moments
 typedef struct {
-  double w;              // sum(w_i) : sum of weights
-  double xm, ym;         // sum(w_i * x_i), sum(w_i * y_i)
-  double xxm, xym, yym;  // sum(w_i * x_i * x_i), etc.
+  uint32_t w;              // sum(w_i) : sum of weights
+  uint32_t xm, ym;         // sum(w_i * x_i), sum(w_i * y_i)
+  uint32_t xxm, xym, yym;  // sum(w_i * x_i * x_i), etc.
 } VP8DistoStats;
 
+// Compute the final SSIM value
+// The non-clipped version assumes stats->w = (2 * VP8_SSIM_KERNEL + 1)^2.
+double VP8SSIMFromStats(const VP8DistoStats* const stats);
+double VP8SSIMFromStatsClipped(const VP8DistoStats* const stats);
+
 #define VP8_SSIM_KERNEL 3   // total size of the kernel: 2 * VP8_SSIM_KERNEL + 1
-typedef void (*VP8SSIMAccumulateClippedFunc)(const uint8_t* src1, int stride1,
-                                             const uint8_t* src2, int stride2,
-                                             int xo, int yo,  // center position
-                                             int W, int H,    // plane dimension
-                                             VP8DistoStats* const stats);
+typedef double (*VP8SSIMGetClippedFunc)(const uint8_t* src1, int stride1,
+                                        const uint8_t* src2, int stride2,
+                                        int xo, int yo,  // center position
+                                        int W, int H);   // plane dimension
 
 // This version is called with the guarantee that you can load 8 bytes and
 // 8 rows at offset src1 and src2
-typedef void (*VP8SSIMAccumulateFunc)(const uint8_t* src1, int stride1,
-                                      const uint8_t* src2, int stride2,
-                                      VP8DistoStats* const stats);
+typedef double (*VP8SSIMGetFunc)(const uint8_t* src1, int stride1,
+                                 const uint8_t* src2, int stride2);
+
+extern VP8SSIMGetFunc VP8SSIMGet;         // unclipped / unchecked
+extern VP8SSIMGetClippedFunc VP8SSIMGetClipped;   // with clipping
 
-extern VP8SSIMAccumulateFunc VP8SSIMAccumulate;         // unclipped / unchecked
-extern VP8SSIMAccumulateClippedFunc VP8SSIMAccumulateClipped;   // with clipping
+typedef uint32_t (*VP8AccumulateSSEFunc)(const uint8_t* src1,
+                                         const uint8_t* src2, int len);
+extern VP8AccumulateSSEFunc VP8AccumulateSSE;
 
 // must be called before using any of the above directly
 void VP8SSIMDspInit(void);
@@ -416,6 +428,15 @@ extern void WebPConvertARGBToUV_C(const uint32_t* argb, uint8_t* u, uint8_t* v,
 extern void WebPConvertRGBA32ToUV_C(const uint16_t* rgb,
                                     uint8_t* u, uint8_t* v, int width);
 
+// utilities for accurate RGB->YUV conversion
+extern uint64_t (*WebPSharpYUVUpdateY)(const uint16_t* src, const uint16_t* ref,
+                                       uint16_t* dst, int len);
+extern void (*WebPSharpYUVUpdateRGB)(const int16_t* src, const int16_t* ref,
+                                     int16_t* dst, int len);
+extern void (*WebPSharpYUVFilterRow)(const int16_t* A, const int16_t* B,
+                                     int len,
+                                     const uint16_t* best_y, uint16_t* out);
+
 // Must be called before using the above.
 void WebPInitConvertARGBToYUV(void);
 
@@ -488,6 +509,10 @@ extern int (*WebPExtractAlpha)(const uint8_t* argb, int argb_stride,
                                int width, int height,
                                uint8_t* alpha, int alpha_stride);
 
+// Extract the green values from 32b values in argb[] and pack them into alpha[]
+// (this is the opposite of WebPDispatchAlphaToGreen).
+extern void (*WebPExtractGreen)(const uint32_t* argb, uint8_t* alpha, int size);
+
 // Pre-Multiply operation transforms x into x * A / 255  (where x=Y,R,G or B).
 // Un-Multiply operation transforms x into x * 255 / A.
 
diff --git a/src/3rdparty/libwebp/src/dsp/enc.c b/src/3rdparty/libwebp/src/dsp/enc.c
index f639f55..f31bc6d 100644
--- a/src/3rdparty/libwebp/src/dsp/enc.c
+++ b/src/3rdparty/libwebp/src/dsp/enc.c
@@ -15,7 +15,7 @@
 #include <stdlib.h>  // for abs()
 
 #include "./dsp.h"
-#include "../enc/vp8enci.h"
+#include "../enc/vp8i_enc.h"
 
 static WEBP_INLINE uint8_t clip_8b(int v) {
   return (!(v & ~0xff)) ? v : (v < 0) ? 0 : 255;
@@ -335,7 +335,7 @@ static void Intra16Preds(uint8_t* dst,
 // luma 4x4 prediction
 
 #define DST(x, y) dst[(x) + (y) * BPS]
-#define AVG3(a, b, c) (((a) + 2 * (b) + (c) + 2) >> 2)
+#define AVG3(a, b, c) ((uint8_t)(((a) + 2 * (b) + (c) + 2) >> 2))
 #define AVG2(a, b) (((a) + (b) + 1) >> 1)
 
 static void VE4(uint8_t* dst, const uint8_t* top) {    // vertical
@@ -551,6 +551,20 @@ static int SSE4x4(const uint8_t* a, const uint8_t* b) {
   return GetSSE(a, b, 4, 4);
 }
 
+static void Mean16x4(const uint8_t* ref, uint32_t dc[4]) {
+  int k, x, y;
+  for (k = 0; k < 4; ++k) {
+    uint32_t avg = 0;
+    for (y = 0; y < 4; ++y) {
+      for (x = 0; x < 4; ++x) {
+        avg += ref[x + y * BPS];
+      }
+    }
+    dc[k] = avg;
+    ref += 4;   // go to next 4x4 block.
+  }
+}
+
 //------------------------------------------------------------------------------
 // Texture distortion
 //
@@ -656,32 +670,6 @@ static int Quantize2Blocks(int16_t in[32], int16_t out[32],
   return nz;
 }
 
-static int QuantizeBlockWHT(int16_t in[16], int16_t out[16],
-                            const VP8Matrix* const mtx) {
-  int n, last = -1;
-  for (n = 0; n < 16; ++n) {
-    const int j = kZigzag[n];
-    const int sign = (in[j] < 0);
-    const uint32_t coeff = sign ? -in[j] : in[j];
-    assert(mtx->sharpen_[j] == 0);
-    if (coeff > mtx->zthresh_[j]) {
-      const uint32_t Q = mtx->q_[j];
-      const uint32_t iQ = mtx->iq_[j];
-      const uint32_t B = mtx->bias_[j];
-      int level = QUANTDIV(coeff, iQ, B);
-      if (level > MAX_LEVEL) level = MAX_LEVEL;
-      if (sign) level = -level;
-      in[j] = level * (int)Q;
-      out[n] = level;
-      if (level) last = n;
-    } else {
-      out[n] = 0;
-      in[j] = 0;
-    }
-  }
-  return (last >= 0);
-}
-
 //------------------------------------------------------------------------------
 // Block copy
 
@@ -703,11 +691,51 @@ static void Copy16x8(const uint8_t* src, uint8_t* dst) {
 }
 
 //------------------------------------------------------------------------------
+// SSIM / PSNR
 
-static void SSIMAccumulateClipped(const uint8_t* src1, int stride1,
-                                  const uint8_t* src2, int stride2,
-                                  int xo, int yo, int W, int H,
-                                  VP8DistoStats* const stats) {
+// hat-shaped filter. Sum of coefficients is equal to 16.
+static const uint32_t kWeight[2 * VP8_SSIM_KERNEL + 1] = {
+  1, 2, 3, 4, 3, 2, 1
+};
+static const uint32_t kWeightSum = 16 * 16;   // sum{kWeight}^2
+
+static WEBP_INLINE double SSIMCalculation(
+    const VP8DistoStats* const stats, uint32_t N  /*num samples*/) {
+  const uint32_t w2 =  N * N;
+  const uint32_t C1 = 20 * w2;
+  const uint32_t C2 = 60 * w2;
+  const uint32_t C3 = 8 * 8 * w2;   // 'dark' limit ~= 6
+  const uint64_t xmxm = (uint64_t)stats->xm * stats->xm;
+  const uint64_t ymym = (uint64_t)stats->ym * stats->ym;
+  if (xmxm + ymym >= C3) {
+    const int64_t xmym = (int64_t)stats->xm * stats->ym;
+    const int64_t sxy = (int64_t)stats->xym * N - xmym;    // can be negative
+    const uint64_t sxx = (uint64_t)stats->xxm * N - xmxm;
+    const uint64_t syy = (uint64_t)stats->yym * N - ymym;
+    // we descale by 8 to prevent overflow during the fnum/fden multiply.
+    const uint64_t num_S = (2 * (uint64_t)(sxy < 0 ? 0 : sxy) + C2) >> 8;
+    const uint64_t den_S = (sxx + syy + C2) >> 8;
+    const uint64_t fnum = (2 * xmym + C1) * num_S;
+    const uint64_t fden = (xmxm + ymym + C1) * den_S;
+    const double r = (double)fnum / fden;
+    assert(r >= 0. && r <= 1.0);
+    return r;
+  }
+  return 1.;   // area is too dark to contribute meaningfully
+}
+
+double VP8SSIMFromStats(const VP8DistoStats* const stats) {
+  return SSIMCalculation(stats, kWeightSum);
+}
+
+double VP8SSIMFromStatsClipped(const VP8DistoStats* const stats) {
+  return SSIMCalculation(stats, stats->w);
+}
+
+static double SSIMGetClipped_C(const uint8_t* src1, int stride1,
+                               const uint8_t* src2, int stride2,
+                               int xo, int yo, int W, int H) {
+  VP8DistoStats stats = { 0, 0, 0, 0, 0, 0 };
   const int ymin = (yo - VP8_SSIM_KERNEL < 0) ? 0 : yo - VP8_SSIM_KERNEL;
   const int ymax = (yo + VP8_SSIM_KERNEL > H - 1) ? H - 1
                                                   : yo + VP8_SSIM_KERNEL;
@@ -719,38 +747,61 @@ static void SSIMAccumulateClipped(const uint8_t* src1, int stride1,
   src2 += ymin * stride2;
   for (y = ymin; y <= ymax; ++y, src1 += stride1, src2 += stride2) {
     for (x = xmin; x <= xmax; ++x) {
-      const int s1 = src1[x];
-      const int s2 = src2[x];
-      stats->w   += 1;
-      stats->xm  += s1;
-      stats->ym  += s2;
-      stats->xxm += s1 * s1;
-      stats->xym += s1 * s2;
-      stats->yym += s2 * s2;
+      const uint32_t w = kWeight[VP8_SSIM_KERNEL + x - xo]
+                       * kWeight[VP8_SSIM_KERNEL + y - yo];
+      const uint32_t s1 = src1[x];
+      const uint32_t s2 = src2[x];
+      stats.w   += w;
+      stats.xm  += w * s1;
+      stats.ym  += w * s2;
+      stats.xxm += w * s1 * s1;
+      stats.xym += w * s1 * s2;
+      stats.yym += w * s2 * s2;
     }
   }
+  return VP8SSIMFromStatsClipped(&stats);
 }
 
-static void SSIMAccumulate(const uint8_t* src1, int stride1,
-                           const uint8_t* src2, int stride2,
-                           VP8DistoStats* const stats) {
+static double SSIMGet_C(const uint8_t* src1, int stride1,
+                        const uint8_t* src2, int stride2) {
+  VP8DistoStats stats = { 0, 0, 0, 0, 0, 0 };
   int x, y;
   for (y = 0; y <= 2 * VP8_SSIM_KERNEL; ++y, src1 += stride1, src2 += stride2) {
     for (x = 0; x <= 2 * VP8_SSIM_KERNEL; ++x) {
-      const int s1 = src1[x];
-      const int s2 = src2[x];
-      stats->w   += 1;
-      stats->xm  += s1;
-      stats->ym  += s2;
-      stats->xxm += s1 * s1;
-      stats->xym += s1 * s2;
-      stats->yym += s2 * s2;
+      const uint32_t w = kWeight[x] * kWeight[y];
+      const uint32_t s1 = src1[x];
+      const uint32_t s2 = src2[x];
+      stats.xm  += w * s1;
+      stats.ym  += w * s2;
+      stats.xxm += w * s1 * s1;
+      stats.xym += w * s1 * s2;
+      stats.yym += w * s2 * s2;
     }
   }
+  return VP8SSIMFromStats(&stats);
+}
+
+//------------------------------------------------------------------------------
+
+static uint32_t AccumulateSSE(const uint8_t* src1,
+                              const uint8_t* src2, int len) {
+  int i;
+  uint32_t sse2 = 0;
+  assert(len <= 65535);  // to ensure that accumulation fits within uint32_t
+  for (i = 0; i < len; ++i) {
+    const int32_t diff = src1[i] - src2[i];
+    sse2 += diff * diff;
+  }
+  return sse2;
 }
 
-VP8SSIMAccumulateFunc VP8SSIMAccumulate;
-VP8SSIMAccumulateClippedFunc VP8SSIMAccumulateClipped;
+//------------------------------------------------------------------------------
+
+VP8SSIMGetFunc VP8SSIMGet;
+VP8SSIMGetClippedFunc VP8SSIMGetClipped;
+VP8AccumulateSSEFunc VP8AccumulateSSE;
+
+extern void VP8SSIMDspInitSSE2(void);
 
 static volatile VP8CPUInfo ssim_last_cpuinfo_used =
     (VP8CPUInfo)&ssim_last_cpuinfo_used;
@@ -758,8 +809,17 @@ static volatile VP8CPUInfo ssim_last_cpuinfo_used =
 WEBP_TSAN_IGNORE_FUNCTION void VP8SSIMDspInit(void) {
   if (ssim_last_cpuinfo_used == VP8GetCPUInfo) return;
 
-  VP8SSIMAccumulate = SSIMAccumulate;
-  VP8SSIMAccumulateClipped = SSIMAccumulateClipped;
+  VP8SSIMGetClipped = SSIMGetClipped_C;
+  VP8SSIMGet = SSIMGet_C;
+
+  VP8AccumulateSSE = AccumulateSSE;
+  if (VP8GetCPUInfo != NULL) {
+#if defined(WEBP_USE_SSE2)
+    if (VP8GetCPUInfo(kSSE2)) {
+      VP8SSIMDspInitSSE2();
+    }
+#endif
+  }
 
   ssim_last_cpuinfo_used = VP8GetCPUInfo;
 }
@@ -783,6 +843,7 @@ VP8Metric VP8SSE16x8;
 VP8Metric VP8SSE4x4;
 VP8WMetric VP8TDisto4x4;
 VP8WMetric VP8TDisto16x16;
+VP8MeanMetric VP8Mean16x4;
 VP8QuantizeBlock VP8EncQuantizeBlock;
 VP8Quantize2Blocks VP8EncQuantize2Blocks;
 VP8QuantizeBlockWHT VP8EncQuantizeBlockWHT;
@@ -795,6 +856,7 @@ extern void VP8EncDspInitAVX2(void);
 extern void VP8EncDspInitNEON(void);
 extern void VP8EncDspInitMIPS32(void);
 extern void VP8EncDspInitMIPSdspR2(void);
+extern void VP8EncDspInitMSA(void);
 
 static volatile VP8CPUInfo enc_last_cpuinfo_used =
     (VP8CPUInfo)&enc_last_cpuinfo_used;
@@ -820,9 +882,10 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInit(void) {
   VP8SSE4x4 = SSE4x4;
   VP8TDisto4x4 = Disto4x4;
   VP8TDisto16x16 = Disto16x16;
+  VP8Mean16x4 = Mean16x4;
   VP8EncQuantizeBlock = QuantizeBlock;
   VP8EncQuantize2Blocks = Quantize2Blocks;
-  VP8EncQuantizeBlockWHT = QuantizeBlockWHT;
+  VP8EncQuantizeBlockWHT = QuantizeBlock;
   VP8Copy4x4 = Copy4x4;
   VP8Copy16x8 = Copy16x8;
 
@@ -858,6 +921,11 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInit(void) {
       VP8EncDspInitMIPSdspR2();
     }
 #endif
+#if defined(WEBP_USE_MSA)
+    if (VP8GetCPUInfo(kMSA)) {
+      VP8EncDspInitMSA();
+    }
+#endif
   }
   enc_last_cpuinfo_used = VP8GetCPUInfo;
 }
diff --git a/src/3rdparty/libwebp/src/dsp/enc_mips32.c b/src/3rdparty/libwebp/src/dsp/enc_mips32.c
index fd10143..752b14d 100644
--- a/src/3rdparty/libwebp/src/dsp/enc_mips32.c
+++ b/src/3rdparty/libwebp/src/dsp/enc_mips32.c
@@ -18,8 +18,8 @@
 #if defined(WEBP_USE_MIPS32)
 
 #include "./mips_macro.h"
-#include "../enc/vp8enci.h"
-#include "../enc/cost.h"
+#include "../enc/vp8i_enc.h"
+#include "../enc/cost_enc.h"
 
 static const int kC1 = 20091 + (1 << 16);
 static const int kC2 = 35468;
diff --git a/src/3rdparty/libwebp/src/dsp/enc_mips_dsp_r2.c b/src/3rdparty/libwebp/src/dsp/enc_mips_dsp_r2.c
index 7ab96f6..6c8c1c6 100644
--- a/src/3rdparty/libwebp/src/dsp/enc_mips_dsp_r2.c
+++ b/src/3rdparty/libwebp/src/dsp/enc_mips_dsp_r2.c
@@ -17,8 +17,8 @@
 #if defined(WEBP_USE_MIPS_DSP_R2)
 
 #include "./mips_macro.h"
-#include "../enc/cost.h"
-#include "../enc/vp8enci.h"
+#include "../enc/cost_enc.h"
+#include "../enc/vp8i_enc.h"
 
 static const int kC1 = 20091 + (1 << 16);
 static const int kC2 = 35468;
diff --git a/src/3rdparty/libwebp/src/dsp/enc_msa.c b/src/3rdparty/libwebp/src/dsp/enc_msa.c
new file mode 100644
index 0000000..909b46d
--- /dev/null
+++ b/src/3rdparty/libwebp/src/dsp/enc_msa.c
@@ -0,0 +1,892 @@
+// Copyright 2016 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// MSA version of encoder dsp functions.
+//
+// Author:  Prashant Patil   (prashant.patil@imgtec.com)
+
+#include "./dsp.h"
+
+#if defined(WEBP_USE_MSA)
+
+#include <stdlib.h>
+#include "./msa_macro.h"
+#include "../enc/vp8i_enc.h"
+
+//------------------------------------------------------------------------------
+// Transforms
+
+#define IDCT_1D_W(in0, in1, in2, in3, out0, out1, out2, out3) do {  \
+  v4i32 a1_m, b1_m, c1_m, d1_m;                                     \
+  const v4i32 cospi8sqrt2minus1 = __msa_fill_w(20091);              \
+  const v4i32 sinpi8sqrt2 = __msa_fill_w(35468);                    \
+  v4i32 c_tmp1_m = in1 * sinpi8sqrt2;                               \
+  v4i32 c_tmp2_m = in3 * cospi8sqrt2minus1;                         \
+  v4i32 d_tmp1_m = in1 * cospi8sqrt2minus1;                         \
+  v4i32 d_tmp2_m = in3 * sinpi8sqrt2;                               \
+                                                                    \
+  ADDSUB2(in0, in2, a1_m, b1_m);                                    \
+  SRAI_W2_SW(c_tmp1_m, c_tmp2_m, 16);                               \
+  c_tmp2_m = c_tmp2_m + in3;                                        \
+  c1_m = c_tmp1_m - c_tmp2_m;                                       \
+  SRAI_W2_SW(d_tmp1_m, d_tmp2_m, 16);                               \
+  d_tmp1_m = d_tmp1_m + in1;                                        \
+  d1_m = d_tmp1_m + d_tmp2_m;                                       \
+  BUTTERFLY_4(a1_m, b1_m, c1_m, d1_m, out0, out1, out2, out3);      \
+} while (0)
+
+static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
+                                      uint8_t* dst) {
+  v8i16 input0, input1;
+  v4i32 in0, in1, in2, in3, hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3;
+  v4i32 res0, res1, res2, res3;
+  v16i8 dest0, dest1, dest2, dest3;
+  const v16i8 zero = { 0 };
+
+  LD_SH2(in, 8, input0, input1);
+  UNPCK_SH_SW(input0, in0, in1);
+  UNPCK_SH_SW(input1, in2, in3);
+  IDCT_1D_W(in0, in1, in2, in3, hz0, hz1, hz2, hz3);
+  TRANSPOSE4x4_SW_SW(hz0, hz1, hz2, hz3, hz0, hz1, hz2, hz3);
+  IDCT_1D_W(hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3);
+  SRARI_W4_SW(vt0, vt1, vt2, vt3, 3);
+  TRANSPOSE4x4_SW_SW(vt0, vt1, vt2, vt3, vt0, vt1, vt2, vt3);
+  LD_SB4(ref, BPS, dest0, dest1, dest2, dest3);
+  ILVR_B4_SW(zero, dest0, zero, dest1, zero, dest2, zero, dest3,
+             res0, res1, res2, res3);
+  ILVR_H4_SW(zero, res0, zero, res1, zero, res2, zero, res3,
+             res0, res1, res2, res3);
+  ADD4(res0, vt0, res1, vt1, res2, vt2, res3, vt3, res0, res1, res2, res3);
+  CLIP_SW4_0_255(res0, res1, res2, res3);
+  PCKEV_B2_SW(res0, res1, res2, res3, vt0, vt1);
+  res0 = (v4i32)__msa_pckev_b((v16i8)vt0, (v16i8)vt1);
+  ST4x4_UB(res0, res0, 3, 2, 1, 0, dst, BPS);
+}
+
+static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst,
+                       int do_two) {
+  ITransformOne(ref, in, dst);
+  if (do_two) {
+    ITransformOne(ref + 4, in + 16, dst + 4);
+  }
+}
+
+static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
+  uint64_t out0, out1, out2, out3;
+  uint32_t in0, in1, in2, in3;
+  v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+  v8i16 t0, t1, t2, t3;
+  v16u8 srcl0, srcl1, src0, src1;
+  const v8i16 mask0 = { 0, 4, 8, 12, 1, 5, 9, 13 };
+  const v8i16 mask1 = { 3, 7, 11, 15, 2, 6, 10, 14 };
+  const v8i16 mask2 = { 4, 0, 5, 1, 6, 2, 7, 3 };
+  const v8i16 mask3 = { 0, 4, 1, 5, 2, 6, 3, 7 };
+  const v8i16 cnst0 = { 2217, -5352, 2217, -5352, 2217, -5352, 2217, -5352 };
+  const v8i16 cnst1 = { 5352, 2217, 5352, 2217, 5352, 2217, 5352, 2217 };
+
+  LW4(src, BPS, in0, in1, in2, in3);
+  INSERT_W4_UB(in0, in1, in2, in3, src0);
+  LW4(ref, BPS, in0, in1, in2, in3);
+  INSERT_W4_UB(in0, in1, in2, in3, src1);
+  ILVRL_B2_UB(src0, src1, srcl0, srcl1);
+  HSUB_UB2_SH(srcl0, srcl1, t0, t1);
+  VSHF_H2_SH(t0, t1, t0, t1, mask0, mask1, t2, t3);
+  ADDSUB2(t2, t3, t0, t1);
+  t0 = SRLI_H(t0, 3);
+  VSHF_H2_SH(t0, t0, t1, t1, mask2, mask3, t3, t2);
+  tmp0 = __msa_hadd_s_w(t3, t3);
+  tmp2 = __msa_hsub_s_w(t3, t3);
+  FILL_W2_SW(1812, 937, tmp1, tmp3);
+  DPADD_SH2_SW(t2, t2, cnst0, cnst1, tmp3, tmp1);
+  SRAI_W2_SW(tmp1, tmp3, 9);
+  PCKEV_H2_SH(tmp1, tmp0, tmp3, tmp2, t0, t1);
+  VSHF_H2_SH(t0, t1, t0, t1, mask0, mask1, t2, t3);
+  ADDSUB2(t2, t3, t0, t1);
+  VSHF_H2_SH(t0, t0, t1, t1, mask2, mask3, t3, t2);
+  tmp0 = __msa_hadd_s_w(t3, t3);
+  tmp2 = __msa_hsub_s_w(t3, t3);
+  ADDVI_W2_SW(tmp0, 7, tmp2, 7, tmp0, tmp2);
+  SRAI_W2_SW(tmp0, tmp2, 4);
+  FILL_W2_SW(12000, 51000, tmp1, tmp3);
+  DPADD_SH2_SW(t2, t2, cnst0, cnst1, tmp3, tmp1);
+  SRAI_W2_SW(tmp1, tmp3, 16);
+  UNPCK_R_SH_SW(t1, tmp4);
+  tmp5 = __msa_ceqi_w(tmp4, 0);
+  tmp4 = (v4i32)__msa_nor_v((v16u8)tmp5, (v16u8)tmp5);
+  tmp5 = __msa_fill_w(1);
+  tmp5 = (v4i32)__msa_and_v((v16u8)tmp5, (v16u8)tmp4);
+  tmp1 += tmp5;
+  PCKEV_H2_SH(tmp1, tmp0, tmp3, tmp2, t0, t1);
+  out0 = __msa_copy_s_d((v2i64)t0, 0);
+  out1 = __msa_copy_s_d((v2i64)t0, 1);
+  out2 = __msa_copy_s_d((v2i64)t1, 0);
+  out3 = __msa_copy_s_d((v2i64)t1, 1);
+  SD4(out0, out1, out2, out3, out, 8);
+}
+
+static void FTransformWHT(const int16_t* in, int16_t* out) {
+  v8i16 in0 = { 0 };
+  v8i16 in1 = { 0 };
+  v8i16 tmp0, tmp1, tmp2, tmp3;
+  v8i16 out0, out1;
+  const v8i16 mask0 = { 0, 1, 2, 3, 8, 9, 10, 11 };
+  const v8i16 mask1 = { 4, 5, 6, 7, 12, 13, 14, 15 };
+  const v8i16 mask2 = { 0, 4, 8, 12, 1, 5, 9, 13 };
+  const v8i16 mask3 = { 3, 7, 11, 15, 2, 6, 10, 14 };
+
+  in0 = __msa_insert_h(in0, 0, in[  0]);
+  in0 = __msa_insert_h(in0, 1, in[ 64]);
+  in0 = __msa_insert_h(in0, 2, in[128]);
+  in0 = __msa_insert_h(in0, 3, in[192]);
+  in0 = __msa_insert_h(in0, 4, in[ 16]);
+  in0 = __msa_insert_h(in0, 5, in[ 80]);
+  in0 = __msa_insert_h(in0, 6, in[144]);
+  in0 = __msa_insert_h(in0, 7, in[208]);
+  in1 = __msa_insert_h(in1, 0, in[ 48]);
+  in1 = __msa_insert_h(in1, 1, in[112]);
+  in1 = __msa_insert_h(in1, 2, in[176]);
+  in1 = __msa_insert_h(in1, 3, in[240]);
+  in1 = __msa_insert_h(in1, 4, in[ 32]);
+  in1 = __msa_insert_h(in1, 5, in[ 96]);
+  in1 = __msa_insert_h(in1, 6, in[160]);
+  in1 = __msa_insert_h(in1, 7, in[224]);
+  ADDSUB2(in0, in1, tmp0, tmp1);
+  VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask0, mask1, tmp2, tmp3);
+  ADDSUB2(tmp2, tmp3, tmp0, tmp1);
+  VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask2, mask3, in0, in1);
+  ADDSUB2(in0, in1, tmp0, tmp1);
+  VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask0, mask1, tmp2, tmp3);
+  ADDSUB2(tmp2, tmp3, out0, out1);
+  SRAI_H2_SH(out0, out1, 1);
+  ST_SH2(out0, out1, out, 8);
+}
+
+static int TTransform(const uint8_t* in, const uint16_t* w) {
+  int sum;
+  uint32_t in0_m, in1_m, in2_m, in3_m;
+  v16i8 src0;
+  v8i16 in0, in1, tmp0, tmp1, tmp2, tmp3;
+  v4i32 dst0, dst1;
+  const v16i8 zero = { 0 };
+  const v8i16 mask0 = { 0, 1, 2, 3, 8, 9, 10, 11 };
+  const v8i16 mask1 = { 4, 5, 6, 7, 12, 13, 14, 15 };
+  const v8i16 mask2 = { 0, 4, 8, 12, 1, 5, 9, 13 };
+  const v8i16 mask3 = { 3, 7, 11, 15, 2, 6, 10, 14 };
+
+  LW4(in, BPS, in0_m, in1_m, in2_m, in3_m);
+  INSERT_W4_SB(in0_m, in1_m, in2_m, in3_m, src0);
+  ILVRL_B2_SH(zero, src0, tmp0, tmp1);
+  VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask2, mask3, in0, in1);
+  ADDSUB2(in0, in1, tmp0, tmp1);
+  VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask0, mask1, tmp2, tmp3);
+  ADDSUB2(tmp2, tmp3, tmp0, tmp1);
+  VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask2, mask3, in0, in1);
+  ADDSUB2(in0, in1, tmp0, tmp1);
+  VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask0, mask1, tmp2, tmp3);
+  ADDSUB2(tmp2, tmp3, tmp0, tmp1);
+  tmp0 = __msa_add_a_h(tmp0, (v8i16)zero);
+  tmp1 = __msa_add_a_h(tmp1, (v8i16)zero);
+  LD_SH2(w, 8, tmp2, tmp3);
+  DOTP_SH2_SW(tmp0, tmp1, tmp2, tmp3, dst0, dst1);
+  dst0 = dst0 + dst1;
+  sum = HADD_SW_S32(dst0);
+  return sum;
+}
+
+static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
+                    const uint16_t* const w) {
+  const int sum1 = TTransform(a, w);
+  const int sum2 = TTransform(b, w);
+  return abs(sum2 - sum1) >> 5;
+}
+
+static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
+                      const uint16_t* const w) {
+  int D = 0;
+  int x, y;
+  for (y = 0; y < 16 * BPS; y += 4 * BPS) {
+    for (x = 0; x < 16; x += 4) {
+      D += Disto4x4(a + x + y, b + x + y, w);
+    }
+  }
+  return D;
+}
+
+//------------------------------------------------------------------------------
+// Histogram
+
+static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
+                             int start_block, int end_block,
+                             VP8Histogram* const histo) {
+  int j;
+  int distribution[MAX_COEFF_THRESH + 1] = { 0 };
+  for (j = start_block; j < end_block; ++j) {
+    int16_t out[16];
+    VP8FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
+    {
+      int k;
+      v8i16 coeff0, coeff1;
+      const v8i16 zero = { 0 };
+      const v8i16 max_coeff_thr = __msa_ldi_h(MAX_COEFF_THRESH);
+      LD_SH2(&out[0], 8, coeff0, coeff1);
+      coeff0 = __msa_add_a_h(coeff0, zero);
+      coeff1 = __msa_add_a_h(coeff1, zero);
+      SRAI_H2_SH(coeff0, coeff1, 3);
+      coeff0 = __msa_min_s_h(coeff0, max_coeff_thr);
+      coeff1 = __msa_min_s_h(coeff1, max_coeff_thr);
+      ST_SH2(coeff0, coeff1, &out[0], 8);
+      for (k = 0; k < 16; ++k) {
+        ++distribution[out[k]];
+      }
+    }
+  }
+  VP8SetHistogramData(distribution, histo);
+}
+
+//------------------------------------------------------------------------------
+// Intra predictions
+
+// luma 4x4 prediction
+
+#define DST(x, y) dst[(x) + (y) * BPS]
+#define AVG3(a, b, c) (((a) + 2 * (b) + (c) + 2) >> 2)
+#define AVG2(a, b) (((a) + (b) + 1) >> 1)
+
+static WEBP_INLINE void VE4(uint8_t* dst, const uint8_t* top) {    // vertical
+  const uint64_t val_m = LD(top - 1);
+  const v16u8 A = (v16u8)__msa_insert_d((v2i64)A, 0, val_m);
+  const v16u8 B = SLDI_UB(A, A, 1);
+  const v16u8 C = SLDI_UB(A, A, 2);
+  const v16u8 AC = __msa_ave_u_b(A, C);
+  const v16u8 B2 = __msa_ave_u_b(B, B);
+  const v16u8 R = __msa_aver_u_b(AC, B2);
+  const uint32_t out = __msa_copy_s_w((v4i32)R, 0);
+  SW4(out, out, out, out, dst, BPS);
+}
+
+static WEBP_INLINE void HE4(uint8_t* dst, const uint8_t* top) {    // horizontal
+  const int X = top[-1];
+  const int I = top[-2];
+  const int J = top[-3];
+  const int K = top[-4];
+  const int L = top[-5];
+  WebPUint32ToMem(dst + 0 * BPS, 0x01010101U * AVG3(X, I, J));
+  WebPUint32ToMem(dst + 1 * BPS, 0x01010101U * AVG3(I, J, K));
+  WebPUint32ToMem(dst + 2 * BPS, 0x01010101U * AVG3(J, K, L));
+  WebPUint32ToMem(dst + 3 * BPS, 0x01010101U * AVG3(K, L, L));
+}
+
+static WEBP_INLINE void DC4(uint8_t* dst, const uint8_t* top) {
+  uint32_t dc = 4;
+  int i;
+  for (i = 0; i < 4; ++i) dc += top[i] + top[-5 + i];
+  dc >>= 3;
+  dc = dc | (dc << 8) | (dc << 16) | (dc << 24);
+  SW4(dc, dc, dc, dc, dst, BPS);
+}
+
+static WEBP_INLINE void RD4(uint8_t* dst, const uint8_t* top) {
+  const uint64_t val_m = LD(top - 5);
+  const v16u8 A1 = (v16u8)__msa_insert_d((v2i64)A1, 0, val_m);
+  const v16u8 A = (v16u8)__msa_insert_b((v16i8)A1, 8, top[3]);
+  const v16u8 B = SLDI_UB(A, A, 1);
+  const v16u8 C = SLDI_UB(A, A, 2);
+  const v16u8 AC = __msa_ave_u_b(A, C);
+  const v16u8 B2 = __msa_ave_u_b(B, B);
+  const v16u8 R0 = __msa_aver_u_b(AC, B2);
+  const v16u8 R1 = SLDI_UB(R0, R0, 1);
+  const v16u8 R2 = SLDI_UB(R1, R1, 1);
+  const v16u8 R3 = SLDI_UB(R2, R2, 1);
+  const uint32_t val0 = __msa_copy_s_w((v4i32)R0, 0);
+  const uint32_t val1 = __msa_copy_s_w((v4i32)R1, 0);
+  const uint32_t val2 = __msa_copy_s_w((v4i32)R2, 0);
+  const uint32_t val3 = __msa_copy_s_w((v4i32)R3, 0);
+  SW4(val3, val2, val1, val0, dst, BPS);
+}
+
+static WEBP_INLINE void LD4(uint8_t* dst, const uint8_t* top) {
+  const uint64_t val_m = LD(top);
+  const v16u8 A = (v16u8)__msa_insert_d((v2i64)A, 0, val_m);
+  const v16u8 B = SLDI_UB(A, A, 1);
+  const v16u8 C1 = SLDI_UB(A, A, 2);
+  const v16u8 C = (v16u8)__msa_insert_b((v16i8)C1, 6, top[7]);
+  const v16u8 AC = __msa_ave_u_b(A, C);
+  const v16u8 B2 = __msa_ave_u_b(B, B);
+  const v16u8 R0 = __msa_aver_u_b(AC, B2);
+  const v16u8 R1 = SLDI_UB(R0, R0, 1);
+  const v16u8 R2 = SLDI_UB(R1, R1, 1);
+  const v16u8 R3 = SLDI_UB(R2, R2, 1);
+  const uint32_t val0 = __msa_copy_s_w((v4i32)R0, 0);
+  const uint32_t val1 = __msa_copy_s_w((v4i32)R1, 0);
+  const uint32_t val2 = __msa_copy_s_w((v4i32)R2, 0);
+  const uint32_t val3 = __msa_copy_s_w((v4i32)R3, 0);
+  SW4(val0, val1, val2, val3, dst, BPS);
+}
+
+static WEBP_INLINE void VR4(uint8_t* dst, const uint8_t* top) {
+  const int X = top[-1];
+  const int I = top[-2];
+  const int J = top[-3];
+  const int K = top[-4];
+  const int A = top[0];
+  const int B = top[1];
+  const int C = top[2];
+  const int D = top[3];
+  DST(0, 0) = DST(1, 2) = AVG2(X, A);
+  DST(1, 0) = DST(2, 2) = AVG2(A, B);
+  DST(2, 0) = DST(3, 2) = AVG2(B, C);
+  DST(3, 0)             = AVG2(C, D);
+  DST(0, 3) =             AVG3(K, J, I);
+  DST(0, 2) =             AVG3(J, I, X);
+  DST(0, 1) = DST(1, 3) = AVG3(I, X, A);
+  DST(1, 1) = DST(2, 3) = AVG3(X, A, B);
+  DST(2, 1) = DST(3, 3) = AVG3(A, B, C);
+  DST(3, 1) =             AVG3(B, C, D);
+}
+
+static WEBP_INLINE void VL4(uint8_t* dst, const uint8_t* top) {
+  const int A = top[0];
+  const int B = top[1];
+  const int C = top[2];
+  const int D = top[3];
+  const int E = top[4];
+  const int F = top[5];
+  const int G = top[6];
+  const int H = top[7];
+  DST(0, 0) =             AVG2(A, B);
+  DST(1, 0) = DST(0, 2) = AVG2(B, C);
+  DST(2, 0) = DST(1, 2) = AVG2(C, D);
+  DST(3, 0) = DST(2, 2) = AVG2(D, E);
+  DST(0, 1) =             AVG3(A, B, C);
+  DST(1, 1) = DST(0, 3) = AVG3(B, C, D);
+  DST(2, 1) = DST(1, 3) = AVG3(C, D, E);
+  DST(3, 1) = DST(2, 3) = AVG3(D, E, F);
+              DST(3, 2) = AVG3(E, F, G);
+              DST(3, 3) = AVG3(F, G, H);
+}
+
+static WEBP_INLINE void HU4(uint8_t* dst, const uint8_t* top) {
+  const int I = top[-2];
+  const int J = top[-3];
+  const int K = top[-4];
+  const int L = top[-5];
+  DST(0, 0) =             AVG2(I, J);
+  DST(2, 0) = DST(0, 1) = AVG2(J, K);
+  DST(2, 1) = DST(0, 2) = AVG2(K, L);
+  DST(1, 0) =             AVG3(I, J, K);
+  DST(3, 0) = DST(1, 1) = AVG3(J, K, L);
+  DST(3, 1) = DST(1, 2) = AVG3(K, L, L);
+  DST(3, 2) = DST(2, 2) =
+  DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L;
+}
+
+static WEBP_INLINE void HD4(uint8_t* dst, const uint8_t* top) {
+  const int X = top[-1];
+  const int I = top[-2];
+  const int J = top[-3];
+  const int K = top[-4];
+  const int L = top[-5];
+  const int A = top[0];
+  const int B = top[1];
+  const int C = top[2];
+  DST(0, 0) = DST(2, 1) = AVG2(I, X);
+  DST(0, 1) = DST(2, 2) = AVG2(J, I);
+  DST(0, 2) = DST(2, 3) = AVG2(K, J);
+  DST(0, 3)             = AVG2(L, K);
+  DST(3, 0)             = AVG3(A, B, C);
+  DST(2, 0)             = AVG3(X, A, B);
+  DST(1, 0) = DST(3, 1) = AVG3(I, X, A);
+  DST(1, 1) = DST(3, 2) = AVG3(J, I, X);
+  DST(1, 2) = DST(3, 3) = AVG3(K, J, I);
+  DST(1, 3)             = AVG3(L, K, J);
+}
+
+static WEBP_INLINE void TM4(uint8_t* dst, const uint8_t* top) {
+  const v16i8 zero = { 0 };
+  const v8i16 TL = (v8i16)__msa_fill_h(top[-1]);
+  const v8i16 L0 = (v8i16)__msa_fill_h(top[-2]);
+  const v8i16 L1 = (v8i16)__msa_fill_h(top[-3]);
+  const v8i16 L2 = (v8i16)__msa_fill_h(top[-4]);
+  const v8i16 L3 = (v8i16)__msa_fill_h(top[-5]);
+  const v16u8 T1 = LD_UB(top);
+  const v8i16 T  = (v8i16)__msa_ilvr_b(zero, (v16i8)T1);
+  const v8i16 d = T - TL;
+  v8i16 r0, r1, r2, r3;
+  ADD4(d, L0, d, L1, d, L2, d, L3, r0, r1, r2, r3);
+  CLIP_SH4_0_255(r0, r1, r2, r3);
+  PCKEV_ST4x4_UB(r0, r1, r2, r3, dst, BPS);
+}
+
+#undef DST
+#undef AVG3
+#undef AVG2
+
+static void Intra4Preds(uint8_t* dst, const uint8_t* top) {
+  DC4(I4DC4 + dst, top);
+  TM4(I4TM4 + dst, top);
+  VE4(I4VE4 + dst, top);
+  HE4(I4HE4 + dst, top);
+  RD4(I4RD4 + dst, top);
+  VR4(I4VR4 + dst, top);
+  LD4(I4LD4 + dst, top);
+  VL4(I4VL4 + dst, top);
+  HD4(I4HD4 + dst, top);
+  HU4(I4HU4 + dst, top);
+}
+
+// luma 16x16 prediction
+
+#define STORE16x16(out, dst) do {                                        \
+    ST_UB8(out, out, out, out, out, out, out, out, dst + 0 * BPS, BPS);  \
+    ST_UB8(out, out, out, out, out, out, out, out, dst + 8 * BPS, BPS);  \
+} while (0)
+
+static WEBP_INLINE void VerticalPred16x16(uint8_t* dst, const uint8_t* top) {
+  if (top != NULL) {
+    const v16u8 out = LD_UB(top);
+    STORE16x16(out, dst);
+  } else {
+    const v16u8 out = (v16u8)__msa_fill_b(0x7f);
+    STORE16x16(out, dst);
+  }
+}
+
+static WEBP_INLINE void HorizontalPred16x16(uint8_t* dst,
+                                            const uint8_t* left) {
+  if (left != NULL) {
+    int j;
+    for (j = 0; j < 16; j += 4) {
+      const v16u8 L0 = (v16u8)__msa_fill_b(left[0]);
+      const v16u8 L1 = (v16u8)__msa_fill_b(left[1]);
+      const v16u8 L2 = (v16u8)__msa_fill_b(left[2]);
+      const v16u8 L3 = (v16u8)__msa_fill_b(left[3]);
+      ST_UB4(L0, L1, L2, L3, dst, BPS);
+      dst += 4 * BPS;
+      left += 4;
+    }
+  } else {
+    const v16u8 out = (v16u8)__msa_fill_b(0x81);
+    STORE16x16(out, dst);
+  }
+}
+
+static WEBP_INLINE void TrueMotion16x16(uint8_t* dst, const uint8_t* left,
+                                        const uint8_t* top) {
+  if (left != NULL) {
+    if (top != NULL) {
+      int j;
+      v8i16 d1, d2;
+      const v16i8 zero = { 0 };
+      const v8i16 TL = (v8i16)__msa_fill_h(left[-1]);
+      const v16u8 T = LD_UB(top);
+      ILVRL_B2_SH(zero, T, d1, d2);
+      SUB2(d1, TL, d2, TL, d1, d2);
+      for (j = 0; j < 16; j += 4) {
+        v16i8 t0, t1, t2, t3;
+        v8i16 r0, r1, r2, r3, r4, r5, r6, r7;
+        const v8i16 L0 = (v8i16)__msa_fill_h(left[j + 0]);
+        const v8i16 L1 = (v8i16)__msa_fill_h(left[j + 1]);
+        const v8i16 L2 = (v8i16)__msa_fill_h(left[j + 2]);
+        const v8i16 L3 = (v8i16)__msa_fill_h(left[j + 3]);
+        ADD4(d1, L0, d1, L1, d1, L2, d1, L3, r0, r1, r2, r3);
+        ADD4(d2, L0, d2, L1, d2, L2, d2, L3, r4, r5, r6, r7);
+        CLIP_SH4_0_255(r0, r1, r2, r3);
+        CLIP_SH4_0_255(r4, r5, r6, r7);
+        PCKEV_B4_SB(r4, r0, r5, r1, r6, r2, r7, r3, t0, t1, t2, t3);
+        ST_SB4(t0, t1, t2, t3, dst, BPS);
+        dst += 4 * BPS;
+      }
+    } else {
+      HorizontalPred16x16(dst, left);
+    }
+  } else {
+    if (top != NULL) {
+      VerticalPred16x16(dst, top);
+    } else {
+      const v16u8 out = (v16u8)__msa_fill_b(0x81);
+      STORE16x16(out, dst);
+    }
+  }
+}
+
+static WEBP_INLINE void DCMode16x16(uint8_t* dst, const uint8_t* left,
+                                    const uint8_t* top) {
+  int DC;
+  v16u8 out;
+  if (top != NULL && left != NULL) {
+    const v16u8 rtop = LD_UB(top);
+    const v8u16 dctop = __msa_hadd_u_h(rtop, rtop);
+    const v16u8 rleft = LD_UB(left);
+    const v8u16 dcleft = __msa_hadd_u_h(rleft, rleft);
+    const v8u16 dctemp = dctop + dcleft;
+    DC = HADD_UH_U32(dctemp);
+    DC = (DC + 16) >> 5;
+  } else if (left != NULL) {   // left but no top
+    const v16u8 rleft = LD_UB(left);
+    const v8u16 dcleft = __msa_hadd_u_h(rleft, rleft);
+    DC = HADD_UH_U32(dcleft);
+    DC = (DC + DC + 16) >> 5;
+  } else if (top != NULL) {   // top but no left
+    const v16u8 rtop = LD_UB(top);
+    const v8u16 dctop = __msa_hadd_u_h(rtop, rtop);
+    DC = HADD_UH_U32(dctop);
+    DC = (DC + DC + 16) >> 5;
+  } else {   // no top, no left, nothing.
+    DC = 0x80;
+  }
+  out = (v16u8)__msa_fill_b(DC);
+  STORE16x16(out, dst);
+}
+
+static void Intra16Preds(uint8_t* dst,
+                         const uint8_t* left, const uint8_t* top) {
+  DCMode16x16(I16DC16 + dst, left, top);
+  VerticalPred16x16(I16VE16 + dst, top);
+  HorizontalPred16x16(I16HE16 + dst, left);
+  TrueMotion16x16(I16TM16 + dst, left, top);
+}
+
+// Chroma 8x8 prediction
+
+#define CALC_DC8(in, out) do {                              \
+  const v8u16 temp0 = __msa_hadd_u_h(in, in);               \
+  const v4u32 temp1 = __msa_hadd_u_w(temp0, temp0);         \
+  const v2i64 temp2 = (v2i64)__msa_hadd_u_d(temp1, temp1);  \
+  const v2i64 temp3 = __msa_splati_d(temp2, 1);             \
+  const v2i64 temp4 = temp3 + temp2;                        \
+  const v16i8 temp5 = (v16i8)__msa_srari_d(temp4, 4);       \
+  const v2i64 temp6 = (v2i64)__msa_splati_b(temp5, 0);      \
+  out = __msa_copy_s_d(temp6, 0);                           \
+} while (0)
+
+#define STORE8x8(out, dst) do {                 \
+  SD4(out, out, out, out, dst + 0 * BPS, BPS);  \
+  SD4(out, out, out, out, dst + 4 * BPS, BPS);  \
+} while (0)
+
+static WEBP_INLINE void VerticalPred8x8(uint8_t* dst, const uint8_t* top) {
+  if (top != NULL) {
+    const uint64_t out = LD(top);
+    STORE8x8(out, dst);
+  } else {
+    const uint64_t out = 0x7f7f7f7f7f7f7f7fULL;
+    STORE8x8(out, dst);
+  }
+}
+
+static WEBP_INLINE void HorizontalPred8x8(uint8_t* dst, const uint8_t* left) {
+  if (left != NULL) {
+    int j;
+    for (j = 0; j < 8; j += 4) {
+      const v16u8 L0 = (v16u8)__msa_fill_b(left[0]);
+      const v16u8 L1 = (v16u8)__msa_fill_b(left[1]);
+      const v16u8 L2 = (v16u8)__msa_fill_b(left[2]);
+      const v16u8 L3 = (v16u8)__msa_fill_b(left[3]);
+      const uint64_t out0 = __msa_copy_s_d((v2i64)L0, 0);
+      const uint64_t out1 = __msa_copy_s_d((v2i64)L1, 0);
+      const uint64_t out2 = __msa_copy_s_d((v2i64)L2, 0);
+      const uint64_t out3 = __msa_copy_s_d((v2i64)L3, 0);
+      SD4(out0, out1, out2, out3, dst, BPS);
+      dst += 4 * BPS;
+      left += 4;
+    }
+  } else {
+    const uint64_t out = 0x8181818181818181ULL;
+    STORE8x8(out, dst);
+  }
+}
+
+static WEBP_INLINE void TrueMotion8x8(uint8_t* dst, const uint8_t* left,
+                                      const uint8_t* top) {
+  if (left != NULL) {
+    if (top != NULL) {
+      int j;
+      const v8i16 TL = (v8i16)__msa_fill_h(left[-1]);
+      const v16u8 T1 = LD_UB(top);
+      const v16i8 zero = { 0 };
+      const v8i16 T  = (v8i16)__msa_ilvr_b(zero, (v16i8)T1);
+      const v8i16 d = T - TL;
+      for (j = 0; j < 8; j += 4) {
+        uint64_t out0, out1, out2, out3;
+        v16i8 t0, t1;
+        v8i16 r0 = (v8i16)__msa_fill_h(left[j + 0]);
+        v8i16 r1 = (v8i16)__msa_fill_h(left[j + 1]);
+        v8i16 r2 = (v8i16)__msa_fill_h(left[j + 2]);
+        v8i16 r3 = (v8i16)__msa_fill_h(left[j + 3]);
+        ADD4(d, r0, d, r1, d, r2, d, r3, r0, r1, r2, r3);
+        CLIP_SH4_0_255(r0, r1, r2, r3);
+        PCKEV_B2_SB(r1, r0, r3, r2, t0, t1);
+        out0 = __msa_copy_s_d((v2i64)t0, 0);
+        out1 = __msa_copy_s_d((v2i64)t0, 1);
+        out2 = __msa_copy_s_d((v2i64)t1, 0);
+        out3 = __msa_copy_s_d((v2i64)t1, 1);
+        SD4(out0, out1, out2, out3, dst, BPS);
+        dst += 4 * BPS;
+      }
+    } else {
+      HorizontalPred8x8(dst, left);
+    }
+  } else {
+    if (top != NULL) {
+      VerticalPred8x8(dst, top);
+    } else {
+      const uint64_t out = 0x8181818181818181ULL;
+      STORE8x8(out, dst);
+    }
+  }
+}
+
+static WEBP_INLINE void DCMode8x8(uint8_t* dst, const uint8_t* left,
+                                  const uint8_t* top) {
+  uint64_t out;
+  v16u8 src;
+  if (top != NULL && left != NULL) {
+    const uint64_t left_m = LD(left);
+    const uint64_t top_m = LD(top);
+    INSERT_D2_UB(left_m, top_m, src);
+    CALC_DC8(src, out);
+  } else if (left != NULL) {   // left but no top
+    const uint64_t left_m = LD(left);
+    INSERT_D2_UB(left_m, left_m, src);
+    CALC_DC8(src, out);
+  } else if (top != NULL) {   // top but no left
+    const uint64_t top_m = LD(top);
+    INSERT_D2_UB(top_m, top_m, src);
+    CALC_DC8(src, out);
+  } else {   // no top, no left, nothing.
+    src = (v16u8)__msa_fill_b(0x80);
+    out = __msa_copy_s_d((v2i64)src, 0);
+  }
+  STORE8x8(out, dst);
+}
+
+static void IntraChromaPreds(uint8_t* dst, const uint8_t* left,
+                             const uint8_t* top) {
+  // U block
+  DCMode8x8(C8DC8 + dst, left, top);
+  VerticalPred8x8(C8VE8 + dst, top);
+  HorizontalPred8x8(C8HE8 + dst, left);
+  TrueMotion8x8(C8TM8 + dst, left, top);
+  // V block
+  dst += 8;
+  if (top != NULL) top += 8;
+  if (left != NULL) left += 16;
+  DCMode8x8(C8DC8 + dst, left, top);
+  VerticalPred8x8(C8VE8 + dst, top);
+  HorizontalPred8x8(C8HE8 + dst, left);
+  TrueMotion8x8(C8TM8 + dst, left, top);
+}
+
+//------------------------------------------------------------------------------
+// Metric
+
+#define PACK_DOTP_UB4_SW(in0, in1, in2, in3, out0, out1, out2, out3) do {  \
+  v16u8 tmp0, tmp1;                                                        \
+  v8i16 tmp2, tmp3;                                                        \
+  ILVRL_B2_UB(in0, in1, tmp0, tmp1);                                       \
+  HSUB_UB2_SH(tmp0, tmp1, tmp2, tmp3);                                     \
+  DOTP_SH2_SW(tmp2, tmp3, tmp2, tmp3, out0, out1);                         \
+  ILVRL_B2_UB(in2, in3, tmp0, tmp1);                                       \
+  HSUB_UB2_SH(tmp0, tmp1, tmp2, tmp3);                                     \
+  DOTP_SH2_SW(tmp2, tmp3, tmp2, tmp3, out2, out3);                         \
+} while (0)
+
+#define PACK_DPADD_UB4_SW(in0, in1, in2, in3, out0, out1, out2, out3) do {  \
+  v16u8 tmp0, tmp1;                                                         \
+  v8i16 tmp2, tmp3;                                                         \
+  ILVRL_B2_UB(in0, in1, tmp0, tmp1);                                        \
+  HSUB_UB2_SH(tmp0, tmp1, tmp2, tmp3);                                      \
+  DPADD_SH2_SW(tmp2, tmp3, tmp2, tmp3, out0, out1);                         \
+  ILVRL_B2_UB(in2, in3, tmp0, tmp1);                                        \
+  HSUB_UB2_SH(tmp0, tmp1, tmp2, tmp3);                                      \
+  DPADD_SH2_SW(tmp2, tmp3, tmp2, tmp3, out2, out3);                         \
+} while (0)
+
+static int SSE16x16(const uint8_t* a, const uint8_t* b) {
+  uint32_t sum;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+  v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
+  v4i32 out0, out1, out2, out3;
+
+  LD_UB8(a, BPS, src0, src1, src2, src3, src4, src5, src6, src7);
+  LD_UB8(b, BPS, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
+  PACK_DOTP_UB4_SW(src0, ref0, src1, ref1, out0, out1, out2, out3);
+  PACK_DPADD_UB4_SW(src2, ref2, src3, ref3, out0, out1, out2, out3);
+  PACK_DPADD_UB4_SW(src4, ref4, src5, ref5, out0, out1, out2, out3);
+  PACK_DPADD_UB4_SW(src6, ref6, src7, ref7, out0, out1, out2, out3);
+  a += 8 * BPS;
+  b += 8 * BPS;
+  LD_UB8(a, BPS, src0, src1, src2, src3, src4, src5, src6, src7);
+  LD_UB8(b, BPS, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
+  PACK_DPADD_UB4_SW(src0, ref0, src1, ref1, out0, out1, out2, out3);
+  PACK_DPADD_UB4_SW(src2, ref2, src3, ref3, out0, out1, out2, out3);
+  PACK_DPADD_UB4_SW(src4, ref4, src5, ref5, out0, out1, out2, out3);
+  PACK_DPADD_UB4_SW(src6, ref6, src7, ref7, out0, out1, out2, out3);
+  out0 += out1;
+  out2 += out3;
+  out0 += out2;
+  sum = HADD_SW_S32(out0);
+  return sum;
+}
+
+static int SSE16x8(const uint8_t* a, const uint8_t* b) {
+  uint32_t sum;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+  v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
+  v4i32 out0, out1, out2, out3;
+
+  LD_UB8(a, BPS, src0, src1, src2, src3, src4, src5, src6, src7);
+  LD_UB8(b, BPS, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
+  PACK_DOTP_UB4_SW(src0, ref0, src1, ref1, out0, out1, out2, out3);
+  PACK_DPADD_UB4_SW(src2, ref2, src3, ref3, out0, out1, out2, out3);
+  PACK_DPADD_UB4_SW(src4, ref4, src5, ref5, out0, out1, out2, out3);
+  PACK_DPADD_UB4_SW(src6, ref6, src7, ref7, out0, out1, out2, out3);
+  out0 += out1;
+  out2 += out3;
+  out0 += out2;
+  sum = HADD_SW_S32(out0);
+  return sum;
+}
+
+static int SSE8x8(const uint8_t* a, const uint8_t* b) {
+  uint32_t sum;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+  v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
+  v16u8 t0, t1, t2, t3;
+  v4i32 out0, out1, out2, out3;
+
+  LD_UB8(a, BPS, src0, src1, src2, src3, src4, src5, src6, src7);
+  LD_UB8(b, BPS, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
+  ILVR_B4_UB(src0, src1, src2, src3, ref0, ref1, ref2, ref3, t0, t1, t2, t3);
+  PACK_DOTP_UB4_SW(t0, t2, t1, t3, out0, out1, out2, out3);
+  ILVR_B4_UB(src4, src5, src6, src7, ref4, ref5, ref6, ref7, t0, t1, t2, t3);
+  PACK_DPADD_UB4_SW(t0, t2, t1, t3, out0, out1, out2, out3);
+  out0 += out1;
+  out2 += out3;
+  out0 += out2;
+  sum = HADD_SW_S32(out0);
+  return sum;
+}
+
+static int SSE4x4(const uint8_t* a, const uint8_t* b) {
+  uint32_t sum = 0;
+  uint32_t src0, src1, src2, src3, ref0, ref1, ref2, ref3;
+  v16u8 src, ref, tmp0, tmp1;
+  v8i16 diff0, diff1;
+  v4i32 out0, out1;
+
+  LW4(a, BPS, src0, src1, src2, src3);
+  LW4(b, BPS, ref0, ref1, ref2, ref3);
+  INSERT_W4_UB(src0, src1, src2, src3, src);
+  INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
+  ILVRL_B2_UB(src, ref, tmp0, tmp1);
+  HSUB_UB2_SH(tmp0, tmp1, diff0, diff1);
+  DOTP_SH2_SW(diff0, diff1, diff0, diff1, out0, out1);
+  out0 += out1;
+  sum = HADD_SW_S32(out0);
+  return sum;
+}
+
+//------------------------------------------------------------------------------
+// Quantization
+
+static int QuantizeBlock(int16_t in[16], int16_t out[16],
+                         const VP8Matrix* const mtx) {
+  int sum;
+  v8i16 in0, in1, sh0, sh1, out0, out1;
+  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, sign0, sign1;
+  v4i32 s0, s1, s2, s3, b0, b1, b2, b3, t0, t1, t2, t3;
+  const v8i16 zero = { 0 };
+  const v8i16 zigzag0 = { 0, 1, 4, 8, 5, 2, 3, 6 };
+  const v8i16 zigzag1 = { 9, 12, 13, 10, 7, 11, 14, 15 };
+  const v8i16 maxlevel = __msa_fill_h(MAX_LEVEL);
+
+  LD_SH2(&in[0], 8, in0, in1);
+  LD_SH2(&mtx->sharpen_[0], 8, sh0, sh1);
+  tmp4 = __msa_add_a_h(in0, zero);
+  tmp5 = __msa_add_a_h(in1, zero);
+  ILVRL_H2_SH(sh0, tmp4, tmp0, tmp1);
+  ILVRL_H2_SH(sh1, tmp5, tmp2, tmp3);
+  HADD_SH4_SW(tmp0, tmp1, tmp2, tmp3, s0, s1, s2, s3);
+  sign0 = (in0 < zero);
+  sign1 = (in1 < zero);                           // sign
+  LD_SH2(&mtx->iq_[0], 8, tmp0, tmp1);            // iq
+  ILVRL_H2_SW(zero, tmp0, t0, t1);
+  ILVRL_H2_SW(zero, tmp1, t2, t3);
+  LD_SW4(&mtx->bias_[0], 4, b0, b1, b2, b3);      // bias
+  MUL4(t0, s0, t1, s1, t2, s2, t3, s3, t0, t1, t2, t3);
+  ADD4(b0, t0, b1, t1, b2, t2, b3, t3, b0, b1, b2, b3);
+  SRAI_W4_SW(b0, b1, b2, b3, 17);
+  PCKEV_H2_SH(b1, b0, b3, b2, tmp2, tmp3);
+  tmp0 = (tmp2 > maxlevel);
+  tmp1 = (tmp3 > maxlevel);
+  tmp2 = (v8i16)__msa_bmnz_v((v16u8)tmp2, (v16u8)maxlevel, (v16u8)tmp0);
+  tmp3 = (v8i16)__msa_bmnz_v((v16u8)tmp3, (v16u8)maxlevel, (v16u8)tmp1);
+  SUB2(0, tmp2, 0, tmp3, tmp0, tmp1);
+  tmp2 = (v8i16)__msa_bmnz_v((v16u8)tmp2, (v16u8)tmp0, (v16u8)sign0);
+  tmp3 = (v8i16)__msa_bmnz_v((v16u8)tmp3, (v16u8)tmp1, (v16u8)sign1);
+  LD_SW4(&mtx->zthresh_[0], 4, t0, t1, t2, t3);   // zthresh
+  t0 = (s0 > t0);
+  t1 = (s1 > t1);
+  t2 = (s2 > t2);
+  t3 = (s3 > t3);
+  PCKEV_H2_SH(t1, t0, t3, t2, tmp0, tmp1);
+  tmp4 = (v8i16)__msa_bmnz_v((v16u8)zero, (v16u8)tmp2, (v16u8)tmp0);
+  tmp5 = (v8i16)__msa_bmnz_v((v16u8)zero, (v16u8)tmp3, (v16u8)tmp1);
+  LD_SH2(&mtx->q_[0], 8, tmp0, tmp1);
+  MUL2(tmp4, tmp0, tmp5, tmp1, in0, in1);
+  VSHF_H2_SH(tmp4, tmp5, tmp4, tmp5, zigzag0, zigzag1, out0, out1);
+  ST_SH2(in0, in1, &in[0], 8);
+  ST_SH2(out0, out1, &out[0], 8);
+  out0 = __msa_add_a_h(out0, out1);
+  sum = HADD_SH_S32(out0);
+  return (sum > 0);
+}
+
+static int Quantize2Blocks(int16_t in[32], int16_t out[32],
+                           const VP8Matrix* const mtx) {
+  int nz;
+  nz  = VP8EncQuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0;
+  nz |= VP8EncQuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1;
+  return nz;
+}
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8EncDspInitMSA(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitMSA(void) {
+  VP8ITransform = ITransform;
+  VP8FTransform = FTransform;
+  VP8FTransformWHT = FTransformWHT;
+
+  VP8TDisto4x4 = Disto4x4;
+  VP8TDisto16x16 = Disto16x16;
+  VP8CollectHistogram = CollectHistogram;
+
+  VP8EncPredLuma4 = Intra4Preds;
+  VP8EncPredLuma16 = Intra16Preds;
+  VP8EncPredChroma8 = IntraChromaPreds;
+
+  VP8SSE16x16 = SSE16x16;
+  VP8SSE16x8 = SSE16x8;
+  VP8SSE8x8 = SSE8x8;
+  VP8SSE4x4 = SSE4x4;
+
+  VP8EncQuantizeBlock = QuantizeBlock;
+  VP8EncQuantize2Blocks = Quantize2Blocks;
+  VP8EncQuantizeBlockWHT = QuantizeBlock;
+}
+
+#else  // !WEBP_USE_MSA
+
+WEBP_DSP_INIT_STUB(VP8EncDspInitMSA)
+
+#endif  // WEBP_USE_MSA
diff --git a/src/3rdparty/libwebp/src/dsp/enc_neon.c b/src/3rdparty/libwebp/src/dsp/enc_neon.c
index 46f6bf9..6a078d6 100644
--- a/src/3rdparty/libwebp/src/dsp/enc_neon.c
+++ b/src/3rdparty/libwebp/src/dsp/enc_neon.c
@@ -18,7 +18,7 @@
 #include <assert.h>
 
 #include "./neon.h"
-#include "../enc/vp8enci.h"
+#include "../enc/vp8i_enc.h"
 
 //------------------------------------------------------------------------------
 // Transforms (Paragraph 14.4)
@@ -746,9 +746,14 @@ static WEBP_INLINE void AccumulateSSE16(const uint8_t* const a,
   const uint8x16_t a0 = vld1q_u8(a);
   const uint8x16_t b0 = vld1q_u8(b);
   const uint8x16_t abs_diff = vabdq_u8(a0, b0);
-  uint16x8_t prod = vmull_u8(vget_low_u8(abs_diff), vget_low_u8(abs_diff));
-  prod = vmlal_u8(prod, vget_high_u8(abs_diff), vget_high_u8(abs_diff));
-  *sum = vpadalq_u16(*sum, prod);      // pair-wise add and accumulate
+  const uint16x8_t prod1 = vmull_u8(vget_low_u8(abs_diff),
+                                    vget_low_u8(abs_diff));
+  const uint16x8_t prod2 = vmull_u8(vget_high_u8(abs_diff),
+                                    vget_high_u8(abs_diff));
+  /* pair-wise adds and widen */
+  const uint32x4_t sum1 = vpaddlq_u16(prod1);
+  const uint32x4_t sum2 = vpaddlq_u16(prod2);
+  *sum = vaddq_u32(*sum, vaddq_u32(sum1, sum2));
 }
 
 // Horizontal sum of all four uint32_t values in 'sum'.
@@ -758,7 +763,7 @@ static int SumToInt(uint32x4_t sum) {
   return (int)sum3;
 }
 
-static int SSE16x16(const uint8_t* a, const uint8_t* b) {
+static int SSE16x16_NEON(const uint8_t* a, const uint8_t* b) {
   uint32x4_t sum = vdupq_n_u32(0);
   int y;
   for (y = 0; y < 16; ++y) {
@@ -767,7 +772,7 @@ static int SSE16x16(const uint8_t* a, const uint8_t* b) {
   return SumToInt(sum);
 }
 
-static int SSE16x8(const uint8_t* a, const uint8_t* b) {
+static int SSE16x8_NEON(const uint8_t* a, const uint8_t* b) {
   uint32x4_t sum = vdupq_n_u32(0);
   int y;
   for (y = 0; y < 8; ++y) {
@@ -776,7 +781,7 @@ static int SSE16x8(const uint8_t* a, const uint8_t* b) {
   return SumToInt(sum);
 }
 
-static int SSE8x8(const uint8_t* a, const uint8_t* b) {
+static int SSE8x8_NEON(const uint8_t* a, const uint8_t* b) {
   uint32x4_t sum = vdupq_n_u32(0);
   int y;
   for (y = 0; y < 8; ++y) {
@@ -789,13 +794,18 @@ static int SSE8x8(const uint8_t* a, const uint8_t* b) {
   return SumToInt(sum);
 }
 
-static int SSE4x4(const uint8_t* a, const uint8_t* b) {
+static int SSE4x4_NEON(const uint8_t* a, const uint8_t* b) {
   const uint8x16_t a0 = Load4x4(a);
   const uint8x16_t b0 = Load4x4(b);
   const uint8x16_t abs_diff = vabdq_u8(a0, b0);
-  uint16x8_t prod = vmull_u8(vget_low_u8(abs_diff), vget_low_u8(abs_diff));
-  prod = vmlal_u8(prod, vget_high_u8(abs_diff), vget_high_u8(abs_diff));
-  return SumToInt(vpaddlq_u16(prod));
+  const uint16x8_t prod1 = vmull_u8(vget_low_u8(abs_diff),
+                                    vget_low_u8(abs_diff));
+  const uint16x8_t prod2 = vmull_u8(vget_high_u8(abs_diff),
+                                    vget_high_u8(abs_diff));
+  /* pair-wise adds and widen */
+  const uint32x4_t sum1 = vpaddlq_u16(prod1);
+  const uint32x4_t sum2 = vpaddlq_u16(prod2);
+  return SumToInt(vaddq_u32(sum1, sum2));
 }
 
 //------------------------------------------------------------------------------
@@ -903,10 +913,12 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitNEON(void) {
   VP8TDisto4x4 = Disto4x4;
   VP8TDisto16x16 = Disto16x16;
   VP8CollectHistogram = CollectHistogram;
-  VP8SSE16x16 = SSE16x16;
-  VP8SSE16x8 = SSE16x8;
-  VP8SSE8x8 = SSE8x8;
-  VP8SSE4x4 = SSE4x4;
+
+  VP8SSE16x16 = SSE16x16_NEON;
+  VP8SSE16x8 = SSE16x8_NEON;
+  VP8SSE8x8 = SSE8x8_NEON;
+  VP8SSE4x4 = SSE4x4_NEON;
+
 #if !defined(WORK_AROUND_GCC)
   VP8EncQuantizeBlock = QuantizeBlock;
   VP8EncQuantize2Blocks = Quantize2Blocks;
diff --git a/src/3rdparty/libwebp/src/dsp/enc_sse2.c b/src/3rdparty/libwebp/src/dsp/enc_sse2.c
index 4a2e3ce..2026a74 100644
--- a/src/3rdparty/libwebp/src/dsp/enc_sse2.c
+++ b/src/3rdparty/libwebp/src/dsp/enc_sse2.c
@@ -14,12 +14,13 @@
 #include "./dsp.h"
 
 #if defined(WEBP_USE_SSE2)
+#include <assert.h>
 #include <stdlib.h>  // for abs()
 #include <emmintrin.h>
 
 #include "./common_sse2.h"
-#include "../enc/cost.h"
-#include "../enc/vp8enci.h"
+#include "../enc/cost_enc.h"
+#include "../enc/vp8i_enc.h"
 
 //------------------------------------------------------------------------------
 // Transforms (Paragraph 14.4)
@@ -139,7 +140,7 @@ static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst,
 
     // Transpose the two 4x4.
     VP8Transpose_2_4x4_16b(&shifted0, &shifted1, &shifted2, &shifted3, &T0, &T1,
-                        &T2, &T3);
+                           &T2, &T3);
   }
 
   // Add inverse transform to 'ref' and store.
@@ -250,25 +251,11 @@ static void FTransformPass2(const __m128i* const v01, const __m128i* const v32,
   const __m128i k51000 = _mm_set1_epi32(51000);
 
   // Same operations are done on the (0,3) and (1,2) pairs.
-  // a0 = v0 + v3
-  // a1 = v1 + v2
   // a3 = v0 - v3
   // a2 = v1 - v2
-  const __m128i a01 = _mm_add_epi16(*v01, *v32);
   const __m128i a32 = _mm_sub_epi16(*v01, *v32);
-  const __m128i a11 = _mm_unpackhi_epi64(a01, a01);
   const __m128i a22 = _mm_unpackhi_epi64(a32, a32);
-  const __m128i a01_plus_7 = _mm_add_epi16(a01, seven);
 
-  // d0 = (a0 + a1 + 7) >> 4;
-  // d2 = (a0 - a1 + 7) >> 4;
-  const __m128i c0 = _mm_add_epi16(a01_plus_7, a11);
-  const __m128i c2 = _mm_sub_epi16(a01_plus_7, a11);
-  const __m128i d0 = _mm_srai_epi16(c0, 4);
-  const __m128i d2 = _mm_srai_epi16(c2, 4);
-
-  // f1 = ((b3 * 5352 + b2 * 2217 + 12000) >> 16)
-  // f3 = ((b3 * 2217 - b2 * 5352 + 51000) >> 16)
   const __m128i b23 = _mm_unpacklo_epi16(a22, a32);
   const __m128i c1 = _mm_madd_epi16(b23, k5352_2217);
   const __m128i c3 = _mm_madd_epi16(b23, k2217_5352);
@@ -276,14 +263,28 @@ static void FTransformPass2(const __m128i* const v01, const __m128i* const v32,
   const __m128i d3 = _mm_add_epi32(c3, k51000);
   const __m128i e1 = _mm_srai_epi32(d1, 16);
   const __m128i e3 = _mm_srai_epi32(d3, 16);
+  // f1 = ((b3 * 5352 + b2 * 2217 + 12000) >> 16)
+  // f3 = ((b3 * 2217 - b2 * 5352 + 51000) >> 16)
   const __m128i f1 = _mm_packs_epi32(e1, e1);
   const __m128i f3 = _mm_packs_epi32(e3, e3);
-  // f1 = f1 + (a3 != 0);
+  // g1 = f1 + (a3 != 0);
   // The compare will return (0xffff, 0) for (==0, !=0). To turn that into the
   // desired (0, 1), we add one earlier through k12000_plus_one.
-  // -> f1 = f1 + 1 - (a3 == 0)
+  // -> g1 = f1 + 1 - (a3 == 0)
   const __m128i g1 = _mm_add_epi16(f1, _mm_cmpeq_epi16(a32, zero));
 
+  // a0 = v0 + v3
+  // a1 = v1 + v2
+  const __m128i a01 = _mm_add_epi16(*v01, *v32);
+  const __m128i a01_plus_7 = _mm_add_epi16(a01, seven);
+  const __m128i a11 = _mm_unpackhi_epi64(a01, a01);
+  const __m128i c0 = _mm_add_epi16(a01_plus_7, a11);
+  const __m128i c2 = _mm_sub_epi16(a01_plus_7, a11);
+  // d0 = (a0 + a1 + 7) >> 4;
+  // d2 = (a0 - a1 + 7) >> 4;
+  const __m128i d0 = _mm_srai_epi16(c0, 4);
+  const __m128i d2 = _mm_srai_epi16(c2, 4);
+
   const __m128i d0_g1 = _mm_unpacklo_epi64(d0, g1);
   const __m128i d2_f3 = _mm_unpacklo_epi64(d2, f3);
   _mm_storeu_si128((__m128i*)&out[0], d0_g1);
@@ -1046,6 +1047,37 @@ static int SSE4x4(const uint8_t* a, const uint8_t* b) {
 }
 
 //------------------------------------------------------------------------------
+
+static void Mean16x4(const uint8_t* ref, uint32_t dc[4]) {
+  const __m128i mask = _mm_set1_epi16(0x00ff);
+  const __m128i a0 = _mm_loadu_si128((const __m128i*)&ref[BPS * 0]);
+  const __m128i a1 = _mm_loadu_si128((const __m128i*)&ref[BPS * 1]);
+  const __m128i a2 = _mm_loadu_si128((const __m128i*)&ref[BPS * 2]);
+  const __m128i a3 = _mm_loadu_si128((const __m128i*)&ref[BPS * 3]);
+  const __m128i b0 = _mm_srli_epi16(a0, 8);     // hi byte
+  const __m128i b1 = _mm_srli_epi16(a1, 8);
+  const __m128i b2 = _mm_srli_epi16(a2, 8);
+  const __m128i b3 = _mm_srli_epi16(a3, 8);
+  const __m128i c0 = _mm_and_si128(a0, mask);   // lo byte
+  const __m128i c1 = _mm_and_si128(a1, mask);
+  const __m128i c2 = _mm_and_si128(a2, mask);
+  const __m128i c3 = _mm_and_si128(a3, mask);
+  const __m128i d0 = _mm_add_epi32(b0, c0);
+  const __m128i d1 = _mm_add_epi32(b1, c1);
+  const __m128i d2 = _mm_add_epi32(b2, c2);
+  const __m128i d3 = _mm_add_epi32(b3, c3);
+  const __m128i e0 = _mm_add_epi32(d0, d1);
+  const __m128i e1 = _mm_add_epi32(d2, d3);
+  const __m128i f0 = _mm_add_epi32(e0, e1);
+  uint16_t tmp[8];
+  _mm_storeu_si128((__m128i*)tmp, f0);
+  dc[0] = tmp[0] + tmp[1];
+  dc[1] = tmp[2] + tmp[3];
+  dc[2] = tmp[4] + tmp[5];
+  dc[3] = tmp[6] + tmp[7];
+}
+
+//------------------------------------------------------------------------------
 // Texture distortion
 //
 // We try to match the spectral content (weighted) between source and
@@ -1331,10 +1363,122 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitSSE2(void) {
   VP8SSE4x4 = SSE4x4;
   VP8TDisto4x4 = Disto4x4;
   VP8TDisto16x16 = Disto16x16;
+  VP8Mean16x4 = Mean16x4;
+}
+
+//------------------------------------------------------------------------------
+// SSIM / PSNR entry point (TODO(skal): move to its own file later)
+
+static uint32_t AccumulateSSE_SSE2(const uint8_t* src1,
+                                   const uint8_t* src2, int len) {
+  int i = 0;
+  uint32_t sse2 = 0;
+  if (len >= 16) {
+    const int limit = len - 32;
+    int32_t tmp[4];
+    __m128i sum1;
+    __m128i sum = _mm_setzero_si128();
+    __m128i a0 = _mm_loadu_si128((const __m128i*)&src1[i]);
+    __m128i b0 = _mm_loadu_si128((const __m128i*)&src2[i]);
+    i += 16;
+    while (i <= limit) {
+      const __m128i a1 = _mm_loadu_si128((const __m128i*)&src1[i]);
+      const __m128i b1 = _mm_loadu_si128((const __m128i*)&src2[i]);
+      __m128i sum2;
+      i += 16;
+      SubtractAndAccumulate(a0, b0, &sum1);
+      sum = _mm_add_epi32(sum, sum1);
+      a0 = _mm_loadu_si128((const __m128i*)&src1[i]);
+      b0 = _mm_loadu_si128((const __m128i*)&src2[i]);
+      i += 16;
+      SubtractAndAccumulate(a1, b1, &sum2);
+      sum = _mm_add_epi32(sum, sum2);
+    }
+    SubtractAndAccumulate(a0, b0, &sum1);
+    sum = _mm_add_epi32(sum, sum1);
+    _mm_storeu_si128((__m128i*)tmp, sum);
+    sse2 += (tmp[3] + tmp[2] + tmp[1] + tmp[0]);
+  }
+
+  for (; i < len; ++i) {
+    const int32_t diff = src1[i] - src2[i];
+    sse2 += diff * diff;
+  }
+  return sse2;
+}
+
+static uint32_t HorizontalAdd16b(const __m128i* const m) {
+  uint16_t tmp[8];
+  const __m128i a = _mm_srli_si128(*m, 8);
+  const __m128i b = _mm_add_epi16(*m, a);
+  _mm_storeu_si128((__m128i*)tmp, b);
+  return (uint32_t)tmp[3] + tmp[2] + tmp[1] + tmp[0];
+}
+
+static uint32_t HorizontalAdd32b(const __m128i* const m) {
+  const __m128i a = _mm_srli_si128(*m, 8);
+  const __m128i b = _mm_add_epi32(*m, a);
+  const __m128i c = _mm_add_epi32(b, _mm_srli_si128(b, 4));
+  return (uint32_t)_mm_cvtsi128_si32(c);
+}
+
+static const uint16_t kWeight[] = { 1, 2, 3, 4, 3, 2, 1, 0 };
+
+#define ACCUMULATE_ROW(WEIGHT) do {                         \
+  /* compute row weight (Wx * Wy) */                        \
+  const __m128i Wy = _mm_set1_epi16((WEIGHT));              \
+  const __m128i W = _mm_mullo_epi16(Wx, Wy);                \
+  /* process 8 bytes at a time (7 bytes, actually) */       \
+  const __m128i a0 = _mm_loadl_epi64((const __m128i*)src1); \
+  const __m128i b0 = _mm_loadl_epi64((const __m128i*)src2); \
+  /* convert to 16b and multiply by weight */               \
+  const __m128i a1 = _mm_unpacklo_epi8(a0, zero);           \
+  const __m128i b1 = _mm_unpacklo_epi8(b0, zero);           \
+  const __m128i wa1 = _mm_mullo_epi16(a1, W);               \
+  const __m128i wb1 = _mm_mullo_epi16(b1, W);               \
+  /* accumulate */                                          \
+  xm  = _mm_add_epi16(xm, wa1);                             \
+  ym  = _mm_add_epi16(ym, wb1);                             \
+  xxm = _mm_add_epi32(xxm, _mm_madd_epi16(a1, wa1));        \
+  xym = _mm_add_epi32(xym, _mm_madd_epi16(a1, wb1));        \
+  yym = _mm_add_epi32(yym, _mm_madd_epi16(b1, wb1));        \
+  src1 += stride1;                                          \
+  src2 += stride2;                                          \
+} while (0)
+
+static double SSIMGet_SSE2(const uint8_t* src1, int stride1,
+                           const uint8_t* src2, int stride2) {
+  VP8DistoStats stats;
+  const __m128i zero = _mm_setzero_si128();
+  __m128i xm = zero, ym = zero;                // 16b accums
+  __m128i xxm = zero, yym = zero, xym = zero;  // 32b accum
+  const __m128i Wx = _mm_loadu_si128((const __m128i*)kWeight);
+  assert(2 * VP8_SSIM_KERNEL + 1 == 7);
+  ACCUMULATE_ROW(1);
+  ACCUMULATE_ROW(2);
+  ACCUMULATE_ROW(3);
+  ACCUMULATE_ROW(4);
+  ACCUMULATE_ROW(3);
+  ACCUMULATE_ROW(2);
+  ACCUMULATE_ROW(1);
+  stats.xm  = HorizontalAdd16b(&xm);
+  stats.ym  = HorizontalAdd16b(&ym);
+  stats.xxm = HorizontalAdd32b(&xxm);
+  stats.xym = HorizontalAdd32b(&xym);
+  stats.yym = HorizontalAdd32b(&yym);
+  return VP8SSIMFromStats(&stats);
+}
+
+extern void VP8SSIMDspInitSSE2(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8SSIMDspInitSSE2(void) {
+  VP8AccumulateSSE = AccumulateSSE_SSE2;
+  VP8SSIMGet = SSIMGet_SSE2;
 }
 
 #else  // !WEBP_USE_SSE2
 
 WEBP_DSP_INIT_STUB(VP8EncDspInitSSE2)
+WEBP_DSP_INIT_STUB(VP8SSIMDspInitSSE2)
 
 #endif  // WEBP_USE_SSE2
diff --git a/src/3rdparty/libwebp/src/dsp/enc_sse41.c b/src/3rdparty/libwebp/src/dsp/enc_sse41.c
index a178390..e32086d 100644
--- a/src/3rdparty/libwebp/src/dsp/enc_sse41.c
+++ b/src/3rdparty/libwebp/src/dsp/enc_sse41.c
@@ -18,7 +18,7 @@
 #include <stdlib.h>  // for abs()
 
 #include "./common_sse2.h"
-#include "../enc/vp8enci.h"
+#include "../enc/vp8i_enc.h"
 
 //------------------------------------------------------------------------------
 // Compute susceptibility based on DCT-coeff histograms.
diff --git a/src/3rdparty/libwebp/src/dsp/filters.c b/src/3rdparty/libwebp/src/dsp/filters.c
index 9f04faf..65f34aa 100644
--- a/src/3rdparty/libwebp/src/dsp/filters.c
+++ b/src/3rdparty/libwebp/src/dsp/filters.c
@@ -227,6 +227,8 @@ WebPFilterFunc WebPFilters[WEBP_FILTER_LAST];
 WebPUnfilterFunc WebPUnfilters[WEBP_FILTER_LAST];
 
 extern void VP8FiltersInitMIPSdspR2(void);
+extern void VP8FiltersInitMSA(void);
+extern void VP8FiltersInitNEON(void);
 extern void VP8FiltersInitSSE2(void);
 
 static volatile VP8CPUInfo filters_last_cpuinfo_used =
@@ -251,11 +253,21 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInit(void) {
       VP8FiltersInitSSE2();
     }
 #endif
+#if defined(WEBP_USE_NEON)
+    if (VP8GetCPUInfo(kNEON)) {
+      VP8FiltersInitNEON();
+    }
+#endif
 #if defined(WEBP_USE_MIPS_DSP_R2)
     if (VP8GetCPUInfo(kMIPSdspR2)) {
       VP8FiltersInitMIPSdspR2();
     }
 #endif
+#if defined(WEBP_USE_MSA)
+    if (VP8GetCPUInfo(kMSA)) {
+      VP8FiltersInitMSA();
+    }
+#endif
   }
   filters_last_cpuinfo_used = VP8GetCPUInfo;
 }
diff --git a/src/3rdparty/libwebp/src/dsp/filters_msa.c b/src/3rdparty/libwebp/src/dsp/filters_msa.c
new file mode 100644
index 0000000..4b8922d
--- /dev/null
+++ b/src/3rdparty/libwebp/src/dsp/filters_msa.c
@@ -0,0 +1,202 @@
+// Copyright 2016 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// MSA variant of alpha filters
+//
+// Author: Prashant Patil (prashant.patil@imgtec.com)
+
+#include "./dsp.h"
+
+#if defined(WEBP_USE_MSA)
+
+#include "./msa_macro.h"
+
+#include <assert.h>
+
+static WEBP_INLINE void PredictLineInverse0(const uint8_t* src,
+                                            const uint8_t* pred,
+                                            uint8_t* dst, int length) {
+  v16u8 src0, pred0, dst0;
+  assert(length >= 0);
+  while (length >= 32) {
+    v16u8 src1, pred1, dst1;
+    LD_UB2(src, 16, src0, src1);
+    LD_UB2(pred, 16, pred0, pred1);
+    SUB2(src0, pred0, src1, pred1, dst0, dst1);
+    ST_UB2(dst0, dst1, dst, 16);
+    src += 32;
+    pred += 32;
+    dst += 32;
+    length -= 32;
+  }
+  if (length > 0) {
+    int i;
+    if (length >= 16) {
+      src0 = LD_UB(src);
+      pred0 = LD_UB(pred);
+      dst0 = src0 - pred0;
+      ST_UB(dst0, dst);
+      src += 16;
+      pred += 16;
+      dst += 16;
+      length -= 16;
+    }
+    for (i = 0; i < length; i++) {
+      dst[i] = src[i] - pred[i];
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// Helpful macro.
+
+#define SANITY_CHECK(in, out)  \
+  assert(in != NULL);          \
+  assert(out != NULL);         \
+  assert(width > 0);           \
+  assert(height > 0);          \
+  assert(stride >= width);
+
+//------------------------------------------------------------------------------
+// Horrizontal filter
+
+static void HorizontalFilter(const uint8_t* data, int width, int height,
+                             int stride, uint8_t* filtered_data) {
+  const uint8_t* preds = data;
+  const uint8_t* in = data;
+  uint8_t* out = filtered_data;
+  int row = 1;
+  SANITY_CHECK(in, out);
+
+  // Leftmost pixel is the same as input for topmost scanline.
+  out[0] = in[0];
+  PredictLineInverse0(in + 1, preds, out + 1, width - 1);
+  preds += stride;
+  in += stride;
+  out += stride;
+  // Filter line-by-line.
+  while (row < height) {
+    // Leftmost pixel is predicted from above.
+    PredictLineInverse0(in, preds - stride, out, 1);
+    PredictLineInverse0(in + 1, preds, out + 1, width - 1);
+    ++row;
+    preds += stride;
+    in += stride;
+    out += stride;
+  }
+}
+
+//------------------------------------------------------------------------------
+// Gradient filter
+
+static WEBP_INLINE void PredictLineGradient(const uint8_t* pinput,
+                                            const uint8_t* ppred,
+                                            uint8_t* poutput, int stride,
+                                            int size) {
+  int w;
+  const v16i8 zero = { 0 };
+  while (size >= 16) {
+    v16u8 pred0, dst0;
+    v8i16 a0, a1, b0, b1, c0, c1;
+    const v16u8 tmp0 = LD_UB(ppred - 1);
+    const v16u8 tmp1 = LD_UB(ppred - stride);
+    const v16u8 tmp2 = LD_UB(ppred - stride - 1);
+    const v16u8 src0 = LD_UB(pinput);
+    ILVRL_B2_SH(zero, tmp0, a0, a1);
+    ILVRL_B2_SH(zero, tmp1, b0, b1);
+    ILVRL_B2_SH(zero, tmp2, c0, c1);
+    ADD2(a0, b0, a1, b1, a0, a1);
+    SUB2(a0, c0, a1, c1, a0, a1);
+    CLIP_SH2_0_255(a0, a1);
+    pred0 = (v16u8)__msa_pckev_b((v16i8)a1, (v16i8)a0);
+    dst0 = src0 - pred0;
+    ST_UB(dst0, poutput);
+    ppred += 16;
+    pinput += 16;
+    poutput += 16;
+    size -= 16;
+  }
+  for (w = 0; w < size; ++w) {
+    const int pred = ppred[w - 1] + ppred[w - stride] - ppred[w - stride - 1];
+    poutput[w] = pinput[w] - (pred < 0 ? 0 : pred > 255 ? 255 : pred);
+  }
+}
+
+
+static void GradientFilter(const uint8_t* data, int width, int height,
+                           int stride, uint8_t* filtered_data) {
+  const uint8_t* in = data;
+  const uint8_t* preds = data;
+  uint8_t* out = filtered_data;
+  int row = 1;
+  SANITY_CHECK(in, out);
+
+  // left prediction for top scan-line
+  out[0] = in[0];
+  PredictLineInverse0(in + 1, preds, out + 1, width - 1);
+  preds += stride;
+  in += stride;
+  out += stride;
+  // Filter line-by-line.
+  while (row < height) {
+    out[0] = in[0] - preds[- stride];
+    PredictLineGradient(preds + 1, in + 1, out + 1, stride, width - 1);
+    ++row;
+    preds += stride;
+    in += stride;
+    out += stride;
+  }
+}
+
+//------------------------------------------------------------------------------
+// Vertical filter
+
+static void VerticalFilter(const uint8_t* data, int width, int height,
+                           int stride, uint8_t* filtered_data) {
+  const uint8_t* in = data;
+  const uint8_t* preds = data;
+  uint8_t* out = filtered_data;
+  int row = 1;
+  SANITY_CHECK(in, out);
+
+  // Very first top-left pixel is copied.
+  out[0] = in[0];
+  // Rest of top scan-line is left-predicted.
+  PredictLineInverse0(in + 1, preds, out + 1, width - 1);
+  in += stride;
+  out += stride;
+
+  // Filter line-by-line.
+  while (row < height) {
+    PredictLineInverse0(in, preds, out, width);
+    ++row;
+    preds += stride;
+    in += stride;
+    out += stride;
+  }
+}
+
+#undef SANITY_CHECK
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8FiltersInitMSA(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInitMSA(void) {
+  WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter;
+  WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter;
+  WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter;
+}
+
+#else  // !WEBP_USE_MSA
+
+WEBP_DSP_INIT_STUB(VP8FiltersInitMSA)
+
+#endif  // WEBP_USE_MSA
diff --git a/src/3rdparty/libwebp/src/dsp/filters_neon.c b/src/3rdparty/libwebp/src/dsp/filters_neon.c
new file mode 100644
index 0000000..4d6e50c
--- /dev/null
+++ b/src/3rdparty/libwebp/src/dsp/filters_neon.c
@@ -0,0 +1,327 @@
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// NEON variant of alpha filters
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "./dsp.h"
+
+#if defined(WEBP_USE_NEON)
+
+#include <assert.h>
+#include "./neon.h"
+
+//------------------------------------------------------------------------------
+// Helpful macros.
+
+# define SANITY_CHECK(in, out)                                                 \
+  assert(in != NULL);                                                          \
+  assert(out != NULL);                                                         \
+  assert(width > 0);                                                           \
+  assert(height > 0);                                                          \
+  assert(stride >= width);                                                     \
+  assert(row >= 0 && num_rows > 0 && row + num_rows <= height);                \
+  (void)height;  // Silence unused warning.
+
+// load eight u8 and widen to s16
+#define U8_TO_S16(A) vreinterpretq_s16_u16(vmovl_u8(A))
+#define LOAD_U8_TO_S16(A) U8_TO_S16(vld1_u8(A))
+
+// shift left or right by N byte, inserting zeros
+#define SHIFT_RIGHT_N_Q(A, N) vextq_u8((A), zero, (N))
+#define SHIFT_LEFT_N_Q(A, N) vextq_u8(zero, (A), (16 - (N)) % 16)
+
+// rotate left by N bytes
+#define ROTATE_LEFT_N(A, N)   vext_u8((A), (A), (N))
+// rotate right by N bytes
+#define ROTATE_RIGHT_N(A, N)   vext_u8((A), (A), (8 - (N)) % 8)
+
+static void PredictLine_NEON(const uint8_t* src, const uint8_t* pred,
+                             uint8_t* dst, int length) {
+  int i;
+  assert(length >= 0);
+  for (i = 0; i + 16 <= length; i += 16) {
+    const uint8x16_t A = vld1q_u8(&src[i]);
+    const uint8x16_t B = vld1q_u8(&pred[i]);
+    const uint8x16_t C = vsubq_u8(A, B);
+    vst1q_u8(&dst[i], C);
+  }
+  for (; i < length; ++i) dst[i] = src[i] - pred[i];
+}
+
+// Special case for left-based prediction (when preds==dst-1 or preds==src-1).
+static void PredictLineLeft_NEON(const uint8_t* src, uint8_t* dst, int length) {
+  PredictLine_NEON(src, src - 1, dst, length);
+}
+
+//------------------------------------------------------------------------------
+// Horizontal filter.
+
+static WEBP_INLINE void DoHorizontalFilter_NEON(const uint8_t* in,
+                                                int width, int height,
+                                                int stride,
+                                                int row, int num_rows,
+                                                uint8_t* out) {
+  const size_t start_offset = row * stride;
+  const int last_row = row + num_rows;
+  SANITY_CHECK(in, out);
+  in += start_offset;
+  out += start_offset;
+
+  if (row == 0) {
+    // Leftmost pixel is the same as input for topmost scanline.
+    out[0] = in[0];
+    PredictLineLeft_NEON(in + 1, out + 1, width - 1);
+    row = 1;
+    in += stride;
+    out += stride;
+  }
+
+  // Filter line-by-line.
+  while (row < last_row) {
+    // Leftmost pixel is predicted from above.
+    out[0] = in[0] - in[-stride];
+    PredictLineLeft_NEON(in + 1, out + 1, width - 1);
+    ++row;
+    in += stride;
+    out += stride;
+  }
+}
+
+static void HorizontalFilter_NEON(const uint8_t* data, int width, int height,
+                                  int stride, uint8_t* filtered_data) {
+  DoHorizontalFilter_NEON(data, width, height, stride, 0, height,
+                          filtered_data);
+}
+
+//------------------------------------------------------------------------------
+// Vertical filter.
+
+static WEBP_INLINE void DoVerticalFilter_NEON(const uint8_t* in,
+                                              int width, int height, int stride,
+                                              int row, int num_rows,
+                                              uint8_t* out) {
+  const size_t start_offset = row * stride;
+  const int last_row = row + num_rows;
+  SANITY_CHECK(in, out);
+  in += start_offset;
+  out += start_offset;
+
+  if (row == 0) {
+    // Very first top-left pixel is copied.
+    out[0] = in[0];
+    // Rest of top scan-line is left-predicted.
+    PredictLineLeft_NEON(in + 1, out + 1, width - 1);
+    row = 1;
+    in += stride;
+    out += stride;
+  }
+
+  // Filter line-by-line.
+  while (row < last_row) {
+    PredictLine_NEON(in, in - stride, out, width);
+    ++row;
+    in += stride;
+    out += stride;
+  }
+}
+
+static void VerticalFilter_NEON(const uint8_t* data, int width, int height,
+                               int stride, uint8_t* filtered_data) {
+  DoVerticalFilter_NEON(data, width, height, stride, 0, height,
+                        filtered_data);
+}
+
+//------------------------------------------------------------------------------
+// Gradient filter.
+
+static WEBP_INLINE int GradientPredictor_C(uint8_t a, uint8_t b, uint8_t c) {
+  const int g = a + b - c;
+  return ((g & ~0xff) == 0) ? g : (g < 0) ? 0 : 255;  // clip to 8bit
+}
+
+static void GradientPredictDirect_NEON(const uint8_t* const row,
+                                       const uint8_t* const top,
+                                       uint8_t* const out, int length) {
+  int i;
+  for (i = 0; i + 8 <= length; i += 8) {
+    const uint8x8_t A = vld1_u8(&row[i - 1]);
+    const uint8x8_t B = vld1_u8(&top[i + 0]);
+    const int16x8_t C = vreinterpretq_s16_u16(vaddl_u8(A, B));
+    const int16x8_t D = LOAD_U8_TO_S16(&top[i - 1]);
+    const uint8x8_t E = vqmovun_s16(vsubq_s16(C, D));
+    const uint8x8_t F = vld1_u8(&row[i + 0]);
+    vst1_u8(&out[i], vsub_u8(F, E));
+  }
+  for (; i < length; ++i) {
+    out[i] = row[i] - GradientPredictor_C(row[i - 1], top[i], top[i - 1]);
+  }
+}
+
+static WEBP_INLINE void DoGradientFilter_NEON(const uint8_t* in,
+                                              int width, int height,
+                                              int stride,
+                                              int row, int num_rows,
+                                              uint8_t* out) {
+  const size_t start_offset = row * stride;
+  const int last_row = row + num_rows;
+  SANITY_CHECK(in, out);
+  in += start_offset;
+  out += start_offset;
+
+  // left prediction for top scan-line
+  if (row == 0) {
+    out[0] = in[0];
+    PredictLineLeft_NEON(in + 1, out + 1, width - 1);
+    row = 1;
+    in += stride;
+    out += stride;
+  }
+
+  // Filter line-by-line.
+  while (row < last_row) {
+    out[0] = in[0] - in[-stride];
+    GradientPredictDirect_NEON(in + 1, in + 1 - stride, out + 1, width - 1);
+    ++row;
+    in += stride;
+    out += stride;
+  }
+}
+
+static void GradientFilter_NEON(const uint8_t* data, int width, int height,
+                               int stride, uint8_t* filtered_data) {
+  DoGradientFilter_NEON(data, width, height, stride, 0, height,
+                        filtered_data);
+}
+
+#undef SANITY_CHECK
+
+//------------------------------------------------------------------------------
+// Inverse transforms
+
+static void HorizontalUnfilter_NEON(const uint8_t* prev, const uint8_t* in,
+                                    uint8_t* out, int width) {
+  int i;
+  const uint8x16_t zero = vdupq_n_u8(0);
+  uint8x16_t last;
+  out[0] = in[0] + (prev == NULL ? 0 : prev[0]);
+  if (width <= 1) return;
+  last = vsetq_lane_u8(out[0], zero, 0);
+  for (i = 1; i + 16 <= width; i += 16) {
+    const uint8x16_t A0 = vld1q_u8(&in[i]);
+    const uint8x16_t A1 = vaddq_u8(A0, last);
+    const uint8x16_t A2 = SHIFT_LEFT_N_Q(A1, 1);
+    const uint8x16_t A3 = vaddq_u8(A1, A2);
+    const uint8x16_t A4 = SHIFT_LEFT_N_Q(A3, 2);
+    const uint8x16_t A5 = vaddq_u8(A3, A4);
+    const uint8x16_t A6 = SHIFT_LEFT_N_Q(A5, 4);
+    const uint8x16_t A7 = vaddq_u8(A5, A6);
+    const uint8x16_t A8 = SHIFT_LEFT_N_Q(A7, 8);
+    const uint8x16_t A9 = vaddq_u8(A7, A8);
+    vst1q_u8(&out[i], A9);
+    last = SHIFT_RIGHT_N_Q(A9, 15);
+  }
+  for (; i < width; ++i) out[i] = in[i] + out[i - 1];
+}
+
+static void VerticalUnfilter_NEON(const uint8_t* prev, const uint8_t* in,
+                                  uint8_t* out, int width) {
+  if (prev == NULL) {
+    HorizontalUnfilter_NEON(NULL, in, out, width);
+  } else {
+    int i;
+    assert(width >= 0);
+    for (i = 0; i + 16 <= width; i += 16) {
+      const uint8x16_t A = vld1q_u8(&in[i]);
+      const uint8x16_t B = vld1q_u8(&prev[i]);
+      const uint8x16_t C = vaddq_u8(A, B);
+      vst1q_u8(&out[i], C);
+    }
+    for (; i < width; ++i) out[i] = in[i] + prev[i];
+  }
+}
+
+// GradientUnfilter_NEON is correct but slower than the C-version,
+// at least on ARM64. For armv7, it's a wash.
+// So best is to disable it for now, but keep the idea around...
+// #define USE_GRADIENT_UNFILTER
+
+#if defined(USE_GRADIENT_UNFILTER)
+#define GRAD_PROCESS_LANE(L)  do {                                             \
+  const uint8x8_t tmp1 = ROTATE_RIGHT_N(pred, 1);  /* rotate predictor in */   \
+  const int16x8_t tmp2 = vaddq_s16(BC, U8_TO_S16(tmp1));                       \
+  const uint8x8_t delta = vqmovun_s16(tmp2);                                   \
+  pred = vadd_u8(D, delta);                                                    \
+  out = vext_u8(out, ROTATE_LEFT_N(pred, (L)), 1);                             \
+} while (0)
+
+static void GradientPredictInverse_NEON(const uint8_t* const in,
+                                        const uint8_t* const top,
+                                        uint8_t* const row, int length) {
+  if (length > 0) {
+    int i;
+    uint8x8_t pred = vdup_n_u8(row[-1]);   // left sample
+    uint8x8_t out = vdup_n_u8(0);
+    for (i = 0; i + 8 <= length; i += 8) {
+      const int16x8_t B = LOAD_U8_TO_S16(&top[i + 0]);
+      const int16x8_t C = LOAD_U8_TO_S16(&top[i - 1]);
+      const int16x8_t BC = vsubq_s16(B, C);  // unclipped gradient basis B - C
+      const uint8x8_t D = vld1_u8(&in[i]);   // base input
+      GRAD_PROCESS_LANE(0);
+      GRAD_PROCESS_LANE(1);
+      GRAD_PROCESS_LANE(2);
+      GRAD_PROCESS_LANE(3);
+      GRAD_PROCESS_LANE(4);
+      GRAD_PROCESS_LANE(5);
+      GRAD_PROCESS_LANE(6);
+      GRAD_PROCESS_LANE(7);
+      vst1_u8(&row[i], out);
+    }
+    for (; i < length; ++i) {
+      row[i] = in[i] + GradientPredictor_C(row[i - 1], top[i], top[i - 1]);
+    }
+  }
+}
+#undef GRAD_PROCESS_LANE
+
+static void GradientUnfilter_NEON(const uint8_t* prev, const uint8_t* in,
+                                 uint8_t* out, int width) {
+  if (prev == NULL) {
+    HorizontalUnfilter_NEON(NULL, in, out, width);
+  } else {
+    out[0] = in[0] + prev[0];  // predict from above
+    GradientPredictInverse_NEON(in + 1, prev + 1, out + 1, width - 1);
+  }
+}
+
+#endif   // USE_GRADIENT_UNFILTER
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8FiltersInitNEON(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInitNEON(void) {
+  WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter_NEON;
+  WebPUnfilters[WEBP_FILTER_VERTICAL] = VerticalUnfilter_NEON;
+#if defined(USE_GRADIENT_UNFILTER)
+  WebPUnfilters[WEBP_FILTER_GRADIENT] = GradientUnfilter_NEON;
+#endif
+
+  WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter_NEON;
+  WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter_NEON;
+  WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter_NEON;
+}
+
+#else  // !WEBP_USE_NEON
+
+WEBP_DSP_INIT_STUB(VP8FiltersInitNEON)
+
+#endif  // WEBP_USE_NEON
diff --git a/src/3rdparty/libwebp/src/dsp/lossless.c b/src/3rdparty/libwebp/src/dsp/lossless.c
index af913ef..20d18f6 100644
--- a/src/3rdparty/libwebp/src/dsp/lossless.c
+++ b/src/3rdparty/libwebp/src/dsp/lossless.c
@@ -17,20 +17,16 @@
 
 #include <math.h>
 #include <stdlib.h>
-#include "../dec/vp8li.h"
-#include "../utils/endian_inl.h"
+#include "../dec/vp8li_dec.h"
+#include "../utils/endian_inl_utils.h"
 #include "./lossless.h"
+#include "./lossless_common.h"
 
 #define MAX_DIFF_COST (1e30f)
 
 //------------------------------------------------------------------------------
 // Image transforms.
 
-// In-place sum of each component with mod 256.
-static WEBP_INLINE void AddPixelsEq(uint32_t* a, uint32_t b) {
-  *a = VP8LAddPixels(*a, b);
-}
-
 static WEBP_INLINE uint32_t Average2(uint32_t a0, uint32_t a1) {
   return (((a0 ^ a1) & 0xfefefefeu) >> 1) + (a0 & a1);
 }
@@ -171,21 +167,41 @@ static uint32_t Predictor13(uint32_t left, const uint32_t* const top) {
   return pred;
 }
 
+GENERATE_PREDICTOR_ADD(Predictor0, PredictorAdd0)
+static void PredictorAdd1(const uint32_t* in, const uint32_t* upper,
+                          int num_pixels, uint32_t* out) {
+  int i;
+  uint32_t left = out[-1];
+  for (i = 0; i < num_pixels; ++i) {
+    out[i] = left = VP8LAddPixels(in[i], left);
+  }
+  (void)upper;
+}
+GENERATE_PREDICTOR_ADD(Predictor2, PredictorAdd2)
+GENERATE_PREDICTOR_ADD(Predictor3, PredictorAdd3)
+GENERATE_PREDICTOR_ADD(Predictor4, PredictorAdd4)
+GENERATE_PREDICTOR_ADD(Predictor5, PredictorAdd5)
+GENERATE_PREDICTOR_ADD(Predictor6, PredictorAdd6)
+GENERATE_PREDICTOR_ADD(Predictor7, PredictorAdd7)
+GENERATE_PREDICTOR_ADD(Predictor8, PredictorAdd8)
+GENERATE_PREDICTOR_ADD(Predictor9, PredictorAdd9)
+GENERATE_PREDICTOR_ADD(Predictor10, PredictorAdd10)
+GENERATE_PREDICTOR_ADD(Predictor11, PredictorAdd11)
+GENERATE_PREDICTOR_ADD(Predictor12, PredictorAdd12)
+GENERATE_PREDICTOR_ADD(Predictor13, PredictorAdd13)
+
 //------------------------------------------------------------------------------
 
 // Inverse prediction.
 static void PredictorInverseTransform(const VP8LTransform* const transform,
-                                      int y_start, int y_end, uint32_t* data) {
+                                      int y_start, int y_end,
+                                      const uint32_t* in, uint32_t* out) {
   const int width = transform->xsize_;
   if (y_start == 0) {  // First Row follows the L (mode=1) mode.
-    int x;
-    const uint32_t pred0 = Predictor0(data[-1], NULL);
-    AddPixelsEq(data, pred0);
-    for (x = 1; x < width; ++x) {
-      const uint32_t pred1 = Predictor1(data[x - 1], NULL);
-      AddPixelsEq(data + x, pred1);
-    }
-    data += width;
+    PredictorAdd0(in, NULL, 1, out);
+    PredictorAdd1(in + 1, NULL, width - 1, out + 1);
+    in += width;
+    out += width;
     ++y_start;
   }
 
@@ -193,36 +209,26 @@ static void PredictorInverseTransform(const VP8LTransform* const transform,
     int y = y_start;
     const int tile_width = 1 << transform->bits_;
     const int mask = tile_width - 1;
-    const int safe_width = width & ~mask;
     const int tiles_per_row = VP8LSubSampleSize(width, transform->bits_);
     const uint32_t* pred_mode_base =
         transform->data_ + (y >> transform->bits_) * tiles_per_row;
 
     while (y < y_end) {
-      const uint32_t pred2 = Predictor2(data[-1], data - width);
       const uint32_t* pred_mode_src = pred_mode_base;
-      VP8LPredictorFunc pred_func;
       int x = 1;
-      int t = 1;
       // First pixel follows the T (mode=2) mode.
-      AddPixelsEq(data, pred2);
+      PredictorAdd2(in, out - width, 1, out);
       // .. the rest:
-      while (x < safe_width) {
-        pred_func = VP8LPredictors[((*pred_mode_src++) >> 8) & 0xf];
-        for (; t < tile_width; ++t, ++x) {
-          const uint32_t pred = pred_func(data[x - 1], data + x - width);
-          AddPixelsEq(data + x, pred);
-        }
-        t = 0;
-      }
-      if (x < width) {
-        pred_func = VP8LPredictors[((*pred_mode_src++) >> 8) & 0xf];
-        for (; x < width; ++x) {
-          const uint32_t pred = pred_func(data[x - 1], data + x - width);
-          AddPixelsEq(data + x, pred);
-        }
+      while (x < width) {
+        const VP8LPredictorAddSubFunc pred_func =
+            VP8LPredictorsAdd[((*pred_mode_src++) >> 8) & 0xf];
+        int x_end = (x & ~mask) + tile_width;
+        if (x_end > width) x_end = width;
+        pred_func(in + x, out + x - width, x_end - x, out + x);
+        x = x_end;
       }
-      data += width;
+      in += width;
+      out += width;
       ++y;
       if ((y & mask) == 0) {   // Use the same mask, since tiles are squares.
         pred_mode_base += tiles_per_row;
@@ -233,21 +239,22 @@ static void PredictorInverseTransform(const VP8LTransform* const transform,
 
 // Add green to blue and red channels (i.e. perform the inverse transform of
 // 'subtract green').
-void VP8LAddGreenToBlueAndRed_C(uint32_t* data, int num_pixels) {
+void VP8LAddGreenToBlueAndRed_C(const uint32_t* src, int num_pixels,
+                                uint32_t* dst) {
   int i;
   for (i = 0; i < num_pixels; ++i) {
-    const uint32_t argb = data[i];
+    const uint32_t argb = src[i];
     const uint32_t green = ((argb >> 8) & 0xff);
     uint32_t red_blue = (argb & 0x00ff00ffu);
     red_blue += (green << 16) | green;
     red_blue &= 0x00ff00ffu;
-    data[i] = (argb & 0xff00ff00u) | red_blue;
+    dst[i] = (argb & 0xff00ff00u) | red_blue;
   }
 }
 
-static WEBP_INLINE uint32_t ColorTransformDelta(int8_t color_pred,
-                                                int8_t color) {
-  return (uint32_t)((int)(color_pred) * color) >> 5;
+static WEBP_INLINE int ColorTransformDelta(int8_t color_pred,
+                                           int8_t color) {
+  return ((int)color_pred * color) >> 5;
 }
 
 static WEBP_INLINE void ColorCodeToMultipliers(uint32_t color_code,
@@ -257,27 +264,29 @@ static WEBP_INLINE void ColorCodeToMultipliers(uint32_t color_code,
   m->red_to_blue_   = (color_code >> 16) & 0xff;
 }
 
-void VP8LTransformColorInverse_C(const VP8LMultipliers* const m, uint32_t* data,
-                                 int num_pixels) {
+void VP8LTransformColorInverse_C(const VP8LMultipliers* const m,
+                                 const uint32_t* src, int num_pixels,
+                                 uint32_t* dst) {
   int i;
   for (i = 0; i < num_pixels; ++i) {
-    const uint32_t argb = data[i];
+    const uint32_t argb = src[i];
     const uint32_t green = argb >> 8;
     const uint32_t red = argb >> 16;
-    uint32_t new_red = red;
-    uint32_t new_blue = argb;
+    int new_red = red;
+    int new_blue = argb;
     new_red += ColorTransformDelta(m->green_to_red_, green);
     new_red &= 0xff;
     new_blue += ColorTransformDelta(m->green_to_blue_, green);
     new_blue += ColorTransformDelta(m->red_to_blue_, new_red);
     new_blue &= 0xff;
-    data[i] = (argb & 0xff00ff00u) | (new_red << 16) | (new_blue);
+    dst[i] = (argb & 0xff00ff00u) | (new_red << 16) | (new_blue);
   }
 }
 
 // Color space inverse transform.
 static void ColorSpaceInverseTransform(const VP8LTransform* const transform,
-                                       int y_start, int y_end, uint32_t* data) {
+                                       int y_start, int y_end,
+                                       const uint32_t* src, uint32_t* dst) {
   const int width = transform->xsize_;
   const int tile_width = 1 << transform->bits_;
   const int mask = tile_width - 1;
@@ -291,17 +300,19 @@ static void ColorSpaceInverseTransform(const VP8LTransform* const transform,
   while (y < y_end) {
     const uint32_t* pred = pred_row;
     VP8LMultipliers m = { 0, 0, 0 };
-    const uint32_t* const data_safe_end = data + safe_width;
-    const uint32_t* const data_end = data + width;
-    while (data < data_safe_end) {
+    const uint32_t* const src_safe_end = src + safe_width;
+    const uint32_t* const src_end = src + width;
+    while (src < src_safe_end) {
       ColorCodeToMultipliers(*pred++, &m);
-      VP8LTransformColorInverse(&m, data, tile_width);
-      data += tile_width;
+      VP8LTransformColorInverse(&m, src, tile_width, dst);
+      src += tile_width;
+      dst += tile_width;
     }
-    if (data < data_end) {  // Left-overs using C-version.
+    if (src < src_end) {  // Left-overs using C-version.
       ColorCodeToMultipliers(*pred++, &m);
-      VP8LTransformColorInverse(&m, data, remaining_width);
-      data += remaining_width;
+      VP8LTransformColorInverse(&m, src, remaining_width, dst);
+      src += remaining_width;
+      dst += remaining_width;
     }
     ++y;
     if ((y & mask) == 0) pred_row += tiles_per_row;
@@ -366,10 +377,10 @@ void VP8LInverseTransform(const VP8LTransform* const transform,
   assert(row_end <= transform->ysize_);
   switch (transform->type_) {
     case SUBTRACT_GREEN:
-      VP8LAddGreenToBlueAndRed(out, (row_end - row_start) * width);
+      VP8LAddGreenToBlueAndRed(in, (row_end - row_start) * width, out);
       break;
     case PREDICTOR_TRANSFORM:
-      PredictorInverseTransform(transform, row_start, row_end, out);
+      PredictorInverseTransform(transform, row_start, row_end, in, out);
       if (row_end != transform->ysize_) {
         // The last predicted row in this iteration will be the top-pred row
         // for the first row in next iteration.
@@ -378,7 +389,7 @@ void VP8LInverseTransform(const VP8LTransform* const transform,
       }
       break;
     case CROSS_COLOR_TRANSFORM:
-      ColorSpaceInverseTransform(transform, row_start, row_end, out);
+      ColorSpaceInverseTransform(transform, row_start, row_end, in, out);
       break;
     case COLOR_INDEXING_TRANSFORM:
       if (in == out && transform->bits_ > 0) {
@@ -555,10 +566,15 @@ void VP8LConvertFromBGRA(const uint32_t* const in_data, int num_pixels,
 
 //------------------------------------------------------------------------------
 
-VP8LProcessBlueAndRedFunc VP8LAddGreenToBlueAndRed;
+VP8LProcessDecBlueAndRedFunc VP8LAddGreenToBlueAndRed;
+VP8LPredictorAddSubFunc VP8LPredictorsAdd[16];
 VP8LPredictorFunc VP8LPredictors[16];
 
-VP8LTransformColorFunc VP8LTransformColorInverse;
+// exposed plain-C implementations
+VP8LPredictorAddSubFunc VP8LPredictorsAdd_C[16];
+VP8LPredictorFunc VP8LPredictors_C[16];
+
+VP8LTransformColorInverseFunc VP8LTransformColorInverse;
 
 VP8LConvertFunc VP8LConvertBGRAToRGB;
 VP8LConvertFunc VP8LConvertBGRAToRGBA;
@@ -572,29 +588,37 @@ VP8LMapAlphaFunc VP8LMapColor8b;
 extern void VP8LDspInitSSE2(void);
 extern void VP8LDspInitNEON(void);
 extern void VP8LDspInitMIPSdspR2(void);
+extern void VP8LDspInitMSA(void);
 
 static volatile VP8CPUInfo lossless_last_cpuinfo_used =
     (VP8CPUInfo)&lossless_last_cpuinfo_used;
 
+#define COPY_PREDICTOR_ARRAY(IN, OUT) do {              \
+  (OUT)[0] = IN##0;                                     \
+  (OUT)[1] = IN##1;                                     \
+  (OUT)[2] = IN##2;                                     \
+  (OUT)[3] = IN##3;                                     \
+  (OUT)[4] = IN##4;                                     \
+  (OUT)[5] = IN##5;                                     \
+  (OUT)[6] = IN##6;                                     \
+  (OUT)[7] = IN##7;                                     \
+  (OUT)[8] = IN##8;                                     \
+  (OUT)[9] = IN##9;                                     \
+  (OUT)[10] = IN##10;                                   \
+  (OUT)[11] = IN##11;                                   \
+  (OUT)[12] = IN##12;                                   \
+  (OUT)[13] = IN##13;                                   \
+  (OUT)[14] = IN##0; /* <- padding security sentinels*/ \
+  (OUT)[15] = IN##0;                                    \
+} while (0);
+
 WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInit(void) {
   if (lossless_last_cpuinfo_used == VP8GetCPUInfo) return;
 
-  VP8LPredictors[0] = Predictor0;
-  VP8LPredictors[1] = Predictor1;
-  VP8LPredictors[2] = Predictor2;
-  VP8LPredictors[3] = Predictor3;
-  VP8LPredictors[4] = Predictor4;
-  VP8LPredictors[5] = Predictor5;
-  VP8LPredictors[6] = Predictor6;
-  VP8LPredictors[7] = Predictor7;
-  VP8LPredictors[8] = Predictor8;
-  VP8LPredictors[9] = Predictor9;
-  VP8LPredictors[10] = Predictor10;
-  VP8LPredictors[11] = Predictor11;
-  VP8LPredictors[12] = Predictor12;
-  VP8LPredictors[13] = Predictor13;
-  VP8LPredictors[14] = Predictor0;     // <- padding security sentinels
-  VP8LPredictors[15] = Predictor0;
+  COPY_PREDICTOR_ARRAY(Predictor, VP8LPredictors)
+  COPY_PREDICTOR_ARRAY(Predictor, VP8LPredictors_C)
+  COPY_PREDICTOR_ARRAY(PredictorAdd, VP8LPredictorsAdd)
+  COPY_PREDICTOR_ARRAY(PredictorAdd, VP8LPredictorsAdd_C)
 
   VP8LAddGreenToBlueAndRed = VP8LAddGreenToBlueAndRed_C;
 
@@ -626,8 +650,14 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInit(void) {
       VP8LDspInitMIPSdspR2();
     }
 #endif
+#if defined(WEBP_USE_MSA)
+    if (VP8GetCPUInfo(kMSA)) {
+      VP8LDspInitMSA();
+    }
+#endif
   }
   lossless_last_cpuinfo_used = VP8GetCPUInfo;
 }
+#undef COPY_PREDICTOR_ARRAY
 
 //------------------------------------------------------------------------------
diff --git a/src/3rdparty/libwebp/src/dsp/lossless.h b/src/3rdparty/libwebp/src/dsp/lossless.h
index 9f0d7a2..352a54e 100644
--- a/src/3rdparty/libwebp/src/dsp/lossless.h
+++ b/src/3rdparty/libwebp/src/dsp/lossless.h
@@ -18,7 +18,7 @@
 #include "../webp/types.h"
 #include "../webp/decode.h"
 
-#include "../enc/histogram.h"
+#include "../enc/histogram_enc.h"
 #include "../utils/utils.h"
 
 #ifdef __cplusplus
@@ -26,7 +26,7 @@ extern "C" {
 #endif
 
 #ifdef WEBP_EXPERIMENTAL_FEATURES
-#include "../enc/delta_palettization.h"
+#include "../enc/delta_palettization_enc.h"
 #endif  // WEBP_EXPERIMENTAL_FEATURES
 
 //------------------------------------------------------------------------------
@@ -34,9 +34,17 @@ extern "C" {
 
 typedef uint32_t (*VP8LPredictorFunc)(uint32_t left, const uint32_t* const top);
 extern VP8LPredictorFunc VP8LPredictors[16];
-
-typedef void (*VP8LProcessBlueAndRedFunc)(uint32_t* argb_data, int num_pixels);
-extern VP8LProcessBlueAndRedFunc VP8LAddGreenToBlueAndRed;
+extern VP8LPredictorFunc VP8LPredictors_C[16];
+// These Add/Sub function expects upper[-1] and out[-1] to be readable.
+typedef void (*VP8LPredictorAddSubFunc)(const uint32_t* in,
+                                        const uint32_t* upper, int num_pixels,
+                                        uint32_t* out);
+extern VP8LPredictorAddSubFunc VP8LPredictorsAdd[16];
+extern VP8LPredictorAddSubFunc VP8LPredictorsAdd_C[16];
+
+typedef void (*VP8LProcessDecBlueAndRedFunc)(const uint32_t* src,
+                                             int num_pixels, uint32_t* dst);
+extern VP8LProcessDecBlueAndRedFunc VP8LAddGreenToBlueAndRed;
 
 typedef struct {
   // Note: the members are uint8_t, so that any negative values are
@@ -45,9 +53,10 @@ typedef struct {
   uint8_t green_to_blue_;
   uint8_t red_to_blue_;
 } VP8LMultipliers;
-typedef void (*VP8LTransformColorFunc)(const VP8LMultipliers* const m,
-                                       uint32_t* argb_data, int num_pixels);
-extern VP8LTransformColorFunc VP8LTransformColorInverse;
+typedef void (*VP8LTransformColorInverseFunc)(const VP8LMultipliers* const m,
+                                              const uint32_t* src,
+                                              int num_pixels, uint32_t* dst);
+extern VP8LTransformColorInverseFunc VP8LTransformColorInverse;
 
 struct VP8LTransform;  // Defined in dec/vp8li.h.
 
@@ -72,23 +81,6 @@ extern VP8LConvertFunc VP8LConvertBGRAToBGR;
 void VP8LConvertFromBGRA(const uint32_t* const in_data, int num_pixels,
                          WEBP_CSP_MODE out_colorspace, uint8_t* const rgba);
 
-// color mapping related functions.
-static WEBP_INLINE uint32_t VP8GetARGBIndex(uint32_t idx) {
-  return (idx >> 8) & 0xff;
-}
-
-static WEBP_INLINE uint8_t VP8GetAlphaIndex(uint8_t idx) {
-  return idx;
-}
-
-static WEBP_INLINE uint32_t VP8GetARGBValue(uint32_t val) {
-  return val;
-}
-
-static WEBP_INLINE uint8_t VP8GetAlphaValue(uint32_t val) {
-  return (val >> 8) & 0xff;
-}
-
 typedef void (*VP8LMapARGBFunc)(const uint32_t* src,
                                 const uint32_t* const color_map,
                                 uint32_t* dst, int y_start,
@@ -110,7 +102,8 @@ void VP8LColorIndexInverseTransformAlpha(
 
 // Expose some C-only fallback functions
 void VP8LTransformColorInverse_C(const VP8LMultipliers* const m,
-                                 uint32_t* data, int num_pixels);
+                                 const uint32_t* src, int num_pixels,
+                                 uint32_t* dst);
 
 void VP8LConvertBGRAToRGB_C(const uint32_t* src, int num_pixels, uint8_t* dst);
 void VP8LConvertBGRAToRGBA_C(const uint32_t* src, int num_pixels, uint8_t* dst);
@@ -119,7 +112,8 @@ void VP8LConvertBGRAToRGBA4444_C(const uint32_t* src,
 void VP8LConvertBGRAToRGB565_C(const uint32_t* src,
                                int num_pixels, uint8_t* dst);
 void VP8LConvertBGRAToBGR_C(const uint32_t* src, int num_pixels, uint8_t* dst);
-void VP8LAddGreenToBlueAndRed_C(uint32_t* data, int num_pixels);
+void VP8LAddGreenToBlueAndRed_C(const uint32_t* src, int num_pixels,
+                                uint32_t* dst);
 
 // Must be called before calling any of the above methods.
 void VP8LDspInit(void);
@@ -127,7 +121,10 @@ void VP8LDspInit(void);
 //------------------------------------------------------------------------------
 // Encoding
 
-extern VP8LProcessBlueAndRedFunc VP8LSubtractGreenFromBlueAndRed;
+typedef void (*VP8LProcessEncBlueAndRedFunc)(uint32_t* dst, int num_pixels);
+extern VP8LProcessEncBlueAndRedFunc VP8LSubtractGreenFromBlueAndRed;
+typedef void (*VP8LTransformColorFunc)(const VP8LMultipliers* const m,
+                                       uint32_t* const dst, int num_pixels);
 extern VP8LTransformColorFunc VP8LTransformColor;
 typedef void (*VP8LCollectColorBlueTransformsFunc)(
     const uint32_t* argb, int stride,
@@ -153,62 +150,8 @@ void VP8LCollectColorBlueTransforms_C(const uint32_t* argb, int stride,
                                       int green_to_blue, int red_to_blue,
                                       int histo[]);
 
-//------------------------------------------------------------------------------
-// Image transforms.
-
-void VP8LResidualImage(int width, int height, int bits, int low_effort,
-                       uint32_t* const argb, uint32_t* const argb_scratch,
-                       uint32_t* const image, int near_lossless, int exact,
-                       int used_subtract_green);
-
-void VP8LColorSpaceTransform(int width, int height, int bits, int quality,
-                             uint32_t* const argb, uint32_t* image);
-
-//------------------------------------------------------------------------------
-// Misc methods.
-
-// Computes sampled size of 'size' when sampling using 'sampling bits'.
-static WEBP_INLINE uint32_t VP8LSubSampleSize(uint32_t size,
-                                              uint32_t sampling_bits) {
-  return (size + (1 << sampling_bits) - 1) >> sampling_bits;
-}
-
-// Converts near lossless quality into max number of bits shaved off.
-static WEBP_INLINE int VP8LNearLosslessBits(int near_lossless_quality) {
-  //    100 -> 0
-  // 80..99 -> 1
-  // 60..79 -> 2
-  // 40..59 -> 3
-  // 20..39 -> 4
-  //  0..19 -> 5
-  return 5 - near_lossless_quality / 20;
-}
-
-// -----------------------------------------------------------------------------
-// Faster logarithm for integers. Small values use a look-up table.
-
-// The threshold till approximate version of log_2 can be used.
-// Practically, we can get rid of the call to log() as the two values match to
-// very high degree (the ratio of these two is 0.99999x).
-// Keeping a high threshold for now.
-#define APPROX_LOG_WITH_CORRECTION_MAX  65536
-#define APPROX_LOG_MAX                   4096
-#define LOG_2_RECIPROCAL 1.44269504088896338700465094007086
-#define LOG_LOOKUP_IDX_MAX 256
-extern const float kLog2Table[LOG_LOOKUP_IDX_MAX];
-extern const float kSLog2Table[LOG_LOOKUP_IDX_MAX];
-typedef float (*VP8LFastLog2SlowFunc)(uint32_t v);
-
-extern VP8LFastLog2SlowFunc VP8LFastLog2Slow;
-extern VP8LFastLog2SlowFunc VP8LFastSLog2Slow;
-
-static WEBP_INLINE float VP8LFastLog2(uint32_t v) {
-  return (v < LOG_LOOKUP_IDX_MAX) ? kLog2Table[v] : VP8LFastLog2Slow(v);
-}
-// Fast calculation of v * log2(v) for integer input.
-static WEBP_INLINE float VP8LFastSLog2(uint32_t v) {
-  return (v < LOG_LOOKUP_IDX_MAX) ? kSLog2Table[v] : VP8LFastSLog2Slow(v);
-}
+extern VP8LPredictorAddSubFunc VP8LPredictorsSub[16];
+extern VP8LPredictorAddSubFunc VP8LPredictorsSub_C[16];
 
 // -----------------------------------------------------------------------------
 // Huffman-cost related functions.
@@ -228,11 +171,6 @@ typedef struct {        // small struct to hold counters
   int streaks[2][2];    // [zero/non-zero][streak<3 / streak>=3]
 } VP8LStreaks;
 
-typedef VP8LStreaks (*VP8LCostCombinedCountFunc)(const uint32_t* X,
-                                                 const uint32_t* Y, int length);
-
-extern VP8LCostCombinedCountFunc VP8LHuffmanCostCombinedCount;
-
 typedef struct {            // small struct to hold bit entropy results
   double entropy;           // entropy
   uint32_t sum;             // sum of the population
@@ -246,26 +184,20 @@ void VP8LBitEntropyInit(VP8LBitEntropy* const entropy);
 // Get the combined symbol bit entropy and Huffman cost stats for the
 // distributions 'X' and 'Y'. Those results can then be refined according to
 // codec specific heuristics.
-void VP8LGetCombinedEntropyUnrefined(const uint32_t* const X,
-                                     const uint32_t* const Y, int length,
-                                     VP8LBitEntropy* const bit_entropy,
-                                     VP8LStreaks* const stats);
+typedef void (*VP8LGetCombinedEntropyUnrefinedFunc)(
+    const uint32_t X[], const uint32_t Y[], int length,
+    VP8LBitEntropy* const bit_entropy, VP8LStreaks* const stats);
+extern VP8LGetCombinedEntropyUnrefinedFunc VP8LGetCombinedEntropyUnrefined;
+
 // Get the entropy for the distribution 'X'.
-void VP8LGetEntropyUnrefined(const uint32_t* const X, int length,
-                             VP8LBitEntropy* const bit_entropy,
-                             VP8LStreaks* const stats);
+typedef void (*VP8LGetEntropyUnrefinedFunc)(const uint32_t X[], int length,
+                                            VP8LBitEntropy* const bit_entropy,
+                                            VP8LStreaks* const stats);
+extern VP8LGetEntropyUnrefinedFunc VP8LGetEntropyUnrefined;
 
 void VP8LBitsEntropyUnrefined(const uint32_t* const array, int n,
                               VP8LBitEntropy* const entropy);
 
-typedef void (*GetEntropyUnrefinedHelperFunc)(uint32_t val, int i,
-                                              uint32_t* const val_prev,
-                                              int* const i_prev,
-                                              VP8LBitEntropy* const bit_entropy,
-                                              VP8LStreaks* const stats);
-// Internal function used by VP8LGet*EntropyUnrefined.
-extern GetEntropyUnrefinedHelperFunc VP8LGetEntropyUnrefinedHelper;
-
 typedef void (*VP8LHistogramAddFunc)(const VP8LHistogram* const a,
                                      const VP8LHistogram* const b,
                                      VP8LHistogram* const out);
@@ -279,86 +211,11 @@ typedef int (*VP8LVectorMismatchFunc)(const uint32_t* const array1,
 // Returns the first index where array1 and array2 are different.
 extern VP8LVectorMismatchFunc VP8LVectorMismatch;
 
-static WEBP_INLINE int VP8LBitsLog2Ceiling(uint32_t n) {
-  const int log_floor = BitsLog2Floor(n);
-  if (n == (n & ~(n - 1)))  // zero or a power of two.
-    return log_floor;
-  else
-    return log_floor + 1;
-}
-
-// Splitting of distance and length codes into prefixes and
-// extra bits. The prefixes are encoded with an entropy code
-// while the extra bits are stored just as normal bits.
-static WEBP_INLINE void VP8LPrefixEncodeBitsNoLUT(int distance, int* const code,
-                                                  int* const extra_bits) {
-  const int highest_bit = BitsLog2Floor(--distance);
-  const int second_highest_bit = (distance >> (highest_bit - 1)) & 1;
-  *extra_bits = highest_bit - 1;
-  *code = 2 * highest_bit + second_highest_bit;
-}
-
-static WEBP_INLINE void VP8LPrefixEncodeNoLUT(int distance, int* const code,
-                                              int* const extra_bits,
-                                              int* const extra_bits_value) {
-  const int highest_bit = BitsLog2Floor(--distance);
-  const int second_highest_bit = (distance >> (highest_bit - 1)) & 1;
-  *extra_bits = highest_bit - 1;
-  *extra_bits_value = distance & ((1 << *extra_bits) - 1);
-  *code = 2 * highest_bit + second_highest_bit;
-}
-
-#define PREFIX_LOOKUP_IDX_MAX   512
-typedef struct {
-  int8_t code_;
-  int8_t extra_bits_;
-} VP8LPrefixCode;
-
-// These tables are derived using VP8LPrefixEncodeNoLUT.
-extern const VP8LPrefixCode kPrefixEncodeCode[PREFIX_LOOKUP_IDX_MAX];
-extern const uint8_t kPrefixEncodeExtraBitsValue[PREFIX_LOOKUP_IDX_MAX];
-static WEBP_INLINE void VP8LPrefixEncodeBits(int distance, int* const code,
-                                             int* const extra_bits) {
-  if (distance < PREFIX_LOOKUP_IDX_MAX) {
-    const VP8LPrefixCode prefix_code = kPrefixEncodeCode[distance];
-    *code = prefix_code.code_;
-    *extra_bits = prefix_code.extra_bits_;
-  } else {
-    VP8LPrefixEncodeBitsNoLUT(distance, code, extra_bits);
-  }
-}
-
-static WEBP_INLINE void VP8LPrefixEncode(int distance, int* const code,
-                                         int* const extra_bits,
-                                         int* const extra_bits_value) {
-  if (distance < PREFIX_LOOKUP_IDX_MAX) {
-    const VP8LPrefixCode prefix_code = kPrefixEncodeCode[distance];
-    *code = prefix_code.code_;
-    *extra_bits = prefix_code.extra_bits_;
-    *extra_bits_value = kPrefixEncodeExtraBitsValue[distance];
-  } else {
-    VP8LPrefixEncodeNoLUT(distance, code, extra_bits, extra_bits_value);
-  }
-}
-
-// Sum of each component, mod 256.
-static WEBP_INLINE uint32_t VP8LAddPixels(uint32_t a, uint32_t b) {
-  const uint32_t alpha_and_green = (a & 0xff00ff00u) + (b & 0xff00ff00u);
-  const uint32_t red_and_blue = (a & 0x00ff00ffu) + (b & 0x00ff00ffu);
-  return (alpha_and_green & 0xff00ff00u) | (red_and_blue & 0x00ff00ffu);
-}
-
-// Difference of each component, mod 256.
-static WEBP_INLINE uint32_t VP8LSubPixels(uint32_t a, uint32_t b) {
-  const uint32_t alpha_and_green =
-      0x00ff00ffu + (a & 0xff00ff00u) - (b & 0xff00ff00u);
-  const uint32_t red_and_blue =
-      0xff00ff00u + (a & 0x00ff00ffu) - (b & 0x00ff00ffu);
-  return (alpha_and_green & 0xff00ff00u) | (red_and_blue & 0x00ff00ffu);
-}
-
-void VP8LBundleColorMap(const uint8_t* const row, int width,
-                        int xbits, uint32_t* const dst);
+typedef void (*VP8LBundleColorMapFunc)(const uint8_t* const row, int width,
+                                       int xbits, uint32_t* dst);
+extern VP8LBundleColorMapFunc VP8LBundleColorMap;
+void VP8LBundleColorMap_C(const uint8_t* const row, int width, int xbits,
+                          uint32_t* dst);
 
 // Must be called before calling any of the above methods.
 void VP8LEncDspInit(void);
diff --git a/src/3rdparty/libwebp/src/dsp/lossless_common.h b/src/3rdparty/libwebp/src/dsp/lossless_common.h
new file mode 100644
index 0000000..c40f711
--- /dev/null
+++ b/src/3rdparty/libwebp/src/dsp/lossless_common.h
@@ -0,0 +1,210 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Image transforms and color space conversion methods for lossless decoder.
+//
+// Authors: Vikas Arora (vikaas.arora@gmail.com)
+//          Jyrki Alakuijala (jyrki@google.com)
+//          Vincent Rabaud (vrabaud@google.com)
+
+#ifndef WEBP_DSP_LOSSLESS_COMMON_H_
+#define WEBP_DSP_LOSSLESS_COMMON_H_
+
+#include "../webp/types.h"
+
+#include "../utils/utils.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//------------------------------------------------------------------------------
+// Decoding
+
+// color mapping related functions.
+static WEBP_INLINE uint32_t VP8GetARGBIndex(uint32_t idx) {
+  return (idx >> 8) & 0xff;
+}
+
+static WEBP_INLINE uint8_t VP8GetAlphaIndex(uint8_t idx) {
+  return idx;
+}
+
+static WEBP_INLINE uint32_t VP8GetARGBValue(uint32_t val) {
+  return val;
+}
+
+static WEBP_INLINE uint8_t VP8GetAlphaValue(uint32_t val) {
+  return (val >> 8) & 0xff;
+}
+
+//------------------------------------------------------------------------------
+// Misc methods.
+
+// Computes sampled size of 'size' when sampling using 'sampling bits'.
+static WEBP_INLINE uint32_t VP8LSubSampleSize(uint32_t size,
+                                              uint32_t sampling_bits) {
+  return (size + (1 << sampling_bits) - 1) >> sampling_bits;
+}
+
+// Converts near lossless quality into max number of bits shaved off.
+static WEBP_INLINE int VP8LNearLosslessBits(int near_lossless_quality) {
+  //    100 -> 0
+  // 80..99 -> 1
+  // 60..79 -> 2
+  // 40..59 -> 3
+  // 20..39 -> 4
+  //  0..19 -> 5
+  return 5 - near_lossless_quality / 20;
+}
+
+// -----------------------------------------------------------------------------
+// Faster logarithm for integers. Small values use a look-up table.
+
+// The threshold till approximate version of log_2 can be used.
+// Practically, we can get rid of the call to log() as the two values match to
+// very high degree (the ratio of these two is 0.99999x).
+// Keeping a high threshold for now.
+#define APPROX_LOG_WITH_CORRECTION_MAX  65536
+#define APPROX_LOG_MAX                   4096
+#define LOG_2_RECIPROCAL 1.44269504088896338700465094007086
+#define LOG_LOOKUP_IDX_MAX 256
+extern const float kLog2Table[LOG_LOOKUP_IDX_MAX];
+extern const float kSLog2Table[LOG_LOOKUP_IDX_MAX];
+typedef float (*VP8LFastLog2SlowFunc)(uint32_t v);
+
+extern VP8LFastLog2SlowFunc VP8LFastLog2Slow;
+extern VP8LFastLog2SlowFunc VP8LFastSLog2Slow;
+
+static WEBP_INLINE float VP8LFastLog2(uint32_t v) {
+  return (v < LOG_LOOKUP_IDX_MAX) ? kLog2Table[v] : VP8LFastLog2Slow(v);
+}
+// Fast calculation of v * log2(v) for integer input.
+static WEBP_INLINE float VP8LFastSLog2(uint32_t v) {
+  return (v < LOG_LOOKUP_IDX_MAX) ? kSLog2Table[v] : VP8LFastSLog2Slow(v);
+}
+
+// -----------------------------------------------------------------------------
+// PrefixEncode()
+
+static WEBP_INLINE int VP8LBitsLog2Ceiling(uint32_t n) {
+  const int log_floor = BitsLog2Floor(n);
+  if (n == (n & ~(n - 1))) {  // zero or a power of two.
+    return log_floor;
+  }
+  return log_floor + 1;
+}
+
+// Splitting of distance and length codes into prefixes and
+// extra bits. The prefixes are encoded with an entropy code
+// while the extra bits are stored just as normal bits.
+static WEBP_INLINE void VP8LPrefixEncodeBitsNoLUT(int distance, int* const code,
+                                                  int* const extra_bits) {
+  const int highest_bit = BitsLog2Floor(--distance);
+  const int second_highest_bit = (distance >> (highest_bit - 1)) & 1;
+  *extra_bits = highest_bit - 1;
+  *code = 2 * highest_bit + second_highest_bit;
+}
+
+static WEBP_INLINE void VP8LPrefixEncodeNoLUT(int distance, int* const code,
+                                              int* const extra_bits,
+                                              int* const extra_bits_value) {
+  const int highest_bit = BitsLog2Floor(--distance);
+  const int second_highest_bit = (distance >> (highest_bit - 1)) & 1;
+  *extra_bits = highest_bit - 1;
+  *extra_bits_value = distance & ((1 << *extra_bits) - 1);
+  *code = 2 * highest_bit + second_highest_bit;
+}
+
+#define PREFIX_LOOKUP_IDX_MAX   512
+typedef struct {
+  int8_t code_;
+  int8_t extra_bits_;
+} VP8LPrefixCode;
+
+// These tables are derived using VP8LPrefixEncodeNoLUT.
+extern const VP8LPrefixCode kPrefixEncodeCode[PREFIX_LOOKUP_IDX_MAX];
+extern const uint8_t kPrefixEncodeExtraBitsValue[PREFIX_LOOKUP_IDX_MAX];
+static WEBP_INLINE void VP8LPrefixEncodeBits(int distance, int* const code,
+                                             int* const extra_bits) {
+  if (distance < PREFIX_LOOKUP_IDX_MAX) {
+    const VP8LPrefixCode prefix_code = kPrefixEncodeCode[distance];
+    *code = prefix_code.code_;
+    *extra_bits = prefix_code.extra_bits_;
+  } else {
+    VP8LPrefixEncodeBitsNoLUT(distance, code, extra_bits);
+  }
+}
+
+static WEBP_INLINE void VP8LPrefixEncode(int distance, int* const code,
+                                         int* const extra_bits,
+                                         int* const extra_bits_value) {
+  if (distance < PREFIX_LOOKUP_IDX_MAX) {
+    const VP8LPrefixCode prefix_code = kPrefixEncodeCode[distance];
+    *code = prefix_code.code_;
+    *extra_bits = prefix_code.extra_bits_;
+    *extra_bits_value = kPrefixEncodeExtraBitsValue[distance];
+  } else {
+    VP8LPrefixEncodeNoLUT(distance, code, extra_bits, extra_bits_value);
+  }
+}
+
+// Sum of each component, mod 256.
+static WEBP_UBSAN_IGNORE_UNSIGNED_OVERFLOW WEBP_INLINE
+uint32_t VP8LAddPixels(uint32_t a, uint32_t b) {
+  const uint32_t alpha_and_green = (a & 0xff00ff00u) + (b & 0xff00ff00u);
+  const uint32_t red_and_blue = (a & 0x00ff00ffu) + (b & 0x00ff00ffu);
+  return (alpha_and_green & 0xff00ff00u) | (red_and_blue & 0x00ff00ffu);
+}
+
+// Difference of each component, mod 256.
+static WEBP_UBSAN_IGNORE_UNSIGNED_OVERFLOW WEBP_INLINE
+uint32_t VP8LSubPixels(uint32_t a, uint32_t b) {
+  const uint32_t alpha_and_green =
+      0x00ff00ffu + (a & 0xff00ff00u) - (b & 0xff00ff00u);
+  const uint32_t red_and_blue =
+      0xff00ff00u + (a & 0x00ff00ffu) - (b & 0x00ff00ffu);
+  return (alpha_and_green & 0xff00ff00u) | (red_and_blue & 0x00ff00ffu);
+}
+
+//------------------------------------------------------------------------------
+// Transform-related functions use din both encoding and decoding.
+
+// Macros used to create a batch predictor that iteratively uses a
+// one-pixel predictor.
+
+// The predictor is added to the output pixel (which
+// is therefore considered as a residual) to get the final prediction.
+#define GENERATE_PREDICTOR_ADD(PREDICTOR, PREDICTOR_ADD)             \
+static void PREDICTOR_ADD(const uint32_t* in, const uint32_t* upper, \
+                          int num_pixels, uint32_t* out) {           \
+  int x;                                                             \
+  for (x = 0; x < num_pixels; ++x) {                                 \
+    const uint32_t pred = (PREDICTOR)(out[x - 1], upper + x);        \
+    out[x] = VP8LAddPixels(in[x], pred);                             \
+  }                                                                  \
+}
+
+// It subtracts the prediction from the input pixel and stores the residual
+// in the output pixel.
+#define GENERATE_PREDICTOR_SUB(PREDICTOR, PREDICTOR_SUB)             \
+static void PREDICTOR_SUB(const uint32_t* in, const uint32_t* upper, \
+                          int num_pixels, uint32_t* out) {           \
+  int x;                                                             \
+  for (x = 0; x < num_pixels; ++x) {                                 \
+    const uint32_t pred = (PREDICTOR)(in[x - 1], upper + x);         \
+    out[x] = VP8LSubPixels(in[x], pred);                             \
+  }                                                                  \
+}
+
+#ifdef __cplusplus
+}    // extern "C"
+#endif
+
+#endif  // WEBP_DSP_LOSSLESS_COMMON_H_
diff --git a/src/3rdparty/libwebp/src/dsp/lossless_enc.c b/src/3rdparty/libwebp/src/dsp/lossless_enc.c
index 256f6f5..4e46fba 100644
--- a/src/3rdparty/libwebp/src/dsp/lossless_enc.c
+++ b/src/3rdparty/libwebp/src/dsp/lossless_enc.c
@@ -17,16 +17,12 @@
 
 #include <math.h>
 #include <stdlib.h>
-#include "../dec/vp8li.h"
-#include "../utils/endian_inl.h"
+#include "../dec/vp8li_dec.h"
+#include "../utils/endian_inl_utils.h"
 #include "./lossless.h"
+#include "./lossless_common.h"
 #include "./yuv.h"
 
-#define MAX_DIFF_COST (1e30f)
-
-static const int kPredLowEffort = 11;
-static const uint32_t kMaskAlpha = 0xff000000;
-
 // lookup table for small values of log2(int)
 const float kLog2Table[LOG_LOOKUP_IDX_MAX] = {
   0.0000000000000000f, 0.0000000000000000f,
@@ -380,26 +376,9 @@ static float FastLog2Slow(uint32_t v) {
   }
 }
 
-// Mostly used to reduce code size + readability
-static WEBP_INLINE int GetMin(int a, int b) { return (a > b) ? b : a; }
-static WEBP_INLINE int GetMax(int a, int b) { return (a < b) ? b : a; }
-
 //------------------------------------------------------------------------------
 // Methods to calculate Entropy (Shannon).
 
-static float PredictionCostSpatial(const int counts[256], int weight_0,
-                                   double exp_val) {
-  const int significant_symbols = 256 >> 4;
-  const double exp_decay_factor = 0.6;
-  double bits = weight_0 * counts[0];
-  int i;
-  for (i = 1; i < significant_symbols; ++i) {
-    bits += exp_val * (counts[i] + counts[256 - i]);
-    exp_val *= exp_decay_factor;
-  }
-  return (float)(-0.1 * bits);
-}
-
 // Compute the combined Shanon's entropy for distribution {X} and {X+Y}
 static float CombinedShannonEntropy(const int X[256], const int Y[256]) {
   int i;
@@ -422,18 +401,6 @@ static float CombinedShannonEntropy(const int X[256], const int Y[256]) {
   return (float)retval;
 }
 
-static float PredictionCostSpatialHistogram(const int accumulated[4][256],
-                                            const int tile[4][256]) {
-  int i;
-  double retval = 0;
-  for (i = 0; i < 4; ++i) {
-    const double kExpValue = 0.94;
-    retval += PredictionCostSpatial(tile[i], 1, kExpValue);
-    retval += VP8LCombinedShannonEntropy(tile[i], accumulated[i]);
-  }
-  return (float)retval;
-}
-
 void VP8LBitEntropyInit(VP8LBitEntropy* const entropy) {
   entropy->entropy = 0.;
   entropy->sum = 0;
@@ -486,9 +453,9 @@ static WEBP_INLINE void GetEntropyUnrefinedHelper(
   *i_prev = i;
 }
 
-void VP8LGetEntropyUnrefined(const uint32_t* const X, int length,
-                             VP8LBitEntropy* const bit_entropy,
-                             VP8LStreaks* const stats) {
+static void GetEntropyUnrefined(const uint32_t X[], int length,
+                                VP8LBitEntropy* const bit_entropy,
+                                VP8LStreaks* const stats) {
   int i;
   int i_prev = 0;
   uint32_t x_prev = X[0];
@@ -499,18 +466,18 @@ void VP8LGetEntropyUnrefined(const uint32_t* const X, int length,
   for (i = 1; i < length; ++i) {
     const uint32_t x = X[i];
     if (x != x_prev) {
-      VP8LGetEntropyUnrefinedHelper(x, i, &x_prev, &i_prev, bit_entropy, stats);
+      GetEntropyUnrefinedHelper(x, i, &x_prev, &i_prev, bit_entropy, stats);
     }
   }
-  VP8LGetEntropyUnrefinedHelper(0, i, &x_prev, &i_prev, bit_entropy, stats);
+  GetEntropyUnrefinedHelper(0, i, &x_prev, &i_prev, bit_entropy, stats);
 
   bit_entropy->entropy += VP8LFastSLog2(bit_entropy->sum);
 }
 
-void VP8LGetCombinedEntropyUnrefined(const uint32_t* const X,
-                                     const uint32_t* const Y, int length,
-                                     VP8LBitEntropy* const bit_entropy,
-                                     VP8LStreaks* const stats) {
+static void GetCombinedEntropyUnrefined(const uint32_t X[], const uint32_t Y[],
+                                        int length,
+                                        VP8LBitEntropy* const bit_entropy,
+                                        VP8LStreaks* const stats) {
   int i = 1;
   int i_prev = 0;
   uint32_t xy_prev = X[0] + Y[0];
@@ -521,439 +488,29 @@ void VP8LGetCombinedEntropyUnrefined(const uint32_t* const X,
   for (i = 1; i < length; ++i) {
     const uint32_t xy = X[i] + Y[i];
     if (xy != xy_prev) {
-      VP8LGetEntropyUnrefinedHelper(xy, i, &xy_prev, &i_prev, bit_entropy,
-                                    stats);
+      GetEntropyUnrefinedHelper(xy, i, &xy_prev, &i_prev, bit_entropy, stats);
     }
   }
-  VP8LGetEntropyUnrefinedHelper(0, i, &xy_prev, &i_prev, bit_entropy, stats);
+  GetEntropyUnrefinedHelper(0, i, &xy_prev, &i_prev, bit_entropy, stats);
 
   bit_entropy->entropy += VP8LFastSLog2(bit_entropy->sum);
 }
 
-static WEBP_INLINE void UpdateHisto(int histo_argb[4][256], uint32_t argb) {
-  ++histo_argb[0][argb >> 24];
-  ++histo_argb[1][(argb >> 16) & 0xff];
-  ++histo_argb[2][(argb >> 8) & 0xff];
-  ++histo_argb[3][argb & 0xff];
-}
-
 //------------------------------------------------------------------------------
 
-static WEBP_INLINE uint32_t Predict(VP8LPredictorFunc pred_func,
-                                    int x, int y,
-                                    const uint32_t* current_row,
-                                    const uint32_t* upper_row) {
-  if (y == 0) {
-    return (x == 0) ? ARGB_BLACK : current_row[x - 1];  // Left.
-  } else if (x == 0) {
-    return upper_row[x];  // Top.
-  } else {
-    return pred_func(current_row[x - 1], upper_row + x);
-  }
-}
-
-static int MaxDiffBetweenPixels(uint32_t p1, uint32_t p2) {
-  const int diff_a = abs((int)(p1 >> 24) - (int)(p2 >> 24));
-  const int diff_r = abs((int)((p1 >> 16) & 0xff) - (int)((p2 >> 16) & 0xff));
-  const int diff_g = abs((int)((p1 >> 8) & 0xff) - (int)((p2 >> 8) & 0xff));
-  const int diff_b = abs((int)(p1 & 0xff) - (int)(p2 & 0xff));
-  return GetMax(GetMax(diff_a, diff_r), GetMax(diff_g, diff_b));
-}
-
-static int MaxDiffAroundPixel(uint32_t current, uint32_t up, uint32_t down,
-                              uint32_t left, uint32_t right) {
-  const int diff_up = MaxDiffBetweenPixels(current, up);
-  const int diff_down = MaxDiffBetweenPixels(current, down);
-  const int diff_left = MaxDiffBetweenPixels(current, left);
-  const int diff_right = MaxDiffBetweenPixels(current, right);
-  return GetMax(GetMax(diff_up, diff_down), GetMax(diff_left, diff_right));
-}
-
-static uint32_t AddGreenToBlueAndRed(uint32_t argb) {
-  const uint32_t green = (argb >> 8) & 0xff;
-  uint32_t red_blue = argb & 0x00ff00ffu;
-  red_blue += (green << 16) | green;
-  red_blue &= 0x00ff00ffu;
-  return (argb & 0xff00ff00u) | red_blue;
-}
-
-static void MaxDiffsForRow(int width, int stride, const uint32_t* const argb,
-                           uint8_t* const max_diffs, int used_subtract_green) {
-  uint32_t current, up, down, left, right;
-  int x;
-  if (width <= 2) return;
-  current = argb[0];
-  right = argb[1];
-  if (used_subtract_green) {
-    current = AddGreenToBlueAndRed(current);
-    right = AddGreenToBlueAndRed(right);
-  }
-  // max_diffs[0] and max_diffs[width - 1] are never used.
-  for (x = 1; x < width - 1; ++x) {
-    up = argb[-stride + x];
-    down = argb[stride + x];
-    left = current;
-    current = right;
-    right = argb[x + 1];
-    if (used_subtract_green) {
-      up = AddGreenToBlueAndRed(up);
-      down = AddGreenToBlueAndRed(down);
-      right = AddGreenToBlueAndRed(right);
-    }
-    max_diffs[x] = MaxDiffAroundPixel(current, up, down, left, right);
-  }
-}
-
-// Quantize the difference between the actual component value and its prediction
-// to a multiple of quantization, working modulo 256, taking care not to cross
-// a boundary (inclusive upper limit).
-static uint8_t NearLosslessComponent(uint8_t value, uint8_t predict,
-                                     uint8_t boundary, int quantization) {
-  const int residual = (value - predict) & 0xff;
-  const int boundary_residual = (boundary - predict) & 0xff;
-  const int lower = residual & ~(quantization - 1);
-  const int upper = lower + quantization;
-  // Resolve ties towards a value closer to the prediction (i.e. towards lower
-  // if value comes after prediction and towards upper otherwise).
-  const int bias = ((boundary - value) & 0xff) < boundary_residual;
-  if (residual - lower < upper - residual + bias) {
-    // lower is closer to residual than upper.
-    if (residual > boundary_residual && lower <= boundary_residual) {
-      // Halve quantization step to avoid crossing boundary. This midpoint is
-      // on the same side of boundary as residual because midpoint >= residual
-      // (since lower is closer than upper) and residual is above the boundary.
-      return lower + (quantization >> 1);
-    }
-    return lower;
-  } else {
-    // upper is closer to residual than lower.
-    if (residual <= boundary_residual && upper > boundary_residual) {
-      // Halve quantization step to avoid crossing boundary. This midpoint is
-      // on the same side of boundary as residual because midpoint <= residual
-      // (since upper is closer than lower) and residual is below the boundary.
-      return lower + (quantization >> 1);
-    }
-    return upper & 0xff;
-  }
-}
-
-// Quantize every component of the difference between the actual pixel value and
-// its prediction to a multiple of a quantization (a power of 2, not larger than
-// max_quantization which is a power of 2, smaller than max_diff). Take care if
-// value and predict have undergone subtract green, which means that red and
-// blue are represented as offsets from green.
-static uint32_t NearLossless(uint32_t value, uint32_t predict,
-                             int max_quantization, int max_diff,
-                             int used_subtract_green) {
-  int quantization;
-  uint8_t new_green = 0;
-  uint8_t green_diff = 0;
-  uint8_t a, r, g, b;
-  if (max_diff <= 2) {
-    return VP8LSubPixels(value, predict);
-  }
-  quantization = max_quantization;
-  while (quantization >= max_diff) {
-    quantization >>= 1;
-  }
-  if ((value >> 24) == 0 || (value >> 24) == 0xff) {
-    // Preserve transparency of fully transparent or fully opaque pixels.
-    a = ((value >> 24) - (predict >> 24)) & 0xff;
-  } else {
-    a = NearLosslessComponent(value >> 24, predict >> 24, 0xff, quantization);
-  }
-  g = NearLosslessComponent((value >> 8) & 0xff, (predict >> 8) & 0xff, 0xff,
-                            quantization);
-  if (used_subtract_green) {
-    // The green offset will be added to red and blue components during decoding
-    // to obtain the actual red and blue values.
-    new_green = ((predict >> 8) + g) & 0xff;
-    // The amount by which green has been adjusted during quantization. It is
-    // subtracted from red and blue for compensation, to avoid accumulating two
-    // quantization errors in them.
-    green_diff = (new_green - (value >> 8)) & 0xff;
-  }
-  r = NearLosslessComponent(((value >> 16) - green_diff) & 0xff,
-                            (predict >> 16) & 0xff, 0xff - new_green,
-                            quantization);
-  b = NearLosslessComponent((value - green_diff) & 0xff, predict & 0xff,
-                            0xff - new_green, quantization);
-  return ((uint32_t)a << 24) | ((uint32_t)r << 16) | ((uint32_t)g << 8) | b;
-}
-
-// Returns the difference between the pixel and its prediction. In case of a
-// lossy encoding, updates the source image to avoid propagating the deviation
-// further to pixels which depend on the current pixel for their predictions.
-static WEBP_INLINE uint32_t GetResidual(int width, int height,
-                                        uint32_t* const upper_row,
-                                        uint32_t* const current_row,
-                                        const uint8_t* const max_diffs,
-                                        int mode, VP8LPredictorFunc pred_func,
-                                        int x, int y, int max_quantization,
-                                        int exact, int used_subtract_green) {
-  const uint32_t predict = Predict(pred_func, x, y, current_row, upper_row);
-  uint32_t residual;
-  if (max_quantization == 1 || mode == 0 || y == 0 || y == height - 1 ||
-      x == 0 || x == width - 1) {
-    residual = VP8LSubPixels(current_row[x], predict);
-  } else {
-    residual = NearLossless(current_row[x], predict, max_quantization,
-                            max_diffs[x], used_subtract_green);
-    // Update the source image.
-    current_row[x] = VP8LAddPixels(predict, residual);
-    // x is never 0 here so we do not need to update upper_row like below.
-  }
-  if (!exact && (current_row[x] & kMaskAlpha) == 0) {
-    // If alpha is 0, cleanup RGB. We can choose the RGB values of the residual
-    // for best compression. The prediction of alpha itself can be non-zero and
-    // must be kept though. We choose RGB of the residual to be 0.
-    residual &= kMaskAlpha;
-    // Update the source image.
-    current_row[x] = predict & ~kMaskAlpha;
-    // The prediction for the rightmost pixel in a row uses the leftmost pixel
-    // in that row as its top-right context pixel. Hence if we change the
-    // leftmost pixel of current_row, the corresponding change must be applied
-    // to upper_row as well where top-right context is being read from.
-    if (x == 0 && y != 0) upper_row[width] = current_row[0];
-  }
-  return residual;
-}
-
-// Returns best predictor and updates the accumulated histogram.
-// If max_quantization > 1, assumes that near lossless processing will be
-// applied, quantizing residuals to multiples of quantization levels up to
-// max_quantization (the actual quantization level depends on smoothness near
-// the given pixel).
-static int GetBestPredictorForTile(int width, int height,
-                                   int tile_x, int tile_y, int bits,
-                                   int accumulated[4][256],
-                                   uint32_t* const argb_scratch,
-                                   const uint32_t* const argb,
-                                   int max_quantization,
-                                   int exact, int used_subtract_green) {
-  const int kNumPredModes = 14;
-  const int start_x = tile_x << bits;
-  const int start_y = tile_y << bits;
-  const int tile_size = 1 << bits;
-  const int max_y = GetMin(tile_size, height - start_y);
-  const int max_x = GetMin(tile_size, width - start_x);
-  // Whether there exist columns just outside the tile.
-  const int have_left = (start_x > 0);
-  const int have_right = (max_x < width - start_x);
-  // Position and size of the strip covering the tile and adjacent columns if
-  // they exist.
-  const int context_start_x = start_x - have_left;
-  const int context_width = max_x + have_left + have_right;
-  // The width of upper_row and current_row is one pixel larger than image width
-  // to allow the top right pixel to point to the leftmost pixel of the next row
-  // when at the right edge.
-  uint32_t* upper_row = argb_scratch;
-  uint32_t* current_row = upper_row + width + 1;
-  uint8_t* const max_diffs = (uint8_t*)(current_row + width + 1);
-  float best_diff = MAX_DIFF_COST;
-  int best_mode = 0;
-  int mode;
-  int histo_stack_1[4][256];
-  int histo_stack_2[4][256];
-  // Need pointers to be able to swap arrays.
-  int (*histo_argb)[256] = histo_stack_1;
-  int (*best_histo)[256] = histo_stack_2;
-  int i, j;
-
-  for (mode = 0; mode < kNumPredModes; ++mode) {
-    const VP8LPredictorFunc pred_func = VP8LPredictors[mode];
-    float cur_diff;
-    int relative_y;
-    memset(histo_argb, 0, sizeof(histo_stack_1));
-    if (start_y > 0) {
-      // Read the row above the tile which will become the first upper_row.
-      // Include a pixel to the left if it exists; include a pixel to the right
-      // in all cases (wrapping to the leftmost pixel of the next row if it does
-      // not exist).
-      memcpy(current_row + context_start_x,
-             argb + (start_y - 1) * width + context_start_x,
-             sizeof(*argb) * (max_x + have_left + 1));
-    }
-    for (relative_y = 0; relative_y < max_y; ++relative_y) {
-      const int y = start_y + relative_y;
-      int relative_x;
-      uint32_t* tmp = upper_row;
-      upper_row = current_row;
-      current_row = tmp;
-      // Read current_row. Include a pixel to the left if it exists; include a
-      // pixel to the right in all cases except at the bottom right corner of
-      // the image (wrapping to the leftmost pixel of the next row if it does
-      // not exist in the current row).
-      memcpy(current_row + context_start_x,
-             argb + y * width + context_start_x,
-             sizeof(*argb) * (max_x + have_left + (y + 1 < height)));
-      if (max_quantization > 1 && y >= 1 && y + 1 < height) {
-        MaxDiffsForRow(context_width, width, argb + y * width + context_start_x,
-                       max_diffs + context_start_x, used_subtract_green);
-      }
-
-      for (relative_x = 0; relative_x < max_x; ++relative_x) {
-        const int x = start_x + relative_x;
-        UpdateHisto(histo_argb,
-                    GetResidual(width, height, upper_row, current_row,
-                                max_diffs, mode, pred_func, x, y,
-                                max_quantization, exact, used_subtract_green));
-      }
-    }
-    cur_diff = PredictionCostSpatialHistogram(
-        (const int (*)[256])accumulated, (const int (*)[256])histo_argb);
-    if (cur_diff < best_diff) {
-      int (*tmp)[256] = histo_argb;
-      histo_argb = best_histo;
-      best_histo = tmp;
-      best_diff = cur_diff;
-      best_mode = mode;
-    }
-  }
-
-  for (i = 0; i < 4; i++) {
-    for (j = 0; j < 256; j++) {
-      accumulated[i][j] += best_histo[i][j];
-    }
-  }
-
-  return best_mode;
-}
-
-// Converts pixels of the image to residuals with respect to predictions.
-// If max_quantization > 1, applies near lossless processing, quantizing
-// residuals to multiples of quantization levels up to max_quantization
-// (the actual quantization level depends on smoothness near the given pixel).
-static void CopyImageWithPrediction(int width, int height,
-                                    int bits, uint32_t* const modes,
-                                    uint32_t* const argb_scratch,
-                                    uint32_t* const argb,
-                                    int low_effort, int max_quantization,
-                                    int exact, int used_subtract_green) {
-  const int tiles_per_row = VP8LSubSampleSize(width, bits);
-  const int mask = (1 << bits) - 1;
-  // The width of upper_row and current_row is one pixel larger than image width
-  // to allow the top right pixel to point to the leftmost pixel of the next row
-  // when at the right edge.
-  uint32_t* upper_row = argb_scratch;
-  uint32_t* current_row = upper_row + width + 1;
-  uint8_t* current_max_diffs = (uint8_t*)(current_row + width + 1);
-  uint8_t* lower_max_diffs = current_max_diffs + width;
-  int y;
-  int mode = 0;
-  VP8LPredictorFunc pred_func = NULL;
-
-  for (y = 0; y < height; ++y) {
-    int x;
-    uint32_t* const tmp32 = upper_row;
-    upper_row = current_row;
-    current_row = tmp32;
-    memcpy(current_row, argb + y * width,
-           sizeof(*argb) * (width + (y + 1 < height)));
-
-    if (low_effort) {
-      for (x = 0; x < width; ++x) {
-        const uint32_t predict = Predict(VP8LPredictors[kPredLowEffort], x, y,
-                                         current_row, upper_row);
-        argb[y * width + x] = VP8LSubPixels(current_row[x], predict);
-      }
-    } else {
-      if (max_quantization > 1) {
-        // Compute max_diffs for the lower row now, because that needs the
-        // contents of argb for the current row, which we will overwrite with
-        // residuals before proceeding with the next row.
-        uint8_t* const tmp8 = current_max_diffs;
-        current_max_diffs = lower_max_diffs;
-        lower_max_diffs = tmp8;
-        if (y + 2 < height) {
-          MaxDiffsForRow(width, width, argb + (y + 1) * width, lower_max_diffs,
-                         used_subtract_green);
-        }
-      }
-      for (x = 0; x < width; ++x) {
-        if ((x & mask) == 0) {
-          mode = (modes[(y >> bits) * tiles_per_row + (x >> bits)] >> 8) & 0xff;
-          pred_func = VP8LPredictors[mode];
-        }
-        argb[y * width + x] = GetResidual(
-            width, height, upper_row, current_row, current_max_diffs, mode,
-            pred_func, x, y, max_quantization, exact, used_subtract_green);
-      }
-    }
-  }
-}
-
-// Finds the best predictor for each tile, and converts the image to residuals
-// with respect to predictions. If near_lossless_quality < 100, applies
-// near lossless processing, shaving off more bits of residuals for lower
-// qualities.
-void VP8LResidualImage(int width, int height, int bits, int low_effort,
-                       uint32_t* const argb, uint32_t* const argb_scratch,
-                       uint32_t* const image, int near_lossless_quality,
-                       int exact, int used_subtract_green) {
-  const int tiles_per_row = VP8LSubSampleSize(width, bits);
-  const int tiles_per_col = VP8LSubSampleSize(height, bits);
-  int tile_y;
-  int histo[4][256];
-  const int max_quantization = 1 << VP8LNearLosslessBits(near_lossless_quality);
-  if (low_effort) {
-    int i;
-    for (i = 0; i < tiles_per_row * tiles_per_col; ++i) {
-      image[i] = ARGB_BLACK | (kPredLowEffort << 8);
-    }
-  } else {
-    memset(histo, 0, sizeof(histo));
-    for (tile_y = 0; tile_y < tiles_per_col; ++tile_y) {
-      int tile_x;
-      for (tile_x = 0; tile_x < tiles_per_row; ++tile_x) {
-        const int pred = GetBestPredictorForTile(width, height, tile_x, tile_y,
-            bits, histo, argb_scratch, argb, max_quantization, exact,
-            used_subtract_green);
-        image[tile_y * tiles_per_row + tile_x] = ARGB_BLACK | (pred << 8);
-      }
-    }
-  }
-
-  CopyImageWithPrediction(width, height, bits, image, argb_scratch, argb,
-                          low_effort, max_quantization, exact,
-                          used_subtract_green);
-}
-
 void VP8LSubtractGreenFromBlueAndRed_C(uint32_t* argb_data, int num_pixels) {
   int i;
   for (i = 0; i < num_pixels; ++i) {
-    const uint32_t argb = argb_data[i];
-    const uint32_t green = (argb >> 8) & 0xff;
+    const int argb = argb_data[i];
+    const int green = (argb >> 8) & 0xff;
     const uint32_t new_r = (((argb >> 16) & 0xff) - green) & 0xff;
-    const uint32_t new_b = ((argb & 0xff) - green) & 0xff;
-    argb_data[i] = (argb & 0xff00ff00) | (new_r << 16) | new_b;
+    const uint32_t new_b = (((argb >>  0) & 0xff) - green) & 0xff;
+    argb_data[i] = (argb & 0xff00ff00u) | (new_r << 16) | new_b;
   }
 }
 
-static WEBP_INLINE void MultipliersClear(VP8LMultipliers* const m) {
-  m->green_to_red_ = 0;
-  m->green_to_blue_ = 0;
-  m->red_to_blue_ = 0;
-}
-
-static WEBP_INLINE uint32_t ColorTransformDelta(int8_t color_pred,
-                                                int8_t color) {
-  return (uint32_t)((int)(color_pred) * color) >> 5;
-}
-
-static WEBP_INLINE void ColorCodeToMultipliers(uint32_t color_code,
-                                               VP8LMultipliers* const m) {
-  m->green_to_red_  = (color_code >>  0) & 0xff;
-  m->green_to_blue_ = (color_code >>  8) & 0xff;
-  m->red_to_blue_   = (color_code >> 16) & 0xff;
-}
-
-static WEBP_INLINE uint32_t MultipliersToColorCode(
-    const VP8LMultipliers* const m) {
-  return 0xff000000u |
-         ((uint32_t)(m->red_to_blue_) << 16) |
-         ((uint32_t)(m->green_to_blue_) << 8) |
-         m->green_to_red_;
+static WEBP_INLINE int ColorTransformDelta(int8_t color_pred, int8_t color) {
+  return ((int)color_pred * color) >> 5;
 }
 
 void VP8LTransformColor_C(const VP8LMultipliers* const m, uint32_t* data,
@@ -963,8 +520,8 @@ void VP8LTransformColor_C(const VP8LMultipliers* const m, uint32_t* data,
     const uint32_t argb = data[i];
     const uint32_t green = argb >> 8;
     const uint32_t red = argb >> 16;
-    uint32_t new_red = red;
-    uint32_t new_blue = argb;
+    int new_red = red;
+    int new_blue = argb;
     new_red -= ColorTransformDelta(m->green_to_red_, green);
     new_red &= 0xff;
     new_blue -= ColorTransformDelta(m->green_to_blue_, green);
@@ -977,7 +534,7 @@ void VP8LTransformColor_C(const VP8LMultipliers* const m, uint32_t* data,
 static WEBP_INLINE uint8_t TransformColorRed(uint8_t green_to_red,
                                              uint32_t argb) {
   const uint32_t green = argb >> 8;
-  uint32_t new_red = argb >> 16;
+  int new_red = argb >> 16;
   new_red -= ColorTransformDelta(green_to_red, green);
   return (new_red & 0xff);
 }
@@ -993,15 +550,6 @@ static WEBP_INLINE uint8_t TransformColorBlue(uint8_t green_to_blue,
   return (new_blue & 0xff);
 }
 
-static float PredictionCostCrossColor(const int accumulated[256],
-                                      const int counts[256]) {
-  // Favor low entropy, locally and globally.
-  // Favor small absolute values for PredictionCostSpatial
-  static const double kExpValue = 2.4;
-  return VP8LCombinedShannonEntropy(counts, accumulated) +
-         PredictionCostSpatial(counts, 3, kExpValue);
-}
-
 void VP8LCollectColorRedTransforms_C(const uint32_t* argb, int stride,
                                      int tile_width, int tile_height,
                                      int green_to_red, int histo[]) {
@@ -1014,59 +562,6 @@ void VP8LCollectColorRedTransforms_C(const uint32_t* argb, int stride,
   }
 }
 
-static float GetPredictionCostCrossColorRed(
-    const uint32_t* argb, int stride, int tile_width, int tile_height,
-    VP8LMultipliers prev_x, VP8LMultipliers prev_y, int green_to_red,
-    const int accumulated_red_histo[256]) {
-  int histo[256] = { 0 };
-  float cur_diff;
-
-  VP8LCollectColorRedTransforms(argb, stride, tile_width, tile_height,
-                                green_to_red, histo);
-
-  cur_diff = PredictionCostCrossColor(accumulated_red_histo, histo);
-  if ((uint8_t)green_to_red == prev_x.green_to_red_) {
-    cur_diff -= 3;  // favor keeping the areas locally similar
-  }
-  if ((uint8_t)green_to_red == prev_y.green_to_red_) {
-    cur_diff -= 3;  // favor keeping the areas locally similar
-  }
-  if (green_to_red == 0) {
-    cur_diff -= 3;
-  }
-  return cur_diff;
-}
-
-static void GetBestGreenToRed(
-    const uint32_t* argb, int stride, int tile_width, int tile_height,
-    VP8LMultipliers prev_x, VP8LMultipliers prev_y, int quality,
-    const int accumulated_red_histo[256], VP8LMultipliers* const best_tx) {
-  const int kMaxIters = 4 + ((7 * quality) >> 8);  // in range [4..6]
-  int green_to_red_best = 0;
-  int iter, offset;
-  float best_diff = GetPredictionCostCrossColorRed(
-      argb, stride, tile_width, tile_height, prev_x, prev_y,
-      green_to_red_best, accumulated_red_histo);
-  for (iter = 0; iter < kMaxIters; ++iter) {
-    // ColorTransformDelta is a 3.5 bit fixed point, so 32 is equal to
-    // one in color computation. Having initial delta here as 1 is sufficient
-    // to explore the range of (-2, 2).
-    const int delta = 32 >> iter;
-    // Try a negative and a positive delta from the best known value.
-    for (offset = -delta; offset <= delta; offset += 2 * delta) {
-      const int green_to_red_cur = offset + green_to_red_best;
-      const float cur_diff = GetPredictionCostCrossColorRed(
-          argb, stride, tile_width, tile_height, prev_x, prev_y,
-          green_to_red_cur, accumulated_red_histo);
-      if (cur_diff < best_diff) {
-        best_diff = cur_diff;
-        green_to_red_best = green_to_red_cur;
-      }
-    }
-  }
-  best_tx->green_to_red_ = green_to_red_best;
-}
-
 void VP8LCollectColorBlueTransforms_C(const uint32_t* argb, int stride,
                                       int tile_width, int tile_height,
                                       int green_to_blue, int red_to_blue,
@@ -1080,187 +575,6 @@ void VP8LCollectColorBlueTransforms_C(const uint32_t* argb, int stride,
   }
 }
 
-static float GetPredictionCostCrossColorBlue(
-    const uint32_t* argb, int stride, int tile_width, int tile_height,
-    VP8LMultipliers prev_x, VP8LMultipliers prev_y,
-    int green_to_blue, int red_to_blue, const int accumulated_blue_histo[256]) {
-  int histo[256] = { 0 };
-  float cur_diff;
-
-  VP8LCollectColorBlueTransforms(argb, stride, tile_width, tile_height,
-                                 green_to_blue, red_to_blue, histo);
-
-  cur_diff = PredictionCostCrossColor(accumulated_blue_histo, histo);
-  if ((uint8_t)green_to_blue == prev_x.green_to_blue_) {
-    cur_diff -= 3;  // favor keeping the areas locally similar
-  }
-  if ((uint8_t)green_to_blue == prev_y.green_to_blue_) {
-    cur_diff -= 3;  // favor keeping the areas locally similar
-  }
-  if ((uint8_t)red_to_blue == prev_x.red_to_blue_) {
-    cur_diff -= 3;  // favor keeping the areas locally similar
-  }
-  if ((uint8_t)red_to_blue == prev_y.red_to_blue_) {
-    cur_diff -= 3;  // favor keeping the areas locally similar
-  }
-  if (green_to_blue == 0) {
-    cur_diff -= 3;
-  }
-  if (red_to_blue == 0) {
-    cur_diff -= 3;
-  }
-  return cur_diff;
-}
-
-#define kGreenRedToBlueNumAxis 8
-#define kGreenRedToBlueMaxIters 7
-static void GetBestGreenRedToBlue(
-    const uint32_t* argb, int stride, int tile_width, int tile_height,
-    VP8LMultipliers prev_x, VP8LMultipliers prev_y, int quality,
-    const int accumulated_blue_histo[256],
-    VP8LMultipliers* const best_tx) {
-  const int8_t offset[kGreenRedToBlueNumAxis][2] =
-      {{0, -1}, {0, 1}, {-1, 0}, {1, 0}, {-1, -1}, {-1, 1}, {1, -1}, {1, 1}};
-  const int8_t delta_lut[kGreenRedToBlueMaxIters] = { 16, 16, 8, 4, 2, 2, 2 };
-  const int iters =
-      (quality < 25) ? 1 : (quality > 50) ? kGreenRedToBlueMaxIters : 4;
-  int green_to_blue_best = 0;
-  int red_to_blue_best = 0;
-  int iter;
-  // Initial value at origin:
-  float best_diff = GetPredictionCostCrossColorBlue(
-      argb, stride, tile_width, tile_height, prev_x, prev_y,
-      green_to_blue_best, red_to_blue_best, accumulated_blue_histo);
-  for (iter = 0; iter < iters; ++iter) {
-    const int delta = delta_lut[iter];
-    int axis;
-    for (axis = 0; axis < kGreenRedToBlueNumAxis; ++axis) {
-      const int green_to_blue_cur =
-          offset[axis][0] * delta + green_to_blue_best;
-      const int red_to_blue_cur = offset[axis][1] * delta + red_to_blue_best;
-      const float cur_diff = GetPredictionCostCrossColorBlue(
-          argb, stride, tile_width, tile_height, prev_x, prev_y,
-          green_to_blue_cur, red_to_blue_cur, accumulated_blue_histo);
-      if (cur_diff < best_diff) {
-        best_diff = cur_diff;
-        green_to_blue_best = green_to_blue_cur;
-        red_to_blue_best = red_to_blue_cur;
-      }
-      if (quality < 25 && iter == 4) {
-        // Only axis aligned diffs for lower quality.
-        break;  // next iter.
-      }
-    }
-    if (delta == 2 && green_to_blue_best == 0 && red_to_blue_best == 0) {
-      // Further iterations would not help.
-      break;  // out of iter-loop.
-    }
-  }
-  best_tx->green_to_blue_ = green_to_blue_best;
-  best_tx->red_to_blue_ = red_to_blue_best;
-}
-#undef kGreenRedToBlueMaxIters
-#undef kGreenRedToBlueNumAxis
-
-static VP8LMultipliers GetBestColorTransformForTile(
-    int tile_x, int tile_y, int bits,
-    VP8LMultipliers prev_x,
-    VP8LMultipliers prev_y,
-    int quality, int xsize, int ysize,
-    const int accumulated_red_histo[256],
-    const int accumulated_blue_histo[256],
-    const uint32_t* const argb) {
-  const int max_tile_size = 1 << bits;
-  const int tile_y_offset = tile_y * max_tile_size;
-  const int tile_x_offset = tile_x * max_tile_size;
-  const int all_x_max = GetMin(tile_x_offset + max_tile_size, xsize);
-  const int all_y_max = GetMin(tile_y_offset + max_tile_size, ysize);
-  const int tile_width = all_x_max - tile_x_offset;
-  const int tile_height = all_y_max - tile_y_offset;
-  const uint32_t* const tile_argb = argb + tile_y_offset * xsize
-                                  + tile_x_offset;
-  VP8LMultipliers best_tx;
-  MultipliersClear(&best_tx);
-
-  GetBestGreenToRed(tile_argb, xsize, tile_width, tile_height,
-                    prev_x, prev_y, quality, accumulated_red_histo, &best_tx);
-  GetBestGreenRedToBlue(tile_argb, xsize, tile_width, tile_height,
-                        prev_x, prev_y, quality, accumulated_blue_histo,
-                        &best_tx);
-  return best_tx;
-}
-
-static void CopyTileWithColorTransform(int xsize, int ysize,
-                                       int tile_x, int tile_y,
-                                       int max_tile_size,
-                                       VP8LMultipliers color_transform,
-                                       uint32_t* argb) {
-  const int xscan = GetMin(max_tile_size, xsize - tile_x);
-  int yscan = GetMin(max_tile_size, ysize - tile_y);
-  argb += tile_y * xsize + tile_x;
-  while (yscan-- > 0) {
-    VP8LTransformColor(&color_transform, argb, xscan);
-    argb += xsize;
-  }
-}
-
-void VP8LColorSpaceTransform(int width, int height, int bits, int quality,
-                             uint32_t* const argb, uint32_t* image) {
-  const int max_tile_size = 1 << bits;
-  const int tile_xsize = VP8LSubSampleSize(width, bits);
-  const int tile_ysize = VP8LSubSampleSize(height, bits);
-  int accumulated_red_histo[256] = { 0 };
-  int accumulated_blue_histo[256] = { 0 };
-  int tile_x, tile_y;
-  VP8LMultipliers prev_x, prev_y;
-  MultipliersClear(&prev_y);
-  MultipliersClear(&prev_x);
-  for (tile_y = 0; tile_y < tile_ysize; ++tile_y) {
-    for (tile_x = 0; tile_x < tile_xsize; ++tile_x) {
-      int y;
-      const int tile_x_offset = tile_x * max_tile_size;
-      const int tile_y_offset = tile_y * max_tile_size;
-      const int all_x_max = GetMin(tile_x_offset + max_tile_size, width);
-      const int all_y_max = GetMin(tile_y_offset + max_tile_size, height);
-      const int offset = tile_y * tile_xsize + tile_x;
-      if (tile_y != 0) {
-        ColorCodeToMultipliers(image[offset - tile_xsize], &prev_y);
-      }
-      prev_x = GetBestColorTransformForTile(tile_x, tile_y, bits,
-                                            prev_x, prev_y,
-                                            quality, width, height,
-                                            accumulated_red_histo,
-                                            accumulated_blue_histo,
-                                            argb);
-      image[offset] = MultipliersToColorCode(&prev_x);
-      CopyTileWithColorTransform(width, height, tile_x_offset, tile_y_offset,
-                                 max_tile_size, prev_x, argb);
-
-      // Gather accumulated histogram data.
-      for (y = tile_y_offset; y < all_y_max; ++y) {
-        int ix = y * width + tile_x_offset;
-        const int ix_end = ix + all_x_max - tile_x_offset;
-        for (; ix < ix_end; ++ix) {
-          const uint32_t pix = argb[ix];
-          if (ix >= 2 &&
-              pix == argb[ix - 2] &&
-              pix == argb[ix - 1]) {
-            continue;  // repeated pixels are handled by backward references
-          }
-          if (ix >= width + 2 &&
-              argb[ix - 2] == argb[ix - width - 2] &&
-              argb[ix - 1] == argb[ix - width - 1] &&
-              pix == argb[ix - width]) {
-            continue;  // repeated pixels are handled by backward references
-          }
-          ++accumulated_red_histo[(pix >> 16) & 0xff];
-          ++accumulated_blue_histo[(pix >> 0) & 0xff];
-        }
-      }
-    }
-  }
-}
-
 //------------------------------------------------------------------------------
 
 static int VectorMismatch(const uint32_t* const array1,
@@ -1274,8 +588,8 @@ static int VectorMismatch(const uint32_t* const array1,
 }
 
 // Bundles multiple (1, 2, 4 or 8) pixels into a single pixel.
-void VP8LBundleColorMap(const uint8_t* const row, int width,
-                        int xbits, uint32_t* const dst) {
+void VP8LBundleColorMap_C(const uint8_t* const row, int width, int xbits,
+                          uint32_t* dst) {
   int x;
   if (xbits > 0) {
     const int bit_depth = 1 << (3 - xbits);
@@ -1350,8 +664,172 @@ static void HistogramAdd(const VP8LHistogram* const a,
 }
 
 //------------------------------------------------------------------------------
+// Image transforms.
 
-VP8LProcessBlueAndRedFunc VP8LSubtractGreenFromBlueAndRed;
+static WEBP_INLINE uint32_t Average2(uint32_t a0, uint32_t a1) {
+  return (((a0 ^ a1) & 0xfefefefeu) >> 1) + (a0 & a1);
+}
+
+static WEBP_INLINE uint32_t Average3(uint32_t a0, uint32_t a1, uint32_t a2) {
+  return Average2(Average2(a0, a2), a1);
+}
+
+static WEBP_INLINE uint32_t Average4(uint32_t a0, uint32_t a1,
+                                     uint32_t a2, uint32_t a3) {
+  return Average2(Average2(a0, a1), Average2(a2, a3));
+}
+
+static WEBP_INLINE uint32_t Clip255(uint32_t a) {
+  if (a < 256) {
+    return a;
+  }
+  // return 0, when a is a negative integer.
+  // return 255, when a is positive.
+  return ~a >> 24;
+}
+
+static WEBP_INLINE int AddSubtractComponentFull(int a, int b, int c) {
+  return Clip255(a + b - c);
+}
+
+static WEBP_INLINE uint32_t ClampedAddSubtractFull(uint32_t c0, uint32_t c1,
+                                                   uint32_t c2) {
+  const int a = AddSubtractComponentFull(c0 >> 24, c1 >> 24, c2 >> 24);
+  const int r = AddSubtractComponentFull((c0 >> 16) & 0xff,
+                                         (c1 >> 16) & 0xff,
+                                         (c2 >> 16) & 0xff);
+  const int g = AddSubtractComponentFull((c0 >> 8) & 0xff,
+                                         (c1 >> 8) & 0xff,
+                                         (c2 >> 8) & 0xff);
+  const int b = AddSubtractComponentFull(c0 & 0xff, c1 & 0xff, c2 & 0xff);
+  return ((uint32_t)a << 24) | (r << 16) | (g << 8) | b;
+}
+
+static WEBP_INLINE int AddSubtractComponentHalf(int a, int b) {
+  return Clip255(a + (a - b) / 2);
+}
+
+static WEBP_INLINE uint32_t ClampedAddSubtractHalf(uint32_t c0, uint32_t c1,
+                                                   uint32_t c2) {
+  const uint32_t ave = Average2(c0, c1);
+  const int a = AddSubtractComponentHalf(ave >> 24, c2 >> 24);
+  const int r = AddSubtractComponentHalf((ave >> 16) & 0xff, (c2 >> 16) & 0xff);
+  const int g = AddSubtractComponentHalf((ave >> 8) & 0xff, (c2 >> 8) & 0xff);
+  const int b = AddSubtractComponentHalf((ave >> 0) & 0xff, (c2 >> 0) & 0xff);
+  return ((uint32_t)a << 24) | (r << 16) | (g << 8) | b;
+}
+
+// gcc-4.9 on ARM generates incorrect code in Select() when Sub3() is inlined.
+#if defined(__arm__) && \
+    (LOCAL_GCC_VERSION == 0x409 || LOCAL_GCC_VERSION == 0x408)
+# define LOCAL_INLINE __attribute__ ((noinline))
+#else
+# define LOCAL_INLINE WEBP_INLINE
+#endif
+
+static LOCAL_INLINE int Sub3(int a, int b, int c) {
+  const int pb = b - c;
+  const int pa = a - c;
+  return abs(pb) - abs(pa);
+}
+
+#undef LOCAL_INLINE
+
+static WEBP_INLINE uint32_t Select(uint32_t a, uint32_t b, uint32_t c) {
+  const int pa_minus_pb =
+      Sub3((a >> 24)       , (b >> 24)       , (c >> 24)       ) +
+      Sub3((a >> 16) & 0xff, (b >> 16) & 0xff, (c >> 16) & 0xff) +
+      Sub3((a >>  8) & 0xff, (b >>  8) & 0xff, (c >>  8) & 0xff) +
+      Sub3((a      ) & 0xff, (b      ) & 0xff, (c      ) & 0xff);
+  return (pa_minus_pb <= 0) ? a : b;
+}
+
+//------------------------------------------------------------------------------
+// Predictors
+
+static uint32_t Predictor2(uint32_t left, const uint32_t* const top) {
+  (void)left;
+  return top[0];
+}
+static uint32_t Predictor3(uint32_t left, const uint32_t* const top) {
+  (void)left;
+  return top[1];
+}
+static uint32_t Predictor4(uint32_t left, const uint32_t* const top) {
+  (void)left;
+  return top[-1];
+}
+static uint32_t Predictor5(uint32_t left, const uint32_t* const top) {
+  const uint32_t pred = Average3(left, top[0], top[1]);
+  return pred;
+}
+static uint32_t Predictor6(uint32_t left, const uint32_t* const top) {
+  const uint32_t pred = Average2(left, top[-1]);
+  return pred;
+}
+static uint32_t Predictor7(uint32_t left, const uint32_t* const top) {
+  const uint32_t pred = Average2(left, top[0]);
+  return pred;
+}
+static uint32_t Predictor8(uint32_t left, const uint32_t* const top) {
+  const uint32_t pred = Average2(top[-1], top[0]);
+  (void)left;
+  return pred;
+}
+static uint32_t Predictor9(uint32_t left, const uint32_t* const top) {
+  const uint32_t pred = Average2(top[0], top[1]);
+  (void)left;
+  return pred;
+}
+static uint32_t Predictor10(uint32_t left, const uint32_t* const top) {
+  const uint32_t pred = Average4(left, top[-1], top[0], top[1]);
+  return pred;
+}
+static uint32_t Predictor11(uint32_t left, const uint32_t* const top) {
+  const uint32_t pred = Select(top[0], left, top[-1]);
+  return pred;
+}
+static uint32_t Predictor12(uint32_t left, const uint32_t* const top) {
+  const uint32_t pred = ClampedAddSubtractFull(left, top[0], top[-1]);
+  return pred;
+}
+static uint32_t Predictor13(uint32_t left, const uint32_t* const top) {
+  const uint32_t pred = ClampedAddSubtractHalf(left, top[0], top[-1]);
+  return pred;
+}
+
+//------------------------------------------------------------------------------
+
+static void PredictorSub0_C(const uint32_t* in, const uint32_t* upper,
+                            int num_pixels, uint32_t* out) {
+  int i;
+  for (i = 0; i < num_pixels; ++i) out[i] = VP8LSubPixels(in[i], ARGB_BLACK);
+  (void)upper;
+}
+
+static void PredictorSub1_C(const uint32_t* in, const uint32_t* upper,
+                            int num_pixels, uint32_t* out) {
+  int i;
+  for (i = 0; i < num_pixels; ++i) out[i] = VP8LSubPixels(in[i], in[i - 1]);
+  (void)upper;
+}
+
+GENERATE_PREDICTOR_SUB(Predictor2, PredictorSub2_C)
+GENERATE_PREDICTOR_SUB(Predictor3, PredictorSub3_C)
+GENERATE_PREDICTOR_SUB(Predictor4, PredictorSub4_C)
+GENERATE_PREDICTOR_SUB(Predictor5, PredictorSub5_C)
+GENERATE_PREDICTOR_SUB(Predictor6, PredictorSub6_C)
+GENERATE_PREDICTOR_SUB(Predictor7, PredictorSub7_C)
+GENERATE_PREDICTOR_SUB(Predictor8, PredictorSub8_C)
+GENERATE_PREDICTOR_SUB(Predictor9, PredictorSub9_C)
+GENERATE_PREDICTOR_SUB(Predictor10, PredictorSub10_C)
+GENERATE_PREDICTOR_SUB(Predictor11, PredictorSub11_C)
+GENERATE_PREDICTOR_SUB(Predictor12, PredictorSub12_C)
+GENERATE_PREDICTOR_SUB(Predictor13, PredictorSub13_C)
+
+//------------------------------------------------------------------------------
+
+VP8LProcessEncBlueAndRedFunc VP8LSubtractGreenFromBlueAndRed;
 
 VP8LTransformColorFunc VP8LTransformColor;
 
@@ -1365,17 +843,23 @@ VP8LCostFunc VP8LExtraCost;
 VP8LCostCombinedFunc VP8LExtraCostCombined;
 VP8LCombinedShannonEntropyFunc VP8LCombinedShannonEntropy;
 
-GetEntropyUnrefinedHelperFunc VP8LGetEntropyUnrefinedHelper;
+VP8LGetEntropyUnrefinedFunc VP8LGetEntropyUnrefined;
+VP8LGetCombinedEntropyUnrefinedFunc VP8LGetCombinedEntropyUnrefined;
 
 VP8LHistogramAddFunc VP8LHistogramAdd;
 
 VP8LVectorMismatchFunc VP8LVectorMismatch;
+VP8LBundleColorMapFunc VP8LBundleColorMap;
+
+VP8LPredictorAddSubFunc VP8LPredictorsSub[16];
+VP8LPredictorAddSubFunc VP8LPredictorsSub_C[16];
 
 extern void VP8LEncDspInitSSE2(void);
 extern void VP8LEncDspInitSSE41(void);
 extern void VP8LEncDspInitNEON(void);
 extern void VP8LEncDspInitMIPS32(void);
 extern void VP8LEncDspInitMIPSdspR2(void);
+extern void VP8LEncDspInitMSA(void);
 
 static volatile VP8CPUInfo lossless_enc_last_cpuinfo_used =
     (VP8CPUInfo)&lossless_enc_last_cpuinfo_used;
@@ -1399,11 +883,47 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInit(void) {
   VP8LExtraCostCombined = ExtraCostCombined;
   VP8LCombinedShannonEntropy = CombinedShannonEntropy;
 
-  VP8LGetEntropyUnrefinedHelper = GetEntropyUnrefinedHelper;
+  VP8LGetEntropyUnrefined = GetEntropyUnrefined;
+  VP8LGetCombinedEntropyUnrefined = GetCombinedEntropyUnrefined;
 
   VP8LHistogramAdd = HistogramAdd;
 
   VP8LVectorMismatch = VectorMismatch;
+  VP8LBundleColorMap = VP8LBundleColorMap_C;
+
+  VP8LPredictorsSub[0] = PredictorSub0_C;
+  VP8LPredictorsSub[1] = PredictorSub1_C;
+  VP8LPredictorsSub[2] = PredictorSub2_C;
+  VP8LPredictorsSub[3] = PredictorSub3_C;
+  VP8LPredictorsSub[4] = PredictorSub4_C;
+  VP8LPredictorsSub[5] = PredictorSub5_C;
+  VP8LPredictorsSub[6] = PredictorSub6_C;
+  VP8LPredictorsSub[7] = PredictorSub7_C;
+  VP8LPredictorsSub[8] = PredictorSub8_C;
+  VP8LPredictorsSub[9] = PredictorSub9_C;
+  VP8LPredictorsSub[10] = PredictorSub10_C;
+  VP8LPredictorsSub[11] = PredictorSub11_C;
+  VP8LPredictorsSub[12] = PredictorSub12_C;
+  VP8LPredictorsSub[13] = PredictorSub13_C;
+  VP8LPredictorsSub[14] = PredictorSub0_C;  // <- padding security sentinels
+  VP8LPredictorsSub[15] = PredictorSub0_C;
+
+  VP8LPredictorsSub_C[0] = PredictorSub0_C;
+  VP8LPredictorsSub_C[1] = PredictorSub1_C;
+  VP8LPredictorsSub_C[2] = PredictorSub2_C;
+  VP8LPredictorsSub_C[3] = PredictorSub3_C;
+  VP8LPredictorsSub_C[4] = PredictorSub4_C;
+  VP8LPredictorsSub_C[5] = PredictorSub5_C;
+  VP8LPredictorsSub_C[6] = PredictorSub6_C;
+  VP8LPredictorsSub_C[7] = PredictorSub7_C;
+  VP8LPredictorsSub_C[8] = PredictorSub8_C;
+  VP8LPredictorsSub_C[9] = PredictorSub9_C;
+  VP8LPredictorsSub_C[10] = PredictorSub10_C;
+  VP8LPredictorsSub_C[11] = PredictorSub11_C;
+  VP8LPredictorsSub_C[12] = PredictorSub12_C;
+  VP8LPredictorsSub_C[13] = PredictorSub13_C;
+  VP8LPredictorsSub_C[14] = PredictorSub0_C;  // <- padding security sentinels
+  VP8LPredictorsSub_C[15] = PredictorSub0_C;
 
   // If defined, use CPUInfo() to overwrite some pointers with faster versions.
   if (VP8GetCPUInfo != NULL) {
@@ -1432,6 +952,11 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInit(void) {
       VP8LEncDspInitMIPSdspR2();
     }
 #endif
+#if defined(WEBP_USE_MSA)
+    if (VP8GetCPUInfo(kMSA)) {
+      VP8LEncDspInitMSA();
+    }
+#endif
   }
   lossless_enc_last_cpuinfo_used = VP8GetCPUInfo;
 }
diff --git a/src/3rdparty/libwebp/src/dsp/lossless_enc_mips32.c b/src/3rdparty/libwebp/src/dsp/lossless_enc_mips32.c
index 49c666d..4186b9f 100644
--- a/src/3rdparty/libwebp/src/dsp/lossless_enc_mips32.c
+++ b/src/3rdparty/libwebp/src/dsp/lossless_enc_mips32.c
@@ -14,6 +14,7 @@
 
 #include "./dsp.h"
 #include "./lossless.h"
+#include "./lossless_common.h"
 
 #if defined(WEBP_USE_MIPS32)
 
@@ -240,6 +241,49 @@ static WEBP_INLINE void GetEntropyUnrefinedHelper(
   *i_prev = i;
 }
 
+static void GetEntropyUnrefined(const uint32_t X[], int length,
+                                VP8LBitEntropy* const bit_entropy,
+                                VP8LStreaks* const stats) {
+  int i;
+  int i_prev = 0;
+  uint32_t x_prev = X[0];
+
+  memset(stats, 0, sizeof(*stats));
+  VP8LBitEntropyInit(bit_entropy);
+
+  for (i = 1; i < length; ++i) {
+    const uint32_t x = X[i];
+    if (x != x_prev) {
+      GetEntropyUnrefinedHelper(x, i, &x_prev, &i_prev, bit_entropy, stats);
+    }
+  }
+  GetEntropyUnrefinedHelper(0, i, &x_prev, &i_prev, bit_entropy, stats);
+
+  bit_entropy->entropy += VP8LFastSLog2(bit_entropy->sum);
+}
+
+static void GetCombinedEntropyUnrefined(const uint32_t X[], const uint32_t Y[],
+                                        int length,
+                                        VP8LBitEntropy* const bit_entropy,
+                                        VP8LStreaks* const stats) {
+  int i = 1;
+  int i_prev = 0;
+  uint32_t xy_prev = X[0] + Y[0];
+
+  memset(stats, 0, sizeof(*stats));
+  VP8LBitEntropyInit(bit_entropy);
+
+  for (i = 1; i < length; ++i) {
+    const uint32_t xy = X[i] + Y[i];
+    if (xy != xy_prev) {
+      GetEntropyUnrefinedHelper(xy, i, &xy_prev, &i_prev, bit_entropy, stats);
+    }
+  }
+  GetEntropyUnrefinedHelper(0, i, &xy_prev, &i_prev, bit_entropy, stats);
+
+  bit_entropy->entropy += VP8LFastSLog2(bit_entropy->sum);
+}
+
 #define ASM_START                                       \
   __asm__ volatile(                                     \
     ".set   push                            \n\t"       \
@@ -375,7 +419,8 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitMIPS32(void) {
   VP8LFastLog2Slow = FastLog2Slow;
   VP8LExtraCost = ExtraCost;
   VP8LExtraCostCombined = ExtraCostCombined;
-  VP8LGetEntropyUnrefinedHelper = GetEntropyUnrefinedHelper;
+  VP8LGetEntropyUnrefined = GetEntropyUnrefined;
+  VP8LGetCombinedEntropyUnrefined = GetCombinedEntropyUnrefined;
   VP8LHistogramAdd = HistogramAdd;
 }
 
diff --git a/src/3rdparty/libwebp/src/dsp/lossless_enc_msa.c b/src/3rdparty/libwebp/src/dsp/lossless_enc_msa.c
new file mode 100644
index 0000000..2f69ba3
--- /dev/null
+++ b/src/3rdparty/libwebp/src/dsp/lossless_enc_msa.c
@@ -0,0 +1,147 @@
+// Copyright 2016 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// MSA variant of Image transform methods for lossless encoder.
+//
+// Authors: Prashant Patil (Prashant.Patil@imgtec.com)
+
+#include "./dsp.h"
+
+#if defined(WEBP_USE_MSA)
+
+#include "./lossless.h"
+#include "./msa_macro.h"
+
+#define TRANSFORM_COLOR_8(src0, src1, dst0, dst1, c0, c1, mask0, mask1) do {  \
+  v8i16 g0, g1, t0, t1, t2, t3;                                               \
+  v4i32 t4, t5;                                                               \
+  VSHF_B2_SH(src0, src0, src1, src1, mask0, mask0, g0, g1);                   \
+  DOTP_SB2_SH(g0, g1, c0, c0, t0, t1);                                        \
+  SRAI_H2_SH(t0, t1, 5);                                                      \
+  t0 = __msa_subv_h((v8i16)src0, t0);                                         \
+  t1 = __msa_subv_h((v8i16)src1, t1);                                         \
+  t4 = __msa_srli_w((v4i32)src0, 16);                                         \
+  t5 = __msa_srli_w((v4i32)src1, 16);                                         \
+  DOTP_SB2_SH(t4, t5, c1, c1, t2, t3);                                        \
+  SRAI_H2_SH(t2, t3, 5);                                                      \
+  SUB2(t0, t2, t1, t3, t0, t1);                                               \
+  VSHF_B2_UB(src0, t0, src1, t1, mask1, mask1, dst0, dst1);                   \
+} while (0)
+
+#define TRANSFORM_COLOR_4(src, dst, c0, c1, mask0, mask1) do {  \
+  const v16i8 g0 = VSHF_SB(src, src, mask0);                    \
+  v8i16 t0 = __msa_dotp_s_h(c0, g0);                            \
+  v8i16 t1;                                                     \
+  v4i32 t2;                                                     \
+  t0 = SRAI_H(t0, 5);                                           \
+  t0 = __msa_subv_h((v8i16)src, t0);                            \
+  t2 = __msa_srli_w((v4i32)src, 16);                            \
+  t1 = __msa_dotp_s_h(c1, (v16i8)t2);                           \
+  t1 = SRAI_H(t1, 5);                                           \
+  t0 = t0 - t1;                                                 \
+  dst = VSHF_UB(src, t0, mask1);                                \
+} while (0)
+
+static void TransformColor(const VP8LMultipliers* const m, uint32_t* data,
+                           int num_pixels) {
+  v16u8 src0, dst0;
+  const v16i8 g2br = (v16i8)__msa_fill_w(m->green_to_blue_ |
+                                         (m->green_to_red_ << 16));
+  const v16i8 r2b = (v16i8)__msa_fill_w(m->red_to_blue_);
+  const v16u8 mask0 = { 1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255,
+                        13, 255, 13, 255 };
+  const v16u8 mask1 = { 16, 1, 18, 3, 20, 5, 22, 7, 24, 9, 26, 11,
+                        28, 13, 30, 15 };
+
+  while (num_pixels >= 8) {
+    v16u8 src1, dst1;
+    LD_UB2(data, 4, src0, src1);
+    TRANSFORM_COLOR_8(src0, src1, dst0, dst1, g2br, r2b, mask0, mask1);
+    ST_UB2(dst0, dst1, data, 4);
+    data += 8;
+    num_pixels -= 8;
+  }
+  if (num_pixels > 0) {
+    if (num_pixels >= 4) {
+      src0 = LD_UB(data);
+      TRANSFORM_COLOR_4(src0, dst0, g2br, r2b, mask0, mask1);
+      ST_UB(dst0, data);
+      data += 4;
+      num_pixels -= 4;
+    }
+    if (num_pixels > 0) {
+      src0 = LD_UB(data);
+      TRANSFORM_COLOR_4(src0, dst0, g2br, r2b, mask0, mask1);
+      if (num_pixels == 3) {
+        const uint64_t pix_d = __msa_copy_s_d((v2i64)dst0, 0);
+        const uint32_t pix_w = __msa_copy_s_w((v4i32)dst0, 2);
+        SD(pix_d, data + 0);
+        SW(pix_w, data + 2);
+      } else if (num_pixels == 2) {
+        const uint64_t pix_d = __msa_copy_s_d((v2i64)dst0, 0);
+        SD(pix_d, data);
+      } else {
+        const uint32_t pix_w = __msa_copy_s_w((v4i32)dst0, 0);
+        SW(pix_w, data);
+      }
+    }
+  }
+}
+
+static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) {
+  int i;
+  uint8_t* ptemp_data = (uint8_t*)argb_data;
+  v16u8 src0, dst0, tmp0;
+  const v16u8 mask = { 1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255,
+                       13, 255, 13, 255 };
+
+  while (num_pixels >= 8) {
+    v16u8 src1, dst1, tmp1;
+    LD_UB2(ptemp_data, 16, src0, src1);
+    VSHF_B2_UB(src0, src1, src1, src0, mask, mask, tmp0, tmp1);
+    SUB2(src0, tmp0, src1, tmp1, dst0, dst1);
+    ST_UB2(dst0, dst1, ptemp_data, 16);
+    ptemp_data += 8 * 4;
+    num_pixels -= 8;
+  }
+  if (num_pixels > 0) {
+    if (num_pixels >= 4) {
+      src0 = LD_UB(ptemp_data);
+      tmp0 = VSHF_UB(src0, src0, mask);
+      dst0 = src0 - tmp0;
+      ST_UB(dst0, ptemp_data);
+      ptemp_data += 4 * 4;
+      num_pixels -= 4;
+    }
+    for (i = 0; i < num_pixels; i++) {
+      const uint8_t b = ptemp_data[0];
+      const uint8_t g = ptemp_data[1];
+      const uint8_t r = ptemp_data[2];
+      ptemp_data[0] = (b - g) & 0xff;
+      ptemp_data[2] = (r - g) & 0xff;
+      ptemp_data += 4;
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8LEncDspInitMSA(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitMSA(void) {
+  VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed;
+  VP8LTransformColor = TransformColor;
+}
+
+#else  // !WEBP_USE_MSA
+
+WEBP_DSP_INIT_STUB(VP8LEncDspInitMSA)
+
+#endif  // WEBP_USE_MSA
diff --git a/src/3rdparty/libwebp/src/dsp/lossless_enc_sse2.c b/src/3rdparty/libwebp/src/dsp/lossless_enc_sse2.c
index 7c894e7..8ad85d9 100644
--- a/src/3rdparty/libwebp/src/dsp/lossless_enc_sse2.c
+++ b/src/3rdparty/libwebp/src/dsp/lossless_enc_sse2.c
@@ -17,6 +17,8 @@
 #include <assert.h>
 #include <emmintrin.h>
 #include "./lossless.h"
+#include "./common_sse2.h"
+#include "./lossless_common.h"
 
 // For sign-extended multiplying constants, pre-shifted by 5:
 #define CST_5b(X)  (((int16_t)((uint16_t)X << 8)) >> 5)
@@ -35,7 +37,9 @@ static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) {
     _mm_storeu_si128((__m128i*)&argb_data[i], out);
   }
   // fallthrough and finish off with plain-C
-  VP8LSubtractGreenFromBlueAndRed_C(argb_data + i, num_pixels - i);
+  if (i != num_pixels) {
+    VP8LSubtractGreenFromBlueAndRed_C(argb_data + i, num_pixels - i);
+  }
 }
 
 //------------------------------------------------------------------------------
@@ -69,7 +73,9 @@ static void TransformColor(const VP8LMultipliers* const m,
     _mm_storeu_si128((__m128i*)&argb_data[i], out);
   }
   // fallthrough and finish off with plain-C
-  VP8LTransformColor_C(m, argb_data + i, num_pixels - i);
+  if (i != num_pixels) {
+    VP8LTransformColor_C(m, argb_data + i, num_pixels - i);
+  }
 }
 
 //------------------------------------------------------------------------------
@@ -364,8 +370,9 @@ static int VectorMismatch(const uint32_t* const array1,
       if (length >= 8 &&
           _mm_movemask_epi8(_mm_cmpeq_epi32(
               _mm_loadu_si128((const __m128i*)&array1[4]),
-              _mm_loadu_si128((const __m128i*)&array2[4]))) == 0xffff)
+              _mm_loadu_si128((const __m128i*)&array2[4]))) == 0xffff) {
         match_len = 8;
+      }
     }
   }
 
@@ -375,6 +382,295 @@ static int VectorMismatch(const uint32_t* const array1,
   return match_len;
 }
 
+// Bundles multiple (1, 2, 4 or 8) pixels into a single pixel.
+static void BundleColorMap_SSE2(const uint8_t* const row, int width, int xbits,
+                                uint32_t* dst) {
+  int x;
+  assert(xbits >= 0);
+  assert(xbits <= 3);
+  switch (xbits) {
+    case 0: {
+      const __m128i ff = _mm_set1_epi16(0xff00);
+      const __m128i zero = _mm_setzero_si128();
+      // Store 0xff000000 | (row[x] << 8).
+      for (x = 0; x + 16 <= width; x += 16, dst += 16) {
+        const __m128i in = _mm_loadu_si128((const __m128i*)&row[x]);
+        const __m128i in_lo = _mm_unpacklo_epi8(zero, in);
+        const __m128i dst0 = _mm_unpacklo_epi16(in_lo, ff);
+        const __m128i dst1 = _mm_unpackhi_epi16(in_lo, ff);
+        const __m128i in_hi = _mm_unpackhi_epi8(zero, in);
+        const __m128i dst2 = _mm_unpacklo_epi16(in_hi, ff);
+        const __m128i dst3 = _mm_unpackhi_epi16(in_hi, ff);
+        _mm_storeu_si128((__m128i*)&dst[0], dst0);
+        _mm_storeu_si128((__m128i*)&dst[4], dst1);
+        _mm_storeu_si128((__m128i*)&dst[8], dst2);
+        _mm_storeu_si128((__m128i*)&dst[12], dst3);
+      }
+      break;
+    }
+    case 1: {
+      const __m128i ff = _mm_set1_epi16(0xff00);
+      const __m128i mul = _mm_set1_epi16(0x110);
+      for (x = 0; x + 16 <= width; x += 16, dst += 8) {
+        // 0a0b | (where a/b are 4 bits).
+        const __m128i in = _mm_loadu_si128((const __m128i*)&row[x]);
+        const __m128i tmp = _mm_mullo_epi16(in, mul);  // aba0
+        const __m128i pack = _mm_and_si128(tmp, ff);   // ab00
+        const __m128i dst0 = _mm_unpacklo_epi16(pack, ff);
+        const __m128i dst1 = _mm_unpackhi_epi16(pack, ff);
+        _mm_storeu_si128((__m128i*)&dst[0], dst0);
+        _mm_storeu_si128((__m128i*)&dst[4], dst1);
+      }
+      break;
+    }
+    case 2: {
+      const __m128i mask_or = _mm_set1_epi32(0xff000000);
+      const __m128i mul_cst = _mm_set1_epi16(0x0104);
+      const __m128i mask_mul = _mm_set1_epi16(0x0f00);
+      for (x = 0; x + 16 <= width; x += 16, dst += 4) {
+        // 000a000b000c000d | (where a/b/c/d are 2 bits).
+        const __m128i in = _mm_loadu_si128((const __m128i*)&row[x]);
+        const __m128i mul = _mm_mullo_epi16(in, mul_cst);  // 00ab00b000cd00d0
+        const __m128i tmp = _mm_and_si128(mul, mask_mul);  // 00ab000000cd0000
+        const __m128i shift = _mm_srli_epi32(tmp, 12);     // 00000000ab000000
+        const __m128i pack = _mm_or_si128(shift, tmp);     // 00000000abcd0000
+        // Convert to 0xff00**00.
+        const __m128i res = _mm_or_si128(pack, mask_or);
+        _mm_storeu_si128((__m128i*)dst, res);
+      }
+      break;
+    }
+    default: {
+      assert(xbits == 3);
+      for (x = 0; x + 16 <= width; x += 16, dst += 2) {
+        // 0000000a00000000b... | (where a/b are 1 bit).
+        const __m128i in = _mm_loadu_si128((const __m128i*)&row[x]);
+        const __m128i shift = _mm_slli_epi64(in, 7);
+        const uint32_t move = _mm_movemask_epi8(shift);
+        dst[0] = 0xff000000 | ((move & 0xff) << 8);
+        dst[1] = 0xff000000 | (move & 0xff00);
+      }
+      break;
+    }
+  }
+  if (x != width) {
+    VP8LBundleColorMap_C(row + x, width - x, xbits, dst);
+  }
+}
+
+//------------------------------------------------------------------------------
+// Batch version of Predictor Transform subtraction
+
+static WEBP_INLINE void Average2_m128i(const __m128i* const a0,
+                                       const __m128i* const a1,
+                                       __m128i* const avg) {
+  // (a + b) >> 1 = ((a + b + 1) >> 1) - ((a ^ b) & 1)
+  const __m128i ones = _mm_set1_epi8(1);
+  const __m128i avg1 = _mm_avg_epu8(*a0, *a1);
+  const __m128i one = _mm_and_si128(_mm_xor_si128(*a0, *a1), ones);
+  *avg = _mm_sub_epi8(avg1, one);
+}
+
+// Predictor0: ARGB_BLACK.
+static void PredictorSub0_SSE2(const uint32_t* in, const uint32_t* upper,
+                               int num_pixels, uint32_t* out) {
+  int i;
+  const __m128i black = _mm_set1_epi32(ARGB_BLACK);
+  for (i = 0; i + 4 <= num_pixels; i += 4) {
+    const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
+    const __m128i res = _mm_sub_epi8(src, black);
+    _mm_storeu_si128((__m128i*)&out[i], res);
+  }
+  if (i != num_pixels) {
+    VP8LPredictorsSub_C[0](in + i, upper + i, num_pixels - i, out + i);
+  }
+}
+
+#define GENERATE_PREDICTOR_1(X, IN)                                           \
+static void PredictorSub##X##_SSE2(const uint32_t* in, const uint32_t* upper, \
+                                   int num_pixels, uint32_t* out) {           \
+  int i;                                                                      \
+  for (i = 0; i + 4 <= num_pixels; i += 4) {                                  \
+    const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);              \
+    const __m128i pred = _mm_loadu_si128((const __m128i*)&(IN));              \
+    const __m128i res = _mm_sub_epi8(src, pred);                              \
+    _mm_storeu_si128((__m128i*)&out[i], res);                                 \
+  }                                                                           \
+  if (i != num_pixels) {                                                      \
+    VP8LPredictorsSub_C[(X)](in + i, upper + i, num_pixels - i, out + i);     \
+  }                                                                           \
+}
+
+GENERATE_PREDICTOR_1(1, in[i - 1])       // Predictor1: L
+GENERATE_PREDICTOR_1(2, upper[i])        // Predictor2: T
+GENERATE_PREDICTOR_1(3, upper[i + 1])    // Predictor3: TR
+GENERATE_PREDICTOR_1(4, upper[i - 1])    // Predictor4: TL
+#undef GENERATE_PREDICTOR_1
+
+// Predictor5: avg2(avg2(L, TR), T)
+static void PredictorSub5_SSE2(const uint32_t* in, const uint32_t* upper,
+                               int num_pixels, uint32_t* out) {
+  int i;
+  for (i = 0; i + 4 <= num_pixels; i += 4) {
+    const __m128i L = _mm_loadu_si128((const __m128i*)&in[i - 1]);
+    const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]);
+    const __m128i TR = _mm_loadu_si128((const __m128i*)&upper[i + 1]);
+    const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
+    __m128i avg, pred, res;
+    Average2_m128i(&L, &TR, &avg);
+    Average2_m128i(&avg, &T, &pred);
+    res = _mm_sub_epi8(src, pred);
+    _mm_storeu_si128((__m128i*)&out[i], res);
+  }
+  if (i != num_pixels) {
+    VP8LPredictorsSub_C[5](in + i, upper + i, num_pixels - i, out + i);
+  }
+}
+
+#define GENERATE_PREDICTOR_2(X, A, B)                                         \
+static void PredictorSub##X##_SSE2(const uint32_t* in, const uint32_t* upper, \
+                                   int num_pixels, uint32_t* out) {           \
+  int i;                                                                      \
+  for (i = 0; i + 4 <= num_pixels; i += 4) {                                  \
+    const __m128i tA = _mm_loadu_si128((const __m128i*)&(A));                 \
+    const __m128i tB = _mm_loadu_si128((const __m128i*)&(B));                 \
+    const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);              \
+    __m128i pred, res;                                                        \
+    Average2_m128i(&tA, &tB, &pred);                                          \
+    res = _mm_sub_epi8(src, pred);                                            \
+    _mm_storeu_si128((__m128i*)&out[i], res);                                 \
+  }                                                                           \
+  if (i != num_pixels) {                                                      \
+    VP8LPredictorsSub_C[(X)](in + i, upper + i, num_pixels - i, out + i);     \
+  }                                                                           \
+}
+
+GENERATE_PREDICTOR_2(6, in[i - 1], upper[i - 1])   // Predictor6: avg(L, TL)
+GENERATE_PREDICTOR_2(7, in[i - 1], upper[i])       // Predictor7: avg(L, T)
+GENERATE_PREDICTOR_2(8, upper[i - 1], upper[i])    // Predictor8: avg(TL, T)
+GENERATE_PREDICTOR_2(9, upper[i], upper[i + 1])    // Predictor9: average(T, TR)
+#undef GENERATE_PREDICTOR_2
+
+// Predictor10: avg(avg(L,TL), avg(T, TR)).
+static void PredictorSub10_SSE2(const uint32_t* in, const uint32_t* upper,
+                                int num_pixels, uint32_t* out) {
+  int i;
+  for (i = 0; i + 4 <= num_pixels; i += 4) {
+    const __m128i L = _mm_loadu_si128((const __m128i*)&in[i - 1]);
+    const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
+    const __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]);
+    const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]);
+    const __m128i TR = _mm_loadu_si128((const __m128i*)&upper[i + 1]);
+    __m128i avgTTR, avgLTL, avg, res;
+    Average2_m128i(&T, &TR, &avgTTR);
+    Average2_m128i(&L, &TL, &avgLTL);
+    Average2_m128i(&avgTTR, &avgLTL, &avg);
+    res = _mm_sub_epi8(src, avg);
+    _mm_storeu_si128((__m128i*)&out[i], res);
+  }
+  if (i != num_pixels) {
+    VP8LPredictorsSub_C[10](in + i, upper + i, num_pixels - i, out + i);
+  }
+}
+
+// Predictor11: select.
+static void GetSumAbsDiff32(const __m128i* const A, const __m128i* const B,
+                            __m128i* const out) {
+  // We can unpack with any value on the upper 32 bits, provided it's the same
+  // on both operands (to that their sum of abs diff is zero). Here we use *A.
+  const __m128i A_lo = _mm_unpacklo_epi32(*A, *A);
+  const __m128i B_lo = _mm_unpacklo_epi32(*B, *A);
+  const __m128i A_hi = _mm_unpackhi_epi32(*A, *A);
+  const __m128i B_hi = _mm_unpackhi_epi32(*B, *A);
+  const __m128i s_lo = _mm_sad_epu8(A_lo, B_lo);
+  const __m128i s_hi = _mm_sad_epu8(A_hi, B_hi);
+  *out = _mm_packs_epi32(s_lo, s_hi);
+}
+
+static void PredictorSub11_SSE2(const uint32_t* in, const uint32_t* upper,
+                                int num_pixels, uint32_t* out) {
+  int i;
+  for (i = 0; i + 4 <= num_pixels; i += 4) {
+    const __m128i L = _mm_loadu_si128((const __m128i*)&in[i - 1]);
+    const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]);
+    const __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]);
+    const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
+    __m128i pa, pb;
+    GetSumAbsDiff32(&T, &TL, &pa);   // pa = sum |T-TL|
+    GetSumAbsDiff32(&L, &TL, &pb);   // pb = sum |L-TL|
+    {
+      const __m128i mask = _mm_cmpgt_epi32(pb, pa);
+      const __m128i A = _mm_and_si128(mask, L);
+      const __m128i B = _mm_andnot_si128(mask, T);
+      const __m128i pred = _mm_or_si128(A, B);    // pred = (L > T)? L : T
+      const __m128i res = _mm_sub_epi8(src, pred);
+      _mm_storeu_si128((__m128i*)&out[i], res);
+    }
+  }
+  if (i != num_pixels) {
+    VP8LPredictorsSub_C[11](in + i, upper + i, num_pixels - i, out + i);
+  }
+}
+
+// Predictor12: ClampedSubSubtractFull.
+static void PredictorSub12_SSE2(const uint32_t* in, const uint32_t* upper,
+                                int num_pixels, uint32_t* out) {
+  int i;
+  const __m128i zero = _mm_setzero_si128();
+  for (i = 0; i + 4 <= num_pixels; i += 4) {
+    const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
+    const __m128i L = _mm_loadu_si128((const __m128i*)&in[i - 1]);
+    const __m128i L_lo = _mm_unpacklo_epi8(L, zero);
+    const __m128i L_hi = _mm_unpackhi_epi8(L, zero);
+    const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]);
+    const __m128i T_lo = _mm_unpacklo_epi8(T, zero);
+    const __m128i T_hi = _mm_unpackhi_epi8(T, zero);
+    const __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]);
+    const __m128i TL_lo = _mm_unpacklo_epi8(TL, zero);
+    const __m128i TL_hi = _mm_unpackhi_epi8(TL, zero);
+    const __m128i diff_lo = _mm_sub_epi16(T_lo, TL_lo);
+    const __m128i diff_hi = _mm_sub_epi16(T_hi, TL_hi);
+    const __m128i pred_lo = _mm_add_epi16(L_lo, diff_lo);
+    const __m128i pred_hi = _mm_add_epi16(L_hi, diff_hi);
+    const __m128i pred = _mm_packus_epi16(pred_lo, pred_hi);
+    const __m128i res = _mm_sub_epi8(src, pred);
+    _mm_storeu_si128((__m128i*)&out[i], res);
+  }
+  if (i != num_pixels) {
+    VP8LPredictorsSub_C[12](in + i, upper + i, num_pixels - i, out + i);
+  }
+}
+
+// Predictors13: ClampedAddSubtractHalf
+static void PredictorSub13_SSE2(const uint32_t* in, const uint32_t* upper,
+                                int num_pixels, uint32_t* out) {
+  int i;
+  const __m128i zero = _mm_setzero_si128();
+  for (i = 0; i + 2 <= num_pixels; i += 2) {
+    // we can only process two pixels at a time
+    const __m128i L = _mm_loadl_epi64((const __m128i*)&in[i - 1]);
+    const __m128i src = _mm_loadl_epi64((const __m128i*)&in[i]);
+    const __m128i T = _mm_loadl_epi64((const __m128i*)&upper[i]);
+    const __m128i TL = _mm_loadl_epi64((const __m128i*)&upper[i - 1]);
+    const __m128i L_lo = _mm_unpacklo_epi8(L, zero);
+    const __m128i T_lo = _mm_unpacklo_epi8(T, zero);
+    const __m128i TL_lo = _mm_unpacklo_epi8(TL, zero);
+    const __m128i sum = _mm_add_epi16(T_lo, L_lo);
+    const __m128i avg = _mm_srli_epi16(sum, 1);
+    const __m128i A1 = _mm_sub_epi16(avg, TL_lo);
+    const __m128i bit_fix = _mm_cmpgt_epi16(TL_lo, avg);
+    const __m128i A2 = _mm_sub_epi16(A1, bit_fix);
+    const __m128i A3 = _mm_srai_epi16(A2, 1);
+    const __m128i A4 = _mm_add_epi16(avg, A3);
+    const __m128i pred = _mm_packus_epi16(A4, A4);
+    const __m128i res = _mm_sub_epi8(src, pred);
+    _mm_storel_epi64((__m128i*)&out[i], res);
+  }
+  if (i != num_pixels) {
+    VP8LPredictorsSub_C[13](in + i, upper + i, num_pixels - i, out + i);
+  }
+}
+
 //------------------------------------------------------------------------------
 // Entry point
 
@@ -388,6 +684,24 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitSSE2(void) {
   VP8LHistogramAdd = HistogramAdd;
   VP8LCombinedShannonEntropy = CombinedShannonEntropy;
   VP8LVectorMismatch = VectorMismatch;
+  VP8LBundleColorMap = BundleColorMap_SSE2;
+
+  VP8LPredictorsSub[0] = PredictorSub0_SSE2;
+  VP8LPredictorsSub[1] = PredictorSub1_SSE2;
+  VP8LPredictorsSub[2] = PredictorSub2_SSE2;
+  VP8LPredictorsSub[3] = PredictorSub3_SSE2;
+  VP8LPredictorsSub[4] = PredictorSub4_SSE2;
+  VP8LPredictorsSub[5] = PredictorSub5_SSE2;
+  VP8LPredictorsSub[6] = PredictorSub6_SSE2;
+  VP8LPredictorsSub[7] = PredictorSub7_SSE2;
+  VP8LPredictorsSub[8] = PredictorSub8_SSE2;
+  VP8LPredictorsSub[9] = PredictorSub9_SSE2;
+  VP8LPredictorsSub[10] = PredictorSub10_SSE2;
+  VP8LPredictorsSub[11] = PredictorSub11_SSE2;
+  VP8LPredictorsSub[12] = PredictorSub12_SSE2;
+  VP8LPredictorsSub[13] = PredictorSub13_SSE2;
+  VP8LPredictorsSub[14] = PredictorSub0_SSE2;  // <- padding security sentinels
+  VP8LPredictorsSub[15] = PredictorSub0_SSE2;
 }
 
 #else  // !WEBP_USE_SSE2
diff --git a/src/3rdparty/libwebp/src/dsp/lossless_enc_sse41.c b/src/3rdparty/libwebp/src/dsp/lossless_enc_sse41.c
index 3e49319..821057c 100644
--- a/src/3rdparty/libwebp/src/dsp/lossless_enc_sse41.c
+++ b/src/3rdparty/libwebp/src/dsp/lossless_enc_sse41.c
@@ -32,7 +32,9 @@ static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) {
     _mm_storeu_si128((__m128i*)&argb_data[i], out);
   }
   // fallthrough and finish off with plain-C
-  VP8LSubtractGreenFromBlueAndRed_C(argb_data + i, num_pixels - i);
+  if (i != num_pixels) {
+    VP8LSubtractGreenFromBlueAndRed_C(argb_data + i, num_pixels - i);
+  }
 }
 
 //------------------------------------------------------------------------------
diff --git a/src/3rdparty/libwebp/src/dsp/lossless_mips_dsp_r2.c b/src/3rdparty/libwebp/src/dsp/lossless_mips_dsp_r2.c
index 90aed7f..2984ce8 100644
--- a/src/3rdparty/libwebp/src/dsp/lossless_mips_dsp_r2.c
+++ b/src/3rdparty/libwebp/src/dsp/lossless_mips_dsp_r2.c
@@ -17,6 +17,7 @@
 #if defined(WEBP_USE_MIPS_DSP_R2)
 
 #include "./lossless.h"
+#include "./lossless_common.h"
 
 #define MAP_COLOR_FUNCS(FUNC_NAME, TYPE, GET_INDEX, GET_VALUE)                 \
 static void FUNC_NAME(const TYPE* src,                                         \
@@ -227,25 +228,27 @@ static uint32_t Predictor13(uint32_t left, const uint32_t* const top) {
 
 // Add green to blue and red channels (i.e. perform the inverse transform of
 // 'subtract green').
-static void AddGreenToBlueAndRed(uint32_t* data, int num_pixels) {
+static void AddGreenToBlueAndRed(const uint32_t* src, int num_pixels,
+                                 uint32_t* dst) {
   uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
-  uint32_t* const p_loop1_end = data + (num_pixels & ~3);
-  uint32_t* const p_loop2_end = data + num_pixels;
+  const uint32_t* const p_loop1_end = src + (num_pixels & ~3);
+  const uint32_t* const p_loop2_end = src + num_pixels;
   __asm__ volatile (
     ".set       push                                          \n\t"
     ".set       noreorder                                     \n\t"
-    "beq        %[data],         %[p_loop1_end],     3f       \n\t"
+    "beq        %[src],          %[p_loop1_end],     3f       \n\t"
     " nop                                                     \n\t"
   "0:                                                         \n\t"
-    "lw         %[temp0],        0(%[data])                   \n\t"
-    "lw         %[temp1],        4(%[data])                   \n\t"
-    "lw         %[temp2],        8(%[data])                   \n\t"
-    "lw         %[temp3],        12(%[data])                  \n\t"
+    "lw         %[temp0],        0(%[src])                    \n\t"
+    "lw         %[temp1],        4(%[src])                    \n\t"
+    "lw         %[temp2],        8(%[src])                    \n\t"
+    "lw         %[temp3],        12(%[src])                   \n\t"
     "ext        %[temp4],        %[temp0],           8,    8  \n\t"
     "ext        %[temp5],        %[temp1],           8,    8  \n\t"
     "ext        %[temp6],        %[temp2],           8,    8  \n\t"
     "ext        %[temp7],        %[temp3],           8,    8  \n\t"
-    "addiu      %[data],         %[data],            16       \n\t"
+    "addiu      %[src],          %[src],             16       \n\t"
+    "addiu      %[dst],          %[dst],             16       \n\t"
     "replv.ph   %[temp4],        %[temp4]                     \n\t"
     "replv.ph   %[temp5],        %[temp5]                     \n\t"
     "replv.ph   %[temp6],        %[temp6]                     \n\t"
@@ -254,44 +257,47 @@ static void AddGreenToBlueAndRed(uint32_t* data, int num_pixels) {
     "addu.qb    %[temp1],        %[temp1],           %[temp5] \n\t"
     "addu.qb    %[temp2],        %[temp2],           %[temp6] \n\t"
     "addu.qb    %[temp3],        %[temp3],           %[temp7] \n\t"
-    "sw         %[temp0],        -16(%[data])                 \n\t"
-    "sw         %[temp1],        -12(%[data])                 \n\t"
-    "sw         %[temp2],        -8(%[data])                  \n\t"
-    "bne        %[data],         %[p_loop1_end],     0b       \n\t"
-    " sw        %[temp3],        -4(%[data])                  \n\t"
+    "sw         %[temp0],        -16(%[dst])                  \n\t"
+    "sw         %[temp1],        -12(%[dst])                  \n\t"
+    "sw         %[temp2],        -8(%[dst])                   \n\t"
+    "bne        %[src],          %[p_loop1_end],     0b       \n\t"
+    " sw        %[temp3],        -4(%[dst])                   \n\t"
   "3:                                                         \n\t"
-    "beq        %[data],         %[p_loop2_end],     2f       \n\t"
+    "beq        %[src],          %[p_loop2_end],     2f       \n\t"
     " nop                                                     \n\t"
   "1:                                                         \n\t"
-    "lw         %[temp0],        0(%[data])                   \n\t"
-    "addiu      %[data],         %[data],            4        \n\t"
+    "lw         %[temp0],        0(%[src])                    \n\t"
+    "addiu      %[src],          %[src],             4        \n\t"
+    "addiu      %[dst],          %[dst],             4        \n\t"
     "ext        %[temp4],        %[temp0],           8,    8  \n\t"
     "replv.ph   %[temp4],        %[temp4]                     \n\t"
     "addu.qb    %[temp0],        %[temp0],           %[temp4] \n\t"
-    "bne        %[data],         %[p_loop2_end],     1b       \n\t"
-    " sw        %[temp0],        -4(%[data])                  \n\t"
+    "bne        %[src],          %[p_loop2_end],     1b       \n\t"
+    " sw        %[temp0],        -4(%[dst])                   \n\t"
   "2:                                                         \n\t"
     ".set       pop                                           \n\t"
-    : [data]"+&r"(data), [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
-      [temp2]"=&r"(temp2), [temp3]"=&r"(temp3), [temp4]"=&r"(temp4),
-      [temp5]"=&r"(temp5), [temp6]"=&r"(temp6), [temp7]"=&r"(temp7)
+    : [dst]"+&r"(dst), [src]"+&r"(src), [temp0]"=&r"(temp0),
+      [temp1]"=&r"(temp1), [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
+      [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [temp6]"=&r"(temp6),
+      [temp7]"=&r"(temp7)
     : [p_loop1_end]"r"(p_loop1_end), [p_loop2_end]"r"(p_loop2_end)
     : "memory"
   );
 }
 
 static void TransformColorInverse(const VP8LMultipliers* const m,
-                                  uint32_t* data, int num_pixels) {
+                                  const uint32_t* src, int num_pixels,
+                                  uint32_t* dst) {
   int temp0, temp1, temp2, temp3, temp4, temp5;
   uint32_t argb, argb1, new_red;
   const uint32_t G_to_R = m->green_to_red_;
   const uint32_t G_to_B = m->green_to_blue_;
   const uint32_t R_to_B = m->red_to_blue_;
-  uint32_t* const p_loop_end = data + (num_pixels & ~1);
+  const uint32_t* const p_loop_end = src + (num_pixels & ~1);
   __asm__ volatile (
     ".set            push                                    \n\t"
     ".set            noreorder                               \n\t"
-    "beq             %[data],      %[p_loop_end],  1f        \n\t"
+    "beq             %[src],       %[p_loop_end],  1f        \n\t"
     " nop                                                    \n\t"
     "replv.ph        %[temp0],     %[G_to_R]                 \n\t"
     "replv.ph        %[temp1],     %[G_to_B]                 \n\t"
@@ -303,9 +309,12 @@ static void TransformColorInverse(const VP8LMultipliers* const m,
     "shra.ph         %[temp1],     %[temp1],       8         \n\t"
     "shra.ph         %[temp2],     %[temp2],       8         \n\t"
   "0:                                                        \n\t"
-    "lw              %[argb],      0(%[data])                \n\t"
-    "lw              %[argb1],     4(%[data])                \n\t"
-    "addiu           %[data],      %[data],        8         \n\t"
+    "lw              %[argb],      0(%[src])                 \n\t"
+    "lw              %[argb1],     4(%[src])                 \n\t"
+    "sw              %[argb],      0(%[dst])                 \n\t"
+    "sw              %[argb1],     4(%[dst])                 \n\t"
+    "addiu           %[src],       %[src],         8         \n\t"
+    "addiu           %[dst],       %[dst],         8         \n\t"
     "precrq.qb.ph    %[temp3],     %[argb],        %[argb1]  \n\t"
     "preceu.ph.qbra  %[temp3],     %[temp3]                  \n\t"
     "shll.ph         %[temp3],     %[temp3],       8         \n\t"
@@ -322,29 +331,29 @@ static void TransformColorInverse(const VP8LMultipliers* const m,
     "shll.ph         %[temp4],     %[temp5],       8         \n\t"
     "shra.ph         %[temp4],     %[temp4],       8         \n\t"
     "mul.ph          %[temp4],     %[temp4],       %[temp2]  \n\t"
-    "sb              %[temp5],     -2(%[data])               \n\t"
+    "sb              %[temp5],     -2(%[dst])                \n\t"
     "sra             %[temp5],     %[temp5],       16        \n\t"
     "shra.ph         %[temp4],     %[temp4],       5         \n\t"
     "addu.ph         %[argb1],     %[argb1],       %[temp4]  \n\t"
     "preceu.ph.qbra  %[temp3],     %[argb1]                  \n\t"
-    "sb              %[temp5],     -6(%[data])               \n\t"
-    "sb              %[temp3],     -4(%[data])               \n\t"
+    "sb              %[temp5],     -6(%[dst])                \n\t"
+    "sb              %[temp3],     -4(%[dst])                \n\t"
     "sra             %[temp3],     %[temp3],       16        \n\t"
-    "bne             %[data],      %[p_loop_end],  0b        \n\t"
-    " sb             %[temp3],     -8(%[data])               \n\t"
+    "bne             %[src],       %[p_loop_end],  0b        \n\t"
+    " sb             %[temp3],     -8(%[dst])                \n\t"
   "1:                                                        \n\t"
     ".set            pop                                     \n\t"
     : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
       [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
       [new_red]"=&r"(new_red), [argb]"=&r"(argb),
-      [argb1]"=&r"(argb1), [data]"+&r"(data)
+      [argb1]"=&r"(argb1), [dst]"+&r"(dst), [src]"+&r"(src)
     : [G_to_R]"r"(G_to_R), [R_to_B]"r"(R_to_B),
       [G_to_B]"r"(G_to_B), [p_loop_end]"r"(p_loop_end)
     : "memory", "hi", "lo"
   );
 
   // Fall-back to C-version for left-overs.
-  if (num_pixels & 1) VP8LTransformColorInverse_C(m, data, 1);
+  if (num_pixels & 1) VP8LTransformColorInverse_C(m, src, 1, dst);
 }
 
 static void ConvertBGRAToRGB(const uint32_t* src,
diff --git a/src/3rdparty/libwebp/src/dsp/lossless_msa.c b/src/3rdparty/libwebp/src/dsp/lossless_msa.c
new file mode 100644
index 0000000..f6dd564
--- /dev/null
+++ b/src/3rdparty/libwebp/src/dsp/lossless_msa.c
@@ -0,0 +1,355 @@
+// Copyright 2016 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// MSA variant of methods for lossless decoder
+//
+// Author: Prashant Patil (prashant.patil@imgtec.com)
+
+#include "./dsp.h"
+
+#if defined(WEBP_USE_MSA)
+
+#include "./lossless.h"
+#include "./msa_macro.h"
+
+//------------------------------------------------------------------------------
+// Colorspace conversion functions
+
+#define CONVERT16_BGRA_XXX(psrc, pdst, m0, m1, m2) do {    \
+  v16u8 src0, src1, src2, src3, dst0, dst1, dst2;          \
+  LD_UB4(psrc, 16, src0, src1, src2, src3);                \
+  VSHF_B2_UB(src0, src1, src1, src2, m0, m1, dst0, dst1);  \
+  dst2 = VSHF_UB(src2, src3, m2);                          \
+  ST_UB2(dst0, dst1, pdst, 16);                            \
+  ST_UB(dst2, pdst + 32);                                  \
+} while (0)
+
+#define CONVERT12_BGRA_XXX(psrc, pdst, m0, m1, m2) do {    \
+  uint32_t pix_w;                                          \
+  v16u8 src0, src1, src2, dst0, dst1, dst2;                \
+  LD_UB3(psrc, 16, src0, src1, src2);                      \
+  VSHF_B2_UB(src0, src1, src1, src2, m0, m1, dst0, dst1);  \
+  dst2 = VSHF_UB(src2, src2, m2);                          \
+  ST_UB2(dst0, dst1, pdst, 16);                            \
+  pix_w = __msa_copy_s_w((v4i32)dst2, 0);                  \
+  SW(pix_w, pdst + 32);                                    \
+} while (0)
+
+#define CONVERT8_BGRA_XXX(psrc, pdst, m0, m1) do {         \
+  uint64_t pix_d;                                          \
+  v16u8 src0, src1, src2, dst0, dst1;                      \
+  LD_UB2(psrc, 16, src0, src1);                            \
+  VSHF_B2_UB(src0, src1, src1, src2, m0, m1, dst0, dst1);  \
+  ST_UB(dst0, pdst);                                       \
+  pix_d = __msa_copy_s_d((v2i64)dst1, 0);                  \
+  SD(pix_d, pdst + 16);                                    \
+} while (0)
+
+#define CONVERT4_BGRA_XXX(psrc, pdst, m) do {       \
+  const v16u8 src0 = LD_UB(psrc);                   \
+  const v16u8 dst0 = VSHF_UB(src0, src0, m);        \
+  uint64_t pix_d = __msa_copy_s_d((v2i64)dst0, 0);  \
+  uint32_t pix_w = __msa_copy_s_w((v4i32)dst0, 2);  \
+  SD(pix_d, pdst + 0);                              \
+  SW(pix_w, pdst + 8);                              \
+} while (0)
+
+#define CONVERT1_BGRA_BGR(psrc, pdst) do {  \
+  const int32_t b = (psrc)[0];              \
+  const int32_t g = (psrc)[1];              \
+  const int32_t r = (psrc)[2];              \
+  (pdst)[0] = b;                            \
+  (pdst)[1] = g;                            \
+  (pdst)[2] = r;                            \
+} while (0)
+
+#define CONVERT1_BGRA_RGB(psrc, pdst) do {  \
+  const int32_t b = (psrc)[0];              \
+  const int32_t g = (psrc)[1];              \
+  const int32_t r = (psrc)[2];              \
+  (pdst)[0] = r;                            \
+  (pdst)[1] = g;                            \
+  (pdst)[2] = b;                            \
+} while (0)
+
+#define TRANSFORM_COLOR_INVERSE_8(src0, src1, dst0, dst1,     \
+                                  c0, c1, mask0, mask1) do {  \
+  v8i16 g0, g1, t0, t1, t2, t3;                               \
+  v4i32 t4, t5;                                               \
+  VSHF_B2_SH(src0, src0, src1, src1, mask0, mask0, g0, g1);   \
+  DOTP_SB2_SH(g0, g1, c0, c0, t0, t1);                        \
+  SRAI_H2_SH(t0, t1, 5);                                      \
+  t0 = __msa_addv_h(t0, (v8i16)src0);                         \
+  t1 = __msa_addv_h(t1, (v8i16)src1);                         \
+  t4 = __msa_srli_w((v4i32)t0, 16);                           \
+  t5 = __msa_srli_w((v4i32)t1, 16);                           \
+  DOTP_SB2_SH(t4, t5, c1, c1, t2, t3);                        \
+  SRAI_H2_SH(t2, t3, 5);                                      \
+  ADD2(t0, t2, t1, t3, t0, t1);                               \
+  VSHF_B2_UB(src0, t0, src1, t1, mask1, mask1, dst0, dst1);   \
+} while (0)
+
+#define TRANSFORM_COLOR_INVERSE_4(src, dst, c0, c1, mask0, mask1) do {  \
+  const v16i8 g0 = VSHF_SB(src, src, mask0);                            \
+  v8i16 t0 = __msa_dotp_s_h(c0, g0);                                    \
+  v8i16 t1;                                                             \
+  v4i32 t2;                                                             \
+  t0 = SRAI_H(t0, 5);                                                   \
+  t0 = __msa_addv_h(t0, (v8i16)src);                                    \
+  t2 = __msa_srli_w((v4i32)t0, 16);                                     \
+  t1 = __msa_dotp_s_h(c1, (v16i8)t2);                                   \
+  t1 = SRAI_H(t1, 5);                                                   \
+  t0 = t0 + t1;                                                         \
+  dst = VSHF_UB(src, t0, mask1);                                        \
+} while (0)
+
+static void ConvertBGRAToRGBA(const uint32_t* src,
+                              int num_pixels, uint8_t* dst) {
+  int i;
+  const uint8_t* ptemp_src = (const uint8_t*)src;
+  uint8_t* ptemp_dst = (uint8_t*)dst;
+  v16u8 src0, dst0;
+  const v16u8 mask = { 2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15 };
+
+  while (num_pixels >= 8) {
+    v16u8 src1, dst1;
+    LD_UB2(ptemp_src, 16, src0, src1);
+    VSHF_B2_UB(src0, src0, src1, src1, mask, mask, dst0, dst1);
+    ST_UB2(dst0, dst1, ptemp_dst, 16);
+    ptemp_src += 32;
+    ptemp_dst += 32;
+    num_pixels -= 8;
+  }
+  if (num_pixels > 0) {
+    if (num_pixels >= 4) {
+      src0 = LD_UB(ptemp_src);
+      dst0 = VSHF_UB(src0, src0, mask);
+      ST_UB(dst0, ptemp_dst);
+      ptemp_src += 16;
+      ptemp_dst += 16;
+      num_pixels -= 4;
+    }
+    for (i = 0; i < num_pixels; i++) {
+      const uint8_t b = ptemp_src[2];
+      const uint8_t g = ptemp_src[1];
+      const uint8_t r = ptemp_src[0];
+      const uint8_t a = ptemp_src[3];
+      ptemp_dst[0] = b;
+      ptemp_dst[1] = g;
+      ptemp_dst[2] = r;
+      ptemp_dst[3] = a;
+      ptemp_src += 4;
+      ptemp_dst += 4;
+    }
+  }
+}
+
+static void ConvertBGRAToBGR(const uint32_t* src,
+                             int num_pixels, uint8_t* dst) {
+  const uint8_t* ptemp_src = (const uint8_t*)src;
+  uint8_t* ptemp_dst = (uint8_t*)dst;
+  const v16u8 mask0 = { 0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14,
+                        16, 17, 18, 20 };
+  const v16u8 mask1 = { 5, 6, 8, 9, 10, 12, 13, 14, 16, 17, 18, 20,
+                        21, 22, 24, 25 };
+  const v16u8 mask2 = { 10, 12, 13, 14, 16, 17, 18, 20, 21, 22, 24, 25,
+                        26, 28, 29, 30 };
+
+  while (num_pixels >= 16) {
+    CONVERT16_BGRA_XXX(ptemp_src, ptemp_dst, mask0, mask1, mask2);
+    ptemp_src += 64;
+    ptemp_dst += 48;
+    num_pixels -= 16;
+  }
+  if (num_pixels > 0) {
+    if (num_pixels >= 12) {
+      CONVERT12_BGRA_XXX(ptemp_src, ptemp_dst, mask0, mask1, mask2);
+      ptemp_src += 48;
+      ptemp_dst += 36;
+      num_pixels -= 12;
+    } else if (num_pixels >= 8) {
+      CONVERT8_BGRA_XXX(ptemp_src, ptemp_dst, mask0, mask1);
+      ptemp_src += 32;
+      ptemp_dst += 24;
+      num_pixels -= 8;
+    } else if (num_pixels >= 4) {
+      CONVERT4_BGRA_XXX(ptemp_src, ptemp_dst, mask0);
+      ptemp_src += 16;
+      ptemp_dst += 12;
+      num_pixels -= 4;
+    }
+    if (num_pixels == 3) {
+      CONVERT1_BGRA_BGR(ptemp_src + 0, ptemp_dst + 0);
+      CONVERT1_BGRA_BGR(ptemp_src + 4, ptemp_dst + 3);
+      CONVERT1_BGRA_BGR(ptemp_src + 8, ptemp_dst + 6);
+    } else if (num_pixels == 2) {
+      CONVERT1_BGRA_BGR(ptemp_src + 0, ptemp_dst + 0);
+      CONVERT1_BGRA_BGR(ptemp_src + 4, ptemp_dst + 3);
+    } else if (num_pixels == 1) {
+      CONVERT1_BGRA_BGR(ptemp_src, ptemp_dst);
+    }
+  }
+}
+
+static void ConvertBGRAToRGB(const uint32_t* src,
+                             int num_pixels, uint8_t* dst) {
+  const uint8_t* ptemp_src = (const uint8_t*)src;
+  uint8_t* ptemp_dst = (uint8_t*)dst;
+  const v16u8 mask0 = { 2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12,
+                        18, 17, 16, 22 };
+  const v16u8 mask1 = { 5, 4, 10, 9, 8, 14, 13, 12, 18, 17, 16, 22,
+                        21, 20, 26, 25 };
+  const v16u8 mask2 = { 8, 14, 13, 12, 18, 17, 16, 22, 21, 20, 26, 25,
+                        24, 30, 29, 28 };
+
+  while (num_pixels >= 16) {
+    CONVERT16_BGRA_XXX(ptemp_src, ptemp_dst, mask0, mask1, mask2);
+    ptemp_src += 64;
+    ptemp_dst += 48;
+    num_pixels -= 16;
+  }
+  if (num_pixels) {
+    if (num_pixels >= 12) {
+      CONVERT12_BGRA_XXX(ptemp_src, ptemp_dst, mask0, mask1, mask2);
+      ptemp_src += 48;
+      ptemp_dst += 36;
+      num_pixels -= 12;
+    } else if (num_pixels >= 8) {
+      CONVERT8_BGRA_XXX(ptemp_src, ptemp_dst, mask0, mask1);
+      ptemp_src += 32;
+      ptemp_dst += 24;
+      num_pixels -= 8;
+    } else if (num_pixels >= 4) {
+      CONVERT4_BGRA_XXX(ptemp_src, ptemp_dst, mask0);
+      ptemp_src += 16;
+      ptemp_dst += 12;
+      num_pixels -= 4;
+    }
+    if (num_pixels == 3) {
+      CONVERT1_BGRA_RGB(ptemp_src + 0, ptemp_dst + 0);
+      CONVERT1_BGRA_RGB(ptemp_src + 4, ptemp_dst + 3);
+      CONVERT1_BGRA_RGB(ptemp_src + 8, ptemp_dst + 6);
+    } else if (num_pixels == 2) {
+      CONVERT1_BGRA_RGB(ptemp_src + 0, ptemp_dst + 0);
+      CONVERT1_BGRA_RGB(ptemp_src + 4, ptemp_dst + 3);
+    } else if (num_pixels == 1) {
+      CONVERT1_BGRA_RGB(ptemp_src, ptemp_dst);
+    }
+  }
+}
+
+static void AddGreenToBlueAndRed(const uint32_t* const src, int num_pixels,
+                                 uint32_t* dst) {
+  int i;
+  const uint8_t* in = (const uint8_t*)src;
+  uint8_t* out = (uint8_t*)dst;
+  v16u8 src0, dst0, tmp0;
+  const v16u8 mask = { 1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255,
+                       13, 255, 13, 255 };
+
+  while (num_pixels >= 8) {
+    v16u8 src1, dst1, tmp1;
+    LD_UB2(in, 16, src0, src1);
+    VSHF_B2_UB(src0, src1, src1, src0, mask, mask, tmp0, tmp1);
+    ADD2(src0, tmp0, src1, tmp1, dst0, dst1);
+    ST_UB2(dst0, dst1, out, 16);
+    in += 32;
+    out += 32;
+    num_pixels -= 8;
+  }
+  if (num_pixels > 0) {
+    if (num_pixels >= 4) {
+      src0 = LD_UB(in);
+      tmp0 = VSHF_UB(src0, src0, mask);
+      dst0 = src0 + tmp0;
+      ST_UB(dst0, out);
+      in += 16;
+      out += 16;
+      num_pixels -= 4;
+    }
+    for (i = 0; i < num_pixels; i++) {
+      const uint8_t b = in[0];
+      const uint8_t g = in[1];
+      const uint8_t r = in[2];
+      out[0] = (b + g) & 0xff;
+      out[1] = g;
+      out[2] = (r + g) & 0xff;
+      out[4] = in[4];
+      out += 4;
+    }
+  }
+}
+
+static void TransformColorInverse(const VP8LMultipliers* const m,
+                                  const uint32_t* src, int num_pixels,
+                                  uint32_t* dst) {
+  v16u8 src0, dst0;
+  const v16i8 g2br = (v16i8)__msa_fill_w(m->green_to_blue_ |
+                                         (m->green_to_red_ << 16));
+  const v16i8 r2b = (v16i8)__msa_fill_w(m->red_to_blue_);
+  const v16u8 mask0 = { 1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255,
+                        13, 255, 13, 255 };
+  const v16u8 mask1 = { 16, 1, 18, 3, 20, 5, 22, 7, 24, 9, 26, 11,
+                        28, 13, 30, 15 };
+
+  while (num_pixels >= 8) {
+    v16u8 src1, dst1;
+    LD_UB2(src, 4, src0, src1);
+    TRANSFORM_COLOR_INVERSE_8(src0, src1, dst0, dst1, g2br, r2b, mask0, mask1);
+    ST_UB2(dst0, dst1, dst, 4);
+    src += 8;
+    dst += 8;
+    num_pixels -= 8;
+  }
+  if (num_pixels > 0) {
+    if (num_pixels >= 4) {
+      src0 = LD_UB(src);
+      TRANSFORM_COLOR_INVERSE_4(src0, dst0, g2br, r2b, mask0, mask1);
+      ST_UB(dst0, dst);
+      src += 4;
+      dst += 4;
+      num_pixels -= 4;
+    }
+    if (num_pixels > 0) {
+      src0 = LD_UB(src);
+      TRANSFORM_COLOR_INVERSE_4(src0, dst0, g2br, r2b, mask0, mask1);
+      if (num_pixels == 3) {
+        const uint64_t pix_d = __msa_copy_s_d((v2i64)dst0, 0);
+        const uint32_t pix_w = __msa_copy_s_w((v4i32)dst0, 2);
+        SD(pix_d, dst + 0);
+        SW(pix_w, dst + 2);
+      } else if (num_pixels == 2) {
+        const uint64_t pix_d = __msa_copy_s_d((v2i64)dst0, 0);
+        SD(pix_d, dst);
+      } else {
+        const uint32_t pix_w = __msa_copy_s_w((v4i32)dst0, 0);
+        SW(pix_w, dst);
+      }
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8LDspInitMSA(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitMSA(void) {
+  VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA;
+  VP8LConvertBGRAToBGR = ConvertBGRAToBGR;
+  VP8LConvertBGRAToRGB = ConvertBGRAToRGB;
+  VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed;
+  VP8LTransformColorInverse = TransformColorInverse;
+}
+
+#else  // !WEBP_USE_MSA
+
+WEBP_DSP_INIT_STUB(VP8LDspInitMSA)
+
+#endif  // WEBP_USE_MSA
diff --git a/src/3rdparty/libwebp/src/dsp/lossless_neon.c b/src/3rdparty/libwebp/src/dsp/lossless_neon.c
index 6faccb8..1145d5f 100644
--- a/src/3rdparty/libwebp/src/dsp/lossless_neon.c
+++ b/src/3rdparty/libwebp/src/dsp/lossless_neon.c
@@ -139,6 +139,357 @@ static void ConvertBGRAToRGB(const uint32_t* src,
 
 #endif   // !WORK_AROUND_GCC
 
+
+//------------------------------------------------------------------------------
+// Predictor Transform
+
+#define LOAD_U32_AS_U8(IN) vreinterpret_u8_u32(vdup_n_u32((IN)))
+#define LOAD_U32P_AS_U8(IN) vreinterpret_u8_u32(vld1_u32((IN)))
+#define LOADQ_U32_AS_U8(IN) vreinterpretq_u8_u32(vdupq_n_u32((IN)))
+#define LOADQ_U32P_AS_U8(IN) vreinterpretq_u8_u32(vld1q_u32((IN)))
+#define GET_U8_AS_U32(IN) vget_lane_u32(vreinterpret_u32_u8((IN)), 0);
+#define GETQ_U8_AS_U32(IN) vgetq_lane_u32(vreinterpretq_u32_u8((IN)), 0);
+#define STOREQ_U8_AS_U32P(OUT, IN) vst1q_u32((OUT), vreinterpretq_u32_u8((IN)));
+#define ROTATE32_LEFT(L) vextq_u8((L), (L), 12)    // D|C|B|A -> C|B|A|D
+
+static WEBP_INLINE uint8x8_t Average2_u8_NEON(uint32_t a0, uint32_t a1) {
+  const uint8x8_t A0 = LOAD_U32_AS_U8(a0);
+  const uint8x8_t A1 = LOAD_U32_AS_U8(a1);
+  return vhadd_u8(A0, A1);
+}
+
+static WEBP_INLINE uint32_t ClampedAddSubtractHalf_NEON(uint32_t c0,
+                                                        uint32_t c1,
+                                                        uint32_t c2) {
+  const uint8x8_t avg = Average2_u8_NEON(c0, c1);
+  // Remove one to c2 when bigger than avg.
+  const uint8x8_t C2 = LOAD_U32_AS_U8(c2);
+  const uint8x8_t cmp = vcgt_u8(C2, avg);
+  const uint8x8_t C2_1 = vadd_u8(C2, cmp);
+  // Compute half of the difference between avg and c2.
+  const int8x8_t diff_avg = vreinterpret_s8_u8(vhsub_u8(avg, C2_1));
+  // Compute the sum with avg and saturate.
+  const int16x8_t avg_16 = vreinterpretq_s16_u16(vmovl_u8(avg));
+  const uint8x8_t res = vqmovun_s16(vaddw_s8(avg_16, diff_avg));
+  const uint32_t output = GET_U8_AS_U32(res);
+  return output;
+}
+
+static WEBP_INLINE uint32_t Average2_NEON(uint32_t a0, uint32_t a1) {
+  const uint8x8_t avg_u8x8 = Average2_u8_NEON(a0, a1);
+  const uint32_t avg = GET_U8_AS_U32(avg_u8x8);
+  return avg;
+}
+
+static WEBP_INLINE uint32_t Average3_NEON(uint32_t a0, uint32_t a1,
+                                          uint32_t a2) {
+  const uint8x8_t avg0 = Average2_u8_NEON(a0, a2);
+  const uint8x8_t A1 = LOAD_U32_AS_U8(a1);
+  const uint32_t avg = GET_U8_AS_U32(vhadd_u8(avg0, A1));
+  return avg;
+}
+
+static uint32_t Predictor5_NEON(uint32_t left, const uint32_t* const top) {
+  return Average3_NEON(left, top[0], top[1]);
+}
+static uint32_t Predictor6_NEON(uint32_t left, const uint32_t* const top) {
+  return Average2_NEON(left, top[-1]);
+}
+static uint32_t Predictor7_NEON(uint32_t left, const uint32_t* const top) {
+  return Average2_NEON(left, top[0]);
+}
+static uint32_t Predictor13_NEON(uint32_t left, const uint32_t* const top) {
+  return ClampedAddSubtractHalf_NEON(left, top[0], top[-1]);
+}
+
+// Batch versions of those functions.
+
+// Predictor0: ARGB_BLACK.
+static void PredictorAdd0_NEON(const uint32_t* in, const uint32_t* upper,
+                               int num_pixels, uint32_t* out) {
+  int i;
+  const uint8x16_t black = vreinterpretq_u8_u32(vdupq_n_u32(ARGB_BLACK));
+  for (i = 0; i + 4 <= num_pixels; i += 4) {
+    const uint8x16_t src = LOADQ_U32P_AS_U8(&in[i]);
+    const uint8x16_t res = vaddq_u8(src, black);
+    STOREQ_U8_AS_U32P(&out[i], res);
+  }
+  VP8LPredictorsAdd_C[0](in + i, upper + i, num_pixels - i, out + i);
+}
+
+// Predictor1: left.
+static void PredictorAdd1_NEON(const uint32_t* in, const uint32_t* upper,
+                               int num_pixels, uint32_t* out) {
+  int i;
+  const uint8x16_t zero = LOADQ_U32_AS_U8(0);
+  for (i = 0; i + 4 <= num_pixels; i += 4) {
+    // a | b | c | d
+    const uint8x16_t src = LOADQ_U32P_AS_U8(&in[i]);
+    // 0 | a | b | c
+    const uint8x16_t shift0 = vextq_u8(zero, src, 12);
+    // a | a + b | b + c | c + d
+    const uint8x16_t sum0 = vaddq_u8(src, shift0);
+    // 0 | 0 | a | a + b
+    const uint8x16_t shift1 = vextq_u8(zero, sum0, 8);
+    // a | a + b | a + b + c | a + b + c + d
+    const uint8x16_t sum1 = vaddq_u8(sum0, shift1);
+    const uint8x16_t prev = LOADQ_U32_AS_U8(out[i - 1]);
+    const uint8x16_t res = vaddq_u8(sum1, prev);
+    STOREQ_U8_AS_U32P(&out[i], res);
+  }
+  VP8LPredictorsAdd_C[1](in + i, upper + i, num_pixels - i, out + i);
+}
+
+// Macro that adds 32-bit integers from IN using mod 256 arithmetic
+// per 8 bit channel.
+#define GENERATE_PREDICTOR_1(X, IN)                                       \
+static void PredictorAdd##X##_NEON(const uint32_t* in,                    \
+                                   const uint32_t* upper, int num_pixels, \
+                                   uint32_t* out) {                       \
+  int i;                                                                  \
+  for (i = 0; i + 4 <= num_pixels; i += 4) {                              \
+    const uint8x16_t src = LOADQ_U32P_AS_U8(&in[i]);                      \
+    const uint8x16_t other = LOADQ_U32P_AS_U8(&(IN));                     \
+    const uint8x16_t res = vaddq_u8(src, other);                          \
+    STOREQ_U8_AS_U32P(&out[i], res);                                      \
+  }                                                                       \
+  VP8LPredictorsAdd_C[(X)](in + i, upper + i, num_pixels - i, out + i);   \
+}
+// Predictor2: Top.
+GENERATE_PREDICTOR_1(2, upper[i])
+// Predictor3: Top-right.
+GENERATE_PREDICTOR_1(3, upper[i + 1])
+// Predictor4: Top-left.
+GENERATE_PREDICTOR_1(4, upper[i - 1])
+#undef GENERATE_PREDICTOR_1
+
+// Predictor5: average(average(left, TR), T)
+#define DO_PRED5(LANE) do {                                              \
+  const uint8x16_t avgLTR = vhaddq_u8(L, TR);                            \
+  const uint8x16_t avg = vhaddq_u8(avgLTR, T);                           \
+  const uint8x16_t res = vaddq_u8(avg, src);                             \
+  vst1q_lane_u32(&out[i + (LANE)], vreinterpretq_u32_u8(res), (LANE));   \
+  L = ROTATE32_LEFT(res);                                                \
+} while (0)
+
+static void PredictorAdd5_NEON(const uint32_t* in, const uint32_t* upper,
+                               int num_pixels, uint32_t* out) {
+  int i;
+  uint8x16_t L = LOADQ_U32_AS_U8(out[-1]);
+  for (i = 0; i + 4 <= num_pixels; i += 4) {
+    const uint8x16_t src = LOADQ_U32P_AS_U8(&in[i]);
+    const uint8x16_t T = LOADQ_U32P_AS_U8(&upper[i + 0]);
+    const uint8x16_t TR = LOADQ_U32P_AS_U8(&upper[i + 1]);
+    DO_PRED5(0);
+    DO_PRED5(1);
+    DO_PRED5(2);
+    DO_PRED5(3);
+  }
+  VP8LPredictorsAdd_C[5](in + i, upper + i, num_pixels - i, out + i);
+}
+#undef DO_PRED5
+
+#define DO_PRED67(LANE) do {                                             \
+  const uint8x16_t avg = vhaddq_u8(L, top);                              \
+  const uint8x16_t res = vaddq_u8(avg, src);                             \
+  vst1q_lane_u32(&out[i + (LANE)], vreinterpretq_u32_u8(res), (LANE));   \
+  L = ROTATE32_LEFT(res);                                                \
+} while (0)
+
+// Predictor6: average(left, TL)
+static void PredictorAdd6_NEON(const uint32_t* in, const uint32_t* upper,
+                               int num_pixels, uint32_t* out) {
+  int i;
+  uint8x16_t L = LOADQ_U32_AS_U8(out[-1]);
+  for (i = 0; i + 4 <= num_pixels; i += 4) {
+    const uint8x16_t src = LOADQ_U32P_AS_U8(&in[i]);
+    const uint8x16_t top = LOADQ_U32P_AS_U8(&upper[i - 1]);
+    DO_PRED67(0);
+    DO_PRED67(1);
+    DO_PRED67(2);
+    DO_PRED67(3);
+  }
+  VP8LPredictorsAdd_C[6](in + i, upper + i, num_pixels - i, out + i);
+}
+
+// Predictor7: average(left, T)
+static void PredictorAdd7_NEON(const uint32_t* in, const uint32_t* upper,
+                               int num_pixels, uint32_t* out) {
+  int i;
+  uint8x16_t L = LOADQ_U32_AS_U8(out[-1]);
+  for (i = 0; i + 4 <= num_pixels; i += 4) {
+    const uint8x16_t src = LOADQ_U32P_AS_U8(&in[i]);
+    const uint8x16_t top = LOADQ_U32P_AS_U8(&upper[i]);
+    DO_PRED67(0);
+    DO_PRED67(1);
+    DO_PRED67(2);
+    DO_PRED67(3);
+  }
+  VP8LPredictorsAdd_C[7](in + i, upper + i, num_pixels - i, out + i);
+}
+#undef DO_PRED67
+
+#define GENERATE_PREDICTOR_2(X, IN)                                       \
+static void PredictorAdd##X##_NEON(const uint32_t* in,                    \
+                                   const uint32_t* upper, int num_pixels, \
+                                   uint32_t* out) {                       \
+  int i;                                                                  \
+  for (i = 0; i + 4 <= num_pixels; i += 4) {                              \
+    const uint8x16_t src = LOADQ_U32P_AS_U8(&in[i]);                      \
+    const uint8x16_t Tother = LOADQ_U32P_AS_U8(&(IN));                    \
+    const uint8x16_t T = LOADQ_U32P_AS_U8(&upper[i]);                     \
+    const uint8x16_t avg = vhaddq_u8(T, Tother);                          \
+    const uint8x16_t res = vaddq_u8(avg, src);                            \
+    STOREQ_U8_AS_U32P(&out[i], res);                                      \
+  }                                                                       \
+  VP8LPredictorsAdd_C[(X)](in + i, upper + i, num_pixels - i, out + i);   \
+}
+// Predictor8: average TL T.
+GENERATE_PREDICTOR_2(8, upper[i - 1])
+// Predictor9: average T TR.
+GENERATE_PREDICTOR_2(9, upper[i + 1])
+#undef GENERATE_PREDICTOR_2
+
+// Predictor10: average of (average of (L,TL), average of (T, TR)).
+#define DO_PRED10(LANE) do {                                             \
+  const uint8x16_t avgLTL = vhaddq_u8(L, TL);                            \
+  const uint8x16_t avg = vhaddq_u8(avgTTR, avgLTL);                      \
+  const uint8x16_t res = vaddq_u8(avg, src);                             \
+  vst1q_lane_u32(&out[i + (LANE)], vreinterpretq_u32_u8(res), (LANE));   \
+  L = ROTATE32_LEFT(res);                                                \
+} while (0)
+
+static void PredictorAdd10_NEON(const uint32_t* in, const uint32_t* upper,
+                                int num_pixels, uint32_t* out) {
+  int i;
+  uint8x16_t L = LOADQ_U32_AS_U8(out[-1]);
+  for (i = 0; i + 4 <= num_pixels; i += 4) {
+    const uint8x16_t src = LOADQ_U32P_AS_U8(&in[i]);
+    const uint8x16_t TL = LOADQ_U32P_AS_U8(&upper[i - 1]);
+    const uint8x16_t T = LOADQ_U32P_AS_U8(&upper[i]);
+    const uint8x16_t TR = LOADQ_U32P_AS_U8(&upper[i + 1]);
+    const uint8x16_t avgTTR = vhaddq_u8(T, TR);
+    DO_PRED10(0);
+    DO_PRED10(1);
+    DO_PRED10(2);
+    DO_PRED10(3);
+  }
+  VP8LPredictorsAdd_C[10](in + i, upper + i, num_pixels - i, out + i);
+}
+#undef DO_PRED10
+
+// Predictor11: select.
+#define DO_PRED11(LANE) do {                                                   \
+  const uint8x16_t sumLin = vaddq_u8(L, src);  /* in + L */                    \
+  const uint8x16_t pLTL = vabdq_u8(L, TL);  /* |L - TL| */                     \
+  const uint16x8_t sum_LTL = vpaddlq_u8(pLTL);                                 \
+  const uint32x4_t pa = vpaddlq_u16(sum_LTL);                                  \
+  const uint32x4_t mask = vcleq_u32(pa, pb);                                   \
+  const uint8x16_t res = vbslq_u8(vreinterpretq_u8_u32(mask), sumTin, sumLin); \
+  vst1q_lane_u32(&out[i + (LANE)], vreinterpretq_u32_u8(res), (LANE));         \
+  L = ROTATE32_LEFT(res);                                                      \
+} while (0)
+
+static void PredictorAdd11_NEON(const uint32_t* in, const uint32_t* upper,
+                                int num_pixels, uint32_t* out) {
+  int i;
+  uint8x16_t L = LOADQ_U32_AS_U8(out[-1]);
+  for (i = 0; i + 4 <= num_pixels; i += 4) {
+    const uint8x16_t T = LOADQ_U32P_AS_U8(&upper[i]);
+    const uint8x16_t TL = LOADQ_U32P_AS_U8(&upper[i - 1]);
+    const uint8x16_t pTTL = vabdq_u8(T, TL);   // |T - TL|
+    const uint16x8_t sum_TTL = vpaddlq_u8(pTTL);
+    const uint32x4_t pb = vpaddlq_u16(sum_TTL);
+    const uint8x16_t src = LOADQ_U32P_AS_U8(&in[i]);
+    const uint8x16_t sumTin = vaddq_u8(T, src);   // in + T
+    DO_PRED11(0);
+    DO_PRED11(1);
+    DO_PRED11(2);
+    DO_PRED11(3);
+  }
+  VP8LPredictorsAdd_C[11](in + i, upper + i, num_pixels - i, out + i);
+}
+#undef DO_PRED11
+
+// Predictor12: ClampedAddSubtractFull.
+#define DO_PRED12(DIFF, LANE) do {                                       \
+  const uint8x8_t pred =                                                 \
+      vqmovun_s16(vaddq_s16(vreinterpretq_s16_u16(L), (DIFF)));          \
+  const uint8x8_t res =                                                  \
+      vadd_u8(pred, (LANE <= 1) ? vget_low_u8(src) : vget_high_u8(src)); \
+  const uint16x8_t res16 = vmovl_u8(res);                                \
+  vst1_lane_u32(&out[i + (LANE)], vreinterpret_u32_u8(res), (LANE) & 1); \
+  /* rotate in the left predictor for next iteration */                  \
+  L = vextq_u16(res16, res16, 4);                                        \
+} while (0)
+
+static void PredictorAdd12_NEON(const uint32_t* in, const uint32_t* upper,
+                                int num_pixels, uint32_t* out) {
+  int i;
+  uint16x8_t L = vmovl_u8(LOAD_U32_AS_U8(out[-1]));
+  for (i = 0; i + 4 <= num_pixels; i += 4) {
+    // load four pixels of source
+    const uint8x16_t src = LOADQ_U32P_AS_U8(&in[i]);
+    // precompute the difference T - TL once for all, stored as s16
+    const uint8x16_t TL = LOADQ_U32P_AS_U8(&upper[i - 1]);
+    const uint8x16_t T = LOADQ_U32P_AS_U8(&upper[i]);
+    const int16x8_t diff_lo =
+        vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(T), vget_low_u8(TL)));
+    const int16x8_t diff_hi =
+        vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(T), vget_high_u8(TL)));
+    // loop over the four reconstructed pixels
+    DO_PRED12(diff_lo, 0);
+    DO_PRED12(diff_lo, 1);
+    DO_PRED12(diff_hi, 2);
+    DO_PRED12(diff_hi, 3);
+  }
+  VP8LPredictorsAdd_C[12](in + i, upper + i, num_pixels - i, out + i);
+}
+#undef DO_PRED12
+
+// Predictor13: ClampedAddSubtractHalf
+#define DO_PRED13(LANE, LOW_OR_HI) do {                                        \
+  const uint8x16_t avg = vhaddq_u8(L, T);                                      \
+  const uint8x16_t cmp = vcgtq_u8(TL, avg);                                    \
+  const uint8x16_t TL_1 = vaddq_u8(TL, cmp);                                   \
+  /* Compute half of the difference between avg and TL'. */                    \
+  const int8x8_t diff_avg =                                                    \
+      vreinterpret_s8_u8(LOW_OR_HI(vhsubq_u8(avg, TL_1)));                     \
+  /* Compute the sum with avg and saturate. */                                 \
+  const int16x8_t avg_16 = vreinterpretq_s16_u16(vmovl_u8(LOW_OR_HI(avg)));    \
+  const uint8x8_t delta = vqmovun_s16(vaddw_s8(avg_16, diff_avg));             \
+  const uint8x8_t res = vadd_u8(LOW_OR_HI(src), delta);                        \
+  const uint8x16_t res2 = vcombine_u8(res, res);                               \
+  vst1_lane_u32(&out[i + (LANE)], vreinterpret_u32_u8(res), (LANE) & 1);       \
+  L = ROTATE32_LEFT(res2);                                                     \
+} while (0)
+
+static void PredictorAdd13_NEON(const uint32_t* in, const uint32_t* upper,
+                                int num_pixels, uint32_t* out) {
+  int i;
+  uint8x16_t L = LOADQ_U32_AS_U8(out[-1]);
+  for (i = 0; i + 4 <= num_pixels; i += 4) {
+    const uint8x16_t src = LOADQ_U32P_AS_U8(&in[i]);
+    const uint8x16_t T = LOADQ_U32P_AS_U8(&upper[i]);
+    const uint8x16_t TL = LOADQ_U32P_AS_U8(&upper[i - 1]);
+    DO_PRED13(0, vget_low_u8);
+    DO_PRED13(1, vget_low_u8);
+    DO_PRED13(2, vget_high_u8);
+    DO_PRED13(3, vget_high_u8);
+  }
+  VP8LPredictorsAdd_C[13](in + i, upper + i, num_pixels - i, out + i);
+}
+#undef DO_PRED13
+
+#undef LOAD_U32_AS_U8
+#undef LOAD_U32P_AS_U8
+#undef LOADQ_U32_AS_U8
+#undef LOADQ_U32P_AS_U8
+#undef GET_U8_AS_U32
+#undef GETQ_U8_AS_U32
+#undef STOREQ_U8_AS_U32P
+#undef ROTATE32_LEFT
+
 //------------------------------------------------------------------------------
 // Subtract-Green Transform
 
@@ -171,28 +522,30 @@ static WEBP_INLINE uint8x16_t DoGreenShuffle(const uint8x16_t argb,
 }
 #endif  // USE_VTBLQ
 
-static void AddGreenToBlueAndRed(uint32_t* argb_data, int num_pixels) {
-  const uint32_t* const end = argb_data + (num_pixels & ~3);
+static void AddGreenToBlueAndRed(const uint32_t* src, int num_pixels,
+                                 uint32_t* dst) {
+  const uint32_t* const end = src + (num_pixels & ~3);
 #ifdef USE_VTBLQ
   const uint8x16_t shuffle = vld1q_u8(kGreenShuffle);
 #else
   const uint8x8_t shuffle = vld1_u8(kGreenShuffle);
 #endif
-  for (; argb_data < end; argb_data += 4) {
-    const uint8x16_t argb = vld1q_u8((uint8_t*)argb_data);
+  for (; src < end; src += 4, dst += 4) {
+    const uint8x16_t argb = vld1q_u8((const uint8_t*)src);
     const uint8x16_t greens = DoGreenShuffle(argb, shuffle);
-    vst1q_u8((uint8_t*)argb_data, vaddq_u8(argb, greens));
+    vst1q_u8((uint8_t*)dst, vaddq_u8(argb, greens));
   }
   // fallthrough and finish off with plain-C
-  VP8LAddGreenToBlueAndRed_C(argb_data, num_pixels & 3);
+  VP8LAddGreenToBlueAndRed_C(src, num_pixels & 3, dst);
 }
 
 //------------------------------------------------------------------------------
 // Color Transform
 
 static void TransformColorInverse(const VP8LMultipliers* const m,
-                                  uint32_t* argb_data, int num_pixels) {
-  // sign-extended multiplying constants, pre-shifted by 6.
+                                  const uint32_t* const src, int num_pixels,
+                                  uint32_t* dst) {
+// sign-extended multiplying constants, pre-shifted by 6.
 #define CST(X)  (((int16_t)(m->X << 8)) >> 6)
   const int16_t rb[8] = {
     CST(green_to_blue_), CST(green_to_red_),
@@ -219,7 +572,7 @@ static void TransformColorInverse(const VP8LMultipliers* const m,
   const uint32x4_t mask_ag = vdupq_n_u32(0xff00ff00u);
   int i;
   for (i = 0; i + 4 <= num_pixels; i += 4) {
-    const uint8x16_t in = vld1q_u8((uint8_t*)(argb_data + i));
+    const uint8x16_t in = vld1q_u8((const uint8_t*)(src + i));
     const uint32x4_t a0g0 = vandq_u32(vreinterpretq_u32_u8(in), mask_ag);
     // 0 g 0 g
     const uint8x16_t greens = DoGreenShuffle(in, shuffle);
@@ -240,10 +593,10 @@ static void TransformColorInverse(const VP8LMultipliers* const m,
     // 0  r'  0  b''
     const uint16x8_t G = vshrq_n_u16(vreinterpretq_u16_s8(F), 8);
     const uint32x4_t out = vorrq_u32(vreinterpretq_u32_u16(G), a0g0);
-    vst1q_u32(argb_data + i, out);
+    vst1q_u32(dst + i, out);
   }
   // Fall-back to C-version for left-overs.
-  VP8LTransformColorInverse_C(m, argb_data + i, num_pixels - i);
+  VP8LTransformColorInverse_C(m, src + i, num_pixels - i, dst + i);
 }
 
 #undef USE_VTBLQ
@@ -254,6 +607,26 @@ static void TransformColorInverse(const VP8LMultipliers* const m,
 extern void VP8LDspInitNEON(void);
 
 WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitNEON(void) {
+  VP8LPredictors[5] = Predictor5_NEON;
+  VP8LPredictors[6] = Predictor6_NEON;
+  VP8LPredictors[7] = Predictor7_NEON;
+  VP8LPredictors[13] = Predictor13_NEON;
+
+  VP8LPredictorsAdd[0] = PredictorAdd0_NEON;
+  VP8LPredictorsAdd[1] = PredictorAdd1_NEON;
+  VP8LPredictorsAdd[2] = PredictorAdd2_NEON;
+  VP8LPredictorsAdd[3] = PredictorAdd3_NEON;
+  VP8LPredictorsAdd[4] = PredictorAdd4_NEON;
+  VP8LPredictorsAdd[5] = PredictorAdd5_NEON;
+  VP8LPredictorsAdd[6] = PredictorAdd6_NEON;
+  VP8LPredictorsAdd[7] = PredictorAdd7_NEON;
+  VP8LPredictorsAdd[8] = PredictorAdd8_NEON;
+  VP8LPredictorsAdd[9] = PredictorAdd9_NEON;
+  VP8LPredictorsAdd[10] = PredictorAdd10_NEON;
+  VP8LPredictorsAdd[11] = PredictorAdd11_NEON;
+  VP8LPredictorsAdd[12] = PredictorAdd12_NEON;
+  VP8LPredictorsAdd[13] = PredictorAdd13_NEON;
+
   VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA;
   VP8LConvertBGRAToBGR = ConvertBGRAToBGR;
   VP8LConvertBGRAToRGB = ConvertBGRAToRGB;
diff --git a/src/3rdparty/libwebp/src/dsp/lossless_sse2.c b/src/3rdparty/libwebp/src/dsp/lossless_sse2.c
index 2d016c2..15aae93 100644
--- a/src/3rdparty/libwebp/src/dsp/lossless_sse2.c
+++ b/src/3rdparty/libwebp/src/dsp/lossless_sse2.c
@@ -14,9 +14,12 @@
 #include "./dsp.h"
 
 #if defined(WEBP_USE_SSE2)
+
+#include "./common_sse2.h"
+#include "./lossless.h"
+#include "./lossless_common.h"
 #include <assert.h>
 #include <emmintrin.h>
-#include "./lossless.h"
 
 //------------------------------------------------------------------------------
 // Predictor Transform
@@ -75,25 +78,44 @@ static WEBP_INLINE uint32_t Select(uint32_t a, uint32_t b, uint32_t c) {
   return (pa_minus_pb <= 0) ? a : b;
 }
 
-static WEBP_INLINE __m128i Average2_128i(uint32_t a0, uint32_t a1) {
+static WEBP_INLINE void Average2_m128i(const __m128i* const a0,
+                                       const __m128i* const a1,
+                                       __m128i* const avg) {
+  // (a + b) >> 1 = ((a + b + 1) >> 1) - ((a ^ b) & 1)
+  const __m128i ones = _mm_set1_epi8(1);
+  const __m128i avg1 = _mm_avg_epu8(*a0, *a1);
+  const __m128i one = _mm_and_si128(_mm_xor_si128(*a0, *a1), ones);
+  *avg = _mm_sub_epi8(avg1, one);
+}
+
+static WEBP_INLINE void Average2_uint32(const uint32_t a0, const uint32_t a1,
+                                        __m128i* const avg) {
+  // (a + b) >> 1 = ((a + b + 1) >> 1) - ((a ^ b) & 1)
+  const __m128i ones = _mm_set1_epi8(1);
+  const __m128i A0 = _mm_cvtsi32_si128(a0);
+  const __m128i A1 = _mm_cvtsi32_si128(a1);
+  const __m128i avg1 = _mm_avg_epu8(A0, A1);
+  const __m128i one = _mm_and_si128(_mm_xor_si128(A0, A1), ones);
+  *avg = _mm_sub_epi8(avg1, one);
+}
+
+static WEBP_INLINE __m128i Average2_uint32_16(uint32_t a0, uint32_t a1) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i A0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(a0), zero);
   const __m128i A1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(a1), zero);
   const __m128i sum = _mm_add_epi16(A1, A0);
-  const __m128i avg = _mm_srli_epi16(sum, 1);
-  return avg;
+  return _mm_srli_epi16(sum, 1);
 }
 
 static WEBP_INLINE uint32_t Average2(uint32_t a0, uint32_t a1) {
-  const __m128i avg = Average2_128i(a0, a1);
-  const __m128i A2 = _mm_packus_epi16(avg, avg);
-  const uint32_t output = _mm_cvtsi128_si32(A2);
-  return output;
+  __m128i output;
+  Average2_uint32(a0, a1, &output);
+  return _mm_cvtsi128_si32(output);
 }
 
 static WEBP_INLINE uint32_t Average3(uint32_t a0, uint32_t a1, uint32_t a2) {
   const __m128i zero = _mm_setzero_si128();
-  const __m128i avg1 = Average2_128i(a0, a2);
+  const __m128i avg1 = Average2_uint32_16(a0, a2);
   const __m128i A1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(a1), zero);
   const __m128i sum = _mm_add_epi16(avg1, A1);
   const __m128i avg2 = _mm_srli_epi16(sum, 1);
@@ -104,8 +126,8 @@ static WEBP_INLINE uint32_t Average3(uint32_t a0, uint32_t a1, uint32_t a2) {
 
 static WEBP_INLINE uint32_t Average4(uint32_t a0, uint32_t a1,
                                      uint32_t a2, uint32_t a3) {
-  const __m128i avg1 = Average2_128i(a0, a1);
-  const __m128i avg2 = Average2_128i(a2, a3);
+  const __m128i avg1 = Average2_uint32_16(a0, a1);
+  const __m128i avg2 = Average2_uint32_16(a2, a3);
   const __m128i sum = _mm_add_epi16(avg2, avg1);
   const __m128i avg3 = _mm_srli_epi16(sum, 1);
   const __m128i A0 = _mm_packus_epi16(avg3, avg3);
@@ -113,68 +135,289 @@ static WEBP_INLINE uint32_t Average4(uint32_t a0, uint32_t a1,
   return output;
 }
 
-static uint32_t Predictor5(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor5_SSE2(uint32_t left, const uint32_t* const top) {
   const uint32_t pred = Average3(left, top[0], top[1]);
   return pred;
 }
-static uint32_t Predictor6(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor6_SSE2(uint32_t left, const uint32_t* const top) {
   const uint32_t pred = Average2(left, top[-1]);
   return pred;
 }
-static uint32_t Predictor7(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor7_SSE2(uint32_t left, const uint32_t* const top) {
   const uint32_t pred = Average2(left, top[0]);
   return pred;
 }
-static uint32_t Predictor8(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor8_SSE2(uint32_t left, const uint32_t* const top) {
   const uint32_t pred = Average2(top[-1], top[0]);
   (void)left;
   return pred;
 }
-static uint32_t Predictor9(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor9_SSE2(uint32_t left, const uint32_t* const top) {
   const uint32_t pred = Average2(top[0], top[1]);
   (void)left;
   return pred;
 }
-static uint32_t Predictor10(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor10_SSE2(uint32_t left, const uint32_t* const top) {
   const uint32_t pred = Average4(left, top[-1], top[0], top[1]);
   return pred;
 }
-static uint32_t Predictor11(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor11_SSE2(uint32_t left, const uint32_t* const top) {
   const uint32_t pred = Select(top[0], left, top[-1]);
   return pred;
 }
-static uint32_t Predictor12(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor12_SSE2(uint32_t left, const uint32_t* const top) {
   const uint32_t pred = ClampedAddSubtractFull(left, top[0], top[-1]);
   return pred;
 }
-static uint32_t Predictor13(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor13_SSE2(uint32_t left, const uint32_t* const top) {
   const uint32_t pred = ClampedAddSubtractHalf(left, top[0], top[-1]);
   return pred;
 }
 
+// Batch versions of those functions.
+
+// Predictor0: ARGB_BLACK.
+static void PredictorAdd0_SSE2(const uint32_t* in, const uint32_t* upper,
+                               int num_pixels, uint32_t* out) {
+  int i;
+  const __m128i black = _mm_set1_epi32(ARGB_BLACK);
+  for (i = 0; i + 4 <= num_pixels; i += 4) {
+    const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
+    const __m128i res = _mm_add_epi8(src, black);
+    _mm_storeu_si128((__m128i*)&out[i], res);
+  }
+  if (i != num_pixels) {
+    VP8LPredictorsAdd_C[0](in + i, upper + i, num_pixels - i, out + i);
+  }
+}
+
+// Predictor1: left.
+static void PredictorAdd1_SSE2(const uint32_t* in, const uint32_t* upper,
+                               int num_pixels, uint32_t* out) {
+  int i;
+  __m128i prev = _mm_set1_epi32(out[-1]);
+  for (i = 0; i + 4 <= num_pixels; i += 4) {
+    // a | b | c | d
+    const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
+    // 0 | a | b | c
+    const __m128i shift0 = _mm_slli_si128(src, 4);
+    // a | a + b | b + c | c + d
+    const __m128i sum0 = _mm_add_epi8(src, shift0);
+    // 0 | 0 | a | a + b
+    const __m128i shift1 = _mm_slli_si128(sum0, 8);
+    // a | a + b | a + b + c | a + b + c + d
+    const __m128i sum1 = _mm_add_epi8(sum0, shift1);
+    const __m128i res = _mm_add_epi8(sum1, prev);
+    _mm_storeu_si128((__m128i*)&out[i], res);
+    // replicate prev output on the four lanes
+    prev = _mm_shuffle_epi32(res, (3 << 0) | (3 << 2) | (3 << 4) | (3 << 6));
+  }
+  if (i != num_pixels) {
+    VP8LPredictorsAdd_C[1](in + i, upper + i, num_pixels - i, out + i);
+  }
+}
+
+// Macro that adds 32-bit integers from IN using mod 256 arithmetic
+// per 8 bit channel.
+#define GENERATE_PREDICTOR_1(X, IN)                                           \
+static void PredictorAdd##X##_SSE2(const uint32_t* in, const uint32_t* upper, \
+                                  int num_pixels, uint32_t* out) {            \
+  int i;                                                                      \
+  for (i = 0; i + 4 <= num_pixels; i += 4) {                                  \
+    const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);              \
+    const __m128i other = _mm_loadu_si128((const __m128i*)&(IN));             \
+    const __m128i res = _mm_add_epi8(src, other);                             \
+    _mm_storeu_si128((__m128i*)&out[i], res);                                 \
+  }                                                                           \
+  if (i != num_pixels) {                                                      \
+    VP8LPredictorsAdd_C[(X)](in + i, upper + i, num_pixels - i, out + i);     \
+  }                                                                           \
+}
+
+// Predictor2: Top.
+GENERATE_PREDICTOR_1(2, upper[i])
+// Predictor3: Top-right.
+GENERATE_PREDICTOR_1(3, upper[i + 1])
+// Predictor4: Top-left.
+GENERATE_PREDICTOR_1(4, upper[i - 1])
+#undef GENERATE_PREDICTOR_1
+
+// Due to averages with integers, values cannot be accumulated in parallel for
+// predictors 5 to 7.
+GENERATE_PREDICTOR_ADD(Predictor5_SSE2, PredictorAdd5_SSE2)
+GENERATE_PREDICTOR_ADD(Predictor6_SSE2, PredictorAdd6_SSE2)
+GENERATE_PREDICTOR_ADD(Predictor7_SSE2, PredictorAdd7_SSE2)
+
+#define GENERATE_PREDICTOR_2(X, IN)                                           \
+static void PredictorAdd##X##_SSE2(const uint32_t* in, const uint32_t* upper, \
+                                   int num_pixels, uint32_t* out) {           \
+  int i;                                                                      \
+  for (i = 0; i + 4 <= num_pixels; i += 4) {                                  \
+    const __m128i Tother = _mm_loadu_si128((const __m128i*)&(IN));            \
+    const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]);             \
+    const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);              \
+    __m128i avg, res;                                                         \
+    Average2_m128i(&T, &Tother, &avg);                                        \
+    res = _mm_add_epi8(avg, src);                                             \
+    _mm_storeu_si128((__m128i*)&out[i], res);                                 \
+  }                                                                           \
+  if (i != num_pixels) {                                                      \
+    VP8LPredictorsAdd_C[(X)](in + i, upper + i, num_pixels - i, out + i);     \
+  }                                                                           \
+}
+// Predictor8: average TL T.
+GENERATE_PREDICTOR_2(8, upper[i - 1])
+// Predictor9: average T TR.
+GENERATE_PREDICTOR_2(9, upper[i + 1])
+#undef GENERATE_PREDICTOR_2
+
+// Predictor10: average of (average of (L,TL), average of (T, TR)).
+static void PredictorAdd10_SSE2(const uint32_t* in, const uint32_t* upper,
+                                int num_pixels, uint32_t* out) {
+  int i, j;
+  __m128i L = _mm_cvtsi32_si128(out[-1]);
+  for (i = 0; i + 4 <= num_pixels; i += 4) {
+    __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
+    __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]);
+    const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]);
+    const __m128i TR = _mm_loadu_si128((const __m128i*)&upper[i + 1]);
+    __m128i avgTTR;
+    Average2_m128i(&T, &TR, &avgTTR);
+    for (j = 0; j < 4; ++j) {
+      __m128i avgLTL, avg;
+      Average2_m128i(&L, &TL, &avgLTL);
+      Average2_m128i(&avgTTR, &avgLTL, &avg);
+      L = _mm_add_epi8(avg, src);
+      out[i + j] = _mm_cvtsi128_si32(L);
+      // Rotate the pre-computed values for the next iteration.
+      avgTTR = _mm_srli_si128(avgTTR, 4);
+      TL = _mm_srli_si128(TL, 4);
+      src = _mm_srli_si128(src, 4);
+    }
+  }
+  if (i != num_pixels) {
+    VP8LPredictorsAdd_C[10](in + i, upper + i, num_pixels - i, out + i);
+  }
+}
+
+// Predictor11: select.
+static void GetSumAbsDiff32(const __m128i* const A, const __m128i* const B,
+                            __m128i* const out) {
+  // We can unpack with any value on the upper 32 bits, provided it's the same
+  // on both operands (to that their sum of abs diff is zero). Here we use *A.
+  const __m128i A_lo = _mm_unpacklo_epi32(*A, *A);
+  const __m128i B_lo = _mm_unpacklo_epi32(*B, *A);
+  const __m128i A_hi = _mm_unpackhi_epi32(*A, *A);
+  const __m128i B_hi = _mm_unpackhi_epi32(*B, *A);
+  const __m128i s_lo = _mm_sad_epu8(A_lo, B_lo);
+  const __m128i s_hi = _mm_sad_epu8(A_hi, B_hi);
+  *out = _mm_packs_epi32(s_lo, s_hi);
+}
+
+static void PredictorAdd11_SSE2(const uint32_t* in, const uint32_t* upper,
+                                int num_pixels, uint32_t* out) {
+  int i, j;
+  __m128i L = _mm_cvtsi32_si128(out[-1]);
+  for (i = 0; i + 4 <= num_pixels; i += 4) {
+    __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]);
+    __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]);
+    __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
+    __m128i pa;
+    GetSumAbsDiff32(&T, &TL, &pa);   // pa = sum |T-TL|
+    for (j = 0; j < 4; ++j) {
+      const __m128i L_lo = _mm_unpacklo_epi32(L, L);
+      const __m128i TL_lo = _mm_unpacklo_epi32(TL, L);
+      const __m128i pb = _mm_sad_epu8(L_lo, TL_lo);  // pb = sum |L-TL|
+      const __m128i mask = _mm_cmpgt_epi32(pb, pa);
+      const __m128i A = _mm_and_si128(mask, L);
+      const __m128i B = _mm_andnot_si128(mask, T);
+      const __m128i pred = _mm_or_si128(A, B);    // pred = (L > T)? L : T
+      L = _mm_add_epi8(src, pred);
+      out[i + j] = _mm_cvtsi128_si32(L);
+      // Shift the pre-computed value for the next iteration.
+      T = _mm_srli_si128(T, 4);
+      TL = _mm_srli_si128(TL, 4);
+      src = _mm_srli_si128(src, 4);
+      pa = _mm_srli_si128(pa, 4);
+    }
+  }
+  if (i != num_pixels) {
+    VP8LPredictorsAdd_C[11](in + i, upper + i, num_pixels - i, out + i);
+  }
+}
+
+// Predictor12: ClampedAddSubtractFull.
+#define DO_PRED12(DIFF, LANE, OUT)                          \
+do {                                                        \
+  const __m128i all = _mm_add_epi16(L, (DIFF));             \
+  const __m128i alls = _mm_packus_epi16(all, all);          \
+  const __m128i res = _mm_add_epi8(src, alls);              \
+  out[i + (OUT)] = _mm_cvtsi128_si32(res);                  \
+  L = _mm_unpacklo_epi8(res, zero);                         \
+  /* Shift the pre-computed value for the next iteration.*/ \
+  if (LANE == 0) (DIFF) = _mm_srli_si128((DIFF), 8);        \
+  src = _mm_srli_si128(src, 4);                             \
+} while (0)
+
+static void PredictorAdd12_SSE2(const uint32_t* in, const uint32_t* upper,
+                                int num_pixels, uint32_t* out) {
+  int i;
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i L8 = _mm_cvtsi32_si128(out[-1]);
+  __m128i L = _mm_unpacklo_epi8(L8, zero);
+  for (i = 0; i + 4 <= num_pixels; i += 4) {
+    // Load 4 pixels at a time.
+    __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
+    const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]);
+    const __m128i T_lo = _mm_unpacklo_epi8(T, zero);
+    const __m128i T_hi = _mm_unpackhi_epi8(T, zero);
+    const __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]);
+    const __m128i TL_lo = _mm_unpacklo_epi8(TL, zero);
+    const __m128i TL_hi = _mm_unpackhi_epi8(TL, zero);
+    __m128i diff_lo = _mm_sub_epi16(T_lo, TL_lo);
+    __m128i diff_hi = _mm_sub_epi16(T_hi, TL_hi);
+    DO_PRED12(diff_lo, 0, 0);
+    DO_PRED12(diff_lo, 1, 1);
+    DO_PRED12(diff_hi, 0, 2);
+    DO_PRED12(diff_hi, 1, 3);
+  }
+  if (i != num_pixels) {
+    VP8LPredictorsAdd_C[12](in + i, upper + i, num_pixels - i, out + i);
+  }
+}
+#undef DO_PRED12
+
+// Due to averages with integers, values cannot be accumulated in parallel for
+// predictors 13.
+GENERATE_PREDICTOR_ADD(Predictor13_SSE2, PredictorAdd13_SSE2)
+
 //------------------------------------------------------------------------------
 // Subtract-Green Transform
 
-static void AddGreenToBlueAndRed(uint32_t* argb_data, int num_pixels) {
+static void AddGreenToBlueAndRed(const uint32_t* const src, int num_pixels,
+                                 uint32_t* dst) {
   int i;
   for (i = 0; i + 4 <= num_pixels; i += 4) {
-    const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]); // argb
+    const __m128i in = _mm_loadu_si128((const __m128i*)&src[i]); // argb
     const __m128i A = _mm_srli_epi16(in, 8);     // 0 a 0 g
     const __m128i B = _mm_shufflelo_epi16(A, _MM_SHUFFLE(2, 2, 0, 0));
     const __m128i C = _mm_shufflehi_epi16(B, _MM_SHUFFLE(2, 2, 0, 0));  // 0g0g
     const __m128i out = _mm_add_epi8(in, C);
-    _mm_storeu_si128((__m128i*)&argb_data[i], out);
+    _mm_storeu_si128((__m128i*)&dst[i], out);
   }
   // fallthrough and finish off with plain-C
-  VP8LAddGreenToBlueAndRed_C(argb_data + i, num_pixels - i);
+  if (i != num_pixels) {
+    VP8LAddGreenToBlueAndRed_C(src + i, num_pixels - i, dst + i);
+  }
 }
 
 //------------------------------------------------------------------------------
 // Color Transform
 
 static void TransformColorInverse(const VP8LMultipliers* const m,
-                                  uint32_t* argb_data, int num_pixels) {
-  // sign-extended multiplying constants, pre-shifted by 5.
+                                  const uint32_t* const src, int num_pixels,
+                                  uint32_t* dst) {
+// sign-extended multiplying constants, pre-shifted by 5.
 #define CST(X)  (((int16_t)(m->X << 8)) >> 5)   // sign-extend
   const __m128i mults_rb = _mm_set_epi16(
       CST(green_to_red_), CST(green_to_blue_),
@@ -188,7 +431,7 @@ static void TransformColorInverse(const VP8LMultipliers* const m,
   const __m128i mask_ag = _mm_set1_epi32(0xff00ff00);  // alpha-green masks
   int i;
   for (i = 0; i + 4 <= num_pixels; i += 4) {
-    const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]); // argb
+    const __m128i in = _mm_loadu_si128((const __m128i*)&src[i]); // argb
     const __m128i A = _mm_and_si128(in, mask_ag);     // a   0   g   0
     const __m128i B = _mm_shufflelo_epi16(A, _MM_SHUFFLE(2, 2, 0, 0));
     const __m128i C = _mm_shufflehi_epi16(B, _MM_SHUFFLE(2, 2, 0, 0));  // g0g0
@@ -200,15 +443,53 @@ static void TransformColorInverse(const VP8LMultipliers* const m,
     const __m128i I = _mm_add_epi8(H, F);              // r' x  b'' 0
     const __m128i J = _mm_srli_epi16(I, 8);            // 0  r'  0  b''
     const __m128i out = _mm_or_si128(J, A);
-    _mm_storeu_si128((__m128i*)&argb_data[i], out);
+    _mm_storeu_si128((__m128i*)&dst[i], out);
   }
   // Fall-back to C-version for left-overs.
-  VP8LTransformColorInverse_C(m, argb_data + i, num_pixels - i);
+  if (i != num_pixels) {
+    VP8LTransformColorInverse_C(m, src + i, num_pixels - i, dst + i);
+  }
 }
 
 //------------------------------------------------------------------------------
 // Color-space conversion functions
 
+static void ConvertBGRAToRGB(const uint32_t* src, int num_pixels,
+                             uint8_t* dst) {
+  const __m128i* in = (const __m128i*)src;
+  __m128i* out = (__m128i*)dst;
+
+  while (num_pixels >= 32) {
+    // Load the BGRA buffers.
+    __m128i in0 = _mm_loadu_si128(in + 0);
+    __m128i in1 = _mm_loadu_si128(in + 1);
+    __m128i in2 = _mm_loadu_si128(in + 2);
+    __m128i in3 = _mm_loadu_si128(in + 3);
+    __m128i in4 = _mm_loadu_si128(in + 4);
+    __m128i in5 = _mm_loadu_si128(in + 5);
+    __m128i in6 = _mm_loadu_si128(in + 6);
+    __m128i in7 = _mm_loadu_si128(in + 7);
+    VP8L32bToPlanar(&in0, &in1, &in2, &in3);
+    VP8L32bToPlanar(&in4, &in5, &in6, &in7);
+    // At this points, in1/in5 contains red only, in2/in6 green only ...
+    // Pack the colors in 24b RGB.
+    VP8PlanarTo24b(&in1, &in5, &in2, &in6, &in3, &in7);
+    _mm_storeu_si128(out + 0, in1);
+    _mm_storeu_si128(out + 1, in5);
+    _mm_storeu_si128(out + 2, in2);
+    _mm_storeu_si128(out + 3, in6);
+    _mm_storeu_si128(out + 4, in3);
+    _mm_storeu_si128(out + 5, in7);
+    in += 8;
+    out += 6;
+    num_pixels -= 32;
+  }
+  // left-overs
+  if (num_pixels > 0) {
+    VP8LConvertBGRAToRGB_C((const uint32_t*)in, num_pixels, (uint8_t*)out);
+  }
+}
+
 static void ConvertBGRAToRGBA(const uint32_t* src,
                               int num_pixels, uint8_t* dst) {
   const __m128i* in = (const __m128i*)src;
@@ -233,7 +514,9 @@ static void ConvertBGRAToRGBA(const uint32_t* src,
     num_pixels -= 8;
   }
   // left-overs
-  VP8LConvertBGRAToRGBA_C((const uint32_t*)in, num_pixels, (uint8_t*)out);
+  if (num_pixels > 0) {
+    VP8LConvertBGRAToRGBA_C((const uint32_t*)in, num_pixels, (uint8_t*)out);
+  }
 }
 
 static void ConvertBGRAToRGBA4444(const uint32_t* src,
@@ -267,7 +550,9 @@ static void ConvertBGRAToRGBA4444(const uint32_t* src,
     num_pixels -= 8;
   }
   // left-overs
-  VP8LConvertBGRAToRGBA4444_C((const uint32_t*)in, num_pixels, (uint8_t*)out);
+  if (num_pixels > 0) {
+    VP8LConvertBGRAToRGBA4444_C((const uint32_t*)in, num_pixels, (uint8_t*)out);
+  }
 }
 
 static void ConvertBGRAToRGB565(const uint32_t* src,
@@ -306,7 +591,9 @@ static void ConvertBGRAToRGB565(const uint32_t* src,
     num_pixels -= 8;
   }
   // left-overs
-  VP8LConvertBGRAToRGB565_C((const uint32_t*)in, num_pixels, (uint8_t*)out);
+  if (num_pixels > 0) {
+    VP8LConvertBGRAToRGB565_C((const uint32_t*)in, num_pixels, (uint8_t*)out);
+  }
 }
 
 static void ConvertBGRAToBGR(const uint32_t* src,
@@ -337,7 +624,9 @@ static void ConvertBGRAToBGR(const uint32_t* src,
     num_pixels -= 8;
   }
   // left-overs
-  VP8LConvertBGRAToBGR_C((const uint32_t*)in, num_pixels, dst);
+  if (num_pixels > 0) {
+    VP8LConvertBGRAToBGR_C((const uint32_t*)in, num_pixels, dst);
+  }
 }
 
 //------------------------------------------------------------------------------
@@ -346,19 +635,35 @@ static void ConvertBGRAToBGR(const uint32_t* src,
 extern void VP8LDspInitSSE2(void);
 
 WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitSSE2(void) {
-  VP8LPredictors[5] = Predictor5;
-  VP8LPredictors[6] = Predictor6;
-  VP8LPredictors[7] = Predictor7;
-  VP8LPredictors[8] = Predictor8;
-  VP8LPredictors[9] = Predictor9;
-  VP8LPredictors[10] = Predictor10;
-  VP8LPredictors[11] = Predictor11;
-  VP8LPredictors[12] = Predictor12;
-  VP8LPredictors[13] = Predictor13;
+  VP8LPredictors[5] = Predictor5_SSE2;
+  VP8LPredictors[6] = Predictor6_SSE2;
+  VP8LPredictors[7] = Predictor7_SSE2;
+  VP8LPredictors[8] = Predictor8_SSE2;
+  VP8LPredictors[9] = Predictor9_SSE2;
+  VP8LPredictors[10] = Predictor10_SSE2;
+  VP8LPredictors[11] = Predictor11_SSE2;
+  VP8LPredictors[12] = Predictor12_SSE2;
+  VP8LPredictors[13] = Predictor13_SSE2;
+
+  VP8LPredictorsAdd[0] = PredictorAdd0_SSE2;
+  VP8LPredictorsAdd[1] = PredictorAdd1_SSE2;
+  VP8LPredictorsAdd[2] = PredictorAdd2_SSE2;
+  VP8LPredictorsAdd[3] = PredictorAdd3_SSE2;
+  VP8LPredictorsAdd[4] = PredictorAdd4_SSE2;
+  VP8LPredictorsAdd[5] = PredictorAdd5_SSE2;
+  VP8LPredictorsAdd[6] = PredictorAdd6_SSE2;
+  VP8LPredictorsAdd[7] = PredictorAdd7_SSE2;
+  VP8LPredictorsAdd[8] = PredictorAdd8_SSE2;
+  VP8LPredictorsAdd[9] = PredictorAdd9_SSE2;
+  VP8LPredictorsAdd[10] = PredictorAdd10_SSE2;
+  VP8LPredictorsAdd[11] = PredictorAdd11_SSE2;
+  VP8LPredictorsAdd[12] = PredictorAdd12_SSE2;
+  VP8LPredictorsAdd[13] = PredictorAdd13_SSE2;
 
   VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed;
   VP8LTransformColorInverse = TransformColorInverse;
 
+  VP8LConvertBGRAToRGB = ConvertBGRAToRGB;
   VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA;
   VP8LConvertBGRAToRGBA4444 = ConvertBGRAToRGBA4444;
   VP8LConvertBGRAToRGB565 = ConvertBGRAToRGB565;
diff --git a/src/3rdparty/libwebp/src/dsp/msa_macro.h b/src/3rdparty/libwebp/src/dsp/msa_macro.h
index 5c707f4..d0e5f45 100644
--- a/src/3rdparty/libwebp/src/dsp/msa_macro.h
+++ b/src/3rdparty/libwebp/src/dsp/msa_macro.h
@@ -23,12 +23,24 @@
 
 #ifdef CLANG_BUILD
   #define ADDVI_H(a, b)  __msa_addvi_h((v8i16)a, b)
+  #define ADDVI_W(a, b)  __msa_addvi_w((v4i32)a, b)
+  #define SRAI_B(a, b)  __msa_srai_b((v16i8)a, b)
   #define SRAI_H(a, b)  __msa_srai_h((v8i16)a, b)
   #define SRAI_W(a, b)  __msa_srai_w((v4i32)a, b)
+  #define SRLI_H(a, b)  __msa_srli_h((v8i16)a, b)
+  #define SLLI_B(a, b)  __msa_slli_b((v4i32)a, b)
+  #define ANDI_B(a, b)  __msa_andi_b((v16u8)a, b)
+  #define ORI_B(a, b)   __msa_ori_b((v16u8)a, b)
 #else
   #define ADDVI_H(a, b)  (a + b)
+  #define ADDVI_W(a, b)  (a + b)
+  #define SRAI_B(a, b)  (a >> b)
   #define SRAI_H(a, b)  (a >> b)
   #define SRAI_W(a, b)  (a >> b)
+  #define SRLI_H(a, b)  (a << b)
+  #define SLLI_B(a, b)  (a << b)
+  #define ANDI_B(a, b)  (a & b)
+  #define ORI_B(a, b)   (a | b)
 #endif
 
 #define LD_B(RTYPE, psrc) *((RTYPE*)(psrc))
@@ -116,13 +128,13 @@
   #define SH(val, pdst)  MSA_STORE(val, pdst, msa_ush)
   MSA_STORE_FUNC(uint32_t, usw, msa_usw);
   #define SW(val, pdst)  MSA_STORE(val, pdst, msa_usw)
-  #define SD(val, pdst) {                                                  \
+  #define SD(val, pdst) do {                                               \
     uint8_t* const pdst_sd_m = (uint8_t*)(pdst);                           \
     const uint32_t val0_m = (uint32_t)(val & 0x00000000FFFFFFFF);          \
     const uint32_t val1_m = (uint32_t)((val >> 32) & 0x00000000FFFFFFFF);  \
     SW(val0_m, pdst_sd_m);                                                 \
     SW(val1_m, pdst_sd_m + 4);                                             \
-  }
+  } while (0)
 #endif  // (__mips_isa_rev >= 6)
 
 /* Description : Load 4 words with stride
@@ -133,34 +145,68 @@
  *               Load word in 'out2' from (psrc + 2 * stride)
  *               Load word in 'out3' from (psrc + 3 * stride)
  */
-#define LW4(psrc, stride, out0, out1, out2, out3) {  \
-  const uint8_t* ptmp = (const uint8_t*)psrc;        \
-  out0 = LW(ptmp);                                   \
-  ptmp += stride;                                    \
-  out1 = LW(ptmp);                                   \
-  ptmp += stride;                                    \
-  out2 = LW(ptmp);                                   \
-  ptmp += stride;                                    \
-  out3 = LW(ptmp);                                   \
-}
+#define LW4(psrc, stride, out0, out1, out2, out3) do {  \
+  const uint8_t* ptmp = (const uint8_t*)psrc;           \
+  out0 = LW(ptmp);                                      \
+  ptmp += stride;                                       \
+  out1 = LW(ptmp);                                      \
+  ptmp += stride;                                       \
+  out2 = LW(ptmp);                                      \
+  ptmp += stride;                                       \
+  out3 = LW(ptmp);                                      \
+} while (0)
 
-/* Description : Store 4 words with stride
+/* Description : Store words with stride
  * Arguments   : Inputs - in0, in1, in2, in3, pdst, stride
  * Details     : Store word from 'in0' to (pdst)
  *               Store word from 'in1' to (pdst + stride)
  *               Store word from 'in2' to (pdst + 2 * stride)
  *               Store word from 'in3' to (pdst + 3 * stride)
  */
-#define SW4(in0, in1, in2, in3, pdst, stride) {  \
-  uint8_t* ptmp = (uint8_t*)pdst;                \
-  SW(in0, ptmp);                                 \
-  ptmp += stride;                                \
-  SW(in1, ptmp);                                 \
-  ptmp += stride;                                \
-  SW(in2, ptmp);                                 \
-  ptmp += stride;                                \
-  SW(in3, ptmp);                                 \
-}
+#define SW4(in0, in1, in2, in3, pdst, stride) do {  \
+  uint8_t* ptmp = (uint8_t*)pdst;                   \
+  SW(in0, ptmp);                                    \
+  ptmp += stride;                                   \
+  SW(in1, ptmp);                                    \
+  ptmp += stride;                                   \
+  SW(in2, ptmp);                                    \
+  ptmp += stride;                                   \
+  SW(in3, ptmp);                                    \
+} while (0)
+
+#define SW3(in0, in1, in2, pdst, stride) do {  \
+  uint8_t* ptmp = (uint8_t*)pdst;              \
+  SW(in0, ptmp);                               \
+  ptmp += stride;                              \
+  SW(in1, ptmp);                               \
+  ptmp += stride;                              \
+  SW(in2, ptmp);                               \
+} while (0)
+
+#define SW2(in0, in1, pdst, stride) do {  \
+  uint8_t* ptmp = (uint8_t*)pdst;         \
+  SW(in0, ptmp);                          \
+  ptmp += stride;                         \
+  SW(in1, ptmp);                          \
+} while (0)
+
+/* Description : Store 4 double words with stride
+ * Arguments   : Inputs - in0, in1, in2, in3, pdst, stride
+ * Details     : Store double word from 'in0' to (pdst)
+ *               Store double word from 'in1' to (pdst + stride)
+ *               Store double word from 'in2' to (pdst + 2 * stride)
+ *               Store double word from 'in3' to (pdst + 3 * stride)
+ */
+#define SD4(in0, in1, in2, in3, pdst, stride) do {  \
+  uint8_t* ptmp = (uint8_t*)pdst;                   \
+  SD(in0, ptmp);                                    \
+  ptmp += stride;                                   \
+  SD(in1, ptmp);                                    \
+  ptmp += stride;                                   \
+  SD(in2, ptmp);                                    \
+  ptmp += stride;                                   \
+  SD(in3, ptmp);                                    \
+} while (0)
 
 /* Description : Load vectors with 16 byte elements with stride
  * Arguments   : Inputs  - psrc, stride
@@ -169,33 +215,169 @@
  * Details     : Load 16 byte elements in 'out0' from (psrc)
  *               Load 16 byte elements in 'out1' from (psrc + stride)
  */
-#define LD_B2(RTYPE, psrc, stride, out0, out1) {  \
-  out0 = LD_B(RTYPE, psrc);                       \
-  out1 = LD_B(RTYPE, psrc + stride);              \
-}
+#define LD_B2(RTYPE, psrc, stride, out0, out1) do {  \
+  out0 = LD_B(RTYPE, psrc);                          \
+  out1 = LD_B(RTYPE, psrc + stride);                 \
+} while (0)
 #define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__)
 #define LD_SB2(...) LD_B2(v16i8, __VA_ARGS__)
 
-#define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) {  \
-  LD_B2(RTYPE, psrc, stride, out0, out1);                     \
-  LD_B2(RTYPE, psrc + 2 * stride , stride, out2, out3);       \
-}
+#define LD_B3(RTYPE, psrc, stride, out0, out1, out2) do {  \
+  LD_B2(RTYPE, psrc, stride, out0, out1);                  \
+  out2 = LD_B(RTYPE, psrc + 2 * stride);                   \
+} while (0)
+#define LD_UB3(...) LD_B3(v16u8, __VA_ARGS__)
+#define LD_SB3(...) LD_B3(v16i8, __VA_ARGS__)
+
+#define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) do {  \
+  LD_B2(RTYPE, psrc, stride, out0, out1);                        \
+  LD_B2(RTYPE, psrc + 2 * stride , stride, out2, out3);          \
+} while (0)
 #define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__)
 #define LD_SB4(...) LD_B4(v16i8, __VA_ARGS__)
 
+#define LD_B8(RTYPE, psrc, stride,                                  \
+              out0, out1, out2, out3, out4, out5, out6, out7) do {  \
+  LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3);               \
+  LD_B4(RTYPE, psrc + 4 * stride, stride, out4, out5, out6, out7);  \
+} while (0)
+#define LD_UB8(...) LD_B8(v16u8, __VA_ARGS__)
+#define LD_SB8(...) LD_B8(v16i8, __VA_ARGS__)
+
 /* Description : Load vectors with 8 halfword elements with stride
  * Arguments   : Inputs  - psrc, stride
  *               Outputs - out0, out1
  * Details     : Load 8 halfword elements in 'out0' from (psrc)
  *               Load 8 halfword elements in 'out1' from (psrc + stride)
  */
-#define LD_H2(RTYPE, psrc, stride, out0, out1) {  \
-  out0 = LD_H(RTYPE, psrc);                       \
-  out1 = LD_H(RTYPE, psrc + stride);              \
-}
+#define LD_H2(RTYPE, psrc, stride, out0, out1) do {  \
+  out0 = LD_H(RTYPE, psrc);                          \
+  out1 = LD_H(RTYPE, psrc + stride);                 \
+} while (0)
 #define LD_UH2(...) LD_H2(v8u16, __VA_ARGS__)
 #define LD_SH2(...) LD_H2(v8i16, __VA_ARGS__)
 
+/* Description : Load vectors with 4 word elements with stride
+ * Arguments   : Inputs  - psrc, stride
+ *               Outputs - out0, out1, out2, out3
+ * Details     : Load 4 word elements in 'out0' from (psrc + 0 * stride)
+ *               Load 4 word elements in 'out1' from (psrc + 1 * stride)
+ *               Load 4 word elements in 'out2' from (psrc + 2 * stride)
+ *               Load 4 word elements in 'out3' from (psrc + 3 * stride)
+ */
+#define LD_W2(RTYPE, psrc, stride, out0, out1) do {  \
+  out0 = LD_W(RTYPE, psrc);                          \
+  out1 = LD_W(RTYPE, psrc + stride);                 \
+} while (0)
+#define LD_UW2(...) LD_W2(v4u32, __VA_ARGS__)
+#define LD_SW2(...) LD_W2(v4i32, __VA_ARGS__)
+
+#define LD_W3(RTYPE, psrc, stride, out0, out1, out2) do {  \
+  LD_W2(RTYPE, psrc, stride, out0, out1);                  \
+  out2 = LD_W(RTYPE, psrc + 2 * stride);                   \
+} while (0)
+#define LD_UW3(...) LD_W3(v4u32, __VA_ARGS__)
+#define LD_SW3(...) LD_W3(v4i32, __VA_ARGS__)
+
+#define LD_W4(RTYPE, psrc, stride, out0, out1, out2, out3) do {  \
+  LD_W2(RTYPE, psrc, stride, out0, out1);                        \
+  LD_W2(RTYPE, psrc + 2 * stride, stride, out2, out3);           \
+} while (0)
+#define LD_UW4(...) LD_W4(v4u32, __VA_ARGS__)
+#define LD_SW4(...) LD_W4(v4i32, __VA_ARGS__)
+
+/* Description : Store vectors of 16 byte elements with stride
+ * Arguments   : Inputs - in0, in1, pdst, stride
+ * Details     : Store 16 byte elements from 'in0' to (pdst)
+ *               Store 16 byte elements from 'in1' to (pdst + stride)
+ */
+#define ST_B2(RTYPE, in0, in1, pdst, stride) do {  \
+  ST_B(RTYPE, in0, pdst);                          \
+  ST_B(RTYPE, in1, pdst + stride);                 \
+} while (0)
+#define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__)
+#define ST_SB2(...) ST_B2(v16i8, __VA_ARGS__)
+
+#define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride) do {  \
+  ST_B2(RTYPE, in0, in1, pdst, stride);                      \
+  ST_B2(RTYPE, in2, in3, pdst + 2 * stride, stride);         \
+} while (0)
+#define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__)
+#define ST_SB4(...) ST_B4(v16i8, __VA_ARGS__)
+
+#define ST_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,    \
+              pdst, stride) do {                                \
+  ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride);               \
+  ST_B4(RTYPE, in4, in5, in6, in7, pdst + 4 * stride, stride);  \
+} while (0)
+#define ST_UB8(...) ST_B8(v16u8, __VA_ARGS__)
+
+/* Description : Store vectors of 4 word elements with stride
+ * Arguments   : Inputs - in0, in1, in2, in3, pdst, stride
+ * Details     : Store 4 word elements from 'in0' to (pdst + 0 * stride)
+ *               Store 4 word elements from 'in1' to (pdst + 1 * stride)
+ *               Store 4 word elements from 'in2' to (pdst + 2 * stride)
+ *               Store 4 word elements from 'in3' to (pdst + 3 * stride)
+ */
+#define ST_W2(RTYPE, in0, in1, pdst, stride) do {  \
+  ST_W(RTYPE, in0, pdst);                          \
+  ST_W(RTYPE, in1, pdst + stride);                 \
+} while (0)
+#define ST_UW2(...) ST_W2(v4u32, __VA_ARGS__)
+#define ST_SW2(...) ST_W2(v4i32, __VA_ARGS__)
+
+#define ST_W3(RTYPE, in0, in1, in2, pdst, stride) do {  \
+  ST_W2(RTYPE, in0, in1, pdst, stride);                 \
+  ST_W(RTYPE, in2, pdst + 2 * stride);                  \
+} while (0)
+#define ST_UW3(...) ST_W3(v4u32, __VA_ARGS__)
+#define ST_SW3(...) ST_W3(v4i32, __VA_ARGS__)
+
+#define ST_W4(RTYPE, in0, in1, in2, in3, pdst, stride) do {  \
+  ST_W2(RTYPE, in0, in1, pdst, stride);                      \
+  ST_W2(RTYPE, in2, in3, pdst + 2 * stride, stride);         \
+} while (0)
+#define ST_UW4(...) ST_W4(v4u32, __VA_ARGS__)
+#define ST_SW4(...) ST_W4(v4i32, __VA_ARGS__)
+
+/* Description : Store vectors of 8 halfword elements with stride
+ * Arguments   : Inputs - in0, in1, pdst, stride
+ * Details     : Store 8 halfword elements from 'in0' to (pdst)
+ *               Store 8 halfword elements from 'in1' to (pdst + stride)
+ */
+#define ST_H2(RTYPE, in0, in1, pdst, stride) do {  \
+  ST_H(RTYPE, in0, pdst);                          \
+  ST_H(RTYPE, in1, pdst + stride);                 \
+} while (0)
+#define ST_UH2(...) ST_H2(v8u16, __VA_ARGS__)
+#define ST_SH2(...) ST_H2(v8i16, __VA_ARGS__)
+
+/* Description : Store 2x4 byte block to destination memory from input vector
+ * Arguments   : Inputs - in, stidx, pdst, stride
+ * Details     : Index 'stidx' halfword element from 'in' vector is copied to
+ *               the GP register and stored to (pdst)
+ *               Index 'stidx+1' halfword element from 'in' vector is copied to
+ *               the GP register and stored to (pdst + stride)
+ *               Index 'stidx+2' halfword element from 'in' vector is copied to
+ *               the GP register and stored to (pdst + 2 * stride)
+ *               Index 'stidx+3' halfword element from 'in' vector is copied to
+ *               the GP register and stored to (pdst + 3 * stride)
+ */
+#define ST2x4_UB(in, stidx, pdst, stride) do {                   \
+  uint8_t* pblk_2x4_m = (uint8_t*)pdst;                          \
+  const uint16_t out0_m = __msa_copy_s_h((v8i16)in, stidx);      \
+  const uint16_t out1_m = __msa_copy_s_h((v8i16)in, stidx + 1);  \
+  const uint16_t out2_m = __msa_copy_s_h((v8i16)in, stidx + 2);  \
+  const uint16_t out3_m = __msa_copy_s_h((v8i16)in, stidx + 3);  \
+  SH(out0_m, pblk_2x4_m);                                        \
+  pblk_2x4_m += stride;                                          \
+  SH(out1_m, pblk_2x4_m);                                        \
+  pblk_2x4_m += stride;                                          \
+  SH(out2_m, pblk_2x4_m);                                        \
+  pblk_2x4_m += stride;                                          \
+  SH(out3_m, pblk_2x4_m);                                        \
+} while (0)
+
 /* Description : Store 4x4 byte block to destination memory from input vector
  * Arguments   : Inputs - in0, in1, pdst, stride
  * Details     : 'Idx0' word element from input vector 'in0' is copied to the
@@ -207,14 +389,20 @@
  *               'Idx3' word element from input vector 'in0' is copied to the
  *               GP register and stored to (pdst + 3 * stride)
  */
-#define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) {  \
-  uint8_t* const pblk_4x4_m = (uint8_t*)pdst;                       \
-  const uint32_t out0_m = __msa_copy_s_w((v4i32)in0, idx0);         \
-  const uint32_t out1_m = __msa_copy_s_w((v4i32)in0, idx1);         \
-  const uint32_t out2_m = __msa_copy_s_w((v4i32)in1, idx2);         \
-  const uint32_t out3_m = __msa_copy_s_w((v4i32)in1, idx3);         \
-  SW4(out0_m, out1_m, out2_m, out3_m, pblk_4x4_m, stride);          \
-}
+#define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) do {  \
+  uint8_t* const pblk_4x4_m = (uint8_t*)pdst;                          \
+  const uint32_t out0_m = __msa_copy_s_w((v4i32)in0, idx0);            \
+  const uint32_t out1_m = __msa_copy_s_w((v4i32)in0, idx1);            \
+  const uint32_t out2_m = __msa_copy_s_w((v4i32)in1, idx2);            \
+  const uint32_t out3_m = __msa_copy_s_w((v4i32)in1, idx3);            \
+  SW4(out0_m, out1_m, out2_m, out3_m, pblk_4x4_m, stride);             \
+} while (0)
+
+#define ST4x8_UB(in0, in1, pdst, stride) do {                     \
+  uint8_t* const pblk_4x8 = (uint8_t*)pdst;                       \
+  ST4x4_UB(in0, in0, 0, 1, 2, 3, pblk_4x8, stride);               \
+  ST4x4_UB(in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride);  \
+} while (0)
 
 /* Description : Immediate number of elements to slide
  * Arguments   : Inputs  - in0, in1, slide_val
@@ -230,6 +418,30 @@
 #define SLDI_SB(...) SLDI_B(v16i8, __VA_ARGS__)
 #define SLDI_SH(...) SLDI_B(v8i16, __VA_ARGS__)
 
+/* Description : Shuffle byte vector elements as per mask vector
+ * Arguments   : Inputs  - in0, in1, in2, in3, mask0, mask1
+ *               Outputs - out0, out1
+ *               Return Type - as per RTYPE
+ * Details     : Byte elements from 'in0' & 'in1' are copied selectively to
+ *               'out0' as per control vector 'mask0'
+ */
+#define VSHF_B(RTYPE, in0, in1, mask)                              \
+        (RTYPE)__msa_vshf_b((v16i8)mask, (v16i8)in1, (v16i8)in0)
+
+#define VSHF_UB(...) VSHF_B(v16u8, __VA_ARGS__)
+#define VSHF_SB(...) VSHF_B(v16i8, __VA_ARGS__)
+#define VSHF_UH(...) VSHF_B(v8u16, __VA_ARGS__)
+#define VSHF_SH(...) VSHF_B(v8i16, __VA_ARGS__)
+
+#define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) do {  \
+  out0 = VSHF_B(RTYPE, in0, in1, mask0);                                   \
+  out1 = VSHF_B(RTYPE, in2, in3, mask1);                                   \
+} while (0)
+#define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__)
+#define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__)
+#define VSHF_B2_UH(...) VSHF_B2(v8u16, __VA_ARGS__)
+#define VSHF_B2_SH(...) VSHF_B2(v8i16, __VA_ARGS__)
+
 /* Description : Shuffle halfword vector elements as per mask vector
  * Arguments   : Inputs  - in0, in1, in2, in3, mask0, mask1
  *               Outputs - out0, out1
@@ -237,44 +449,219 @@
  * Details     : halfword elements from 'in0' & 'in1' are copied selectively to
  *               'out0' as per control vector 'mask0'
  */
-#define VSHF_H2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) {  \
-  out0 = (RTYPE)__msa_vshf_h((v8i16)mask0, (v8i16)in1, (v8i16)in0);     \
-  out1 = (RTYPE)__msa_vshf_h((v8i16)mask1, (v8i16)in3, (v8i16)in2);     \
-}
+#define VSHF_H2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) do {  \
+  out0 = (RTYPE)__msa_vshf_h((v8i16)mask0, (v8i16)in1, (v8i16)in0);        \
+  out1 = (RTYPE)__msa_vshf_h((v8i16)mask1, (v8i16)in3, (v8i16)in2);        \
+} while (0)
 #define VSHF_H2_UH(...) VSHF_H2(v8u16, __VA_ARGS__)
 #define VSHF_H2_SH(...) VSHF_H2(v8i16, __VA_ARGS__)
 
+/* Description : Dot product of byte vector elements
+ * Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
+ *               Outputs - out0, out1
+ *               Return Type - as per RTYPE
+ * Details     : Signed byte elements from 'mult0' are multiplied with
+ *               signed byte elements from 'cnst0' producing a result
+ *               twice the size of input i.e. signed halfword.
+ *               The multiplication result of adjacent odd-even elements
+ *               are added together and written to the 'out0' vector
+*/
+#define DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) do {  \
+  out0 = (RTYPE)__msa_dotp_s_h((v16i8)mult0, (v16i8)cnst0);           \
+  out1 = (RTYPE)__msa_dotp_s_h((v16i8)mult1, (v16i8)cnst1);           \
+} while (0)
+#define DOTP_SB2_SH(...) DOTP_SB2(v8i16, __VA_ARGS__)
+
+/* Description : Dot product of halfword vector elements
+ * Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
+ *               Outputs - out0, out1
+ *               Return Type - as per RTYPE
+ * Details     : Signed halfword elements from 'mult0' are multiplied with
+ *               signed halfword elements from 'cnst0' producing a result
+ *               twice the size of input i.e. signed word.
+ *               The multiplication result of adjacent odd-even elements
+ *               are added together and written to the 'out0' vector
+ */
+#define DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) do {  \
+  out0 = (RTYPE)__msa_dotp_s_w((v8i16)mult0, (v8i16)cnst0);           \
+  out1 = (RTYPE)__msa_dotp_s_w((v8i16)mult1, (v8i16)cnst1);           \
+} while (0)
+#define DOTP_SH2_SW(...) DOTP_SH2(v4i32, __VA_ARGS__)
+
+/* Description : Dot product of unsigned word vector elements
+ * Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
+ *               Outputs - out0, out1
+ *               Return Type - as per RTYPE
+ * Details     : Unsigned word elements from 'mult0' are multiplied with
+ *               unsigned word elements from 'cnst0' producing a result
+ *               twice the size of input i.e. unsigned double word.
+ *               The multiplication result of adjacent odd-even elements
+ *               are added together and written to the 'out0' vector
+ */
+#define DOTP_UW2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) do {  \
+  out0 = (RTYPE)__msa_dotp_u_d((v4u32)mult0, (v4u32)cnst0);           \
+  out1 = (RTYPE)__msa_dotp_u_d((v4u32)mult1, (v4u32)cnst1);           \
+} while (0)
+#define DOTP_UW2_UD(...) DOTP_UW2(v2u64, __VA_ARGS__)
+
+/* Description : Dot product & addition of halfword vector elements
+ * Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
+ *               Outputs - out0, out1
+ *               Return Type - as per RTYPE
+ * Details     : Signed halfword elements from 'mult0' are multiplied with
+ *               signed halfword elements from 'cnst0' producing a result
+ *               twice the size of input i.e. signed word.
+ *               The multiplication result of adjacent odd-even elements
+ *               are added to the 'out0' vector
+ */
+#define DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) do {      \
+  out0 = (RTYPE)__msa_dpadd_s_w((v4i32)out0, (v8i16)mult0, (v8i16)cnst0);  \
+  out1 = (RTYPE)__msa_dpadd_s_w((v4i32)out1, (v8i16)mult1, (v8i16)cnst1);  \
+} while (0)
+#define DPADD_SH2_SW(...) DPADD_SH2(v4i32, __VA_ARGS__)
+
 /* Description : Clips all signed halfword elements of input vector
  *               between 0 & 255
  * Arguments   : Input/output  - val
  *               Return Type - signed halfword
  */
-#define CLIP_SH_0_255(val) {                      \
+#define CLIP_SH_0_255(val) do {                   \
   const v8i16 max_m = __msa_ldi_h(255);           \
   val = __msa_maxi_s_h((v8i16)val, 0);            \
   val = __msa_min_s_h(max_m, (v8i16)val);         \
-}
-#define CLIP_SH2_0_255(in0, in1) {  \
-  CLIP_SH_0_255(in0);               \
-  CLIP_SH_0_255(in1);               \
-}
+} while (0)
+
+#define CLIP_SH2_0_255(in0, in1) do {  \
+  CLIP_SH_0_255(in0);                  \
+  CLIP_SH_0_255(in1);                  \
+} while (0)
+
+#define CLIP_SH4_0_255(in0, in1, in2, in3) do {  \
+  CLIP_SH2_0_255(in0, in1);                      \
+  CLIP_SH2_0_255(in2, in3);                      \
+} while (0)
+
+/* Description : Clips all unsigned halfword elements of input vector
+ *               between 0 & 255
+ * Arguments   : Input  - in
+ *               Output - out_m
+ *               Return Type - unsigned halfword
+ */
+#define CLIP_UH_0_255(in) do {                    \
+  const v8u16 max_m = (v8u16)__msa_ldi_h(255);    \
+  in = __msa_maxi_u_h((v8u16) in, 0);             \
+  in = __msa_min_u_h((v8u16) max_m, (v8u16) in);  \
+} while (0)
+
+#define CLIP_UH2_0_255(in0, in1) do {  \
+  CLIP_UH_0_255(in0);                  \
+  CLIP_UH_0_255(in1);                  \
+} while (0)
 
 /* Description : Clips all signed word elements of input vector
  *               between 0 & 255
  * Arguments   : Input/output  - val
  *               Return Type - signed word
  */
-#define CLIP_SW_0_255(val) {                      \
+#define CLIP_SW_0_255(val) do {                   \
   const v4i32 max_m = __msa_ldi_w(255);           \
   val = __msa_maxi_s_w((v4i32)val, 0);            \
   val = __msa_min_s_w(max_m, (v4i32)val);         \
+} while (0)
+
+#define CLIP_SW4_0_255(in0, in1, in2, in3) do {   \
+  CLIP_SW_0_255(in0);                             \
+  CLIP_SW_0_255(in1);                             \
+  CLIP_SW_0_255(in2);                             \
+  CLIP_SW_0_255(in3);                             \
+} while (0)
+
+/* Description : Horizontal addition of 4 signed word elements of input vector
+ * Arguments   : Input  - in       (signed word vector)
+ *               Output - sum_m    (i32 sum)
+ *               Return Type - signed word (GP)
+ * Details     : 4 signed word elements of 'in' vector are added together and
+ *               the resulting integer sum is returned
+ */
+static WEBP_INLINE int32_t func_hadd_sw_s32(v4i32 in) {
+  const v2i64 res0_m = __msa_hadd_s_d((v4i32)in, (v4i32)in);
+  const v2i64 res1_m = __msa_splati_d(res0_m, 1);
+  const v2i64 out = res0_m + res1_m;
+  int32_t sum_m = __msa_copy_s_w((v4i32)out, 0);
+  return sum_m;
 }
-#define CLIP_SW4_0_255(in0, in1, in2, in3) {  \
-  CLIP_SW_0_255(in0);                         \
-  CLIP_SW_0_255(in1);                         \
-  CLIP_SW_0_255(in2);                         \
-  CLIP_SW_0_255(in3);                         \
+#define HADD_SW_S32(in) func_hadd_sw_s32(in)
+
+/* Description : Horizontal addition of 8 signed halfword elements
+ * Arguments   : Input  - in       (signed halfword vector)
+ *               Output - sum_m    (s32 sum)
+ *               Return Type - signed word
+ * Details     : 8 signed halfword elements of input vector are added
+ *               together and the resulting integer sum is returned
+ */
+static WEBP_INLINE int32_t func_hadd_sh_s32(v8i16 in) {
+  const v4i32 res = __msa_hadd_s_w(in, in);
+  const v2i64 res0 = __msa_hadd_s_d(res, res);
+  const v2i64 res1 = __msa_splati_d(res0, 1);
+  const v2i64 res2 = res0 + res1;
+  const int32_t sum_m = __msa_copy_s_w((v4i32)res2, 0);
+  return sum_m;
+}
+#define HADD_SH_S32(in) func_hadd_sh_s32(in)
+
+/* Description : Horizontal addition of 8 unsigned halfword elements
+ * Arguments   : Input  - in       (unsigned halfword vector)
+ *               Output - sum_m    (u32 sum)
+ *               Return Type - unsigned word
+ * Details     : 8 unsigned halfword elements of input vector are added
+ *               together and the resulting integer sum is returned
+ */
+static WEBP_INLINE uint32_t func_hadd_uh_u32(v8u16 in) {
+  uint32_t sum_m;
+  const v4u32 res_m = __msa_hadd_u_w(in, in);
+  v2u64 res0_m = __msa_hadd_u_d(res_m, res_m);
+  v2u64 res1_m = (v2u64)__msa_splati_d((v2i64)res0_m, 1);
+  res0_m = res0_m + res1_m;
+  sum_m = __msa_copy_s_w((v4i32)res0_m, 0);
+  return sum_m;
 }
+#define HADD_UH_U32(in) func_hadd_uh_u32(in)
+
+/* Description : Horizontal addition of signed half word vector elements
+   Arguments   : Inputs  - in0, in1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Each signed odd half word element from 'in0' is added to
+                 even signed half word element from 'in0' (pairwise) and the
+                 halfword result is written in 'out0'
+*/
+#define HADD_SH2(RTYPE, in0, in1, out0, out1) do {       \
+  out0 = (RTYPE)__msa_hadd_s_w((v8i16)in0, (v8i16)in0);  \
+  out1 = (RTYPE)__msa_hadd_s_w((v8i16)in1, (v8i16)in1);  \
+} while (0)
+#define HADD_SH2_SW(...) HADD_SH2(v4i32, __VA_ARGS__)
+
+#define HADD_SH4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) do {  \
+  HADD_SH2(RTYPE, in0, in1, out0, out1);                                  \
+  HADD_SH2(RTYPE, in2, in3, out2, out3);                                  \
+} while (0)
+#define HADD_SH4_SW(...) HADD_SH4(v4i32, __VA_ARGS__)
+
+/* Description : Horizontal subtraction of unsigned byte vector elements
+ * Arguments   : Inputs  - in0, in1
+ *               Outputs - out0, out1
+ *               Return Type - as per RTYPE
+ * Details     : Each unsigned odd byte element from 'in0' is subtracted from
+ *               even unsigned byte element from 'in0' (pairwise) and the
+ *               halfword result is written to 'out0'
+ */
+#define HSUB_UB2(RTYPE, in0, in1, out0, out1) do {       \
+  out0 = (RTYPE)__msa_hsub_u_h((v16u8)in0, (v16u8)in0);  \
+  out1 = (RTYPE)__msa_hsub_u_h((v16u8)in1, (v16u8)in1);  \
+} while (0)
+#define HSUB_UB2_UH(...) HSUB_UB2(v8u16, __VA_ARGS__)
+#define HSUB_UB2_SH(...) HSUB_UB2(v8i16, __VA_ARGS__)
+#define HSUB_UB2_SW(...) HSUB_UB2(v4i32, __VA_ARGS__)
 
 /* Description : Set element n input vector to GPR value
  * Arguments   : Inputs - in0, in1, in2, in3
@@ -282,23 +669,188 @@
  *               Return Type - as per RTYPE
  * Details     : Set element 0 in vector 'out' to value specified in 'in0'
  */
-#define INSERT_W2(RTYPE, in0, in1, out) {           \
+#define INSERT_W2(RTYPE, in0, in1, out) do {        \
   out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0);  \
   out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1);  \
-}
+} while (0)
 #define INSERT_W2_UB(...) INSERT_W2(v16u8, __VA_ARGS__)
 #define INSERT_W2_SB(...) INSERT_W2(v16i8, __VA_ARGS__)
 
-#define INSERT_W4(RTYPE, in0, in1, in2, in3, out) {  \
-  out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0);   \
-  out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1);   \
-  out = (RTYPE)__msa_insert_w((v4i32)out, 2, in2);   \
-  out = (RTYPE)__msa_insert_w((v4i32)out, 3, in3);   \
-}
+#define INSERT_W4(RTYPE, in0, in1, in2, in3, out) do {  \
+  out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0);      \
+  out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1);      \
+  out = (RTYPE)__msa_insert_w((v4i32)out, 2, in2);      \
+  out = (RTYPE)__msa_insert_w((v4i32)out, 3, in3);      \
+} while (0)
 #define INSERT_W4_UB(...) INSERT_W4(v16u8, __VA_ARGS__)
 #define INSERT_W4_SB(...) INSERT_W4(v16i8, __VA_ARGS__)
 #define INSERT_W4_SW(...) INSERT_W4(v4i32, __VA_ARGS__)
 
+/* Description : Set element n of double word input vector to GPR value
+ * Arguments   : Inputs - in0, in1
+ *               Output - out
+ *               Return Type - as per RTYPE
+ * Details     : Set element 0 in vector 'out' to GPR value specified in 'in0'
+ *               Set element 1 in vector 'out' to GPR value specified in 'in1'
+ */
+#define INSERT_D2(RTYPE, in0, in1, out) do {        \
+  out = (RTYPE)__msa_insert_d((v2i64)out, 0, in0);  \
+  out = (RTYPE)__msa_insert_d((v2i64)out, 1, in1);  \
+} while (0)
+#define INSERT_D2_UB(...) INSERT_D2(v16u8, __VA_ARGS__)
+#define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__)
+
+/* Description : Interleave even byte elements from vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3
+ *               Outputs - out0, out1
+ *               Return Type - as per RTYPE
+ * Details     : Even byte elements of 'in0' and 'in1' are interleaved
+ *               and written to 'out0'
+ */
+#define ILVEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) do {  \
+  out0 = (RTYPE)__msa_ilvev_b((v16i8)in1, (v16i8)in0);        \
+  out1 = (RTYPE)__msa_ilvev_b((v16i8)in3, (v16i8)in2);        \
+} while (0)
+#define ILVEV_B2_UB(...) ILVEV_B2(v16u8, __VA_ARGS__)
+#define ILVEV_B2_SB(...) ILVEV_B2(v16i8, __VA_ARGS__)
+#define ILVEV_B2_UH(...) ILVEV_B2(v8u16, __VA_ARGS__)
+#define ILVEV_B2_SH(...) ILVEV_B2(v8i16, __VA_ARGS__)
+#define ILVEV_B2_SD(...) ILVEV_B2(v2i64, __VA_ARGS__)
+
+/* Description : Interleave odd byte elements from vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3
+ *               Outputs - out0, out1
+ *               Return Type - as per RTYPE
+ * Details     : Odd byte elements of 'in0' and 'in1' are interleaved
+ *               and written to 'out0'
+ */
+#define ILVOD_B2(RTYPE, in0, in1, in2, in3, out0, out1) do {  \
+  out0 = (RTYPE)__msa_ilvod_b((v16i8)in1, (v16i8)in0);        \
+  out1 = (RTYPE)__msa_ilvod_b((v16i8)in3, (v16i8)in2);        \
+} while (0)
+#define ILVOD_B2_UB(...) ILVOD_B2(v16u8, __VA_ARGS__)
+#define ILVOD_B2_SB(...) ILVOD_B2(v16i8, __VA_ARGS__)
+#define ILVOD_B2_UH(...) ILVOD_B2(v8u16, __VA_ARGS__)
+#define ILVOD_B2_SH(...) ILVOD_B2(v8i16, __VA_ARGS__)
+#define ILVOD_B2_SD(...) ILVOD_B2(v2i64, __VA_ARGS__)
+
+/* Description : Interleave even halfword elements from vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3
+ *               Outputs - out0, out1
+ *               Return Type - as per RTYPE
+ * Details     : Even halfword elements of 'in0' and 'in1' are interleaved
+ *               and written to 'out0'
+ */
+#define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) do {  \
+  out0 = (RTYPE)__msa_ilvev_h((v8i16)in1, (v8i16)in0);        \
+  out1 = (RTYPE)__msa_ilvev_h((v8i16)in3, (v8i16)in2);        \
+} while (0)
+#define ILVEV_H2_UB(...) ILVEV_H2(v16u8, __VA_ARGS__)
+#define ILVEV_H2_UH(...) ILVEV_H2(v8u16, __VA_ARGS__)
+#define ILVEV_H2_SH(...) ILVEV_H2(v8i16, __VA_ARGS__)
+#define ILVEV_H2_SW(...) ILVEV_H2(v4i32, __VA_ARGS__)
+
+/* Description : Interleave odd halfword elements from vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3
+ *               Outputs - out0, out1
+ *               Return Type - as per RTYPE
+ * Details     : Odd halfword elements of 'in0' and 'in1' are interleaved
+ *               and written to 'out0'
+ */
+#define ILVOD_H2(RTYPE, in0, in1, in2, in3, out0, out1) do {  \
+  out0 = (RTYPE)__msa_ilvod_h((v8i16)in1, (v8i16)in0);        \
+  out1 = (RTYPE)__msa_ilvod_h((v8i16)in3, (v8i16)in2);        \
+} while (0)
+#define ILVOD_H2_UB(...) ILVOD_H2(v16u8, __VA_ARGS__)
+#define ILVOD_H2_UH(...) ILVOD_H2(v8u16, __VA_ARGS__)
+#define ILVOD_H2_SH(...) ILVOD_H2(v8i16, __VA_ARGS__)
+#define ILVOD_H2_SW(...) ILVOD_H2(v4i32, __VA_ARGS__)
+
+/* Description : Interleave even word elements from vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3
+ *               Outputs - out0, out1
+ *               Return Type - as per RTYPE
+ * Details     : Even word elements of 'in0' and 'in1' are interleaved
+ *               and written to 'out0'
+ */
+#define ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1) do {  \
+  out0 = (RTYPE)__msa_ilvev_w((v4i32)in1, (v4i32)in0);        \
+  out1 = (RTYPE)__msa_ilvev_w((v4i32)in3, (v4i32)in2);        \
+} while (0)
+#define ILVEV_W2_UB(...) ILVEV_W2(v16u8, __VA_ARGS__)
+#define ILVEV_W2_SB(...) ILVEV_W2(v16i8, __VA_ARGS__)
+#define ILVEV_W2_UH(...) ILVEV_W2(v8u16, __VA_ARGS__)
+#define ILVEV_W2_SD(...) ILVEV_W2(v2i64, __VA_ARGS__)
+
+/* Description : Interleave even-odd word elements from vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3
+ *               Outputs - out0, out1
+ *               Return Type - as per RTYPE
+ * Details     : Even word elements of 'in0' and 'in1' are interleaved
+ *               and written to 'out0'
+ *               Odd word elements of 'in2' and 'in3' are interleaved
+ *               and written to 'out1'
+ */
+#define ILVEVOD_W2(RTYPE, in0, in1, in2, in3, out0, out1) do {  \
+  out0 = (RTYPE)__msa_ilvev_w((v4i32)in1, (v4i32)in0);          \
+  out1 = (RTYPE)__msa_ilvod_w((v4i32)in3, (v4i32)in2);          \
+} while (0)
+#define ILVEVOD_W2_UB(...) ILVEVOD_W2(v16u8, __VA_ARGS__)
+#define ILVEVOD_W2_UH(...) ILVEVOD_W2(v8u16, __VA_ARGS__)
+#define ILVEVOD_W2_SH(...) ILVEVOD_W2(v8i16, __VA_ARGS__)
+#define ILVEVOD_W2_SW(...) ILVEVOD_W2(v4i32, __VA_ARGS__)
+
+/* Description : Interleave even-odd half-word elements from vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3
+ *               Outputs - out0, out1
+ *               Return Type - as per RTYPE
+ * Details     : Even half-word elements of 'in0' and 'in1' are interleaved
+ *               and written to 'out0'
+ *               Odd half-word elements of 'in2' and 'in3' are interleaved
+ *               and written to 'out1'
+ */
+#define ILVEVOD_H2(RTYPE, in0, in1, in2, in3, out0, out1) do {  \
+  out0 = (RTYPE)__msa_ilvev_h((v8i16)in1, (v8i16)in0);          \
+  out1 = (RTYPE)__msa_ilvod_h((v8i16)in3, (v8i16)in2);          \
+} while (0)
+#define ILVEVOD_H2_UB(...) ILVEVOD_H2(v16u8, __VA_ARGS__)
+#define ILVEVOD_H2_UH(...) ILVEVOD_H2(v8u16, __VA_ARGS__)
+#define ILVEVOD_H2_SH(...) ILVEVOD_H2(v8i16, __VA_ARGS__)
+#define ILVEVOD_H2_SW(...) ILVEVOD_H2(v4i32, __VA_ARGS__)
+
+/* Description : Interleave even double word elements from vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3
+ *               Outputs - out0, out1
+ *               Return Type - as per RTYPE
+ * Details     : Even double word elements of 'in0' and 'in1' are interleaved
+ *               and written to 'out0'
+ */
+#define ILVEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) do {  \
+  out0 = (RTYPE)__msa_ilvev_d((v2i64)in1, (v2i64)in0);        \
+  out1 = (RTYPE)__msa_ilvev_d((v2i64)in3, (v2i64)in2);        \
+} while (0)
+#define ILVEV_D2_UB(...) ILVEV_D2(v16u8, __VA_ARGS__)
+#define ILVEV_D2_SB(...) ILVEV_D2(v16i8, __VA_ARGS__)
+#define ILVEV_D2_SW(...) ILVEV_D2(v4i32, __VA_ARGS__)
+#define ILVEV_D2_SD(...) ILVEV_D2(v2i64, __VA_ARGS__)
+
+/* Description : Interleave left half of byte elements from vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3
+ *               Outputs - out0, out1
+ *               Return Type - as per RTYPE
+ * Details     : Left half of byte elements of 'in0' and 'in1' are interleaved
+ *               and written to 'out0'.
+ */
+#define ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1) do {  \
+  out0 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1);        \
+  out1 = (RTYPE)__msa_ilvl_b((v16i8)in2, (v16i8)in3);        \
+} while (0)
+#define ILVL_B2_UB(...) ILVL_B2(v16u8, __VA_ARGS__)
+#define ILVL_B2_SB(...) ILVL_B2(v16i8, __VA_ARGS__)
+#define ILVL_B2_UH(...) ILVL_B2(v8u16, __VA_ARGS__)
+#define ILVL_B2_SH(...) ILVL_B2(v8i16, __VA_ARGS__)
+#define ILVL_B2_SW(...) ILVL_B2(v4i32, __VA_ARGS__)
+
 /* Description : Interleave right half of byte elements from vectors
  * Arguments   : Inputs  - in0, in1, in2, in3
  *               Outputs - out0, out1
@@ -306,10 +858,10 @@
  * Details     : Right half of byte elements of 'in0' and 'in1' are interleaved
  *               and written to out0.
  */
-#define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
-  out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1);     \
-  out1 = (RTYPE)__msa_ilvr_b((v16i8)in2, (v16i8)in3);     \
-}
+#define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1) do {  \
+  out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1);        \
+  out1 = (RTYPE)__msa_ilvr_b((v16i8)in2, (v16i8)in3);        \
+} while (0)
 #define ILVR_B2_UB(...) ILVR_B2(v16u8, __VA_ARGS__)
 #define ILVR_B2_SB(...) ILVR_B2(v16i8, __VA_ARGS__)
 #define ILVR_B2_UH(...) ILVR_B2(v8u16, __VA_ARGS__)
@@ -317,10 +869,10 @@
 #define ILVR_B2_SW(...) ILVR_B2(v4i32, __VA_ARGS__)
 
 #define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
-                out0, out1, out2, out3) {                       \
+                out0, out1, out2, out3) do {                    \
   ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1);               \
   ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3);               \
-}
+} while (0)
 #define ILVR_B4_UB(...) ILVR_B4(v16u8, __VA_ARGS__)
 #define ILVR_B4_SB(...) ILVR_B4(v16i8, __VA_ARGS__)
 #define ILVR_B4_UH(...) ILVR_B4(v8u16, __VA_ARGS__)
@@ -334,19 +886,19 @@
  * Details     : Right half of halfword elements of 'in0' and 'in1' are
  *               interleaved and written to 'out0'.
  */
-#define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
-  out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1);     \
-  out1 = (RTYPE)__msa_ilvr_h((v8i16)in2, (v8i16)in3);     \
-}
+#define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1) do {  \
+  out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1);        \
+  out1 = (RTYPE)__msa_ilvr_h((v8i16)in2, (v8i16)in3);        \
+} while (0)
 #define ILVR_H2_UB(...) ILVR_H2(v16u8, __VA_ARGS__)
 #define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__)
 #define ILVR_H2_SW(...) ILVR_H2(v4i32, __VA_ARGS__)
 
 #define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
-                out0, out1, out2, out3) {                       \
+                out0, out1, out2, out3) do {                    \
   ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1);               \
   ILVR_H2(RTYPE, in4, in5, in6, in7, out2, out3);               \
-}
+} while (0)
 #define ILVR_H4_UB(...) ILVR_H4(v16u8, __VA_ARGS__)
 #define ILVR_H4_SH(...) ILVR_H4(v8i16, __VA_ARGS__)
 #define ILVR_H4_SW(...) ILVR_H4(v4i32, __VA_ARGS__)
@@ -358,31 +910,57 @@
  * Details     : Right half of double word elements of 'in0' and 'in1' are
  *               interleaved and written to 'out0'.
  */
-#define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
-  out0 = (RTYPE)__msa_ilvr_d((v2i64)in0, (v2i64)in1);     \
-  out1 = (RTYPE)__msa_ilvr_d((v2i64)in2, (v2i64)in3);     \
-}
+#define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1) do {  \
+  out0 = (RTYPE)__msa_ilvr_d((v2i64)in0, (v2i64)in1);        \
+  out1 = (RTYPE)__msa_ilvr_d((v2i64)in2, (v2i64)in3);        \
+} while (0)
 #define ILVR_D2_UB(...) ILVR_D2(v16u8, __VA_ARGS__)
 #define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__)
 #define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__)
 
-#define ILVRL_H2(RTYPE, in0, in1, out0, out1) {        \
+#define ILVR_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
+                out0, out1, out2, out3) do {                    \
+  ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1);               \
+  ILVR_D2(RTYPE, in4, in5, in6, in7, out2, out3);               \
+} while (0)
+#define ILVR_D4_SB(...) ILVR_D4(v16i8, __VA_ARGS__)
+#define ILVR_D4_UB(...) ILVR_D4(v16u8, __VA_ARGS__)
+
+/* Description : Interleave both left and right half of input vectors
+ * Arguments   : Inputs  - in0, in1
+ *               Outputs - out0, out1
+ *               Return Type - as per RTYPE
+ * Details     : Right half of byte elements from 'in0' and 'in1' are
+ *               interleaved and written to 'out0'
+ */
+#define ILVRL_B2(RTYPE, in0, in1, out0, out1) do {     \
+  out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1);  \
+  out1 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1);  \
+} while (0)
+#define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__)
+#define ILVRL_B2_SB(...) ILVRL_B2(v16i8, __VA_ARGS__)
+#define ILVRL_B2_UH(...) ILVRL_B2(v8u16, __VA_ARGS__)
+#define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__)
+#define ILVRL_B2_SW(...) ILVRL_B2(v4i32, __VA_ARGS__)
+
+#define ILVRL_H2(RTYPE, in0, in1, out0, out1) do {     \
   out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1);  \
   out1 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1);  \
-}
+} while (0)
 #define ILVRL_H2_UB(...) ILVRL_H2(v16u8, __VA_ARGS__)
 #define ILVRL_H2_SB(...) ILVRL_H2(v16i8, __VA_ARGS__)
 #define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__)
 #define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__)
 #define ILVRL_H2_UW(...) ILVRL_H2(v4u32, __VA_ARGS__)
 
-#define ILVRL_W2(RTYPE, in0, in1, out0, out1) {        \
+#define ILVRL_W2(RTYPE, in0, in1, out0, out1) do {     \
   out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1);  \
   out1 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1);  \
-}
+} while (0)
 #define ILVRL_W2_UB(...) ILVRL_W2(v16u8, __VA_ARGS__)
 #define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__)
 #define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__)
+#define ILVRL_W2_UW(...) ILVRL_W2(v4u32, __VA_ARGS__)
 
 /* Description : Pack even byte elements of vector pairs
  *  Arguments   : Inputs  - in0, in1, in2, in3
@@ -392,15 +970,76 @@
  *                'out0' & even byte elements of 'in1' are copied to the right
  *                half of 'out0'.
  */
-#define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
-  out0 = (RTYPE)__msa_pckev_b((v16i8)in0, (v16i8)in1);     \
-  out1 = (RTYPE)__msa_pckev_b((v16i8)in2, (v16i8)in3);     \
-}
+#define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) do {  \
+  out0 = (RTYPE)__msa_pckev_b((v16i8)in0, (v16i8)in1);        \
+  out1 = (RTYPE)__msa_pckev_b((v16i8)in2, (v16i8)in3);        \
+} while (0)
 #define PCKEV_B2_SB(...) PCKEV_B2(v16i8, __VA_ARGS__)
 #define PCKEV_B2_UB(...) PCKEV_B2(v16u8, __VA_ARGS__)
 #define PCKEV_B2_SH(...) PCKEV_B2(v8i16, __VA_ARGS__)
 #define PCKEV_B2_SW(...) PCKEV_B2(v4i32, __VA_ARGS__)
 
+#define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
+                 out0, out1, out2, out3) do {                    \
+  PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1);               \
+  PCKEV_B2(RTYPE, in4, in5, in6, in7, out2, out3);               \
+} while (0)
+#define PCKEV_B4_SB(...) PCKEV_B4(v16i8, __VA_ARGS__)
+#define PCKEV_B4_UB(...) PCKEV_B4(v16u8, __VA_ARGS__)
+#define PCKEV_B4_SH(...) PCKEV_B4(v8i16, __VA_ARGS__)
+#define PCKEV_B4_SW(...) PCKEV_B4(v4i32, __VA_ARGS__)
+
+/* Description : Pack even halfword elements of vector pairs
+ * Arguments   : Inputs  - in0, in1, in2, in3
+ *               Outputs - out0, out1
+ *               Return Type - as per RTYPE
+ * Details     : Even halfword elements of 'in0' are copied to the left half of
+ *               'out0' & even halfword elements of 'in1' are copied to the
+ *               right half of 'out0'.
+ */
+#define PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) do {  \
+  out0 = (RTYPE)__msa_pckev_h((v8i16)in0, (v8i16)in1);        \
+  out1 = (RTYPE)__msa_pckev_h((v8i16)in2, (v8i16)in3);        \
+} while (0)
+#define PCKEV_H2_UH(...) PCKEV_H2(v8u16, __VA_ARGS__)
+#define PCKEV_H2_SH(...) PCKEV_H2(v8i16, __VA_ARGS__)
+#define PCKEV_H2_SW(...) PCKEV_H2(v4i32, __VA_ARGS__)
+#define PCKEV_H2_UW(...) PCKEV_H2(v4u32, __VA_ARGS__)
+
+/* Description : Pack even word elements of vector pairs
+ * Arguments   : Inputs  - in0, in1, in2, in3
+ *               Outputs - out0, out1
+ *               Return Type - as per RTYPE
+ * Details     : Even word elements of 'in0' are copied to the left half of
+ *               'out0' & even word elements of 'in1' are copied to the
+ *               right half of 'out0'.
+ */
+#define PCKEV_W2(RTYPE, in0, in1, in2, in3, out0, out1) do {  \
+  out0 = (RTYPE)__msa_pckev_w((v4i32)in0, (v4i32)in1);        \
+  out1 = (RTYPE)__msa_pckev_w((v4i32)in2, (v4i32)in3);        \
+} while (0)
+#define PCKEV_W2_UH(...) PCKEV_W2(v8u16, __VA_ARGS__)
+#define PCKEV_W2_SH(...) PCKEV_W2(v8i16, __VA_ARGS__)
+#define PCKEV_W2_SW(...) PCKEV_W2(v4i32, __VA_ARGS__)
+#define PCKEV_W2_UW(...) PCKEV_W2(v4u32, __VA_ARGS__)
+
+/* Description : Pack odd halfword elements of vector pairs
+ * Arguments   : Inputs  - in0, in1, in2, in3
+ *               Outputs - out0, out1
+ *               Return Type - as per RTYPE
+ * Details     : Odd halfword elements of 'in0' are copied to the left half of
+ *               'out0' & odd halfword elements of 'in1' are copied to the
+ *               right half of 'out0'.
+ */
+#define PCKOD_H2(RTYPE, in0, in1, in2, in3, out0, out1) do {  \
+  out0 = (RTYPE)__msa_pckod_h((v8i16)in0, (v8i16)in1);        \
+  out1 = (RTYPE)__msa_pckod_h((v8i16)in2, (v8i16)in3);        \
+} while (0)
+#define PCKOD_H2_UH(...) PCKOD_H2(v8u16, __VA_ARGS__)
+#define PCKOD_H2_SH(...) PCKOD_H2(v8i16, __VA_ARGS__)
+#define PCKOD_H2_SW(...) PCKOD_H2(v4i32, __VA_ARGS__)
+#define PCKOD_H2_UW(...) PCKOD_H2(v4u32, __VA_ARGS__)
+
 /* Description : Arithmetic immediate shift right all elements of word vector
  * Arguments   : Inputs  - in0, in1, shift
  *               Outputs - in place operation
@@ -408,17 +1047,17 @@
  * Details     : Each element of vector 'in0' is right shifted by 'shift' and
  *               the result is written in-place. 'shift' is a GP variable.
  */
-#define SRAI_W2(RTYPE, in0, in1, shift_val) {  \
-  in0 = (RTYPE)SRAI_W(in0, shift_val);         \
-  in1 = (RTYPE)SRAI_W(in1, shift_val);         \
-}
+#define SRAI_W2(RTYPE, in0, in1, shift_val) do {  \
+  in0 = (RTYPE)SRAI_W(in0, shift_val);            \
+  in1 = (RTYPE)SRAI_W(in1, shift_val);            \
+} while (0)
 #define SRAI_W2_SW(...) SRAI_W2(v4i32, __VA_ARGS__)
 #define SRAI_W2_UW(...) SRAI_W2(v4u32, __VA_ARGS__)
 
-#define SRAI_W4(RTYPE, in0, in1, in2, in3, shift_val) {  \
-  SRAI_W2(RTYPE, in0, in1, shift_val);                   \
-  SRAI_W2(RTYPE, in2, in3, shift_val);                   \
-}
+#define SRAI_W4(RTYPE, in0, in1, in2, in3, shift_val) do {  \
+  SRAI_W2(RTYPE, in0, in1, shift_val);                      \
+  SRAI_W2(RTYPE, in2, in3, shift_val);                      \
+} while (0)
 #define SRAI_W4_SW(...) SRAI_W4(v4i32, __VA_ARGS__)
 #define SRAI_W4_UW(...) SRAI_W4(v4u32, __VA_ARGS__)
 
@@ -429,10 +1068,10 @@
  * Details     : Each element of vector 'in0' is right shifted by 'shift' and
  *               the result is written in-place. 'shift' is a GP variable.
  */
-#define SRAI_H2(RTYPE, in0, in1, shift_val) {  \
-  in0 = (RTYPE)SRAI_H(in0, shift_val);         \
-  in1 = (RTYPE)SRAI_H(in1, shift_val);         \
-}
+#define SRAI_H2(RTYPE, in0, in1, shift_val) do {  \
+  in0 = (RTYPE)SRAI_H(in0, shift_val);            \
+  in1 = (RTYPE)SRAI_H(in1, shift_val);            \
+} while (0)
 #define SRAI_H2_SH(...) SRAI_H2(v8i16, __VA_ARGS__)
 #define SRAI_H2_UH(...) SRAI_H2(v8u16, __VA_ARGS__)
 
@@ -443,48 +1082,166 @@
  * Details     : Each element of vector 'in0' is right shifted by 'shift' and
  *               the result is written in-place. 'shift' is a GP variable.
  */
-#define SRARI_W2(RTYPE, in0, in1, shift) {        \
+#define SRARI_W2(RTYPE, in0, in1, shift) do {     \
   in0 = (RTYPE)__msa_srari_w((v4i32)in0, shift);  \
   in1 = (RTYPE)__msa_srari_w((v4i32)in1, shift);  \
-}
+} while (0)
 #define SRARI_W2_SW(...) SRARI_W2(v4i32, __VA_ARGS__)
 
-#define SRARI_W4(RTYPE, in0, in1, in2, in3, shift) {  \
-  SRARI_W2(RTYPE, in0, in1, shift);                   \
-  SRARI_W2(RTYPE, in2, in3, shift);                   \
-}
+#define SRARI_W4(RTYPE, in0, in1, in2, in3, shift) do {  \
+  SRARI_W2(RTYPE, in0, in1, shift);                      \
+  SRARI_W2(RTYPE, in2, in3, shift);                      \
+} while (0)
 #define SRARI_W4_SH(...) SRARI_W4(v8i16, __VA_ARGS__)
 #define SRARI_W4_UW(...) SRARI_W4(v4u32, __VA_ARGS__)
 #define SRARI_W4_SW(...) SRARI_W4(v4i32, __VA_ARGS__)
 
+/* Description : Shift right arithmetic rounded double words
+ * Arguments   : Inputs  - in0, in1, shift
+ *               Outputs - in place operation
+ *               Return Type - as per RTYPE
+ * Details     : Each element of vector 'in0' is shifted right arithmetically by
+ *               the number of bits in the corresponding element in the vector
+ *               'shift'. The last discarded bit is added to shifted value for
+ *               rounding and the result is written in-place.
+ *               'shift' is a vector.
+ */
+#define SRAR_D2(RTYPE, in0, in1, shift) do {            \
+  in0 = (RTYPE)__msa_srar_d((v2i64)in0, (v2i64)shift);  \
+  in1 = (RTYPE)__msa_srar_d((v2i64)in1, (v2i64)shift);  \
+} while (0)
+#define SRAR_D2_SW(...) SRAR_D2(v4i32, __VA_ARGS__)
+#define SRAR_D2_SD(...) SRAR_D2(v2i64, __VA_ARGS__)
+#define SRAR_D2_UD(...) SRAR_D2(v2u64, __VA_ARGS__)
+
+#define SRAR_D4(RTYPE, in0, in1, in2, in3, shift) do {  \
+  SRAR_D2(RTYPE, in0, in1, shift);                      \
+  SRAR_D2(RTYPE, in2, in3, shift);                      \
+} while (0)
+#define SRAR_D4_SD(...) SRAR_D4(v2i64, __VA_ARGS__)
+#define SRAR_D4_UD(...) SRAR_D4(v2u64, __VA_ARGS__)
+
 /* Description : Addition of 2 pairs of half-word vectors
  * Arguments   : Inputs  - in0, in1, in2, in3
  *               Outputs - out0, out1
  * Details     : Each element in 'in0' is added to 'in1' and result is written
  *               to 'out0'.
  */
-#define ADDVI_H2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
-  out0 = (RTYPE)ADDVI_H(in0, in1);                         \
-  out1 = (RTYPE)ADDVI_H(in2, in3);                         \
-}
+#define ADDVI_H2(RTYPE, in0, in1, in2, in3, out0, out1) do {  \
+  out0 = (RTYPE)ADDVI_H(in0, in1);                            \
+  out1 = (RTYPE)ADDVI_H(in2, in3);                            \
+} while (0)
 #define ADDVI_H2_SH(...) ADDVI_H2(v8i16, __VA_ARGS__)
 #define ADDVI_H2_UH(...) ADDVI_H2(v8u16, __VA_ARGS__)
 
+/* Description : Addition of 2 pairs of word vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3
+ *               Outputs - out0, out1
+ * Details     : Each element in 'in0' is added to 'in1' and result is written
+ *               to 'out0'.
+ */
+#define ADDVI_W2(RTYPE, in0, in1, in2, in3, out0, out1) do {  \
+  out0 = (RTYPE)ADDVI_W(in0, in1);                            \
+  out1 = (RTYPE)ADDVI_W(in2, in3);                            \
+} while (0)
+#define ADDVI_W2_SW(...) ADDVI_W2(v4i32, __VA_ARGS__)
+
+/* Description : Fill 2 pairs of word vectors with GP registers
+ * Arguments   : Inputs  - in0, in1
+ *               Outputs - out0, out1
+ * Details     : GP register in0 is replicated in each word element of out0
+ *               GP register in1 is replicated in each word element of out1
+ */
+#define FILL_W2(RTYPE, in0, in1, out0, out1) do {  \
+  out0 = (RTYPE)__msa_fill_w(in0);                 \
+  out1 = (RTYPE)__msa_fill_w(in1);                 \
+} while (0)
+#define FILL_W2_SW(...) FILL_W2(v4i32, __VA_ARGS__)
+
 /* Description : Addition of 2 pairs of vectors
  * Arguments   : Inputs  - in0, in1, in2, in3
  *               Outputs - out0, out1
  * Details     : Each element in 'in0' is added to 'in1' and result is written
  *               to 'out0'.
  */
-#define ADD2(in0, in1, in2, in3, out0, out1) {  \
-  out0 = in0 + in1;                             \
-  out1 = in2 + in3;                             \
-}
+#define ADD2(in0, in1, in2, in3, out0, out1) do {  \
+  out0 = in0 + in1;                                \
+  out1 = in2 + in3;                                \
+} while (0)
+
 #define ADD4(in0, in1, in2, in3, in4, in5, in6, in7,  \
-             out0, out1, out2, out3) {                \
+             out0, out1, out2, out3) do {             \
   ADD2(in0, in1, in2, in3, out0, out1);               \
   ADD2(in4, in5, in6, in7, out2, out3);               \
-}
+} while (0)
+
+/* Description : Subtraction of 2 pairs of vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3
+ *               Outputs - out0, out1
+ * Details     : Each element in 'in1' is subtracted from 'in0' and result is
+ *               written to 'out0'.
+ */
+#define SUB2(in0, in1, in2, in3, out0, out1) do {  \
+  out0 = in0 - in1;                                \
+  out1 = in2 - in3;                                \
+} while (0)
+
+#define SUB3(in0, in1, in2, in3, in4, in5, out0, out1, out2) do {  \
+  out0 = in0 - in1;                                                \
+  out1 = in2 - in3;                                                \
+  out2 = in4 - in5;                                                \
+} while (0)
+
+#define SUB4(in0, in1, in2, in3, in4, in5, in6, in7,  \
+             out0, out1, out2, out3) do {             \
+  out0 = in0 - in1;                                   \
+  out1 = in2 - in3;                                   \
+  out2 = in4 - in5;                                   \
+  out3 = in6 - in7;                                   \
+} while (0)
+
+/* Description : Addition - Subtraction of input vectors
+ * Arguments   : Inputs  - in0, in1
+ *               Outputs - out0, out1
+ * Details     : Each element in 'in1' is added to 'in0' and result is
+ *               written to 'out0'.
+ *               Each element in 'in1' is subtracted from 'in0' and result is
+ *               written to 'out1'.
+ */
+#define ADDSUB2(in0, in1, out0, out1) do {  \
+  out0 = in0 + in1;                         \
+  out1 = in0 - in1;                         \
+} while (0)
+
+/* Description : Multiplication of pairs of vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3
+ *               Outputs - out0, out1
+ * Details     : Each element from 'in0' is multiplied with elements from 'in1'
+ *               and the result is written to 'out0'
+ */
+#define MUL2(in0, in1, in2, in3, out0, out1) do {  \
+  out0 = in0 * in1;                                \
+  out1 = in2 * in3;                                \
+} while (0)
+
+#define MUL4(in0, in1, in2, in3, in4, in5, in6, in7,  \
+             out0, out1, out2, out3) do {             \
+  MUL2(in0, in1, in2, in3, out0, out1);               \
+  MUL2(in4, in5, in6, in7, out2, out3);               \
+} while (0)
+
+/* Description : Sign extend halfword elements from right half of the vector
+ * Arguments   : Input  - in    (halfword vector)
+ *               Output - out   (sign extended word vector)
+ *               Return Type - signed word
+ * Details     : Sign bit of halfword elements from input vector 'in' is
+ *               extracted and interleaved with same vector 'in0' to generate
+ *               4 word elements keeping sign intact
+ */
+#define UNPCK_R_SH_SW(in, out) do {                   \
+  const v8i16 sign_m = __msa_clti_s_h((v8i16)in, 0);  \
+  out = (v4i32)__msa_ilvr_h(sign_m, (v8i16)in);       \
+} while (0)
 
 /* Description : Sign extend halfword elements from input vector and return
  *               the result in pair of vectors
@@ -497,29 +1254,82 @@
  *               Then interleaved left with same vector 'in0' to
  *               generate 4 signed word elements in 'out1'
  */
-#define UNPCK_SH_SW(in, out0, out1) {                 \
+#define UNPCK_SH_SW(in, out0, out1) do {              \
   const v8i16 tmp_m = __msa_clti_s_h((v8i16)in, 0);   \
   ILVRL_H2_SW(tmp_m, in, out0, out1);                 \
-}
+} while (0)
 
 /* Description : Butterfly of 4 input vectors
  * Arguments   : Inputs  - in0, in1, in2, in3
  *               Outputs - out0, out1, out2, out3
  * Details     : Butterfly operation
  */
-#define BUTTERFLY_4(in0, in1, in2, in3, out0, out1, out2, out3) {  \
-  out0 = in0 + in3;                                                \
-  out1 = in1 + in2;                                                \
-  out2 = in1 - in2;                                                \
-  out3 = in0 - in3;                                                \
-}
+#define BUTTERFLY_4(in0, in1, in2, in3, out0, out1, out2, out3) do {  \
+  out0 = in0 + in3;                                                   \
+  out1 = in1 + in2;                                                   \
+  out2 = in1 - in2;                                                   \
+  out3 = in0 - in3;                                                   \
+} while (0)
+
+/* Description : Transpose 16x4 block into 4x16 with byte elements in vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7,
+ *                         in8, in9, in10, in11, in12, in13, in14, in15
+ *               Outputs - out0, out1, out2, out3
+ *               Return Type - unsigned byte
+ */
+#define TRANSPOSE16x4_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7,        \
+                            in8, in9, in10, in11, in12, in13, in14, in15,  \
+                            out0, out1, out2, out3) do {                   \
+  v2i64 tmp0_m, tmp1_m, tmp2_m, tmp3_m, tmp4_m, tmp5_m;                    \
+  ILVEV_W2_SD(in0, in4, in8, in12, tmp2_m, tmp3_m);                        \
+  ILVEV_W2_SD(in1, in5, in9, in13, tmp0_m, tmp1_m);                        \
+  ILVEV_D2_UB(tmp2_m, tmp3_m, tmp0_m, tmp1_m, out1, out3);                 \
+  ILVEV_W2_SD(in2, in6, in10, in14, tmp4_m, tmp5_m);                       \
+  ILVEV_W2_SD(in3, in7, in11, in15, tmp0_m, tmp1_m);                       \
+  ILVEV_D2_SD(tmp4_m, tmp5_m, tmp0_m, tmp1_m, tmp2_m, tmp3_m);             \
+  ILVEV_B2_SD(out1, out3, tmp2_m, tmp3_m, tmp0_m, tmp1_m);                 \
+  ILVEVOD_H2_UB(tmp0_m, tmp1_m, tmp0_m, tmp1_m, out0, out2);               \
+  ILVOD_B2_SD(out1, out3, tmp2_m, tmp3_m, tmp0_m, tmp1_m);                 \
+  ILVEVOD_H2_UB(tmp0_m, tmp1_m, tmp0_m, tmp1_m, out1, out3);               \
+} while (0)
+
+/* Description : Transpose 16x8 block into 8x16 with byte elements in vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7,
+ *                         in8, in9, in10, in11, in12, in13, in14, in15
+ *               Outputs - out0, out1, out2, out3, out4, out5, out6, out7
+ *               Return Type - unsigned byte
+ */
+#define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7,        \
+                            in8, in9, in10, in11, in12, in13, in14, in15,  \
+                            out0, out1, out2, out3, out4, out5,            \
+                            out6, out7) do {                               \
+  v8i16 tmp0_m, tmp1_m, tmp4_m, tmp5_m, tmp6_m, tmp7_m;                    \
+  v4i32 tmp2_m, tmp3_m;                                                    \
+  ILVEV_D2_UB(in0, in8, in1, in9, out7, out6);                             \
+  ILVEV_D2_UB(in2, in10, in3, in11, out5, out4);                           \
+  ILVEV_D2_UB(in4, in12, in5, in13, out3, out2);                           \
+  ILVEV_D2_UB(in6, in14, in7, in15, out1, out0);                           \
+  ILVEV_B2_SH(out7, out6, out5, out4, tmp0_m, tmp1_m);                     \
+  ILVOD_B2_SH(out7, out6, out5, out4, tmp4_m, tmp5_m);                     \
+  ILVEV_B2_UB(out3, out2, out1, out0, out5, out7);                         \
+  ILVOD_B2_SH(out3, out2, out1, out0, tmp6_m, tmp7_m);                     \
+  ILVEV_H2_SW(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m);                 \
+  ILVEVOD_W2_UB(tmp2_m, tmp3_m, tmp2_m, tmp3_m, out0, out4);               \
+  ILVOD_H2_SW(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m);                 \
+  ILVEVOD_W2_UB(tmp2_m, tmp3_m, tmp2_m, tmp3_m, out2, out6);               \
+  ILVEV_H2_SW(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m);             \
+  ILVEVOD_W2_UB(tmp2_m, tmp3_m, tmp2_m, tmp3_m, out1, out5);               \
+  ILVOD_H2_SW(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m);             \
+  ILVEVOD_W2_UB(tmp2_m, tmp3_m, tmp2_m, tmp3_m, out3, out7);               \
+} while (0)
 
 /* Description : Transpose 4x4 block with word elements in vectors
  * Arguments   : Inputs  - in0, in1, in2, in3
  *                Outputs - out0, out1, out2, out3
  *                Return Type - as per RTYPE
  */
-#define TRANSPOSE4x4_W(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) {  \
+#define TRANSPOSE4x4_W(RTYPE, in0, in1, in2, in3,                            \
+                       out0, out1, out2, out3) do {                          \
   v4i32 s0_m, s1_m, s2_m, s3_m;                                              \
   ILVRL_W2_SW(in1, in0, s0_m, s1_m);                                         \
   ILVRL_W2_SW(in3, in2, s2_m, s3_m);                                         \
@@ -527,7 +1337,7 @@
   out1 = (RTYPE)__msa_ilvl_d((v2i64)s2_m, (v2i64)s0_m);                      \
   out2 = (RTYPE)__msa_ilvr_d((v2i64)s3_m, (v2i64)s1_m);                      \
   out3 = (RTYPE)__msa_ilvl_d((v2i64)s3_m, (v2i64)s1_m);                      \
-}
+} while (0)
 #define TRANSPOSE4x4_SW_SW(...) TRANSPOSE4x4_W(v4i32, __VA_ARGS__)
 
 /* Description : Add block 4x4
@@ -535,7 +1345,7 @@
  * Details     : Least significant 4 bytes from each input vector are added to
  *               the destination bytes, clipped between 0-255 and stored.
  */
-#define ADDBLK_ST4x4_UB(in0, in1, in2, in3, pdst, stride) {     \
+#define ADDBLK_ST4x4_UB(in0, in1, in2, in3, pdst, stride) do {  \
   uint32_t src0_m, src1_m, src2_m, src3_m;                      \
   v8i16 inp0_m, inp1_m, res0_m, res1_m;                         \
   v16i8 dst0_m = { 0 };                                         \
@@ -550,6 +1360,31 @@
   CLIP_SH2_0_255(res0_m, res1_m);                               \
   PCKEV_B2_SB(res0_m, res0_m, res1_m, res1_m, dst0_m, dst1_m);  \
   ST4x4_UB(dst0_m, dst1_m, 0, 1, 0, 1, pdst, stride);           \
-}
+} while (0)
+
+/* Description : Pack even byte elements, extract 0 & 2 index words from pair
+ *               of results and store 4 words in destination memory as per
+ *               stride
+ * Arguments   : Inputs - in0, in1, in2, in3, pdst, stride
+ */
+#define PCKEV_ST4x4_UB(in0, in1, in2, in3, pdst, stride) do {  \
+  v16i8 tmp0_m, tmp1_m;                                        \
+  PCKEV_B2_SB(in1, in0, in3, in2, tmp0_m, tmp1_m);             \
+  ST4x4_UB(tmp0_m, tmp1_m, 0, 2, 0, 2, pdst, stride);          \
+} while (0)
+
+/* Description : average with rounding (in0 + in1 + 1) / 2.
+ * Arguments   : Inputs  - in0, in1, in2, in3,
+ *               Outputs - out0, out1
+ *               Return Type - as per RTYPE
+ * Details     : Each unsigned byte element from 'in0' vector is added with
+ *               each unsigned byte element from 'in1' vector. Then the average
+ *               with rounding is calculated and written to 'out0'
+ */
+#define AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1) do {  \
+  out0 = (RTYPE)__msa_aver_u_b((v16u8)in0, (v16u8)in1);       \
+  out1 = (RTYPE)__msa_aver_u_b((v16u8)in2, (v16u8)in3);       \
+} while (0)
+#define AVER_UB2_UB(...) AVER_UB2(v16u8, __VA_ARGS__)
 
 #endif  /* WEBP_DSP_MSA_MACRO_H_ */
diff --git a/src/3rdparty/libwebp/src/dsp/neon.h b/src/3rdparty/libwebp/src/dsp/neon.h
index 0a06266..3b548a6 100644
--- a/src/3rdparty/libwebp/src/dsp/neon.h
+++ b/src/3rdparty/libwebp/src/dsp/neon.h
@@ -79,4 +79,22 @@ static WEBP_INLINE int32x4x4_t Transpose4x4(const int32x4x4_t rows) {
   }
 }
 
+#if 0     // Useful debug macro.
+#include <stdio.h>
+#define PRINT_REG(REG, SIZE) do {                       \
+  int i;                                                \
+  printf("%s \t[%d]: 0x", #REG, SIZE);                  \
+  if (SIZE == 8) {                                      \
+    uint8_t _tmp[8];                                    \
+    vst1_u8(_tmp, (REG));                               \
+    for (i = 0; i < 8; ++i) printf("%.2x ", _tmp[i]);   \
+  } else if (SIZE == 16) {                              \
+    uint16_t _tmp[4];                                   \
+    vst1_u16(_tmp, (REG));                              \
+    for (i = 0; i < 4; ++i) printf("%.4x ", _tmp[i]);   \
+  }                                                     \
+  printf("\n");                                         \
+} while (0)
+#endif
+
 #endif  // WEBP_DSP_NEON_H_
diff --git a/src/3rdparty/libwebp/src/dsp/rescaler.c b/src/3rdparty/libwebp/src/dsp/rescaler.c
index bc743d5..0f54502 100644
--- a/src/3rdparty/libwebp/src/dsp/rescaler.c
+++ b/src/3rdparty/libwebp/src/dsp/rescaler.c
@@ -14,7 +14,7 @@
 #include <assert.h>
 
 #include "./dsp.h"
-#include "../utils/rescaler.h"
+#include "../utils/rescaler_utils.h"
 
 //------------------------------------------------------------------------------
 // Implementations of critical functions ImportRow / ExportRow
@@ -173,10 +173,10 @@ void WebPRescalerExportRow(WebPRescaler* const wrk) {
       WebPRescalerExportRowExpand(wrk);
     } else if (wrk->fxy_scale) {
       WebPRescalerExportRowShrink(wrk);
-    } else {  // very special case for src = dst = 1x1
+    } else {  // special case
       int i;
+      assert(wrk->src_height == wrk->dst_height && wrk->x_add == 1);
       assert(wrk->src_width == 1 && wrk->dst_width <= 2);
-      assert(wrk->src_height == 1 && wrk->dst_height == 1);
       for (i = 0; i < wrk->num_channels * wrk->dst_width; ++i) {
         wrk->dst[i] = wrk->irow[i];
         wrk->irow[i] = 0;
@@ -199,6 +199,7 @@ WebPRescalerExportRowFunc WebPRescalerExportRowShrink;
 extern void WebPRescalerDspInitSSE2(void);
 extern void WebPRescalerDspInitMIPS32(void);
 extern void WebPRescalerDspInitMIPSdspR2(void);
+extern void WebPRescalerDspInitMSA(void);
 extern void WebPRescalerDspInitNEON(void);
 
 static volatile VP8CPUInfo rescaler_last_cpuinfo_used =
@@ -233,6 +234,11 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInit(void) {
       WebPRescalerDspInitMIPSdspR2();
     }
 #endif
+#if defined(WEBP_USE_MSA)
+    if (VP8GetCPUInfo(kMSA)) {
+      WebPRescalerDspInitMSA();
+    }
+#endif
   }
   rescaler_last_cpuinfo_used = VP8GetCPUInfo;
 }
diff --git a/src/3rdparty/libwebp/src/dsp/rescaler_mips32.c b/src/3rdparty/libwebp/src/dsp/rescaler_mips32.c
index ddaa391..e09ad5d 100644
--- a/src/3rdparty/libwebp/src/dsp/rescaler_mips32.c
+++ b/src/3rdparty/libwebp/src/dsp/rescaler_mips32.c
@@ -16,7 +16,7 @@
 #if defined(WEBP_USE_MIPS32)
 
 #include <assert.h>
-#include "../utils/rescaler.h"
+#include "../utils/rescaler_utils.h"
 
 //------------------------------------------------------------------------------
 // Row import
diff --git a/src/3rdparty/libwebp/src/dsp/rescaler_mips_dsp_r2.c b/src/3rdparty/libwebp/src/dsp/rescaler_mips_dsp_r2.c
index b457d0a..2308d64 100644
--- a/src/3rdparty/libwebp/src/dsp/rescaler_mips_dsp_r2.c
+++ b/src/3rdparty/libwebp/src/dsp/rescaler_mips_dsp_r2.c
@@ -16,7 +16,7 @@
 #if defined(WEBP_USE_MIPS_DSP_R2)
 
 #include <assert.h>
-#include "../utils/rescaler.h"
+#include "../utils/rescaler_utils.h"
 
 #define ROUNDER (WEBP_RESCALER_ONE >> 1)
 #define MULT_FIX(x, y) (((uint64_t)(x) * (y) + ROUNDER) >> WEBP_RESCALER_RFIX)
diff --git a/src/3rdparty/libwebp/src/dsp/rescaler_msa.c b/src/3rdparty/libwebp/src/dsp/rescaler_msa.c
new file mode 100644
index 0000000..2c10e55
--- /dev/null
+++ b/src/3rdparty/libwebp/src/dsp/rescaler_msa.c
@@ -0,0 +1,444 @@
+// Copyright 2016 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// MSA version of rescaling functions
+//
+// Author: Prashant Patil (prashant.patil@imgtec.com)
+
+#include "./dsp.h"
+
+#if defined(WEBP_USE_MSA)
+
+#include <assert.h>
+
+#include "../utils/rescaler_utils.h"
+#include "./msa_macro.h"
+
+#define ROUNDER (WEBP_RESCALER_ONE >> 1)
+#define MULT_FIX(x, y) (((uint64_t)(x) * (y) + ROUNDER) >> WEBP_RESCALER_RFIX)
+
+#define CALC_MULT_FIX_16(in0, in1, in2, in3, scale, shift, dst) do {  \
+  v4u32 tmp0, tmp1, tmp2, tmp3;                                       \
+  v16u8 t0, t1, t2, t3, t4, t5;                                       \
+  v2u64 out0, out1, out2, out3;                                       \
+  ILVRL_W2_UW(zero, in0, tmp0, tmp1);                                 \
+  ILVRL_W2_UW(zero, in1, tmp2, tmp3);                                 \
+  DOTP_UW2_UD(tmp0, tmp1, scale, scale, out0, out1);                  \
+  DOTP_UW2_UD(tmp2, tmp3, scale, scale, out2, out3);                  \
+  SRAR_D4_UD(out0, out1, out2, out3, shift);                          \
+  PCKEV_B2_UB(out1, out0, out3, out2, t0, t1);                        \
+  ILVRL_W2_UW(zero, in2, tmp0, tmp1);                                 \
+  ILVRL_W2_UW(zero, in3, tmp2, tmp3);                                 \
+  DOTP_UW2_UD(tmp0, tmp1, scale, scale, out0, out1);                  \
+  DOTP_UW2_UD(tmp2, tmp3, scale, scale, out2, out3);                  \
+  SRAR_D4_UD(out0, out1, out2, out3, shift);                          \
+  PCKEV_B2_UB(out1, out0, out3, out2, t2, t3);                        \
+  PCKEV_B2_UB(t1, t0, t3, t2, t4, t5);                                \
+  dst = (v16u8)__msa_pckev_b((v16i8)t5, (v16i8)t4);                   \
+} while (0)
+
+#define CALC_MULT_FIX_4(in0, scale, shift, dst) do {  \
+  v4u32 tmp0, tmp1;                                   \
+  v16i8 t0, t1;                                       \
+  v2u64 out0, out1;                                   \
+  ILVRL_W2_UW(zero, in0, tmp0, tmp1);                 \
+  DOTP_UW2_UD(tmp0, tmp1, scale, scale, out0, out1);  \
+  SRAR_D2_UD(out0, out1, shift);                      \
+  t0 = __msa_pckev_b((v16i8)out1, (v16i8)out0);       \
+  t1 = __msa_pckev_b(t0, t0);                         \
+  t0 = __msa_pckev_b(t1, t1);                         \
+  dst = __msa_copy_s_w((v4i32)t0, 0);                 \
+} while (0)
+
+#define CALC_MULT_FIX1_16(in0, in1, in2, in3, fyscale, shift,  \
+                          dst0, dst1, dst2, dst3) do {         \
+  v4u32 tmp0, tmp1, tmp2, tmp3;                                \
+  v2u64 out0, out1, out2, out3;                                \
+  ILVRL_W2_UW(zero, in0, tmp0, tmp1);                          \
+  ILVRL_W2_UW(zero, in1, tmp2, tmp3);                          \
+  DOTP_UW2_UD(tmp0, tmp1, fyscale, fyscale, out0, out1);       \
+  DOTP_UW2_UD(tmp2, tmp3, fyscale, fyscale, out2, out3);       \
+  SRAR_D4_UD(out0, out1, out2, out3, shift);                   \
+  PCKEV_W2_UW(out1, out0, out3, out2, dst0, dst1);             \
+  ILVRL_W2_UW(zero, in2, tmp0, tmp1);                          \
+  ILVRL_W2_UW(zero, in3, tmp2, tmp3);                          \
+  DOTP_UW2_UD(tmp0, tmp1, fyscale, fyscale, out0, out1);       \
+  DOTP_UW2_UD(tmp2, tmp3, fyscale, fyscale, out2, out3);       \
+  SRAR_D4_UD(out0, out1, out2, out3, shift);                   \
+  PCKEV_W2_UW(out1, out0, out3, out2, dst2, dst3);             \
+} while (0)
+
+#define CALC_MULT_FIX1_4(in0, scale, shift, dst) do {    \
+  v4u32 tmp0, tmp1;                                      \
+  v2u64 out0, out1;                                      \
+  ILVRL_W2_UW(zero, in0, tmp0, tmp1);                    \
+  DOTP_UW2_UD(tmp0, tmp1, scale, scale, out0, out1);     \
+  SRAR_D2_UD(out0, out1, shift);                         \
+  dst = (v4u32)__msa_pckev_w((v4i32)out1, (v4i32)out0);  \
+} while (0)
+
+#define CALC_MULT_FIX2_16(in0, in1, in2, in3, mult, scale, shift,  \
+                          dst0, dst1) do {                         \
+  v4u32 tmp0, tmp1, tmp2, tmp3;                                    \
+  v2u64 out0, out1, out2, out3;                                    \
+  ILVRL_W2_UW(in0, in2, tmp0, tmp1);                               \
+  ILVRL_W2_UW(in1, in3, tmp2, tmp3);                               \
+  DOTP_UW2_UD(tmp0, tmp1, mult, mult, out0, out1);                 \
+  DOTP_UW2_UD(tmp2, tmp3, mult, mult, out2, out3);                 \
+  SRAR_D4_UD(out0, out1, out2, out3, shift);                       \
+  DOTP_UW2_UD(out0, out1, scale, scale, out0, out1);               \
+  DOTP_UW2_UD(out2, out3, scale, scale, out2, out3);               \
+  SRAR_D4_UD(out0, out1, out2, out3, shift);                       \
+  PCKEV_B2_UB(out1, out0, out3, out2, dst0, dst1);                 \
+} while (0)
+
+#define CALC_MULT_FIX2_4(in0, in1, mult, scale, shift, dst) do {  \
+  v4u32 tmp0, tmp1;                                               \
+  v2u64 out0, out1;                                               \
+  v16i8 t0, t1;                                                   \
+  ILVRL_W2_UW(in0, in1, tmp0, tmp1);                              \
+  DOTP_UW2_UD(tmp0, tmp1, mult, mult, out0, out1);                \
+  SRAR_D2_UD(out0, out1, shift);                                  \
+  DOTP_UW2_UD(out0, out1, scale, scale, out0, out1);              \
+  SRAR_D2_UD(out0, out1, shift);                                  \
+  t0 = __msa_pckev_b((v16i8)out1, (v16i8)out0);                   \
+  t1 = __msa_pckev_b(t0, t0);                                     \
+  t0 = __msa_pckev_b(t1, t1);                                     \
+  dst = __msa_copy_s_w((v4i32)t0, 0);                             \
+} while (0)
+
+static WEBP_INLINE void ExportRowExpand_0(const uint32_t* frow, uint8_t* dst,
+                                          int length,
+                                          WebPRescaler* const wrk) {
+  const v4u32 scale = (v4u32)__msa_fill_w(wrk->fy_scale);
+  const v4u32 shift = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX);
+  const v4i32 zero = { 0 };
+
+  while (length >= 16) {
+    v4u32 src0, src1, src2, src3;
+    v16u8 out;
+    LD_UW4(frow, 4, src0, src1, src2, src3);
+    CALC_MULT_FIX_16(src0, src1, src2, src3, scale, shift, out);
+    ST_UB(out, dst);
+    length -= 16;
+    frow   += 16;
+    dst    += 16;
+  }
+  if (length > 0) {
+    int x_out;
+    if (length >= 12) {
+      uint32_t val0_m, val1_m, val2_m;
+      v4u32 src0, src1, src2;
+      LD_UW3(frow, 4, src0, src1, src2);
+      CALC_MULT_FIX_4(src0, scale, shift, val0_m);
+      CALC_MULT_FIX_4(src1, scale, shift, val1_m);
+      CALC_MULT_FIX_4(src2, scale, shift, val2_m);
+      SW3(val0_m, val1_m, val2_m, dst, 4);
+      length -= 12;
+      frow   += 12;
+      dst    += 12;
+    } else if (length >= 8) {
+      uint32_t val0_m, val1_m;
+      v4u32 src0, src1;
+      LD_UW2(frow, 4, src0, src1);
+      CALC_MULT_FIX_4(src0, scale, shift, val0_m);
+      CALC_MULT_FIX_4(src1, scale, shift, val1_m);
+      SW2(val0_m, val1_m, dst, 4);
+      length -= 8;
+      frow   += 8;
+      dst    += 8;
+    } else if (length >= 4) {
+      uint32_t val0_m;
+      const v4u32 src0 = LD_UW(frow);
+      CALC_MULT_FIX_4(src0, scale, shift, val0_m);
+      SW(val0_m, dst);
+      length -= 4;
+      frow   += 4;
+      dst    += 4;
+    }
+    for (x_out = 0; x_out < length; ++x_out) {
+      const uint32_t J = frow[x_out];
+      const int v = (int)MULT_FIX(J, wrk->fy_scale);
+      assert(v >= 0 && v <= 255);
+      dst[x_out] = v;
+    }
+  }
+}
+
+static WEBP_INLINE void ExportRowExpand_1(const uint32_t* frow, uint32_t* irow,
+                                          uint8_t* dst, int length,
+                                          WebPRescaler* const wrk) {
+  const uint32_t B = WEBP_RESCALER_FRAC(-wrk->y_accum, wrk->y_sub);
+  const uint32_t A = (uint32_t)(WEBP_RESCALER_ONE - B);
+  const v4i32 B1 = __msa_fill_w(B);
+  const v4i32 A1 = __msa_fill_w(A);
+  const v4i32 AB = __msa_ilvr_w(A1, B1);
+  const v4u32 scale = (v4u32)__msa_fill_w(wrk->fy_scale);
+  const v4u32 shift = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX);
+
+  while (length >= 16) {
+    v4u32 frow0, frow1, frow2, frow3, irow0, irow1, irow2, irow3;
+    v16u8 t0, t1, t2, t3, t4, t5;
+    LD_UW4(frow, 4, frow0, frow1, frow2, frow3);
+    LD_UW4(irow, 4, irow0, irow1, irow2, irow3);
+    CALC_MULT_FIX2_16(frow0, frow1, irow0, irow1, AB, scale, shift, t0, t1);
+    CALC_MULT_FIX2_16(frow2, frow3, irow2, irow3, AB, scale, shift, t2, t3);
+    PCKEV_B2_UB(t1, t0, t3, t2, t4, t5);
+    t0 = (v16u8)__msa_pckev_b((v16i8)t5, (v16i8)t4);
+    ST_UB(t0, dst);
+    frow   += 16;
+    irow   += 16;
+    dst    += 16;
+    length -= 16;
+  }
+  if (length > 0) {
+    int x_out;
+    if (length >= 12) {
+      uint32_t val0_m, val1_m, val2_m;
+      v4u32 frow0, frow1, frow2, irow0, irow1, irow2;
+      LD_UW3(frow, 4, frow0, frow1, frow2);
+      LD_UW3(irow, 4, irow0, irow1, irow2);
+      CALC_MULT_FIX2_4(frow0, irow0, AB, scale, shift, val0_m);
+      CALC_MULT_FIX2_4(frow1, irow1, AB, scale, shift, val1_m);
+      CALC_MULT_FIX2_4(frow2, irow2, AB, scale, shift, val2_m);
+      SW3(val0_m, val1_m, val2_m, dst, 4);
+      frow   += 12;
+      irow   += 12;
+      dst    += 12;
+      length -= 12;
+    } else if (length >= 8) {
+      uint32_t val0_m, val1_m;
+      v4u32 frow0, frow1, irow0, irow1;
+      LD_UW2(frow, 4, frow0, frow1);
+      LD_UW2(irow, 4, irow0, irow1);
+      CALC_MULT_FIX2_4(frow0, irow0, AB, scale, shift, val0_m);
+      CALC_MULT_FIX2_4(frow1, irow1, AB, scale, shift, val1_m);
+      SW2(val0_m, val1_m, dst, 4);
+      frow   += 4;
+      irow   += 4;
+      dst    += 4;
+      length -= 4;
+    } else if (length >= 4) {
+      uint32_t val0_m;
+      const v4u32 frow0 = LD_UW(frow + 0);
+      const v4u32 irow0 = LD_UW(irow + 0);
+      CALC_MULT_FIX2_4(frow0, irow0, AB, scale, shift, val0_m);
+      SW(val0_m, dst);
+      frow   += 4;
+      irow   += 4;
+      dst    += 4;
+      length -= 4;
+    }
+    for (x_out = 0; x_out < length; ++x_out) {
+      const uint64_t I = (uint64_t)A * frow[x_out]
+                       + (uint64_t)B * irow[x_out];
+      const uint32_t J = (uint32_t)((I + ROUNDER) >> WEBP_RESCALER_RFIX);
+      const int v = (int)MULT_FIX(J, wrk->fy_scale);
+      assert(v >= 0 && v <= 255);
+      dst[x_out] = v;
+    }
+  }
+}
+
+static void RescalerExportRowExpand(WebPRescaler* const wrk) {
+  uint8_t* dst = wrk->dst;
+  rescaler_t* irow = wrk->irow;
+  const int x_out_max = wrk->dst_width * wrk->num_channels;
+  const rescaler_t* frow = wrk->frow;
+  assert(!WebPRescalerOutputDone(wrk));
+  assert(wrk->y_accum <= 0);
+  assert(wrk->y_expand);
+  assert(wrk->y_sub != 0);
+  if (wrk->y_accum == 0) {
+    ExportRowExpand_0(frow, dst, x_out_max, wrk);
+  } else {
+    ExportRowExpand_1(frow, irow, dst, x_out_max, wrk);
+  }
+}
+
+static WEBP_INLINE void ExportRowShrink_0(const uint32_t* frow, uint32_t* irow,
+                                          uint8_t* dst, int length,
+                                          const uint32_t yscale,
+                                          WebPRescaler* const wrk) {
+  const v4u32 y_scale = (v4u32)__msa_fill_w(yscale);
+  const v4u32 fxyscale = (v4u32)__msa_fill_w(wrk->fxy_scale);
+  const v4u32 shiftval = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX);
+  const v4i32 zero = { 0 };
+
+  while (length >= 16) {
+    v4u32 src0, src1, src2, src3, frac0, frac1, frac2, frac3;
+    v16u8 out;
+    LD_UW4(frow, 4, src0, src1, src2, src3);
+    CALC_MULT_FIX1_16(src0, src1, src2, src3, y_scale, shiftval,
+                      frac0, frac1, frac2, frac3);
+    LD_UW4(irow, 4, src0, src1, src2, src3);
+    SUB4(src0, frac0, src1, frac1, src2, frac2, src3, frac3,
+         src0, src1, src2, src3);
+    CALC_MULT_FIX_16(src0, src1, src2, src3, fxyscale, shiftval, out);
+    ST_UB(out, dst);
+    ST_UW4(frac0, frac1, frac2, frac3, irow, 4);
+    frow   += 16;
+    irow   += 16;
+    dst    += 16;
+    length -= 16;
+  }
+  if (length > 0) {
+    int x_out;
+    if (length >= 12) {
+      uint32_t val0_m, val1_m, val2_m;
+      v4u32 src0, src1, src2, frac0, frac1, frac2;
+      LD_UW3(frow, 4, src0, src1, src2);
+      CALC_MULT_FIX1_4(src0, y_scale, shiftval, frac0);
+      CALC_MULT_FIX1_4(src1, y_scale, shiftval, frac1);
+      CALC_MULT_FIX1_4(src2, y_scale, shiftval, frac2);
+      LD_UW3(irow, 4, src0, src1, src2);
+      SUB3(src0, frac0, src1, frac1, src2, frac2, src0, src1, src2);
+      CALC_MULT_FIX_4(src0, fxyscale, shiftval, val0_m);
+      CALC_MULT_FIX_4(src1, fxyscale, shiftval, val1_m);
+      CALC_MULT_FIX_4(src2, fxyscale, shiftval, val2_m);
+      SW3(val0_m, val1_m, val2_m, dst, 4);
+      ST_UW3(frac0, frac1, frac2, irow, 4);
+      frow   += 12;
+      irow   += 12;
+      dst    += 12;
+      length -= 12;
+    } else if (length >= 8) {
+      uint32_t val0_m, val1_m;
+      v4u32 src0, src1, frac0, frac1;
+      LD_UW2(frow, 4, src0, src1);
+      CALC_MULT_FIX1_4(src0, y_scale, shiftval, frac0);
+      CALC_MULT_FIX1_4(src1, y_scale, shiftval, frac1);
+      LD_UW2(irow, 4, src0, src1);
+      SUB2(src0, frac0, src1, frac1, src0, src1);
+      CALC_MULT_FIX_4(src0, fxyscale, shiftval, val0_m);
+      CALC_MULT_FIX_4(src1, fxyscale, shiftval, val1_m);
+      SW2(val0_m, val1_m, dst, 4);
+      ST_UW2(frac0, frac1, irow, 4);
+      frow   += 8;
+      irow   += 8;
+      dst    += 8;
+      length -= 8;
+    } else if (length >= 4) {
+      uint32_t val0_m;
+      v4u32 frac0;
+      v4u32 src0 = LD_UW(frow);
+      CALC_MULT_FIX1_4(src0, y_scale, shiftval, frac0);
+      src0 = LD_UW(irow);
+      src0 = src0 - frac0;
+      CALC_MULT_FIX_4(src0, fxyscale, shiftval, val0_m);
+      SW(val0_m, dst);
+      ST_UW(frac0, irow);
+      frow   += 4;
+      irow   += 4;
+      dst    += 4;
+      length -= 4;
+    }
+    for (x_out = 0; x_out < length; ++x_out) {
+      const uint32_t frac = (uint32_t)MULT_FIX(frow[x_out], yscale);
+      const int v = (int)MULT_FIX(irow[x_out] - frac, wrk->fxy_scale);
+      assert(v >= 0 && v <= 255);
+      dst[x_out] = v;
+      irow[x_out] = frac;
+    }
+  }
+}
+
+static WEBP_INLINE void ExportRowShrink_1(uint32_t* irow, uint8_t* dst,
+                                          int length,
+                                          WebPRescaler* const wrk) {
+  const v4u32 scale = (v4u32)__msa_fill_w(wrk->fxy_scale);
+  const v4u32 shift = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX);
+  const v4i32 zero = { 0 };
+
+  while (length >= 16) {
+    v4u32 src0, src1, src2, src3;
+    v16u8 dst0;
+    LD_UW4(irow, 4, src0, src1, src2, src3);
+    CALC_MULT_FIX_16(src0, src1, src2, src3, scale, shift, dst0);
+    ST_UB(dst0, dst);
+    ST_SW4(zero, zero, zero, zero, irow, 4);
+    length -= 16;
+    irow   += 16;
+    dst    += 16;
+  }
+  if (length > 0) {
+    int x_out;
+    if (length >= 12) {
+      uint32_t val0_m, val1_m, val2_m;
+      v4u32 src0, src1, src2;
+      LD_UW3(irow, 4, src0, src1, src2);
+      CALC_MULT_FIX_4(src0, scale, shift, val0_m);
+      CALC_MULT_FIX_4(src1, scale, shift, val1_m);
+      CALC_MULT_FIX_4(src2, scale, shift, val2_m);
+      SW3(val0_m, val1_m, val2_m, dst, 4);
+      ST_SW3(zero, zero, zero, irow, 4);
+      length -= 12;
+      irow   += 12;
+      dst    += 12;
+    } else if (length >= 8) {
+      uint32_t val0_m, val1_m;
+      v4u32 src0, src1;
+      LD_UW2(irow, 4, src0, src1);
+      CALC_MULT_FIX_4(src0, scale, shift, val0_m);
+      CALC_MULT_FIX_4(src1, scale, shift, val1_m);
+      SW2(val0_m, val1_m, dst, 4);
+      ST_SW2(zero, zero, irow, 4);
+      length -= 8;
+      irow   += 8;
+      dst    += 8;
+    } else if (length >= 4) {
+      uint32_t val0_m;
+      const v4u32 src0 = LD_UW(irow + 0);
+      CALC_MULT_FIX_4(src0, scale, shift, val0_m);
+      SW(val0_m, dst);
+      ST_SW(zero, irow);
+      length -= 4;
+      irow   += 4;
+      dst    += 4;
+    }
+    for (x_out = 0; x_out < length; ++x_out) {
+      const int v = (int)MULT_FIX(irow[x_out], wrk->fxy_scale);
+      assert(v >= 0 && v <= 255);
+      dst[x_out] = v;
+      irow[x_out] = 0;
+    }
+  }
+}
+
+static void RescalerExportRowShrink(WebPRescaler* const wrk) {
+  uint8_t* dst = wrk->dst;
+  rescaler_t* irow = wrk->irow;
+  const int x_out_max = wrk->dst_width * wrk->num_channels;
+  const rescaler_t* frow = wrk->frow;
+  const uint32_t yscale = wrk->fy_scale * (-wrk->y_accum);
+  assert(!WebPRescalerOutputDone(wrk));
+  assert(wrk->y_accum <= 0);
+  assert(!wrk->y_expand);
+  if (yscale) {
+    ExportRowShrink_0(frow, irow, dst, x_out_max, yscale, wrk);
+  } else {
+    ExportRowShrink_1(irow, dst, x_out_max, wrk);
+  }
+}
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void WebPRescalerDspInitMSA(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInitMSA(void) {
+  WebPRescalerExportRowExpand = RescalerExportRowExpand;
+  WebPRescalerExportRowShrink = RescalerExportRowShrink;
+}
+
+#else     // !WEBP_USE_MSA
+
+WEBP_DSP_INIT_STUB(WebPRescalerDspInitMSA)
+
+#endif    // WEBP_USE_MSA
diff --git a/src/3rdparty/libwebp/src/dsp/rescaler_neon.c b/src/3rdparty/libwebp/src/dsp/rescaler_neon.c
index 16fd450..b2dd8f3 100644
--- a/src/3rdparty/libwebp/src/dsp/rescaler_neon.c
+++ b/src/3rdparty/libwebp/src/dsp/rescaler_neon.c
@@ -18,7 +18,7 @@
 #include <arm_neon.h>
 #include <assert.h>
 #include "./neon.h"
-#include "../utils/rescaler.h"
+#include "../utils/rescaler_utils.h"
 
 #define ROUNDER (WEBP_RESCALER_ONE >> 1)
 #define MULT_FIX_C(x, y) (((uint64_t)(x) * (y) + ROUNDER) >> WEBP_RESCALER_RFIX)
diff --git a/src/3rdparty/libwebp/src/dsp/rescaler_sse2.c b/src/3rdparty/libwebp/src/dsp/rescaler_sse2.c
index 5b97028..8271c22 100644
--- a/src/3rdparty/libwebp/src/dsp/rescaler_sse2.c
+++ b/src/3rdparty/libwebp/src/dsp/rescaler_sse2.c
@@ -17,7 +17,7 @@
 #include <emmintrin.h>
 
 #include <assert.h>
-#include "../utils/rescaler.h"
+#include "../utils/rescaler_utils.h"
 #include "../utils/utils.h"
 
 //------------------------------------------------------------------------------
diff --git a/src/3rdparty/libwebp/src/dsp/upsampling.c b/src/3rdparty/libwebp/src/dsp/upsampling.c
index 651274f..265e722 100644
--- a/src/3rdparty/libwebp/src/dsp/upsampling.c
+++ b/src/3rdparty/libwebp/src/dsp/upsampling.c
@@ -215,6 +215,7 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitYUV444Converters(void) {
 extern void WebPInitUpsamplersSSE2(void);
 extern void WebPInitUpsamplersNEON(void);
 extern void WebPInitUpsamplersMIPSdspR2(void);
+extern void WebPInitUpsamplersMSA(void);
 
 static volatile VP8CPUInfo upsampling_last_cpuinfo_used2 =
     (VP8CPUInfo)&upsampling_last_cpuinfo_used2;
@@ -252,6 +253,11 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplers(void) {
       WebPInitUpsamplersMIPSdspR2();
     }
 #endif
+#if defined(WEBP_USE_MSA)
+    if (VP8GetCPUInfo(kMSA)) {
+      WebPInitUpsamplersMSA();
+    }
+#endif
   }
 #endif  // FANCY_UPSAMPLING
   upsampling_last_cpuinfo_used2 = VP8GetCPUInfo;
diff --git a/src/3rdparty/libwebp/src/dsp/upsampling_msa.c b/src/3rdparty/libwebp/src/dsp/upsampling_msa.c
new file mode 100644
index 0000000..f24926f
--- /dev/null
+++ b/src/3rdparty/libwebp/src/dsp/upsampling_msa.c
@@ -0,0 +1,678 @@
+// Copyright 2016 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// MSA version of YUV to RGB upsampling functions.
+//
+// Author: Prashant Patil (prashant.patil@imgtec.com)
+
+#include <string.h>
+#include "./dsp.h"
+
+#if defined(WEBP_USE_MSA)
+
+#include "./msa_macro.h"
+#include "./yuv.h"
+
+#ifdef FANCY_UPSAMPLING
+
+#define ILVR_UW2(in, out0, out1) do {                            \
+  const v8i16 t0 = (v8i16)__msa_ilvr_b((v16i8)zero, (v16i8)in);  \
+  out0 = (v4u32)__msa_ilvr_h((v8i16)zero, t0);                   \
+  out1 = (v4u32)__msa_ilvl_h((v8i16)zero, t0);                   \
+} while (0)
+
+#define ILVRL_UW4(in, out0, out1, out2, out3) do {  \
+  v16u8 t0, t1;                                     \
+  ILVRL_B2_UB(zero, in, t0, t1);                    \
+  ILVRL_H2_UW(zero, t0, out0, out1);                \
+  ILVRL_H2_UW(zero, t1, out2, out3);                \
+} while (0)
+
+#define MULTHI_16(in0, in1, in2, in3, cnst, out0, out1) do {   \
+  const v4i32 const0 = (v4i32)__msa_fill_w(cnst * 256);        \
+  v4u32 temp0, temp1, temp2, temp3;                            \
+  MUL4(in0, const0, in1, const0, in2, const0, in3, const0,     \
+       temp0, temp1, temp2, temp3);                            \
+  PCKOD_H2_UH(temp1, temp0, temp3, temp2, out0, out1);         \
+} while (0)
+
+#define MULTHI_8(in0, in1, cnst, out0) do {                 \
+  const v4i32 const0 = (v4i32)__msa_fill_w(cnst * 256);     \
+  v4u32 temp0, temp1;                                       \
+  MUL2(in0, const0, in1, const0, temp0, temp1);             \
+  out0 = (v8u16)__msa_pckod_h((v8i16)temp1, (v8i16)temp0);  \
+} while (0)
+
+#define CALC_R16(y0, y1, v0, v1, dst) do {                \
+  const v8i16 const_a = (v8i16)__msa_fill_h(14234);       \
+  const v8i16 a0 = __msa_adds_s_h((v8i16)y0, (v8i16)v0);  \
+  const v8i16 a1 = __msa_adds_s_h((v8i16)y1, (v8i16)v1);  \
+  v8i16 b0 = __msa_subs_s_h(a0, const_a);                 \
+  v8i16 b1 = __msa_subs_s_h(a1, const_a);                 \
+  SRAI_H2_SH(b0, b1, 6);                                  \
+  CLIP_SH2_0_255(b0, b1);                                 \
+  dst = (v16u8)__msa_pckev_b((v16i8)b1, (v16i8)b0);       \
+} while (0)
+
+#define CALC_R8(y0, v0, dst) do {                         \
+  const v8i16 const_a = (v8i16)__msa_fill_h(14234);       \
+  const v8i16 a0 = __msa_adds_s_h((v8i16)y0, (v8i16)v0);  \
+  v8i16 b0 = __msa_subs_s_h(a0, const_a);                 \
+  b0 = SRAI_H(b0, 6);                                     \
+  CLIP_SH_0_255(b0);                                      \
+  dst = (v16u8)__msa_pckev_b((v16i8)b0, (v16i8)b0);       \
+} while (0)
+
+#define CALC_G16(y0, y1, u0, u1, v0, v1, dst) do {   \
+  const v8i16 const_a = (v8i16)__msa_fill_h(8708);   \
+  v8i16 a0 = __msa_subs_s_h((v8i16)y0, (v8i16)u0);   \
+  v8i16 a1 = __msa_subs_s_h((v8i16)y1, (v8i16)u1);   \
+  const v8i16 b0 = __msa_subs_s_h(a0, (v8i16)v0);    \
+  const v8i16 b1 = __msa_subs_s_h(a1, (v8i16)v1);    \
+  a0 = __msa_adds_s_h(b0, const_a);                  \
+  a1 = __msa_adds_s_h(b1, const_a);                  \
+  SRAI_H2_SH(a0, a1, 6);                             \
+  CLIP_SH2_0_255(a0, a1);                            \
+  dst = (v16u8)__msa_pckev_b((v16i8)a1, (v16i8)a0);  \
+} while (0)
+
+#define CALC_G8(y0, u0, v0, dst) do {                \
+  const v8i16 const_a = (v8i16)__msa_fill_h(8708);   \
+  v8i16 a0 = __msa_subs_s_h((v8i16)y0, (v8i16)u0);   \
+  const v8i16 b0 = __msa_subs_s_h(a0, (v8i16)v0);    \
+  a0 = __msa_adds_s_h(b0, const_a);                  \
+  a0 = SRAI_H(a0, 6);                                \
+  CLIP_SH_0_255(a0);                                 \
+  dst = (v16u8)__msa_pckev_b((v16i8)a0, (v16i8)a0);  \
+} while (0)
+
+#define CALC_B16(y0, y1, u0, u1, dst) do {           \
+  const v8u16 const_a = (v8u16)__msa_fill_h(17685);  \
+  const v8u16 a0 = __msa_adds_u_h((v8u16)y0, u0);    \
+  const v8u16 a1 = __msa_adds_u_h((v8u16)y1, u1);    \
+  v8u16 b0 = __msa_subs_u_h(a0, const_a);            \
+  v8u16 b1 = __msa_subs_u_h(a1, const_a);            \
+  SRAI_H2_UH(b0, b1, 6);                             \
+  CLIP_UH2_0_255(b0, b1);                            \
+  dst = (v16u8)__msa_pckev_b((v16i8)b1, (v16i8)b0);  \
+} while (0)
+
+#define CALC_B8(y0, u0, dst) do {                    \
+  const v8u16 const_a = (v8u16)__msa_fill_h(17685);  \
+  const v8u16 a0 = __msa_adds_u_h((v8u16)y0, u0);    \
+  v8u16 b0 = __msa_subs_u_h(a0, const_a);            \
+  b0 = SRAI_H(b0, 6);                                \
+  CLIP_UH_0_255(b0);                                 \
+  dst = (v16u8)__msa_pckev_b((v16i8)b0, (v16i8)b0);  \
+} while (0)
+
+#define CALC_RGB16(y, u, v, R, G, B) do {    \
+  const v16u8 zero = { 0 };                  \
+  v8u16 y0, y1, u0, u1, v0, v1;              \
+  v4u32 p0, p1, p2, p3;                      \
+  const v16u8 in_y = LD_UB(y);               \
+  const v16u8 in_u = LD_UB(u);               \
+  const v16u8 in_v = LD_UB(v);               \
+  ILVRL_UW4(in_y, p0, p1, p2, p3);           \
+  MULTHI_16(p0, p1, p2, p3, 19077, y0, y1);  \
+  ILVRL_UW4(in_v, p0, p1, p2, p3);           \
+  MULTHI_16(p0, p1, p2, p3, 26149, v0, v1);  \
+  CALC_R16(y0, y1, v0, v1, R);               \
+  MULTHI_16(p0, p1, p2, p3, 13320, v0, v1);  \
+  ILVRL_UW4(in_u, p0, p1, p2, p3);           \
+  MULTHI_16(p0, p1, p2, p3, 6419, u0, u1);   \
+  CALC_G16(y0, y1, u0, u1, v0, v1, G);       \
+  MULTHI_16(p0, p1, p2, p3, 33050, u0, u1);  \
+  CALC_B16(y0, y1, u0, u1, B);               \
+} while (0)
+
+#define CALC_RGB8(y, u, v, R, G, B) do {  \
+  const v16u8 zero = { 0 };               \
+  v8u16 y0, u0, v0;                       \
+  v4u32 p0, p1;                           \
+  const v16u8 in_y = LD_UB(y);            \
+  const v16u8 in_u = LD_UB(u);            \
+  const v16u8 in_v = LD_UB(v);            \
+  ILVR_UW2(in_y, p0, p1);                 \
+  MULTHI_8(p0, p1, 19077, y0);            \
+  ILVR_UW2(in_v, p0, p1);                 \
+  MULTHI_8(p0, p1, 26149, v0);            \
+  CALC_R8(y0, v0, R);                     \
+  MULTHI_8(p0, p1, 13320, v0);            \
+  ILVR_UW2(in_u, p0, p1);                 \
+  MULTHI_8(p0, p1, 6419, u0);             \
+  CALC_G8(y0, u0, v0, G);                 \
+  MULTHI_8(p0, p1, 33050, u0);            \
+  CALC_B8(y0, u0, B);                     \
+} while (0)
+
+#define STORE16_3(a0, a1, a2, dst) do {                          \
+  const v16u8 mask0 = { 0, 1, 16, 2, 3, 17, 4, 5, 18, 6, 7, 19,  \
+                        8, 9, 20, 10 };                          \
+  const v16u8 mask1 = { 0, 21, 1, 2, 22, 3, 4, 23, 5, 6, 24, 7,  \
+                        8, 25, 9, 10 };                          \
+  const v16u8 mask2 = { 26, 0, 1, 27, 2, 3, 28, 4, 5, 29, 6, 7,  \
+                        30, 8, 9, 31 };                          \
+  v16u8 out0, out1, out2, tmp0, tmp1, tmp2;                      \
+  ILVRL_B2_UB(a1, a0, tmp0, tmp1);                               \
+  out0 = VSHF_UB(tmp0, a2, mask0);                               \
+  tmp2 = SLDI_UB(tmp1, tmp0, 11);                                \
+  out1 = VSHF_UB(tmp2, a2, mask1);                               \
+  tmp2 = SLDI_UB(tmp1, tmp1, 6);                                 \
+  out2 = VSHF_UB(tmp2, a2, mask2);                               \
+  ST_UB(out0, dst +  0);                                         \
+  ST_UB(out1, dst + 16);                                         \
+  ST_UB(out2, dst + 32);                                         \
+} while (0)
+
+#define STORE8_3(a0, a1, a2, dst) do {                             \
+  int64_t out_m;                                                   \
+  const v16u8 mask0 = { 0, 1, 16, 2, 3, 17, 4, 5, 18, 6, 7, 19,    \
+                        8, 9, 20, 10 };                            \
+  const v16u8 mask1 = { 11, 21, 12, 13, 22, 14, 15, 23,            \
+                        255, 255, 255, 255, 255, 255, 255, 255 };  \
+  const v16u8 tmp0 = (v16u8)__msa_ilvr_b((v16i8)a1, (v16i8)a0);    \
+  v16u8 out0, out1;                                                \
+  VSHF_B2_UB(tmp0, a2, tmp0, a2, mask0, mask1, out0, out1);        \
+  ST_UB(out0, dst);                                                \
+  out_m = __msa_copy_s_d((v2i64)out1, 0);                          \
+  SD(out_m, dst + 16);                                             \
+} while (0)
+
+#define STORE16_4(a0, a1, a2, a3, dst) do {  \
+  v16u8 tmp0, tmp1, tmp2, tmp3;              \
+  v16u8 out0, out1, out2, out3;              \
+  ILVRL_B2_UB(a1, a0, tmp0, tmp1);           \
+  ILVRL_B2_UB(a3, a2, tmp2, tmp3);           \
+  ILVRL_H2_UB(tmp2, tmp0, out0, out1);       \
+  ILVRL_H2_UB(tmp3, tmp1, out2, out3);       \
+  ST_UB(out0, dst +  0);                     \
+  ST_UB(out1, dst + 16);                     \
+  ST_UB(out2, dst + 32);                     \
+  ST_UB(out3, dst + 48);                     \
+} while (0)
+
+#define STORE8_4(a0, a1, a2, a3, dst) do {  \
+  v16u8 tmp0, tmp1, tmp2, tmp3;             \
+  ILVR_B2_UB(a1, a0, a3, a2, tmp0, tmp1);   \
+  ILVRL_H2_UB(tmp1, tmp0, tmp2, tmp3);      \
+  ST_UB(tmp2, dst +  0);                    \
+  ST_UB(tmp3, dst + 16);                    \
+} while (0)
+
+#define STORE2_16(a0, a1, dst) do {  \
+  v16u8 out0, out1;                  \
+  ILVRL_B2_UB(a1, a0, out0, out1);   \
+  ST_UB(out0, dst +  0);             \
+  ST_UB(out1, dst + 16);             \
+} while (0)
+
+#define STORE2_8(a0, a1, dst) do {                               \
+  const v16u8 out0 = (v16u8)__msa_ilvr_b((v16i8)a1, (v16i8)a0);  \
+  ST_UB(out0, dst);                                              \
+} while (0)
+
+#define CALC_RGBA4444(y, u, v, out0, out1, N, dst) do {  \
+  CALC_RGB##N(y, u, v, R, G, B);                         \
+  tmp0 = ANDI_B(R, 0xf0);                                \
+  tmp1 = SRAI_B(G, 4);                                   \
+  RG = tmp0 | tmp1;                                      \
+  tmp0 = ANDI_B(B, 0xf0);                                \
+  BA = ORI_B(tmp0, 0x0f);                                \
+  STORE2_##N(out0, out1, dst);                           \
+} while (0)
+
+#define CALC_RGB565(y, u, v, out0, out1, N, dst) do {  \
+  CALC_RGB##N(y, u, v, R, G, B);                       \
+  tmp0 = ANDI_B(R, 0xf8);                              \
+  tmp1 = SRAI_B(G, 5);                                 \
+  RG = tmp0 | tmp1;                                    \
+  tmp0 = SLLI_B(G, 3);                                 \
+  tmp1 = ANDI_B(tmp0, 0xe0);                           \
+  tmp0 = SRAI_B(B, 3);                                 \
+  GB = tmp0 | tmp1;                                    \
+  STORE2_##N(out0, out1, dst);                         \
+} while (0)
+
+static WEBP_INLINE int Clip8(int v) {
+  return v < 0 ? 0 : v > 255 ? 255 : v;
+}
+
+static void YuvToRgb(int y, int u, int v, uint8_t* const rgb) {
+  const int y1 = MultHi(y, 19077);
+  const int r1 = y1 + MultHi(v, 26149) - 14234;
+  const int g1 = y1 - MultHi(u, 6419) - MultHi(v, 13320) + 8708;
+  const int b1 = y1 + MultHi(u, 33050) - 17685;
+  rgb[0] = Clip8(r1 >> 6);
+  rgb[1] = Clip8(g1 >> 6);
+  rgb[2] = Clip8(b1 >> 6);
+}
+
+static void YuvToBgr(int y, int u, int v, uint8_t* const bgr) {
+  const int y1 = MultHi(y, 19077);
+  const int r1 = y1 + MultHi(v, 26149) - 14234;
+  const int g1 = y1 - MultHi(u, 6419) - MultHi(v, 13320) + 8708;
+  const int b1 = y1 + MultHi(u, 33050) - 17685;
+  bgr[0] = Clip8(b1 >> 6);
+  bgr[1] = Clip8(g1 >> 6);
+  bgr[2] = Clip8(r1 >> 6);
+}
+
+static void YuvToRgb565(int y, int u, int v, uint8_t* const rgb) {
+  const int y1 = MultHi(y, 19077);
+  const int r1 = y1 + MultHi(v, 26149) - 14234;
+  const int g1 = y1 - MultHi(u, 6419) - MultHi(v, 13320) + 8708;
+  const int b1 = y1 + MultHi(u, 33050) - 17685;
+  const int r = Clip8(r1 >> 6);
+  const int g = Clip8(g1 >> 6);
+  const int b = Clip8(b1 >> 6);
+  const int rg = (r & 0xf8) | (g >> 5);
+  const int gb = ((g << 3) & 0xe0) | (b >> 3);
+#ifdef WEBP_SWAP_16BIT_CSP
+  rgb[0] = gb;
+  rgb[1] = rg;
+#else
+  rgb[0] = rg;
+  rgb[1] = gb;
+#endif
+}
+
+static void YuvToRgba4444(int y, int u, int v, uint8_t* const argb) {
+  const int y1 = MultHi(y, 19077);
+  const int r1 = y1 + MultHi(v, 26149) - 14234;
+  const int g1 = y1 - MultHi(u, 6419) - MultHi(v, 13320) + 8708;
+  const int b1 = y1 + MultHi(u, 33050) - 17685;
+  const int r = Clip8(r1 >> 6);
+  const int g = Clip8(g1 >> 6);
+  const int b = Clip8(b1 >> 6);
+  const int rg = (r & 0xf0) | (g >> 4);
+  const int ba = (b & 0xf0) | 0x0f;     // overwrite the lower 4 bits
+#ifdef WEBP_SWAP_16BIT_CSP
+  argb[0] = ba;
+  argb[1] = rg;
+#else
+  argb[0] = rg;
+  argb[1] = ba;
+#endif
+}
+
+static void YuvToArgb(uint8_t y, uint8_t u, uint8_t v, uint8_t* const argb) {
+  argb[0] = 0xff;
+  YuvToRgb(y, u, v, argb + 1);
+}
+
+static void YuvToBgra(uint8_t y, uint8_t u, uint8_t v, uint8_t* const bgra) {
+  YuvToBgr(y, u, v, bgra);
+  bgra[3] = 0xff;
+}
+
+static void YuvToRgba(uint8_t y, uint8_t u, uint8_t v, uint8_t* const rgba) {
+  YuvToRgb(y, u, v, rgba);
+  rgba[3] = 0xff;
+}
+
+static void YuvToRgbLine(const uint8_t* y, const uint8_t* u,
+                         const uint8_t* v, uint8_t* dst, int length) {
+  v16u8 R, G, B;
+  while (length >= 16) {
+    CALC_RGB16(y, u, v, R, G, B);
+    STORE16_3(R, G, B, dst);
+    y      += 16;
+    u      += 16;
+    v      += 16;
+    dst    += 16 * 3;
+    length -= 16;
+  }
+  if (length > 8) {
+    uint8_t temp[3 * 16] = { 0 };
+    memcpy(temp, y, length * sizeof(*temp));
+    CALC_RGB16(temp, u, v, R, G, B);
+    STORE16_3(R, G, B, temp);
+    memcpy(dst, temp, length * 3 * sizeof(*dst));
+  } else if (length > 0) {
+    uint8_t temp[3 * 8] = { 0 };
+    memcpy(temp, y, length * sizeof(*temp));
+    CALC_RGB8(temp, u, v, R, G, B);
+    STORE8_3(R, G, B, temp);
+    memcpy(dst, temp, length * 3 * sizeof(*dst));
+  }
+}
+
+static void YuvToBgrLine(const uint8_t* y, const uint8_t* u,
+                         const uint8_t* v, uint8_t* dst, int length) {
+  v16u8 R, G, B;
+  while (length >= 16) {
+    CALC_RGB16(y, u, v, R, G, B);
+    STORE16_3(B, G, R, dst);
+    y      += 16;
+    u      += 16;
+    v      += 16;
+    dst    += 16 * 3;
+    length -= 16;
+  }
+  if (length > 8) {
+    uint8_t temp[3 * 16] = { 0 };
+    memcpy(temp, y, length * sizeof(*temp));
+    CALC_RGB16(temp, u, v, R, G, B);
+    STORE16_3(B, G, R, temp);
+    memcpy(dst, temp, length * 3 * sizeof(*dst));
+  } else if (length > 0) {
+    uint8_t temp[3 * 8] = { 0 };
+    memcpy(temp, y, length * sizeof(*temp));
+    CALC_RGB8(temp, u, v, R, G, B);
+    STORE8_3(B, G, R, temp);
+    memcpy(dst, temp, length * 3 * sizeof(*dst));
+  }
+}
+
+static void YuvToRgbaLine(const uint8_t* y, const uint8_t* u,
+                          const uint8_t* v, uint8_t* dst, int length) {
+  v16u8 R, G, B;
+  const v16u8 A = (v16u8)__msa_ldi_b(0xff);
+  while (length >= 16) {
+    CALC_RGB16(y, u, v, R, G, B);
+    STORE16_4(R, G, B, A, dst);
+    y      += 16;
+    u      += 16;
+    v      += 16;
+    dst    += 16 * 4;
+    length -= 16;
+  }
+  if (length > 8) {
+    uint8_t temp[4 * 16] = { 0 };
+    memcpy(temp, y, length * sizeof(*temp));
+    CALC_RGB16(&temp[0], u, v, R, G, B);
+    STORE16_4(R, G, B, A, temp);
+    memcpy(dst, temp, length * 4 * sizeof(*dst));
+  } else if (length > 0) {
+    uint8_t temp[4 * 8] = { 0 };
+    memcpy(temp, y, length * sizeof(*temp));
+    CALC_RGB8(temp, u, v, R, G, B);
+    STORE8_4(R, G, B, A, temp);
+    memcpy(dst, temp, length * 4 * sizeof(*dst));
+  }
+}
+
+static void YuvToBgraLine(const uint8_t* y, const uint8_t* u,
+                          const uint8_t* v, uint8_t* dst, int length) {
+  v16u8 R, G, B;
+  const v16u8 A = (v16u8)__msa_ldi_b(0xff);
+  while (length >= 16) {
+    CALC_RGB16(y, u, v, R, G, B);
+    STORE16_4(B, G, R, A, dst);
+    y      += 16;
+    u      += 16;
+    v      += 16;
+    dst    += 16 * 4;
+    length -= 16;
+  }
+  if (length > 8) {
+    uint8_t temp[4 * 16] = { 0 };
+    memcpy(temp, y, length * sizeof(*temp));
+    CALC_RGB16(temp, u, v, R, G, B);
+    STORE16_4(B, G, R, A, temp);
+    memcpy(dst, temp, length * 4 * sizeof(*dst));
+  } else if (length > 0) {
+    uint8_t temp[4 * 8] = { 0 };
+    memcpy(temp, y, length * sizeof(*temp));
+    CALC_RGB8(temp, u, v, R, G, B);
+    STORE8_4(B, G, R, A, temp);
+    memcpy(dst, temp, length * 4 * sizeof(*dst));
+  }
+}
+
+static void YuvToArgbLine(const uint8_t* y, const uint8_t* u,
+                          const uint8_t* v, uint8_t* dst, int length) {
+  v16u8 R, G, B;
+  const v16u8 A = (v16u8)__msa_ldi_b(0xff);
+  while (length >= 16) {
+    CALC_RGB16(y, u, v, R, G, B);
+    STORE16_4(A, R, G, B, dst);
+    y      += 16;
+    u      += 16;
+    v      += 16;
+    dst    += 16 * 4;
+    length -= 16;
+  }
+  if (length > 8) {
+    uint8_t temp[4 * 16] = { 0 };
+    memcpy(temp, y, length * sizeof(*temp));
+    CALC_RGB16(temp, u, v, R, G, B);
+    STORE16_4(A, R, G, B, temp);
+    memcpy(dst, temp, length * 4 * sizeof(*dst));
+  } else if (length > 0) {
+    uint8_t temp[4 * 8] = { 0 };
+    memcpy(temp, y, length * sizeof(*temp));
+    CALC_RGB8(temp, u, v, R, G, B);
+    STORE8_4(A, R, G, B, temp);
+    memcpy(dst, temp, length * 4 * sizeof(*dst));
+  }
+}
+
+static void YuvToRgba4444Line(const uint8_t* y, const uint8_t* u,
+                              const uint8_t* v, uint8_t* dst, int length) {
+  v16u8 R, G, B, RG, BA, tmp0, tmp1;
+  while (length >= 16) {
+  #ifdef WEBP_SWAP_16BIT_CSP
+    CALC_RGBA4444(y, u, v, BA, RG, 16, dst);
+  #else
+    CALC_RGBA4444(y, u, v, RG, BA, 16, dst);
+  #endif
+    y      += 16;
+    u      += 16;
+    v      += 16;
+    dst    += 16 * 2;
+    length -= 16;
+  }
+  if (length > 8) {
+    uint8_t temp[2 * 16] = { 0 };
+    memcpy(temp, y, length * sizeof(*temp));
+#ifdef WEBP_SWAP_16BIT_CSP
+    CALC_RGBA4444(temp, u, v, BA, RG, 16, temp);
+#else
+    CALC_RGBA4444(temp, u, v, RG, BA, 16, temp);
+#endif
+    memcpy(dst, temp, length * 2 * sizeof(*dst));
+  } else if (length > 0) {
+    uint8_t temp[2 * 8] = { 0 };
+    memcpy(temp, y, length * sizeof(*temp));
+#ifdef WEBP_SWAP_16BIT_CSP
+    CALC_RGBA4444(temp, u, v, BA, RG, 8, temp);
+#else
+    CALC_RGBA4444(temp, u, v, RG, BA, 8, temp);
+#endif
+    memcpy(dst, temp, length * 2 * sizeof(*dst));
+  }
+}
+
+static void YuvToRgb565Line(const uint8_t* y, const uint8_t* u,
+                            const uint8_t* v, uint8_t* dst, int length) {
+  v16u8 R, G, B, RG, GB, tmp0, tmp1;
+  while (length >= 16) {
+  #ifdef WEBP_SWAP_16BIT_CSP
+    CALC_RGB565(y, u, v, GB, RG, 16, dst);
+  #else
+    CALC_RGB565(y, u, v, RG, GB, 16, dst);
+  #endif
+    y      += 16;
+    u      += 16;
+    v      += 16;
+    dst    += 16 * 2;
+    length -= 16;
+  }
+  if (length > 8) {
+    uint8_t temp[2 * 16] = { 0 };
+    memcpy(temp, y, length * sizeof(*temp));
+#ifdef WEBP_SWAP_16BIT_CSP
+    CALC_RGB565(temp, u, v, GB, RG, 16, temp);
+#else
+    CALC_RGB565(temp, u, v, RG, GB, 16, temp);
+#endif
+    memcpy(dst, temp, length * 2 * sizeof(*dst));
+  } else if (length > 0) {
+    uint8_t temp[2 * 8] = { 0 };
+    memcpy(temp, y, length * sizeof(*temp));
+#ifdef WEBP_SWAP_16BIT_CSP
+    CALC_RGB565(temp, u, v, GB, RG, 8, temp);
+#else
+    CALC_RGB565(temp, u, v, RG, GB, 8, temp);
+#endif
+    memcpy(dst, temp, length * 2 * sizeof(*dst));
+  }
+}
+
+#define UPSAMPLE_32PIXELS(a, b, c, d) do {    \
+  v16u8 s = __msa_aver_u_b(a, d);             \
+  v16u8 t = __msa_aver_u_b(b, c);             \
+  const v16u8 st = s ^ t;                     \
+  v16u8 ad = a ^ d;                           \
+  v16u8 bc = b ^ c;                           \
+  v16u8 t0 = ad | bc;                         \
+  v16u8 t1 = t0 | st;                         \
+  v16u8 t2 = ANDI_B(t1, 1);                   \
+  v16u8 t3 = __msa_aver_u_b(s, t);            \
+  const v16u8 k = t3 - t2;                    \
+  v16u8 diag1, diag2;                         \
+  AVER_UB2_UB(t, k, s, k, t0, t1);            \
+  bc = bc & st;                               \
+  ad = ad & st;                               \
+  t = t ^ k;                                  \
+  s = s ^ k;                                  \
+  t2 = bc | t;                                \
+  t3 = ad | s;                                \
+  t2 = ANDI_B(t2, 1);                         \
+  t3 = ANDI_B(t3, 1);                         \
+  SUB2(t0, t2, t1, t3, diag1, diag2);         \
+  AVER_UB2_UB(a, diag1, b, diag2, t0, t1);    \
+  ILVRL_B2_UB(t1, t0, a, b);                  \
+  if (pbot_y != NULL) {                       \
+    AVER_UB2_UB(c, diag2, d, diag1, t0, t1);  \
+    ILVRL_B2_UB(t1, t0, c, d);                \
+  }                                           \
+} while (0)
+
+#define UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP)                            \
+static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bot_y,        \
+                      const uint8_t* top_u, const uint8_t* top_v,        \
+                      const uint8_t* cur_u, const uint8_t* cur_v,        \
+                      uint8_t* top_dst, uint8_t* bot_dst, int len)       \
+{                                                                        \
+  int size = (len - 1) >> 1;                                             \
+  uint8_t temp_u[64];                                                    \
+  uint8_t temp_v[64];                                                    \
+  const uint32_t tl_uv = ((top_u[0]) | ((top_v[0]) << 16));              \
+  const uint32_t l_uv = ((cur_u[0]) | ((cur_v[0]) << 16));               \
+  const uint32_t uv0 = (3 * tl_uv + l_uv + 0x00020002u) >> 2;            \
+  const uint8_t* ptop_y = &top_y[1];                                     \
+  uint8_t *ptop_dst = top_dst + XSTEP;                                   \
+  const uint8_t* pbot_y = &bot_y[1];                                     \
+  uint8_t *pbot_dst = bot_dst + XSTEP;                                   \
+                                                                         \
+  FUNC(top_y[0], uv0 & 0xff, (uv0 >> 16), top_dst);                      \
+  if (bot_y != NULL) {                                                   \
+    const uint32_t uv1 = (3 * l_uv + tl_uv + 0x00020002u) >> 2;          \
+    FUNC(bot_y[0], uv1 & 0xff, (uv1 >> 16), bot_dst);                    \
+  }                                                                      \
+  while (size >= 16) {                                                   \
+    v16u8 tu0, tu1, tv0, tv1, cu0, cu1, cv0, cv1;                        \
+    LD_UB2(top_u, 1, tu0, tu1);                                          \
+    LD_UB2(cur_u, 1, cu0, cu1);                                          \
+    LD_UB2(top_v, 1, tv0, tv1);                                          \
+    LD_UB2(cur_v, 1, cv0, cv1);                                          \
+    UPSAMPLE_32PIXELS(tu0, tu1, cu0, cu1);                               \
+    UPSAMPLE_32PIXELS(tv0, tv1, cv0, cv1);                               \
+    ST_UB4(tu0, tu1, cu0, cu1, &temp_u[0], 16);                          \
+    ST_UB4(tv0, tv1, cv0, cv1, &temp_v[0], 16);                          \
+    FUNC##Line(ptop_y, &temp_u[ 0], &temp_v[0], ptop_dst, 32);           \
+    if (bot_y != NULL) {                                                 \
+      FUNC##Line(pbot_y, &temp_u[32], &temp_v[32], pbot_dst, 32);        \
+    }                                                                    \
+    ptop_y   += 32;                                                      \
+    pbot_y   += 32;                                                      \
+    ptop_dst += XSTEP * 32;                                              \
+    pbot_dst += XSTEP * 32;                                              \
+    top_u    += 16;                                                      \
+    top_v    += 16;                                                      \
+    cur_u    += 16;                                                      \
+    cur_v    += 16;                                                      \
+    size     -= 16;                                                      \
+  }                                                                      \
+  if (size > 0) {                                                        \
+    v16u8 tu0, tu1, tv0, tv1, cu0, cu1, cv0, cv1;                        \
+    memcpy(&temp_u[ 0], top_u, 17 * sizeof(uint8_t));                    \
+    memcpy(&temp_u[32], cur_u, 17 * sizeof(uint8_t));                    \
+    memcpy(&temp_v[ 0], top_v, 17 * sizeof(uint8_t));                    \
+    memcpy(&temp_v[32], cur_v, 17 * sizeof(uint8_t));                    \
+    LD_UB2(&temp_u[ 0], 1, tu0, tu1);                                    \
+    LD_UB2(&temp_u[32], 1, cu0, cu1);                                    \
+    LD_UB2(&temp_v[ 0], 1, tv0, tv1);                                    \
+    LD_UB2(&temp_v[32], 1, cv0, cv1);                                    \
+    UPSAMPLE_32PIXELS(tu0, tu1, cu0, cu1);                               \
+    UPSAMPLE_32PIXELS(tv0, tv1, cv0, cv1);                               \
+    ST_UB4(tu0, tu1, cu0, cu1, &temp_u[0], 16);                          \
+    ST_UB4(tv0, tv1, cv0, cv1, &temp_v[0], 16);                          \
+    FUNC##Line(ptop_y, &temp_u[ 0], &temp_v[0], ptop_dst, size * 2);     \
+    if (bot_y != NULL) {                                                 \
+      FUNC##Line(pbot_y, &temp_u[32], &temp_v[32], pbot_dst, size * 2);  \
+    }                                                                    \
+    top_u += size;                                                       \
+    top_v += size;                                                       \
+    cur_u += size;                                                       \
+    cur_v += size;                                                       \
+  }                                                                      \
+  if (!(len & 1)) {                                                      \
+    const uint32_t t0 = ((top_u[0]) | ((top_v[0]) << 16));               \
+    const uint32_t c0  = ((cur_u[0]) | ((cur_v[0]) << 16));              \
+    const uint32_t tmp0 = (3 * t0 + c0 + 0x00020002u) >> 2;              \
+    FUNC(top_y[len - 1], tmp0 & 0xff, (tmp0 >> 16),                      \
+                top_dst + (len - 1) * XSTEP);                            \
+    if (bot_y != NULL) {                                                 \
+      const uint32_t tmp1 = (3 * c0 + t0 + 0x00020002u) >> 2;            \
+      FUNC(bot_y[len - 1], tmp1 & 0xff, (tmp1 >> 16),                    \
+           bot_dst + (len - 1) * XSTEP);                                 \
+    }                                                                    \
+  }                                                                      \
+}
+
+UPSAMPLE_FUNC(UpsampleRgbLinePair,      YuvToRgb,      3)
+UPSAMPLE_FUNC(UpsampleBgrLinePair,      YuvToBgr,      3)
+UPSAMPLE_FUNC(UpsampleRgbaLinePair,     YuvToRgba,     4)
+UPSAMPLE_FUNC(UpsampleBgraLinePair,     YuvToBgra,     4)
+UPSAMPLE_FUNC(UpsampleArgbLinePair,     YuvToArgb,     4)
+UPSAMPLE_FUNC(UpsampleRgba4444LinePair, YuvToRgba4444, 2)
+UPSAMPLE_FUNC(UpsampleRgb565LinePair,   YuvToRgb565,   2)
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */];
+
+extern void WebPInitUpsamplersMSA(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplersMSA(void) {
+  WebPUpsamplers[MODE_RGB]       = UpsampleRgbLinePair;
+  WebPUpsamplers[MODE_RGBA]      = UpsampleRgbaLinePair;
+  WebPUpsamplers[MODE_BGR]       = UpsampleBgrLinePair;
+  WebPUpsamplers[MODE_BGRA]      = UpsampleBgraLinePair;
+  WebPUpsamplers[MODE_ARGB]      = UpsampleArgbLinePair;
+  WebPUpsamplers[MODE_rgbA]      = UpsampleRgbaLinePair;
+  WebPUpsamplers[MODE_bgrA]      = UpsampleBgraLinePair;
+  WebPUpsamplers[MODE_Argb]      = UpsampleArgbLinePair;
+  WebPUpsamplers[MODE_RGB_565]   = UpsampleRgb565LinePair;
+  WebPUpsamplers[MODE_RGBA_4444] = UpsampleRgba4444LinePair;
+  WebPUpsamplers[MODE_rgbA_4444] = UpsampleRgba4444LinePair;
+}
+
+#endif  // FANCY_UPSAMPLING
+
+#endif  // WEBP_USE_MSA
+
+#if !(defined(FANCY_UPSAMPLING) && defined(WEBP_USE_MSA))
+WEBP_DSP_INIT_STUB(WebPInitUpsamplersMSA)
+#endif
diff --git a/src/3rdparty/libwebp/src/dsp/upsampling_neon.c b/src/3rdparty/libwebp/src/dsp/upsampling_neon.c
index 2b0c99b..d371a83 100644
--- a/src/3rdparty/libwebp/src/dsp/upsampling_neon.c
+++ b/src/3rdparty/libwebp/src/dsp/upsampling_neon.c
@@ -28,47 +28,34 @@
 // U/V upsampling
 
 // Loads 9 pixels each from rows r1 and r2 and generates 16 pixels.
-#define UPSAMPLE_16PIXELS(r1, r2, out) {                                \
-  uint8x8_t a = vld1_u8(r1);                                            \
-  uint8x8_t b = vld1_u8(r1 + 1);                                        \
-  uint8x8_t c = vld1_u8(r2);                                            \
-  uint8x8_t d = vld1_u8(r2 + 1);                                        \
-                                                                        \
-  uint16x8_t al = vshll_n_u8(a, 1);                                     \
-  uint16x8_t bl = vshll_n_u8(b, 1);                                     \
-  uint16x8_t cl = vshll_n_u8(c, 1);                                     \
-  uint16x8_t dl = vshll_n_u8(d, 1);                                     \
-                                                                        \
-  uint8x8_t diag1, diag2;                                               \
-  uint16x8_t sl;                                                        \
-                                                                        \
+#define UPSAMPLE_16PIXELS(r1, r2, out) do {                             \
+  const uint8x8_t a = vld1_u8(r1 + 0);                                  \
+  const uint8x8_t b = vld1_u8(r1 + 1);                                  \
+  const uint8x8_t c = vld1_u8(r2 + 0);                                  \
+  const uint8x8_t d = vld1_u8(r2 + 1);                                  \
   /* a + b + c + d */                                                   \
-  sl = vaddl_u8(a,  b);                                                 \
-  sl = vaddw_u8(sl, c);                                                 \
-  sl = vaddw_u8(sl, d);                                                 \
-                                                                        \
-  al = vaddq_u16(sl, al); /* 3a +  b +  c +  d */                       \
-  bl = vaddq_u16(sl, bl); /*  a + 3b +  c +  d */                       \
-                                                                        \
-  al = vaddq_u16(al, dl); /* 3a +  b +  c + 3d */                       \
-  bl = vaddq_u16(bl, cl); /*  a + 3b + 3c +  d */                       \
+  const uint16x8_t ad = vaddl_u8(a,  d);                                \
+  const uint16x8_t bc = vaddl_u8(b,  c);                                \
+  const uint16x8_t abcd = vaddq_u16(ad, bc);                            \
+  /* 3a +  b +  c + 3d */                                               \
+  const uint16x8_t al = vaddq_u16(abcd, vshlq_n_u16(ad, 1));            \
+  /*  a + 3b + 3c +  d */                                               \
+  const uint16x8_t bl = vaddq_u16(abcd, vshlq_n_u16(bc, 1));            \
                                                                         \
-  diag2 = vshrn_n_u16(al, 3);                                           \
-  diag1 = vshrn_n_u16(bl, 3);                                           \
+  const uint8x8_t diag2 = vshrn_n_u16(al, 3);                           \
+  const uint8x8_t diag1 = vshrn_n_u16(bl, 3);                           \
                                                                         \
-  a = vrhadd_u8(a, diag1);                                              \
-  b = vrhadd_u8(b, diag2);                                              \
-  c = vrhadd_u8(c, diag2);                                              \
-  d = vrhadd_u8(d, diag1);                                              \
+  const uint8x8_t A = vrhadd_u8(a, diag1);                              \
+  const uint8x8_t B = vrhadd_u8(b, diag2);                              \
+  const uint8x8_t C = vrhadd_u8(c, diag2);                              \
+  const uint8x8_t D = vrhadd_u8(d, diag1);                              \
                                                                         \
-  {                                                                     \
-    uint8x8x2_t a_b, c_d;                                               \
-    INIT_VECTOR2(a_b, a, b);                                            \
-    INIT_VECTOR2(c_d, c, d);                                            \
-    vst2_u8(out,      a_b);                                             \
-    vst2_u8(out + 32, c_d);                                             \
-  }                                                                     \
-}
+  uint8x8x2_t A_B, C_D;                                                 \
+  INIT_VECTOR2(A_B, A, B);                                              \
+  INIT_VECTOR2(C_D, C, D);                                              \
+  vst2_u8(out +  0, A_B);                                               \
+  vst2_u8(out + 32, C_D);                                               \
+} while (0)
 
 // Turn the macro into a function for reducing code-size when non-critical
 static void Upsample16Pixels(const uint8_t *r1, const uint8_t *r2,
@@ -93,7 +80,6 @@ static void Upsample16Pixels(const uint8_t *r1, const uint8_t *r2,
 static const int16_t kCoeffs1[4] = { 19077, 26149, 6419, 13320 };
 
 #define v255 vdup_n_u8(255)
-#define v_0x0f vdup_n_u8(15)
 
 #define STORE_Rgb(out, r, g, b) do {                                    \
   uint8x8x3_t r_g_b;                                                    \
@@ -132,21 +118,16 @@ static const int16_t kCoeffs1[4] = { 19077, 26149, 6419, 13320 };
 #endif
 
 #define STORE_Rgba4444(out, r, g, b) do {                               \
-  const uint8x8_t r1 = vshl_n_u8(vshr_n_u8(r, 4), 4);  /* 4bits */      \
-  const uint8x8_t g1 = vshr_n_u8(g, 4);                                 \
-  const uint8x8_t ba = vorr_u8(b, v_0x0f);                              \
-  const uint8x8_t rg = vorr_u8(r1, g1);                                 \
+  const uint8x8_t rg = vsri_n_u8(r, g, 4);      /* shift g, insert r */ \
+  const uint8x8_t ba = vsri_n_u8(b, v255, 4);   /* shift a, insert b */ \
   const uint8x8x2_t rgba4444 = ZIP_U8(rg, ba);                          \
   vst1q_u8(out, vcombine_u8(rgba4444.val[0], rgba4444.val[1]));         \
 } while (0)
 
 #define STORE_Rgb565(out, r, g, b) do {                                 \
-  const uint8x8_t r1 = vshl_n_u8(vshr_n_u8(r, 3), 3);  /* 5bits */      \
-  const uint8x8_t g1 = vshr_n_u8(g, 5);                /* upper 3bits */\
-  const uint8x8_t g2 = vshl_n_u8(vshr_n_u8(g, 2), 5);  /* lower 3bits */\
-  const uint8x8_t b1 = vshr_n_u8(b, 3);                /* 5bits */      \
-  const uint8x8_t rg = vorr_u8(r1, g1);                                 \
-  const uint8x8_t gb = vorr_u8(g2, b1);                                 \
+  const uint8x8_t rg = vsri_n_u8(r, g, 5);   /* shift g and insert r */ \
+  const uint8x8_t g1 = vshl_n_u8(g, 3);      /* pre-shift g: 3bits */   \
+  const uint8x8_t gb = vsri_n_u8(g1, b, 3);  /* shift b and insert g */ \
   const uint8x8x2_t rgb565 = ZIP_U8(rg, gb);                            \
   vst1q_u8(out, vcombine_u8(rgb565.val[0], rgb565.val[1]));             \
 } while (0)
diff --git a/src/3rdparty/libwebp/src/dsp/yuv.c b/src/3rdparty/libwebp/src/dsp/yuv.c
index f50a253..dd7d9de 100644
--- a/src/3rdparty/libwebp/src/dsp/yuv.c
+++ b/src/3rdparty/libwebp/src/dsp/yuv.c
@@ -13,6 +13,8 @@
 
 #include "./yuv.h"
 
+#include <stdlib.h>
+
 #if defined(WEBP_YUV_USE_TABLE)
 
 static int done = 0;
@@ -244,6 +246,48 @@ void WebPConvertRGBA32ToUV_C(const uint16_t* rgb,
 
 //-----------------------------------------------------------------------------
 
+#define MAX_Y ((1 << 10) - 1)    // 10b precision over 16b-arithmetic
+static uint16_t clip_y(int v) {
+  return (v < 0) ? 0 : (v > MAX_Y) ? MAX_Y : (uint16_t)v;
+}
+
+static uint64_t SharpYUVUpdateY_C(const uint16_t* ref, const uint16_t* src,
+                                  uint16_t* dst, int len) {
+  uint64_t diff = 0;
+  int i;
+  for (i = 0; i < len; ++i) {
+    const int diff_y = ref[i] - src[i];
+    const int new_y = (int)dst[i] + diff_y;
+    dst[i] = clip_y(new_y);
+    diff += (uint64_t)abs(diff_y);
+  }
+  return diff;
+}
+
+static void SharpYUVUpdateRGB_C(const int16_t* ref, const int16_t* src,
+                                int16_t* dst, int len) {
+  int i;
+  for (i = 0; i < len; ++i) {
+    const int diff_uv = ref[i] - src[i];
+    dst[i] += diff_uv;
+  }
+}
+
+static void SharpYUVFilterRow_C(const int16_t* A, const int16_t* B, int len,
+                                const uint16_t* best_y, uint16_t* out) {
+  int i;
+  for (i = 0; i < len; ++i, ++A, ++B) {
+    const int v0 = (A[0] * 9 + A[1] * 3 + B[0] * 3 + B[1] + 8) >> 4;
+    const int v1 = (A[1] * 9 + A[0] * 3 + B[1] * 3 + B[0] + 8) >> 4;
+    out[2 * i + 0] = clip_y(best_y[2 * i + 0] + v0);
+    out[2 * i + 1] = clip_y(best_y[2 * i + 1] + v1);
+  }
+}
+
+#undef MAX_Y
+
+//-----------------------------------------------------------------------------
+
 void (*WebPConvertRGB24ToY)(const uint8_t* rgb, uint8_t* y, int width);
 void (*WebPConvertBGR24ToY)(const uint8_t* bgr, uint8_t* y, int width);
 void (*WebPConvertRGBA32ToUV)(const uint16_t* rgb,
@@ -253,10 +297,18 @@ void (*WebPConvertARGBToY)(const uint32_t* argb, uint8_t* y, int width);
 void (*WebPConvertARGBToUV)(const uint32_t* argb, uint8_t* u, uint8_t* v,
                             int src_width, int do_store);
 
+uint64_t (*WebPSharpYUVUpdateY)(const uint16_t* ref, const uint16_t* src,
+                                uint16_t* dst, int len);
+void (*WebPSharpYUVUpdateRGB)(const int16_t* ref, const int16_t* src,
+                              int16_t* dst, int len);
+void (*WebPSharpYUVFilterRow)(const int16_t* A, const int16_t* B, int len,
+                              const uint16_t* best_y, uint16_t* out);
+
 static volatile VP8CPUInfo rgba_to_yuv_last_cpuinfo_used =
     (VP8CPUInfo)&rgba_to_yuv_last_cpuinfo_used;
 
 extern void WebPInitConvertARGBToYUVSSE2(void);
+extern void WebPInitSharpYUVSSE2(void);
 
 WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUV(void) {
   if (rgba_to_yuv_last_cpuinfo_used == VP8GetCPUInfo) return;
@@ -269,10 +321,15 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUV(void) {
 
   WebPConvertRGBA32ToUV = WebPConvertRGBA32ToUV_C;
 
+  WebPSharpYUVUpdateY = SharpYUVUpdateY_C;
+  WebPSharpYUVUpdateRGB = SharpYUVUpdateRGB_C;
+  WebPSharpYUVFilterRow = SharpYUVFilterRow_C;
+
   if (VP8GetCPUInfo != NULL) {
 #if defined(WEBP_USE_SSE2)
     if (VP8GetCPUInfo(kSSE2)) {
       WebPInitConvertARGBToYUVSSE2();
+      WebPInitSharpYUVSSE2();
     }
 #endif  // WEBP_USE_SSE2
   }
diff --git a/src/3rdparty/libwebp/src/dsp/yuv.h b/src/3rdparty/libwebp/src/dsp/yuv.h
index 01c40fc..1d33b58 100644
--- a/src/3rdparty/libwebp/src/dsp/yuv.h
+++ b/src/3rdparty/libwebp/src/dsp/yuv.h
@@ -36,7 +36,7 @@
 #define WEBP_DSP_YUV_H_
 
 #include "./dsp.h"
-#include "../dec/decode_vp8.h"
+#include "../dec/vp8_dec.h"
 
 #if defined(WEBP_EXPERIMENTAL_FEATURES)
 // Do NOT activate this feature for real compression. This is only experimental!
diff --git a/src/3rdparty/libwebp/src/dsp/yuv_sse2.c b/src/3rdparty/libwebp/src/dsp/yuv_sse2.c
index e19bddf..e33c2bb 100644
--- a/src/3rdparty/libwebp/src/dsp/yuv_sse2.c
+++ b/src/3rdparty/libwebp/src/dsp/yuv_sse2.c
@@ -15,6 +15,8 @@
 
 #if defined(WEBP_USE_SSE2)
 
+#include "./common_sse2.h"
+#include <stdlib.h>
 #include <emmintrin.h>
 
 //-----------------------------------------------------------------------------
@@ -155,30 +157,13 @@ static WEBP_INLINE void PackAndStore565(const __m128i* const R,
   _mm_storeu_si128((__m128i*)dst, rgb565);
 }
 
-// Function used several times in PlanarTo24b.
-// It samples the in buffer as follows: one every two unsigned char is stored
-// at the beginning of the buffer, while the other half is stored at the end.
-static WEBP_INLINE void PlanarTo24bHelper(const __m128i* const in /*in[6]*/,
-                                          __m128i* const out /*out[6]*/) {
-  const __m128i v_mask = _mm_set1_epi16(0x00ff);
-
-  // Take one every two upper 8b values.
-  out[0] = _mm_packus_epi16(_mm_and_si128(in[0], v_mask),
-                            _mm_and_si128(in[1], v_mask));
-  out[1] = _mm_packus_epi16(_mm_and_si128(in[2], v_mask),
-                            _mm_and_si128(in[3], v_mask));
-  out[2] = _mm_packus_epi16(_mm_and_si128(in[4], v_mask),
-                            _mm_and_si128(in[5], v_mask));
-  // Take one every two lower 8b values.
-  out[3] = _mm_packus_epi16(_mm_srli_epi16(in[0], 8), _mm_srli_epi16(in[1], 8));
-  out[4] = _mm_packus_epi16(_mm_srli_epi16(in[2], 8), _mm_srli_epi16(in[3], 8));
-  out[5] = _mm_packus_epi16(_mm_srli_epi16(in[4], 8), _mm_srli_epi16(in[5], 8));
-}
-
 // Pack the planar buffers
 // rrrr... rrrr... gggg... gggg... bbbb... bbbb....
 // triplet by triplet in the output buffer rgb as rgbrgbrgbrgb ...
-static WEBP_INLINE void PlanarTo24b(__m128i* const in /*in[6]*/, uint8_t* rgb) {
+static WEBP_INLINE void PlanarTo24b(__m128i* const in0, __m128i* const in1,
+                                    __m128i* const in2, __m128i* const in3,
+                                    __m128i* const in4, __m128i* const in5,
+                                    uint8_t* const rgb) {
   // The input is 6 registers of sixteen 8b but for the sake of explanation,
   // let's take 6 registers of four 8b values.
   // To pack, we will keep taking one every two 8b integer and move it
@@ -191,22 +176,15 @@ static WEBP_INLINE void PlanarTo24b(__m128i* const in /*in[6]*/, uint8_t* rgb) {
   // Repeat the same permutations twice more:
   //   r0r4g0g4 | b0b4r1r5 | g1g5b1b5 | r2r6g2g6 | b2b6r3r7 | g3g7b3b7
   //   r0g0b0r1 | g1b1r2g2 | b2r3g3b3 | r4g4b4r5 | g5b5r6g6 | b6r7g7b7
-  __m128i tmp[6];
-  PlanarTo24bHelper(in, tmp);
-  PlanarTo24bHelper(tmp, in);
-  PlanarTo24bHelper(in, tmp);
-  // We need to do it two more times than the example as we have sixteen bytes.
-  PlanarTo24bHelper(tmp, in);
-  PlanarTo24bHelper(in, tmp);
-
-  _mm_storeu_si128((__m128i*)(rgb +  0), tmp[0]);
-  _mm_storeu_si128((__m128i*)(rgb + 16), tmp[1]);
-  _mm_storeu_si128((__m128i*)(rgb + 32), tmp[2]);
-  _mm_storeu_si128((__m128i*)(rgb + 48), tmp[3]);
-  _mm_storeu_si128((__m128i*)(rgb + 64), tmp[4]);
-  _mm_storeu_si128((__m128i*)(rgb + 80), tmp[5]);
-}
-#undef MK_UINT32
+  VP8PlanarTo24b(in0, in1, in2, in3, in4, in5);
+
+  _mm_storeu_si128((__m128i*)(rgb +  0), *in0);
+  _mm_storeu_si128((__m128i*)(rgb + 16), *in1);
+  _mm_storeu_si128((__m128i*)(rgb + 32), *in2);
+  _mm_storeu_si128((__m128i*)(rgb + 48), *in3);
+  _mm_storeu_si128((__m128i*)(rgb + 64), *in4);
+  _mm_storeu_si128((__m128i*)(rgb + 80), *in5);
+}
 
 void VP8YuvToRgba32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
                     uint8_t* dst) {
@@ -265,29 +243,29 @@ void VP8YuvToRgb56532(const uint8_t* y, const uint8_t* u, const uint8_t* v,
 void VP8YuvToRgb32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
                    uint8_t* dst) {
   __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
-  __m128i rgb[6];
+  __m128i rgb0, rgb1, rgb2, rgb3, rgb4, rgb5;
 
-  YUV444ToRGB(y +  0, u +  0, v +  0, &R0, &G0, &B0);
-  YUV444ToRGB(y +  8, u +  8, v +  8, &R1, &G1, &B1);
+  YUV444ToRGB(y + 0, u + 0, v + 0, &R0, &G0, &B0);
+  YUV444ToRGB(y + 8, u + 8, v + 8, &R1, &G1, &B1);
   YUV444ToRGB(y + 16, u + 16, v + 16, &R2, &G2, &B2);
   YUV444ToRGB(y + 24, u + 24, v + 24, &R3, &G3, &B3);
 
   // Cast to 8b and store as RRRRGGGGBBBB.
-  rgb[0] = _mm_packus_epi16(R0, R1);
-  rgb[1] = _mm_packus_epi16(R2, R3);
-  rgb[2] = _mm_packus_epi16(G0, G1);
-  rgb[3] = _mm_packus_epi16(G2, G3);
-  rgb[4] = _mm_packus_epi16(B0, B1);
-  rgb[5] = _mm_packus_epi16(B2, B3);
+  rgb0 = _mm_packus_epi16(R0, R1);
+  rgb1 = _mm_packus_epi16(R2, R3);
+  rgb2 = _mm_packus_epi16(G0, G1);
+  rgb3 = _mm_packus_epi16(G2, G3);
+  rgb4 = _mm_packus_epi16(B0, B1);
+  rgb5 = _mm_packus_epi16(B2, B3);
 
   // Pack as RGBRGBRGBRGB.
-  PlanarTo24b(rgb, dst);
+  PlanarTo24b(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst);
 }
 
 void VP8YuvToBgr32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
                    uint8_t* dst) {
   __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
-  __m128i bgr[6];
+  __m128i bgr0, bgr1, bgr2, bgr3, bgr4, bgr5;
 
   YUV444ToRGB(y +  0, u +  0, v +  0, &R0, &G0, &B0);
   YUV444ToRGB(y +  8, u +  8, v +  8, &R1, &G1, &B1);
@@ -295,15 +273,15 @@ void VP8YuvToBgr32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
   YUV444ToRGB(y + 24, u + 24, v + 24, &R3, &G3, &B3);
 
   // Cast to 8b and store as BBBBGGGGRRRR.
-  bgr[0] = _mm_packus_epi16(B0, B1);
-  bgr[1] = _mm_packus_epi16(B2, B3);
-  bgr[2] = _mm_packus_epi16(G0, G1);
-  bgr[3] = _mm_packus_epi16(G2, G3);
-  bgr[4] = _mm_packus_epi16(R0, R1);
-  bgr[5] = _mm_packus_epi16(R2, R3);
+  bgr0 = _mm_packus_epi16(B0, B1);
+  bgr1 = _mm_packus_epi16(B2, B3);
+  bgr2 = _mm_packus_epi16(G0, G1);
+  bgr3 = _mm_packus_epi16(G2, G3);
+  bgr4 = _mm_packus_epi16(R0, R1);
+  bgr5= _mm_packus_epi16(R2, R3);
 
   // Pack as BGRBGRBGRBGR.
-  PlanarTo24b(bgr, dst);
+  PlanarTo24b(&bgr0, &bgr1, &bgr2, &bgr3, &bgr4, &bgr5, dst);
 }
 
 //-----------------------------------------------------------------------------
@@ -377,7 +355,7 @@ static void YuvToRgbRow(const uint8_t* y, const uint8_t* u, const uint8_t* v,
   int n;
   for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) {
     __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
-    __m128i rgb[6];
+    __m128i rgb0, rgb1, rgb2, rgb3, rgb4, rgb5;
 
     YUV420ToRGB(y +  0, u +  0, v +  0, &R0, &G0, &B0);
     YUV420ToRGB(y +  8, u +  4, v +  4, &R1, &G1, &B1);
@@ -385,15 +363,15 @@ static void YuvToRgbRow(const uint8_t* y, const uint8_t* u, const uint8_t* v,
     YUV420ToRGB(y + 24, u + 12, v + 12, &R3, &G3, &B3);
 
     // Cast to 8b and store as RRRRGGGGBBBB.
-    rgb[0] = _mm_packus_epi16(R0, R1);
-    rgb[1] = _mm_packus_epi16(R2, R3);
-    rgb[2] = _mm_packus_epi16(G0, G1);
-    rgb[3] = _mm_packus_epi16(G2, G3);
-    rgb[4] = _mm_packus_epi16(B0, B1);
-    rgb[5] = _mm_packus_epi16(B2, B3);
+    rgb0 = _mm_packus_epi16(R0, R1);
+    rgb1 = _mm_packus_epi16(R2, R3);
+    rgb2 = _mm_packus_epi16(G0, G1);
+    rgb3 = _mm_packus_epi16(G2, G3);
+    rgb4 = _mm_packus_epi16(B0, B1);
+    rgb5 = _mm_packus_epi16(B2, B3);
 
     // Pack as RGBRGBRGBRGB.
-    PlanarTo24b(rgb, dst);
+    PlanarTo24b(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst);
 
     y += 32;
     u += 16;
@@ -413,7 +391,7 @@ static void YuvToBgrRow(const uint8_t* y, const uint8_t* u, const uint8_t* v,
   int n;
   for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) {
     __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
-    __m128i bgr[6];
+    __m128i bgr0, bgr1, bgr2, bgr3, bgr4, bgr5;
 
     YUV420ToRGB(y +  0, u +  0, v +  0, &R0, &G0, &B0);
     YUV420ToRGB(y +  8, u +  4, v +  4, &R1, &G1, &B1);
@@ -421,15 +399,15 @@ static void YuvToBgrRow(const uint8_t* y, const uint8_t* u, const uint8_t* v,
     YUV420ToRGB(y + 24, u + 12, v + 12, &R3, &G3, &B3);
 
     // Cast to 8b and store as BBBBGGGGRRRR.
-    bgr[0] = _mm_packus_epi16(B0, B1);
-    bgr[1] = _mm_packus_epi16(B2, B3);
-    bgr[2] = _mm_packus_epi16(G0, G1);
-    bgr[3] = _mm_packus_epi16(G2, G3);
-    bgr[4] = _mm_packus_epi16(R0, R1);
-    bgr[5] = _mm_packus_epi16(R2, R3);
+    bgr0 = _mm_packus_epi16(B0, B1);
+    bgr1 = _mm_packus_epi16(B2, B3);
+    bgr2 = _mm_packus_epi16(G0, G1);
+    bgr3 = _mm_packus_epi16(G2, G3);
+    bgr4 = _mm_packus_epi16(R0, R1);
+    bgr5 = _mm_packus_epi16(R2, R3);
 
     // Pack as BGRBGRBGRBGR.
-    PlanarTo24b(bgr, dst);
+    PlanarTo24b(&bgr0, &bgr1, &bgr2, &bgr3, &bgr4, &bgr5, dst);
 
     y += 32;
     u += 16;
@@ -499,25 +477,19 @@ static WEBP_INLINE void RGB24PackedToPlanar(const uint8_t* const rgb,
 
 // Convert 8 packed ARGB to r[], g[], b[]
 static WEBP_INLINE void RGB32PackedToPlanar(const uint32_t* const argb,
-                                            __m128i* const r,
-                                            __m128i* const g,
-                                            __m128i* const b) {
+                                            __m128i* const rgb /*in[6]*/) {
   const __m128i zero = _mm_setzero_si128();
-  const __m128i in0 = LOAD_16(argb + 0);    // argb3 | argb2 | argb1 | argb0
-  const __m128i in1 = LOAD_16(argb + 4);    // argb7 | argb6 | argb5 | argb4
-  // column-wise transpose
-  const __m128i A0 = _mm_unpacklo_epi8(in0, in1);
-  const __m128i A1 = _mm_unpackhi_epi8(in0, in1);
-  const __m128i B0 = _mm_unpacklo_epi8(A0, A1);
-  const __m128i B1 = _mm_unpackhi_epi8(A0, A1);
-  // C0 = g7 g6 ... g1 g0 | b7 b6 ... b1 b0
-  // C1 = a7 a6 ... a1 a0 | r7 r6 ... r1 r0
-  const __m128i C0 = _mm_unpacklo_epi8(B0, B1);
-  const __m128i C1 = _mm_unpackhi_epi8(B0, B1);
-  // store 16b
-  *r = _mm_unpacklo_epi8(C1, zero);
-  *g = _mm_unpackhi_epi8(C0, zero);
-  *b = _mm_unpacklo_epi8(C0, zero);
+  __m128i a0 = LOAD_16(argb + 0);
+  __m128i a1 = LOAD_16(argb + 4);
+  __m128i a2 = LOAD_16(argb + 8);
+  __m128i a3 = LOAD_16(argb + 12);
+  VP8L32bToPlanar(&a0, &a1, &a2, &a3);
+  rgb[0] = _mm_unpacklo_epi8(a1, zero);
+  rgb[1] = _mm_unpackhi_epi8(a1, zero);
+  rgb[2] = _mm_unpacklo_epi8(a2, zero);
+  rgb[3] = _mm_unpackhi_epi8(a2, zero);
+  rgb[4] = _mm_unpacklo_epi8(a3, zero);
+  rgb[5] = _mm_unpackhi_epi8(a3, zero);
 }
 
 // This macro computes (RG * MULT_RG + GB * MULT_GB + ROUNDER) >> DESCALE_FIX
@@ -649,11 +621,10 @@ static void ConvertARGBToY(const uint32_t* argb, uint8_t* y, int width) {
   const int max_width = width & ~15;
   int i;
   for (i = 0; i < max_width; i += 16) {
-    __m128i r, g, b, Y0, Y1;
-    RGB32PackedToPlanar(&argb[i + 0], &r, &g, &b);
-    ConvertRGBToY(&r, &g, &b, &Y0);
-    RGB32PackedToPlanar(&argb[i + 8], &r, &g, &b);
-    ConvertRGBToY(&r, &g, &b, &Y1);
+    __m128i Y0, Y1, rgb[6];
+    RGB32PackedToPlanar(&argb[i], rgb);
+    ConvertRGBToY(&rgb[0], &rgb[2], &rgb[4], &Y0);
+    ConvertRGBToY(&rgb[1], &rgb[3], &rgb[5], &Y1);
     STORE_16(_mm_packus_epi16(Y0, Y1), y + i);
   }
   for (; i < width; ++i) {   // left-over
@@ -678,20 +649,18 @@ static void ConvertARGBToUV(const uint32_t* argb, uint8_t* u, uint8_t* v,
   const int max_width = src_width & ~31;
   int i;
   for (i = 0; i < max_width; i += 32, u += 16, v += 16) {
-    __m128i r0, g0, b0, r1, g1, b1, U0, V0, U1, V1;
-    RGB32PackedToPlanar(&argb[i +  0], &r0, &g0, &b0);
-    RGB32PackedToPlanar(&argb[i +  8], &r1, &g1, &b1);
-    HorizontalAddPack(&r0, &r1, &r0);
-    HorizontalAddPack(&g0, &g1, &g0);
-    HorizontalAddPack(&b0, &b1, &b0);
-    ConvertRGBToUV(&r0, &g0, &b0, &U0, &V0);
-
-    RGB32PackedToPlanar(&argb[i + 16], &r0, &g0, &b0);
-    RGB32PackedToPlanar(&argb[i + 24], &r1, &g1, &b1);
-    HorizontalAddPack(&r0, &r1, &r0);
-    HorizontalAddPack(&g0, &g1, &g0);
-    HorizontalAddPack(&b0, &b1, &b0);
-    ConvertRGBToUV(&r0, &g0, &b0, &U1, &V1);
+    __m128i rgb[6], U0, V0, U1, V1;
+    RGB32PackedToPlanar(&argb[i], rgb);
+    HorizontalAddPack(&rgb[0], &rgb[1], &rgb[0]);
+    HorizontalAddPack(&rgb[2], &rgb[3], &rgb[2]);
+    HorizontalAddPack(&rgb[4], &rgb[5], &rgb[4]);
+    ConvertRGBToUV(&rgb[0], &rgb[2], &rgb[4], &U0, &V0);
+
+    RGB32PackedToPlanar(&argb[i + 16], rgb);
+    HorizontalAddPack(&rgb[0], &rgb[1], &rgb[0]);
+    HorizontalAddPack(&rgb[2], &rgb[3], &rgb[2]);
+    HorizontalAddPack(&rgb[4], &rgb[5], &rgb[4]);
+    ConvertRGBToUV(&rgb[0], &rgb[2], &rgb[4], &U1, &V1);
 
     U0 = _mm_packus_epi16(U0, U1);
     V0 = _mm_packus_epi16(V0, V1);
@@ -767,9 +736,128 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUVSSE2(void) {
   WebPConvertRGBA32ToUV = ConvertRGBA32ToUV;
 }
 
+//------------------------------------------------------------------------------
+
+#define MAX_Y ((1 << 10) - 1)    // 10b precision over 16b-arithmetic
+static uint16_t clip_y(int v) {
+  return (v < 0) ? 0 : (v > MAX_Y) ? MAX_Y : (uint16_t)v;
+}
+
+static uint64_t SharpYUVUpdateY_SSE2(const uint16_t* ref, const uint16_t* src,
+                                     uint16_t* dst, int len) {
+  uint64_t diff = 0;
+  uint32_t tmp[4];
+  int i;
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i max = _mm_set1_epi16(MAX_Y);
+  const __m128i one = _mm_set1_epi16(1);
+  __m128i sum = zero;
+
+  for (i = 0; i + 8 <= len; i += 8) {
+    const __m128i A = _mm_loadu_si128((const __m128i*)(ref + i));
+    const __m128i B = _mm_loadu_si128((const __m128i*)(src + i));
+    const __m128i C = _mm_loadu_si128((const __m128i*)(dst + i));
+    const __m128i D = _mm_sub_epi16(A, B);       // diff_y
+    const __m128i E = _mm_cmpgt_epi16(zero, D);  // sign (-1 or 0)
+    const __m128i F = _mm_add_epi16(C, D);       // new_y
+    const __m128i G = _mm_or_si128(E, one);      // -1 or 1
+    const __m128i H = _mm_max_epi16(_mm_min_epi16(F, max), zero);
+    const __m128i I = _mm_madd_epi16(D, G);      // sum(abs(...))
+    _mm_storeu_si128((__m128i*)(dst + i), H);
+    sum = _mm_add_epi32(sum, I);
+  }
+  _mm_storeu_si128((__m128i*)tmp, sum);
+  diff = tmp[3] + tmp[2] + tmp[1] + tmp[0];
+  for (; i < len; ++i) {
+    const int diff_y = ref[i] - src[i];
+    const int new_y = (int)dst[i] + diff_y;
+    dst[i] = clip_y(new_y);
+    diff += (uint64_t)abs(diff_y);
+  }
+  return diff;
+}
+
+static void SharpYUVUpdateRGB_SSE2(const int16_t* ref, const int16_t* src,
+                                   int16_t* dst, int len) {
+  int i = 0;
+  for (i = 0; i + 8 <= len; i += 8) {
+    const __m128i A = _mm_loadu_si128((const __m128i*)(ref + i));
+    const __m128i B = _mm_loadu_si128((const __m128i*)(src + i));
+    const __m128i C = _mm_loadu_si128((const __m128i*)(dst + i));
+    const __m128i D = _mm_sub_epi16(A, B);   // diff_uv
+    const __m128i E = _mm_add_epi16(C, D);   // new_uv
+    _mm_storeu_si128((__m128i*)(dst + i), E);
+  }
+  for (; i < len; ++i) {
+    const int diff_uv = ref[i] - src[i];
+    dst[i] += diff_uv;
+  }
+}
+
+static void SharpYUVFilterRow_SSE2(const int16_t* A, const int16_t* B, int len,
+                                   const uint16_t* best_y, uint16_t* out) {
+  int i;
+  const __m128i kCst8 = _mm_set1_epi16(8);
+  const __m128i max = _mm_set1_epi16(MAX_Y);
+  const __m128i zero = _mm_setzero_si128();
+  for (i = 0; i + 8 <= len; i += 8) {
+    const __m128i a0 = _mm_loadu_si128((const __m128i*)(A + i + 0));
+    const __m128i a1 = _mm_loadu_si128((const __m128i*)(A + i + 1));
+    const __m128i b0 = _mm_loadu_si128((const __m128i*)(B + i + 0));
+    const __m128i b1 = _mm_loadu_si128((const __m128i*)(B + i + 1));
+    const __m128i a0b1 = _mm_add_epi16(a0, b1);
+    const __m128i a1b0 = _mm_add_epi16(a1, b0);
+    const __m128i a0a1b0b1 = _mm_add_epi16(a0b1, a1b0);  // A0+A1+B0+B1
+    const __m128i a0a1b0b1_8 = _mm_add_epi16(a0a1b0b1, kCst8);
+    const __m128i a0b1_2 = _mm_add_epi16(a0b1, a0b1);    // 2*(A0+B1)
+    const __m128i a1b0_2 = _mm_add_epi16(a1b0, a1b0);    // 2*(A1+B0)
+    const __m128i c0 = _mm_srai_epi16(_mm_add_epi16(a0b1_2, a0a1b0b1_8), 3);
+    const __m128i c1 = _mm_srai_epi16(_mm_add_epi16(a1b0_2, a0a1b0b1_8), 3);
+    const __m128i d0 = _mm_add_epi16(c1, a0);
+    const __m128i d1 = _mm_add_epi16(c0, a1);
+    const __m128i e0 = _mm_srai_epi16(d0, 1);
+    const __m128i e1 = _mm_srai_epi16(d1, 1);
+    const __m128i f0 = _mm_unpacklo_epi16(e0, e1);
+    const __m128i f1 = _mm_unpackhi_epi16(e0, e1);
+    const __m128i g0 = _mm_loadu_si128((const __m128i*)(best_y + 2 * i + 0));
+    const __m128i g1 = _mm_loadu_si128((const __m128i*)(best_y + 2 * i + 8));
+    const __m128i h0 = _mm_add_epi16(g0, f0);
+    const __m128i h1 = _mm_add_epi16(g1, f1);
+    const __m128i i0 = _mm_max_epi16(_mm_min_epi16(h0, max), zero);
+    const __m128i i1 = _mm_max_epi16(_mm_min_epi16(h1, max), zero);
+    _mm_storeu_si128((__m128i*)(out + 2 * i + 0), i0);
+    _mm_storeu_si128((__m128i*)(out + 2 * i + 8), i1);
+  }
+  for (; i < len; ++i) {
+    //   (9 * A0 + 3 * A1 + 3 * B0 + B1 + 8) >> 4 =
+    // = (8 * A0 + 2 * (A1 + B0) + (A0 + A1 + B0 + B1 + 8)) >> 4
+    // We reuse the common sub-expressions.
+    const int a0b1 = A[i + 0] + B[i + 1];
+    const int a1b0 = A[i + 1] + B[i + 0];
+    const int a0a1b0b1 = a0b1 + a1b0 + 8;
+    const int v0 = (8 * A[i + 0] + 2 * a1b0 + a0a1b0b1) >> 4;
+    const int v1 = (8 * A[i + 1] + 2 * a0b1 + a0a1b0b1) >> 4;
+    out[2 * i + 0] = clip_y(best_y[2 * i + 0] + v0);
+    out[2 * i + 1] = clip_y(best_y[2 * i + 1] + v1);
+  }
+}
+
+#undef MAX_Y
+
+//------------------------------------------------------------------------------
+
+extern void WebPInitSharpYUVSSE2(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void WebPInitSharpYUVSSE2(void) {
+  WebPSharpYUVUpdateY = SharpYUVUpdateY_SSE2;
+  WebPSharpYUVUpdateRGB = SharpYUVUpdateRGB_SSE2;
+  WebPSharpYUVFilterRow = SharpYUVFilterRow_SSE2;
+}
+
 #else  // !WEBP_USE_SSE2
 
 WEBP_DSP_INIT_STUB(WebPInitSamplersSSE2)
 WEBP_DSP_INIT_STUB(WebPInitConvertARGBToYUVSSE2)
+WEBP_DSP_INIT_STUB(WebPInitSharpYUVSSE2)
 
 #endif  // WEBP_USE_SSE2
diff --git a/src/3rdparty/libwebp/src/enc/alpha.c b/src/3rdparty/libwebp/src/enc/alpha_enc.c
index 03e3ad0..5a2c931 100644
--- a/src/3rdparty/libwebp/src/enc/alpha.c
+++ b/src/3rdparty/libwebp/src/enc/alpha_enc.c
@@ -14,10 +14,10 @@
 #include <assert.h>
 #include <stdlib.h>
 
-#include "./vp8enci.h"
+#include "./vp8i_enc.h"
 #include "../dsp/dsp.h"
-#include "../utils/filters.h"
-#include "../utils/quant_levels.h"
+#include "../utils/filters_utils.h"
+#include "../utils/quant_levels_utils.h"
 #include "../utils/utils.h"
 #include "../webp/format_constants.h"
 
@@ -44,7 +44,7 @@
 //           invalid quality or method, or
 //           memory allocation for the compressed data fails.
 
-#include "../enc/vp8li.h"
+#include "../enc/vp8li_enc.h"
 
 static int EncodeLossless(const uint8_t* const data, int width, int height,
                           int effort_level,  // in [0..6] range
diff --git a/src/3rdparty/libwebp/src/enc/analysis.c b/src/3rdparty/libwebp/src/enc/analysis_enc.c
index b55128f..dce159b 100644
--- a/src/3rdparty/libwebp/src/enc/analysis.c
+++ b/src/3rdparty/libwebp/src/enc/analysis_enc.c
@@ -15,8 +15,8 @@
 #include <string.h>
 #include <assert.h>
 
-#include "./vp8enci.h"
-#include "./cost.h"
+#include "./vp8i_enc.h"
+#include "./cost_enc.h"
 #include "../utils/utils.h"
 
 #define MAX_ITERS_K_MEANS  6
@@ -262,6 +262,29 @@ static int MBAnalyzeBestIntra16Mode(VP8EncIterator* const it) {
   return best_alpha;
 }
 
+static int FastMBAnalyze(VP8EncIterator* const it) {
+  // Empirical cut-off value, should be around 16 (~=block size). We use the
+  // [8-17] range and favor intra4 at high quality, intra16 for low quality.
+  const int q = (int)it->enc_->config_->quality;
+  const uint32_t kThreshold = 8 + (17 - 8) * q / 100;
+  int k;
+  uint32_t dc[16], m, m2;
+  for (k = 0; k < 16; k += 4) {
+    VP8Mean16x4(it->yuv_in_ + Y_OFF_ENC + k * BPS, &dc[k]);
+  }
+  for (m = 0, m2 = 0, k = 0; k < 16; ++k) {
+    m += dc[k];
+    m2 += dc[k] * dc[k];
+  }
+  if (kThreshold * m2 < m * m) {
+    VP8SetIntra16Mode(it, 0);   // DC16
+  } else {
+    const uint8_t modes[16] = { 0 };  // DC4
+    VP8SetIntra4Mode(it, modes);
+  }
+  return 0;
+}
+
 static int MBAnalyzeBestIntra4Mode(VP8EncIterator* const it,
                                    int best_alpha) {
   uint8_t modes[16];
@@ -307,6 +330,7 @@ static int MBAnalyzeBestIntra4Mode(VP8EncIterator* const it,
 
 static int MBAnalyzeBestUVMode(VP8EncIterator* const it) {
   int best_alpha = DEFAULT_ALPHA;
+  int smallest_alpha = 0;
   int best_mode = 0;
   const int max_mode = MAX_UV_MODE;
   int mode;
@@ -322,6 +346,10 @@ static int MBAnalyzeBestUVMode(VP8EncIterator* const it) {
     alpha = GetAlpha(&histo);
     if (IS_BETTER_ALPHA(alpha, best_alpha)) {
       best_alpha = alpha;
+    }
+    // The best prediction mode tends to be the one with the smallest alpha.
+    if (mode == 0 || alpha < smallest_alpha) {
+      smallest_alpha = alpha;
       best_mode = mode;
     }
   }
@@ -339,13 +367,17 @@ static void MBAnalyze(VP8EncIterator* const it,
   VP8SetSkip(it, 0);         // not skipped
   VP8SetSegment(it, 0);      // default segment, spec-wise.
 
-  best_alpha = MBAnalyzeBestIntra16Mode(it);
-  if (enc->method_ >= 5) {
-    // We go and make a fast decision for intra4/intra16.
-    // It's usually not a good and definitive pick, but helps seeding the stats
-    // about level bit-cost.
-    // TODO(skal): improve criterion.
-    best_alpha = MBAnalyzeBestIntra4Mode(it, best_alpha);
+  if (enc->method_ <= 1) {
+    best_alpha = FastMBAnalyze(it);
+  } else {
+    best_alpha = MBAnalyzeBestIntra16Mode(it);
+    if (enc->method_ >= 5) {
+      // We go and make a fast decision for intra4/intra16.
+      // It's usually not a good and definitive pick, but helps seeding the
+      // stats about level bit-cost.
+      // TODO(skal): improve criterion.
+      best_alpha = MBAnalyzeBestIntra4Mode(it, best_alpha);
+    }
   }
   best_uv_alpha = MBAnalyzeBestUVMode(it);
 
@@ -448,7 +480,7 @@ int VP8EncAnalyze(VP8Encoder* const enc) {
   const int do_segments =
       enc->config_->emulate_jpeg_size ||   // We need the complexity evaluation.
       (enc->segment_hdr_.num_segments_ > 1) ||
-      (enc->method_ == 0);  // for method 0, we need preds_[] to be filled.
+      (enc->method_ <= 1);  // for method 0 - 1, we need preds_[] to be filled.
   if (do_segments) {
     const int last_row = enc->mb_h_;
     // We give a little more than a half work to the main thread.
diff --git a/src/3rdparty/libwebp/src/enc/backward_references.c b/src/3rdparty/libwebp/src/enc/backward_references_enc.c
index 136a24a..7c0559f 100644
--- a/src/3rdparty/libwebp/src/enc/backward_references.c
+++ b/src/3rdparty/libwebp/src/enc/backward_references_enc.c
@@ -13,11 +13,12 @@
 #include <assert.h>
 #include <math.h>
 
-#include "./backward_references.h"
-#include "./histogram.h"
+#include "./backward_references_enc.h"
+#include "./histogram_enc.h"
 #include "../dsp/lossless.h"
+#include "../dsp/lossless_common.h"
 #include "../dsp/dsp.h"
-#include "../utils/color_cache.h"
+#include "../utils/color_cache_utils.h"
 #include "../utils/utils.h"
 
 #define VALUES_IN_BYTE 256
@@ -30,8 +31,9 @@
 #define WINDOW_SIZE_BITS 20
 #define WINDOW_SIZE ((1 << WINDOW_SIZE_BITS) - 120)
 
-// Bounds for the match length.
-#define MIN_LENGTH 2
+// Minimum number of pixels for which it is cheaper to encode a
+// distance + length instead of each pixel as a literal.
+#define MIN_LENGTH 4
 // If you change this, you need MAX_LENGTH_BITS + WINDOW_SIZE_BITS <= 32 as it
 // is used in VP8LHashChain.
 #define MAX_LENGTH_BITS 12
@@ -211,13 +213,13 @@ void VP8LHashChainClear(VP8LHashChain* const p) {
 
 // -----------------------------------------------------------------------------
 
-#define HASH_MULTIPLIER_HI (0xc6a4a793U)
-#define HASH_MULTIPLIER_LO (0x5bd1e996U)
+#define HASH_MULTIPLIER_HI (0xc6a4a793ULL)
+#define HASH_MULTIPLIER_LO (0x5bd1e996ULL)
 
 static WEBP_INLINE uint32_t GetPixPairHash64(const uint32_t* const argb) {
   uint32_t key;
-  key  = argb[1] * HASH_MULTIPLIER_HI;
-  key += argb[0] * HASH_MULTIPLIER_LO;
+  key  = (argb[1] * HASH_MULTIPLIER_HI) & 0xffffffffu;
+  key += (argb[0] * HASH_MULTIPLIER_LO) & 0xffffffffu;
   key = key >> (32 - HASH_BITS);
   return key;
 }
@@ -242,19 +244,26 @@ static WEBP_INLINE int MaxFindCopyLength(int len) {
 }
 
 int VP8LHashChainFill(VP8LHashChain* const p, int quality,
-                      const uint32_t* const argb, int xsize, int ysize) {
+                      const uint32_t* const argb, int xsize, int ysize,
+                      int low_effort) {
   const int size = xsize * ysize;
   const int iter_max = GetMaxItersForQuality(quality);
-  const int iter_min = iter_max - quality / 10;
   const uint32_t window_size = GetWindowSizeForHashChain(quality, xsize);
   int pos;
+  int argb_comp;
   uint32_t base_position;
   int32_t* hash_to_first_index;
   // Temporarily use the p->offset_length_ as a hash chain.
   int32_t* chain = (int32_t*)p->offset_length_;
+  assert(size > 0);
   assert(p->size_ != 0);
   assert(p->offset_length_ != NULL);
 
+  if (size <= 2) {
+    p->offset_length_[0] = p->offset_length_[size - 1] = 0;
+    return 1;
+  }
+
   hash_to_first_index =
       (int32_t*)WebPSafeMalloc(HASH_SIZE, sizeof(*hash_to_first_index));
   if (hash_to_first_index == NULL) return 0;
@@ -262,48 +271,111 @@ int VP8LHashChainFill(VP8LHashChain* const p, int quality,
   // Set the int32_t array to -1.
   memset(hash_to_first_index, 0xff, HASH_SIZE * sizeof(*hash_to_first_index));
   // Fill the chain linking pixels with the same hash.
-  for (pos = 0; pos < size - 1; ++pos) {
-    const uint32_t hash_code = GetPixPairHash64(argb + pos);
-    chain[pos] = hash_to_first_index[hash_code];
-    hash_to_first_index[hash_code] = pos;
+  argb_comp = (argb[0] == argb[1]);
+  for (pos = 0; pos < size - 2;) {
+    uint32_t hash_code;
+    const int argb_comp_next = (argb[pos + 1] == argb[pos + 2]);
+    if (argb_comp && argb_comp_next) {
+      // Consecutive pixels with the same color will share the same hash.
+      // We therefore use a different hash: the color and its repetition
+      // length.
+      uint32_t tmp[2];
+      uint32_t len = 1;
+      tmp[0] = argb[pos];
+      // Figure out how far the pixels are the same.
+      // The last pixel has a different 64 bit hash, as its next pixel does
+      // not have the same color, so we just need to get to the last pixel equal
+      // to its follower.
+      while (pos + (int)len + 2 < size && argb[pos + len + 2] == argb[pos]) {
+        ++len;
+      }
+      if (len > MAX_LENGTH) {
+        // Skip the pixels that match for distance=1 and length>MAX_LENGTH
+        // because they are linked to their predecessor and we automatically
+        // check that in the main for loop below. Skipping means setting no
+        // predecessor in the chain, hence -1.
+        memset(chain + pos, 0xff, (len - MAX_LENGTH) * sizeof(*chain));
+        pos += len - MAX_LENGTH;
+        len = MAX_LENGTH;
+      }
+      // Process the rest of the hash chain.
+      while (len) {
+        tmp[1] = len--;
+        hash_code = GetPixPairHash64(tmp);
+        chain[pos] = hash_to_first_index[hash_code];
+        hash_to_first_index[hash_code] = pos++;
+      }
+      argb_comp = 0;
+    } else {
+      // Just move one pixel forward.
+      hash_code = GetPixPairHash64(argb + pos);
+      chain[pos] = hash_to_first_index[hash_code];
+      hash_to_first_index[hash_code] = pos++;
+      argb_comp = argb_comp_next;
+    }
   }
+  // Process the penultimate pixel.
+  chain[pos] = hash_to_first_index[GetPixPairHash64(argb + pos)];
+
   WebPSafeFree(hash_to_first_index);
 
   // Find the best match interval at each pixel, defined by an offset to the
   // pixel and a length. The right-most pixel cannot match anything to the right
   // (hence a best length of 0) and the left-most pixel nothing to the left
   // (hence an offset of 0).
+  assert(size > 2);
   p->offset_length_[0] = p->offset_length_[size - 1] = 0;
-  for (base_position = size - 2 < 0 ? 0 : size - 2; base_position > 0;) {
+  for (base_position = size - 2; base_position > 0;) {
     const int max_len = MaxFindCopyLength(size - 1 - base_position);
     const uint32_t* const argb_start = argb + base_position;
     int iter = iter_max;
     int best_length = 0;
     uint32_t best_distance = 0;
+    uint32_t best_argb;
     const int min_pos =
         (base_position > window_size) ? base_position - window_size : 0;
     const int length_max = (max_len < 256) ? max_len : 256;
     uint32_t max_base_position;
 
-    for (pos = chain[base_position]; pos >= min_pos; pos = chain[pos]) {
+    pos = chain[base_position];
+    if (!low_effort) {
       int curr_length;
-      if (--iter < 0) {
-        break;
+      // Heuristic: use the comparison with the above line as an initialization.
+      if (base_position >= (uint32_t)xsize) {
+        curr_length = FindMatchLength(argb_start - xsize, argb_start,
+                                      best_length, max_len);
+        if (curr_length > best_length) {
+          best_length = curr_length;
+          best_distance = xsize;
+        }
+        --iter;
+      }
+      // Heuristic: compare to the previous pixel.
+      curr_length =
+          FindMatchLength(argb_start - 1, argb_start, best_length, max_len);
+      if (curr_length > best_length) {
+        best_length = curr_length;
+        best_distance = 1;
       }
+      --iter;
+      // Skip the for loop if we already have the maximum.
+      if (best_length == MAX_LENGTH) pos = min_pos - 1;
+    }
+    best_argb = argb_start[best_length];
+
+    for (; pos >= min_pos && --iter; pos = chain[pos]) {
+      int curr_length;
       assert(base_position > (uint32_t)pos);
 
-      curr_length =
-          FindMatchLength(argb + pos, argb_start, best_length, max_len);
+      if (argb[pos + best_length] != best_argb) continue;
+
+      curr_length = VP8LVectorMismatch(argb + pos, argb_start, max_len);
       if (best_length < curr_length) {
         best_length = curr_length;
         best_distance = base_position - pos;
-        // Stop if we have reached the maximum length. Otherwise, make sure
-        // we have executed a minimum number of iterations depending on the
-        // quality.
-        if ((best_length == MAX_LENGTH) ||
-            (curr_length >= length_max && iter < iter_min)) {
-          break;
-        }
+        best_argb = argb_start[best_length];
+        // Stop if we have reached a good enough length.
+        if (best_length >= length_max) break;
       }
     }
     // We have the best match but in case the two intervals continue matching
@@ -392,17 +464,16 @@ static int BackwardReferencesRle(int xsize, int ysize,
   i = 1;
   while (i < pix_count) {
     const int max_len = MaxFindCopyLength(pix_count - i);
-    const int kMinLength = 4;
     const int rle_len = FindMatchLength(argb + i, argb + i - 1, 0, max_len);
     const int prev_row_len = (i < xsize) ? 0 :
         FindMatchLength(argb + i, argb + i - xsize, 0, max_len);
-    if (rle_len >= prev_row_len && rle_len >= kMinLength) {
+    if (rle_len >= prev_row_len && rle_len >= MIN_LENGTH) {
       BackwardRefsCursorAdd(refs, PixOrCopyCreateCopy(1, rle_len));
       // We don't need to update the color cache here since it is always the
       // same pixel being copied, and that does not change the color cache
       // state.
       i += rle_len;
-    } else if (prev_row_len >= kMinLength) {
+    } else if (prev_row_len >= MIN_LENGTH) {
       BackwardRefsCursorAdd(refs, PixOrCopyCreateCopy(xsize, prev_row_len));
       if (use_color_cache) {
         for (k = 0; k < prev_row_len; ++k) {
@@ -442,7 +513,7 @@ static int BackwardReferencesLz77(int xsize, int ysize,
     int len = 0;
     int j;
     HashChainFindCopy(hash_chain, i, &offset, &len);
-    if (len > MIN_LENGTH + 1) {
+    if (len >= MIN_LENGTH) {
       const int len_ini = len;
       int max_reach = 0;
       assert(i + len < pix_count);
@@ -457,7 +528,7 @@ static int BackwardReferencesLz77(int xsize, int ysize,
       for (j = i_last_check + 1; j <= i + len_ini; ++j) {
         const int len_j = HashChainFindLength(hash_chain, j);
         const int reach =
-            j + (len_j > MIN_LENGTH + 1 ? len_j : 1);  // 1 for single literal.
+            j + (len_j >= MIN_LENGTH ? len_j : 1);  // 1 for single literal.
         if (reach > max_reach) {
           len = j - i;
           max_reach = reach;
@@ -581,9 +652,10 @@ static void AddSingleLiteralWithCostModel(const uint32_t* const argb,
                                           uint16_t* const dist_array) {
   double cost_val = prev_cost;
   const uint32_t color = argb[0];
-  if (use_color_cache && VP8LColorCacheContains(hashers, color)) {
+  const int ix = use_color_cache ? VP8LColorCacheContains(hashers, color) : -1;
+  if (ix >= 0) {
+    // use_color_cache is true and hashers contains color
     const double mul0 = 0.68;
-    const int ix = VP8LColorCacheGetIndex(hashers, color);
     cost_val += GetCacheCost(cost_model, ix) * mul0;
   } else {
     const double mul1 = 0.82;
@@ -1215,7 +1287,8 @@ static int BackwardReferencesHashChainDistanceOnly(
     int offset = 0, len = 0;
     double prev_cost = cost_manager->costs_[i - 1];
     HashChainFindCopy(hash_chain, i, &offset, &len);
-    if (len >= MIN_LENGTH) {
+    if (len >= 2) {
+      // If we are dealing with a non-literal.
       const int code = DistanceToPlaneCode(xsize, offset);
       const double offset_cost = GetDistanceCost(cost_model, code);
       const int first_i = i;
@@ -1304,20 +1377,17 @@ static int BackwardReferencesHashChainDistanceOnly(
         }
         goto next_symbol;
       }
-      if (len > MIN_LENGTH) {
-        int code_min_length;
-        double cost_total;
-        offset = HashChainFindOffset(hash_chain, i);
-        code_min_length = DistanceToPlaneCode(xsize, offset);
-        cost_total = prev_cost +
-            GetDistanceCost(cost_model, code_min_length) +
-            GetLengthCost(cost_model, 1);
+      if (len > 2) {
+        // Also try the smallest interval possible (size 2).
+        double cost_total =
+            prev_cost + offset_cost + GetLengthCost(cost_model, 1);
         if (cost_manager->costs_[i + 1] > cost_total) {
           cost_manager->costs_[i + 1] = (float)cost_total;
           dist_array[i + 1] = 2;
         }
       }
-    } else {    // len < MIN_LENGTH
+    } else {
+      // The pixel is added as a single literal so just update the costs.
       UpdateCostPerIndex(cost_manager, i + 1);
     }
 
@@ -1393,9 +1463,11 @@ static int BackwardReferencesHashChainFollowChosenPath(
       i += len;
     } else {
       PixOrCopy v;
-      if (use_color_cache && VP8LColorCacheContains(&hashers, argb[i])) {
+      const int idx =
+          use_color_cache ? VP8LColorCacheContains(&hashers, argb[i]) : -1;
+      if (idx >= 0) {
+        // use_color_cache is true and hashers contains argb[i]
         // push pixel as a color cache index
-        const int idx = VP8LColorCacheGetIndex(&hashers, argb[i]);
         v = PixOrCopyCreateCacheIdx(idx);
       } else {
         if (use_color_cache) VP8LColorCacheInsert(&hashers, argb[i]);
@@ -1454,63 +1526,89 @@ static void BackwardReferences2DLocality(int xsize,
   }
 }
 
-// Returns entropy for the given cache bits.
-static double ComputeCacheEntropy(const uint32_t* argb,
-                                  const VP8LBackwardRefs* const refs,
-                                  int cache_bits) {
-  const int use_color_cache = (cache_bits > 0);
-  int cc_init = 0;
-  double entropy = MAX_ENTROPY;
-  const double kSmallPenaltyForLargeCache = 4.0;
-  VP8LColorCache hashers;
+// Computes the entropies for a color cache size (in bits) between 0 (unused)
+// and cache_bits_max (inclusive).
+// Returns 1 on success, 0 in case of allocation error.
+static int ComputeCacheEntropies(const uint32_t* argb,
+                                 const VP8LBackwardRefs* const refs,
+                                 int cache_bits_max, double entropies[]) {
+  int cc_init[MAX_COLOR_CACHE_BITS + 1] = { 0 };
+  VP8LColorCache hashers[MAX_COLOR_CACHE_BITS + 1];
   VP8LRefsCursor c = VP8LRefsCursorInit(refs);
-  VP8LHistogram* histo = VP8LAllocateHistogram(cache_bits);
-  if (histo == NULL) goto Error;
+  VP8LHistogram* histos[MAX_COLOR_CACHE_BITS + 1] = { NULL };
+  int ok = 0;
+  int i;
 
-  if (use_color_cache) {
-    cc_init = VP8LColorCacheInit(&hashers, cache_bits);
-    if (!cc_init) goto Error;
+  for (i = 0; i <= cache_bits_max; ++i) {
+    histos[i] = VP8LAllocateHistogram(i);
+    if (histos[i] == NULL) goto Error;
+    if (i == 0) continue;
+    cc_init[i] = VP8LColorCacheInit(&hashers[i], i);
+    if (!cc_init[i]) goto Error;
   }
-  if (!use_color_cache) {
-    while (VP8LRefsCursorOk(&c)) {
-      VP8LHistogramAddSinglePixOrCopy(histo, c.cur_pos);
-      VP8LRefsCursorNext(&c);
-    }
-  } else {
+
+  assert(cache_bits_max >= 0);
+  // Do not use the color cache for cache_bits=0.
+  while (VP8LRefsCursorOk(&c)) {
+    VP8LHistogramAddSinglePixOrCopy(histos[0], c.cur_pos);
+    VP8LRefsCursorNext(&c);
+  }
+  if (cache_bits_max > 0) {
+    c = VP8LRefsCursorInit(refs);
     while (VP8LRefsCursorOk(&c)) {
       const PixOrCopy* const v = c.cur_pos;
       if (PixOrCopyIsLiteral(v)) {
         const uint32_t pix = *argb++;
-        const uint32_t key = VP8LColorCacheGetIndex(&hashers, pix);
-        if (VP8LColorCacheLookup(&hashers, key) == pix) {
-          ++histo->literal_[NUM_LITERAL_CODES + NUM_LENGTH_CODES + key];
-        } else {
-          VP8LColorCacheSet(&hashers, key, pix);
-          ++histo->blue_[pix & 0xff];
-          ++histo->literal_[(pix >> 8) & 0xff];
-          ++histo->red_[(pix >> 16) & 0xff];
-          ++histo->alpha_[pix >> 24];
+        // The keys of the caches can be derived from the longest one.
+        int key = HashPix(pix, 32 - cache_bits_max);
+        for (i = cache_bits_max; i >= 1; --i, key >>= 1) {
+          if (VP8LColorCacheLookup(&hashers[i], key) == pix) {
+            ++histos[i]->literal_[NUM_LITERAL_CODES + NUM_LENGTH_CODES + key];
+          } else {
+            VP8LColorCacheSet(&hashers[i], key, pix);
+            ++histos[i]->blue_[pix & 0xff];
+            ++histos[i]->literal_[(pix >> 8) & 0xff];
+            ++histos[i]->red_[(pix >> 16) & 0xff];
+            ++histos[i]->alpha_[pix >> 24];
+          }
         }
       } else {
+        // Update the histograms for distance/length.
         int len = PixOrCopyLength(v);
-        int code, extra_bits;
-        VP8LPrefixEncodeBits(len, &code, &extra_bits);
-        ++histo->literal_[NUM_LITERAL_CODES + code];
-        VP8LPrefixEncodeBits(PixOrCopyDistance(v), &code, &extra_bits);
-        ++histo->distance_[code];
+        int code_dist, code_len, extra_bits;
+        uint32_t argb_prev = *argb ^ 0xffffffffu;
+        VP8LPrefixEncodeBits(len, &code_len, &extra_bits);
+        VP8LPrefixEncodeBits(PixOrCopyDistance(v), &code_dist, &extra_bits);
+        for (i = 1; i <= cache_bits_max; ++i) {
+          ++histos[i]->literal_[NUM_LITERAL_CODES + code_len];
+          ++histos[i]->distance_[code_dist];
+        }
+        // Update the colors caches.
         do {
-          VP8LColorCacheInsert(&hashers, *argb++);
-        } while(--len != 0);
+          if (*argb != argb_prev) {
+            // Efficiency: insert only if the color changes.
+            int key = HashPix(*argb, 32 - cache_bits_max);
+            for (i = cache_bits_max; i >= 1; --i, key >>= 1) {
+              hashers[i].colors_[key] = *argb;
+            }
+            argb_prev = *argb;
+          }
+          argb++;
+        } while (--len != 0);
       }
       VP8LRefsCursorNext(&c);
     }
   }
-  entropy = VP8LHistogramEstimateBits(histo) +
-      kSmallPenaltyForLargeCache * cache_bits;
- Error:
-  if (cc_init) VP8LColorCacheClear(&hashers);
-  VP8LFreeHistogram(histo);
-  return entropy;
+  for (i = 0; i <= cache_bits_max; ++i) {
+    entropies[i] = VP8LHistogramEstimateBits(histos[i]);
+  }
+  ok = 1;
+Error:
+  for (i = 0; i <= cache_bits_max; ++i) {
+    if (cc_init[i]) VP8LColorCacheClear(&hashers[i]);
+    VP8LFreeHistogram(histos[i]);
+  }
+  return ok;
 }
 
 // Evaluate optimal cache bits for the local color cache.
@@ -1524,13 +1622,10 @@ static int CalculateBestCacheSize(const uint32_t* const argb,
                                   VP8LBackwardRefs* const refs,
                                   int* const lz77_computed,
                                   int* const best_cache_bits) {
-  int eval_low = 1;
-  int eval_high = 1;
-  double entropy_low = MAX_ENTROPY;
-  double entropy_high = MAX_ENTROPY;
-  const double cost_mul = 5e-4;
-  int cache_bits_low = 0;
+  int i;
   int cache_bits_high = (quality <= 25) ? 0 : *best_cache_bits;
+  double entropy_min = MAX_ENTROPY;
+  double entropies[MAX_COLOR_CACHE_BITS + 1];
 
   assert(cache_bits_high <= MAX_COLOR_CACHE_BITS);
 
@@ -1540,34 +1635,23 @@ static int CalculateBestCacheSize(const uint32_t* const argb,
     // Local color cache is disabled.
     return 1;
   }
-  if (!BackwardReferencesLz77(xsize, ysize, argb, cache_bits_low, hash_chain,
-                              refs)) {
+  // Compute LZ77 with no cache (0 bits), as the ideal LZ77 with a color cache
+  // is not that different in practice.
+  if (!BackwardReferencesLz77(xsize, ysize, argb, 0, hash_chain, refs)) {
     return 0;
   }
-  // Do a binary search to find the optimal entropy for cache_bits.
-  while (eval_low || eval_high) {
-    if (eval_low) {
-      entropy_low = ComputeCacheEntropy(argb, refs, cache_bits_low);
-      entropy_low += entropy_low * cache_bits_low * cost_mul;
-      eval_low = 0;
-    }
-    if (eval_high) {
-      entropy_high = ComputeCacheEntropy(argb, refs, cache_bits_high);
-      entropy_high += entropy_high * cache_bits_high * cost_mul;
-      eval_high = 0;
-    }
-    if (entropy_high < entropy_low) {
-      const int prev_cache_bits_low = cache_bits_low;
-      *best_cache_bits = cache_bits_high;
-      cache_bits_low = (cache_bits_low + cache_bits_high) / 2;
-      if (cache_bits_low != prev_cache_bits_low) eval_low = 1;
-    } else {
-      *best_cache_bits = cache_bits_low;
-      cache_bits_high = (cache_bits_low + cache_bits_high) / 2;
-      if (cache_bits_high != cache_bits_low) eval_high = 1;
+  // Find the cache_bits giving the lowest entropy. The search is done in a
+  // brute-force way as the function (entropy w.r.t cache_bits) can be
+  // anything in practice.
+  if (!ComputeCacheEntropies(argb, refs, cache_bits_high, entropies)) {
+    return 0;
+  }
+  for (i = 0; i <= cache_bits_high; ++i) {
+    if (i == 0 || entropies[i] < entropy_min) {
+      entropy_min = entropies[i];
+      *best_cache_bits = i;
     }
   }
-  *lz77_computed = 1;
   return 1;
 }
 
@@ -1584,8 +1668,9 @@ static int BackwardRefsWithLocalCache(const uint32_t* const argb,
     PixOrCopy* const v = c.cur_pos;
     if (PixOrCopyIsLiteral(v)) {
       const uint32_t argb_literal = v->argb_or_distance;
-      if (VP8LColorCacheContains(&hashers, argb_literal)) {
-        const int ix = VP8LColorCacheGetIndex(&hashers, argb_literal);
+      const int ix = VP8LColorCacheContains(&hashers, argb_literal);
+      if (ix >= 0) {
+        // hashers contains argb_literal
         *v = PixOrCopyCreateCacheIdx(ix);
       } else {
         VP8LColorCacheInsert(&hashers, argb_literal);
diff --git a/src/3rdparty/libwebp/src/enc/backward_references.h b/src/3rdparty/libwebp/src/enc/backward_references_enc.h
index 0cadb11..3a19aa7 100644
--- a/src/3rdparty/libwebp/src/enc/backward_references.h
+++ b/src/3rdparty/libwebp/src/enc/backward_references_enc.h
@@ -130,7 +130,8 @@ struct VP8LHashChain {
 int VP8LHashChainInit(VP8LHashChain* const p, int size);
 // Pre-compute the best matches for argb.
 int VP8LHashChainFill(VP8LHashChain* const p, int quality,
-                      const uint32_t* const argb, int xsize, int ysize);
+                      const uint32_t* const argb, int xsize, int ysize,
+                      int low_effort);
 void VP8LHashChainClear(VP8LHashChain* const p);  // release memory
 
 // -----------------------------------------------------------------------------
diff --git a/src/3rdparty/libwebp/src/enc/config.c b/src/3rdparty/libwebp/src/enc/config_enc.c
index f9f7961..4589dc0 100644
--- a/src/3rdparty/libwebp/src/enc/config.c
+++ b/src/3rdparty/libwebp/src/enc/config_enc.c
@@ -11,6 +11,10 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
+#ifdef HAVE_CONFIG_H
+#include "../webp/config.h"
+#endif
+
 #include "../webp/encode.h"
 
 //------------------------------------------------------------------------------
@@ -49,9 +53,8 @@ int WebPConfigInitInternal(WebPConfig* config,
   config->thread_level = 0;
   config->low_memory = 0;
   config->near_lossless = 100;
-#ifdef WEBP_EXPERIMENTAL_FEATURES
-  config->delta_palettization = 0;
-#endif // WEBP_EXPERIMENTAL_FEATURES
+  config->use_delta_palette = 0;
+  config->use_sharp_yuv = 0;
 
   // TODO(skal): tune.
   switch (preset) {
@@ -92,60 +95,36 @@ int WebPConfigInitInternal(WebPConfig* config,
 
 int WebPValidateConfig(const WebPConfig* config) {
   if (config == NULL) return 0;
-  if (config->quality < 0 || config->quality > 100)
-    return 0;
-  if (config->target_size < 0)
-    return 0;
-  if (config->target_PSNR < 0)
-    return 0;
-  if (config->method < 0 || config->method > 6)
-    return 0;
-  if (config->segments < 1 || config->segments > 4)
-    return 0;
-  if (config->sns_strength < 0 || config->sns_strength > 100)
-    return 0;
-  if (config->filter_strength < 0 || config->filter_strength > 100)
-    return 0;
-  if (config->filter_sharpness < 0 || config->filter_sharpness > 7)
-    return 0;
-  if (config->filter_type < 0 || config->filter_type > 1)
-    return 0;
-  if (config->autofilter < 0 || config->autofilter > 1)
-    return 0;
-  if (config->pass < 1 || config->pass > 10)
-    return 0;
-  if (config->show_compressed < 0 || config->show_compressed > 1)
-    return 0;
-  if (config->preprocessing < 0 || config->preprocessing > 7)
-    return 0;
-  if (config->partitions < 0 || config->partitions > 3)
+  if (config->quality < 0 || config->quality > 100) return 0;
+  if (config->target_size < 0) return 0;
+  if (config->target_PSNR < 0) return 0;
+  if (config->method < 0 || config->method > 6) return 0;
+  if (config->segments < 1 || config->segments > 4) return 0;
+  if (config->sns_strength < 0 || config->sns_strength > 100) return 0;
+  if (config->filter_strength < 0 || config->filter_strength > 100) return 0;
+  if (config->filter_sharpness < 0 || config->filter_sharpness > 7) return 0;
+  if (config->filter_type < 0 || config->filter_type > 1) return 0;
+  if (config->autofilter < 0 || config->autofilter > 1) return 0;
+  if (config->pass < 1 || config->pass > 10) return 0;
+  if (config->show_compressed < 0 || config->show_compressed > 1) return 0;
+  if (config->preprocessing < 0 || config->preprocessing > 7) return 0;
+  if (config->partitions < 0 || config->partitions > 3) return 0;
+  if (config->partition_limit < 0 || config->partition_limit > 100) return 0;
+  if (config->alpha_compression < 0) return 0;
+  if (config->alpha_filtering < 0) return 0;
+  if (config->alpha_quality < 0 || config->alpha_quality > 100) return 0;
+  if (config->lossless < 0 || config->lossless > 1) return 0;
+  if (config->near_lossless < 0 || config->near_lossless > 100) return 0;
+  if (config->image_hint >= WEBP_HINT_LAST) return 0;
+  if (config->emulate_jpeg_size < 0 || config->emulate_jpeg_size > 1) return 0;
+  if (config->thread_level < 0 || config->thread_level > 1) return 0;
+  if (config->low_memory < 0 || config->low_memory > 1) return 0;
+  if (config->exact < 0 || config->exact > 1) return 0;
+  if (config->use_delta_palette < 0 || config->use_delta_palette > 1) {
     return 0;
-  if (config->partition_limit < 0 || config->partition_limit > 100)
-    return 0;
-  if (config->alpha_compression < 0)
-    return 0;
-  if (config->alpha_filtering < 0)
-    return 0;
-  if (config->alpha_quality < 0 || config->alpha_quality > 100)
-    return 0;
-  if (config->lossless < 0 || config->lossless > 1)
-    return 0;
-  if (config->near_lossless < 0 || config->near_lossless > 100)
-    return 0;
-  if (config->image_hint >= WEBP_HINT_LAST)
-    return 0;
-  if (config->emulate_jpeg_size < 0 || config->emulate_jpeg_size > 1)
-    return 0;
-  if (config->thread_level < 0 || config->thread_level > 1)
-    return 0;
-  if (config->low_memory < 0 || config->low_memory > 1)
-    return 0;
-  if (config->exact < 0 || config->exact > 1)
-    return 0;
-#ifdef WEBP_EXPERIMENTAL_FEATURES
-  if (config->delta_palettization < 0 || config->delta_palettization > 1)
-    return 0;
-#endif  // WEBP_EXPERIMENTAL_FEATURES
+  }
+  if (config->use_sharp_yuv < 0 || config->use_sharp_yuv > 1) return 0;
+
   return 1;
 }
 
diff --git a/src/3rdparty/libwebp/src/enc/cost.c b/src/3rdparty/libwebp/src/enc/cost_enc.c
index ae7fe01..c823f5a 100644
--- a/src/3rdparty/libwebp/src/enc/cost.c
+++ b/src/3rdparty/libwebp/src/enc/cost_enc.c
@@ -11,7 +11,7 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
-#include "./cost.h"
+#include "./cost_enc.h"
 
 //------------------------------------------------------------------------------
 // Level cost tables
@@ -281,18 +281,6 @@ int VP8GetCostUV(VP8EncIterator* const it, const VP8ModeScore* const rd) {
 //------------------------------------------------------------------------------
 // Recording of token probabilities.
 
-// Record proba context used
-static int Record(int bit, proba_t* const stats) {
-  proba_t p = *stats;
-  if (p >= 0xffff0000u) {               // an overflow is inbound.
-    p = ((p + 1u) >> 1) & 0x7fff7fffu;  // -> divide the stats by 2.
-  }
-  // record bit count (lower 16 bits) and increment total count (upper 16 bits).
-  p += 0x00010000u + bit;
-  *stats = p;
-  return bit;
-}
-
 // We keep the table-free variant around for reference, in case.
 #define USE_LEVEL_CODE_TABLE
 
@@ -303,31 +291,31 @@ int VP8RecordCoeffs(int ctx, const VP8Residual* const res) {
   // should be stats[VP8EncBands[n]], but it's equivalent for n=0 or 1
   proba_t* s = res->stats[n][ctx];
   if (res->last  < 0) {
-    Record(0, s + 0);
+    VP8RecordStats(0, s + 0);
     return 0;
   }
   while (n <= res->last) {
     int v;
-    Record(1, s + 0);  // order of record doesn't matter
+    VP8RecordStats(1, s + 0);  // order of record doesn't matter
     while ((v = res->coeffs[n++]) == 0) {
-      Record(0, s + 1);
+      VP8RecordStats(0, s + 1);
       s = res->stats[VP8EncBands[n]][0];
     }
-    Record(1, s + 1);
-    if (!Record(2u < (unsigned int)(v + 1), s + 2)) {  // v = -1 or 1
+    VP8RecordStats(1, s + 1);
+    if (!VP8RecordStats(2u < (unsigned int)(v + 1), s + 2)) {  // v = -1 or 1
       s = res->stats[VP8EncBands[n]][1];
     } else {
       v = abs(v);
 #if !defined(USE_LEVEL_CODE_TABLE)
-      if (!Record(v > 4, s + 3)) {
-        if (Record(v != 2, s + 4))
-          Record(v == 4, s + 5);
-      } else if (!Record(v > 10, s + 6)) {
-        Record(v > 6, s + 7);
-      } else if (!Record((v >= 3 + (8 << 2)), s + 8)) {
-        Record((v >= 3 + (8 << 1)), s + 9);
+      if (!VP8RecordStats(v > 4, s + 3)) {
+        if (VP8RecordStats(v != 2, s + 4))
+          VP8RecordStats(v == 4, s + 5);
+      } else if (!VP8RecordStats(v > 10, s + 6)) {
+        VP8RecordStats(v > 6, s + 7);
+      } else if (!VP8RecordStats((v >= 3 + (8 << 2)), s + 8)) {
+        VP8RecordStats((v >= 3 + (8 << 1)), s + 9);
       } else {
-        Record((v >= 3 + (8 << 3)), s + 10);
+        VP8RecordStats((v >= 3 + (8 << 3)), s + 10);
       }
 #else
       if (v > MAX_VARIABLE_LEVEL) {
@@ -340,14 +328,14 @@ int VP8RecordCoeffs(int ctx, const VP8Residual* const res) {
         int i;
         for (i = 0; (pattern >>= 1) != 0; ++i) {
           const int mask = 2 << i;
-          if (pattern & 1) Record(!!(bits & mask), s + 3 + i);
+          if (pattern & 1) VP8RecordStats(!!(bits & mask), s + 3 + i);
         }
       }
 #endif
       s = res->stats[VP8EncBands[n]][2];
     }
   }
-  if (n < 16) Record(0, s + 0);
+  if (n < 16) VP8RecordStats(0, s + 0);
   return 1;
 }
 
diff --git a/src/3rdparty/libwebp/src/enc/cost.h b/src/3rdparty/libwebp/src/enc/cost_enc.h
index 20960d6..99e4b37 100644
--- a/src/3rdparty/libwebp/src/enc/cost.h
+++ b/src/3rdparty/libwebp/src/enc/cost_enc.h
@@ -16,7 +16,7 @@
 
 #include <assert.h>
 #include <stdlib.h>
-#include "./vp8enci.h"
+#include "./vp8i_enc.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -41,6 +41,20 @@ void VP8InitResidual(int first, int coeff_type,
 
 int VP8RecordCoeffs(int ctx, const VP8Residual* const res);
 
+// Record proba context used.
+static WEBP_INLINE int VP8RecordStats(int bit, proba_t* const stats) {
+  proba_t p = *stats;
+  // An overflow is inbound. Note we handle this at 0xfffe0000u instead of
+  // 0xffff0000u to make sure p + 1u does not overflow.
+  if (p >= 0xfffe0000u) {
+    p = ((p + 1u) >> 1) & 0x7fff7fffu;  // -> divide the stats by 2.
+  }
+  // record bit count (lower 16 bits) and increment total count (upper 16 bits).
+  p += 0x00010000u + bit;
+  *stats = p;
+  return bit;
+}
+
 // Cost of coding one event with probability 'proba'.
 static WEBP_INLINE int VP8BitCost(int bit, uint8_t proba) {
   return !bit ? VP8EntropyCost[proba] : VP8EntropyCost[255 - proba];
diff --git a/src/3rdparty/libwebp/src/enc/delta_palettization.c b/src/3rdparty/libwebp/src/enc/delta_palettization_enc.c
index 062e588..eaf0f05 100644
--- a/src/3rdparty/libwebp/src/enc/delta_palettization.c
+++ b/src/3rdparty/libwebp/src/enc/delta_palettization_enc.c
@@ -10,7 +10,7 @@
 // Author: Mislav Bradac (mislavm@google.com)
 //
 
-#include "./delta_palettization.h"
+#include "./delta_palettization_enc.h"
 
 #ifdef WEBP_EXPERIMENTAL_FEATURES
 #include "../webp/types.h"
diff --git a/src/3rdparty/libwebp/src/enc/delta_palettization.h b/src/3rdparty/libwebp/src/enc/delta_palettization_enc.h
index e41c0c5..63048ec 100644
--- a/src/3rdparty/libwebp/src/enc/delta_palettization.h
+++ b/src/3rdparty/libwebp/src/enc/delta_palettization_enc.h
@@ -14,7 +14,7 @@
 #define WEBP_ENC_DELTA_PALETTIZATION_H_
 
 #include "../webp/encode.h"
-#include "../enc/vp8li.h"
+#include "../enc/vp8li_enc.h"
 
 // Replaces enc->argb_[] input by a palettizable approximation of it,
 // and generates optimal enc->palette_[].
diff --git a/src/3rdparty/libwebp/src/enc/filter.c b/src/3rdparty/libwebp/src/enc/filter_enc.c
index e8ea8b4..4bc3672 100644
--- a/src/3rdparty/libwebp/src/enc/filter.c
+++ b/src/3rdparty/libwebp/src/enc/filter_enc.c
@@ -12,7 +12,7 @@
 // Author: somnath@google.com (Somnath Banerjee)
 
 #include <assert.h>
-#include "./vp8enci.h"
+#include "./vp8i_enc.h"
 #include "../dsp/dsp.h"
 
 // This table gives, for a given sharpness, the filtering strength to be
@@ -105,115 +105,28 @@ static void DoFilter(const VP8EncIterator* const it, int level) {
 }
 
 //------------------------------------------------------------------------------
-// SSIM metric
-
-static const double kMinValue = 1.e-10;  // minimal threshold
-
-void VP8SSIMAddStats(const VP8DistoStats* const src, VP8DistoStats* const dst) {
-  dst->w   += src->w;
-  dst->xm  += src->xm;
-  dst->ym  += src->ym;
-  dst->xxm += src->xxm;
-  dst->xym += src->xym;
-  dst->yym += src->yym;
-}
-
-double VP8SSIMGet(const VP8DistoStats* const stats) {
-  const double xmxm = stats->xm * stats->xm;
-  const double ymym = stats->ym * stats->ym;
-  const double xmym = stats->xm * stats->ym;
-  const double w2 = stats->w * stats->w;
-  double sxx = stats->xxm * stats->w - xmxm;
-  double syy = stats->yym * stats->w - ymym;
-  double sxy = stats->xym * stats->w - xmym;
-  double C1, C2;
-  double fnum;
-  double fden;
-  // small errors are possible, due to rounding. Clamp to zero.
-  if (sxx < 0.) sxx = 0.;
-  if (syy < 0.) syy = 0.;
-  C1 = 6.5025 * w2;
-  C2 = 58.5225 * w2;
-  fnum = (2 * xmym + C1) * (2 * sxy + C2);
-  fden = (xmxm + ymym + C1) * (sxx + syy + C2);
-  return (fden != 0.) ? fnum / fden : kMinValue;
-}
-
-double VP8SSIMGetSquaredError(const VP8DistoStats* const s) {
-  if (s->w > 0.) {
-    const double iw2 = 1. / (s->w * s->w);
-    const double sxx = s->xxm * s->w - s->xm * s->xm;
-    const double syy = s->yym * s->w - s->ym * s->ym;
-    const double sxy = s->xym * s->w - s->xm * s->ym;
-    const double SSE = iw2 * (sxx + syy - 2. * sxy);
-    if (SSE > kMinValue) return SSE;
-  }
-  return kMinValue;
-}
-
-#define LIMIT(A, M)  ((A) > (M) ? (M) : (A))
-static void VP8SSIMAccumulateRow(const uint8_t* src1, int stride1,
-                                 const uint8_t* src2, int stride2,
-                                 int y, int W, int H,
-                                 VP8DistoStats* const stats) {
-  int x = 0;
-  const int w0 = LIMIT(VP8_SSIM_KERNEL, W);
-  for (x = 0; x < w0; ++x) {
-    VP8SSIMAccumulateClipped(src1, stride1, src2, stride2, x, y, W, H, stats);
-  }
-  for (; x <= W - 8 + VP8_SSIM_KERNEL; ++x) {
-    VP8SSIMAccumulate(
-        src1 + (y - VP8_SSIM_KERNEL) * stride1 + (x - VP8_SSIM_KERNEL), stride1,
-        src2 + (y - VP8_SSIM_KERNEL) * stride2 + (x - VP8_SSIM_KERNEL), stride2,
-        stats);
-  }
-  for (; x < W; ++x) {
-    VP8SSIMAccumulateClipped(src1, stride1, src2, stride2, x, y, W, H, stats);
-  }
-}
-
-void VP8SSIMAccumulatePlane(const uint8_t* src1, int stride1,
-                            const uint8_t* src2, int stride2,
-                            int W, int H, VP8DistoStats* const stats) {
-  int x, y;
-  const int h0 = LIMIT(VP8_SSIM_KERNEL, H);
-  const int h1 = LIMIT(VP8_SSIM_KERNEL, H - VP8_SSIM_KERNEL);
-  for (y = 0; y < h0; ++y) {
-    for (x = 0; x < W; ++x) {
-      VP8SSIMAccumulateClipped(src1, stride1, src2, stride2, x, y, W, H, stats);
-    }
-  }
-  for (; y < h1; ++y) {
-    VP8SSIMAccumulateRow(src1, stride1, src2, stride2, y, W, H, stats);
-  }
-  for (; y < H; ++y) {
-    for (x = 0; x < W; ++x) {
-      VP8SSIMAccumulateClipped(src1, stride1, src2, stride2, x, y, W, H, stats);
-    }
-  }
-}
-#undef LIMIT
+// SSIM metric for one macroblock
 
 static double GetMBSSIM(const uint8_t* yuv1, const uint8_t* yuv2) {
   int x, y;
-  VP8DistoStats s = { .0, .0, .0, .0, .0, .0 };
+  double sum = 0.;
 
   // compute SSIM in a 10 x 10 window
   for (y = VP8_SSIM_KERNEL; y < 16 - VP8_SSIM_KERNEL; y++) {
     for (x = VP8_SSIM_KERNEL; x < 16 - VP8_SSIM_KERNEL; x++) {
-      VP8SSIMAccumulateClipped(yuv1 + Y_OFF_ENC, BPS, yuv2 + Y_OFF_ENC, BPS,
-                               x, y, 16, 16, &s);
+      sum += VP8SSIMGetClipped(yuv1 + Y_OFF_ENC, BPS, yuv2 + Y_OFF_ENC, BPS,
+                               x, y, 16, 16);
     }
   }
   for (x = 1; x < 7; x++) {
     for (y = 1; y < 7; y++) {
-      VP8SSIMAccumulateClipped(yuv1 + U_OFF_ENC, BPS, yuv2 + U_OFF_ENC, BPS,
-                               x, y, 8, 8, &s);
-      VP8SSIMAccumulateClipped(yuv1 + V_OFF_ENC, BPS, yuv2 + V_OFF_ENC, BPS,
-                               x, y, 8, 8, &s);
+      sum += VP8SSIMGetClipped(yuv1 + U_OFF_ENC, BPS, yuv2 + U_OFF_ENC, BPS,
+                               x, y, 8, 8);
+      sum += VP8SSIMGetClipped(yuv1 + V_OFF_ENC, BPS, yuv2 + V_OFF_ENC, BPS,
+                               x, y, 8, 8);
     }
   }
-  return VP8SSIMGet(&s);
+  return sum;
 }
 
 //------------------------------------------------------------------------------
diff --git a/src/3rdparty/libwebp/src/enc/frame.c b/src/3rdparty/libwebp/src/enc/frame_enc.c
index 5b7a40b..abef523 100644
--- a/src/3rdparty/libwebp/src/enc/frame.c
+++ b/src/3rdparty/libwebp/src/enc/frame_enc.c
@@ -14,8 +14,8 @@
 #include <string.h>
 #include <math.h>
 
-#include "./cost.h"
-#include "./vp8enci.h"
+#include "./cost_enc.h"
+#include "./vp8i_enc.h"
 #include "../dsp/dsp.h"
 #include "../webp/format_constants.h"  // RIFF constants
 
@@ -185,6 +185,13 @@ static int GetProba(int a, int b) {
                       : (255 * a + total / 2) / total;  // rounded proba
 }
 
+static void ResetSegments(VP8Encoder* const enc) {
+  int n;
+  for (n = 0; n < enc->mb_w_ * enc->mb_h_; ++n) {
+    enc->mb_info_[n].segment_ = 0;
+  }
+}
+
 static void SetSegmentProbas(VP8Encoder* const enc) {
   int p[NUM_MB_SEGMENTS] = { 0 };
   int n;
@@ -206,6 +213,7 @@ static void SetSegmentProbas(VP8Encoder* const enc) {
 
     enc->segment_hdr_.update_map_ =
         (probas[0] != 255) || (probas[1] != 255) || (probas[2] != 255);
+    if (!enc->segment_hdr_.update_map_) ResetSegments(enc);
     enc->segment_hdr_.size_ =
         p[0] * (VP8BitCost(0, probas[0]) + VP8BitCost(0, probas[1])) +
         p[1] * (VP8BitCost(0, probas[0]) + VP8BitCost(1, probas[1])) +
@@ -240,8 +248,9 @@ static int PutCoeffs(VP8BitWriter* const bw, int ctx, const VP8Residual* res) {
       p = res->prob[VP8EncBands[n]][1];
     } else {
       if (!VP8PutBit(bw, v > 4, p[3])) {
-        if (VP8PutBit(bw, v != 2, p[4]))
+        if (VP8PutBit(bw, v != 2, p[4])) {
           VP8PutBit(bw, v == 4, p[5]);
+        }
       } else if (!VP8PutBit(bw, v > 10, p[6])) {
         if (!VP8PutBit(bw, v > 6, p[7])) {
           VP8PutBit(bw, v == 6, 159);
@@ -406,9 +415,7 @@ static int RecordTokens(VP8EncIterator* const it, const VP8ModeScore* const rd,
     VP8InitResidual(0, 1, enc, &res);
     VP8SetResidualCoeffs(rd->y_dc_levels, &res);
     it->top_nz_[8] = it->left_nz_[8] =
-        VP8RecordCoeffTokens(ctx, 1,
-                             res.first, res.last, res.coeffs, tokens);
-    VP8RecordCoeffs(ctx, &res);
+        VP8RecordCoeffTokens(ctx, &res, tokens);
     VP8InitResidual(1, 0, enc, &res);
   } else {
     VP8InitResidual(0, 3, enc, &res);
@@ -420,9 +427,7 @@ static int RecordTokens(VP8EncIterator* const it, const VP8ModeScore* const rd,
       const int ctx = it->top_nz_[x] + it->left_nz_[y];
       VP8SetResidualCoeffs(rd->y_ac_levels[x + y * 4], &res);
       it->top_nz_[x] = it->left_nz_[y] =
-          VP8RecordCoeffTokens(ctx, res.coeff_type,
-                               res.first, res.last, res.coeffs, tokens);
-      VP8RecordCoeffs(ctx, &res);
+          VP8RecordCoeffTokens(ctx, &res, tokens);
     }
   }
 
@@ -434,9 +439,7 @@ static int RecordTokens(VP8EncIterator* const it, const VP8ModeScore* const rd,
         const int ctx = it->top_nz_[4 + ch + x] + it->left_nz_[4 + ch + y];
         VP8SetResidualCoeffs(rd->uv_levels[ch * 2 + x + y * 2], &res);
         it->top_nz_[4 + ch + x] = it->left_nz_[4 + ch + y] =
-            VP8RecordCoeffTokens(ctx, 2,
-                                 res.first, res.last, res.coeffs, tokens);
-        VP8RecordCoeffs(ctx, &res);
+            VP8RecordCoeffTokens(ctx, &res, tokens);
       }
     }
   }
@@ -555,8 +558,9 @@ static uint64_t OneStatPass(VP8Encoder* const enc, VP8RDLevel rd_opt,
     size += info.R + info.H;
     size_p0 += info.H;
     distortion += info.D;
-    if (percent_delta && !VP8IteratorProgress(&it, percent_delta))
+    if (percent_delta && !VP8IteratorProgress(&it, percent_delta)) {
       return 0;
+    }
     VP8IteratorSaveBoundary(&it);
   } while (VP8IteratorNext(&it) && --nb_mbs > 0);
 
@@ -814,7 +818,7 @@ int VP8EncTokenLoop(VP8Encoder* const enc) {
            num_pass_left, stats.last_value, stats.value,
            stats.last_q, stats.q, stats.dq);
 #endif
-    if (size_p0 > PARTITION0_SIZE_LIMIT) {
+    if (enc->max_i4_header_bits_ > 0 && size_p0 > PARTITION0_SIZE_LIMIT) {
       ++num_pass_left;
       enc->max_i4_header_bits_ >>= 1;  // strengthen header bit limitation...
       continue;                        // ...and start over
diff --git a/src/3rdparty/libwebp/src/enc/histogram.c b/src/3rdparty/libwebp/src/enc/histogram_enc.c
index 395372b..808b6f7 100644
--- a/src/3rdparty/libwebp/src/enc/histogram.c
+++ b/src/3rdparty/libwebp/src/enc/histogram_enc.c
@@ -15,9 +15,10 @@
 
 #include <math.h>
 
-#include "./backward_references.h"
-#include "./histogram.h"
+#include "./backward_references_enc.h"
+#include "./histogram_enc.h"
 #include "../dsp/lossless.h"
+#include "../dsp/lossless_common.h"
 #include "../utils/utils.h"
 
 #define MAX_COST 1.e38
@@ -213,10 +214,19 @@ static double InitialHuffmanCost(void) {
 
 // Finalize the Huffman cost based on streak numbers and length type (<3 or >=3)
 static double FinalHuffmanCost(const VP8LStreaks* const stats) {
+  // The constants in this function are experimental and got rounded from
+  // their original values in 1/8 when switched to 1/1024.
   double retval = InitialHuffmanCost();
+  // Second coefficient: Many zeros in the histogram are covered efficiently
+  // by a run-length encode. Originally 2/8.
   retval += stats->counts[0] * 1.5625 + 0.234375 * stats->streaks[0][1];
+  // Second coefficient: Constant values are encoded less efficiently, but still
+  // RLE'ed. Originally 6/8.
   retval += stats->counts[1] * 2.578125 + 0.703125 * stats->streaks[1][1];
+  // 0s are usually encoded more efficiently than non-0s.
+  // Originally 15/8.
   retval += 1.796875 * stats->streaks[0][0];
+  // Originally 26/8.
   retval += 3.28125 * stats->streaks[1][0];
   return retval;
 }
@@ -236,14 +246,30 @@ static double PopulationCost(const uint32_t* const population, int length,
   return BitsEntropyRefine(&bit_entropy) + FinalHuffmanCost(&stats);
 }
 
+// trivial_at_end is 1 if the two histograms only have one element that is
+// non-zero: both the zero-th one, or both the last one.
 static WEBP_INLINE double GetCombinedEntropy(const uint32_t* const X,
                                              const uint32_t* const Y,
-                                             int length) {
-  VP8LBitEntropy bit_entropy;
+                                             int length, int trivial_at_end) {
   VP8LStreaks stats;
-  VP8LGetCombinedEntropyUnrefined(X, Y, length, &bit_entropy, &stats);
+  if (trivial_at_end) {
+    // This configuration is due to palettization that transforms an indexed
+    // pixel into 0xff000000 | (pixel << 8) in VP8LBundleColorMap.
+    // BitsEntropyRefine is 0 for histograms with only one non-zero value.
+    // Only FinalHuffmanCost needs to be evaluated.
+    memset(&stats, 0, sizeof(stats));
+    // Deal with the non-zero value at index 0 or length-1.
+    stats.streaks[1][0] += 1;
+    // Deal with the following/previous zero streak.
+    stats.counts[0] += 1;
+    stats.streaks[0][1] += length - 1;
+    return FinalHuffmanCost(&stats);
+  } else {
+    VP8LBitEntropy bit_entropy;
+    VP8LGetCombinedEntropyUnrefined(X, Y, length, &bit_entropy, &stats);
 
-  return BitsEntropyRefine(&bit_entropy) + FinalHuffmanCost(&stats);
+    return BitsEntropyRefine(&bit_entropy) + FinalHuffmanCost(&stats);
+  }
 }
 
 // Estimates the Entropy + Huffman + other block overhead size cost.
@@ -267,24 +293,42 @@ static int GetCombinedHistogramEntropy(const VP8LHistogram* const a,
                                        double cost_threshold,
                                        double* cost) {
   const int palette_code_bits = a->palette_code_bits_;
+  int trivial_at_end = 0;
   assert(a->palette_code_bits_ == b->palette_code_bits_);
   *cost += GetCombinedEntropy(a->literal_, b->literal_,
-                              VP8LHistogramNumCodes(palette_code_bits));
+                              VP8LHistogramNumCodes(palette_code_bits), 0);
   *cost += VP8LExtraCostCombined(a->literal_ + NUM_LITERAL_CODES,
                                  b->literal_ + NUM_LITERAL_CODES,
                                  NUM_LENGTH_CODES);
   if (*cost > cost_threshold) return 0;
 
-  *cost += GetCombinedEntropy(a->red_, b->red_, NUM_LITERAL_CODES);
+  if (a->trivial_symbol_ != VP8L_NON_TRIVIAL_SYM &&
+      a->trivial_symbol_ == b->trivial_symbol_) {
+    // A, R and B are all 0 or 0xff.
+    const uint32_t color_a = (a->trivial_symbol_ >> 24) & 0xff;
+    const uint32_t color_r = (a->trivial_symbol_ >> 16) & 0xff;
+    const uint32_t color_b = (a->trivial_symbol_ >> 0) & 0xff;
+    if ((color_a == 0 || color_a == 0xff) &&
+        (color_r == 0 || color_r == 0xff) &&
+        (color_b == 0 || color_b == 0xff)) {
+      trivial_at_end = 1;
+    }
+  }
+
+  *cost +=
+      GetCombinedEntropy(a->red_, b->red_, NUM_LITERAL_CODES, trivial_at_end);
   if (*cost > cost_threshold) return 0;
 
-  *cost += GetCombinedEntropy(a->blue_, b->blue_, NUM_LITERAL_CODES);
+  *cost +=
+      GetCombinedEntropy(a->blue_, b->blue_, NUM_LITERAL_CODES, trivial_at_end);
   if (*cost > cost_threshold) return 0;
 
-  *cost += GetCombinedEntropy(a->alpha_, b->alpha_, NUM_LITERAL_CODES);
+  *cost += GetCombinedEntropy(a->alpha_, b->alpha_, NUM_LITERAL_CODES,
+                              trivial_at_end);
   if (*cost > cost_threshold) return 0;
 
-  *cost += GetCombinedEntropy(a->distance_, b->distance_, NUM_DISTANCE_CODES);
+  *cost +=
+      GetCombinedEntropy(a->distance_, b->distance_, NUM_DISTANCE_CODES, 0);
   *cost +=
       VP8LExtraCostCombined(a->distance_, b->distance_, NUM_DISTANCE_CODES);
   if (*cost > cost_threshold) return 0;
@@ -292,6 +336,15 @@ static int GetCombinedHistogramEntropy(const VP8LHistogram* const a,
   return 1;
 }
 
+static WEBP_INLINE void HistogramAdd(const VP8LHistogram* const a,
+                                     const VP8LHistogram* const b,
+                                     VP8LHistogram* const out) {
+  VP8LHistogramAdd(a, b, out);
+  out->trivial_symbol_ = (a->trivial_symbol_ == b->trivial_symbol_)
+                       ? a->trivial_symbol_
+                       : VP8L_NON_TRIVIAL_SYM;
+}
+
 // Performs out = a + b, computing the cost C(a+b) - C(a) - C(b) while comparing
 // to the threshold value 'cost_threshold'. The score returned is
 //  Score = C(a+b) - C(a) - C(b), where C(a) + C(b) is known and fixed.
@@ -307,11 +360,9 @@ static double HistogramAddEval(const VP8LHistogram* const a,
   cost_threshold += sum_cost;
 
   if (GetCombinedHistogramEntropy(a, b, cost_threshold, &cost)) {
-    VP8LHistogramAdd(a, b, out);
+    HistogramAdd(a, b, out);
     out->bit_cost_ = cost;
     out->palette_code_bits_ = a->palette_code_bits_;
-    out->trivial_symbol_ = (a->trivial_symbol_ == b->trivial_symbol_) ?
-        a->trivial_symbol_ : VP8L_NON_TRIVIAL_SYM;
   }
 
   return cost - sum_cost;
@@ -450,113 +501,103 @@ static void HistogramCopyAndAnalyze(
 // Partition histograms to different entropy bins for three dominant (literal,
 // red and blue) symbol costs and compute the histogram aggregate bit_cost.
 static void HistogramAnalyzeEntropyBin(VP8LHistogramSet* const image_histo,
-                                       int16_t* const bin_map, int low_effort) {
+                                       uint16_t* const bin_map,
+                                       int low_effort) {
   int i;
   VP8LHistogram** const histograms = image_histo->histograms;
   const int histo_size = image_histo->size;
-  const int bin_depth = histo_size + 1;
   DominantCostRange cost_range;
   DominantCostRangeInit(&cost_range);
 
   // Analyze the dominant (literal, red and blue) entropy costs.
   for (i = 0; i < histo_size; ++i) {
-    VP8LHistogram* const histo = histograms[i];
-    UpdateDominantCostRange(histo, &cost_range);
+    UpdateDominantCostRange(histograms[i], &cost_range);
   }
 
   // bin-hash histograms on three of the dominant (literal, red and blue)
-  // symbol costs.
+  // symbol costs and store the resulting bin_id for each histogram.
   for (i = 0; i < histo_size; ++i) {
-    const VP8LHistogram* const histo = histograms[i];
-    const int bin_id = GetHistoBinIndex(histo, &cost_range, low_effort);
-    const int bin_offset = bin_id * bin_depth;
-    // bin_map[n][0] for every bin 'n' maintains the counter for the number of
-    // histograms in that bin.
-    // Get and increment the num_histos in that bin.
-    const int num_histos = ++bin_map[bin_offset];
-    assert(bin_offset + num_histos < bin_depth * BIN_SIZE);
-    // Add histogram i'th index at num_histos (last) position in the bin_map.
-    bin_map[bin_offset + num_histos] = i;
-  }
-}
-
-// Compact the histogram set by removing unused entries.
-static void HistogramCompactBins(VP8LHistogramSet* const image_histo) {
-  VP8LHistogram** const histograms = image_histo->histograms;
-  int i, j;
-
-  for (i = 0, j = 0; i < image_histo->size; ++i) {
-    if (histograms[i] != NULL && histograms[i]->bit_cost_ != 0.) {
-      if (j < i) {
-        histograms[j] = histograms[i];
-        histograms[i] = NULL;
-      }
-      ++j;
-    }
+    bin_map[i] = GetHistoBinIndex(histograms[i], &cost_range, low_effort);
   }
-  image_histo->size = j;
 }
 
+// Compact image_histo[] by merging some histograms with same bin_id together if
+// it's advantageous.
 static VP8LHistogram* HistogramCombineEntropyBin(
     VP8LHistogramSet* const image_histo,
     VP8LHistogram* cur_combo,
-    int16_t* const bin_map, int bin_depth, int num_bins,
+    const uint16_t* const bin_map, int bin_map_size, int num_bins,
     double combine_cost_factor, int low_effort) {
-  int bin_id;
   VP8LHistogram** const histograms = image_histo->histograms;
-
-  for (bin_id = 0; bin_id < num_bins; ++bin_id) {
-    const int bin_offset = bin_id * bin_depth;
-    const int num_histos = bin_map[bin_offset];
-    const int idx1 = bin_map[bin_offset + 1];
-    int num_combine_failures = 0;
-    int n;
-    for (n = 2; n <= num_histos; ++n) {
-      const int idx2 = bin_map[bin_offset + n];
-      if (low_effort) {
-        // Merge all histograms with the same bin index, irrespective of cost of
-        // the merged histograms.
-        VP8LHistogramAdd(histograms[idx1], histograms[idx2], histograms[idx1]);
-        histograms[idx2]->bit_cost_ = 0.;
-      } else {
-        const double bit_cost_idx2 = histograms[idx2]->bit_cost_;
-        if (bit_cost_idx2 > 0.) {
-          const double bit_cost_thresh = -bit_cost_idx2 * combine_cost_factor;
-          const double curr_cost_diff =
-              HistogramAddEval(histograms[idx1], histograms[idx2],
-                               cur_combo, bit_cost_thresh);
-          if (curr_cost_diff < bit_cost_thresh) {
-            // Try to merge two histograms only if the combo is a trivial one or
-            // the two candidate histograms are already non-trivial.
-            // For some images, 'try_combine' turns out to be false for a lot of
-            // histogram pairs. In that case, we fallback to combining
-            // histograms as usual to avoid increasing the header size.
-            const int try_combine =
-                (cur_combo->trivial_symbol_ != VP8L_NON_TRIVIAL_SYM) ||
-                ((histograms[idx1]->trivial_symbol_ == VP8L_NON_TRIVIAL_SYM) &&
-                 (histograms[idx2]->trivial_symbol_ == VP8L_NON_TRIVIAL_SYM));
-            const int max_combine_failures = 32;
-            if (try_combine || (num_combine_failures >= max_combine_failures)) {
-              HistogramSwap(&cur_combo, &histograms[idx1]);
-              histograms[idx2]->bit_cost_ = 0.;
-            } else {
-              ++num_combine_failures;
-            }
-          }
+  int idx;
+  // Work in-place: processed histograms are put at the beginning of
+  // image_histo[]. At the end, we just have to truncate the array.
+  int size = 0;
+  struct {
+    int16_t first;    // position of the histogram that accumulates all
+                      // histograms with the same bin_id
+    uint16_t num_combine_failures;   // number of combine failures per bin_id
+  } bin_info[BIN_SIZE];
+
+  assert(num_bins <= BIN_SIZE);
+  for (idx = 0; idx < num_bins; ++idx) {
+    bin_info[idx].first = -1;
+    bin_info[idx].num_combine_failures = 0;
+  }
+
+  for (idx = 0; idx < bin_map_size; ++idx) {
+    const int bin_id = bin_map[idx];
+    const int first = bin_info[bin_id].first;
+    assert(size <= idx);
+    if (first == -1) {
+      // just move histogram #idx to its final position
+      histograms[size] = histograms[idx];
+      bin_info[bin_id].first = size++;
+    } else if (low_effort) {
+      HistogramAdd(histograms[idx], histograms[first], histograms[first]);
+    } else {
+      // try to merge #idx into #first (both share the same bin_id)
+      const double bit_cost = histograms[idx]->bit_cost_;
+      const double bit_cost_thresh = -bit_cost * combine_cost_factor;
+      const double curr_cost_diff =
+          HistogramAddEval(histograms[first], histograms[idx],
+                           cur_combo, bit_cost_thresh);
+      if (curr_cost_diff < bit_cost_thresh) {
+        // Try to merge two histograms only if the combo is a trivial one or
+        // the two candidate histograms are already non-trivial.
+        // For some images, 'try_combine' turns out to be false for a lot of
+        // histogram pairs. In that case, we fallback to combining
+        // histograms as usual to avoid increasing the header size.
+        const int try_combine =
+            (cur_combo->trivial_symbol_ != VP8L_NON_TRIVIAL_SYM) ||
+            ((histograms[idx]->trivial_symbol_ == VP8L_NON_TRIVIAL_SYM) &&
+             (histograms[first]->trivial_symbol_ == VP8L_NON_TRIVIAL_SYM));
+        const int max_combine_failures = 32;
+        if (try_combine ||
+            bin_info[bin_id].num_combine_failures >= max_combine_failures) {
+          // move the (better) merged histogram to its final slot
+          HistogramSwap(&cur_combo, &histograms[first]);
+        } else {
+          histograms[size++] = histograms[idx];
+          ++bin_info[bin_id].num_combine_failures;
         }
+      } else {
+        histograms[size++] = histograms[idx];
       }
     }
-    if (low_effort) {
-      // Update the bit_cost for the merged histograms (per bin index).
-      UpdateHistogramCost(histograms[idx1]);
+  }
+  image_histo->size = size;
+  if (low_effort) {
+    // for low_effort case, update the final cost when everything is merged
+    for (idx = 0; idx < size; ++idx) {
+      UpdateHistogramCost(histograms[idx]);
     }
   }
-  HistogramCompactBins(image_histo);
   return cur_combo;
 }
 
-static uint32_t MyRand(uint32_t *seed) {
-  *seed *= 16807U;
+static uint32_t MyRand(uint32_t* const seed) {
+  *seed = (*seed * 16807ull) & 0xffffffffu;
   if (*seed == 0) {
     *seed = 1;
   }
@@ -592,8 +633,8 @@ static int HistoQueueInit(HistoQueue* const histo_queue, const int max_index) {
   histo_queue->max_size = max_index * max_index;
   // We allocate max_size + 1 because the last element at index "size" is
   // used as temporary data (and it could be up to max_size).
-  histo_queue->queue = WebPSafeMalloc(histo_queue->max_size + 1,
-                                      sizeof(*histo_queue->queue));
+  histo_queue->queue = (HistogramPair*)WebPSafeMalloc(
+      histo_queue->max_size + 1, sizeof(*histo_queue->queue));
   return histo_queue->queue != NULL;
 }
 
@@ -659,7 +700,8 @@ static int HistogramCombineGreedy(VP8LHistogramSet* const image_histo) {
   int i, j;
   VP8LHistogram** const histograms = image_histo->histograms;
   // Indexes of remaining histograms.
-  int* const clusters = WebPSafeMalloc(image_histo_size, sizeof(*clusters));
+  int* const clusters =
+      (int*)WebPSafeMalloc(image_histo_size, sizeof(*clusters));
   // Priority queue of histogram pairs.
   HistoQueue histo_queue;
 
@@ -681,7 +723,7 @@ static int HistogramCombineGreedy(VP8LHistogramSet* const image_histo) {
     HistogramPair* copy_to;
     const int idx1 = histo_queue.queue[0].idx1;
     const int idx2 = histo_queue.queue[0].idx2;
-    VP8LHistogramAdd(histograms[idx2], histograms[idx1], histograms[idx1]);
+    HistogramAdd(histograms[idx2], histograms[idx1], histograms[idx1]);
     histograms[idx1]->bit_cost_ = histo_queue.queue[0].cost_combo;
     // Remove merged histogram.
     for (i = 0; i + 1 < image_histo_size; ++i) {
@@ -747,6 +789,8 @@ static void HistogramCombineStochastic(VP8LHistogramSet* const image_histo,
   const int outer_iters = image_histo_size * iter_mult;
   const int num_pairs = image_histo_size / 2;
   const int num_tries_no_success = outer_iters / 2;
+  int idx2_max = image_histo_size - 1;
+  int do_brute_dorce = 0;
   VP8LHistogram** const histograms = image_histo->histograms;
 
   // Collapse similar histograms in 'image_histo'.
@@ -757,43 +801,62 @@ static void HistogramCombineStochastic(VP8LHistogramSet* const image_histo,
     double best_cost_diff = 0.;
     int best_idx1 = -1, best_idx2 = 1;
     int j;
-    const int num_tries =
+    int num_tries =
         (num_pairs < image_histo_size) ? num_pairs : image_histo_size;
+    // Use a brute force approach if:
+    // - stochastic has not worked for a while and
+    // - if the number of iterations for brute force is less than the number of
+    // iterations if we never find a match ever again stochastically (hence
+    // num_tries times the number of remaining outer iterations).
+    do_brute_dorce =
+        (tries_with_no_success > 10) &&
+        (idx2_max * (idx2_max + 1) < 2 * num_tries * (outer_iters - iter));
+    if (do_brute_dorce) num_tries = idx2_max;
+
     seed += iter;
     for (j = 0; j < num_tries; ++j) {
       double curr_cost_diff;
       // Choose two histograms at random and try to combine them.
-      const uint32_t idx1 = MyRand(&seed) % image_histo_size;
-      const uint32_t tmp = (j & 7) + 1;
-      const uint32_t diff =
-          (tmp < 3) ? tmp : MyRand(&seed) % (image_histo_size - 1);
-      const uint32_t idx2 = (idx1 + diff + 1) % image_histo_size;
-      if (idx1 == idx2) {
-        continue;
+      uint32_t idx1, idx2;
+      if (do_brute_dorce) {
+        // Use a brute force approach.
+        idx1 = (uint32_t)j;
+        idx2 = (uint32_t)idx2_max;
+      } else {
+        const uint32_t tmp = (j & 7) + 1;
+        const uint32_t diff =
+            (tmp < 3) ? tmp : MyRand(&seed) % (image_histo_size - 1);
+        idx1 = MyRand(&seed) % image_histo_size;
+        idx2 = (idx1 + diff + 1) % image_histo_size;
+        if (idx1 == idx2) {
+          continue;
+        }
       }
 
       // Calculate cost reduction on combining.
       curr_cost_diff = HistogramAddEval(histograms[idx1], histograms[idx2],
                                         tmp_histo, best_cost_diff);
-      if (curr_cost_diff < best_cost_diff) {    // found a better pair?
+      if (curr_cost_diff < best_cost_diff) {  // found a better pair?
         HistogramSwap(&best_combo, &tmp_histo);
         best_cost_diff = curr_cost_diff;
         best_idx1 = idx1;
         best_idx2 = idx2;
       }
     }
+    if (do_brute_dorce) --idx2_max;
 
     if (best_idx1 >= 0) {
       HistogramSwap(&best_combo, &histograms[best_idx1]);
       // swap best_idx2 slot with last one (which is now unused)
       --image_histo_size;
+      if (idx2_max >= image_histo_size) idx2_max = image_histo_size - 1;
       if (best_idx2 != image_histo_size) {
         HistogramSwap(&histograms[image_histo_size], &histograms[best_idx2]);
         histograms[image_histo_size] = NULL;
       }
       tries_with_no_success = 0;
     }
-    if (++tries_with_no_success >= num_tries_no_success) {
+    if (++tries_with_no_success >= num_tries_no_success || idx2_max == 0) {
       break;
     }
   }
@@ -842,7 +905,7 @@ static void HistogramRemap(const VP8LHistogramSet* const in,
 
   for (i = 0; i < in_size; ++i) {
     const int idx = symbols[i];
-    VP8LHistogramAdd(in_histo[i], out_histo[idx], out_histo[idx]);
+    HistogramAdd(in_histo[i], out_histo[idx], out_histo[idx]);
   }
 }
 
@@ -868,32 +931,18 @@ int VP8LGetHistoImageSymbols(int xsize, int ysize,
   const int histo_xsize = histo_bits ? VP8LSubSampleSize(xsize, histo_bits) : 1;
   const int histo_ysize = histo_bits ? VP8LSubSampleSize(ysize, histo_bits) : 1;
   const int image_histo_raw_size = histo_xsize * histo_ysize;
-  const int entropy_combine_num_bins = low_effort ? NUM_PARTITIONS : BIN_SIZE;
-
-  // The bin_map for every bin follows following semantics:
-  // bin_map[n][0] = num_histo; // The number of histograms in that bin.
-  // bin_map[n][1] = index of first histogram in that bin;
-  // bin_map[n][num_histo] = index of last histogram in that bin;
-  // bin_map[n][num_histo + 1] ... bin_map[n][bin_depth - 1] = unused indices.
-  const int bin_depth = image_histo_raw_size + 1;
-  int16_t* bin_map = NULL;
   VP8LHistogramSet* const orig_histo =
       VP8LAllocateHistogramSet(image_histo_raw_size, cache_bits);
   VP8LHistogram* cur_combo;
+  // Don't attempt linear bin-partition heuristic for
+  // histograms of small sizes (as bin_map will be very sparse) and
+  // maximum quality q==100 (to preserve the compression gains at that level).
+  const int entropy_combine_num_bins = low_effort ? NUM_PARTITIONS : BIN_SIZE;
   const int entropy_combine =
       (orig_histo->size > entropy_combine_num_bins * 2) && (quality < 100);
 
   if (orig_histo == NULL) goto Error;
 
-  // Don't attempt linear bin-partition heuristic for:
-  // histograms of small sizes, as bin_map will be very sparse and;
-  // Maximum quality (q==100), to preserve the compression gains at that level.
-  if (entropy_combine) {
-    const int bin_map_size = bin_depth * entropy_combine_num_bins;
-    bin_map = (int16_t*)WebPSafeCalloc(bin_map_size, sizeof(*bin_map));
-    if (bin_map == NULL) goto Error;
-  }
-
   // Construct the histograms from backward references.
   HistogramBuild(xsize, histo_bits, refs, orig_histo);
   // Copies the histograms and computes its bit_cost.
@@ -901,12 +950,17 @@ int VP8LGetHistoImageSymbols(int xsize, int ysize,
 
   cur_combo = tmp_histos->histograms[1];  // pick up working slot
   if (entropy_combine) {
+    const int bin_map_size = orig_histo->size;
+    // Reuse histogram_symbols storage. By definition, it's guaranteed to be ok.
+    uint16_t* const bin_map = histogram_symbols;
     const double combine_cost_factor =
         GetCombineCostFactor(image_histo_raw_size, quality);
+
     HistogramAnalyzeEntropyBin(orig_histo, bin_map, low_effort);
     // Collapse histograms with similar entropy.
-    cur_combo = HistogramCombineEntropyBin(image_histo, cur_combo, bin_map,
-                                           bin_depth, entropy_combine_num_bins,
+    cur_combo = HistogramCombineEntropyBin(image_histo, cur_combo,
+                                           bin_map, bin_map_size,
+                                           entropy_combine_num_bins,
                                            combine_cost_factor, low_effort);
   }
 
@@ -931,7 +985,6 @@ int VP8LGetHistoImageSymbols(int xsize, int ysize,
   ok = 1;
 
  Error:
-  WebPSafeFree(bin_map);
   VP8LFreeHistogramSet(orig_histo);
   return ok;
 }
diff --git a/src/3rdparty/libwebp/src/enc/histogram.h b/src/3rdparty/libwebp/src/enc/histogram_enc.h
index d303d1d..a9d258a 100644
--- a/src/3rdparty/libwebp/src/enc/histogram.h
+++ b/src/3rdparty/libwebp/src/enc/histogram_enc.h
@@ -16,7 +16,7 @@
 
 #include <string.h>
 
-#include "./backward_references.h"
+#include "./backward_references_enc.h"
 #include "../webp/format_constants.h"
 #include "../webp/types.h"
 
diff --git a/src/3rdparty/libwebp/src/enc/iterator.c b/src/3rdparty/libwebp/src/enc/iterator_enc.c
index 99d960a..e48d30b 100644
--- a/src/3rdparty/libwebp/src/enc/iterator.c
+++ b/src/3rdparty/libwebp/src/enc/iterator_enc.c
@@ -13,7 +13,7 @@
 
 #include <string.h>
 
-#include "./vp8enci.h"
+#include "./vp8i_enc.h"
 
 //------------------------------------------------------------------------------
 // VP8Iterator
@@ -53,7 +53,6 @@ void VP8IteratorReset(VP8EncIterator* const it) {
   VP8IteratorSetRow(it, 0);
   VP8IteratorSetCountDown(it, enc->mb_w_ * enc->mb_h_);  // default
   InitTop(it);
-  InitLeft(it);
   memset(it->bit_count_, 0, sizeof(it->bit_count_));
   it->do_trellis_ = 0;
 }
@@ -68,8 +67,6 @@ int VP8IteratorIsDone(const VP8EncIterator* const it) {
 
 void VP8IteratorInit(VP8Encoder* const enc, VP8EncIterator* const it) {
   it->enc_ = enc;
-  it->y_stride_  = enc->pic_->y_stride;
-  it->uv_stride_ = enc->pic_->uv_stride;
   it->yuv_in_   = (uint8_t*)WEBP_ALIGN(it->yuv_mem_);
   it->yuv_out_  = it->yuv_in_ + YUV_SIZE_ENC;
   it->yuv_out2_ = it->yuv_out_ + YUV_SIZE_ENC;
@@ -309,14 +306,14 @@ void VP8IteratorSaveBoundary(VP8EncIterator* const it) {
 }
 
 int VP8IteratorNext(VP8EncIterator* const it) {
-  it->preds_ += 4;
-  it->mb_ += 1;
-  it->nz_ += 1;
-  it->y_top_ += 16;
-  it->uv_top_ += 16;
-  it->x_ += 1;
-  if (it->x_ == it->enc_->mb_w_) {
+  if (++it->x_ == it->enc_->mb_w_) {
     VP8IteratorSetRow(it, ++it->y_);
+  } else {
+    it->preds_ += 4;
+    it->mb_ += 1;
+    it->nz_ += 1;
+    it->y_top_ += 16;
+    it->uv_top_ += 16;
   }
   return (0 < --it->count_down_);
 }
diff --git a/src/3rdparty/libwebp/src/enc/near_lossless.c b/src/3rdparty/libwebp/src/enc/near_lossless_enc.c
index f4ab91f..2bd03ab 100644
--- a/src/3rdparty/libwebp/src/enc/near_lossless.c
+++ b/src/3rdparty/libwebp/src/enc/near_lossless_enc.c
@@ -17,9 +17,9 @@
 #include <assert.h>
 #include <stdlib.h>
 
-#include "../dsp/lossless.h"
+#include "../dsp/lossless_common.h"
 #include "../utils/utils.h"
-#include "./vp8enci.h"
+#include "./vp8i_enc.h"
 
 #define MIN_DIM_FOR_NEAR_LOSSLESS 64
 #define MAX_LIMIT_BITS             5
diff --git a/src/3rdparty/libwebp/src/enc/picture_csp.c b/src/3rdparty/libwebp/src/enc/picture_csp_enc.c
index 607a624..e5d1c75 100644
--- a/src/3rdparty/libwebp/src/enc/picture_csp.c
+++ b/src/3rdparty/libwebp/src/enc/picture_csp_enc.c
@@ -15,8 +15,8 @@
 #include <stdlib.h>
 #include <math.h>
 
-#include "./vp8enci.h"
-#include "../utils/random.h"
+#include "./vp8i_enc.h"
+#include "../utils/random_utils.h"
 #include "../utils/utils.h"
 #include "../dsp/yuv.h"
 
@@ -153,9 +153,9 @@ static int RGBToV(int r, int g, int b, VP8Random* const rg) {
 }
 
 //------------------------------------------------------------------------------
-// Smart RGB->YUV conversion
+// Sharp RGB->YUV conversion
 
-static const int kNumIterations = 6;
+static const int kNumIterations = 4;
 static const int kMinDimensionIterativeConversion = 4;
 
 // We could use SFIX=0 and only uint8_t for fixed_y_t, but it produces some
@@ -171,9 +171,9 @@ typedef uint16_t fixed_y_t;   // unsigned type with extra SFIX precision for W
 #if defined(USE_GAMMA_COMPRESSION)
 
 // float variant of gamma-correction
-// We use tables of different size and precision, along with a 'real-world'
-// Gamma value close to ~2.
-#define kGammaF 2.2
+// We use tables of different size and precision for the Rec709
+// transfer function.
+#define kGammaF (1./0.45)
 static float kGammaToLinearTabF[MAX_Y_T + 1];   // size scales with Y_FIX
 static float kLinearToGammaTabF[kGammaTabSize + 2];
 static volatile int kGammaTablesFOk = 0;
@@ -183,11 +183,26 @@ static WEBP_TSAN_IGNORE_FUNCTION void InitGammaTablesF(void) {
     int v;
     const double norm = 1. / MAX_Y_T;
     const double scale = 1. / kGammaTabSize;
+    const double a = 0.099;
+    const double thresh = 0.018;
     for (v = 0; v <= MAX_Y_T; ++v) {
-      kGammaToLinearTabF[v] = (float)pow(norm * v, kGammaF);
+      const double g = norm * v;
+      if (g <= thresh * 4.5) {
+        kGammaToLinearTabF[v] = (float)(g / 4.5);
+      } else {
+        const double a_rec = 1. / (1. + a);
+        kGammaToLinearTabF[v] = (float)pow(a_rec * (g + a), kGammaF);
+      }
     }
     for (v = 0; v <= kGammaTabSize; ++v) {
-      kLinearToGammaTabF[v] = (float)(MAX_Y_T * pow(scale * v, 1. / kGammaF));
+      const double g = scale * v;
+      double value;
+      if (g <= thresh) {
+        value = 4.5 * g;
+      } else {
+        value = (1. + a) * pow(g, 1. / kGammaF) - a;
+      }
+      kLinearToGammaTabF[v] = (float)(MAX_Y_T * value);
     }
     // to prevent small rounding errors to cause read-overflow:
     kLinearToGammaTabF[kGammaTabSize + 1] = kLinearToGammaTabF[kGammaTabSize];
@@ -235,12 +250,12 @@ static fixed_y_t clip_y(int y) {
 //------------------------------------------------------------------------------
 
 static int RGBToGray(int r, int g, int b) {
-  const int luma = 19595 * r + 38470 * g + 7471 * b + YUV_HALF;
+  const int luma = 13933 * r + 46871 * g + 4732 * b + YUV_HALF;
   return (luma >> YUV_FIX);
 }
 
 static float RGBToGrayF(float r, float g, float b) {
-  return 0.299f * r + 0.587f * g + 0.114f * b;
+  return (float)(0.2126 * r + 0.7152 * g + 0.0722 * b);
 }
 
 static int ScaleDown(int a, int b, int c, int d) {
@@ -251,58 +266,50 @@ static int ScaleDown(int a, int b, int c, int d) {
   return LinearToGammaF(0.25f * (A + B + C + D));
 }
 
-static WEBP_INLINE void UpdateW(const fixed_y_t* src, fixed_y_t* dst, int len) {
-  while (len-- > 0) {
-    const float R = GammaToLinearF(src[0]);
-    const float G = GammaToLinearF(src[1]);
-    const float B = GammaToLinearF(src[2]);
+static WEBP_INLINE void UpdateW(const fixed_y_t* src, fixed_y_t* dst, int w) {
+  int i;
+  for (i = 0; i < w; ++i) {
+    const float R = GammaToLinearF(src[0 * w + i]);
+    const float G = GammaToLinearF(src[1 * w + i]);
+    const float B = GammaToLinearF(src[2 * w + i]);
     const float Y = RGBToGrayF(R, G, B);
-    *dst++ = (fixed_y_t)LinearToGammaF(Y);
-    src += 3;
+    dst[i] = (fixed_y_t)LinearToGammaF(Y);
   }
 }
 
-static int UpdateChroma(const fixed_y_t* src1,
-                        const fixed_y_t* src2,
-                        fixed_t* dst, fixed_y_t* tmp, int len) {
-  int diff = 0;
-  while (len--> 0) {
-    const int r = ScaleDown(src1[0], src1[3], src2[0], src2[3]);
-    const int g = ScaleDown(src1[1], src1[4], src2[1], src2[4]);
-    const int b = ScaleDown(src1[2], src1[5], src2[2], src2[5]);
+static void UpdateChroma(const fixed_y_t* src1, const fixed_y_t* src2,
+                         fixed_t* dst, int uv_w) {
+  int i;
+  for (i = 0; i < uv_w; ++i) {
+    const int r = ScaleDown(src1[0 * uv_w + 0], src1[0 * uv_w + 1],
+                            src2[0 * uv_w + 0], src2[0 * uv_w + 1]);
+    const int g = ScaleDown(src1[2 * uv_w + 0], src1[2 * uv_w + 1],
+                            src2[2 * uv_w + 0], src2[2 * uv_w + 1]);
+    const int b = ScaleDown(src1[4 * uv_w + 0], src1[4 * uv_w + 1],
+                            src2[4 * uv_w + 0], src2[4 * uv_w + 1]);
     const int W = RGBToGray(r, g, b);
-    const int r_avg = (src1[0] + src1[3] + src2[0] + src2[3] + 2) >> 2;
-    const int g_avg = (src1[1] + src1[4] + src2[1] + src2[4] + 2) >> 2;
-    const int b_avg = (src1[2] + src1[5] + src2[2] + src2[5] + 2) >> 2;
-    dst[0] = (fixed_t)(r - W);
-    dst[1] = (fixed_t)(g - W);
-    dst[2] = (fixed_t)(b - W);
-    dst += 3;
-    src1 += 6;
-    src2 += 6;
-    if (tmp != NULL) {
-      tmp[0] = tmp[1] = clip_y(W);
-      tmp += 2;
-    }
-    diff += abs(RGBToGray(r_avg, g_avg, b_avg) - W);
+    dst[0 * uv_w] = (fixed_t)(r - W);
+    dst[1 * uv_w] = (fixed_t)(g - W);
+    dst[2 * uv_w] = (fixed_t)(b - W);
+    dst  += 1;
+    src1 += 2;
+    src2 += 2;
   }
-  return diff;
 }
 
-//------------------------------------------------------------------------------
-
-static WEBP_INLINE int Filter(const fixed_t* const A, const fixed_t* const B,
-                              int rightwise) {
-  int v;
-  if (!rightwise) {
-    v = (A[0] * 9 + A[-3] * 3 + B[0] * 3 + B[-3]);
-  } else {
-    v = (A[0] * 9 + A[+3] * 3 + B[0] * 3 + B[+3]);
+static void StoreGray(const fixed_y_t* rgb, fixed_y_t* y, int w) {
+  int i;
+  for (i = 0; i < w; ++i) {
+    y[i] = RGBToGray(rgb[0 * w + i], rgb[1 * w + i], rgb[2 * w + i]);
   }
-  return (v + 8) >> 4;
 }
 
-static WEBP_INLINE int Filter2(int A, int B) { return (A * 3 + B + 2) >> 2; }
+//------------------------------------------------------------------------------
+
+static WEBP_INLINE fixed_y_t Filter2(int A, int B, int W0) {
+  const int v0 = (A * 3 + B + 2) >> 2;
+  return clip_y(v0 + W0);
+}
 
 //------------------------------------------------------------------------------
 
@@ -317,52 +324,50 @@ static void ImportOneRow(const uint8_t* const r_ptr,
                          int pic_width,
                          fixed_y_t* const dst) {
   int i;
+  const int w = (pic_width + 1) & ~1;
   for (i = 0; i < pic_width; ++i) {
     const int off = i * step;
-    dst[3 * i + 0] = UpLift(r_ptr[off]);
-    dst[3 * i + 1] = UpLift(g_ptr[off]);
-    dst[3 * i + 2] = UpLift(b_ptr[off]);
+    dst[i + 0 * w] = UpLift(r_ptr[off]);
+    dst[i + 1 * w] = UpLift(g_ptr[off]);
+    dst[i + 2 * w] = UpLift(b_ptr[off]);
   }
   if (pic_width & 1) {  // replicate rightmost pixel
-    memcpy(dst + 3 * pic_width, dst + 3 * (pic_width - 1), 3 * sizeof(*dst));
+    dst[pic_width + 0 * w] = dst[pic_width + 0 * w - 1];
+    dst[pic_width + 1 * w] = dst[pic_width + 1 * w - 1];
+    dst[pic_width + 2 * w] = dst[pic_width + 2 * w - 1];
   }
 }
 
 static void InterpolateTwoRows(const fixed_y_t* const best_y,
-                               const fixed_t* const prev_uv,
-                               const fixed_t* const cur_uv,
-                               const fixed_t* const next_uv,
+                               const fixed_t* prev_uv,
+                               const fixed_t* cur_uv,
+                               const fixed_t* next_uv,
                                int w,
-                               fixed_y_t* const out1,
-                               fixed_y_t* const out2) {
-  int i, k;
-  {  // special boundary case for i==0
-    const int W0 = best_y[0];
-    const int W1 = best_y[w];
-    for (k = 0; k <= 2; ++k) {
-      out1[k] = clip_y(Filter2(cur_uv[k], prev_uv[k]) + W0);
-      out2[k] = clip_y(Filter2(cur_uv[k], next_uv[k]) + W1);
-    }
-  }
-  for (i = 1; i < w - 1; ++i) {
-    const int W0 = best_y[i + 0];
-    const int W1 = best_y[i + w];
-    const int off = 3 * (i >> 1);
-    for (k = 0; k <= 2; ++k) {
-      const int tmp0 = Filter(cur_uv + off + k, prev_uv + off + k, i & 1);
-      const int tmp1 = Filter(cur_uv + off + k, next_uv + off + k, i & 1);
-      out1[3 * i + k] = clip_y(tmp0 + W0);
-      out2[3 * i + k] = clip_y(tmp1 + W1);
-    }
-  }
-  {  // special boundary case for i == w - 1
-    const int W0 = best_y[i + 0];
-    const int W1 = best_y[i + w];
-    const int off = 3 * (i >> 1);
-    for (k = 0; k <= 2; ++k) {
-      out1[3 * i + k] = clip_y(Filter2(cur_uv[off + k], prev_uv[off + k]) + W0);
-      out2[3 * i + k] = clip_y(Filter2(cur_uv[off + k], next_uv[off + k]) + W1);
+                               fixed_y_t* out1,
+                               fixed_y_t* out2) {
+  const int uv_w = w >> 1;
+  const int len = (w - 1) >> 1;   // length to filter
+  int k = 3;
+  while (k-- > 0) {   // process each R/G/B segments in turn
+    // special boundary case for i==0
+    out1[0] = Filter2(cur_uv[0], prev_uv[0], best_y[0]);
+    out2[0] = Filter2(cur_uv[0], next_uv[0], best_y[w]);
+
+    WebPSharpYUVFilterRow(cur_uv, prev_uv, len, best_y + 0 + 1, out1 + 1);
+    WebPSharpYUVFilterRow(cur_uv, next_uv, len, best_y + w + 1, out2 + 1);
+
+    // special boundary case for i == w - 1 when w is even
+    if (!(w & 1)) {
+      out1[w - 1] = Filter2(cur_uv[uv_w - 1], prev_uv[uv_w - 1],
+                            best_y[w - 1 + 0]);
+      out2[w - 1] = Filter2(cur_uv[uv_w - 1], next_uv[uv_w - 1],
+                            best_y[w - 1 + w]);
     }
+    out1 += w;
+    out2 += w;
+    prev_uv += uv_w;
+    cur_uv  += uv_w;
+    next_uv += uv_w;
   }
 }
 
@@ -381,36 +386,42 @@ static WEBP_INLINE uint8_t ConvertRGBToV(int r, int g, int b) {
   return clip_8b(128 + (v >> (YUV_FIX + SFIX)));
 }
 
-static int ConvertWRGBToYUV(const fixed_y_t* const best_y,
-                            const fixed_t* const best_uv,
+static int ConvertWRGBToYUV(const fixed_y_t* best_y, const fixed_t* best_uv,
                             WebPPicture* const picture) {
   int i, j;
+  uint8_t* dst_y = picture->y;
+  uint8_t* dst_u = picture->u;
+  uint8_t* dst_v = picture->v;
+  const fixed_t* const best_uv_base = best_uv;
   const int w = (picture->width + 1) & ~1;
   const int h = (picture->height + 1) & ~1;
   const int uv_w = w >> 1;
   const int uv_h = h >> 1;
-  for (j = 0; j < picture->height; ++j) {
+  for (best_uv = best_uv_base, j = 0; j < picture->height; ++j) {
     for (i = 0; i < picture->width; ++i) {
-      const int off = 3 * ((i >> 1) + (j >> 1) * uv_w);
-      const int off2 = i + j * picture->y_stride;
-      const int W = best_y[i + j * w];
-      const int r = best_uv[off + 0] + W;
-      const int g = best_uv[off + 1] + W;
-      const int b = best_uv[off + 2] + W;
-      picture->y[off2] = ConvertRGBToY(r, g, b);
+      const int off = (i >> 1);
+      const int W = best_y[i];
+      const int r = best_uv[off + 0 * uv_w] + W;
+      const int g = best_uv[off + 1 * uv_w] + W;
+      const int b = best_uv[off + 2 * uv_w] + W;
+      dst_y[i] = ConvertRGBToY(r, g, b);
     }
+    best_y += w;
+    best_uv += (j & 1) * 3 * uv_w;
+    dst_y += picture->y_stride;
   }
-  for (j = 0; j < uv_h; ++j) {
-    uint8_t* const dst_u = picture->u + j * picture->uv_stride;
-    uint8_t* const dst_v = picture->v + j * picture->uv_stride;
+  for (best_uv = best_uv_base, j = 0; j < uv_h; ++j) {
     for (i = 0; i < uv_w; ++i) {
-      const int off = 3 * (i + j * uv_w);
-      const int r = best_uv[off + 0];
-      const int g = best_uv[off + 1];
-      const int b = best_uv[off + 2];
+      const int off = i;
+      const int r = best_uv[off + 0 * uv_w];
+      const int g = best_uv[off + 1 * uv_w];
+      const int b = best_uv[off + 2 * uv_w];
       dst_u[i] = ConvertRGBToU(r, g, b);
       dst_v[i] = ConvertRGBToV(r, g, b);
     }
+    best_uv += 3 * uv_w;
+    dst_u += picture->uv_stride;
+    dst_v += picture->uv_stride;
   }
   return 1;
 }
@@ -420,9 +431,9 @@ static int ConvertWRGBToYUV(const fixed_y_t* const best_y,
 
 #define SAFE_ALLOC(W, H, T) ((T*)WebPSafeMalloc((W) * (H), sizeof(T)))
 
-static int PreprocessARGB(const uint8_t* const r_ptr,
-                          const uint8_t* const g_ptr,
-                          const uint8_t* const b_ptr,
+static int PreprocessARGB(const uint8_t* r_ptr,
+                          const uint8_t* g_ptr,
+                          const uint8_t* b_ptr,
                           int step, int rgb_stride,
                           WebPPicture* const picture) {
   // we expand the right/bottom border if needed
@@ -430,25 +441,27 @@ static int PreprocessARGB(const uint8_t* const r_ptr,
   const int h = (picture->height + 1) & ~1;
   const int uv_w = w >> 1;
   const int uv_h = h >> 1;
-  int i, j, iter;
+  uint64_t prev_diff_y_sum = ~0;
+  int j, iter;
 
   // TODO(skal): allocate one big memory chunk. But for now, it's easier
   // for valgrind debugging to have several chunks.
   fixed_y_t* const tmp_buffer = SAFE_ALLOC(w * 3, 2, fixed_y_t);   // scratch
-  fixed_y_t* const best_y = SAFE_ALLOC(w, h, fixed_y_t);
-  fixed_y_t* const target_y = SAFE_ALLOC(w, h, fixed_y_t);
+  fixed_y_t* const best_y_base = SAFE_ALLOC(w, h, fixed_y_t);
+  fixed_y_t* const target_y_base = SAFE_ALLOC(w, h, fixed_y_t);
   fixed_y_t* const best_rgb_y = SAFE_ALLOC(w, 2, fixed_y_t);
-  fixed_t* const best_uv = SAFE_ALLOC(uv_w * 3, uv_h, fixed_t);
-  fixed_t* const target_uv = SAFE_ALLOC(uv_w * 3, uv_h, fixed_t);
+  fixed_t* const best_uv_base = SAFE_ALLOC(uv_w * 3, uv_h, fixed_t);
+  fixed_t* const target_uv_base = SAFE_ALLOC(uv_w * 3, uv_h, fixed_t);
   fixed_t* const best_rgb_uv = SAFE_ALLOC(uv_w * 3, 1, fixed_t);
+  fixed_y_t* best_y = best_y_base;
+  fixed_y_t* target_y = target_y_base;
+  fixed_t* best_uv = best_uv_base;
+  fixed_t* target_uv = target_uv_base;
+  const uint64_t diff_y_threshold = (uint64_t)(3.0 * w * h);
   int ok;
-  int diff_sum = 0;
-  const int first_diff_threshold = (int)(2.5 * w * h);
-  const int min_improvement = 5;   // stop if improvement is below this %
-  const int min_first_improvement = 80;
 
-  if (best_y == NULL || best_uv == NULL ||
-      target_y == NULL || target_uv == NULL ||
+  if (best_y_base == NULL || best_uv_base == NULL ||
+      target_y_base == NULL || target_uv_base == NULL ||
       best_rgb_y == NULL || best_rgb_uv == NULL ||
       tmp_buffer == NULL) {
     ok = WebPEncodingSetError(picture, VP8_ENC_ERROR_OUT_OF_MEMORY);
@@ -457,101 +470,86 @@ static int PreprocessARGB(const uint8_t* const r_ptr,
   assert(picture->width >= kMinDimensionIterativeConversion);
   assert(picture->height >= kMinDimensionIterativeConversion);
 
+  WebPInitConvertARGBToYUV();
+
   // Import RGB samples to W/RGB representation.
   for (j = 0; j < picture->height; j += 2) {
     const int is_last_row = (j == picture->height - 1);
-    fixed_y_t* const src1 = tmp_buffer;
+    fixed_y_t* const src1 = tmp_buffer + 0 * w;
     fixed_y_t* const src2 = tmp_buffer + 3 * w;
-    const int off1 = j * rgb_stride;
-    const int off2 = off1 + rgb_stride;
-    const int uv_off = (j >> 1) * 3 * uv_w;
-    fixed_y_t* const dst_y = best_y + j * w;
 
     // prepare two rows of input
-    ImportOneRow(r_ptr + off1, g_ptr + off1, b_ptr + off1,
-                 step, picture->width, src1);
+    ImportOneRow(r_ptr, g_ptr, b_ptr, step, picture->width, src1);
     if (!is_last_row) {
-      ImportOneRow(r_ptr + off2, g_ptr + off2, b_ptr + off2,
+      ImportOneRow(r_ptr + rgb_stride, g_ptr + rgb_stride, b_ptr + rgb_stride,
                    step, picture->width, src2);
     } else {
       memcpy(src2, src1, 3 * w * sizeof(*src2));
     }
-    UpdateW(src1, target_y + (j + 0) * w, w);
-    UpdateW(src2, target_y + (j + 1) * w, w);
-    diff_sum += UpdateChroma(src1, src2, target_uv + uv_off, dst_y, uv_w);
-    memcpy(best_uv + uv_off, target_uv + uv_off, 3 * uv_w * sizeof(*best_uv));
-    memcpy(dst_y + w, dst_y, w * sizeof(*dst_y));
+    StoreGray(src1, best_y + 0, w);
+    StoreGray(src2, best_y + w, w);
+
+    UpdateW(src1, target_y, w);
+    UpdateW(src2, target_y + w, w);
+    UpdateChroma(src1, src2, target_uv, uv_w);
+    memcpy(best_uv, target_uv, 3 * uv_w * sizeof(*best_uv));
+    best_y += 2 * w;
+    best_uv += 3 * uv_w;
+    target_y += 2 * w;
+    target_uv += 3 * uv_w;
+    r_ptr += 2 * rgb_stride;
+    g_ptr += 2 * rgb_stride;
+    b_ptr += 2 * rgb_stride;
   }
 
   // Iterate and resolve clipping conflicts.
   for (iter = 0; iter < kNumIterations; ++iter) {
-    int k;
-    const fixed_t* cur_uv = best_uv;
-    const fixed_t* prev_uv = best_uv;
-    const int old_diff_sum = diff_sum;
-    diff_sum = 0;
+    const fixed_t* cur_uv = best_uv_base;
+    const fixed_t* prev_uv = best_uv_base;
+    uint64_t diff_y_sum = 0;
+
+    best_y = best_y_base;
+    best_uv = best_uv_base;
+    target_y = target_y_base;
+    target_uv = target_uv_base;
     for (j = 0; j < h; j += 2) {
-      fixed_y_t* const src1 = tmp_buffer;
+      fixed_y_t* const src1 = tmp_buffer + 0 * w;
       fixed_y_t* const src2 = tmp_buffer + 3 * w;
       {
         const fixed_t* const next_uv = cur_uv + ((j < h - 2) ? 3 * uv_w : 0);
-        InterpolateTwoRows(best_y + j * w, prev_uv, cur_uv, next_uv,
-                           w, src1, src2);
+        InterpolateTwoRows(best_y, prev_uv, cur_uv, next_uv, w, src1, src2);
         prev_uv = cur_uv;
         cur_uv = next_uv;
       }
 
       UpdateW(src1, best_rgb_y + 0 * w, w);
       UpdateW(src2, best_rgb_y + 1 * w, w);
-      diff_sum += UpdateChroma(src1, src2, best_rgb_uv, NULL, uv_w);
+      UpdateChroma(src1, src2, best_rgb_uv, uv_w);
 
       // update two rows of Y and one row of RGB
-      for (i = 0; i < 2 * w; ++i) {
-        const int off = i + j * w;
-        const int diff_y = target_y[off] - best_rgb_y[i];
-        const int new_y = (int)best_y[off] + diff_y;
-        best_y[off] = clip_y(new_y);
-      }
-      for (i = 0; i < uv_w; ++i) {
-        const int off = 3 * (i + (j >> 1) * uv_w);
-        int W;
-        for (k = 0; k <= 2; ++k) {
-          const int diff_uv = (int)target_uv[off + k] - best_rgb_uv[3 * i + k];
-          best_uv[off + k] += diff_uv;
-        }
-        W = RGBToGray(best_uv[off + 0], best_uv[off + 1], best_uv[off + 2]);
-        for (k = 0; k <= 2; ++k) {
-          best_uv[off + k] -= W;
-        }
-      }
+      diff_y_sum += WebPSharpYUVUpdateY(target_y, best_rgb_y, best_y, 2 * w);
+      WebPSharpYUVUpdateRGB(target_uv, best_rgb_uv, best_uv, 3 * uv_w);
+
+      best_y += 2 * w;
+      best_uv += 3 * uv_w;
+      target_y += 2 * w;
+      target_uv += 3 * uv_w;
     }
     // test exit condition
-    if (diff_sum > 0) {
-      const int improvement = 100 * abs(diff_sum - old_diff_sum) / diff_sum;
-      // Check if first iteration gave good result already, without a large
-      // jump of improvement (otherwise it means we need to try few extra
-      // iterations, just to be sure).
-      if (iter == 0 && diff_sum < first_diff_threshold &&
-          improvement < min_first_improvement) {
-        break;
-      }
-      // then, check if improvement is stalling.
-      if (improvement < min_improvement) {
-        break;
-      }
-    } else {
-      break;
+    if (iter > 0) {
+      if (diff_y_sum < diff_y_threshold) break;
+      if (diff_y_sum > prev_diff_y_sum) break;
     }
+    prev_diff_y_sum = diff_y_sum;
   }
-
   // final reconstruction
-  ok = ConvertWRGBToYUV(best_y, best_uv, picture);
+  ok = ConvertWRGBToYUV(best_y_base, best_uv_base, picture);
 
  End:
-  WebPSafeFree(best_y);
-  WebPSafeFree(best_uv);
-  WebPSafeFree(target_y);
-  WebPSafeFree(target_uv);
+  WebPSafeFree(best_y_base);
+  WebPSafeFree(best_uv_base);
+  WebPSafeFree(target_y_base);
+  WebPSafeFree(target_uv_base);
   WebPSafeFree(best_rgb_y);
   WebPSafeFree(best_rgb_uv);
   WebPSafeFree(tmp_buffer);
@@ -830,10 +828,10 @@ static WEBP_INLINE void ConvertRowsToUV(const uint16_t* rgb,
   }
 }
 
-static int ImportYUVAFromRGBA(const uint8_t* const r_ptr,
-                              const uint8_t* const g_ptr,
-                              const uint8_t* const b_ptr,
-                              const uint8_t* const a_ptr,
+static int ImportYUVAFromRGBA(const uint8_t* r_ptr,
+                              const uint8_t* g_ptr,
+                              const uint8_t* b_ptr,
+                              const uint8_t* a_ptr,
                               int step,         // bytes per pixel
                               int rgb_stride,   // bytes per scanline
                               float dithering,
@@ -900,36 +898,34 @@ static int ImportYUVAFromRGBA(const uint8_t* const r_ptr,
     // Downsample Y/U/V planes, two rows at a time
     for (y = 0; y < (height >> 1); ++y) {
       int rows_have_alpha = has_alpha;
-      const int off1 = (2 * y + 0) * rgb_stride;
-      const int off2 = (2 * y + 1) * rgb_stride;
       if (use_dsp) {
         if (is_rgb) {
-          WebPConvertRGB24ToY(r_ptr + off1, dst_y, width);
-          WebPConvertRGB24ToY(r_ptr + off2, dst_y + picture->y_stride, width);
+          WebPConvertRGB24ToY(r_ptr, dst_y, width);
+          WebPConvertRGB24ToY(r_ptr + rgb_stride,
+                              dst_y + picture->y_stride, width);
         } else {
-          WebPConvertBGR24ToY(b_ptr + off1, dst_y, width);
-          WebPConvertBGR24ToY(b_ptr + off2, dst_y + picture->y_stride, width);
+          WebPConvertBGR24ToY(b_ptr, dst_y, width);
+          WebPConvertBGR24ToY(b_ptr + rgb_stride,
+                              dst_y + picture->y_stride, width);
         }
       } else {
-        ConvertRowToY(r_ptr + off1, g_ptr + off1, b_ptr + off1, step,
-                      dst_y, width, rg);
-        ConvertRowToY(r_ptr + off2, g_ptr + off2, b_ptr + off2, step,
+        ConvertRowToY(r_ptr, g_ptr, b_ptr, step, dst_y, width, rg);
+        ConvertRowToY(r_ptr + rgb_stride,
+                      g_ptr + rgb_stride,
+                      b_ptr + rgb_stride, step,
                       dst_y + picture->y_stride, width, rg);
       }
       dst_y += 2 * picture->y_stride;
       if (has_alpha) {
-        rows_have_alpha &= !WebPExtractAlpha(a_ptr + off1, rgb_stride,
-                                             width, 2,
+        rows_have_alpha &= !WebPExtractAlpha(a_ptr, rgb_stride, width, 2,
                                              dst_a, picture->a_stride);
         dst_a += 2 * picture->a_stride;
       }
       // Collect averaged R/G/B(/A)
       if (!rows_have_alpha) {
-        AccumulateRGB(r_ptr + off1, g_ptr + off1, b_ptr + off1,
-                      step, rgb_stride, tmp_rgb, width);
+        AccumulateRGB(r_ptr, g_ptr, b_ptr, step, rgb_stride, tmp_rgb, width);
       } else {
-        AccumulateRGBA(r_ptr + off1, g_ptr + off1, b_ptr + off1, a_ptr + off1,
-                       rgb_stride, tmp_rgb, width);
+        AccumulateRGBA(r_ptr, g_ptr, b_ptr, a_ptr, rgb_stride, tmp_rgb, width);
       }
       // Convert to U/V
       if (rg == NULL) {
@@ -939,31 +935,33 @@ static int ImportYUVAFromRGBA(const uint8_t* const r_ptr,
       }
       dst_u += picture->uv_stride;
       dst_v += picture->uv_stride;
+      r_ptr += 2 * rgb_stride;
+      b_ptr += 2 * rgb_stride;
+      g_ptr += 2 * rgb_stride;
+      if (has_alpha) a_ptr += 2 * rgb_stride;
     }
     if (height & 1) {    // extra last row
-      const int off = 2 * y * rgb_stride;
       int row_has_alpha = has_alpha;
       if (use_dsp) {
         if (r_ptr < b_ptr) {
-          WebPConvertRGB24ToY(r_ptr + off, dst_y, width);
+          WebPConvertRGB24ToY(r_ptr, dst_y, width);
         } else {
-          WebPConvertBGR24ToY(b_ptr + off, dst_y, width);
+          WebPConvertBGR24ToY(b_ptr, dst_y, width);
         }
       } else {
-        ConvertRowToY(r_ptr + off, g_ptr + off, b_ptr + off, step,
-                      dst_y, width, rg);
+        ConvertRowToY(r_ptr, g_ptr, b_ptr, step, dst_y, width, rg);
       }
       if (row_has_alpha) {
-        row_has_alpha &= !WebPExtractAlpha(a_ptr + off, 0, width, 1, dst_a, 0);
+        row_has_alpha &= !WebPExtractAlpha(a_ptr, 0, width, 1, dst_a, 0);
       }
       // Collect averaged R/G/B(/A)
       if (!row_has_alpha) {
         // Collect averaged R/G/B
-        AccumulateRGB(r_ptr + off, g_ptr + off, b_ptr + off,
-                      step, /* rgb_stride = */ 0, tmp_rgb, width);
+        AccumulateRGB(r_ptr, g_ptr, b_ptr, step, /* rgb_stride = */ 0,
+                      tmp_rgb, width);
       } else {
-        AccumulateRGBA(r_ptr + off, g_ptr + off, b_ptr + off, a_ptr + off,
-                       /* rgb_stride = */ 0, tmp_rgb, width);
+        AccumulateRGBA(r_ptr, g_ptr, b_ptr, a_ptr, /* rgb_stride = */ 0,
+                       tmp_rgb, width);
       }
       if (rg == NULL) {
         WebPConvertRGBA32ToUV(tmp_rgb, dst_u, dst_v, uv_width);
@@ -1013,9 +1011,13 @@ int WebPPictureARGBToYUVA(WebPPicture* picture, WebPEncCSP colorspace) {
   return PictureARGBToYUVA(picture, colorspace, 0.f, 0);
 }
 
-int WebPPictureSmartARGBToYUVA(WebPPicture* picture) {
+int WebPPictureSharpARGBToYUVA(WebPPicture* picture) {
   return PictureARGBToYUVA(picture, WEBP_YUV420, 0.f, 1);
 }
+// for backward compatibility
+int WebPPictureSmartARGBToYUVA(WebPPicture* picture) {
+  return WebPPictureSharpARGBToYUVA(picture);
+}
 
 //------------------------------------------------------------------------------
 // call for YUVA -> ARGB conversion
@@ -1086,10 +1088,10 @@ static int Import(WebPPicture* const picture,
                   const uint8_t* const rgb, int rgb_stride,
                   int step, int swap_rb, int import_alpha) {
   int y;
-  const uint8_t* const r_ptr = rgb + (swap_rb ? 2 : 0);
-  const uint8_t* const g_ptr = rgb + 1;
-  const uint8_t* const b_ptr = rgb + (swap_rb ? 0 : 2);
-  const uint8_t* const a_ptr = import_alpha ? rgb + 3 : NULL;
+  const uint8_t* r_ptr = rgb + (swap_rb ? 2 : 0);
+  const uint8_t* g_ptr = rgb + 1;
+  const uint8_t* b_ptr = rgb + (swap_rb ? 0 : 2);
+  const uint8_t* a_ptr = import_alpha ? rgb + 3 : NULL;
   const int width = picture->width;
   const int height = picture->height;
 
@@ -1102,20 +1104,25 @@ static int Import(WebPPicture* const picture,
   VP8EncDspARGBInit();
 
   if (import_alpha) {
+    uint32_t* dst = picture->argb;
     assert(step == 4);
     for (y = 0; y < height; ++y) {
-      uint32_t* const dst = &picture->argb[y * picture->argb_stride];
-      const int offset = y * rgb_stride;
-      VP8PackARGB(a_ptr + offset, r_ptr + offset, g_ptr + offset,
-                  b_ptr + offset, width, dst);
+      VP8PackARGB(a_ptr, r_ptr, g_ptr, b_ptr, width, dst);
+      a_ptr += rgb_stride;
+      r_ptr += rgb_stride;
+      g_ptr += rgb_stride;
+      b_ptr += rgb_stride;
+      dst += picture->argb_stride;
     }
   } else {
+    uint32_t* dst = picture->argb;
     assert(step >= 3);
     for (y = 0; y < height; ++y) {
-      uint32_t* const dst = &picture->argb[y * picture->argb_stride];
-      const int offset = y * rgb_stride;
-      VP8PackRGB(r_ptr + offset, g_ptr + offset, b_ptr + offset,
-                 width, step, dst);
+      VP8PackRGB(r_ptr, g_ptr, b_ptr, width, step, dst);
+      r_ptr += rgb_stride;
+      g_ptr += rgb_stride;
+      b_ptr += rgb_stride;
+      dst += picture->argb_stride;
     }
   }
   return 1;
diff --git a/src/3rdparty/libwebp/src/enc/picture.c b/src/3rdparty/libwebp/src/enc/picture_enc.c
index d9befbc..dfa6651 100644
--- a/src/3rdparty/libwebp/src/enc/picture.c
+++ b/src/3rdparty/libwebp/src/enc/picture_enc.c
@@ -14,7 +14,7 @@
 #include <assert.h>
 #include <stdlib.h>
 
-#include "./vp8enci.h"
+#include "./vp8i_enc.h"
 #include "../dsp/dsp.h"
 #include "../utils/utils.h"
 
@@ -88,8 +88,9 @@ int WebPPictureAllocARGB(WebPPicture* const picture, int width, int height) {
 }
 
 int WebPPictureAllocYUVA(WebPPicture* const picture, int width, int height) {
-  const WebPEncCSP uv_csp = picture->colorspace & WEBP_CSP_UV_MASK;
-  const int has_alpha = picture->colorspace & WEBP_CSP_ALPHA_BIT;
+  const WebPEncCSP uv_csp =
+      (WebPEncCSP)((int)picture->colorspace & WEBP_CSP_UV_MASK);
+  const int has_alpha = (int)picture->colorspace & WEBP_CSP_ALPHA_BIT;
   const int y_stride = width;
   const int uv_width = (width + 1) >> 1;
   const int uv_height = (height + 1) >> 1;
diff --git a/src/3rdparty/libwebp/src/enc/picture_psnr.c b/src/3rdparty/libwebp/src/enc/picture_psnr.c
deleted file mode 100644
index 81ab1b5..0000000
--- a/src/3rdparty/libwebp/src/enc/picture_psnr.c
+++ /dev/null
@@ -1,177 +0,0 @@
-// Copyright 2014 Google Inc. All Rights Reserved.
-//
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
-// -----------------------------------------------------------------------------
-//
-// WebPPicture tools for measuring distortion
-//
-// Author: Skal (pascal.massimino@gmail.com)
-
-#include <math.h>
-#include <stdlib.h>
-
-#include "./vp8enci.h"
-#include "../utils/utils.h"
-
-//------------------------------------------------------------------------------
-// local-min distortion
-//
-// For every pixel in the *reference* picture, we search for the local best
-// match in the compressed image. This is not a symmetrical measure.
-
-#define RADIUS 2  // search radius. Shouldn't be too large.
-
-static void AccumulateLSIM(const uint8_t* src, int src_stride,
-                           const uint8_t* ref, int ref_stride,
-                           int w, int h, VP8DistoStats* stats) {
-  int x, y;
-  double total_sse = 0.;
-  for (y = 0; y < h; ++y) {
-    const int y_0 = (y - RADIUS < 0) ? 0 : y - RADIUS;
-    const int y_1 = (y + RADIUS + 1 >= h) ? h : y + RADIUS + 1;
-    for (x = 0; x < w; ++x) {
-      const int x_0 = (x - RADIUS < 0) ? 0 : x - RADIUS;
-      const int x_1 = (x + RADIUS + 1 >= w) ? w : x + RADIUS + 1;
-      double best_sse = 255. * 255.;
-      const double value = (double)ref[y * ref_stride + x];
-      int i, j;
-      for (j = y_0; j < y_1; ++j) {
-        const uint8_t* const s = src + j * src_stride;
-        for (i = x_0; i < x_1; ++i) {
-          const double diff = s[i] - value;
-          const double sse = diff * diff;
-          if (sse < best_sse) best_sse = sse;
-        }
-      }
-      total_sse += best_sse;
-    }
-  }
-  stats->w = w * h;
-  stats->xm = 0;
-  stats->ym = 0;
-  stats->xxm = total_sse;
-  stats->yym = 0;
-  stats->xxm = 0;
-}
-#undef RADIUS
-
-//------------------------------------------------------------------------------
-// Distortion
-
-// Max value returned in case of exact similarity.
-static const double kMinDistortion_dB = 99.;
-static float GetPSNR(const double v) {
-  return (float)((v > 0.) ? -4.3429448 * log(v / (255 * 255.))
-                          : kMinDistortion_dB);
-}
-
-int WebPPictureDistortion(const WebPPicture* src, const WebPPicture* ref,
-                          int type, float result[5]) {
-  VP8DistoStats stats[5];
-  int w, h;
-
-  memset(stats, 0, sizeof(stats));
-
-  VP8SSIMDspInit();
-
-  if (src == NULL || ref == NULL ||
-      src->width != ref->width || src->height != ref->height ||
-      src->use_argb != ref->use_argb || result == NULL) {
-    return 0;
-  }
-  w = src->width;
-  h = src->height;
-
-  if (src->use_argb == 1) {
-    if (src->argb == NULL || ref->argb == NULL) {
-      return 0;
-    } else {
-      int i, j, c;
-      uint8_t* tmp1, *tmp2;
-      uint8_t* const tmp_plane =
-          (uint8_t*)WebPSafeMalloc(2ULL * w * h, sizeof(*tmp_plane));
-      if (tmp_plane == NULL) return 0;
-      tmp1 = tmp_plane;
-      tmp2 = tmp_plane + w * h;
-      for (c = 0; c < 4; ++c) {
-        for (j = 0; j < h; ++j) {
-          for (i = 0; i < w; ++i) {
-            tmp1[j * w + i] = src->argb[i + j * src->argb_stride] >> (c * 8);
-            tmp2[j * w + i] = ref->argb[i + j * ref->argb_stride] >> (c * 8);
-          }
-        }
-        if (type >= 2) {
-          AccumulateLSIM(tmp1, w, tmp2, w, w, h, &stats[c]);
-        } else {
-          VP8SSIMAccumulatePlane(tmp1, w, tmp2, w, w, h, &stats[c]);
-        }
-      }
-      free(tmp_plane);
-    }
-  } else {
-    int has_alpha, uv_w, uv_h;
-    if (src->y == NULL || ref->y == NULL ||
-        src->u == NULL || ref->u == NULL ||
-        src->v == NULL || ref->v == NULL) {
-      return 0;
-    }
-    has_alpha = !!(src->colorspace & WEBP_CSP_ALPHA_BIT);
-    if (has_alpha != !!(ref->colorspace & WEBP_CSP_ALPHA_BIT) ||
-        (has_alpha && (src->a == NULL || ref->a == NULL))) {
-      return 0;
-    }
-
-    uv_w = (src->width + 1) >> 1;
-    uv_h = (src->height + 1) >> 1;
-    if (type >= 2) {
-      AccumulateLSIM(src->y, src->y_stride, ref->y, ref->y_stride,
-                     w, h, &stats[0]);
-      AccumulateLSIM(src->u, src->uv_stride, ref->u, ref->uv_stride,
-                     uv_w, uv_h, &stats[1]);
-      AccumulateLSIM(src->v, src->uv_stride, ref->v, ref->uv_stride,
-                     uv_w, uv_h, &stats[2]);
-      if (has_alpha) {
-        AccumulateLSIM(src->a, src->a_stride, ref->a, ref->a_stride,
-                       w, h, &stats[3]);
-      }
-    } else {
-      VP8SSIMAccumulatePlane(src->y, src->y_stride,
-                             ref->y, ref->y_stride,
-                             w, h, &stats[0]);
-      VP8SSIMAccumulatePlane(src->u, src->uv_stride,
-                             ref->u, ref->uv_stride,
-                             uv_w, uv_h, &stats[1]);
-      VP8SSIMAccumulatePlane(src->v, src->uv_stride,
-                             ref->v, ref->uv_stride,
-                             uv_w, uv_h, &stats[2]);
-      if (has_alpha) {
-        VP8SSIMAccumulatePlane(src->a, src->a_stride,
-                               ref->a, ref->a_stride,
-                               w, h, &stats[3]);
-      }
-    }
-  }
-  // Final stat calculations.
-  {
-    int c;
-    for (c = 0; c <= 4; ++c) {
-      if (type == 1) {
-        const double v = VP8SSIMGet(&stats[c]);
-        result[c] = (float)((v < 1.) ? -10.0 * log10(1. - v)
-                                     : kMinDistortion_dB);
-      } else {
-        const double v = VP8SSIMGetSquaredError(&stats[c]);
-        result[c] = GetPSNR(v);
-      }
-      // Accumulate forward
-      if (c < 4) VP8SSIMAddStats(&stats[c], &stats[4]);
-    }
-  }
-  return 1;
-}
-
-//------------------------------------------------------------------------------
diff --git a/src/3rdparty/libwebp/src/enc/picture_psnr_enc.c b/src/3rdparty/libwebp/src/enc/picture_psnr_enc.c
new file mode 100644
index 0000000..9c0b229
--- /dev/null
+++ b/src/3rdparty/libwebp/src/enc/picture_psnr_enc.c
@@ -0,0 +1,213 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// WebPPicture tools for measuring distortion
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include <math.h>
+#include <stdlib.h>
+
+#include "./vp8i_enc.h"
+#include "../utils/utils.h"
+
+typedef double (*AccumulateFunc)(const uint8_t* src, int src_stride,
+                                 const uint8_t* ref, int ref_stride,
+                                 int w, int h);
+
+//------------------------------------------------------------------------------
+// local-min distortion
+//
+// For every pixel in the *reference* picture, we search for the local best
+// match in the compressed image. This is not a symmetrical measure.
+
+#define RADIUS 2  // search radius. Shouldn't be too large.
+
+static double AccumulateLSIM(const uint8_t* src, int src_stride,
+                             const uint8_t* ref, int ref_stride,
+                             int w, int h) {
+  int x, y;
+  double total_sse = 0.;
+  for (y = 0; y < h; ++y) {
+    const int y_0 = (y - RADIUS < 0) ? 0 : y - RADIUS;
+    const int y_1 = (y + RADIUS + 1 >= h) ? h : y + RADIUS + 1;
+    for (x = 0; x < w; ++x) {
+      const int x_0 = (x - RADIUS < 0) ? 0 : x - RADIUS;
+      const int x_1 = (x + RADIUS + 1 >= w) ? w : x + RADIUS + 1;
+      double best_sse = 255. * 255.;
+      const double value = (double)ref[y * ref_stride + x];
+      int i, j;
+      for (j = y_0; j < y_1; ++j) {
+        const uint8_t* const s = src + j * src_stride;
+        for (i = x_0; i < x_1; ++i) {
+          const double diff = s[i] - value;
+          const double sse = diff * diff;
+          if (sse < best_sse) best_sse = sse;
+        }
+      }
+      total_sse += best_sse;
+    }
+  }
+  return total_sse;
+}
+#undef RADIUS
+
+static double AccumulateSSE(const uint8_t* src, int src_stride,
+                            const uint8_t* ref, int ref_stride,
+                            int w, int h) {
+  int y;
+  double total_sse = 0.;
+  for (y = 0; y < h; ++y) {
+    total_sse += VP8AccumulateSSE(src, ref, w);
+    src += src_stride;
+    ref += ref_stride;
+  }
+  return total_sse;
+}
+
+//------------------------------------------------------------------------------
+
+static double AccumulateSSIM(const uint8_t* src, int src_stride,
+                             const uint8_t* ref, int ref_stride,
+                             int w, int h) {
+  const int w0 = (w < VP8_SSIM_KERNEL) ? w : VP8_SSIM_KERNEL;
+  const int w1 = w - VP8_SSIM_KERNEL - 1;
+  const int h0 = (h < VP8_SSIM_KERNEL) ? h : VP8_SSIM_KERNEL;
+  const int h1 = h - VP8_SSIM_KERNEL - 1;
+  int x, y;
+  double sum = 0.;
+  for (y = 0; y < h0; ++y) {
+    for (x = 0; x < w; ++x) {
+      sum += VP8SSIMGetClipped(src, src_stride, ref, ref_stride, x, y, w, h);
+    }
+  }
+  for (; y < h1; ++y) {
+    for (x = 0; x < w0; ++x) {
+      sum += VP8SSIMGetClipped(src, src_stride, ref, ref_stride, x, y, w, h);
+    }
+    for (; x < w1; ++x) {
+      const int off1 = x - VP8_SSIM_KERNEL + (y - VP8_SSIM_KERNEL) * src_stride;
+      const int off2 = x - VP8_SSIM_KERNEL + (y - VP8_SSIM_KERNEL) * ref_stride;
+      sum += VP8SSIMGet(src + off1, src_stride, ref + off2, ref_stride);
+    }
+    for (; x < w; ++x) {
+      sum += VP8SSIMGetClipped(src, src_stride, ref, ref_stride, x, y, w, h);
+    }
+  }
+  for (; y < h; ++y) {
+    for (x = 0; x < w; ++x) {
+      sum += VP8SSIMGetClipped(src, src_stride, ref, ref_stride, x, y, w, h);
+    }
+  }
+  return sum;
+}
+
+//------------------------------------------------------------------------------
+// Distortion
+
+// Max value returned in case of exact similarity.
+static const double kMinDistortion_dB = 99.;
+
+static double GetPSNR(double v, double size) {
+  return (v > 0. && size > 0.) ? -4.3429448 * log(v / (size * 255 * 255.))
+                               : kMinDistortion_dB;
+}
+
+static double GetLogSSIM(double v, double size) {
+  v = (size > 0.) ? v / size : 1.;
+  return (v < 1.) ? -10.0 * log10(1. - v) : kMinDistortion_dB;
+}
+
+int WebPPlaneDistortion(const uint8_t* src, size_t src_stride,
+                        const uint8_t* ref, size_t ref_stride,
+                        int width, int height, size_t x_step,
+                        int type, float* distortion, float* result) {
+  uint8_t* allocated = NULL;
+  const AccumulateFunc metric = (type == 0) ? AccumulateSSE :
+                                (type == 1) ? AccumulateSSIM :
+                                              AccumulateLSIM;
+  if (src == NULL || ref == NULL ||
+      src_stride < x_step * width || ref_stride < x_step * width ||
+      result == NULL || distortion == NULL) {
+    return 0;
+  }
+
+  VP8SSIMDspInit();
+  if (x_step != 1) {   // extract a packed plane if needed
+    int x, y;
+    uint8_t* tmp1;
+    uint8_t* tmp2;
+    allocated =
+        (uint8_t*)WebPSafeMalloc(2ULL * width * height, sizeof(*allocated));
+    if (allocated == NULL) return 0;
+    tmp1 = allocated;
+    tmp2 = tmp1 + (size_t)width * height;
+    for (y = 0; y < height; ++y) {
+      for (x = 0; x < width; ++x) {
+        tmp1[x + y * width] = src[x * x_step + y * src_stride];
+        tmp2[x + y * width] = ref[x * x_step + y * ref_stride];
+      }
+    }
+    src = tmp1;
+    ref = tmp2;
+  }
+  *distortion = (float)metric(src, width, ref, width, width, height);
+  WebPSafeFree(allocated);
+
+  *result = (type == 1) ? (float)GetLogSSIM(*distortion, (double)width * height)
+                        : (float)GetPSNR(*distortion, (double)width * height);
+  return 1;
+}
+
+int WebPPictureDistortion(const WebPPicture* src, const WebPPicture* ref,
+                          int type, float results[5]) {
+  int w, h, c;
+  int ok = 0;
+  WebPPicture p0, p1;
+  double total_size = 0., total_distortion = 0.;
+  if (src == NULL || ref == NULL ||
+      src->width != ref->width || src->height != ref->height ||
+      results == NULL) {
+    return 0;
+  }
+
+  VP8SSIMDspInit();
+  if (!WebPPictureInit(&p0) || !WebPPictureInit(&p1)) return 0;
+  w = src->width;
+  h = src->height;
+  if (!WebPPictureView(src, 0, 0, w, h, &p0)) goto Error;
+  if (!WebPPictureView(ref, 0, 0, w, h, &p1)) goto Error;
+
+  // We always measure distortion in ARGB space.
+  if (p0.use_argb == 0 && !WebPPictureYUVAToARGB(&p0)) goto Error;
+  if (p1.use_argb == 0 && !WebPPictureYUVAToARGB(&p1)) goto Error;
+  for (c = 0; c < 4; ++c) {
+    float distortion;
+    const size_t stride0 = 4 * (size_t)p0.argb_stride;
+    const size_t stride1 = 4 * (size_t)p1.argb_stride;
+    if (!WebPPlaneDistortion((const uint8_t*)p0.argb + c, stride0,
+                             (const uint8_t*)p1.argb + c, stride1,
+                             w, h, 4, type, &distortion, results + c)) {
+      goto Error;
+    }
+    total_distortion += distortion;
+    total_size += w * h;
+  }
+
+  results[4] = (type == 1) ? (float)GetLogSSIM(total_distortion, total_size)
+                           : (float)GetPSNR(total_distortion, total_size);
+  ok = 1;
+
+ Error:
+  WebPPictureFree(&p0);
+  WebPPictureFree(&p1);
+  return ok;
+}
+
+//------------------------------------------------------------------------------
diff --git a/src/3rdparty/libwebp/src/enc/picture_rescale.c b/src/3rdparty/libwebp/src/enc/picture_rescale_enc.c
index 9f19e8e..0b7181c 100644
--- a/src/3rdparty/libwebp/src/enc/picture_rescale.c
+++ b/src/3rdparty/libwebp/src/enc/picture_rescale_enc.c
@@ -14,8 +14,8 @@
 #include <assert.h>
 #include <stdlib.h>
 
-#include "./vp8enci.h"
-#include "../utils/rescaler.h"
+#include "./vp8i_enc.h"
+#include "../utils/rescaler_utils.h"
 #include "../utils/utils.h"
 
 #define HALVE(x) (((x) + 1) >> 1)
diff --git a/src/3rdparty/libwebp/src/enc/picture_tools.c b/src/3rdparty/libwebp/src/enc/picture_tools_enc.c
index bf97af8..895df51 100644
--- a/src/3rdparty/libwebp/src/enc/picture_tools.c
+++ b/src/3rdparty/libwebp/src/enc/picture_tools_enc.c
@@ -13,7 +13,7 @@
 
 #include <assert.h>
 
-#include "./vp8enci.h"
+#include "./vp8i_enc.h"
 #include "../dsp/yuv.h"
 
 static WEBP_INLINE uint32_t MakeARGB32(int r, int g, int b) {
diff --git a/src/3rdparty/libwebp/src/enc/predictor_enc.c b/src/3rdparty/libwebp/src/enc/predictor_enc.c
new file mode 100644
index 0000000..0639b74
--- /dev/null
+++ b/src/3rdparty/libwebp/src/enc/predictor_enc.c
@@ -0,0 +1,750 @@
+// Copyright 2016 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Image transform methods for lossless encoder.
+//
+// Authors: Vikas Arora (vikaas.arora@gmail.com)
+//          Jyrki Alakuijala (jyrki@google.com)
+//          Urvang Joshi (urvang@google.com)
+//          Vincent Rabaud (vrabaud@google.com)
+
+#include "../dsp/lossless.h"
+#include "../dsp/lossless_common.h"
+#include "./vp8li_enc.h"
+
+#define MAX_DIFF_COST (1e30f)
+
+static const float kSpatialPredictorBias = 15.f;
+static const int kPredLowEffort = 11;
+static const uint32_t kMaskAlpha = 0xff000000;
+
+// Mostly used to reduce code size + readability
+static WEBP_INLINE int GetMin(int a, int b) { return (a > b) ? b : a; }
+static WEBP_INLINE int GetMax(int a, int b) { return (a < b) ? b : a; }
+
+//------------------------------------------------------------------------------
+// Methods to calculate Entropy (Shannon).
+
+static float PredictionCostSpatial(const int counts[256], int weight_0,
+                                   double exp_val) {
+  const int significant_symbols = 256 >> 4;
+  const double exp_decay_factor = 0.6;
+  double bits = weight_0 * counts[0];
+  int i;
+  for (i = 1; i < significant_symbols; ++i) {
+    bits += exp_val * (counts[i] + counts[256 - i]);
+    exp_val *= exp_decay_factor;
+  }
+  return (float)(-0.1 * bits);
+}
+
+static float PredictionCostSpatialHistogram(const int accumulated[4][256],
+                                            const int tile[4][256]) {
+  int i;
+  double retval = 0;
+  for (i = 0; i < 4; ++i) {
+    const double kExpValue = 0.94;
+    retval += PredictionCostSpatial(tile[i], 1, kExpValue);
+    retval += VP8LCombinedShannonEntropy(tile[i], accumulated[i]);
+  }
+  return (float)retval;
+}
+
+static WEBP_INLINE void UpdateHisto(int histo_argb[4][256], uint32_t argb) {
+  ++histo_argb[0][argb >> 24];
+  ++histo_argb[1][(argb >> 16) & 0xff];
+  ++histo_argb[2][(argb >> 8) & 0xff];
+  ++histo_argb[3][argb & 0xff];
+}
+
+//------------------------------------------------------------------------------
+// Spatial transform functions.
+
+static WEBP_INLINE void PredictBatch(int mode, int x_start, int y,
+                                     int num_pixels, const uint32_t* current,
+                                     const uint32_t* upper, uint32_t* out) {
+  if (x_start == 0) {
+    if (y == 0) {
+      // ARGB_BLACK.
+      VP8LPredictorsSub[0](current, NULL, 1, out);
+    } else {
+      // Top one.
+      VP8LPredictorsSub[2](current, upper, 1, out);
+    }
+    ++x_start;
+    ++out;
+    --num_pixels;
+  }
+  if (y == 0) {
+    // Left one.
+    VP8LPredictorsSub[1](current + x_start, NULL, num_pixels, out);
+  } else {
+    VP8LPredictorsSub[mode](current + x_start, upper + x_start, num_pixels,
+                            out);
+  }
+}
+
+static int MaxDiffBetweenPixels(uint32_t p1, uint32_t p2) {
+  const int diff_a = abs((int)(p1 >> 24) - (int)(p2 >> 24));
+  const int diff_r = abs((int)((p1 >> 16) & 0xff) - (int)((p2 >> 16) & 0xff));
+  const int diff_g = abs((int)((p1 >> 8) & 0xff) - (int)((p2 >> 8) & 0xff));
+  const int diff_b = abs((int)(p1 & 0xff) - (int)(p2 & 0xff));
+  return GetMax(GetMax(diff_a, diff_r), GetMax(diff_g, diff_b));
+}
+
+static int MaxDiffAroundPixel(uint32_t current, uint32_t up, uint32_t down,
+                              uint32_t left, uint32_t right) {
+  const int diff_up = MaxDiffBetweenPixels(current, up);
+  const int diff_down = MaxDiffBetweenPixels(current, down);
+  const int diff_left = MaxDiffBetweenPixels(current, left);
+  const int diff_right = MaxDiffBetweenPixels(current, right);
+  return GetMax(GetMax(diff_up, diff_down), GetMax(diff_left, diff_right));
+}
+
+static uint32_t AddGreenToBlueAndRed(uint32_t argb) {
+  const uint32_t green = (argb >> 8) & 0xff;
+  uint32_t red_blue = argb & 0x00ff00ffu;
+  red_blue += (green << 16) | green;
+  red_blue &= 0x00ff00ffu;
+  return (argb & 0xff00ff00u) | red_blue;
+}
+
+static void MaxDiffsForRow(int width, int stride, const uint32_t* const argb,
+                           uint8_t* const max_diffs, int used_subtract_green) {
+  uint32_t current, up, down, left, right;
+  int x;
+  if (width <= 2) return;
+  current = argb[0];
+  right = argb[1];
+  if (used_subtract_green) {
+    current = AddGreenToBlueAndRed(current);
+    right = AddGreenToBlueAndRed(right);
+  }
+  // max_diffs[0] and max_diffs[width - 1] are never used.
+  for (x = 1; x < width - 1; ++x) {
+    up = argb[-stride + x];
+    down = argb[stride + x];
+    left = current;
+    current = right;
+    right = argb[x + 1];
+    if (used_subtract_green) {
+      up = AddGreenToBlueAndRed(up);
+      down = AddGreenToBlueAndRed(down);
+      right = AddGreenToBlueAndRed(right);
+    }
+    max_diffs[x] = MaxDiffAroundPixel(current, up, down, left, right);
+  }
+}
+
+// Quantize the difference between the actual component value and its prediction
+// to a multiple of quantization, working modulo 256, taking care not to cross
+// a boundary (inclusive upper limit).
+static uint8_t NearLosslessComponent(uint8_t value, uint8_t predict,
+                                     uint8_t boundary, int quantization) {
+  const int residual = (value - predict) & 0xff;
+  const int boundary_residual = (boundary - predict) & 0xff;
+  const int lower = residual & ~(quantization - 1);
+  const int upper = lower + quantization;
+  // Resolve ties towards a value closer to the prediction (i.e. towards lower
+  // if value comes after prediction and towards upper otherwise).
+  const int bias = ((boundary - value) & 0xff) < boundary_residual;
+  if (residual - lower < upper - residual + bias) {
+    // lower is closer to residual than upper.
+    if (residual > boundary_residual && lower <= boundary_residual) {
+      // Halve quantization step to avoid crossing boundary. This midpoint is
+      // on the same side of boundary as residual because midpoint >= residual
+      // (since lower is closer than upper) and residual is above the boundary.
+      return lower + (quantization >> 1);
+    }
+    return lower;
+  } else {
+    // upper is closer to residual than lower.
+    if (residual <= boundary_residual && upper > boundary_residual) {
+      // Halve quantization step to avoid crossing boundary. This midpoint is
+      // on the same side of boundary as residual because midpoint <= residual
+      // (since upper is closer than lower) and residual is below the boundary.
+      return lower + (quantization >> 1);
+    }
+    return upper & 0xff;
+  }
+}
+
+// Quantize every component of the difference between the actual pixel value and
+// its prediction to a multiple of a quantization (a power of 2, not larger than
+// max_quantization which is a power of 2, smaller than max_diff). Take care if
+// value and predict have undergone subtract green, which means that red and
+// blue are represented as offsets from green.
+static uint32_t NearLossless(uint32_t value, uint32_t predict,
+                             int max_quantization, int max_diff,
+                             int used_subtract_green) {
+  int quantization;
+  uint8_t new_green = 0;
+  uint8_t green_diff = 0;
+  uint8_t a, r, g, b;
+  if (max_diff <= 2) {
+    return VP8LSubPixels(value, predict);
+  }
+  quantization = max_quantization;
+  while (quantization >= max_diff) {
+    quantization >>= 1;
+  }
+  if ((value >> 24) == 0 || (value >> 24) == 0xff) {
+    // Preserve transparency of fully transparent or fully opaque pixels.
+    a = ((value >> 24) - (predict >> 24)) & 0xff;
+  } else {
+    a = NearLosslessComponent(value >> 24, predict >> 24, 0xff, quantization);
+  }
+  g = NearLosslessComponent((value >> 8) & 0xff, (predict >> 8) & 0xff, 0xff,
+                            quantization);
+  if (used_subtract_green) {
+    // The green offset will be added to red and blue components during decoding
+    // to obtain the actual red and blue values.
+    new_green = ((predict >> 8) + g) & 0xff;
+    // The amount by which green has been adjusted during quantization. It is
+    // subtracted from red and blue for compensation, to avoid accumulating two
+    // quantization errors in them.
+    green_diff = (new_green - (value >> 8)) & 0xff;
+  }
+  r = NearLosslessComponent(((value >> 16) - green_diff) & 0xff,
+                            (predict >> 16) & 0xff, 0xff - new_green,
+                            quantization);
+  b = NearLosslessComponent((value - green_diff) & 0xff, predict & 0xff,
+                            0xff - new_green, quantization);
+  return ((uint32_t)a << 24) | ((uint32_t)r << 16) | ((uint32_t)g << 8) | b;
+}
+
+// Stores the difference between the pixel and its prediction in "out".
+// In case of a lossy encoding, updates the source image to avoid propagating
+// the deviation further to pixels which depend on the current pixel for their
+// predictions.
+static WEBP_INLINE void GetResidual(
+    int width, int height, uint32_t* const upper_row,
+    uint32_t* const current_row, const uint8_t* const max_diffs, int mode,
+    int x_start, int x_end, int y, int max_quantization, int exact,
+    int used_subtract_green, uint32_t* const out) {
+  if (exact) {
+    PredictBatch(mode, x_start, y, x_end - x_start, current_row, upper_row,
+                 out);
+  } else {
+    const VP8LPredictorFunc pred_func = VP8LPredictors[mode];
+    int x;
+    for (x = x_start; x < x_end; ++x) {
+      uint32_t predict;
+      uint32_t residual;
+      if (y == 0) {
+        predict = (x == 0) ? ARGB_BLACK : current_row[x - 1];  // Left.
+      } else if (x == 0) {
+        predict = upper_row[x];  // Top.
+      } else {
+        predict = pred_func(current_row[x - 1], upper_row + x);
+      }
+      if (max_quantization == 1 || mode == 0 || y == 0 || y == height - 1 ||
+          x == 0 || x == width - 1) {
+        residual = VP8LSubPixels(current_row[x], predict);
+      } else {
+        residual = NearLossless(current_row[x], predict, max_quantization,
+                                max_diffs[x], used_subtract_green);
+        // Update the source image.
+        current_row[x] = VP8LAddPixels(predict, residual);
+        // x is never 0 here so we do not need to update upper_row like below.
+      }
+      if ((current_row[x] & kMaskAlpha) == 0) {
+        // If alpha is 0, cleanup RGB. We can choose the RGB values of the
+        // residual for best compression. The prediction of alpha itself can be
+        // non-zero and must be kept though. We choose RGB of the residual to be
+        // 0.
+        residual &= kMaskAlpha;
+        // Update the source image.
+        current_row[x] = predict & ~kMaskAlpha;
+        // The prediction for the rightmost pixel in a row uses the leftmost
+        // pixel
+        // in that row as its top-right context pixel. Hence if we change the
+        // leftmost pixel of current_row, the corresponding change must be
+        // applied
+        // to upper_row as well where top-right context is being read from.
+        if (x == 0 && y != 0) upper_row[width] = current_row[0];
+      }
+      out[x - x_start] = residual;
+    }
+  }
+}
+
+// Returns best predictor and updates the accumulated histogram.
+// If max_quantization > 1, assumes that near lossless processing will be
+// applied, quantizing residuals to multiples of quantization levels up to
+// max_quantization (the actual quantization level depends on smoothness near
+// the given pixel).
+static int GetBestPredictorForTile(int width, int height,
+                                   int tile_x, int tile_y, int bits,
+                                   int accumulated[4][256],
+                                   uint32_t* const argb_scratch,
+                                   const uint32_t* const argb,
+                                   int max_quantization,
+                                   int exact, int used_subtract_green,
+                                   const uint32_t* const modes) {
+  const int kNumPredModes = 14;
+  const int start_x = tile_x << bits;
+  const int start_y = tile_y << bits;
+  const int tile_size = 1 << bits;
+  const int max_y = GetMin(tile_size, height - start_y);
+  const int max_x = GetMin(tile_size, width - start_x);
+  // Whether there exist columns just outside the tile.
+  const int have_left = (start_x > 0);
+  const int have_right = (max_x < width - start_x);
+  // Position and size of the strip covering the tile and adjacent columns if
+  // they exist.
+  const int context_start_x = start_x - have_left;
+  const int context_width = max_x + have_left + have_right;
+  const int tiles_per_row = VP8LSubSampleSize(width, bits);
+  // Prediction modes of the left and above neighbor tiles.
+  const int left_mode = (tile_x > 0) ?
+      (modes[tile_y * tiles_per_row + tile_x - 1] >> 8) & 0xff : 0xff;
+  const int above_mode = (tile_y > 0) ?
+      (modes[(tile_y - 1) * tiles_per_row + tile_x] >> 8) & 0xff : 0xff;
+  // The width of upper_row and current_row is one pixel larger than image width
+  // to allow the top right pixel to point to the leftmost pixel of the next row
+  // when at the right edge.
+  uint32_t* upper_row = argb_scratch;
+  uint32_t* current_row = upper_row + width + 1;
+  uint8_t* const max_diffs = (uint8_t*)(current_row + width + 1);
+  float best_diff = MAX_DIFF_COST;
+  int best_mode = 0;
+  int mode;
+  int histo_stack_1[4][256];
+  int histo_stack_2[4][256];
+  // Need pointers to be able to swap arrays.
+  int (*histo_argb)[256] = histo_stack_1;
+  int (*best_histo)[256] = histo_stack_2;
+  int i, j;
+  uint32_t residuals[1 << MAX_TRANSFORM_BITS];
+  assert(bits <= MAX_TRANSFORM_BITS);
+  assert(max_x <= (1 << MAX_TRANSFORM_BITS));
+
+  for (mode = 0; mode < kNumPredModes; ++mode) {
+    float cur_diff;
+    int relative_y;
+    memset(histo_argb, 0, sizeof(histo_stack_1));
+    if (start_y > 0) {
+      // Read the row above the tile which will become the first upper_row.
+      // Include a pixel to the left if it exists; include a pixel to the right
+      // in all cases (wrapping to the leftmost pixel of the next row if it does
+      // not exist).
+      memcpy(current_row + context_start_x,
+             argb + (start_y - 1) * width + context_start_x,
+             sizeof(*argb) * (max_x + have_left + 1));
+    }
+    for (relative_y = 0; relative_y < max_y; ++relative_y) {
+      const int y = start_y + relative_y;
+      int relative_x;
+      uint32_t* tmp = upper_row;
+      upper_row = current_row;
+      current_row = tmp;
+      // Read current_row. Include a pixel to the left if it exists; include a
+      // pixel to the right in all cases except at the bottom right corner of
+      // the image (wrapping to the leftmost pixel of the next row if it does
+      // not exist in the current row).
+      memcpy(current_row + context_start_x,
+             argb + y * width + context_start_x,
+             sizeof(*argb) * (max_x + have_left + (y + 1 < height)));
+      if (max_quantization > 1 && y >= 1 && y + 1 < height) {
+        MaxDiffsForRow(context_width, width, argb + y * width + context_start_x,
+                       max_diffs + context_start_x, used_subtract_green);
+      }
+
+      GetResidual(width, height, upper_row, current_row, max_diffs, mode,
+                  start_x, start_x + max_x, y, max_quantization, exact,
+                  used_subtract_green, residuals);
+      for (relative_x = 0; relative_x < max_x; ++relative_x) {
+        UpdateHisto(histo_argb, residuals[relative_x]);
+      }
+    }
+    cur_diff = PredictionCostSpatialHistogram(
+        (const int (*)[256])accumulated, (const int (*)[256])histo_argb);
+    // Favor keeping the areas locally similar.
+    if (mode == left_mode) cur_diff -= kSpatialPredictorBias;
+    if (mode == above_mode) cur_diff -= kSpatialPredictorBias;
+
+    if (cur_diff < best_diff) {
+      int (*tmp)[256] = histo_argb;
+      histo_argb = best_histo;
+      best_histo = tmp;
+      best_diff = cur_diff;
+      best_mode = mode;
+    }
+  }
+
+  for (i = 0; i < 4; i++) {
+    for (j = 0; j < 256; j++) {
+      accumulated[i][j] += best_histo[i][j];
+    }
+  }
+
+  return best_mode;
+}
+
+// Converts pixels of the image to residuals with respect to predictions.
+// If max_quantization > 1, applies near lossless processing, quantizing
+// residuals to multiples of quantization levels up to max_quantization
+// (the actual quantization level depends on smoothness near the given pixel).
+static void CopyImageWithPrediction(int width, int height,
+                                    int bits, uint32_t* const modes,
+                                    uint32_t* const argb_scratch,
+                                    uint32_t* const argb,
+                                    int low_effort, int max_quantization,
+                                    int exact, int used_subtract_green) {
+  const int tiles_per_row = VP8LSubSampleSize(width, bits);
+  // The width of upper_row and current_row is one pixel larger than image width
+  // to allow the top right pixel to point to the leftmost pixel of the next row
+  // when at the right edge.
+  uint32_t* upper_row = argb_scratch;
+  uint32_t* current_row = upper_row + width + 1;
+  uint8_t* current_max_diffs = (uint8_t*)(current_row + width + 1);
+  uint8_t* lower_max_diffs = current_max_diffs + width;
+  int y;
+
+  for (y = 0; y < height; ++y) {
+    int x;
+    uint32_t* const tmp32 = upper_row;
+    upper_row = current_row;
+    current_row = tmp32;
+    memcpy(current_row, argb + y * width,
+           sizeof(*argb) * (width + (y + 1 < height)));
+
+    if (low_effort) {
+      PredictBatch(kPredLowEffort, 0, y, width, current_row, upper_row,
+                   argb + y * width);
+    } else {
+      if (max_quantization > 1) {
+        // Compute max_diffs for the lower row now, because that needs the
+        // contents of argb for the current row, which we will overwrite with
+        // residuals before proceeding with the next row.
+        uint8_t* const tmp8 = current_max_diffs;
+        current_max_diffs = lower_max_diffs;
+        lower_max_diffs = tmp8;
+        if (y + 2 < height) {
+          MaxDiffsForRow(width, width, argb + (y + 1) * width, lower_max_diffs,
+                         used_subtract_green);
+        }
+      }
+      for (x = 0; x < width;) {
+        const int mode =
+            (modes[(y >> bits) * tiles_per_row + (x >> bits)] >> 8) & 0xff;
+        int x_end = x + (1 << bits);
+        if (x_end > width) x_end = width;
+        GetResidual(width, height, upper_row, current_row, current_max_diffs,
+                    mode, x, x_end, y, max_quantization, exact,
+                    used_subtract_green, argb + y * width + x);
+        x = x_end;
+      }
+    }
+  }
+}
+
+// Finds the best predictor for each tile, and converts the image to residuals
+// with respect to predictions. If near_lossless_quality < 100, applies
+// near lossless processing, shaving off more bits of residuals for lower
+// qualities.
+void VP8LResidualImage(int width, int height, int bits, int low_effort,
+                       uint32_t* const argb, uint32_t* const argb_scratch,
+                       uint32_t* const image, int near_lossless_quality,
+                       int exact, int used_subtract_green) {
+  const int tiles_per_row = VP8LSubSampleSize(width, bits);
+  const int tiles_per_col = VP8LSubSampleSize(height, bits);
+  int tile_y;
+  int histo[4][256];
+  const int max_quantization = 1 << VP8LNearLosslessBits(near_lossless_quality);
+  if (low_effort) {
+    int i;
+    for (i = 0; i < tiles_per_row * tiles_per_col; ++i) {
+      image[i] = ARGB_BLACK | (kPredLowEffort << 8);
+    }
+  } else {
+    memset(histo, 0, sizeof(histo));
+    for (tile_y = 0; tile_y < tiles_per_col; ++tile_y) {
+      int tile_x;
+      for (tile_x = 0; tile_x < tiles_per_row; ++tile_x) {
+        const int pred = GetBestPredictorForTile(width, height, tile_x, tile_y,
+            bits, histo, argb_scratch, argb, max_quantization, exact,
+            used_subtract_green, image);
+        image[tile_y * tiles_per_row + tile_x] = ARGB_BLACK | (pred << 8);
+      }
+    }
+  }
+
+  CopyImageWithPrediction(width, height, bits, image, argb_scratch, argb,
+                          low_effort, max_quantization, exact,
+                          used_subtract_green);
+}
+
+//------------------------------------------------------------------------------
+// Color transform functions.
+
+static WEBP_INLINE void MultipliersClear(VP8LMultipliers* const m) {
+  m->green_to_red_ = 0;
+  m->green_to_blue_ = 0;
+  m->red_to_blue_ = 0;
+}
+
+static WEBP_INLINE void ColorCodeToMultipliers(uint32_t color_code,
+                                               VP8LMultipliers* const m) {
+  m->green_to_red_  = (color_code >>  0) & 0xff;
+  m->green_to_blue_ = (color_code >>  8) & 0xff;
+  m->red_to_blue_   = (color_code >> 16) & 0xff;
+}
+
+static WEBP_INLINE uint32_t MultipliersToColorCode(
+    const VP8LMultipliers* const m) {
+  return 0xff000000u |
+         ((uint32_t)(m->red_to_blue_) << 16) |
+         ((uint32_t)(m->green_to_blue_) << 8) |
+         m->green_to_red_;
+}
+
+static float PredictionCostCrossColor(const int accumulated[256],
+                                      const int counts[256]) {
+  // Favor low entropy, locally and globally.
+  // Favor small absolute values for PredictionCostSpatial
+  static const double kExpValue = 2.4;
+  return VP8LCombinedShannonEntropy(counts, accumulated) +
+         PredictionCostSpatial(counts, 3, kExpValue);
+}
+
+static float GetPredictionCostCrossColorRed(
+    const uint32_t* argb, int stride, int tile_width, int tile_height,
+    VP8LMultipliers prev_x, VP8LMultipliers prev_y, int green_to_red,
+    const int accumulated_red_histo[256]) {
+  int histo[256] = { 0 };
+  float cur_diff;
+
+  VP8LCollectColorRedTransforms(argb, stride, tile_width, tile_height,
+                                green_to_red, histo);
+
+  cur_diff = PredictionCostCrossColor(accumulated_red_histo, histo);
+  if ((uint8_t)green_to_red == prev_x.green_to_red_) {
+    cur_diff -= 3;  // favor keeping the areas locally similar
+  }
+  if ((uint8_t)green_to_red == prev_y.green_to_red_) {
+    cur_diff -= 3;  // favor keeping the areas locally similar
+  }
+  if (green_to_red == 0) {
+    cur_diff -= 3;
+  }
+  return cur_diff;
+}
+
+static void GetBestGreenToRed(
+    const uint32_t* argb, int stride, int tile_width, int tile_height,
+    VP8LMultipliers prev_x, VP8LMultipliers prev_y, int quality,
+    const int accumulated_red_histo[256], VP8LMultipliers* const best_tx) {
+  const int kMaxIters = 4 + ((7 * quality) >> 8);  // in range [4..6]
+  int green_to_red_best = 0;
+  int iter, offset;
+  float best_diff = GetPredictionCostCrossColorRed(
+      argb, stride, tile_width, tile_height, prev_x, prev_y,
+      green_to_red_best, accumulated_red_histo);
+  for (iter = 0; iter < kMaxIters; ++iter) {
+    // ColorTransformDelta is a 3.5 bit fixed point, so 32 is equal to
+    // one in color computation. Having initial delta here as 1 is sufficient
+    // to explore the range of (-2, 2).
+    const int delta = 32 >> iter;
+    // Try a negative and a positive delta from the best known value.
+    for (offset = -delta; offset <= delta; offset += 2 * delta) {
+      const int green_to_red_cur = offset + green_to_red_best;
+      const float cur_diff = GetPredictionCostCrossColorRed(
+          argb, stride, tile_width, tile_height, prev_x, prev_y,
+          green_to_red_cur, accumulated_red_histo);
+      if (cur_diff < best_diff) {
+        best_diff = cur_diff;
+        green_to_red_best = green_to_red_cur;
+      }
+    }
+  }
+  best_tx->green_to_red_ = green_to_red_best;
+}
+
+static float GetPredictionCostCrossColorBlue(
+    const uint32_t* argb, int stride, int tile_width, int tile_height,
+    VP8LMultipliers prev_x, VP8LMultipliers prev_y,
+    int green_to_blue, int red_to_blue, const int accumulated_blue_histo[256]) {
+  int histo[256] = { 0 };
+  float cur_diff;
+
+  VP8LCollectColorBlueTransforms(argb, stride, tile_width, tile_height,
+                                 green_to_blue, red_to_blue, histo);
+
+  cur_diff = PredictionCostCrossColor(accumulated_blue_histo, histo);
+  if ((uint8_t)green_to_blue == prev_x.green_to_blue_) {
+    cur_diff -= 3;  // favor keeping the areas locally similar
+  }
+  if ((uint8_t)green_to_blue == prev_y.green_to_blue_) {
+    cur_diff -= 3;  // favor keeping the areas locally similar
+  }
+  if ((uint8_t)red_to_blue == prev_x.red_to_blue_) {
+    cur_diff -= 3;  // favor keeping the areas locally similar
+  }
+  if ((uint8_t)red_to_blue == prev_y.red_to_blue_) {
+    cur_diff -= 3;  // favor keeping the areas locally similar
+  }
+  if (green_to_blue == 0) {
+    cur_diff -= 3;
+  }
+  if (red_to_blue == 0) {
+    cur_diff -= 3;
+  }
+  return cur_diff;
+}
+
+#define kGreenRedToBlueNumAxis 8
+#define kGreenRedToBlueMaxIters 7
+static void GetBestGreenRedToBlue(
+    const uint32_t* argb, int stride, int tile_width, int tile_height,
+    VP8LMultipliers prev_x, VP8LMultipliers prev_y, int quality,
+    const int accumulated_blue_histo[256],
+    VP8LMultipliers* const best_tx) {
+  const int8_t offset[kGreenRedToBlueNumAxis][2] =
+      {{0, -1}, {0, 1}, {-1, 0}, {1, 0}, {-1, -1}, {-1, 1}, {1, -1}, {1, 1}};
+  const int8_t delta_lut[kGreenRedToBlueMaxIters] = { 16, 16, 8, 4, 2, 2, 2 };
+  const int iters =
+      (quality < 25) ? 1 : (quality > 50) ? kGreenRedToBlueMaxIters : 4;
+  int green_to_blue_best = 0;
+  int red_to_blue_best = 0;
+  int iter;
+  // Initial value at origin:
+  float best_diff = GetPredictionCostCrossColorBlue(
+      argb, stride, tile_width, tile_height, prev_x, prev_y,
+      green_to_blue_best, red_to_blue_best, accumulated_blue_histo);
+  for (iter = 0; iter < iters; ++iter) {
+    const int delta = delta_lut[iter];
+    int axis;
+    for (axis = 0; axis < kGreenRedToBlueNumAxis; ++axis) {
+      const int green_to_blue_cur =
+          offset[axis][0] * delta + green_to_blue_best;
+      const int red_to_blue_cur = offset[axis][1] * delta + red_to_blue_best;
+      const float cur_diff = GetPredictionCostCrossColorBlue(
+          argb, stride, tile_width, tile_height, prev_x, prev_y,
+          green_to_blue_cur, red_to_blue_cur, accumulated_blue_histo);
+      if (cur_diff < best_diff) {
+        best_diff = cur_diff;
+        green_to_blue_best = green_to_blue_cur;
+        red_to_blue_best = red_to_blue_cur;
+      }
+      if (quality < 25 && iter == 4) {
+        // Only axis aligned diffs for lower quality.
+        break;  // next iter.
+      }
+    }
+    if (delta == 2 && green_to_blue_best == 0 && red_to_blue_best == 0) {
+      // Further iterations would not help.
+      break;  // out of iter-loop.
+    }
+  }
+  best_tx->green_to_blue_ = green_to_blue_best;
+  best_tx->red_to_blue_ = red_to_blue_best;
+}
+#undef kGreenRedToBlueMaxIters
+#undef kGreenRedToBlueNumAxis
+
+static VP8LMultipliers GetBestColorTransformForTile(
+    int tile_x, int tile_y, int bits,
+    VP8LMultipliers prev_x,
+    VP8LMultipliers prev_y,
+    int quality, int xsize, int ysize,
+    const int accumulated_red_histo[256],
+    const int accumulated_blue_histo[256],
+    const uint32_t* const argb) {
+  const int max_tile_size = 1 << bits;
+  const int tile_y_offset = tile_y * max_tile_size;
+  const int tile_x_offset = tile_x * max_tile_size;
+  const int all_x_max = GetMin(tile_x_offset + max_tile_size, xsize);
+  const int all_y_max = GetMin(tile_y_offset + max_tile_size, ysize);
+  const int tile_width = all_x_max - tile_x_offset;
+  const int tile_height = all_y_max - tile_y_offset;
+  const uint32_t* const tile_argb = argb + tile_y_offset * xsize
+                                  + tile_x_offset;
+  VP8LMultipliers best_tx;
+  MultipliersClear(&best_tx);
+
+  GetBestGreenToRed(tile_argb, xsize, tile_width, tile_height,
+                    prev_x, prev_y, quality, accumulated_red_histo, &best_tx);
+  GetBestGreenRedToBlue(tile_argb, xsize, tile_width, tile_height,
+                        prev_x, prev_y, quality, accumulated_blue_histo,
+                        &best_tx);
+  return best_tx;
+}
+
+static void CopyTileWithColorTransform(int xsize, int ysize,
+                                       int tile_x, int tile_y,
+                                       int max_tile_size,
+                                       VP8LMultipliers color_transform,
+                                       uint32_t* argb) {
+  const int xscan = GetMin(max_tile_size, xsize - tile_x);
+  int yscan = GetMin(max_tile_size, ysize - tile_y);
+  argb += tile_y * xsize + tile_x;
+  while (yscan-- > 0) {
+    VP8LTransformColor(&color_transform, argb, xscan);
+    argb += xsize;
+  }
+}
+
+void VP8LColorSpaceTransform(int width, int height, int bits, int quality,
+                             uint32_t* const argb, uint32_t* image) {
+  const int max_tile_size = 1 << bits;
+  const int tile_xsize = VP8LSubSampleSize(width, bits);
+  const int tile_ysize = VP8LSubSampleSize(height, bits);
+  int accumulated_red_histo[256] = { 0 };
+  int accumulated_blue_histo[256] = { 0 };
+  int tile_x, tile_y;
+  VP8LMultipliers prev_x, prev_y;
+  MultipliersClear(&prev_y);
+  MultipliersClear(&prev_x);
+  for (tile_y = 0; tile_y < tile_ysize; ++tile_y) {
+    for (tile_x = 0; tile_x < tile_xsize; ++tile_x) {
+      int y;
+      const int tile_x_offset = tile_x * max_tile_size;
+      const int tile_y_offset = tile_y * max_tile_size;
+      const int all_x_max = GetMin(tile_x_offset + max_tile_size, width);
+      const int all_y_max = GetMin(tile_y_offset + max_tile_size, height);
+      const int offset = tile_y * tile_xsize + tile_x;
+      if (tile_y != 0) {
+        ColorCodeToMultipliers(image[offset - tile_xsize], &prev_y);
+      }
+      prev_x = GetBestColorTransformForTile(tile_x, tile_y, bits,
+                                            prev_x, prev_y,
+                                            quality, width, height,
+                                            accumulated_red_histo,
+                                            accumulated_blue_histo,
+                                            argb);
+      image[offset] = MultipliersToColorCode(&prev_x);
+      CopyTileWithColorTransform(width, height, tile_x_offset, tile_y_offset,
+                                 max_tile_size, prev_x, argb);
+
+      // Gather accumulated histogram data.
+      for (y = tile_y_offset; y < all_y_max; ++y) {
+        int ix = y * width + tile_x_offset;
+        const int ix_end = ix + all_x_max - tile_x_offset;
+        for (; ix < ix_end; ++ix) {
+          const uint32_t pix = argb[ix];
+          if (ix >= 2 &&
+              pix == argb[ix - 2] &&
+              pix == argb[ix - 1]) {
+            continue;  // repeated pixels are handled by backward references
+          }
+          if (ix >= width + 2 &&
+              argb[ix - 2] == argb[ix - width - 2] &&
+              argb[ix - 1] == argb[ix - width - 1] &&
+              pix == argb[ix - width]) {
+            continue;  // repeated pixels are handled by backward references
+          }
+          ++accumulated_red_histo[(pix >> 16) & 0xff];
+          ++accumulated_blue_histo[(pix >> 0) & 0xff];
+        }
+      }
+    }
+  }
+}
diff --git a/src/3rdparty/libwebp/src/enc/quant.c b/src/3rdparty/libwebp/src/enc/quant_enc.c
index 549ad26..b118fb2 100644
--- a/src/3rdparty/libwebp/src/enc/quant.c
+++ b/src/3rdparty/libwebp/src/enc/quant_enc.c
@@ -15,8 +15,8 @@
 #include <math.h>
 #include <stdlib.h>  // for abs()
 
-#include "./vp8enci.h"
-#include "./cost.h"
+#include "./vp8i_enc.h"
+#include "./cost_enc.h"
 
 #define DO_TRELLIS_I4  1
 #define DO_TRELLIS_I16 1   // not a huge gain, but ok at low bitrate.
@@ -278,7 +278,7 @@ static void SetupMatrices(VP8Encoder* enc) {
     CheckLambdaValue(&m->lambda_trellis_uv_);
     CheckLambdaValue(&m->tlambda_);
 
-    m->min_disto_ = 10 * m->y1_.q_[0];   // quantization-aware min disto
+    m->min_disto_ = 20 * m->y1_.q_[0];   // quantization-aware min disto
     m->max_edge_  = 0;
 
     m->i4_penalty_ = 1000 * q_i4 * q_i4;
@@ -643,6 +643,8 @@ static int TrellisQuantizeBlock(const VP8Encoder* const enc,
     const int sign = (in[j] < 0);
     const uint32_t coeff0 = (sign ? -in[j] : in[j]) + mtx->sharpen_[j];
     int level0 = QUANTDIV(coeff0, iQ, B);
+    int thresh_level = QUANTDIV(coeff0, iQ, BIAS(0x80));
+    if (thresh_level > MAX_LEVEL) thresh_level = MAX_LEVEL;
     if (level0 > MAX_LEVEL) level0 = MAX_LEVEL;
 
     {   // Swap current and previous score states
@@ -657,23 +659,17 @@ static int TrellisQuantizeBlock(const VP8Encoder* const enc,
       int level = level0 + m;
       const int ctx = (level > 2) ? 2 : level;
       const int band = VP8EncBands[n + 1];
-      score_t base_score, last_pos_score;
+      score_t base_score;
       score_t best_cur_score = MAX_COST;
       int best_prev = 0;   // default, in case
 
       ss_cur[m].score = MAX_COST;
       ss_cur[m].costs = costs[n + 1][ctx];
-      if (level > MAX_LEVEL || level < 0) {   // node is dead?
+      if (level < 0 || level > thresh_level) {
+        // Node is dead.
         continue;
       }
 
-      // Compute extra rate cost if last coeff's position is < 15
-      {
-        const score_t last_pos_cost =
-            (n < 15) ? VP8BitCost(0, probas[band][ctx][0]) : 0;
-        last_pos_score = RDScoreTrellis(lambda, last_pos_cost, 0);
-      }
-
       {
         // Compute delta_error = how much coding this level will
         // subtract to max_error as distortion.
@@ -705,6 +701,9 @@ static int TrellisQuantizeBlock(const VP8Encoder* const enc,
 
       // Now, record best terminal node (and thus best entry in the graph).
       if (level != 0) {
+        const score_t last_pos_cost =
+            (n < 15) ? VP8BitCost(0, probas[band][ctx][0]) : 0;
+        const score_t last_pos_score = RDScoreTrellis(lambda, last_pos_cost, 0);
         const score_t score = best_cur_score + last_pos_score;
         if (score < best_score) {
           best_score = score;
@@ -874,9 +873,9 @@ static void StoreMaxDelta(VP8SegmentInfo* const dqm, const int16_t DCs[16]) {
   // We look at the first three AC coefficients to determine what is the average
   // delta between each sub-4x4 block.
   const int v0 = abs(DCs[1]);
-  const int v1 = abs(DCs[4]);
-  const int v2 = abs(DCs[5]);
-  int max_v = (v0 > v1) ? v1 : v0;
+  const int v1 = abs(DCs[2]);
+  const int v2 = abs(DCs[4]);
+  int max_v = (v1 > v0) ? v1 : v0;
   max_v = (v2 > max_v) ? v2 : max_v;
   if (max_v > dqm->max_edge_) dqm->max_edge_ = max_v;
 }
@@ -957,7 +956,7 @@ static void PickBestIntra16(VP8EncIterator* const it, VP8ModeScore* rd) {
   // we have a blocky macroblock (only DCs are non-zero) with fairly high
   // distortion, record max delta so we can later adjust the minimal filtering
   // strength needed to smooth these blocks out.
-  if ((rd->nz & 0xffff) == 0 && rd->D > dqm->min_disto_) {
+  if ((rd->nz & 0x100ffff) == 0x1000000 && rd->D > dqm->min_disto_) {
     StoreMaxDelta(dqm, rd->y_dc_levels);
   }
 }
@@ -1155,7 +1154,8 @@ static void RefineUsingDistortion(VP8EncIterator* const it,
   const int lambda_d_uv = 120;
   score_t score_i4 = dqm->i4_penalty_;
   score_t i4_bit_sum = 0;
-  const score_t bit_limit = it->enc_->mb_header_limit_;
+  const score_t bit_limit = try_both_modes ? it->enc_->mb_header_limit_
+                                           : MAX_COST;  // no early-out allowed
 
   if (is_i16) {   // First, evaluate Intra16 distortion
     int best_mode = -1;
diff --git a/src/3rdparty/libwebp/src/enc/syntax.c b/src/3rdparty/libwebp/src/enc/syntax_enc.c
index a0e79ef..90665bd 100644
--- a/src/3rdparty/libwebp/src/enc/syntax.c
+++ b/src/3rdparty/libwebp/src/enc/syntax_enc.c
@@ -16,7 +16,7 @@
 #include "../utils/utils.h"
 #include "../webp/format_constants.h"  // RIFF constants
 #include "../webp/mux_types.h"         // ALPHA_FLAG
-#include "./vp8enci.h"
+#include "./vp8i_enc.h"
 
 //------------------------------------------------------------------------------
 // Helper functions
@@ -362,8 +362,7 @@ int VP8EncWrite(VP8Encoder* const enc) {
   for (p = 0; p < enc->num_parts_; ++p) {
     const uint8_t* const buf = VP8BitWriterBuf(enc->parts_ + p);
     const size_t size = VP8BitWriterSize(enc->parts_ + p);
-    if (size)
-      ok = ok && pic->writer(buf, size, pic);
+    if (size) ok = ok && pic->writer(buf, size, pic);
     VP8BitWriterWipeOut(enc->parts_ + p);    // will free the internal buffer.
     ok = ok && WebPReportProgress(pic, enc->percent_ + percent_per_part,
                                   &enc->percent_);
diff --git a/src/3rdparty/libwebp/src/enc/token.c b/src/3rdparty/libwebp/src/enc/token_enc.c
index e73256b..02a0d72 100644
--- a/src/3rdparty/libwebp/src/enc/token.c
+++ b/src/3rdparty/libwebp/src/enc/token_enc.c
@@ -20,8 +20,8 @@
 #include <stdlib.h>
 #include <string.h>
 
-#include "./cost.h"
-#include "./vp8enci.h"
+#include "./cost_enc.h"
+#include "./vp8i_enc.h"
 #include "../utils/utils.h"
 
 #if !defined(DISABLE_TOKEN_BUFFER)
@@ -87,14 +87,16 @@ static int TBufferNewPage(VP8TBuffer* const b) {
 #define TOKEN_ID(t, b, ctx) \
     (NUM_PROBAS * ((ctx) + NUM_CTX * ((b) + NUM_BANDS * (t))))
 
-static WEBP_INLINE uint32_t AddToken(VP8TBuffer* const b,
-                                     uint32_t bit, uint32_t proba_idx) {
+static WEBP_INLINE uint32_t AddToken(VP8TBuffer* const b, uint32_t bit,
+                                     uint32_t proba_idx,
+                                     proba_t* const stats) {
   assert(proba_idx < FIXED_PROBA_BIT);
   assert(bit <= 1);
   if (b->left_ > 0 || TBufferNewPage(b)) {
     const int slot = --b->left_;
     b->tokens_[slot] = (bit << 15) | proba_idx;
   }
+  VP8RecordStats(bit, stats);
   return bit;
 }
 
@@ -108,13 +110,16 @@ static WEBP_INLINE void AddConstantToken(VP8TBuffer* const b,
   }
 }
 
-int VP8RecordCoeffTokens(const int ctx, const int coeff_type,
-                         int first, int last,
-                         const int16_t* const coeffs,
+int VP8RecordCoeffTokens(int ctx, const struct VP8Residual* const res,
                          VP8TBuffer* const tokens) {
-  int n = first;
+  const int16_t* const coeffs = res->coeffs;
+  const int coeff_type = res->coeff_type;
+  const int last = res->last;
+  int n = res->first;
   uint32_t base_id = TOKEN_ID(coeff_type, n, ctx);
-  if (!AddToken(tokens, last >= 0, base_id + 0)) {
+  // should be stats[VP8EncBands[n]], but it's equivalent for n=0 or 1
+  proba_t* s = res->stats[n][ctx];
+  if (!AddToken(tokens, last >= 0, base_id + 0, s + 0)) {
     return 0;
   }
 
@@ -122,18 +127,21 @@ int VP8RecordCoeffTokens(const int ctx, const int coeff_type,
     const int c = coeffs[n++];
     const int sign = c < 0;
     const uint32_t v = sign ? -c : c;
-    if (!AddToken(tokens, v != 0, base_id + 1)) {
+    if (!AddToken(tokens, v != 0, base_id + 1, s + 1)) {
       base_id = TOKEN_ID(coeff_type, VP8EncBands[n], 0);  // ctx=0
+      s = res->stats[VP8EncBands[n]][0];
       continue;
     }
-    if (!AddToken(tokens, v > 1, base_id + 2)) {
+    if (!AddToken(tokens, v > 1, base_id + 2, s + 2)) {
       base_id = TOKEN_ID(coeff_type, VP8EncBands[n], 1);  // ctx=1
+      s = res->stats[VP8EncBands[n]][1];
     } else {
-      if (!AddToken(tokens, v > 4, base_id + 3)) {
-        if (AddToken(tokens, v != 2, base_id + 4))
-          AddToken(tokens, v == 4, base_id + 5);
-      } else if (!AddToken(tokens, v > 10, base_id + 6)) {
-        if (!AddToken(tokens, v > 6, base_id + 7)) {
+      if (!AddToken(tokens, v > 4, base_id + 3, s + 3)) {
+        if (AddToken(tokens, v != 2, base_id + 4, s + 4)) {
+          AddToken(tokens, v == 4, base_id + 5, s + 5);
+        }
+      } else if (!AddToken(tokens, v > 10, base_id + 6, s + 6)) {
+        if (!AddToken(tokens, v > 6, base_id + 7, s + 7)) {
           AddConstantToken(tokens, v == 6, 159);
         } else {
           AddConstantToken(tokens, v >= 9, 165);
@@ -144,26 +152,26 @@ int VP8RecordCoeffTokens(const int ctx, const int coeff_type,
         const uint8_t* tab;
         uint32_t residue = v - 3;
         if (residue < (8 << 1)) {          // VP8Cat3  (3b)
-          AddToken(tokens, 0, base_id + 8);
-          AddToken(tokens, 0, base_id + 9);
+          AddToken(tokens, 0, base_id + 8, s + 8);
+          AddToken(tokens, 0, base_id + 9, s + 9);
           residue -= (8 << 0);
           mask = 1 << 2;
           tab = VP8Cat3;
         } else if (residue < (8 << 2)) {   // VP8Cat4  (4b)
-          AddToken(tokens, 0, base_id + 8);
-          AddToken(tokens, 1, base_id + 9);
+          AddToken(tokens, 0, base_id + 8, s + 8);
+          AddToken(tokens, 1, base_id + 9, s + 9);
           residue -= (8 << 1);
           mask = 1 << 3;
           tab = VP8Cat4;
         } else if (residue < (8 << 3)) {   // VP8Cat5  (5b)
-          AddToken(tokens, 1, base_id + 8);
-          AddToken(tokens, 0, base_id + 10);
+          AddToken(tokens, 1, base_id + 8, s + 8);
+          AddToken(tokens, 0, base_id + 10, s + 9);
           residue -= (8 << 2);
           mask = 1 << 4;
           tab = VP8Cat5;
         } else {                         // VP8Cat6 (11b)
-          AddToken(tokens, 1, base_id + 8);
-          AddToken(tokens, 1, base_id + 10);
+          AddToken(tokens, 1, base_id + 8, s + 8);
+          AddToken(tokens, 1, base_id + 10, s + 9);
           residue -= (8 << 3);
           mask = 1 << 10;
           tab = VP8Cat6;
@@ -174,9 +182,10 @@ int VP8RecordCoeffTokens(const int ctx, const int coeff_type,
         }
       }
       base_id = TOKEN_ID(coeff_type, VP8EncBands[n], 2);  // ctx=2
+      s = res->stats[VP8EncBands[n]][2];
     }
     AddConstantToken(tokens, sign, 128);
-    if (n == 16 || !AddToken(tokens, n <= last, base_id + 0)) {
+    if (n == 16 || !AddToken(tokens, n <= last, base_id + 0, s + 0)) {
       return 1;   // EOB
     }
   }
diff --git a/src/3rdparty/libwebp/src/enc/tree.c b/src/3rdparty/libwebp/src/enc/tree_enc.c
index f141006..2c40fe7 100644
--- a/src/3rdparty/libwebp/src/enc/tree.c
+++ b/src/3rdparty/libwebp/src/enc/tree_enc.c
@@ -11,7 +11,7 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
-#include "./vp8enci.h"
+#include "./vp8i_enc.h"
 
 //------------------------------------------------------------------------------
 // Default probabilities
diff --git a/src/3rdparty/libwebp/src/enc/vp8enci.h b/src/3rdparty/libwebp/src/enc/vp8i_enc.h
index c1fbd76..93c95ec 100644
--- a/src/3rdparty/libwebp/src/enc/vp8enci.h
+++ b/src/3rdparty/libwebp/src/enc/vp8i_enc.h
@@ -15,10 +15,10 @@
 #define WEBP_ENC_VP8ENCI_H_
 
 #include <string.h>     // for memcpy()
-#include "../dec/common.h"
+#include "../dec/common_dec.h"
 #include "../dsp/dsp.h"
-#include "../utils/bit_writer.h"
-#include "../utils/thread.h"
+#include "../utils/bit_writer_utils.h"
+#include "../utils/thread_utils.h"
 #include "../utils/utils.h"
 #include "../webp/encode.h"
 
@@ -31,8 +31,8 @@ extern "C" {
 
 // version numbers
 #define ENC_MAJ_VERSION 0
-#define ENC_MIN_VERSION 5
-#define ENC_REV_VERSION 1
+#define ENC_MIN_VERSION 6
+#define ENC_REV_VERSION 0
 
 enum { MAX_LF_LEVELS = 64,       // Maximum loop filter level
        MAX_VARIABLE_LEVEL = 67,  // last (inclusive) level with variable cost
@@ -219,7 +219,6 @@ typedef struct {
 // right neighbouring data (samples, predictions, contexts, ...)
 typedef struct {
   int x_, y_;                      // current macroblock
-  int y_stride_, uv_stride_;       // respective strides
   uint8_t*      yuv_in_;           // input samples
   uint8_t*      yuv_out_;          // output samples
   uint8_t*      yuv_out2_;         // secondary buffer swapped with yuv_out_.
@@ -325,9 +324,7 @@ int VP8EmitTokens(VP8TBuffer* const b, VP8BitWriter* const bw,
                   const uint8_t* const probas, int final_pass);
 
 // record the coding of coefficients without knowing the probabilities yet
-int VP8RecordCoeffTokens(const int ctx, const int coeff_type,
-                         int first, int last,
-                         const int16_t* const coeffs,
+int VP8RecordCoeffTokens(int ctx, const struct VP8Residual* const res,
                          VP8TBuffer* const tokens);
 
 // Estimate the final coded size given a set of 'probas'.
@@ -476,14 +473,6 @@ int VP8EncStartAlpha(VP8Encoder* const enc);    // start alpha coding process
 int VP8EncFinishAlpha(VP8Encoder* const enc);   // finalize compressed data
 int VP8EncDeleteAlpha(VP8Encoder* const enc);   // delete compressed data
 
-  // in filter.c
-void VP8SSIMAddStats(const VP8DistoStats* const src, VP8DistoStats* const dst);
-void VP8SSIMAccumulatePlane(const uint8_t* src1, int stride1,
-                            const uint8_t* src2, int stride2,
-                            int W, int H, VP8DistoStats* const stats);
-double VP8SSIMGet(const VP8DistoStats* const stats);
-double VP8SSIMGetSquaredError(const VP8DistoStats* const stats);
-
 // autofilter
 void VP8InitFilter(VP8EncIterator* const it);
 void VP8StoreFilterStats(VP8EncIterator* const it);
diff --git a/src/3rdparty/libwebp/src/enc/vp8l.c b/src/3rdparty/libwebp/src/enc/vp8l_enc.c
index c16e256..b1a793d 100644
--- a/src/3rdparty/libwebp/src/enc/vp8l.c
+++ b/src/3rdparty/libwebp/src/enc/vp8l_enc.c
@@ -15,17 +15,18 @@
 #include <assert.h>
 #include <stdlib.h>
 
-#include "./backward_references.h"
-#include "./histogram.h"
-#include "./vp8enci.h"
-#include "./vp8li.h"
+#include "./backward_references_enc.h"
+#include "./histogram_enc.h"
+#include "./vp8i_enc.h"
+#include "./vp8li_enc.h"
 #include "../dsp/lossless.h"
-#include "../utils/bit_writer.h"
-#include "../utils/huffman_encode.h"
+#include "../dsp/lossless_common.h"
+#include "../utils/bit_writer_utils.h"
+#include "../utils/huffman_encode_utils.h"
 #include "../utils/utils.h"
 #include "../webp/format_constants.h"
 
-#include "./delta_palettization.h"
+#include "./delta_palettization_enc.h"
 
 #define PALETTE_KEY_RIGHT_SHIFT   22  // Key for 1K buffer.
 // Maximum number of histogram images (sub-blocks).
@@ -34,8 +35,8 @@
 // Palette reordering for smaller sum of deltas (and for smaller storage).
 
 static int PaletteCompareColorsForQsort(const void* p1, const void* p2) {
-  const uint32_t a = WebPMemToUint32(p1);
-  const uint32_t b = WebPMemToUint32(p2);
+  const uint32_t a = WebPMemToUint32((uint8_t*)p1);
+  const uint32_t b = WebPMemToUint32((uint8_t*)p2);
   assert(a != b);
   return (a < b) ? -1 : 1;
 }
@@ -163,18 +164,25 @@ typedef enum {
   kHistoTotal  // Must be last.
 } HistoIx;
 
-static void AddSingleSubGreen(uint32_t p, uint32_t* r, uint32_t* b) {
-  const uint32_t green = p >> 8;  // The upper bits are masked away later.
+static void AddSingleSubGreen(int p, uint32_t* const r, uint32_t* const b) {
+  const int green = p >> 8;  // The upper bits are masked away later.
   ++r[((p >> 16) - green) & 0xff];
-  ++b[(p - green) & 0xff];
+  ++b[((p >>  0) - green) & 0xff];
 }
 
 static void AddSingle(uint32_t p,
-                      uint32_t* a, uint32_t* r, uint32_t* g, uint32_t* b) {
-  ++a[p >> 24];
+                      uint32_t* const a, uint32_t* const r,
+                      uint32_t* const g, uint32_t* const b) {
+  ++a[(p >> 24) & 0xff];
   ++r[(p >> 16) & 0xff];
-  ++g[(p >> 8) & 0xff];
-  ++b[(p & 0xff)];
+  ++g[(p >>  8) & 0xff];
+  ++b[(p >>  0) & 0xff];
+}
+
+static WEBP_INLINE uint32_t HashPix(uint32_t pix) {
+  // Note that masking with 0xffffffffu is for preventing an
+  // 'unsigned int overflow' warning. Doesn't impact the compiled code.
+  return ((((uint64_t)pix + (pix >> 19)) * 0x39c5fba7ull) & 0xffffffffu) >> 24;
 }
 
 static int AnalyzeEntropy(const uint32_t* argb,
@@ -214,8 +222,8 @@ static int AnalyzeEntropy(const uint32_t* argb,
                           &histo[kHistoBluePredSubGreen * 256]);
         {
           // Approximate the palette by the entropy of the multiplicative hash.
-          const int hash = ((pix + (pix >> 19)) * 0x39c5fba7) >> 24;
-          ++histo[kHistoPalette * 256 + (hash & 0xff)];
+          const uint32_t hash = HashPix(pix);
+          ++histo[kHistoPalette * 256 + hash];
         }
       }
       prev_row = curr_row;
@@ -224,9 +232,8 @@ static int AnalyzeEntropy(const uint32_t* argb,
     {
       double entropy_comp[kHistoTotal];
       double entropy[kNumEntropyIx];
-      EntropyIx k;
-      EntropyIx last_mode_to_analyze =
-          use_palette ? kPalette : kSpatialSubGreen;
+      int k;
+      int last_mode_to_analyze = use_palette ? kPalette : kSpatialSubGreen;
       int j;
       // Let's add one zero to the predicted histograms. The zeros are removed
       // too efficiently by the pix_diff == 0 comparison, at least one of the
@@ -263,7 +270,7 @@ static int AnalyzeEntropy(const uint32_t* argb,
       *min_entropy_ix = kDirect;
       for (k = kDirect + 1; k <= last_mode_to_analyze; ++k) {
         if (entropy[*min_entropy_ix] > entropy[k]) {
-          *min_entropy_ix = k;
+          *min_entropy_ix = (EntropyIx)k;
         }
       }
       *red_and_blue_always_zero = 1;
@@ -312,7 +319,10 @@ static int GetHistoBits(int method, int use_palette, int width, int height) {
 
 static int GetTransformBits(int method, int histo_bits) {
   const int max_transform_bits = (method < 4) ? 6 : (method > 4) ? 4 : 5;
-  return (histo_bits > max_transform_bits) ? max_transform_bits : histo_bits;
+  const int res =
+      (histo_bits > max_transform_bits) ? max_transform_bits : histo_bits;
+  assert(res <= MAX_TRANSFORM_BITS);
+  return res;
 }
 
 static int AnalyzeAndInit(VP8LEncoder* const enc) {
@@ -697,7 +707,7 @@ static WebPEncodingError EncodeImageNoHuffman(VP8LBitWriter* const bw,
                                               VP8LHashChain* const hash_chain,
                                               VP8LBackwardRefs refs_array[2],
                                               int width, int height,
-                                              int quality) {
+                                              int quality, int low_effort) {
   int i;
   int max_tokens = 0;
   WebPEncodingError err = VP8_ENC_OK;
@@ -715,7 +725,8 @@ static WebPEncodingError EncodeImageNoHuffman(VP8LBitWriter* const bw,
   }
 
   // Calculate backward references from ARGB image.
-  if (VP8LHashChainFill(hash_chain, quality, argb, width, height) == 0) {
+  if (!VP8LHashChainFill(hash_chain, quality, argb, width, height,
+                         low_effort)) {
     err = VP8_ENC_ERROR_OUT_OF_MEMORY;
     goto Error;
   }
@@ -815,11 +826,18 @@ static WebPEncodingError EncodeImageInternal(VP8LBitWriter* const bw,
     goto Error;
   }
 
-  *cache_bits = use_cache ? MAX_COLOR_CACHE_BITS : 0;
+  if (use_cache) {
+    // If the value is different from zero, it has been set during the
+    // palette analysis.
+    if (*cache_bits == 0) *cache_bits = MAX_COLOR_CACHE_BITS;
+  } else {
+    *cache_bits = 0;
+  }
   // 'best_refs' is the reference to the best backward refs and points to one
   // of refs_array[0] or refs_array[1].
   // Calculate backward references from ARGB image.
-  if (VP8LHashChainFill(hash_chain, quality, argb, width, height) == 0) {
+  if (!VP8LHashChainFill(hash_chain, quality, argb, width, height,
+                         low_effort)) {
     err = VP8_ENC_ERROR_OUT_OF_MEMORY;
     goto Error;
   }
@@ -900,7 +918,7 @@ static WebPEncodingError EncodeImageInternal(VP8LBitWriter* const bw,
       err = EncodeImageNoHuffman(bw, histogram_argb, hash_chain, refs_array,
                                  VP8LSubSampleSize(width, histogram_bits),
                                  VP8LSubSampleSize(height, histogram_bits),
-                                 quality);
+                                 quality, low_effort);
       WebPSafeFree(histogram_argb);
       if (err != VP8_ENC_OK) goto Error;
     }
@@ -991,12 +1009,12 @@ static WebPEncodingError ApplyPredictFilter(const VP8LEncoder* const enc,
                               (VP8LHashChain*)&enc->hash_chain_,
                               (VP8LBackwardRefs*)enc->refs_,  // cast const away
                               transform_width, transform_height,
-                              quality);
+                              quality, low_effort);
 }
 
 static WebPEncodingError ApplyCrossColorFilter(const VP8LEncoder* const enc,
                                                int width, int height,
-                                               int quality,
+                                               int quality, int low_effort,
                                                VP8LBitWriter* const bw) {
   const int ccolor_transform_bits = enc->transform_bits_;
   const int transform_width = VP8LSubSampleSize(width, ccolor_transform_bits);
@@ -1012,7 +1030,7 @@ static WebPEncodingError ApplyCrossColorFilter(const VP8LEncoder* const enc,
                               (VP8LHashChain*)&enc->hash_chain_,
                               (VP8LBackwardRefs*)enc->refs_,  // cast const away
                               transform_width, transform_height,
-                              quality);
+                              quality, low_effort);
 }
 
 // -----------------------------------------------------------------------------
@@ -1157,7 +1175,8 @@ static WebPEncodingError MakeInputImageCopy(VP8LEncoder* const enc) {
 
 // -----------------------------------------------------------------------------
 
-static int SearchColor(const uint32_t sorted[], uint32_t color, int hi) {
+static WEBP_INLINE int SearchColorNoIdx(const uint32_t sorted[], uint32_t color,
+                                        int hi) {
   int low = 0;
   if (sorted[low] == color) return low;  // loop invariant: sorted[low] != color
   while (1) {
@@ -1172,35 +1191,68 @@ static int SearchColor(const uint32_t sorted[], uint32_t color, int hi) {
   }
 }
 
+#define APPLY_PALETTE_GREEDY_MAX 4
+
+static WEBP_INLINE uint32_t SearchColorGreedy(const uint32_t palette[],
+                                              int palette_size,
+                                              uint32_t color) {
+  (void)palette_size;
+  assert(palette_size < APPLY_PALETTE_GREEDY_MAX);
+  assert(3 == APPLY_PALETTE_GREEDY_MAX - 1);
+  if (color == palette[0]) return 0;
+  if (color == palette[1]) return 1;
+  if (color == palette[2]) return 2;
+  return 3;
+}
+
+static WEBP_INLINE uint32_t ApplyPaletteHash0(uint32_t color) {
+  // Focus on the green color.
+  return (color >> 8) & 0xff;
+}
+
+#define PALETTE_INV_SIZE_BITS 11
+#define PALETTE_INV_SIZE (1 << PALETTE_INV_SIZE_BITS)
+
+static WEBP_INLINE uint32_t ApplyPaletteHash1(uint32_t color) {
+  // Forget about alpha.
+  return ((color & 0x00ffffffu) * 4222244071u) >> (32 - PALETTE_INV_SIZE_BITS);
+}
+
+static WEBP_INLINE uint32_t ApplyPaletteHash2(uint32_t color) {
+  // Forget about alpha.
+  return (color & 0x00ffffffu) * ((1u << 31) - 1) >>
+         (32 - PALETTE_INV_SIZE_BITS);
+}
+
 // Sort palette in increasing order and prepare an inverse mapping array.
 static void PrepareMapToPalette(const uint32_t palette[], int num_colors,
-                                uint32_t sorted[], int idx_map[]) {
+                                uint32_t sorted[], uint32_t idx_map[]) {
   int i;
   memcpy(sorted, palette, num_colors * sizeof(*sorted));
   qsort(sorted, num_colors, sizeof(*sorted), PaletteCompareColorsForQsort);
   for (i = 0; i < num_colors; ++i) {
-    idx_map[SearchColor(sorted, palette[i], num_colors)] = i;
+    idx_map[SearchColorNoIdx(sorted, palette[i], num_colors)] = i;
   }
 }
 
-static void MapToPalette(const uint32_t sorted_palette[], int num_colors,
-                         uint32_t* const last_pix, int* const last_idx,
-                         const int idx_map[],
-                         const uint32_t* src, uint8_t* dst, int width) {
-  int x;
-  int prev_idx = *last_idx;
-  uint32_t prev_pix = *last_pix;
-  for (x = 0; x < width; ++x) {
-    const uint32_t pix = src[x];
-    if (pix != prev_pix) {
-      prev_idx = idx_map[SearchColor(sorted_palette, pix, num_colors)];
-      prev_pix = pix;
-    }
-    dst[x] = prev_idx;
-  }
-  *last_idx = prev_idx;
-  *last_pix = prev_pix;
-}
+// Use 1 pixel cache for ARGB pixels.
+#define APPLY_PALETTE_FOR(COLOR_INDEX) do {         \
+  uint32_t prev_pix = palette[0];                   \
+  uint32_t prev_idx = 0;                            \
+  for (y = 0; y < height; ++y) {                    \
+    for (x = 0; x < width; ++x) {                   \
+      const uint32_t pix = src[x];                  \
+      if (pix != prev_pix) {                        \
+        prev_idx = COLOR_INDEX;                     \
+        prev_pix = pix;                             \
+      }                                             \
+      tmp_row[x] = prev_idx;                        \
+    }                                               \
+    VP8LBundleColorMap(tmp_row, width, xbits, dst); \
+    src += src_stride;                              \
+    dst += dst_stride;                              \
+  }                                                 \
+} while (0)
 
 // Remap argb values in src[] to packed palettes entries in dst[]
 // using 'row' as a temporary buffer of size 'width'.
@@ -1213,52 +1265,59 @@ static WebPEncodingError ApplyPalette(const uint32_t* src, uint32_t src_stride,
   // TODO(skal): this tmp buffer is not needed if VP8LBundleColorMap() can be
   // made to work in-place.
   uint8_t* const tmp_row = (uint8_t*)WebPSafeMalloc(width, sizeof(*tmp_row));
-  int i, x, y;
-  int use_LUT = 1;
+  int x, y;
 
   if (tmp_row == NULL) return VP8_ENC_ERROR_OUT_OF_MEMORY;
-  for (i = 0; i < palette_size; ++i) {
-    if ((palette[i] & 0xffff00ffu) != 0) {
-      use_LUT = 0;
-      break;
-    }
-  }
 
-  if (use_LUT) {
-    uint8_t inv_palette[MAX_PALETTE_SIZE] = { 0 };
-    for (i = 0; i < palette_size; ++i) {
-      const int color = (palette[i] >> 8) & 0xff;
-      inv_palette[color] = i;
-    }
-    for (y = 0; y < height; ++y) {
-      for (x = 0; x < width; ++x) {
-        const int color = (src[x] >> 8) & 0xff;
-        tmp_row[x] = inv_palette[color];
+  if (palette_size < APPLY_PALETTE_GREEDY_MAX) {
+    APPLY_PALETTE_FOR(SearchColorGreedy(palette, palette_size, pix));
+  } else {
+    int i, j;
+    uint16_t buffer[PALETTE_INV_SIZE];
+    uint32_t (*const hash_functions[])(uint32_t) = {
+        ApplyPaletteHash0, ApplyPaletteHash1, ApplyPaletteHash2
+    };
+
+    // Try to find a perfect hash function able to go from a color to an index
+    // within 1 << PALETTE_INV_SIZE_BITS in order to build a hash map to go
+    // from color to index in palette.
+    for (i = 0; i < 3; ++i) {
+      int use_LUT = 1;
+      // Set each element in buffer to max uint16_t.
+      memset(buffer, 0xff, sizeof(buffer));
+      for (j = 0; j < palette_size; ++j) {
+        const uint32_t ind = hash_functions[i](palette[j]);
+        if (buffer[ind] != 0xffffu) {
+          use_LUT = 0;
+          break;
+        } else {
+          buffer[ind] = j;
+        }
       }
-      VP8LBundleColorMap(tmp_row, width, xbits, dst);
-      src += src_stride;
-      dst += dst_stride;
+      if (use_LUT) break;
     }
-  } else {
-    // Use 1 pixel cache for ARGB pixels.
-    uint32_t last_pix;
-    int last_idx;
-    uint32_t sorted[MAX_PALETTE_SIZE];
-    int idx_map[MAX_PALETTE_SIZE];
-    PrepareMapToPalette(palette, palette_size, sorted, idx_map);
-    last_pix = palette[0];
-    last_idx = 0;
-    for (y = 0; y < height; ++y) {
-      MapToPalette(sorted, palette_size, &last_pix, &last_idx,
-                   idx_map, src, tmp_row, width);
-      VP8LBundleColorMap(tmp_row, width, xbits, dst);
-      src += src_stride;
-      dst += dst_stride;
+
+    if (i == 0) {
+      APPLY_PALETTE_FOR(buffer[ApplyPaletteHash0(pix)]);
+    } else if (i == 1) {
+      APPLY_PALETTE_FOR(buffer[ApplyPaletteHash1(pix)]);
+    } else if (i == 2) {
+      APPLY_PALETTE_FOR(buffer[ApplyPaletteHash2(pix)]);
+    } else {
+      uint32_t idx_map[MAX_PALETTE_SIZE];
+      uint32_t palette_sorted[MAX_PALETTE_SIZE];
+      PrepareMapToPalette(palette, palette_size, palette_sorted, idx_map);
+      APPLY_PALETTE_FOR(
+          idx_map[SearchColorNoIdx(palette_sorted, pix, palette_size)]);
     }
   }
   WebPSafeFree(tmp_row);
   return VP8_ENC_OK;
 }
+#undef APPLY_PALETTE_FOR
+#undef PALETTE_INV_SIZE_BITS
+#undef PALETTE_INV_SIZE
+#undef APPLY_PALETTE_GREEDY_MAX
 
 // Note: Expects "enc->palette_" to be set properly.
 static WebPEncodingError MapImageFromPalette(VP8LEncoder* const enc,
@@ -1291,7 +1350,7 @@ static WebPEncodingError MapImageFromPalette(VP8LEncoder* const enc,
 }
 
 // Save palette_[] to bitstream.
-static WebPEncodingError EncodePalette(VP8LBitWriter* const bw,
+static WebPEncodingError EncodePalette(VP8LBitWriter* const bw, int low_effort,
                                        VP8LEncoder* const enc) {
   int i;
   uint32_t tmp_palette[MAX_PALETTE_SIZE];
@@ -1306,13 +1365,14 @@ static WebPEncodingError EncodePalette(VP8LBitWriter* const bw,
   }
   tmp_palette[0] = palette[0];
   return EncodeImageNoHuffman(bw, tmp_palette, &enc->hash_chain_, enc->refs_,
-                              palette_size, 1, 20 /* quality */);
+                              palette_size, 1, 20 /* quality */, low_effort);
 }
 
 #ifdef WEBP_EXPERIMENTAL_FEATURES
 
 static WebPEncodingError EncodeDeltaPalettePredictorImage(
-    VP8LBitWriter* const bw, VP8LEncoder* const enc, int quality) {
+    VP8LBitWriter* const bw, VP8LEncoder* const enc, int quality,
+    int low_effort) {
   const WebPPicture* const pic = enc->pic_;
   const int width = pic->width;
   const int height = pic->height;
@@ -1343,7 +1403,7 @@ static WebPEncodingError EncodeDeltaPalettePredictorImage(
   err = EncodeImageNoHuffman(bw, predictors, &enc->hash_chain_,
                              (VP8LBackwardRefs*)enc->refs_,  // cast const away
                              transform_width, transform_height,
-                             quality);
+                             quality, low_effort);
   WebPSafeFree(predictors);
   return err;
 }
@@ -1394,7 +1454,7 @@ WebPEncodingError VP8LEncodeStream(const WebPConfig* const config,
   int use_near_lossless = 0;
   int hdr_size = 0;
   int data_size = 0;
-  int use_delta_palettization = 0;
+  int use_delta_palette = 0;
 
   if (enc == NULL) {
     err = VP8_ENC_ERROR_OUT_OF_MEMORY;
@@ -1421,7 +1481,7 @@ WebPEncodingError VP8LEncodeStream(const WebPConfig* const config,
   }
 
 #ifdef WEBP_EXPERIMENTAL_FEATURES
-  if (config->delta_palettization) {
+  if (config->use_delta_palette) {
     enc->use_predict_ = 1;
     enc->use_cross_color_ = 0;
     enc->use_subtract_green_ = 0;
@@ -1433,21 +1493,25 @@ WebPEncodingError VP8LEncodeStream(const WebPConfig* const config,
     if (enc->use_palette_) {
       err = AllocateTransformBuffer(enc, width, height);
       if (err != VP8_ENC_OK) goto Error;
-      err = EncodeDeltaPalettePredictorImage(bw, enc, quality);
+      err = EncodeDeltaPalettePredictorImage(bw, enc, quality, low_effort);
       if (err != VP8_ENC_OK) goto Error;
-      use_delta_palettization = 1;
+      use_delta_palette = 1;
     }
   }
 #endif  // WEBP_EXPERIMENTAL_FEATURES
 
   // Encode palette
   if (enc->use_palette_) {
-    err = EncodePalette(bw, enc);
+    err = EncodePalette(bw, low_effort, enc);
     if (err != VP8_ENC_OK) goto Error;
-    err = MapImageFromPalette(enc, use_delta_palettization);
+    err = MapImageFromPalette(enc, use_delta_palette);
     if (err != VP8_ENC_OK) goto Error;
+    // If using a color cache, do not have it bigger than the number of colors.
+    if (use_cache && enc->palette_size_ < (1 << MAX_COLOR_CACHE_BITS)) {
+      enc->cache_bits_ = BitsLog2Floor(enc->palette_size_) + 1;
+    }
   }
-  if (!use_delta_palettization) {
+  if (!use_delta_palette) {
     // In case image is not packed.
     if (enc->argb_ == NULL) {
       err = MakeInputImageCopy(enc);
@@ -1469,7 +1533,7 @@ WebPEncodingError VP8LEncodeStream(const WebPConfig* const config,
 
     if (enc->use_cross_color_) {
       err = ApplyCrossColorFilter(enc, enc->current_width_,
-                                  height, quality, bw);
+                                  height, quality, low_effort, bw);
       if (err != VP8_ENC_OK) goto Error;
     }
   }
diff --git a/src/3rdparty/libwebp/src/enc/vp8li.h b/src/3rdparty/libwebp/src/enc/vp8li_enc.h
index 371e276..8c5fbcb 100644
--- a/src/3rdparty/libwebp/src/enc/vp8li.h
+++ b/src/3rdparty/libwebp/src/enc/vp8li_enc.h
@@ -14,9 +14,9 @@
 #ifndef WEBP_ENC_VP8LI_H_
 #define WEBP_ENC_VP8LI_H_
 
-#include "./backward_references.h"
-#include "./histogram.h"
-#include "../utils/bit_writer.h"
+#include "./backward_references_enc.h"
+#include "./histogram_enc.h"
+#include "../utils/bit_writer_utils.h"
 #include "../webp/encode.h"
 #include "../webp/format_constants.h"
 
@@ -24,6 +24,9 @@
 extern "C" {
 #endif
 
+// maximum value of transform_bits_ in VP8LEncoder.
+#define MAX_TRANSFORM_BITS 6
+
 typedef struct {
   const WebPConfig* config_;      // user configuration and parameters
   const WebPPicture* pic_;        // input picture.
@@ -39,7 +42,7 @@ typedef struct {
 
   // Encoding parameters derived from quality parameter.
   int histo_bits_;
-  int transform_bits_;
+  int transform_bits_;    // <= MAX_TRANSFORM_BITS.
   int cache_bits_;        // If equal to 0, don't use color cache.
 
   // Encoding parameters derived from image characteristics.
@@ -73,6 +76,17 @@ WebPEncodingError VP8LEncodeStream(const WebPConfig* const config,
                                    VP8LBitWriter* const bw, int use_cache);
 
 //------------------------------------------------------------------------------
+// Image transforms in predictor.c.
+
+void VP8LResidualImage(int width, int height, int bits, int low_effort,
+                       uint32_t* const argb, uint32_t* const argb_scratch,
+                       uint32_t* const image, int near_lossless, int exact,
+                       int used_subtract_green);
+
+void VP8LColorSpaceTransform(int width, int height, int bits, int quality,
+                             uint32_t* const argb, uint32_t* image);
+
+//------------------------------------------------------------------------------
 
 #ifdef __cplusplus
 }    // extern "C"
diff --git a/src/3rdparty/libwebp/src/enc/webpenc.c b/src/3rdparty/libwebp/src/enc/webp_enc.c
index a7d04ea..f18461e 100644
--- a/src/3rdparty/libwebp/src/enc/webpenc.c
+++ b/src/3rdparty/libwebp/src/enc/webp_enc.c
@@ -16,9 +16,9 @@
 #include <string.h>
 #include <math.h>
 
-#include "./cost.h"
-#include "./vp8enci.h"
-#include "./vp8li.h"
+#include "./cost_enc.h"
+#include "./vp8i_enc.h"
+#include "./vp8li_enc.h"
 #include "../utils/utils.h"
 
 // #define PRINT_MEMORY_INFO
@@ -75,7 +75,7 @@ static void ResetBoundaryPredictions(VP8Encoder* const enc) {
 //-------------------+---+---+---+---+---+---+---+
 // dynamic proba     | ~ | x | x | x | x | x | x |
 //-------------------+---+---+---+---+---+---+---+
-// fast mode analysis|   |   |   |   | x | x | x |
+// fast mode analysis|[x]|[x]|   |   | x | x | x |
 //-------------------+---+---+---+---+---+---+---+
 // basic rd-opt      |   |   |   | x | x | x | x |
 //-------------------+---+---+---+---+---+---+---+
@@ -315,18 +315,21 @@ int WebPReportProgress(const WebPPicture* const pic,
 
 int WebPEncode(const WebPConfig* config, WebPPicture* pic) {
   int ok = 0;
+  if (pic == NULL) return 0;
 
-  if (pic == NULL)
-    return 0;
   WebPEncodingSetError(pic, VP8_ENC_OK);  // all ok so far
-  if (config == NULL)  // bad params
+  if (config == NULL) {  // bad params
     return WebPEncodingSetError(pic, VP8_ENC_ERROR_NULL_PARAMETER);
-  if (!WebPValidateConfig(config))
+  }
+  if (!WebPValidateConfig(config)) {
     return WebPEncodingSetError(pic, VP8_ENC_ERROR_INVALID_CONFIGURATION);
-  if (pic->width <= 0 || pic->height <= 0)
+  }
+  if (pic->width <= 0 || pic->height <= 0) {
     return WebPEncodingSetError(pic, VP8_ENC_ERROR_BAD_DIMENSION);
-  if (pic->width > WEBP_MAX_DIMENSION || pic->height > WEBP_MAX_DIMENSION)
+  }
+  if (pic->width > WEBP_MAX_DIMENSION || pic->height > WEBP_MAX_DIMENSION) {
     return WebPEncodingSetError(pic, VP8_ENC_ERROR_BAD_DIMENSION);
+  }
 
   if (pic->stats != NULL) memset(pic->stats, 0, sizeof(*pic->stats));
 
@@ -339,8 +342,8 @@ int WebPEncode(const WebPConfig* config, WebPPicture* pic) {
 
     if (pic->use_argb || pic->y == NULL || pic->u == NULL || pic->v == NULL) {
       // Make sure we have YUVA samples.
-      if (config->preprocessing & 4) {
-        if (!WebPPictureSmartARGBToYUVA(pic)) {
+      if (config->use_sharp_yuv || (config->preprocessing & 4)) {
+        if (!WebPPictureSharpARGBToYUVA(pic)) {
           return 0;
         }
       } else {
diff --git a/src/3rdparty/libwebp/src/extras/extras.c b/src/3rdparty/libwebp/src/extras/extras.c
deleted file mode 100644
index ca32fbc..0000000
--- a/src/3rdparty/libwebp/src/extras/extras.c
+++ /dev/null
@@ -1,111 +0,0 @@
-// Copyright 2015 Google Inc. All Rights Reserved.
-//
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
-// -----------------------------------------------------------------------------
-//
-//  Additional WebP utilities.
-//
-
-#include "../webp/extras.h"
-
-#include <string.h>
-
-#define XTRA_MAJ_VERSION 0
-#define XTRA_MIN_VERSION 0
-#define XTRA_REV_VERSION 0
-
-//------------------------------------------------------------------------------
-
-int WebPGetExtrasVersion(void) {
-  return (XTRA_MAJ_VERSION << 16) | (XTRA_MIN_VERSION << 8) | XTRA_REV_VERSION;
-}
-
-//------------------------------------------------------------------------------
-
-int WebPImportGray(const uint8_t* gray_data, WebPPicture* pic) {
-  int y, width, uv_width;
-  if (pic == NULL || gray_data == NULL) return 0;
-  pic->colorspace = WEBP_YUV420;
-  if (!WebPPictureAlloc(pic)) return 0;
-  width = pic->width;
-  uv_width = (width + 1) >> 1;
-  for (y = 0; y < pic->height; ++y) {
-    memcpy(pic->y + y * pic->y_stride, gray_data, width);
-    gray_data += width;    // <- we could use some 'data_stride' here if needed
-    if ((y & 1) == 0) {
-      memset(pic->u + (y >> 1) * pic->uv_stride, 128, uv_width);
-      memset(pic->v + (y >> 1) * pic->uv_stride, 128, uv_width);
-    }
-  }
-  return 1;
-}
-
-int WebPImportRGB565(const uint8_t* rgb565, WebPPicture* pic) {
-  int x, y;
-  if (pic == NULL || rgb565 == NULL) return 0;
-  pic->colorspace = WEBP_YUV420;
-  pic->use_argb = 1;
-  if (!WebPPictureAlloc(pic)) return 0;
-  for (y = 0; y < pic->height; ++y) {
-    const int width = pic->width;
-    uint32_t* dst = pic->argb + y * pic->argb_stride;
-    for (x = 0; x < width; ++x) {
-#ifdef WEBP_SWAP_16BIT_CSP
-      const uint32_t rg = rgb565[2 * x + 1];
-      const uint32_t gb = rgb565[2 * x + 0];
-#else
-      const uint32_t rg = rgb565[2 * x + 0];
-      const uint32_t gb = rgb565[2 * x + 1];
-#endif
-      uint32_t r = rg & 0xf8;
-      uint32_t g = ((rg << 5) | (gb >> 3)) & 0xfc;
-      uint32_t b = (gb << 5);
-      // dithering
-      r = r | (r >> 5);
-      g = g | (g >> 6);
-      b = b | (b >> 5);
-      dst[x] = (r << 16) | (g << 8) | b;
-    }
-    rgb565 += 2 * width;
-  }
-  return 1;
-}
-
-int WebPImportRGB4444(const uint8_t* rgb4444, WebPPicture* pic) {
-  int x, y;
-  if (pic == NULL || rgb4444 == NULL) return 0;
-  pic->colorspace = WEBP_YUV420;
-  pic->use_argb = 1;
-  if (!WebPPictureAlloc(pic)) return 0;
-  for (y = 0; y < pic->height; ++y) {
-    const int width = pic->width;
-    uint32_t* dst = pic->argb + y * pic->argb_stride;
-    for (x = 0; x < width; ++x) {
-#ifdef WEBP_SWAP_16BIT_CSP
-      const uint32_t rg = rgb4444[2 * x + 1];
-      const uint32_t ba = rgb4444[2 * x + 0];
-#else
-      const uint32_t rg = rgb4444[2 * x + 0];
-      const uint32_t ba = rgb4444[2 * x + 1];
-#endif
-      uint32_t r = rg & 0xf0;
-      uint32_t g = (rg << 4);
-      uint32_t b = (ba & 0xf0);
-      uint32_t a = (ba << 4);
-      // dithering
-      r = r | (r >> 4);
-      g = g | (g >> 4);
-      b = b | (b >> 4);
-      a = a | (a >> 4);
-      dst[x] = (a << 24) | (r << 16) | (g << 8) | b;
-    }
-    rgb4444 += 2 * width;
-  }
-  return 1;
-}
-
-//------------------------------------------------------------------------------
diff --git a/src/3rdparty/libwebp/src/mux/anim_encode.c b/src/3rdparty/libwebp/src/mux/anim_encode.c
index 53e2906..6066388 100644
--- a/src/3rdparty/libwebp/src/mux/anim_encode.c
+++ b/src/3rdparty/libwebp/src/mux/anim_encode.c
@@ -16,6 +16,7 @@
 #include <stdio.h>
 #include <stdlib.h>  // for abs()
 
+#include "../mux/animi.h"
 #include "../utils/utils.h"
 #include "../webp/decode.h"
 #include "../webp/encode.h"
@@ -128,14 +129,13 @@ static void SanitizeEncoderOptions(WebPAnimEncoderOptions* const enc_options) {
     DisableKeyframes(enc_options);
   }
 
-  if (enc_options->kmin <= 0) {
-    DisableKeyframes(enc_options);
-    print_warning = 0;
-  }
-  if (enc_options->kmax <= 0) {  // All frames will be key-frames.
+  if (enc_options->kmax == 1) {  // All frames will be key-frames.
     enc_options->kmin = 0;
     enc_options->kmax = 0;
     return;
+  } else if (enc_options->kmax <= 0) {
+    DisableKeyframes(enc_options);
+    print_warning = 0;
   }
 
   if (enc_options->kmin >= enc_options->kmax) {
@@ -378,10 +378,10 @@ static WEBP_INLINE int PixelsAreSimilar(uint32_t src, uint32_t dst,
   const int dst_g = (dst >> 8) & 0xff;
   const int dst_b = (dst >> 0) & 0xff;
 
-  return (abs(src_r * src_a - dst_r * dst_a) <= (max_allowed_diff * 255)) &&
-         (abs(src_g * src_a - dst_g * dst_a) <= (max_allowed_diff * 255)) &&
-         (abs(src_b * src_a - dst_b * dst_a) <= (max_allowed_diff * 255)) &&
-         (abs(src_a - dst_a) <= max_allowed_diff);
+  return (src_a == dst_a) &&
+         (abs(src_r - dst_r) * dst_a <= (max_allowed_diff * 255)) &&
+         (abs(src_g - dst_g) * dst_a <= (max_allowed_diff * 255)) &&
+         (abs(src_b - dst_b) * dst_a <= (max_allowed_diff * 255));
 }
 
 // Returns true if 'length' number of pixels in 'src' and 'dst' are within an
@@ -586,6 +586,39 @@ static int GetSubRects(const WebPPicture* const prev_canvas,
                     &params->rect_lossy_, &params->sub_frame_lossy_);
 }
 
+static WEBP_INLINE int clip(int v, int min_v, int max_v) {
+  return (v < min_v) ? min_v : (v > max_v) ? max_v : v;
+}
+
+int WebPAnimEncoderRefineRect(
+    const WebPPicture* const prev_canvas, const WebPPicture* const curr_canvas,
+    int is_lossless, float quality, int* const x_offset, int* const y_offset,
+    int* const width, int* const height) {
+  FrameRect rect;
+  const int right = clip(*x_offset + *width, 0, curr_canvas->width);
+  const int left = clip(*x_offset, 0, curr_canvas->width - 1);
+  const int bottom = clip(*y_offset + *height, 0, curr_canvas->height);
+  const int top = clip(*y_offset, 0, curr_canvas->height - 1);
+  if (prev_canvas == NULL || curr_canvas == NULL ||
+      prev_canvas->width != curr_canvas->width ||
+      prev_canvas->height != curr_canvas->height ||
+      !prev_canvas->use_argb || !curr_canvas->use_argb) {
+    return 0;
+  }
+  rect.x_offset_ = left;
+  rect.y_offset_ = top;
+  rect.width_ = clip(right - left, 0, curr_canvas->width - rect.x_offset_);
+  rect.height_ = clip(bottom - top, 0, curr_canvas->height - rect.y_offset_);
+  MinimizeChangeRectangle(prev_canvas, curr_canvas, &rect, is_lossless,
+                          quality);
+  SnapToEvenOffsets(&rect);
+  *x_offset = rect.x_offset_;
+  *y_offset = rect.y_offset_;
+  *width = rect.width_;
+  *height = rect.height_;
+  return 1;
+}
+
 static void DisposeFrameRectangle(int dispose_method,
                                   const FrameRect* const rect,
                                   WebPPicture* const curr_canvas) {
@@ -829,8 +862,8 @@ static WebPEncodingError GenerateCandidates(
   WebPPicture* const curr_canvas = &enc->curr_canvas_copy_;
   const WebPPicture* const prev_canvas =
       is_dispose_none ? &enc->prev_canvas_ : &enc->prev_canvas_disposed_;
-  int use_blending_ll;
-  int use_blending_lossy;
+  int use_blending_ll, use_blending_lossy;
+  int evaluate_ll, evaluate_lossy;
 
   CopyCurrentCanvas(enc);
   use_blending_ll =
@@ -843,16 +876,19 @@ static WebPEncodingError GenerateCandidates(
 
   // Pick candidates to be tried.
   if (!enc->options_.allow_mixed) {
-    candidate_ll->evaluate_ = is_lossless;
-    candidate_lossy->evaluate_ = !is_lossless;
+    evaluate_ll = is_lossless;
+    evaluate_lossy = !is_lossless;
+  } else if (enc->options_.minimize_size) {
+    evaluate_ll = 1;
+    evaluate_lossy = 1;
   } else {  // Use a heuristic for trying lossless and/or lossy compression.
     const int num_colors = WebPGetColorPalette(&params->sub_frame_ll_, NULL);
-    candidate_ll->evaluate_ = (num_colors < MAX_COLORS_LOSSLESS);
-    candidate_lossy->evaluate_ = (num_colors >= MIN_COLORS_LOSSY);
+    evaluate_ll = (num_colors < MAX_COLORS_LOSSLESS);
+    evaluate_lossy = (num_colors >= MIN_COLORS_LOSSY);
   }
 
   // Generate candidates.
-  if (candidate_ll->evaluate_) {
+  if (evaluate_ll) {
     CopyCurrentCanvas(enc);
     if (use_blending_ll) {
       enc->curr_canvas_copy_modified_ =
@@ -862,7 +898,7 @@ static WebPEncodingError GenerateCandidates(
                                  config_ll, use_blending_ll, candidate_ll);
     if (error_code != VP8_ENC_OK) return error_code;
   }
-  if (candidate_lossy->evaluate_) {
+  if (evaluate_lossy) {
     CopyCurrentCanvas(enc);
     if (use_blending_lossy) {
       enc->curr_canvas_copy_modified_ =
@@ -1029,6 +1065,8 @@ static WebPEncodingError SetFrame(WebPAnimEncoder* const enc,
   const WebPPicture* const prev_canvas = &enc->prev_canvas_;
   Candidate candidates[CANDIDATE_COUNT];
   const int is_lossless = config->lossless;
+  const int consider_lossless = is_lossless || enc->options_.allow_mixed;
+  const int consider_lossy = !is_lossless || enc->options_.allow_mixed;
   const int is_first_frame = enc->is_first_frame_;
 
   // First frame cannot be skipped as there is no 'previous frame' to merge it
@@ -1066,9 +1104,7 @@ static WebPEncodingError SetFrame(WebPAnimEncoder* const enc,
     return VP8_ENC_ERROR_INVALID_CONFIGURATION;
   }
 
-  for (i = 0; i < CANDIDATE_COUNT; ++i) {
-    candidates[i].evaluate_ = 0;
-  }
+  memset(candidates, 0, sizeof(candidates));
 
   // Change-rectangle assuming previous frame was DISPOSE_NONE.
   if (!GetSubRects(prev_canvas, curr_canvas, is_key_frame, is_first_frame,
@@ -1077,8 +1113,8 @@ static WebPEncodingError SetFrame(WebPAnimEncoder* const enc,
     goto Err;
   }
 
-  if ((is_lossless && IsEmptyRect(&dispose_none_params.rect_ll_)) ||
-      (!is_lossless && IsEmptyRect(&dispose_none_params.rect_lossy_))) {
+  if ((consider_lossless && IsEmptyRect(&dispose_none_params.rect_ll_)) ||
+      (consider_lossy && IsEmptyRect(&dispose_none_params.rect_lossy_))) {
     // Don't encode the frame at all. Instead, the duration of the previous
     // frame will be increased later.
     assert(empty_rect_allowed_none);
@@ -1187,16 +1223,20 @@ static int CacheFrame(WebPAnimEncoder* const enc,
       enc->prev_candidate_undecided_ = 0;
     } else {
       int64_t curr_delta;
+      FrameRect prev_rect_key, prev_rect_sub;
 
       // Add this as a frame rectangle to enc.
       error_code = SetFrame(enc, config, 0, encoded_frame, &frame_skipped);
       if (error_code != VP8_ENC_OK) goto End;
       if (frame_skipped) goto Skip;
+      prev_rect_sub = enc->prev_rect_;
+
 
       // Add this as a key-frame to enc, too.
       error_code = SetFrame(enc, config, 1, encoded_frame, &frame_skipped);
       if (error_code != VP8_ENC_OK) goto End;
       assert(frame_skipped == 0);  // Key-frame cannot be an empty rectangle.
+      prev_rect_key = enc->prev_rect_;
 
       // Analyze size difference of the two variants.
       curr_delta = KeyFramePenalty(encoded_frame);
@@ -1207,11 +1247,13 @@ static int CacheFrame(WebPAnimEncoder* const enc,
           old_keyframe->is_key_frame_ = 0;
         }
         encoded_frame->is_key_frame_ = 1;
+        enc->prev_candidate_undecided_ = 1;
         enc->keyframe_ = (int)position;
         enc->best_delta_ = curr_delta;
         enc->flush_count_ = enc->count_ - 1;  // We can flush previous frames.
       } else {
         encoded_frame->is_key_frame_ = 0;
+        enc->prev_candidate_undecided_ = 0;
       }
       // Note: We need '>=' below because when kmin and kmax are both zero,
       // count_since_key_frame will always be > kmax.
@@ -1221,7 +1263,10 @@ static int CacheFrame(WebPAnimEncoder* const enc,
         enc->keyframe_ = KEYFRAME_NONE;
         enc->best_delta_ = DELTA_INFINITY;
       }
-      enc->prev_candidate_undecided_ = 1;
+      if (!enc->prev_candidate_undecided_) {
+        enc->prev_rect_ =
+            encoded_frame->is_key_frame_ ? prev_rect_key : prev_rect_sub;
+      }
     }
   }
 
diff --git a/src/3rdparty/libwebp/src/mux/animi.h b/src/3rdparty/libwebp/src/mux/animi.h
new file mode 100644
index 0000000..cecaf1f
--- /dev/null
+++ b/src/3rdparty/libwebp/src/mux/animi.h
@@ -0,0 +1,43 @@
+// Copyright 2016 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Internal header for animation related functions.
+//
+// Author: Hui Su (huisu@google.com)
+
+#ifndef WEBP_MUX_ANIMI_H_
+#define WEBP_MUX_ANIMI_H_
+
+#include "../webp/mux.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Picks the optimal rectangle between two pictures, starting with initial
+// values of offsets and dimensions that are passed in. The initial
+// values will be clipped, if necessary, to make sure the rectangle is
+// within the canvas. "use_argb" must be true for both pictures.
+// Parameters:
+//   prev_canvas, curr_canvas - (in) two input pictures to compare.
+//   is_lossless, quality - (in) encoding settings.
+//   x_offset, y_offset, width, height - (in/out) rectangle between the two
+//                                                input pictures.
+// Returns true on success.
+int WebPAnimEncoderRefineRect(
+    const struct WebPPicture* const prev_canvas,
+    const struct WebPPicture* const curr_canvas,
+    int is_lossless, float quality, int* const x_offset, int* const y_offset,
+    int* const width, int* const height);
+
+#ifdef __cplusplus
+}    // extern "C"
+#endif
+
+#endif  /* WEBP_MUX_ANIMI_H_ */
diff --git a/src/3rdparty/libwebp/src/mux/muxedit.c b/src/3rdparty/libwebp/src/mux/muxedit.c
index 9bbed42..d2c5305 100644
--- a/src/3rdparty/libwebp/src/mux/muxedit.c
+++ b/src/3rdparty/libwebp/src/mux/muxedit.c
@@ -93,34 +93,32 @@ static WebPMuxError MuxSet(WebPMux* const mux, uint32_t tag, uint32_t nth,
 }
 #undef SWITCH_ID_LIST
 
-// Create data for frame/fragment given image data, offsets and duration.
-static WebPMuxError CreateFrameFragmentData(
-    int width, int height, const WebPMuxFrameInfo* const info, int is_frame,
-    WebPData* const frame_frgm) {
-  uint8_t* frame_frgm_bytes;
-  const size_t frame_frgm_size = kChunks[is_frame ? IDX_ANMF : IDX_FRGM].size;
+// Create data for frame given image data, offsets and duration.
+static WebPMuxError CreateFrameData(
+    int width, int height, const WebPMuxFrameInfo* const info,
+    WebPData* const frame) {
+  uint8_t* frame_bytes;
+  const size_t frame_size = kChunks[IDX_ANMF].size;
 
   assert(width > 0 && height > 0 && info->duration >= 0);
   assert(info->dispose_method == (info->dispose_method & 1));
   // Note: assertion on upper bounds is done in PutLE24().
 
-  frame_frgm_bytes = (uint8_t*)WebPSafeMalloc(1ULL, frame_frgm_size);
-  if (frame_frgm_bytes == NULL) return WEBP_MUX_MEMORY_ERROR;
+  frame_bytes = (uint8_t*)WebPSafeMalloc(1ULL, frame_size);
+  if (frame_bytes == NULL) return WEBP_MUX_MEMORY_ERROR;
 
-  PutLE24(frame_frgm_bytes + 0, info->x_offset / 2);
-  PutLE24(frame_frgm_bytes + 3, info->y_offset / 2);
+  PutLE24(frame_bytes + 0, info->x_offset / 2);
+  PutLE24(frame_bytes + 3, info->y_offset / 2);
 
-  if (is_frame) {
-    PutLE24(frame_frgm_bytes + 6, width - 1);
-    PutLE24(frame_frgm_bytes + 9, height - 1);
-    PutLE24(frame_frgm_bytes + 12, info->duration);
-    frame_frgm_bytes[15] =
-        (info->blend_method == WEBP_MUX_NO_BLEND ? 2 : 0) |
-        (info->dispose_method == WEBP_MUX_DISPOSE_BACKGROUND ? 1 : 0);
-  }
+  PutLE24(frame_bytes + 6, width - 1);
+  PutLE24(frame_bytes + 9, height - 1);
+  PutLE24(frame_bytes + 12, info->duration);
+  frame_bytes[15] =
+      (info->blend_method == WEBP_MUX_NO_BLEND ? 2 : 0) |
+      (info->dispose_method == WEBP_MUX_DISPOSE_BACKGROUND ? 1 : 0);
 
-  frame_frgm->bytes = frame_frgm_bytes;
-  frame_frgm->size = frame_frgm_size;
+  frame->bytes = frame_bytes;
+  frame->size = frame_size;
   return WEBP_MUX_OK;
 }
 
@@ -264,23 +262,16 @@ WebPMuxError WebPMuxSetImage(WebPMux* mux, const WebPData* bitstream,
   return err;
 }
 
-WebPMuxError WebPMuxPushFrame(WebPMux* mux, const WebPMuxFrameInfo* frame,
+WebPMuxError WebPMuxPushFrame(WebPMux* mux, const WebPMuxFrameInfo* info,
                               int copy_data) {
   WebPMuxImage wpi;
   WebPMuxError err;
-  int is_frame;
-  const WebPData* const bitstream = &frame->bitstream;
+  const WebPData* const bitstream = &info->bitstream;
 
   // Sanity checks.
-  if (mux == NULL || frame == NULL) return WEBP_MUX_INVALID_ARGUMENT;
+  if (mux == NULL || info == NULL) return WEBP_MUX_INVALID_ARGUMENT;
 
-  is_frame = (frame->id == WEBP_CHUNK_ANMF);
-  if (!(is_frame || (frame->id == WEBP_CHUNK_FRGM))) {
-    return WEBP_MUX_INVALID_ARGUMENT;
-  }
-  if (frame->id == WEBP_CHUNK_FRGM) {     // Dead experiment.
-    return WEBP_MUX_INVALID_ARGUMENT;
-  }
+  if (info->id != WEBP_CHUNK_ANMF) return WEBP_MUX_INVALID_ARGUMENT;
 
   if (bitstream->bytes == NULL || bitstream->size > MAX_CHUNK_PAYLOAD) {
     return WEBP_MUX_INVALID_ARGUMENT;
@@ -290,7 +281,7 @@ WebPMuxError WebPMuxPushFrame(WebPMux* mux, const WebPMuxFrameInfo* frame,
     const WebPMuxImage* const image = mux->images_;
     const uint32_t image_id = (image->header_ != NULL) ?
         ChunkGetIdFromTag(image->header_->tag_) : WEBP_CHUNK_IMAGE;
-    if (image_id != frame->id) {
+    if (image_id != info->id) {
       return WEBP_MUX_INVALID_ARGUMENT;  // Conflicting frame types.
     }
   }
@@ -301,16 +292,11 @@ WebPMuxError WebPMuxPushFrame(WebPMux* mux, const WebPMuxFrameInfo* frame,
   assert(wpi.img_ != NULL);  // As SetAlphaAndImageChunks() was successful.
 
   {
-    WebPData frame_frgm;
-    const uint32_t tag = kChunks[is_frame ? IDX_ANMF : IDX_FRGM].tag;
-    WebPMuxFrameInfo tmp = *frame;
+    WebPData frame;
+    const uint32_t tag = kChunks[IDX_ANMF].tag;
+    WebPMuxFrameInfo tmp = *info;
     tmp.x_offset &= ~1;  // Snap offsets to even.
     tmp.y_offset &= ~1;
-    if (!is_frame) {  // Reset unused values.
-      tmp.duration = 1;
-      tmp.dispose_method = WEBP_MUX_DISPOSE_NONE;
-      tmp.blend_method = WEBP_MUX_BLEND;
-    }
     if (tmp.x_offset < 0 || tmp.x_offset >= MAX_POSITION_OFFSET ||
         tmp.y_offset < 0 || tmp.y_offset >= MAX_POSITION_OFFSET ||
         (tmp.duration < 0 || tmp.duration >= MAX_DURATION) ||
@@ -318,12 +304,11 @@ WebPMuxError WebPMuxPushFrame(WebPMux* mux, const WebPMuxFrameInfo* frame,
       err = WEBP_MUX_INVALID_ARGUMENT;
       goto Err;
     }
-    err = CreateFrameFragmentData(wpi.width_, wpi.height_, &tmp, is_frame,
-                                  &frame_frgm);
+    err = CreateFrameData(wpi.width_, wpi.height_, &tmp, &frame);
     if (err != WEBP_MUX_OK) goto Err;
-    // Add frame/fragment chunk (with copy_data = 1).
-    err = AddDataToChunkList(&frame_frgm, 1, tag, &wpi.header_);
-    WebPDataClear(&frame_frgm);  // frame_frgm owned by wpi.header_ now.
+    // Add frame chunk (with copy_data = 1).
+    err = AddDataToChunkList(&frame, 1, tag, &wpi.header_);
+    WebPDataClear(&frame);  // frame owned by wpi.header_ now.
     if (err != WEBP_MUX_OK) goto Err;
   }
 
@@ -402,21 +387,18 @@ WebPMuxError WebPMuxDeleteFrame(WebPMux* mux, uint32_t nth) {
 //------------------------------------------------------------------------------
 // Assembly of the WebP RIFF file.
 
-static WebPMuxError GetFrameFragmentInfo(
-    const WebPChunk* const frame_frgm_chunk,
+static WebPMuxError GetFrameInfo(
+    const WebPChunk* const frame_chunk,
     int* const x_offset, int* const y_offset, int* const duration) {
-  const uint32_t tag = frame_frgm_chunk->tag_;
-  const int is_frame = (tag == kChunks[IDX_ANMF].tag);
-  const WebPData* const data = &frame_frgm_chunk->data_;
-  const size_t expected_data_size =
-      is_frame ? ANMF_CHUNK_SIZE : FRGM_CHUNK_SIZE;
-  assert(frame_frgm_chunk != NULL);
-  assert(tag == kChunks[IDX_ANMF].tag || tag ==  kChunks[IDX_FRGM].tag);
+  const WebPData* const data = &frame_chunk->data_;
+  const size_t expected_data_size = ANMF_CHUNK_SIZE;
+  assert(frame_chunk->tag_ == kChunks[IDX_ANMF].tag);
+  assert(frame_chunk != NULL);
   if (data->size != expected_data_size) return WEBP_MUX_INVALID_ARGUMENT;
 
   *x_offset = 2 * GetLE24(data->bytes + 0);
   *y_offset = 2 * GetLE24(data->bytes + 3);
-  if (is_frame) *duration = GetLE24(data->bytes + 12);
+  *duration = GetLE24(data->bytes + 12);
   return WEBP_MUX_OK;
 }
 
@@ -424,13 +406,13 @@ static WebPMuxError GetImageInfo(const WebPMuxImage* const wpi,
                                  int* const x_offset, int* const y_offset,
                                  int* const duration,
                                  int* const width, int* const height) {
-  const WebPChunk* const frame_frgm_chunk = wpi->header_;
+  const WebPChunk* const frame_chunk = wpi->header_;
   WebPMuxError err;
   assert(wpi != NULL);
-  assert(frame_frgm_chunk != NULL);
+  assert(frame_chunk != NULL);
 
-  // Get offsets and duration from ANMF/FRGM chunk.
-  err = GetFrameFragmentInfo(frame_frgm_chunk, x_offset, y_offset, duration);
+  // Get offsets and duration from ANMF chunk.
+  err = GetFrameInfo(frame_chunk, x_offset, y_offset, duration);
   if (err != WEBP_MUX_OK) return err;
 
   // Get width and height from VP8/VP8L chunk.
@@ -441,7 +423,6 @@ static WebPMuxError GetImageInfo(const WebPMuxImage* const wpi,
 
 // Returns the tightest dimension for the canvas considering the image list.
 static WebPMuxError GetAdjustedCanvasSize(const WebPMux* const mux,
-                                          uint32_t flags,
                                           int* const width, int* const height) {
   WebPMuxImage* wpi = NULL;
   assert(mux != NULL);
@@ -452,12 +433,10 @@ static WebPMuxError GetAdjustedCanvasSize(const WebPMux* const mux,
   assert(wpi->img_ != NULL);
 
   if (wpi->next_ != NULL) {
-    int max_x = 0;
-    int max_y = 0;
-    int64_t image_area = 0;
+    int max_x = 0, max_y = 0;
     // if we have a chain of wpi's, header_ is necessarily set
     assert(wpi->header_ != NULL);
-    // Aggregate the bounding box for animation frames & fragmented images.
+    // Aggregate the bounding box for animation frames.
     for (; wpi != NULL; wpi = wpi->next_) {
       int x_offset = 0, y_offset = 0, duration = 0, w = 0, h = 0;
       const WebPMuxError err = GetImageInfo(wpi, &x_offset, &y_offset,
@@ -470,19 +449,9 @@ static WebPMuxError GetAdjustedCanvasSize(const WebPMux* const mux,
 
       if (max_x_pos > max_x) max_x = max_x_pos;
       if (max_y_pos > max_y) max_y = max_y_pos;
-      image_area += w * h;
     }
     *width = max_x;
     *height = max_y;
-    // Crude check to validate that there are no image overlaps/holes for
-    // fragmented images. Check that the aggregated image area for individual
-    // fragments exactly matches the image area of the constructed canvas.
-    // However, the area-match is necessary but not sufficient condition.
-    if ((flags & FRAGMENTS_FLAG) && (image_area != (max_x * max_y))) {
-      *width = 0;
-      *height = 0;
-      return WEBP_MUX_INVALID_ARGUMENT;
-    }
   } else {
     // For a single image, canvas dimensions are same as image dimensions.
     *width = wpi->width_;
@@ -528,10 +497,7 @@ static WebPMuxError CreateVP8XChunk(WebPMux* const mux) {
     flags |= XMP_FLAG;
   }
   if (images->header_ != NULL) {
-    if (images->header_->tag_ == kChunks[IDX_FRGM].tag) {
-      // This is a fragmented image.
-      flags |= FRAGMENTS_FLAG;
-    } else if (images->header_->tag_ == kChunks[IDX_ANMF].tag) {
+    if (images->header_->tag_ == kChunks[IDX_ANMF].tag) {
       // This is an image with animation.
       flags |= ANIMATION_FLAG;
     }
@@ -540,7 +506,7 @@ static WebPMuxError CreateVP8XChunk(WebPMux* const mux) {
     flags |= ALPHA_FLAG;  // Some images have an alpha channel.
   }
 
-  err = GetAdjustedCanvasSize(mux, flags, &width, &height);
+  err = GetAdjustedCanvasSize(mux, &width, &height);
   if (err != WEBP_MUX_OK) return err;
 
   if (width <= 0 || height <= 0) {
@@ -580,31 +546,26 @@ static WebPMuxError CreateVP8XChunk(WebPMux* const mux) {
 // Cleans up 'mux' by removing any unnecessary chunks.
 static WebPMuxError MuxCleanup(WebPMux* const mux) {
   int num_frames;
-  int num_fragments;
   int num_anim_chunks;
 
-  // If we have an image with a single fragment or frame, and its rectangle
-  // covers the whole canvas, convert it to a non-animated non-fragmented image
-  // (to avoid writing FRGM/ANMF chunk unnecessarily).
+  // If we have an image with a single frame, and its rectangle
+  // covers the whole canvas, convert it to a non-animated image
+  // (to avoid writing ANMF chunk unnecessarily).
   WebPMuxError err = WebPMuxNumChunks(mux, kChunks[IDX_ANMF].id, &num_frames);
   if (err != WEBP_MUX_OK) return err;
-  err = WebPMuxNumChunks(mux, kChunks[IDX_FRGM].id, &num_fragments);
-  if (err != WEBP_MUX_OK) return err;
-  if (num_frames == 1 || num_fragments == 1) {
-    WebPMuxImage* frame_frag;
-    err = MuxImageGetNth((const WebPMuxImage**)&mux->images_, 1, &frame_frag);
-    assert(err == WEBP_MUX_OK);  // We know that one frame/fragment does exist.
-    assert(frame_frag != NULL);
-    if (frame_frag->header_ != NULL &&
+  if (num_frames == 1) {
+    WebPMuxImage* frame = NULL;
+    err = MuxImageGetNth((const WebPMuxImage**)&mux->images_, 1, &frame);
+    assert(err == WEBP_MUX_OK);  // We know that one frame does exist.
+    assert(frame != NULL);
+    if (frame->header_ != NULL &&
         ((mux->canvas_width_ == 0 && mux->canvas_height_ == 0) ||
-         (frame_frag->width_ == mux->canvas_width_ &&
-          frame_frag->height_ == mux->canvas_height_))) {
-      assert(frame_frag->header_->tag_ == kChunks[IDX_ANMF].tag ||
-             frame_frag->header_->tag_ == kChunks[IDX_FRGM].tag);
-      ChunkDelete(frame_frag->header_);  // Removes ANMF/FRGM chunk.
-      frame_frag->header_ = NULL;
+         (frame->width_ == mux->canvas_width_ &&
+          frame->height_ == mux->canvas_height_))) {
+      assert(frame->header_->tag_ == kChunks[IDX_ANMF].tag);
+      ChunkDelete(frame->header_);  // Removes ANMF chunk.
+      frame->header_ = NULL;
       num_frames = 0;
-      num_fragments = 0;
     }
   }
   // Remove ANIM chunk if this is a non-animated image.
diff --git a/src/3rdparty/libwebp/src/mux/muxi.h b/src/3rdparty/libwebp/src/mux/muxi.h
index d4d5cba..e6606aa 100644
--- a/src/3rdparty/libwebp/src/mux/muxi.h
+++ b/src/3rdparty/libwebp/src/mux/muxi.h
@@ -15,8 +15,8 @@
 #define WEBP_MUX_MUXI_H_
 
 #include <stdlib.h>
-#include "../dec/vp8i.h"
-#include "../dec/vp8li.h"
+#include "../dec/vp8i_dec.h"
+#include "../dec/vp8li_dec.h"
 #include "../webp/mux.h"
 
 #ifdef __cplusplus
@@ -27,8 +27,8 @@ extern "C" {
 // Defines and constants.
 
 #define MUX_MAJ_VERSION 0
-#define MUX_MIN_VERSION 3
-#define MUX_REV_VERSION 1
+#define MUX_MIN_VERSION 4
+#define MUX_REV_VERSION 0
 
 // Chunk object.
 typedef struct WebPChunk WebPChunk;
@@ -36,16 +36,16 @@ struct WebPChunk {
   uint32_t        tag_;
   int             owner_;  // True if *data_ memory is owned internally.
                            // VP8X, ANIM, and other internally created chunks
-                           // like ANMF/FRGM are always owned.
+                           // like ANMF are always owned.
   WebPData        data_;
   WebPChunk*      next_;
 };
 
-// MuxImage object. Store a full WebP image (including ANMF/FRGM chunk, ALPH
+// MuxImage object. Store a full WebP image (including ANMF chunk, ALPH
 // chunk and VP8/VP8L chunk),
 typedef struct WebPMuxImage WebPMuxImage;
 struct WebPMuxImage {
-  WebPChunk*  header_;      // Corresponds to WEBP_CHUNK_ANMF/WEBP_CHUNK_FRGM.
+  WebPChunk*  header_;      // Corresponds to WEBP_CHUNK_ANMF.
   WebPChunk*  alpha_;       // Corresponds to WEBP_CHUNK_ALPHA.
   WebPChunk*  img_;         // Corresponds to WEBP_CHUNK_IMAGE.
   WebPChunk*  unknown_;     // Corresponds to WEBP_CHUNK_UNKNOWN.
@@ -79,7 +79,6 @@ typedef enum {
   IDX_ICCP,
   IDX_ANIM,
   IDX_ANMF,
-  IDX_FRGM,
   IDX_ALPHA,
   IDX_VP8,
   IDX_VP8L,
@@ -185,7 +184,6 @@ int MuxImageFinalize(WebPMuxImage* const wpi);
 static WEBP_INLINE int IsWPI(WebPChunkId id) {
   switch (id) {
     case WEBP_CHUNK_ANMF:
-    case WEBP_CHUNK_FRGM:
     case WEBP_CHUNK_ALPHA:
     case WEBP_CHUNK_IMAGE:  return 1;
     default:        return 0;
diff --git a/src/3rdparty/libwebp/src/mux/muxinternal.c b/src/3rdparty/libwebp/src/mux/muxinternal.c
index 4babbe8..387b57e 100644
--- a/src/3rdparty/libwebp/src/mux/muxinternal.c
+++ b/src/3rdparty/libwebp/src/mux/muxinternal.c
@@ -16,14 +16,13 @@
 #include "./muxi.h"
 #include "../utils/utils.h"
 
-#define UNDEFINED_CHUNK_SIZE (-1)
+#define UNDEFINED_CHUNK_SIZE ((uint32_t)(-1))
 
 const ChunkInfo kChunks[] = {
   { MKFOURCC('V', 'P', '8', 'X'),  WEBP_CHUNK_VP8X,    VP8X_CHUNK_SIZE },
   { MKFOURCC('I', 'C', 'C', 'P'),  WEBP_CHUNK_ICCP,    UNDEFINED_CHUNK_SIZE },
   { MKFOURCC('A', 'N', 'I', 'M'),  WEBP_CHUNK_ANIM,    ANIM_CHUNK_SIZE },
   { MKFOURCC('A', 'N', 'M', 'F'),  WEBP_CHUNK_ANMF,    ANMF_CHUNK_SIZE },
-  { MKFOURCC('F', 'R', 'G', 'M'),  WEBP_CHUNK_FRGM,    FRGM_CHUNK_SIZE },
   { MKFOURCC('A', 'L', 'P', 'H'),  WEBP_CHUNK_ALPHA,   UNDEFINED_CHUNK_SIZE },
   { MKFOURCC('V', 'P', '8', ' '),  WEBP_CHUNK_IMAGE,   UNDEFINED_CHUNK_SIZE },
   { MKFOURCC('V', 'P', '8', 'L'),  WEBP_CHUNK_IMAGE,   UNDEFINED_CHUNK_SIZE },
@@ -251,8 +250,7 @@ static WebPChunk** GetChunkListFromId(const WebPMuxImage* const wpi,
                                       WebPChunkId id) {
   assert(wpi != NULL);
   switch (id) {
-    case WEBP_CHUNK_ANMF:
-    case WEBP_CHUNK_FRGM:  return (WebPChunk**)&wpi->header_;
+    case WEBP_CHUNK_ANMF:  return (WebPChunk**)&wpi->header_;
     case WEBP_CHUNK_ALPHA: return (WebPChunk**)&wpi->alpha_;
     case WEBP_CHUNK_IMAGE: return (WebPChunk**)&wpi->img_;
     default: return NULL;
@@ -372,13 +370,12 @@ size_t MuxImageDiskSize(const WebPMuxImage* const wpi) {
   return size;
 }
 
-// Special case as ANMF/FRGM chunk encapsulates other image chunks.
+// Special case as ANMF chunk encapsulates other image chunks.
 static uint8_t* ChunkEmitSpecial(const WebPChunk* const header,
                                  size_t total_size, uint8_t* dst) {
   const size_t header_size = header->data_.size;
   const size_t offset_to_next = total_size - CHUNK_HEADER_SIZE;
-  assert(header->tag_ == kChunks[IDX_ANMF].tag ||
-         header->tag_ == kChunks[IDX_FRGM].tag);
+  assert(header->tag_ == kChunks[IDX_ANMF].tag);
   PutLE32(dst + 0, header->tag_);
   PutLE32(dst + TAG_SIZE, (uint32_t)offset_to_next);
   assert(header_size == (uint32_t)header_size);
@@ -391,7 +388,7 @@ static uint8_t* ChunkEmitSpecial(const WebPChunk* const header,
 
 uint8_t* MuxImageEmit(const WebPMuxImage* const wpi, uint8_t* dst) {
   // Ordering of chunks to be emitted is strictly as follows:
-  // 1. ANMF/FRGM chunk (if present).
+  // 1. ANMF chunk (if present).
   // 2. ALPH chunk (if present).
   // 3. VP8/VP8L chunk.
   assert(wpi);
@@ -439,7 +436,7 @@ static int IsNotCompatible(int feature, int num_items) {
   return (feature != 0) != (num_items > 0);
 }
 
-#define NO_FLAG 0
+#define NO_FLAG ((WebPFeatureFlags)0)
 
 // Test basic constraints:
 // retrieval, maximum number of chunks by index (use -1 to skip)
@@ -465,7 +462,6 @@ WebPMuxError MuxValidate(const WebPMux* const mux) {
   int num_xmp;
   int num_anim;
   int num_frames;
-  int num_fragments;
   int num_vp8x;
   int num_images;
   int num_alpha;
@@ -510,10 +506,6 @@ WebPMuxError MuxValidate(const WebPMux* const mux) {
     }
   }
 
-  // Fragmentation: FRAGMENTS_FLAG and FRGM chunk(s) are consistent.
-  err = ValidateChunk(mux, IDX_FRGM, FRAGMENTS_FLAG, flags, -1, &num_fragments);
-  if (err != WEBP_MUX_OK) return err;
-
   // Verify either VP8X chunk is present OR there is only one elem in
   // mux->images_.
   err = ValidateChunk(mux, IDX_VP8X, NO_FLAG, flags, 1, &num_vp8x);
@@ -537,11 +529,6 @@ WebPMuxError MuxValidate(const WebPMux* const mux) {
     if (flags & ALPHA_FLAG) return WEBP_MUX_INVALID_ARGUMENT;
   }
 
-  // num_fragments & num_images are consistent.
-  if (num_fragments > 0 && num_images != num_fragments) {
-    return WEBP_MUX_INVALID_ARGUMENT;
-  }
-
   return WEBP_MUX_OK;
 }
 
diff --git a/src/3rdparty/libwebp/src/mux/muxread.c b/src/3rdparty/libwebp/src/mux/muxread.c
index 8957a1e..410acd9 100644
--- a/src/3rdparty/libwebp/src/mux/muxread.c
+++ b/src/3rdparty/libwebp/src/mux/muxread.c
@@ -104,17 +104,15 @@ static int MuxImageParse(const WebPChunk* const chunk, int copy_data,
   size_t subchunk_size;
   ChunkInit(&subchunk);
 
-  assert(chunk->tag_ == kChunks[IDX_ANMF].tag ||
-         chunk->tag_ == kChunks[IDX_FRGM].tag);
+  assert(chunk->tag_ == kChunks[IDX_ANMF].tag);
   assert(!wpi->is_partial_);
 
-  // ANMF/FRGM.
+  // ANMF.
   {
-    const size_t hdr_size = (chunk->tag_ == kChunks[IDX_ANMF].tag) ?
-        ANMF_CHUNK_SIZE : FRGM_CHUNK_SIZE;
+    const size_t hdr_size = ANMF_CHUNK_SIZE;
     const WebPData temp = { bytes, hdr_size };
-    // Each of ANMF and FRGM chunk contain a header at the beginning. So, its
-    // size should at least be 'hdr_size'.
+    // Each of ANMF chunk contain a header at the beginning. So, its size should
+    // be at least 'hdr_size'.
     if (size < hdr_size) goto Fail;
     ChunkAssignData(&subchunk, &temp, copy_data, chunk->tag_);
   }
@@ -292,16 +290,15 @@ WebPMux* WebPMuxCreateInternal(const WebPData* bitstream, int copy_data,
 static WebPMuxError ValidateForSingleImage(const WebPMux* const mux) {
   const int num_images = MuxImageCount(mux->images_, WEBP_CHUNK_IMAGE);
   const int num_frames = MuxImageCount(mux->images_, WEBP_CHUNK_ANMF);
-  const int num_fragments = MuxImageCount(mux->images_, WEBP_CHUNK_FRGM);
 
   if (num_images == 0) {
     // No images in mux.
     return WEBP_MUX_NOT_FOUND;
-  } else if (num_images == 1 && num_frames == 0 && num_fragments == 0) {
+  } else if (num_images == 1 && num_frames == 0) {
     // Valid case (single image).
     return WEBP_MUX_OK;
   } else {
-    // Frame/Fragment case OR an invalid mux.
+    // Frame case OR an invalid mux.
     return WEBP_MUX_INVALID_ARGUMENT;
   }
 }
@@ -379,7 +376,7 @@ static WebPMuxError SynthesizeBitstream(const WebPMuxImage* const wpi,
   const int need_vp8x = (wpi->alpha_ != NULL);
   const size_t vp8x_size = need_vp8x ? CHUNK_HEADER_SIZE + VP8X_CHUNK_SIZE : 0;
   const size_t alpha_size = need_vp8x ? ChunkDiskSize(wpi->alpha_) : 0;
-  // Note: No need to output ANMF/FRGM chunk for a single image.
+  // Note: No need to output ANMF chunk for a single image.
   const size_t size = RIFF_HEADER_SIZE + vp8x_size + alpha_size +
                       ChunkDiskSize(wpi->img_);
   uint8_t* const data = (uint8_t*)WebPSafeMalloc(1ULL, size);
@@ -436,29 +433,24 @@ static WebPMuxError MuxGetImageInternal(const WebPMuxImage* const wpi,
   return SynthesizeBitstream(wpi, &info->bitstream);
 }
 
-static WebPMuxError MuxGetFrameFragmentInternal(const WebPMuxImage* const wpi,
-                                                WebPMuxFrameInfo* const frame) {
+static WebPMuxError MuxGetFrameInternal(const WebPMuxImage* const wpi,
+                                        WebPMuxFrameInfo* const frame) {
   const int is_frame = (wpi->header_->tag_ == kChunks[IDX_ANMF].tag);
-  const CHUNK_INDEX idx = is_frame ? IDX_ANMF : IDX_FRGM;
-  const WebPData* frame_frgm_data;
+  const WebPData* frame_data;
   if (!is_frame) return WEBP_MUX_INVALID_ARGUMENT;
   assert(wpi->header_ != NULL);  // Already checked by WebPMuxGetFrame().
-  // Get frame/fragment chunk.
-  frame_frgm_data = &wpi->header_->data_;
-  if (frame_frgm_data->size < kChunks[idx].size) return WEBP_MUX_BAD_DATA;
+  // Get frame chunk.
+  frame_data = &wpi->header_->data_;
+  if (frame_data->size < kChunks[IDX_ANMF].size) return WEBP_MUX_BAD_DATA;
   // Extract info.
-  frame->x_offset = 2 * GetLE24(frame_frgm_data->bytes + 0);
-  frame->y_offset = 2 * GetLE24(frame_frgm_data->bytes + 3);
-  if (is_frame) {
-    const uint8_t bits = frame_frgm_data->bytes[15];
-    frame->duration = GetLE24(frame_frgm_data->bytes + 12);
+  frame->x_offset = 2 * GetLE24(frame_data->bytes + 0);
+  frame->y_offset = 2 * GetLE24(frame_data->bytes + 3);
+  {
+    const uint8_t bits = frame_data->bytes[15];
+    frame->duration = GetLE24(frame_data->bytes + 12);
     frame->dispose_method =
         (bits & 1) ? WEBP_MUX_DISPOSE_BACKGROUND : WEBP_MUX_DISPOSE_NONE;
     frame->blend_method = (bits & 2) ? WEBP_MUX_NO_BLEND : WEBP_MUX_BLEND;
-  } else {  // Defaults for unused values.
-    frame->duration = 1;
-    frame->dispose_method = WEBP_MUX_DISPOSE_NONE;
-    frame->blend_method = WEBP_MUX_BLEND;
   }
   frame->id = ChunkGetIdFromTag(wpi->header_->tag_);
   return SynthesizeBitstream(wpi, &frame->bitstream);
@@ -482,7 +474,7 @@ WebPMuxError WebPMuxGetFrame(
   if (wpi->header_ == NULL) {
     return MuxGetImageInternal(wpi, frame);
   } else {
-    return MuxGetFrameFragmentInternal(wpi, frame);
+    return MuxGetFrameInternal(wpi, frame);
   }
 }
 
diff --git a/src/3rdparty/libwebp/src/utils/bit_reader_inl.h b/src/3rdparty/libwebp/src/utils/bit_reader_inl_utils.h
index 99ed313..fd7fb04 100644
--- a/src/3rdparty/libwebp/src/utils/bit_reader_inl.h
+++ b/src/3rdparty/libwebp/src/utils/bit_reader_inl_utils.h
@@ -20,13 +20,12 @@
 #include "../webp/config.h"
 #endif
 
-#ifdef WEBP_FORCE_ALIGNED
-#include <string.h>  // memcpy
-#endif
+#include <string.h>  // for memcpy
 
 #include "../dsp/dsp.h"
-#include "./bit_reader.h"
-#include "./endian_inl.h"
+#include "./bit_reader_utils.h"
+#include "./endian_inl_utils.h"
+#include "./utils.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -62,10 +61,7 @@ void VP8LoadNewBytes(VP8BitReader* const br) {
   if (br->buf_ < br->buf_max_) {
     // convert memory type to register type (with some zero'ing!)
     bit_t bits;
-#if defined(WEBP_FORCE_ALIGNED)
-    lbit_t in_bits;
-    memcpy(&in_bits, br->buf_, sizeof(in_bits));
-#elif defined(WEBP_USE_MIPS32)
+#if defined(WEBP_USE_MIPS32)
     // This is needed because of un-aligned read.
     lbit_t in_bits;
     lbit_t* p_buf_ = (lbit_t*)br->buf_;
@@ -80,7 +76,8 @@ void VP8LoadNewBytes(VP8BitReader* const br) {
       : "memory", "at"
     );
 #else
-    const lbit_t in_bits = *(const lbit_t*)br->buf_;
+    lbit_t in_bits;
+    memcpy(&in_bits, br->buf_, sizeof(in_bits));
 #endif
     br->buf_ += BITS >> 3;
 #if !defined(WORDS_BIGENDIAN)
@@ -119,37 +116,26 @@ static WEBP_INLINE int VP8GetBit(VP8BitReader* const br, int prob) {
     const int pos = br->bits_;
     const range_t split = (range * prob) >> 8;
     const range_t value = (range_t)(br->value_ >> pos);
-#if defined(__arm__) || defined(_M_ARM)      // ARM-specific
-    const int bit = ((int)(split - value) >> 31) & 1;
-    if (value > split) {
-      range -= split + 1;
-      br->value_ -= (bit_t)(split + 1) << pos;
-    } else {
-      range = split;
-    }
-#else  // faster version on x86
-    int bit;  // Don't use 'const int bit = (value > split);", it's slower.
-    if (value > split) {
-      range -= split + 1;
+    const int bit = (value > split);
+    if (bit) {
+      range -= split;
       br->value_ -= (bit_t)(split + 1) << pos;
-      bit = 1;
     } else {
-      range = split;
-      bit = 0;
+      range = split + 1;
     }
-#endif
-    if (range <= (range_t)0x7e) {
-      const int shift = kVP8Log2Range[range];
-      range = kVP8NewRange[range];
+    {
+      const int shift = 7 ^ BitsLog2Floor(range);
+      range <<= shift;
       br->bits_ -= shift;
     }
-    br->range_ = range;
+    br->range_ = range - 1;
     return bit;
   }
 }
 
 // simplified version of VP8GetBit() for prob=0x80 (note shift is always 1 here)
-static WEBP_INLINE int VP8GetSigned(VP8BitReader* const br, int v) {
+static WEBP_UBSAN_IGNORE_UNSIGNED_OVERFLOW WEBP_INLINE
+int VP8GetSigned(VP8BitReader* const br, int v) {
   if (br->bits_ < 0) {
     VP8LoadNewBytes(br);
   }
@@ -166,6 +152,37 @@ static WEBP_INLINE int VP8GetSigned(VP8BitReader* const br, int v) {
   }
 }
 
+static WEBP_INLINE int VP8GetBitAlt(VP8BitReader* const br, int prob) {
+  // Don't move this declaration! It makes a big speed difference to store
+  // 'range' *before* calling VP8LoadNewBytes(), even if this function doesn't
+  // alter br->range_ value.
+  range_t range = br->range_;
+  if (br->bits_ < 0) {
+    VP8LoadNewBytes(br);
+  }
+  {
+    const int pos = br->bits_;
+    const range_t split = (range * prob) >> 8;
+    const range_t value = (range_t)(br->value_ >> pos);
+    int bit;  // Don't use 'const int bit = (value > split);", it's slower.
+    if (value > split) {
+      range -= split + 1;
+      br->value_ -= (bit_t)(split + 1) << pos;
+      bit = 1;
+    } else {
+      range = split;
+      bit = 0;
+    }
+    if (range <= (range_t)0x7e) {
+      const int shift = kVP8Log2Range[range];
+      range = kVP8NewRange[range];
+      br->bits_ -= shift;
+    }
+    br->range_ = range;
+    return bit;
+  }
+}
+
 #ifdef __cplusplus
 }    // extern "C"
 #endif
diff --git a/src/3rdparty/libwebp/src/utils/bit_reader.c b/src/3rdparty/libwebp/src/utils/bit_reader_utils.c
index 50ffb74..c3157e8 100644
--- a/src/3rdparty/libwebp/src/utils/bit_reader.c
+++ b/src/3rdparty/libwebp/src/utils/bit_reader_utils.c
@@ -15,7 +15,7 @@
 #include "../webp/config.h"
 #endif
 
-#include "./bit_reader_inl.h"
+#include "./bit_reader_inl_utils.h"
 #include "../utils/utils.h"
 
 //------------------------------------------------------------------------------
diff --git a/src/3rdparty/libwebp/src/utils/bit_reader.h b/src/3rdparty/libwebp/src/utils/bit_reader_utils.h
index ec3426c..ec3426c 100644
--- a/src/3rdparty/libwebp/src/utils/bit_reader.h
+++ b/src/3rdparty/libwebp/src/utils/bit_reader_utils.h
diff --git a/src/3rdparty/libwebp/src/utils/bit_writer.c b/src/3rdparty/libwebp/src/utils/bit_writer_utils.c
index 0644286..ab0c49d 100644
--- a/src/3rdparty/libwebp/src/utils/bit_writer.c
+++ b/src/3rdparty/libwebp/src/utils/bit_writer_utils.c
@@ -16,8 +16,8 @@
 #include <string.h>   // for memcpy()
 #include <stdlib.h>
 
-#include "./bit_writer.h"
-#include "./endian_inl.h"
+#include "./bit_writer_utils.h"
+#include "./endian_inl_utils.h"
 #include "./utils.h"
 
 //------------------------------------------------------------------------------
@@ -143,13 +143,13 @@ int VP8PutBitUniform(VP8BitWriter* const bw, int bit) {
 void VP8PutBits(VP8BitWriter* const bw, uint32_t value, int nb_bits) {
   uint32_t mask;
   assert(nb_bits > 0 && nb_bits < 32);
-  for (mask = 1u << (nb_bits - 1); mask; mask >>= 1)
+  for (mask = 1u << (nb_bits - 1); mask; mask >>= 1) {
     VP8PutBitUniform(bw, value & mask);
+  }
 }
 
 void VP8PutSignedBits(VP8BitWriter* const bw, int value, int nb_bits) {
-  if (!VP8PutBitUniform(bw, value != 0))
-    return;
+  if (!VP8PutBitUniform(bw, value != 0)) return;
   if (value < 0) {
     VP8PutBits(bw, ((-value) << 1) | 1, nb_bits + 1);
   } else {
diff --git a/src/3rdparty/libwebp/src/utils/bit_writer.h b/src/3rdparty/libwebp/src/utils/bit_writer_utils.h
index ef360d1..9c02bbc 100644
--- a/src/3rdparty/libwebp/src/utils/bit_writer.h
+++ b/src/3rdparty/libwebp/src/utils/bit_writer_utils.h
@@ -54,7 +54,8 @@ int VP8BitWriterAppend(VP8BitWriter* const bw,
 
 // return approximate write position (in bits)
 static WEBP_INLINE uint64_t VP8BitWriterPos(const VP8BitWriter* const bw) {
-  return (uint64_t)(bw->pos_ + bw->run_) * 8 + 8 + bw->nb_bits_;
+  const uint64_t nb_bits = 8 + bw->nb_bits_;   // bw->nb_bits_ is <= 0, note
+  return (bw->pos_ + bw->run_) * 8 + nb_bits;
 }
 
 // Returns a pointer to the internal buffer.
diff --git a/src/3rdparty/libwebp/src/utils/color_cache.c b/src/3rdparty/libwebp/src/utils/color_cache_utils.c
index c34b2e7..0172590 100644
--- a/src/3rdparty/libwebp/src/utils/color_cache.c
+++ b/src/3rdparty/libwebp/src/utils/color_cache_utils.c
@@ -14,7 +14,7 @@
 #include <assert.h>
 #include <stdlib.h>
 #include <string.h>
-#include "./color_cache.h"
+#include "./color_cache_utils.h"
 #include "./utils.h"
 
 //------------------------------------------------------------------------------
diff --git a/src/3rdparty/libwebp/src/utils/color_cache.h b/src/3rdparty/libwebp/src/utils/color_cache_utils.h
index a9a9f64..c373e6b 100644
--- a/src/3rdparty/libwebp/src/utils/color_cache.h
+++ b/src/3rdparty/libwebp/src/utils/color_cache_utils.h
@@ -28,7 +28,11 @@ typedef struct {
   int hash_bits_;
 } VP8LColorCache;
 
-static const uint32_t kHashMul = 0x1e35a7bd;
+static const uint64_t kHashMul = 0x1e35a7bdull;
+
+static WEBP_INLINE int HashPix(uint32_t argb, int shift) {
+  return (int)(((argb * kHashMul) & 0xffffffffu) >> shift);
+}
 
 static WEBP_INLINE uint32_t VP8LColorCacheLookup(
     const VP8LColorCache* const cc, uint32_t key) {
@@ -44,19 +48,20 @@ static WEBP_INLINE void VP8LColorCacheSet(const VP8LColorCache* const cc,
 
 static WEBP_INLINE void VP8LColorCacheInsert(const VP8LColorCache* const cc,
                                              uint32_t argb) {
-  const uint32_t key = (kHashMul * argb) >> cc->hash_shift_;
+  const int key = HashPix(argb, cc->hash_shift_);
   cc->colors_[key] = argb;
 }
 
 static WEBP_INLINE int VP8LColorCacheGetIndex(const VP8LColorCache* const cc,
                                               uint32_t argb) {
-  return (kHashMul * argb) >> cc->hash_shift_;
+  return HashPix(argb, cc->hash_shift_);
 }
 
+// Return the key if cc contains argb, and -1 otherwise.
 static WEBP_INLINE int VP8LColorCacheContains(const VP8LColorCache* const cc,
                                               uint32_t argb) {
-  const uint32_t key = (kHashMul * argb) >> cc->hash_shift_;
-  return (cc->colors_[key] == argb);
+  const int key = HashPix(argb, cc->hash_shift_);
+  return (cc->colors_[key] == argb) ? key : -1;
 }
 
 //------------------------------------------------------------------------------
diff --git a/src/3rdparty/libwebp/src/utils/endian_inl.h b/src/3rdparty/libwebp/src/utils/endian_inl_utils.h
index e11260f..e11260f 100644
--- a/src/3rdparty/libwebp/src/utils/endian_inl.h
+++ b/src/3rdparty/libwebp/src/utils/endian_inl_utils.h
diff --git a/src/3rdparty/libwebp/src/utils/filters.c b/src/3rdparty/libwebp/src/utils/filters_utils.c
index 15543b1..49c1d18 100644
--- a/src/3rdparty/libwebp/src/utils/filters.c
+++ b/src/3rdparty/libwebp/src/utils/filters_utils.c
@@ -11,7 +11,7 @@
 //
 // Author: Urvang (urvang@google.com)
 
-#include "./filters.h"
+#include "./filters_utils.h"
 #include <stdlib.h>
 #include <string.h>
 
diff --git a/src/3rdparty/libwebp/src/utils/filters.h b/src/3rdparty/libwebp/src/utils/filters_utils.h
index 088b132..088b132 100644
--- a/src/3rdparty/libwebp/src/utils/filters.h
+++ b/src/3rdparty/libwebp/src/utils/filters_utils.h
diff --git a/src/3rdparty/libwebp/src/utils/huffman_encode.c b/src/3rdparty/libwebp/src/utils/huffman_encode_utils.c
index 4e5ef6b..f950465 100644
--- a/src/3rdparty/libwebp/src/utils/huffman_encode.c
+++ b/src/3rdparty/libwebp/src/utils/huffman_encode_utils.c
@@ -14,7 +14,7 @@
 #include <assert.h>
 #include <stdlib.h>
 #include <string.h>
-#include "./huffman_encode.h"
+#include "./huffman_encode_utils.h"
 #include "./utils.h"
 #include "../webp/format_constants.h"
 
diff --git a/src/3rdparty/libwebp/src/utils/huffman_encode.h b/src/3rdparty/libwebp/src/utils/huffman_encode_utils.h
index a157165..a157165 100644
--- a/src/3rdparty/libwebp/src/utils/huffman_encode.h
+++ b/src/3rdparty/libwebp/src/utils/huffman_encode_utils.h
diff --git a/src/3rdparty/libwebp/src/utils/huffman.c b/src/3rdparty/libwebp/src/utils/huffman_utils.c
index 36e5502..008b5d7 100644
--- a/src/3rdparty/libwebp/src/utils/huffman.c
+++ b/src/3rdparty/libwebp/src/utils/huffman_utils.c
@@ -14,7 +14,7 @@
 #include <assert.h>
 #include <stdlib.h>
 #include <string.h>
-#include "./huffman.h"
+#include "./huffman_utils.h"
 #include "./utils.h"
 #include "../webp/format_constants.h"
 
@@ -45,7 +45,7 @@ static WEBP_INLINE uint32_t GetNextKey(uint32_t key, int len) {
   while (key & step) {
     step >>= 1;
   }
-  return (key & (step - 1)) + step;
+  return step ? (key & (step - 1)) + step : key;
 }
 
 // Stores code in table[0], table[step], table[2*step], ..., table[end].
@@ -75,11 +75,13 @@ static WEBP_INLINE int NextTableBitSize(const int* const count,
   return len - root_bits;
 }
 
-int VP8LBuildHuffmanTable(HuffmanCode* const root_table, int root_bits,
-                          const int code_lengths[], int code_lengths_size) {
+// sorted[code_lengths_size] is a pre-allocated array for sorting symbols
+// by code length.
+static int BuildHuffmanTable(HuffmanCode* const root_table, int root_bits,
+                             const int code_lengths[], int code_lengths_size,
+                             uint16_t sorted[]) {
   HuffmanCode* table = root_table;  // next available space in table
   int total_size = 1 << root_bits;  // total size root table + 2nd level table
-  int* sorted = NULL;               // symbols sorted by code length
   int len;                          // current code length
   int symbol;                       // symbol index in original or sorted table
   // number of codes of each length:
@@ -114,11 +116,6 @@ int VP8LBuildHuffmanTable(HuffmanCode* const root_table, int root_bits,
     offset[len + 1] = offset[len] + count[len];
   }
 
-  sorted = (int*)WebPSafeMalloc(code_lengths_size, sizeof(*sorted));
-  if (sorted == NULL) {
-    return 0;
-  }
-
   // Sort symbols by length, by symbol order within each length.
   for (symbol = 0; symbol < code_lengths_size; ++symbol) {
     const int symbol_code_length = code_lengths[symbol];
@@ -133,7 +130,6 @@ int VP8LBuildHuffmanTable(HuffmanCode* const root_table, int root_bits,
     code.bits = 0;
     code.value = (uint16_t)sorted[0];
     ReplicateValue(table, 1, total_size, code);
-    WebPSafeFree(sorted);
     return total_size;
   }
 
@@ -153,7 +149,6 @@ int VP8LBuildHuffmanTable(HuffmanCode* const root_table, int root_bits,
       num_nodes += num_open;
       num_open -= count[len];
       if (num_open < 0) {
-        WebPSafeFree(sorted);
         return 0;
       }
       for (; count[len] > 0; --count[len]) {
@@ -172,7 +167,6 @@ int VP8LBuildHuffmanTable(HuffmanCode* const root_table, int root_bits,
       num_nodes += num_open;
       num_open -= count[len];
       if (num_open < 0) {
-        WebPSafeFree(sorted);
         return 0;
       }
       for (; count[len] > 0; --count[len]) {
@@ -195,11 +189,35 @@ int VP8LBuildHuffmanTable(HuffmanCode* const root_table, int root_bits,
 
     // Check if tree is full.
     if (num_nodes != 2 * offset[MAX_ALLOWED_CODE_LENGTH] - 1) {
-      WebPSafeFree(sorted);
       return 0;
     }
   }
 
-  WebPSafeFree(sorted);
+  return total_size;
+}
+
+// Maximum code_lengths_size is 2328 (reached for 11-bit color_cache_bits).
+// More commonly, the value is around ~280.
+#define MAX_CODE_LENGTHS_SIZE \
+  ((1 << MAX_CACHE_BITS) + NUM_LITERAL_CODES + NUM_LENGTH_CODES)
+// Cut-off value for switching between heap and stack allocation.
+#define SORTED_SIZE_CUTOFF 512
+int VP8LBuildHuffmanTable(HuffmanCode* const root_table, int root_bits,
+                          const int code_lengths[], int code_lengths_size) {
+  int total_size;
+  assert(code_lengths_size <= MAX_CODE_LENGTHS_SIZE);
+  if (code_lengths_size <= SORTED_SIZE_CUTOFF) {
+    // use local stack-allocated array.
+    uint16_t sorted[SORTED_SIZE_CUTOFF];
+    total_size = BuildHuffmanTable(root_table, root_bits,
+                                   code_lengths, code_lengths_size, sorted);
+  } else {   // rare case. Use heap allocation.
+    uint16_t* const sorted =
+        (uint16_t*)WebPSafeMalloc(code_lengths_size, sizeof(*sorted));
+    if (sorted == NULL) return 0;
+    total_size = BuildHuffmanTable(root_table, root_bits,
+                                   code_lengths, code_lengths_size, sorted);
+    WebPSafeFree(sorted);
+  }
   return total_size;
 }
diff --git a/src/3rdparty/libwebp/src/utils/huffman.h b/src/3rdparty/libwebp/src/utils/huffman_utils.h
index c6dd6aa..c6dd6aa 100644
--- a/src/3rdparty/libwebp/src/utils/huffman.h
+++ b/src/3rdparty/libwebp/src/utils/huffman_utils.h
diff --git a/src/3rdparty/libwebp/src/utils/quant_levels_dec.c b/src/3rdparty/libwebp/src/utils/quant_levels_dec_utils.c
index ee0a3fe..d4d23d3 100644
--- a/src/3rdparty/libwebp/src/utils/quant_levels_dec.c
+++ b/src/3rdparty/libwebp/src/utils/quant_levels_dec_utils.c
@@ -14,7 +14,7 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
-#include "./quant_levels_dec.h"
+#include "./quant_levels_dec_utils.h"
 
 #include <string.h>   // for memset
 
diff --git a/src/3rdparty/libwebp/src/utils/quant_levels_dec.h b/src/3rdparty/libwebp/src/utils/quant_levels_dec_utils.h
index 59a1349..59a1349 100644
--- a/src/3rdparty/libwebp/src/utils/quant_levels_dec.h
+++ b/src/3rdparty/libwebp/src/utils/quant_levels_dec_utils.h
diff --git a/src/3rdparty/libwebp/src/utils/quant_levels.c b/src/3rdparty/libwebp/src/utils/quant_levels_utils.c
index d7c8aab..73174e8 100644
--- a/src/3rdparty/libwebp/src/utils/quant_levels.c
+++ b/src/3rdparty/libwebp/src/utils/quant_levels_utils.c
@@ -14,7 +14,7 @@
 
 #include <assert.h>
 
-#include "./quant_levels.h"
+#include "./quant_levels_utils.h"
 
 #define NUM_SYMBOLS     256
 
diff --git a/src/3rdparty/libwebp/src/utils/quant_levels.h b/src/3rdparty/libwebp/src/utils/quant_levels_utils.h
index 1cb5a32..1cb5a32 100644
--- a/src/3rdparty/libwebp/src/utils/quant_levels.h
+++ b/src/3rdparty/libwebp/src/utils/quant_levels_utils.h
diff --git a/src/3rdparty/libwebp/src/utils/random.c b/src/3rdparty/libwebp/src/utils/random_utils.c
index 24e96ad..9f1e415 100644
--- a/src/3rdparty/libwebp/src/utils/random.c
+++ b/src/3rdparty/libwebp/src/utils/random_utils.c
@@ -12,7 +12,7 @@
 // Author: Skal (pascal.massimino@gmail.com)
 
 #include <string.h>
-#include "./random.h"
+#include "./random_utils.h"
 
 //------------------------------------------------------------------------------
 
diff --git a/src/3rdparty/libwebp/src/utils/random.h b/src/3rdparty/libwebp/src/utils/random_utils.h
index c392a61..c392a61 100644
--- a/src/3rdparty/libwebp/src/utils/random.h
+++ b/src/3rdparty/libwebp/src/utils/random_utils.h
diff --git a/src/3rdparty/libwebp/src/utils/rescaler.c b/src/3rdparty/libwebp/src/utils/rescaler_utils.c
index 00c9300..0d1f80d 100644
--- a/src/3rdparty/libwebp/src/utils/rescaler.c
+++ b/src/3rdparty/libwebp/src/utils/rescaler_utils.c
@@ -15,7 +15,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include "../dsp/dsp.h"
-#include "./rescaler.h"
+#include "./rescaler_utils.h"
 
 //------------------------------------------------------------------------------
 
@@ -48,11 +48,15 @@ void WebPRescalerInit(WebPRescaler* const wrk, int src_width, int src_height,
   wrk->y_sub = wrk->y_expand ? y_sub - 1 : y_sub;
   wrk->y_accum = wrk->y_expand ? wrk->y_sub : wrk->y_add;
   if (!wrk->y_expand) {
-    // this is WEBP_RESCALER_FRAC(dst_height, x_add * y_add) without the cast.
+    // This is WEBP_RESCALER_FRAC(dst_height, x_add * y_add) without the cast.
+    // Its value is <= WEBP_RESCALER_ONE, because dst_height <= wrk->y_add, and
+    // wrk->x_add >= 1;
     const uint64_t ratio =
         (uint64_t)dst_height * WEBP_RESCALER_ONE / (wrk->x_add * wrk->y_add);
     if (ratio != (uint32_t)ratio) {
-      // We can't represent the ratio with the current fixed-point precision.
+      // When ratio == WEBP_RESCALER_ONE, we can't represent the ratio with the
+      // current fixed-point precision. This happens when src_height ==
+      // wrk->y_add (which == src_height), and wrk->x_add == 1.
       // => We special-case fxy_scale = 0, in WebPRescalerExportRow().
       wrk->fxy_scale = 0;
     } else {
diff --git a/src/3rdparty/libwebp/src/utils/rescaler.h b/src/3rdparty/libwebp/src/utils/rescaler_utils.h
index 98b01a7..98b01a7 100644
--- a/src/3rdparty/libwebp/src/utils/rescaler.h
+++ b/src/3rdparty/libwebp/src/utils/rescaler_utils.h
diff --git a/src/3rdparty/libwebp/src/utils/thread.c b/src/3rdparty/libwebp/src/utils/thread_utils.c
index 93f7622..1729060 100644
--- a/src/3rdparty/libwebp/src/utils/thread.c
+++ b/src/3rdparty/libwebp/src/utils/thread_utils.c
@@ -13,7 +13,7 @@
 
 #include <assert.h>
 #include <string.h>   // for memset()
-#include "./thread.h"
+#include "./thread_utils.h"
 #include "./utils.h"
 
 #ifdef WEBP_USE_THREAD
@@ -183,8 +183,7 @@ static int pthread_cond_wait(pthread_cond_t* const condition,
 #else
   // note that there is a consumer available so the signal isn't dropped in
   // pthread_cond_signal
-  if (!ReleaseSemaphore(condition->waiting_sem_, 1, NULL))
-    return 1;
+  if (!ReleaseSemaphore(condition->waiting_sem_, 1, NULL)) return 1;
   // now unlock the mutex so pthread_cond_signal may be issued
   pthread_mutex_unlock(mutex);
   ok = (WaitForSingleObject(condition->signal_event_, INFINITE) ==
@@ -226,8 +225,7 @@ static THREADFN ThreadLoop(void* ptr) {
 }
 
 // main thread state control
-static void ChangeState(WebPWorker* const worker,
-                        WebPWorkerStatus new_status) {
+static void ChangeState(WebPWorker* const worker, WebPWorkerStatus new_status) {
   // No-op when attempting to change state on a thread that didn't come up.
   // Checking status_ without acquiring the lock first would result in a data
   // race.
diff --git a/src/3rdparty/libwebp/src/utils/thread.h b/src/3rdparty/libwebp/src/utils/thread_utils.h
index 8408311..8408311 100644
--- a/src/3rdparty/libwebp/src/utils/thread.h
+++ b/src/3rdparty/libwebp/src/utils/thread_utils.h
diff --git a/src/3rdparty/libwebp/src/utils/utils.c b/src/3rdparty/libwebp/src/utils/utils.c
index 2602ca3..504d924 100644
--- a/src/3rdparty/libwebp/src/utils/utils.c
+++ b/src/3rdparty/libwebp/src/utils/utils.c
@@ -25,7 +25,7 @@
 //    http://valgrind.org/docs/manual/ms-manual.html
 // Here is an example command line:
 /*    valgrind --tool=massif --massif-out-file=massif.out \
-               --stacks=yes --alloc-fn=WebPSafeAlloc --alloc-fn=WebPSafeCalloc
+               --stacks=yes --alloc-fn=WebPSafeMalloc --alloc-fn=WebPSafeCalloc
       ms_print massif.out
 */
 // In addition:
@@ -175,8 +175,12 @@ static int CheckSizeArgumentsOverflow(uint64_t nmemb, size_t size) {
   }
 #endif
 #if defined(MALLOC_LIMIT)
-  if (mem_limit > 0 && total_mem + total_size >= mem_limit) {
-    return 0;   // fake fail!
+  if (mem_limit > 0) {
+    const uint64_t new_total_mem = (uint64_t)total_mem + total_size;
+    if (new_total_mem != (size_t)new_total_mem ||
+        new_total_mem > mem_limit) {
+      return 0;   // fake fail!
+    }
   }
 #endif
 
@@ -239,8 +243,7 @@ void WebPCopyPixels(const WebPPicture* const src, WebPPicture* const dst) {
 
 //------------------------------------------------------------------------------
 
-#define MAX_COLOR_COUNT         MAX_PALETTE_SIZE
-#define COLOR_HASH_SIZE         (MAX_COLOR_COUNT * 4)
+#define COLOR_HASH_SIZE         (MAX_PALETTE_SIZE * 4)
 #define COLOR_HASH_RIGHT_SHIFT  22  // 32 - log2(COLOR_HASH_SIZE).
 
 int WebPGetColorPalette(const WebPPicture* const pic, uint32_t* const palette) {
@@ -249,7 +252,7 @@ int WebPGetColorPalette(const WebPPicture* const pic, uint32_t* const palette) {
   int num_colors = 0;
   uint8_t in_use[COLOR_HASH_SIZE] = { 0 };
   uint32_t colors[COLOR_HASH_SIZE];
-  static const uint32_t kHashMul = 0x1e35a7bdU;
+  static const uint64_t kHashMul = 0x1e35a7bdull;
   const uint32_t* argb = pic->argb;
   const int width = pic->width;
   const int height = pic->height;
@@ -264,14 +267,14 @@ int WebPGetColorPalette(const WebPPicture* const pic, uint32_t* const palette) {
         continue;
       }
       last_pix = argb[x];
-      key = (kHashMul * last_pix) >> COLOR_HASH_RIGHT_SHIFT;
+      key = ((last_pix * kHashMul) & 0xffffffffu) >> COLOR_HASH_RIGHT_SHIFT;
       while (1) {
         if (!in_use[key]) {
           colors[key] = last_pix;
           in_use[key] = 1;
           ++num_colors;
-          if (num_colors > MAX_COLOR_COUNT) {
-            return MAX_COLOR_COUNT + 1;  // Exact count not needed.
+          if (num_colors > MAX_PALETTE_SIZE) {
+            return MAX_PALETTE_SIZE + 1;  // Exact count not needed.
           }
           break;
         } else if (colors[key] == last_pix) {
@@ -298,8 +301,30 @@ int WebPGetColorPalette(const WebPPicture* const pic, uint32_t* const palette) {
   return num_colors;
 }
 
-#undef MAX_COLOR_COUNT
 #undef COLOR_HASH_SIZE
 #undef COLOR_HASH_RIGHT_SHIFT
 
 //------------------------------------------------------------------------------
+
+#if defined(WEBP_NEED_LOG_TABLE_8BIT)
+const uint8_t WebPLogTable8bit[256] = {   // 31 ^ clz(i)
+  0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3,
+  4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+  6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+  6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+  6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7
+};
+#endif
+
+//------------------------------------------------------------------------------
diff --git a/src/3rdparty/libwebp/src/utils/utils.h b/src/3rdparty/libwebp/src/utils/utils.h
index e0a8112..3ab4590 100644
--- a/src/3rdparty/libwebp/src/utils/utils.h
+++ b/src/3rdparty/libwebp/src/utils/utils.h
@@ -20,6 +20,7 @@
 #endif
 
 #include <assert.h>
+#include <limits.h>
 
 #include "../dsp/dsp.h"
 #include "../webp/types.h"
@@ -32,7 +33,14 @@ extern "C" {
 // Memory allocation
 
 // This is the maximum memory amount that libwebp will ever try to allocate.
-#define WEBP_MAX_ALLOCABLE_MEMORY (1ULL << 40)
+#ifndef WEBP_MAX_ALLOCABLE_MEMORY
+#if SIZE_MAX > (1ULL << 34)
+#define WEBP_MAX_ALLOCABLE_MEMORY (1ULL << 34)
+#else
+// For 32-bit targets keep this below INT_MAX to avoid valgrind warnings.
+#define WEBP_MAX_ALLOCABLE_MEMORY ((1ULL << 31) - (1 << 16))
+#endif
+#endif  // WEBP_MAX_ALLOCABLE_MEMORY
 
 // size-checking safe malloc/calloc: verify that the requested size is not too
 // large, or return NULL. You don't need to call these for constructs like
@@ -54,7 +62,6 @@ WEBP_EXTERN(void) WebPSafeFree(void* const ptr);
 #define WEBP_ALIGN_CST 31
 #define WEBP_ALIGN(PTR) (((uintptr_t)(PTR) + WEBP_ALIGN_CST) & ~WEBP_ALIGN_CST)
 
-#if defined(WEBP_FORCE_ALIGNED)
 #include <string.h>
 // memcpy() is the safe way of moving potentially unaligned 32b memory.
 static WEBP_INLINE uint32_t WebPMemToUint32(const uint8_t* const ptr) {
@@ -65,16 +72,6 @@ static WEBP_INLINE uint32_t WebPMemToUint32(const uint8_t* const ptr) {
 static WEBP_INLINE void WebPUint32ToMem(uint8_t* const ptr, uint32_t val) {
   memcpy(ptr, &val, sizeof(val));
 }
-#else
-static WEBP_UBSAN_IGNORE_UNDEF WEBP_INLINE
-uint32_t WebPMemToUint32(const uint8_t* const ptr) {
-  return *(const uint32_t*)ptr;
-}
-static WEBP_UBSAN_IGNORE_UNDEF WEBP_INLINE
-void WebPUint32ToMem(uint8_t* const ptr, uint32_t val) {
-  *(uint32_t*)ptr = val;
-}
-#endif
 
 //------------------------------------------------------------------------------
 // Reading/writing data.
@@ -110,6 +107,19 @@ static WEBP_INLINE void PutLE32(uint8_t* const data, uint32_t val) {
   PutLE16(data + 2, (int)(val >> 16));
 }
 
+// Returns 31 ^ clz(n) = log2(n). This is the default C-implementation, either
+// based on table or not. Can be used as fallback if clz() is not available.
+#define WEBP_NEED_LOG_TABLE_8BIT
+extern const uint8_t WebPLogTable8bit[256];
+static WEBP_INLINE int WebPLog2FloorC(uint32_t n) {
+  int log = 0;
+  while (n >= 256) {
+    log += 8;
+    n >>= 8;
+  }
+  return log + WebPLogTable8bit[n];
+}
+
 // Returns (int)floor(log2(n)). n must be > 0.
 // use GNU builtins where available.
 #if defined(__GNUC__) && \
@@ -127,22 +137,8 @@ static WEBP_INLINE int BitsLog2Floor(uint32_t n) {
   _BitScanReverse(&first_set_bit, n);
   return first_set_bit;
 }
-#else
-static WEBP_INLINE int BitsLog2Floor(uint32_t n) {
-  int log = 0;
-  uint32_t value = n;
-  int i;
-
-  for (i = 4; i >= 0; --i) {
-    const int shift = (1 << i);
-    const uint32_t x = value >> shift;
-    if (x != 0) {
-      value = x;
-      log += shift;
-    }
-  }
-  return log;
-}
+#else   // default: use the C-version.
+static WEBP_INLINE int BitsLog2Floor(uint32_t n) { return WebPLog2FloorC(n); }
 #endif
 
 //------------------------------------------------------------------------------
@@ -164,12 +160,12 @@ WEBP_EXTERN(void) WebPCopyPixels(const struct WebPPicture* const src,
 // Unique colors.
 
 // Returns count of unique colors in 'pic', assuming pic->use_argb is true.
-// If the unique color count is more than MAX_COLOR_COUNT, returns
-// MAX_COLOR_COUNT+1.
+// If the unique color count is more than MAX_PALETTE_SIZE, returns
+// MAX_PALETTE_SIZE+1.
 // If 'palette' is not NULL and number of unique colors is less than or equal to
-// MAX_COLOR_COUNT, also outputs the actual unique colors into 'palette'.
+// MAX_PALETTE_SIZE, also outputs the actual unique colors into 'palette'.
 // Note: 'palette' is assumed to be an array already allocated with at least
-// MAX_COLOR_COUNT elements.
+// MAX_PALETTE_SIZE elements.
 WEBP_EXTERN(int) WebPGetColorPalette(const struct WebPPicture* const pic,
                                      uint32_t* const palette);
 
diff --git a/src/3rdparty/libwebp/src/webp/config.h b/src/3rdparty/libwebp/src/webp/config.h
index 118ac38..731115b 100644
--- a/src/3rdparty/libwebp/src/webp/config.h
+++ b/src/3rdparty/libwebp/src/webp/config.h
@@ -79,7 +79,7 @@
 #define PACKAGE_NAME "libwebp"
 
 /* Define to the full name and version of this package. */
-#define PACKAGE_STRING "libwebp 0.5.1"
+#define PACKAGE_STRING "libwebp 0.6.0"
 
 /* Define to the one symbol short name of this package. */
 #define PACKAGE_TARNAME "libwebp"
@@ -88,7 +88,7 @@
 #define PACKAGE_URL "http://developers.google.com/speed/webp"
 
 /* Define to the version of this package. */
-#define PACKAGE_VERSION "0.5.1"
+#define PACKAGE_VERSION "0.6.0"
 
 /* Define to necessary symbol if this constant uses a non-standard name on
    your system. */
@@ -98,7 +98,7 @@
 /* #undef STDC_HEADERS */
 
 /* Version number of package */
-#define VERSION "0.5.1"
+#define VERSION "0.6.0"
 
 /* Enable experimental code */
 /* #undef WEBP_EXPERIMENTAL_FEATURES */
diff --git a/src/3rdparty/libwebp/src/webp/decode.h b/src/3rdparty/libwebp/src/webp/decode.h
index 7a3bed9..4c5e74a 100644
--- a/src/3rdparty/libwebp/src/webp/decode.h
+++ b/src/3rdparty/libwebp/src/webp/decode.h
@@ -248,19 +248,19 @@ typedef enum VP8StatusCode {
 // picture is only partially decoded, pending additional input.
 // Code example:
 //
-//   WebPInitDecBuffer(&buffer);
-//   buffer.colorspace = mode;
+//   WebPInitDecBuffer(&output_buffer);
+//   output_buffer.colorspace = mode;
 //   ...
-//   WebPIDecoder* idec = WebPINewDecoder(&buffer);
-//   while (has_more_data) {
-//     // ... (get additional data)
+//   WebPIDecoder* idec = WebPINewDecoder(&output_buffer);
+//   while (additional_data_is_available) {
+//     // ... (get additional data in some new_data[] buffer)
 //     status = WebPIAppend(idec, new_data, new_data_size);
-//     if (status != VP8_STATUS_SUSPENDED ||
-//       break;
+//     if (status != VP8_STATUS_OK && status != VP8_STATUS_SUSPENDED) {
+//       break;    // an error occurred.
 //     }
 //
 //     // The above call decodes the current available buffer.
-//     // Part of the image can now be refreshed by calling to
+//     // Part of the image can now be refreshed by calling
 //     // WebPIDecGetRGB()/WebPIDecGetYUVA() etc.
 //   }
 //   WebPIDelete(idec);
diff --git a/src/3rdparty/libwebp/src/webp/encode.h b/src/3rdparty/libwebp/src/webp/encode.h
index 9291b71..35fde1d 100644
--- a/src/3rdparty/libwebp/src/webp/encode.h
+++ b/src/3rdparty/libwebp/src/webp/encode.h
@@ -20,7 +20,7 @@
 extern "C" {
 #endif
 
-#define WEBP_ENCODER_ABI_VERSION 0x0209    // MAJOR(8b) + MINOR(8b)
+#define WEBP_ENCODER_ABI_VERSION 0x020e    // MAJOR(8b) + MINOR(8b)
 
 // Note: forward declaring enumerations is not allowed in (strict) C and C++,
 // the types are left here for reference.
@@ -141,12 +141,10 @@ struct WebPConfig {
                           // RGB information for better compression. The default
                           // value is 0.
 
-#ifdef WEBP_EXPERIMENTAL_FEATURES
-  int delta_palettization;
+  int use_delta_palette;  // reserved for future lossless feature
+  int use_sharp_yuv;      // if needed, use sharp (and slow) RGB->YUV conversion
+
   uint32_t pad[2];        // padding for later use
-#else
-  uint32_t pad[3];        // padding for later use
-#endif  // WEBP_EXPERIMENTAL_FEATURES
 };
 
 // Enumerate some predefined settings for WebPConfig, depending on the type
@@ -388,9 +386,24 @@ WEBP_EXTERN(void) WebPPictureFree(WebPPicture* picture);
 // Returns false in case of memory allocation error.
 WEBP_EXTERN(int) WebPPictureCopy(const WebPPicture* src, WebPPicture* dst);
 
+// Compute the single distortion for packed planes of samples.
+// 'src' will be compared to 'ref', and the raw distortion stored into
+// '*distortion'. The refined metric (log(MSE), log(1 - ssim),...' will be
+// stored in '*result'.
+// 'x_step' is the horizontal stride (in bytes) between samples.
+// 'src/ref_stride' is the byte distance between rows.
+// Returns false in case of error (bad parameter, memory allocation error, ...).
+WEBP_EXTERN(int) WebPPlaneDistortion(const uint8_t* src, size_t src_stride,
+                                     const uint8_t* ref, size_t ref_stride,
+                                     int width, int height,
+                                     size_t x_step,
+                                     int type,   // 0 = PSNR, 1 = SSIM, 2 = LSIM
+                                     float* distortion, float* result);
+
 // Compute PSNR, SSIM or LSIM distortion metric between two pictures. Results
-// are in dB, stored in result[] in the Y/U/V/Alpha/All or B/G/R/A/All order.
-// Returns false in case of error (src and ref don't have same dimension, ...)
+// are in dB, stored in result[] in the B/G/R/A/All order. The distortion is
+// always performed using ARGB samples. Hence if the input is YUV(A), the
+// picture will be internally converted to ARGB (just for the measurement).
 // Warning: this function is rather CPU-intensive.
 WEBP_EXTERN(int) WebPPictureDistortion(
     const WebPPicture* src, const WebPPicture* ref,
@@ -473,18 +486,20 @@ WEBP_EXTERN(int) WebPPictureARGBToYUVA(WebPPicture* picture,
 WEBP_EXTERN(int) WebPPictureARGBToYUVADithered(
     WebPPicture* picture, WebPEncCSP colorspace, float dithering);
 
-// Performs 'smart' RGBA->YUVA420 downsampling and colorspace conversion.
+// Performs 'sharp' RGBA->YUVA420 downsampling and colorspace conversion.
 // Downsampling is handled with extra care in case of color clipping. This
 // method is roughly 2x slower than WebPPictureARGBToYUVA() but produces better
-// YUV representation.
+// and sharper YUV representation.
 // Returns false in case of error.
+WEBP_EXTERN(int) WebPPictureSharpARGBToYUVA(WebPPicture* picture);
+// kept for backward compatibility:
 WEBP_EXTERN(int) WebPPictureSmartARGBToYUVA(WebPPicture* picture);
 
 // Converts picture->yuv to picture->argb and sets picture->use_argb to true.
-// The input format must be YUV_420 or YUV_420A.
-// Note that the use of this method is discouraged if one has access to the
-// raw ARGB samples, since using YUV420 is comparatively lossy. Also, the
-// conversion from YUV420 to ARGB incurs a small loss too.
+// The input format must be YUV_420 or YUV_420A. The conversion from YUV420 to
+// ARGB incurs a small loss too.
+// Note that the use of this colorspace is discouraged if one has access to the
+// raw ARGB samples, since using YUV420 is comparatively lossy.
 // Returns false in case of error.
 WEBP_EXTERN(int) WebPPictureYUVAToARGB(WebPPicture* picture);
 
diff --git a/src/3rdparty/libwebp/src/webp/extras.h b/src/3rdparty/libwebp/src/webp/extras.h
deleted file mode 100644
index 1c24be2..0000000
--- a/src/3rdparty/libwebp/src/webp/extras.h
+++ /dev/null
@@ -1,51 +0,0 @@
-// Copyright 2015 Google Inc. All Rights Reserved.
-//
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
-// -----------------------------------------------------------------------------
-//
-
-#ifndef WEBP_WEBP_EXTRAS_H_
-#define WEBP_WEBP_EXTRAS_H_
-
-#include "./types.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#include "./encode.h"
-
-#define WEBP_EXTRAS_ABI_VERSION 0x0000    // MAJOR(8b) + MINOR(8b)
-
-//------------------------------------------------------------------------------
-
-// Returns the version number of the extras library, packed in hexadecimal using
-// 8bits for each of major/minor/revision. E.g: v2.5.7 is 0x020507.
-WEBP_EXTERN(int) WebPGetExtrasVersion(void);
-
-//------------------------------------------------------------------------------
-// Ad-hoc colorspace importers.
-
-// Import luma sample (gray scale image) into 'picture'. The 'picture'
-// width and height must be set prior to calling this function.
-WEBP_EXTERN(int) WebPImportGray(const uint8_t* gray, WebPPicture* picture);
-
-// Import rgb sample in RGB565 packed format into 'picture'. The 'picture'
-// width and height must be set prior to calling this function.
-WEBP_EXTERN(int) WebPImportRGB565(const uint8_t* rgb565, WebPPicture* pic);
-
-// Import rgb sample in RGB4444 packed format into 'picture'. The 'picture'
-// width and height must be set prior to calling this function.
-WEBP_EXTERN(int) WebPImportRGB4444(const uint8_t* rgb4444, WebPPicture* pic);
-
-//------------------------------------------------------------------------------
-
-#ifdef __cplusplus
-}    // extern "C"
-#endif
-
-#endif  /* WEBP_WEBP_EXTRAS_H_ */
diff --git a/src/3rdparty/libwebp/src/webp/format_constants.h b/src/3rdparty/libwebp/src/webp/format_constants.h
index b6e78a6..329fc8a 100644
--- a/src/3rdparty/libwebp/src/webp/format_constants.h
+++ b/src/3rdparty/libwebp/src/webp/format_constants.h
@@ -72,14 +72,13 @@ typedef enum {
 #define RIFF_HEADER_SIZE   12    // Size of the RIFF header ("RIFFnnnnWEBP").
 #define ANMF_CHUNK_SIZE    16    // Size of an ANMF chunk.
 #define ANIM_CHUNK_SIZE    6     // Size of an ANIM chunk.
-#define FRGM_CHUNK_SIZE    6     // Size of a FRGM chunk.
 #define VP8X_CHUNK_SIZE    10    // Size of a VP8X chunk.
 
 #define MAX_CANVAS_SIZE     (1 << 24)     // 24-bit max for VP8X width/height.
 #define MAX_IMAGE_AREA      (1ULL << 32)  // 32-bit max for width x height.
 #define MAX_LOOP_COUNT      (1 << 16)     // maximum value for loop-count
 #define MAX_DURATION        (1 << 24)     // maximum duration
-#define MAX_POSITION_OFFSET (1 << 24)     // maximum frame/fragment x/y offset
+#define MAX_POSITION_OFFSET (1 << 24)     // maximum frame x/y offset
 
 // Maximum chunk payload is such that adding the header and padding won't
 // overflow a uint32_t.
diff --git a/src/3rdparty/libwebp/src/webp/mux.h b/src/3rdparty/libwebp/src/webp/mux.h
index b72658c..daccc65 100644
--- a/src/3rdparty/libwebp/src/webp/mux.h
+++ b/src/3rdparty/libwebp/src/webp/mux.h
@@ -21,13 +21,13 @@
 extern "C" {
 #endif
 
-#define WEBP_MUX_ABI_VERSION 0x0106        // MAJOR(8b) + MINOR(8b)
+#define WEBP_MUX_ABI_VERSION 0x0108        // MAJOR(8b) + MINOR(8b)
 
 //------------------------------------------------------------------------------
 // Mux API
 //
 // This API allows manipulation of WebP container images containing features
-// like color profile, metadata, animation and fragmented images.
+// like color profile, metadata, animation.
 //
 // Code Example#1: Create a WebPMux object with image data, color profile and
 // XMP metadata.
@@ -81,16 +81,16 @@ typedef enum WebPMuxError {
 
 // IDs for different types of chunks.
 typedef enum WebPChunkId {
-  WEBP_CHUNK_VP8X,     // VP8X
-  WEBP_CHUNK_ICCP,     // ICCP
-  WEBP_CHUNK_ANIM,     // ANIM
-  WEBP_CHUNK_ANMF,     // ANMF
-  WEBP_CHUNK_FRGM,     // FRGM
-  WEBP_CHUNK_ALPHA,    // ALPH
-  WEBP_CHUNK_IMAGE,    // VP8/VP8L
-  WEBP_CHUNK_EXIF,     // EXIF
-  WEBP_CHUNK_XMP,      // XMP
-  WEBP_CHUNK_UNKNOWN,  // Other chunks.
+  WEBP_CHUNK_VP8X,        // VP8X
+  WEBP_CHUNK_ICCP,        // ICCP
+  WEBP_CHUNK_ANIM,        // ANIM
+  WEBP_CHUNK_ANMF,        // ANMF
+  WEBP_CHUNK_DEPRECATED,  // (deprecated from FRGM)
+  WEBP_CHUNK_ALPHA,       // ALPH
+  WEBP_CHUNK_IMAGE,       // VP8/VP8L
+  WEBP_CHUNK_EXIF,        // EXIF
+  WEBP_CHUNK_XMP,         // XMP
+  WEBP_CHUNK_UNKNOWN,     // Other chunks.
   WEBP_CHUNK_NIL
 } WebPChunkId;
 
@@ -142,7 +142,7 @@ static WEBP_INLINE WebPMux* WebPMuxCreate(const WebPData* bitstream,
 // Non-image chunks.
 
 // Note: Only non-image related chunks should be managed through chunk APIs.
-// (Image related chunks are: "ANMF", "FRGM", "VP8 ", "VP8L" and "ALPH").
+// (Image related chunks are: "ANMF", "VP8 ", "VP8L" and "ALPH").
 // To add, get and delete images, use WebPMuxSetImage(), WebPMuxPushFrame(),
 // WebPMuxGetFrame() and WebPMuxDeleteFrame().
 
@@ -195,7 +195,7 @@ WEBP_EXTERN(WebPMuxError) WebPMuxDeleteChunk(
 //------------------------------------------------------------------------------
 // Images.
 
-// Encapsulates data about a single frame/fragment.
+// Encapsulates data about a single frame.
 struct WebPMuxFrameInfo {
   WebPData    bitstream;  // image data: can be a raw VP8/VP8L bitstream
                           // or a single-image WebP file.
@@ -203,19 +203,19 @@ struct WebPMuxFrameInfo {
   int         y_offset;   // y-offset of the frame.
   int         duration;   // duration of the frame (in milliseconds).
 
-  WebPChunkId id;         // frame type: should be one of WEBP_CHUNK_ANMF,
-                          // WEBP_CHUNK_FRGM or WEBP_CHUNK_IMAGE
+  WebPChunkId id;         // frame type: should be one of WEBP_CHUNK_ANMF
+                          // or WEBP_CHUNK_IMAGE
   WebPMuxAnimDispose dispose_method;  // Disposal method for the frame.
   WebPMuxAnimBlend   blend_method;    // Blend operation for the frame.
   uint32_t    pad[1];     // padding for later use
 };
 
-// Sets the (non-animated and non-fragmented) image in the mux object.
-// Note: Any existing images (including frames/fragments) will be removed.
+// Sets the (non-animated) image in the mux object.
+// Note: Any existing images (including frames) will be removed.
 // Parameters:
 //   mux - (in/out) object in which the image is to be set
 //   bitstream - (in) can be a raw VP8/VP8L bitstream or a single-image
-//               WebP file (non-animated and non-fragmented)
+//               WebP file (non-animated)
 //   copy_data - (in) value 1 indicates given data WILL be copied to the mux
 //               object and value 0 indicates data will NOT be copied.
 // Returns:
@@ -226,9 +226,8 @@ WEBP_EXTERN(WebPMuxError) WebPMuxSetImage(
     WebPMux* mux, const WebPData* bitstream, int copy_data);
 
 // Adds a frame at the end of the mux object.
-// Notes: (1) frame.id should be one of WEBP_CHUNK_ANMF or WEBP_CHUNK_FRGM
-//        (2) For setting a non-animated non-fragmented image, use
-//            WebPMuxSetImage() instead.
+// Notes: (1) frame.id should be WEBP_CHUNK_ANMF
+//        (2) For setting a non-animated image, use WebPMuxSetImage() instead.
 //        (3) Type of frame being pushed must be same as the frames in mux.
 //        (4) As WebP only supports even offsets, any odd offset will be snapped
 //            to an even location using: offset &= ~1
@@ -431,9 +430,10 @@ struct WebPAnimEncoderOptions {
                         // frames in the output. The library may insert some key
                         // frames as needed to satisfy this criteria.
                         // Note that these conditions should hold: kmax > kmin
-                        // and kmin >= kmax / 2 + 1. Also, if kmin == 0, then
-                        // key-frame insertion is disabled; and if kmax == 0,
-                        // then all frames will be key-frames.
+                        // and kmin >= kmax / 2 + 1. Also, if kmax <= 0, then
+                        // key-frame insertion is disabled; and if kmax == 1,
+                        // then all frames will be key-frames (kmin value does
+                        // not matter for these special cases).
   int allow_mixed;      // If true, use mixed compression mode; may choose
                         // either lossy and lossless for each frame.
   int verbose;          // If true, print info and warning messages to stderr.
diff --git a/src/3rdparty/libwebp/src/webp/mux_types.h b/src/3rdparty/libwebp/src/webp/mux_types.h
index c94043a..b37e2c6 100644
--- a/src/3rdparty/libwebp/src/webp/mux_types.h
+++ b/src/3rdparty/libwebp/src/webp/mux_types.h
@@ -31,12 +31,13 @@ typedef struct WebPData WebPData;
 
 // VP8X Feature Flags.
 typedef enum WebPFeatureFlags {
-  FRAGMENTS_FLAG  = 0x00000001,
   ANIMATION_FLAG  = 0x00000002,
   XMP_FLAG        = 0x00000004,
   EXIF_FLAG       = 0x00000008,
   ALPHA_FLAG      = 0x00000010,
-  ICCP_FLAG       = 0x00000020
+  ICCP_FLAG       = 0x00000020,
+
+  ALL_VALID_FLAGS = 0x0000003e
 } WebPFeatureFlags;
 
 // Dispose method (animation only). Indicates how the area used by the current
author	Liang Qi <liang.qi@qt.io>	2017-03-07 13:05:21 +0100
committer	Liang Qi <liang.qi@qt.io>	2017-03-13 10:47:45 +0000
commit	b7ec9e78633d8f2c75a8b02e17e169497bb103e2 (patch)
tree	e4be04af4dbcf8cd635715efdf4e769281183746
parent	f2dbc67c2b032a5f27d0224e020fb6dfcd3fd142 (diff)