libwebp: update to 0.4.3

This commit imports libwebp 0.4.3, including AUTHORS, COPYING, ChangeLog, NEWS, PATENTS, README and src directories. In src, only includes header and source files. The patches required to build it in Qt will follow in separate commit(s). Change-Id: I23ebfd69e47a468c91a9e9b109e9cb8ac63705d4 Reviewed-by: Lars Knoll <lars.knoll@digia.com> Reviewed-by: Konstantin Ritt <ritt.ks@gmail.com> Reviewed-by: aavit <eirik.aavitsland@theqtcompany.com>
author: Liang Qi <liang.qi@theqtcompany.com> 2015-04-08 10:10:02 +0200
committer: Liang Qi <liang.qi@theqtcompany.com> 2015-04-09 08:41:30 +0000
commit: 600a2d18d40ba87a273e1a258ce3189b0edc11f8 (patch)
tree: d80ede3880cb86f06eae50191e5dd3b102482189 /src/3rdparty
parent: 32e437b1244e7bf45011e4fd770fc0eef40d02de (diff)
99 files changed, 13662 insertions, 5344 deletions
diff --git a/src/3rdparty/libwebp/AUTHORS b/src/3rdparty/libwebp/AUTHORS
index 817950f..5767e90 100644
--- a/src/3rdparty/libwebp/AUTHORS
+++ b/src/3rdparty/libwebp/AUTHORS
@@ -1,18 +1,25 @@
 Contributors:
 - Charles Munger (clm at google dot com)
 - Christian Duvivier (cduvivier at google dot com)
+- Djordje Pesut (djordje dot pesut at imgtec dot com)
 - James Zern (jzern at google dot com)
 - Jan Engelhardt (jengelh at medozas dot de)
 - Johann (johann dot koenig at duck dot com)
+- Jovan Zelincevic (jovan dot zelincevic at imgtec dot com)
 - Jyrki Alakuijala (jyrki at google dot com)
+- levytamar82 (tamar dot levy at intel dot com)
 - Lou Quillio (louquillio at google dot com)
 - Mans Rullgard (mans at mansr dot com)
 - Martin Olsson (mnemo at minimum dot se)
 - Mikołaj Zalewski (mikolajz at google dot com)
 - Noel Chromium (noel at chromium dot org)
 - Pascal Massimino (pascal dot massimino at gmail dot com)
+- Paweł Hajdan, Jr (phajdan dot jr at chromium dot org)
 - Pierre Joye (pierre dot php at gmail dot com)
 - Scott LaVarnway (slavarnway at google dot com)
+- Scott Talbot (s at chikachow dot org)
+- Slobodan Prijic (slobodan dot prijic at imgtec dot com)
 - Somnath Banerjee (somnath dot banerjee at gmail dot com)
+- Timothy Gu (timothygu99 at gmail dot com)
 - Urvang Joshi (urvang at google dot com)
 - Vikas Arora (vikasa at google dot com)
diff --git a/src/3rdparty/libwebp/ChangeLog b/src/3rdparty/libwebp/ChangeLog
index 5fa6c3f..2914cf5 100644
--- a/src/3rdparty/libwebp/ChangeLog
+++ b/src/3rdparty/libwebp/ChangeLog
@@ -1,3 +1,488 @@
+a661e50 Disable NEON code on Native Client
+fcd94e9 update ChangeLog (tag: v0.4.3-rc1)
+569fe57 update NEWS
+bd852f5 bump version to 0.4.3
+2d58b64 WebPPictureRescale: add a note about 0 width/height
+a0d8ca5 examples/Android.mk: add webpmux_example target
+34b1d29 Android.mk: add webpmux target
+7561988 Android.mk: add webpdemux target
+a987576 Android.mk: add webpdecoder{,_static} targets
+a6d4859 Android.mk: split source lists per-directory
+77544d5 fix iOS arm64 build with Xcode 6.3
+6dea157 doc/webp-container-spec: note MSB order for chunk diagrams
+f7cd57b doc/webp-container-spec: cosmetics
+1d6b250 vwebp: clear canvas at the beginning of each loop
+f97b3f8 webp-container-spec: clarify background clear on loop
+4ba83c1 vwebp: remove unnecessary static Help() prototype
+d34e8e3 vwebp/animation: display last frame on end-of-loop
+bbbc524 dec/vp8: clear 'dither_' on skipped blocks
+0339fa2 lossless_neon: enable subtract green for aarch64
+5a0c220 Regression fix for lossless decoding
+6e3a31d wicdec: (msvs) quiet some /analyze warnings
+b49a578 dwebp/WritePNG: mark png variables volatile
+0a4391a dwebp: include setjmp.h w/WEBP_HAVE_PNG
+90f1ec5 dwebp: correct sign in format strings
+b61ce86 VP8LEncodeStream: add an assert
+df1081b dsp/cpu: (msvs) add include for __cpuidex
+39aa055 dsp/cpu: (msvs) avoid immintrin.h on _M_ARM
+f814f42 dsp/cpu: add include for _xgetbv() w/MSVS
+8508ab9 cpu: fix AVX2 detection for gcc/clang targets
+5769623 fix handling of zero-sized partition #0 corner case
+b2e71a9 make the 'last_cpuinfo_used' variable names unique
+1273e84 add -Wformat-nonliteral and -Wformat-security
+3ae78eb multi-thread fix: lock each entry points with a static var
+5c1eeda webp-container-spec: remove references to fragments
+c5ceea4 enc_neon: fix building with non-Xcode clang (iOS)
+d0859d6 iosbuild: add x64_64 simulator support
+046732c WebPEncode: Support encoding same pic twice (even if modified)
+4426f50 webp/types.h: use inline for clang++/-std=c++11
+e297fc7 gif2webp: Use the default hint instead of WEBP_HINT_GRAPH.
+855fe43 Makefile.vc: add a 'legacy' RTLIBCFG option
+b7eb6d5 gif2webp: Support GIF_DISPOSE_RESTORE_PREVIOUS
+5691bdd gif2webp: Handle frames with odd offsets + disposal to background.
+8301da1 stopwatch.h: fix includes
+6a2209a update ChangeLog (tag: v0.4.2, origin/0.4.2, 0.4.2)
+36cad6a bit_reader.h: cosmetics: fix a typo
+e2ecae6 enc_mips32: workaround gcc-4.9 bug
+243e68d update ChangeLog (tag: v0.4.2-rc2)
+eec5f5f enc/vp8enci.h: update version number
+0c1b98d update NEWS
+69b0fc9 update AUTHORS
+857578a bump version to 0.4.2
+9129deb restore encode API compatibility
+f17b95e AssignSegments: quiet -Warray-bounds warning
+9c56c8a enc_neon: initialize vectors w/vdup_n_u32
+a008902 iosbuild: cleanup
+cc6de53 iosbuild: output autoconf req. on failure
+740d765 iosbuild: make iOS 6 the minimum requirement
+403023f iobuild.sh: only install .h files in Headers
+b65727b Premultiply with alpha during U/V downsampling
+8de0deb gif2webp: Background color correction
+f8b7d94 Amend the lossless spec according to issue #205, #206 and #224
+9102a7b Add a WebPExtractAlpha function to dsp
+e407b5d webpmux: simplify InitializeConfig()
+3e70e64 webpmux: fix indent
+be38f1a webpmux: fix exit status on numeric value parse error
+94dadcb webpmux: fix loop_count range check
+40b3a61 examples: warn on invalid numeric parameters
+b7d209a gif2webp: Handle frames with missing  graphic control extension
+bf0eb74 configure: simplify libpng-config invocation
+3740f7d Rectify bug in lossless incremental decoding.
+3ab0a37 make VP8LSetBitPos() set br->eos_ flag
+2e4312b Lossless decoding: fix eos_ flag condition
+e6609ac fix erroneous dec->status_ setting
+5692eae add a fallback to ALPHA_NO_COMPRESSION
+6ecd5bf ExUtilReadFromStdin: (windows) open stdin in bin mode
+4206ac6 webpmux: (windows) open stdout in binary mode
+d40e885 cwebp: (windows) open stdout in binary mode
+4aaf463 example_util: add ExUtilSetBinaryMode
+4c82ff7 webpmux man page: Clarify some title, descriptions and examples
+23d4fb3 dsp/lossless: workaround gcc-4.9 bug on arm
+5af7719 dsp.h: collect gcc/clang version test macros
+90d1124 enc_neon: enable QuantizeBlock for aarch64
+ee78e78 SmartRGBYUV: fix odd-width problem with pixel replication
+c9ac204 fix some MSVC64 warning about float conversion
+f4497a1 cpu: check for _MSC_VER before using msvc inline asm
+e2159fd faster RGB->YUV conversion function (~7% speedup)
+21abaa0 Add smart RGB->YUV conversion option -pre 4
+1a161e2 configure: add work around for gcc-4.9 aarch64 bug
+55b10de MIPS: mips32r2: added optimization for BSwap32
+76d2192 Update PATENTS to reflect s/VP8/WebM/g
+29a9db1 MIPS: detect mips32r6 and disable mips32r1 code
+245c4a6 Correctly use the AC_CANONICAL_* macros
+40aa8b6 cosmetics
+2ddcca5 cosmetics: remove some extraneous 'extern's
+f40dd7c vp8enci.h: cosmetics: fix '*' placement
+4610c9c bit_writer: cosmetics: rename kFlush() -> Flush()
+fc3c175 dsp: detect mips64 & disable mips32 code
+c1a7955 cwebp.1: restore quality description
+57a7e73 correct alpha_dithering_strength ABI check
+6c83157 correct WebPMemoryWriterClear ABI check
+8af2771 update ChangeLog (tag: v0.4.1, origin/0.4.1, 0.4.1)
+f59c0b4 iosbuild.sh: specify optimization flags
+8d34ea3 update ChangeLog (tag: v0.4.1-rc1)
+dbc3da6 makefile.unix: add vwebp.1 to the dist target
+89a7c83 update ChangeLog
+ffe67ee Merge "update NEWS for the next release" into 0.4.1
+2def1fe gif2webp: dust up the help message
+fb668d7 remove -noalphadither option from README/vwebp.1
+e49f693 update NEWS for the next release
+cd01358 Merge "update AUTHORS" into 0.4.1
+268d01e update AUTHORS
+85213b9 bump version to 0.4.1
+695f80a Merge "restore mux API compatibility" into 0.4.1
+862d296 restore mux API compatibility
+8f6f8c5 remove the !WEBP_REFERENCE_IMPLEMENTATION tweak in Put8x8uv
+d713a69 Merge changes If4debc15,I437a5d5f into 0.4.1
+c2fc52e restore encode API compatibility
+793368e restore decode API compatibility
+b8984f3 gif2webp: fix compile with giflib 5.1.0
+222f9b1 gif2webp: simplify giflib version checking
+d2cc61b Extend MakeARGB32() to accept Alpha channel.
+4595b62 Merge "use explicit size of kErrorMessages[] arrays"
+157de01 Merge "Actuate memory stats for PRINT_MEMORY_INFO"
+fbda2f4 JPEG decoder: delay conversion to YUV to WebPEncode() call
+0b747b1 use explicit size of kErrorMessages[] arrays
+3398d81 Actuate memory stats for PRINT_MEMORY_INFO
+6f3202b Merge "move WebPPictureInit to picture.c"
+6c347bb move WebPPictureInit to picture.c
+fb3acf1 fix configure message for multi-thread
+40b086f configure: check for _beginthreadex
+1549d62 reorder the YUVA->ARGB and ARGB->YUVA functions correctly
+c6461bf Merge "extract colorspace code from picture.c into picture_csp.c"
+736f2a1 extract colorspace code from picture.c into picture_csp.c
+645daa0 Merge "configure: check for -Wformat-security"
+abafed8 configure: check for -Wformat-security
+fbadb48 split monolithic picture.c into picture_{tools,psnr,rescale}.c
+c76f07e dec_neon/TransformAC3: initialize vector w/vcreate
+bb4fc05 gif2webp: Allow single-frame animations
+46fd44c thread: remove harmless race on status_ in End()
+5a1a726 Merge "configure: check for __builtin_bswapXX()"
+6781423 configure: check for __builtin_bswapXX()
+6450c48 configure: fix iOS builds
+6422e68 VP8LFillBitWindow: enable fast path for 32-bit builds
+4f7f52b VP8LFillBitWindow: respect WEBP_FORCE_ALIGNED
+e458bad endian_inl.h: implement htoleXX with BSwapXX
+f2664d1 endian_inl.h: add BSwap16
+6fbf534 Merge "configure: add --enable-aligned"
+dc0f479 configure: add --enable-aligned
+9cc69e2 Merge "configure: support WIC + OpenGL under mingw64"
+257adfb remove experimental YUV444 YUV422 and YUV400 code
+10f4257 configure: support WIC + OpenGL under mingw64
+380cca4 configure.ac: add AC_C_BIGENDIAN
+ee70a90 endian_inl.h: add BSwap64
+47779d4 endian_inl.h: add BSwap32
+d5104b1 utils: add endian_inl.h
+58ab622 Merge "make alpha-detection loop in IsKeyFrame() in good x/y order"
+9d56290 make alpha-detection loop in IsKeyFrame() in good x/y order
+516971b lossless: Remove unaligned read warning
+b8b596f Merge "configure.ac: add an autoconf version prerequisite"
+34b02f8 configure.ac: add an autoconf version prerequisite
+e59f536 neon: normalize vdup_n_* usage
+6ee7160 Merge changes I0da7b3d3,Idad2f278,I4accc305
+abc02f2 Merge "fix (uncompiled) typo"
+bc03670 neon: add INIT_VECTOR4
+6c1c632 neon: add INIT_VECTOR3
+dc7687e neon: add INIT_VECTOR2
+4536e7c add WebPMuxSetCanvasSize() to the mux API
+824eab1 fix (uncompiled) typo
+1f3e5f1 remove unused 'shift' argument and QFIX2 define
+8e86705 Merge "VP8LoadNewBytes: use __builtin_bswap32 if available"
+1b6a263 Merge "Fix handling of weird GIF with canvas dimension 0x0"
+1da3d46 VP8LoadNewBytes: use __builtin_bswap32 if available
+1582e40 Fix handling of weird GIF with canvas dimension 0x0
+b8811da Merge "rename interface -> winterface"
+db8b8b5 Fix logic in the GIF LOOP-detection parsing
+25aaddc rename interface -> winterface
+5584d9d make WebPSetWorkerInterface() check its arguments
+a9ef7ef Merge "cosmetics: update thread.h comments"
+c6af999 Merge "dust up the help message"
+0a8b886 dust up the help message
+a9cf319 cosmetics: update thread.h comments
+27bfeee QuantizeBlock SSE2 Optimization:
+2bc0dc3 Merge "webpmux: warn when odd frame offsets are used"
+3114ebe Merge changes Id8edd3c1,Id418eb96,Ide05e3be
+c072663 webpmux: warn when odd frame offsets are used
+c5c6b40 Merge "add alpha dithering for lossy"
+d514678 examples/Android.mk: add cwebp
+ca0fa7c Android.mk: move dwebp to examples/Android.mk
+73d8fca Android.mk: add ENABLE_SHARED flag
+6e93317 muxread: fix out of bounds read
+8b0f6a4 Makefile.vc: fix CFLAGS assignment w/HAVE_AVX2=1
+bbe32df add alpha dithering for lossy
+7902076 Merge "make error-code reporting consistent upon malloc failure"
+77bf441 make error-code reporting consistent upon malloc failure
+7a93c00 **/Makefile.am: remove unused AM_CPPFLAGS
+24e3080 Add an interface abstraction to the WebP worker thread implementation
+d6cd635 Merge "fix orig_rect==NULL case"
+2bfd1ff fix orig_rect==NULL case
+059e21c Merge "configure: move config.h to src/webp/config.h"
+f05fe00 properly report back encoding error code in WebPFrameCacheAddFrame()
+32b3137 configure: move config.h to src/webp/config.h
+90090d9 Merge changes I7c675e51,I84f7d785
+ae7661b makefiles: define WEBP_HAVE_AVX2 when appropriate
+69fce2e remove the special casing for res->first in VP8SetResidualCoeffs
+6e61a3a configure: test for -msse2
+b9d2efc rename upsampling_mips32.c to yuv_mips32.c
+bdfeeba dsp/yuv: move sse2 functions to yuv_sse2.c
+46b32e8 Merge "configure: set WEBP_HAVE_AVX2 when available"
+88305db Merge "VP8RandomBits2: prevent signed int overflow"
+73fee88 VP8RandomBits2: prevent signed int overflow
+db4860b enc_sse2: prevent signed int overflow
+3fdaf4d Merge "real fix for longjmp warning"
+385e334 real fix for longjmp warning
+230a055 configure: set WEBP_HAVE_AVX2 when available
+a2ac8a4 restore original value_/range_ field order
+5e2ee56 Merge "remove libwebpdspdecode dep on libwebpdsp_avx2"
+61362db remove libwebpdspdecode dep on libwebpdsp_avx2
+42c447a Merge "lossy bit-reader clean-up:"
+479ffd8 Merge "remove unused #include's"
+9754d39 Merge "strong filtering speed-up (~2-3% x86, ~1-2% for NEON)"
+158aff9 remove unused #include's
+09545ee lossy bit-reader clean-up:
+ea8b0a1 strong filtering speed-up (~2-3% x86, ~1-2% for NEON)
+6679f89 Optimize VP8SetResidualCoeffs.
+ac591cf fix for gcc-4.9 warnings about longjmp + local variables
+4dfa86b dsp/cpu: NaCl has no support for xgetbv
+4c39869 Merge "cwebp: fallback to native webp decode in WIC builds"
+33aa497 Merge "cwebp: add some missing newlines in longhelp output"
+c9b340a fix missing WebPInitAlphaProcessing call for premultiplied colorspace output
+57897ba Merge "lossless_neon: use vcreate_*() where appropriate"
+6aa4777 Merge "(enc|dec)_neon: use vcreate_*() where appropriate"
+0d346e4 Always reinit VP8TransformWHT instead of hard-coding
+7d039fc cwebp: fallback to native webp decode in WIC builds
+d471f42 cwebp: add some missing newlines in longhelp output
+bf0e003 lossless_neon: use vcreate_*() where appropriate
+9251c2f (enc|dec)_neon: use vcreate_*() where appropriate
+399b916 lossy decoding: correct alpha-rescaling for YUVA format
+78c12ed Merge "Makefile.vc: add rudimentary avx2 support"
+dc5b122 try to remove the spurious warning for static analysis
+ddfefd6 Makefile.vc: add rudimentary avx2 support
+a891164 Merge "simplify VP8LInitBitReader()"
+fdbcd44 simplify VP8LInitBitReader()
+7c00428 makefile.unix: add rudimentary avx2 support
+515e35c Merge "add stub dsp/enc_avx2.c"
+a05dc14 SSE2: yuv->rgb speed-up for point-sampling
+178e9a6 add stub dsp/enc_avx2.c
+1b99c09 Merge "configure: add a test for -mavx2"
+fe72807 configure: add a test for -mavx2
+e46a247 cpu: fix check for __cpuidex availability
+176fda2 fix the bit-writer for lossless in 32bit mode
+541784c dsp.h: add a check for AVX2 / define WEBP_USE_AVX2
+bdb151e dsp/cpu: add AVX2 detection
+ab9f2f8 Merge "revamp the point-sampling functions by processing a full plane"
+a2f8b28 revamp the point-sampling functions by processing a full plane
+ef07602 use decoder's DSP functions for autofilter
+2b5cb32 Merge "dsp/cpu: add AVX detection"
+df08e67 dsp/cpu: add AVX detection
+e2f405c Merge "clean-up and slight speed-up in-loop filtering SSE2"
+f60957b clean-up and slight speed-up in-loop filtering SSE2
+9fc3ae4 .gitattributes: treat .ppm as binary
+3da924b Merge "dsp/WEBP_USE_NEON: test for __aarch64__"
+c716449 Android.mk: always include *_neon.c in the build
+a577b23 dsp/WEBP_USE_NEON: test for __aarch64__
+54bfffc move RemapBitReader() from idec.c to bit_reader code
+34168ec Merge "remove all unused layer code"
+f1e7717 remove all unused layer code
+b0757db Code cleanup for VP8LGetHistoImageSymbols.
+5fe628d make the token page size be variable instead of fixed 8192
+f948d08 memory debug: allow setting pre-defined malloc failure points
+ca3d746 use block-based allocation for backward refs storage, and free-lists
+1ba61b0 enable NEON intrinsics in aarch64 builds
+b9d2bb6 dsp/neon.h: coalesce intrinsics-related defines
+b5c7525 iosbuild: add support for iOSv7/aarch64
+9383afd Reduce number of memory allocations while decoding lossless.
+888e63e Merge "dsp/lossless: prevent signed int overflow in left shift ops"
+8137f3e Merge "instrument memory allocation routines for debugging"
+2aa1873 instrument memory allocation routines for debugging
+d3bcf72 Don't allocate VP8LHashChain, but treat like automatic object
+bd6b861 dsp/lossless: prevent signed int overflow in left shift ops
+b7f19b8 Merge "dec/vp8l: prevent signed int overflow in left shift ops"
+29059d5 Merge "remove some uint64_t casts and use."
+e69a1df dec/vp8l: prevent signed int overflow in left shift ops
+cf5eb8a remove some uint64_t casts and use.
+38e2db3 MIPS: MIPS32r1: Added optimization for HistogramAdd.
+e0609ad dwebp: fix exit code on webp load failure
+bbd358a Merge "example_util.h: avoid forward declaring enums"
+8955da2 example_util.h: avoid forward declaring enums
+6d6865f Added SSE2 variants for Average2/3/4
+b3a616b make HistogramAdd() a pointer in dsp
+c8bbb63 dec_neon: relocate some inline-asm defines
+4e393bb dec_neon: enable intrinsics-only functions
+ba99a92 dec_neon: use positive tests for USE_INTRINSICS
+69058ff Merge "example_util: add ExUtilDecodeWebPIncremental"
+a7828e8 dec_neon: make WORK_AROUND_GCC conditional on version
+3f3d717 Merge "enc_neon: enable intrinsics-only functions"
+de3cb6c Merge "move LOCAL_GCC_VERSION def to dsp.h"
+1b2fe14 example_util: add ExUtilDecodeWebPIncremental
+ca49e7a Merge "enc_neon: move Transpose4x4 to dsp/neon.h"
+ad900ab Merge "fix warning about size_t -> int conversion"
+4825b43 fix warning about size_t -> int conversion
+42b35e0 enc_neon: enable intrinsics-only functions
+f937e01 move LOCAL_GCC_VERSION def to dsp.h
+5e1a17e enc_neon: move Transpose4x4 to dsp/neon.h
+c7b92a5 dec_neon: (WORK_AROUND_GCC) delete unused Load4x8
+8e5f90b Merge "make ExUtilLoadWebP() accept NULL bitstream param."
+05d4c1b Merge "cwebp: add webpdec"
+ddeb6ac cwebp: add webpdec
+35d7d09 Merge "Reduce memory footprint for encoding WebP lossless."
+0b89610 Reduce memory footprint for encoding WebP lossless.
+f0b65c9 make ExUtilLoadWebP() accept NULL bitstream param.
+9c0a60c Merge "dwebp: move webp decoding to example_util"
+1d62acf MIPS: MIPS32r1: Added optimization for HuffmanCost functions.
+4a0e739 dwebp: move webp decoding to example_util
+c022046 Merge "Bugfix: Incremental decode of lossy-alpha"
+8c7cd72 Bugfix: Incremental decode of lossy-alpha
+7955152 MIPS: fix error with number of registers.
+b1dabe3 Merge "Move the HuffmanCost() function to dsp lib"
+75b1200 Move the HuffmanCost() function to dsp lib
+2772b8b MIPS: fix assembler error revealed by clang's debug build
+6653b60 enc_mips32: fix unused symbol warning in debug
+8dec120 enc_mips32: disable ITransform(One) in debug builds
+98519dd enc_neon: convert Disto4x4 to intrinsics
+fe9317c cosmetics:
+953b074 enc_neon: cosmetics
+a9fc697 Merge "WIP: extract the float-calculation of HuffmanCost from loop"
+3f84b52 Merge "replace some mult-long (vmull_u8) with mult-long-accumulate (vmlal_u8)"
+4ae0533 MIPS: MIPS32r1: Added optimizations for ExtraCost functions.
+b30a04c WIP: extract the float-calculation of HuffmanCost from loop
+a8fe8ce Merge "NEON intrinsics version of CollectHistogram"
+95203d2 NEON intrinsics version of CollectHistogram
+7ca2e74 replace some mult-long (vmull_u8) with mult-long-accumulate (vmlal_u8)
+41c6efb fix lossless_neon.c
+8ff96a0 NEON intrinsics version of FTransform
+0214f4a Merge "MIPS: MIPS32r1: Added optimizations for FastLog2"
+baabf1e MIPS: MIPS32r1: Added optimizations for FastLog2
+3d49871 NEON functions for lossless coding
+3fe0291 MIPS: MIPS32r1: Added optimizations for SSE functions.
+c503b48 Merge "fix the gcc-4.6.0 bug by implementing alternative method"
+abe6f48 fix the gcc-4.6.0 bug by implementing alternative method
+5598bde enc_mips32.c: fix file mode
+2b1b4d5 MIPS: MIPS32r1: Add optimization for GetResidualCost
+f0a1f3c Merge "MIPS: MIPS32r1: Added optimization for FTransform"
+7231f61 MIPS: MIPS32r1: Added optimization for FTransform
+869eaf6  ~30% encoding speedup: use NEON for QuantizeBlock()
+f758af6 enc_neon: convert FTransformWHT to intrinsics
+7dad095 MIPS: MIPS32r1: Added optimization for Disto4x4 (TTransform)
+2298d5f MIPS: MIPS32r1: Added optimization for QuantizeBlock
+e88150c Merge "MIPS: MIPS32r1: Add optimization for ITransform"
+de693f2 lossless_neon: disable VP8LConvert* functions
+4143332 NEON intrinsics for encoding
+0ca2914 MIPS: MIPS32r1: Add optimization for ITransform
+71bca5e dec_neon: use vst_lane instead of vget_lane
+bf06105 Intrinsics NEON version of TransformOne
+19c6f1b Merge "dec_neon: use vld?_lane instead of vset?_lane"
+7a94c0c upsampling_neon: drop NEON suffix from local functions
+d14669c upsampling_sse2: drop SSE2 suffix from local functions
+2ca42a4 enc_sse2: drop SSE2 suffix from local functions
+d038e61 dec_sse2: drop SSE2 suffix from local functions
+fa52d75 dec_neon: use vld?_lane instead of vset?_lane
+c520e77 cosmetic: fix long line
+4b0f2da Merge "add intrinsics NEON code for chroma strong-filtering"
+e351ec0 add intrinsics NEON code for chroma strong-filtering
+aaf734b Merge "Add SSE2 version of forward cross-color transform"
+c90a902 Add SSE2 version of forward cross-color transform
+bc374ff Use histogram_bits to initalize transform_bits.
+2132992 Merge "Add strong filtering intrinsics (inner and outer edges)"
+5fbff3a Add strong filtering intrinsics (inner and outer edges)
+d4813f0 Add SSE2 function for Inverse Cross-color Transform
+2602956 dec_neon: add strong loopfilter intrinsics
+cca7d7e Merge "add intrinsics version of SimpleHFilter16NEON()"
+1a05dfa windows: fix dll builds
+d6c50d8 Merge "add some colorspace conversion functions in NEON"
+4fd7c82 SSE2 variants of Subtract-Green: Rectify loop condition
+97e5fac add some colorspace conversion functions in NEON
+b9a7a45 add intrinsics version of SimpleHFilter16NEON()
+daccbf4 add light filtering NEON intrinsics
+af44460 fix typo in STORE_WHT
+6af6b8e Tune HistogramCombineBin for large images.
+af93bdd use WebPSafe[CM]alloc/WebPSafeFree instead of [cm]alloc/free
+51f406a lossless_sse2: relocate VP8LDspInitSSE2 proto
+0f4f721 separate SSE2 lossless functions into its own file
+514fc25 VP8LConvertFromBGRA: use conversion function pointers
+6d2f352 dsp/dec: TransformDCUV: use VP8TransformDC
+defc8e1 Merge "fix out-of-bound read during alpha-plane decoding"
+fbed364 Merge "dsp: reuse wht transform from dec in encoder"
+d846708 Merge "Add SSE2 version of ARGB -> BGR/RGB/... conversion functions"
+207d03b fix out-of-bound read during alpha-plane decoding
+d1b33ad 2-5% faster trellis with clang/MacOS (and ~2-3% on ARM)
+369c26d Add SSE2 version of ARGB -> BGR/RGB/... conversion functions
+df230f2 dsp: reuse wht transform from dec in encoder
+80e218d Android.mk: fix build with APP_ABI=armeabi-v7a-hard
+59daf08 Merge "cosmetics:"
+5362200 cosmetics:
+3e7f34a AssignSegments: quiet array-bounds warning
+3c2ebf5 Merge "UpdateHistogramCost: avoid implicit double->float"
+cf821c8 UpdateHistogramCost: avoid implicit double->float
+312e638 Extend the search space for GetBestGreenRedToBlue
+1c58526 Fix few nits
+fef2270 Optimize and re-structure VP8LGetHistoImageSymbols
+068b14a Optimize lossless decoding.
+5f0cfa8 Do a binary search to get the optimum cache bits.
+24ca367 Merge "allow 'cwebp -o -' to emit output to stdout"
+e12f874 allow 'cwebp -o -' to emit output to stdout
+2bcad89 allow some more stdin/stout I/O
+84ed4b3 fix cwebp.1 typos after patch #69199
+65b99f1 add a -z option to cwebp, and WebPConfigLosslessPreset() function
+3017661 4-5% faster trellis by removing some unneeded calculations.
+687a58e histogram.c: reindent after b33e8a0
+06d456f Merge "~3-4% faster lossless encoding"
+c60de26 ~3-4% faster lossless encoding
+42eb06f Merge "few cosmetics after patch #69079"
+82af826 few cosmetics after patch #69079
+b33e8a0 Refactor code for HistogramCombine.
+ca1bfff Merge "5-10% encoding speedup with faster trellis (-m 6)"
+5aeeb08 5-10% encoding speedup with faster trellis (-m 6)
+82ae1bf cosmetics: normalize VP8GetCPUInfo checks
+e3dd924 Merge "Refactor GetBestPredictorForTile for future tuning."
+206cc1b Refactor GetBestPredictorForTile for future tuning.
+3cb8406 Merge "speed-up trellis quant (~5-10% overall speed-up)"
+b66f222 Merge "lossy encoding: ~3% speed-up"
+4287d0d speed-up trellis quant (~5-10% overall speed-up)
+390c8b3 lossy encoding: ~3% speed-up
+9a463c4 Merge "dec_neon: convert TransformWHT to intrinsics"
+e8605e9 Merge "dec_neon: add ConvertU8ToS16"
+4aa3e41 MIPS: MIPS32r1: rescaler bugfix
+c16cd99 Speed up lossless encoder.
+9d6b5ff dec_neon: convert TransformWHT to intrinsics
+2ff0aae dec_neon: add ConvertU8ToS16
+77a8f91 fix compilation with USE_YUVj flag
+4acbec1 Merge changes I3b240ffb,Ia9370283,Ia2d28728
+2719bb7 dec_neon: TransformAC3: work on packed vectors
+b7b60ca dec_neon: add SaturateAndStore4x4
+b7685d7 Rescale: let ImportRow / ExportRow be pointer-to-function
+e02f16e dec_neon.c: convert TransformDC to intrinsics
+9cba963 add missing file
+8992ddb use static clipping tables
+0235d5e 1-2% faster quantization in SSE2
+b2fbc36 fix VC12-x64 warning
+6e37cb9 Merge "cosmetics: backward_references.c: reindent after a7d2ee3"
+a42ea97 cosmetics: backward_references.c: reindent after a7d2ee3
+6c32744 Merge "fix missing __BIG_ENDIAN__ definition on some platform"
+a8b6aad fix missing __BIG_ENDIAN__ definition on some platform
+fde2904 Increase initial buffer size for VP8L Bit Writer.
+a7d2ee3 Optimize cache estimate logic.
+7fb6095 Merge "dec_neon.c: add TransformAC3"
+bf182e8 VP8LBitWriter: use a bit-accumulator
+3f40b4a Merge "MIPS: MIPS32r1: clang macro warning resolved"
+1684f4e WebP Decoder: Mark some truncated bitstreams as invalid
+acbedac MIPS: MIPS32r1: clang macro warning resolved
+228e487 dec_neon.c: add TransformAC3
+393f89b Android.mk: avoid gcc-specific flags with clang
+32aeaf1 revamp VP8LColorSpaceTransform() a bit
+0c7cc4c Merge "Don't dereference NULL, ensure HashChain fully initialized"
+391316f Don't dereference NULL, ensure HashChain fully initialized
+926ff40 WEBP_SWAP_16BIT_CSP: remove code dup
+1d1cd3b Fix decode bug for rgbA_4444/RGBA_4444 color-modes.
+939e70e update AUTHORS file
+8934a62 cosmetics: *_mips32.c
+dd438c9 MIPS: MIPS32r1: Optimization of some simple point-sampling functions. PATCH [6/6]
+5352091 Added support for calling sampling functions via pointers.
+d16c697 MIPS: MIPS32r1: Optimization of filter functions. PATCH [5/6]
+04336fc MIPS: MIPS32r1: Optimization of function TransformOne. PATCH [4/6]
+92d8fc7 MIPS: MIPS32r1: Optimization of function WebPRescalerImportRow. PATCH [3/6]
+bbc23ff parse one row of intra modes altogether
+a2f608f Merge "MIPS: MIPS32r1: Optimization of function WebPRescalerExportRow. [2/6]"
+8823085 MIPS: MIPS32r1: Optimization of function WebPRescalerExportRow. [2/6]
+c5a5b02 decode mt+incremental: fix segfault in debug builds
+9882b2f always use fast-analysis for all methods.
+000adac Merge "autoconf: update ax_pthread.m4"
+2d2fc37 update .gitignore
+5bf4255 Merge "Make it possible to avoid automagic dependencies"
+c1cb193 disable NEON for arm64 platform
+73a304e Make it possible to avoid automagic dependencies
+4d493f8 MIPS: MIPS32r1: Decoder bit reader function optimized. PATCH [1/6]
+c741183 make WebPCleanupTransparentArea work with argb picture
+5da1855 add a decoding option to flip image vertically
+00c3c4e Merge "add man/vwebp.1"
+2c6bb42 add man/vwebp.1
+ea59a8e Merge "Merge tag 'v0.4.0'"
+7574bed fix comments related to array sizes
+0b5a90f dwebp.1: fix option formatting
+effcb0f Merge tag 'v0.4.0'
+7c76255 autoconf: update ax_pthread.m4
+fff2a11 make -short work with -print_ssim, -print_psnr, etc.
+68e7901 update ChangeLog (tag: v0.4.0-rc1, tag: v0.4.0, origin/0.4.0, 0.4.0)
 256e433 update NEWS description with new general features
 2962534 Merge "gif2webp: don't use C99 %zu" into 0.4.0
 3b9f9dd gif2webp: don't use C99 %zu
diff --git a/src/3rdparty/libwebp/NEWS b/src/3rdparty/libwebp/NEWS
index 55c2c5e..61d0d6c 100644
--- a/src/3rdparty/libwebp/NEWS
+++ b/src/3rdparty/libwebp/NEWS
@@ -1,3 +1,29 @@
+- 3/3/15: version 0.4.3
+  This is a binary compatible release.
+  * Android / gcc / iOS / MSVS build fixes and improvements
+  * lossless decode fix (issue #239 -- since 0.4.0)
+  * documentation / vwebp updates for animation
+  * multi-threading fix (issue #234)
+
+- 10/13/14: version 0.4.2
+  This is a binary compatible release.
+  * Android / gcc build fixes
+  * (Windows) fix reading from stdin and writing to stdout
+  * gif2webp: miscellaneous fixes
+  * fix 'alpha-leak' with lossy compression (issue #220)
+  * the lossless bitstream spec has been amended to reflect the current code
+
+- 7/24/14: version 0.4.1
+  This is a binary compatible release.
+  * AArch64 (arm64) & MIPS support/optimizations
+  * NEON assembly additions:
+    - ~25% faster lossy decode / encode (-m 4)
+    - ~10% faster lossless decode
+    - ~5-10% faster lossless encode (-m 3/4)
+  * dwebp/vwebp can read from stdin
+  * cwebp/gif2webp can write to stdout
+  * cwebp can read webp files; useful if storing sources as webp lossless
+
 - 12/19/13: version 0.4.0
   * improved gif2webp tool
   * numerous fixes, compression improvement and speed-up
diff --git a/src/3rdparty/libwebp/PATENTS b/src/3rdparty/libwebp/PATENTS
index 4414d83..79d17d7 100644
--- a/src/3rdparty/libwebp/PATENTS
+++ b/src/3rdparty/libwebp/PATENTS
@@ -1,22 +1,23 @@
 Additional IP Rights Grant (Patents)
+------------------------------------
 
-"This implementation" means the copyrightable works distributed by
-Google as part of the WebM Project.
+"These implementations" means the copyrightable works that implement the WebM
+codecs distributed by Google as part of the WebM Project.
 
-Google hereby grants to you a perpetual, worldwide, non-exclusive,
-no-charge, royalty-free, irrevocable (except as stated in this section)
-patent license to make, have made, use, offer to sell, sell, import,
-transfer, and otherwise run, modify and propagate the contents of this
-implementation of VP8, where such license applies only to those patent
-claims, both currently owned by Google and acquired in the future,
-licensable by Google that are necessarily infringed by this
-implementation of VP8. This grant does not include claims that would be
-infringed only as a consequence of further modification of this
-implementation. If you or your agent or exclusive licensee institute or
-order or agree to the institution of patent litigation against any
-entity (including a cross-claim or counterclaim in a lawsuit) alleging
-that this implementation of VP8 or any code incorporated within this
-implementation of VP8 constitutes direct or contributory patent
-infringement, or inducement of patent infringement, then any patent
-rights granted to you under this License for this implementation of VP8
-shall terminate as of the date such litigation is filed.
+Google hereby grants to you a perpetual, worldwide, non-exclusive, no-charge,
+royalty-free, irrevocable (except as stated in this section) patent license to
+make, have made, use, offer to sell, sell, import, transfer, and otherwise
+run, modify and propagate the contents of these implementations of WebM, where
+such license applies only to those patent claims, both currently owned by
+Google and acquired in the future, licensable by Google that are necessarily
+infringed by these implementations of WebM. This grant does not include claims
+that would be infringed only as a consequence of further modification of these
+implementations. If you or your agent or exclusive licensee institute or order
+or agree to the institution of patent litigation or any other patent
+enforcement activity against any entity (including a cross-claim or
+counterclaim in a lawsuit) alleging that any of these implementations of WebM
+or any code incorporated within any of these implementations of WebM
+constitutes direct or contributory patent infringement, or inducement of
+patent infringement, then any patent rights granted to you under this License
+for these implementations of WebM shall terminate as of the date such
+litigation is filed.
diff --git a/src/3rdparty/libwebp/README b/src/3rdparty/libwebp/README
index 64e9f2d..c3f32c5 100644
--- a/src/3rdparty/libwebp/README
+++ b/src/3rdparty/libwebp/README
@@ -4,7 +4,7 @@
           \__\__/\____/\_____/__/ ____  ___
                 / _/ /    \    \ /  _ \/ _/
                /  \_/   / /   \ \   __/  \__
-               \____/____/\_____/_____/____/v0.4.0
+               \____/____/\_____/_____/____/v0.4.3
 
 Description:
 ============
@@ -140,28 +140,30 @@ A longer list of options is available using the -longhelp command line flag:
 Usage:
  cwebp [-preset <...>] [options] in_file [-o out_file]
 
-If input size (-s) for an image is not specified, it is assumed to be a PNG,
-JPEG or TIFF file.
-options:
+If input size (-s) for an image is not specified, it is
+assumed to be a PNG, JPEG, TIFF or WebP file.
+
+Options:
   -h / -help  ............ short help
   -H / -longhelp  ........ long help
   -q <float> ............. quality factor (0:small..100:big)
-  -alpha_q <int> ......... Transparency-compression quality (0..100).
-  -preset <string> ....... Preset setting, one of:
+  -alpha_q <int> ......... transparency-compression quality (0..100)
+  -preset <string> ....... preset setting, one of:
                             default, photo, picture,
                             drawing, icon, text
-     -preset must come first, as it overwrites other parameters.
+     -preset must come first, as it overwrites other parameters
+
   -m <int> ............... compression method (0=fast, 6=slowest)
   -segments <int> ........ number of segments to use (1..4)
-  -size <int> ............ Target size (in bytes)
-  -psnr <float> .......... Target PSNR (in dB. typically: 42)
+  -size <int> ............ target size (in bytes)
+  -psnr <float> .......... target PSNR (in dB. typically: 42)
 
-  -s <int> <int> ......... Input size (width x height) for YUV
-  -sns <int> ............. Spatial Noise Shaping (0:off, 100:max)
+  -s <int> <int> ......... input size (width x height) for YUV
+  -sns <int> ............. spatial noise shaping (0:off, 100:max)
   -f <int> ............... filter strength (0=off..100)
   -sharpness <int> ....... filter sharpness (0:most .. 7:least sharp)
-  -strong ................ use strong filter instead of simple (default).
-  -nostrong .............. use simple filter instead of strong.
+  -strong ................ use strong filter instead of simple (default)
+  -nostrong .............. use simple filter instead of strong
   -partition_limit <int> . limit quality to fit the 512k limit on
                            the first partition (0=no degradation ... 100=full)
   -pass <int> ............ analysis pass number (1..10)
@@ -169,41 +171,40 @@ options:
   -resize <w> <h> ........ resize picture (after any cropping)
   -mt .................... use multi-threading if available
   -low_memory ............ reduce memory usage (slower encoding)
-  -map <int> ............. print map of extra info.
-  -print_psnr ............ prints averaged PSNR distortion.
-  -print_ssim ............ prints averaged SSIM distortion.
-  -print_lsim ............ prints local-similarity distortion.
-  -d <file.pgm> .......... dump the compressed output (PGM file).
-  -alpha_method <int> .... Transparency-compression method (0..1)
-  -alpha_filter <string> . predictive filtering for alpha plane.
-                           One of: none, fast (default) or best.
-  -alpha_cleanup ......... Clean RGB values in transparent area.
-  -blend_alpha <hex> ..... Blend colors against background color
+  -map <int> ............. print map of extra info
+  -print_psnr ............ prints averaged PSNR distortion
+  -print_ssim ............ prints averaged SSIM distortion
+  -print_lsim ............ prints local-similarity distortion
+  -d <file.pgm> .......... dump the compressed output (PGM file)
+  -alpha_method <int> .... transparency-compression method (0..1)
+  -alpha_filter <string> . predictive filtering for alpha plane,
+                           one of: none, fast (default) or best
+  -alpha_cleanup ......... clean RGB values in transparent area
+  -blend_alpha <hex> ..... blend colors against background color
                            expressed as RGB values written in
                            hexadecimal, e.g. 0xc0e0d0 for red=0xc0
-                           green=0xe0 and blue=0xd0.
-  -noalpha ............... discard any transparency information.
-  -lossless .............. Encode image losslessly.
-  -hint <string> ......... Specify image characteristics hint.
-                           One of: photo, picture or graph
+                           green=0xe0 and blue=0xd0
+  -noalpha ............... discard any transparency information
+  -lossless .............. encode image losslessly
+  -hint <string> ......... specify image characteristics hint,
+                           one of: photo, picture or graph
 
   -metadata <string> ..... comma separated list of metadata to
                            copy from the input to the output if present.
                            Valid values: all, none (default), exif, icc, xmp
 
   -short ................. condense printed message
-  -quiet ................. don't print anything.
-  -version ............... print version number and exit.
-  -noasm ................. disable all assembly optimizations.
+  -quiet ................. don't print anything
+  -version ............... print version number and exit
+  -noasm ................. disable all assembly optimizations
   -v ..................... verbose, e.g. print encoding/decoding times
   -progress .............. report encoding progress
 
 Experimental Options:
-  -jpeg_like ............. Roughly match expected JPEG size.
-  -af .................... auto-adjust filter strength.
+  -jpeg_like ............. roughly match expected JPEG size
+  -af .................... auto-adjust filter strength
   -pre <int> ............. pre-processing filter
 
-
 The main options you might want to try in order to further tune the
 visual quality are:
  -preset
@@ -262,19 +263,19 @@ Use following options to convert into alternate image formats:
   -yuv ......... save the raw YUV samples in flat layout
 
  Other options are:
-  -version  .... print version number and exit.
-  -nofancy ..... don't use the fancy YUV420 upscaler.
-  -nofilter .... disable in-loop filtering.
-  -nodither .... disable dithering.
+  -version  .... print version number and exit
+  -nofancy ..... don't use the fancy YUV420 upscaler
+  -nofilter .... disable in-loop filtering
+  -nodither .... disable dithering
   -dither <d> .. dithering strength (in 0..100)
   -mt .......... use multi-threading
   -crop <x> <y> <w> <h> ... crop output with the given rectangle
   -scale <w> <h> .......... scale the output (*after* any cropping)
-  -alpha ....... only save the alpha plane.
+  -alpha ....... only save the alpha plane
   -incremental . use incremental decoding (useful for tests)
-  -h     ....... this help message.
+  -h     ....... this help message
   -v     ....... verbose (e.g. print encoding/decoding times)
-  -noasm ....... disable all assembly optimizations.
+  -noasm ....... disable all assembly optimizations
 
 Visualization tool:
 ===================
@@ -288,19 +289,19 @@ Usage: vwebp in_file [options]
 
 Decodes the WebP image file and visualize it using OpenGL
 Options are:
-  -version  .... print version number and exit.
-  -noicc ....... don't use the icc profile if present.
-  -nofancy ..... don't use the fancy YUV420 upscaler.
-  -nofilter .... disable in-loop filtering.
-  -dither <int>  dithering strength (0..100). Default=50.
-  -mt .......... use multi-threading.
-  -info ........ print info.
-  -h     ....... this help message.
+  -version  .... print version number and exit
+  -noicc ....... don't use the icc profile if present
+  -nofancy ..... don't use the fancy YUV420 upscaler
+  -nofilter .... disable in-loop filtering
+  -dither <int>  dithering strength (0..100), default=50
+  -mt .......... use multi-threading
+  -info ........ print info
+  -h     ....... this help message
 
 Keyboard shortcuts:
-  'c' ................ toggle use of color profile.
-  'i' ................ overlay file information.
-  'q' / 'Q' / ESC .... quit.
+  'c' ................ toggle use of color profile
+  'i' ................ overlay file information
+  'q' / 'Q' / ESC .... quit
 
 Building:
 ---------
@@ -336,24 +337,24 @@ vwebp.
 
 Usage:
  gif2webp [options] gif_file -o webp_file
-options:
+Options:
   -h / -help  ............ this help
-  -lossy ................. Encode image using lossy compression.
-  -mixed ................. For each frame in the image, pick lossy
-                           or lossless compression heuristically.
+  -lossy ................. encode image using lossy compression
+  -mixed ................. for each frame in the image, pick lossy
+                           or lossless compression heuristically
   -q <float> ............. quality factor (0:small..100:big)
   -m <int> ............... compression method (0=fast, 6=slowest)
-  -kmin <int> ............ Min distance between key frames
-  -kmax <int> ............ Max distance between key frames
+  -kmin <int> ............ min distance between key frames
+  -kmax <int> ............ max distance between key frames
   -f <int> ............... filter strength (0=off..100)
   -metadata <string> ..... comma separated list of metadata to
-                           copy from the input to the output if present.
+                           copy from the input to the output if present
                            Valid values: all, none, icc, xmp (default)
   -mt .................... use multi-threading if available
 
-  -version ............... print version number and exit.
-  -v ..................... verbose.
-  -quiet ................. don't print anything.
+  -version ............... print version number and exit
+  -v ..................... verbose
+  -quiet ................. don't print anything
 
 Building:
 ---------
@@ -442,15 +443,20 @@ The encoding flow looks like:
 
   // Set up a byte-output write method. WebPMemoryWriter, for instance.
   WebPMemoryWriter wrt;
+  WebPMemoryWriterInit(&wrt);     // initialize 'wrt'
+
   pic.writer = MyFileWriter;
   pic.custom_ptr = my_opaque_structure_to_make_MyFileWriter_work;
-  // initialize 'wrt' here...
 
   // Compress!
   int ok = WebPEncode(&config, &pic);   // ok = 0 => error occurred!
   WebPPictureFree(&pic);  // must be called independently of the 'ok' result.
 
   // output data should have been handled by the writer at that point.
+  // -> compressed data is the memory buffer described by wrt.mem / wrt.size
+
+  // deallocate the memory used by compressed data
+  WebPMemoryWriterClear(&wrt);
 
 -------------------------------------- END PSEUDO EXAMPLE
 
diff --git a/src/3rdparty/libwebp/src/dec/alpha.c b/src/3rdparty/libwebp/src/dec/alpha.c
index 93729a0..f23ba7d 100644
--- a/src/3rdparty/libwebp/src/dec/alpha.c
+++ b/src/3rdparty/libwebp/src/dec/alpha.c
@@ -16,13 +16,14 @@
 #include "./vp8i.h"
 #include "./vp8li.h"
 #include "../utils/quant_levels_dec.h"
+#include "../utils/utils.h"
 #include "../webp/format_constants.h"
 
 //------------------------------------------------------------------------------
 // ALPHDecoder object.
 
 ALPHDecoder* ALPHNew(void) {
-  ALPHDecoder* const dec = (ALPHDecoder*)calloc(1, sizeof(*dec));
+  ALPHDecoder* const dec = (ALPHDecoder*)WebPSafeCalloc(1ULL, sizeof(*dec));
   return dec;
 }
 
@@ -30,7 +31,7 @@ void ALPHDelete(ALPHDecoder* const dec) {
   if (dec != NULL) {
     VP8LDelete(dec->vp8l_dec_);
     dec->vp8l_dec_ = NULL;
-    free(dec);
+    WebPSafeFree(dec);
   }
 }
 
@@ -107,12 +108,6 @@ static int ALPHDecode(VP8Decoder* const dec, int row, int num_rows) {
     unfilter_func(width, height, width, row, num_rows, output);
   }
 
-  if (alph_dec->pre_processing_ == ALPHA_PREPROCESSED_LEVELS) {
-    if (!DequantizeLevels(output, width, height, row, num_rows)) {
-      return 0;
-    }
-  }
-
   if (row + num_rows == dec->pic_hdr_.height_) {
     dec->is_alpha_decoded_ = 1;
   }
@@ -142,12 +137,22 @@ const uint8_t* VP8DecompressAlphaRows(VP8Decoder* const dec,
       dec->alph_dec_ = NULL;
       return NULL;
     }
+    // if we allowed use of alpha dithering, check whether it's needed at all
+    if (dec->alph_dec_->pre_processing_ != ALPHA_PREPROCESSED_LEVELS) {
+      dec->alpha_dithering_ = 0;  // disable dithering
+    } else {
+      num_rows = height;          // decode everything in one pass
+    }
   }
 
   if (!dec->is_alpha_decoded_) {
     int ok = 0;
     assert(dec->alph_dec_ != NULL);
     ok = ALPHDecode(dec, row, num_rows);
+    if (ok && dec->alpha_dithering_ > 0) {
+      ok = WebPDequantizeLevels(dec->alpha_plane_, width, height,
+                                dec->alpha_dithering_);
+    }
     if (!ok || dec->is_alpha_decoded_) {
       ALPHDelete(dec->alph_dec_);
       dec->alph_dec_ = NULL;
@@ -158,4 +163,3 @@ const uint8_t* VP8DecompressAlphaRows(VP8Decoder* const dec,
   // Return a pointer to the current decoded row.
   return dec->alpha_plane_ + row * width;
 }
-
diff --git a/src/3rdparty/libwebp/src/dec/buffer.c b/src/3rdparty/libwebp/src/dec/buffer.c
index 1e852ef..42feac7 100644
--- a/src/3rdparty/libwebp/src/dec/buffer.c
+++ b/src/3rdparty/libwebp/src/dec/buffer.c
@@ -42,29 +42,34 @@ static VP8StatusCode CheckDecBuffer(const WebPDecBuffer* const buffer) {
     ok = 0;
   } else if (!WebPIsRGBMode(mode)) {   // YUV checks
     const WebPYUVABuffer* const buf = &buffer->u.YUVA;
-    const uint64_t y_size = (uint64_t)buf->y_stride * height;
-    const uint64_t u_size = (uint64_t)buf->u_stride * ((height + 1) / 2);
-    const uint64_t v_size = (uint64_t)buf->v_stride * ((height + 1) / 2);
-    const uint64_t a_size = (uint64_t)buf->a_stride * height;
+    const int y_stride = abs(buf->y_stride);
+    const int u_stride = abs(buf->u_stride);
+    const int v_stride = abs(buf->v_stride);
+    const int a_stride = abs(buf->a_stride);
+    const uint64_t y_size = (uint64_t)y_stride * height;
+    const uint64_t u_size = (uint64_t)u_stride * ((height + 1) / 2);
+    const uint64_t v_size = (uint64_t)v_stride * ((height + 1) / 2);
+    const uint64_t a_size = (uint64_t)a_stride * height;
     ok &= (y_size <= buf->y_size);
     ok &= (u_size <= buf->u_size);
     ok &= (v_size <= buf->v_size);
-    ok &= (buf->y_stride >= width);
-    ok &= (buf->u_stride >= (width + 1) / 2);
-    ok &= (buf->v_stride >= (width + 1) / 2);
+    ok &= (y_stride >= width);
+    ok &= (u_stride >= (width + 1) / 2);
+    ok &= (v_stride >= (width + 1) / 2);
     ok &= (buf->y != NULL);
     ok &= (buf->u != NULL);
     ok &= (buf->v != NULL);
     if (mode == MODE_YUVA) {
-      ok &= (buf->a_stride >= width);
+      ok &= (a_stride >= width);
       ok &= (a_size <= buf->a_size);
       ok &= (buf->a != NULL);
     }
   } else {    // RGB checks
     const WebPRGBABuffer* const buf = &buffer->u.RGBA;
-    const uint64_t size = (uint64_t)buf->stride * height;
+    const int stride = abs(buf->stride);
+    const uint64_t size = (uint64_t)stride * height;
     ok &= (size <= buf->size);
-    ok &= (buf->stride >= width * kModeBpp[mode]);
+    ok &= (stride >= width * kModeBpp[mode]);
     ok &= (buf->rgba != NULL);
   }
   return ok ? VP8_STATUS_OK : VP8_STATUS_INVALID_PARAM;
@@ -131,9 +136,35 @@ static VP8StatusCode AllocateBuffer(WebPDecBuffer* const buffer) {
   return CheckDecBuffer(buffer);
 }
 
+VP8StatusCode WebPFlipBuffer(WebPDecBuffer* const buffer) {
+  if (buffer == NULL) {
+    return VP8_STATUS_INVALID_PARAM;
+  }
+  if (WebPIsRGBMode(buffer->colorspace)) {
+    WebPRGBABuffer* const buf = &buffer->u.RGBA;
+    buf->rgba += (buffer->height - 1) * buf->stride;
+    buf->stride = -buf->stride;
+  } else {
+    WebPYUVABuffer* const buf = &buffer->u.YUVA;
+    const int H = buffer->height;
+    buf->y += (H - 1) * buf->y_stride;
+    buf->y_stride = -buf->y_stride;
+    buf->u += ((H - 1) >> 1) * buf->u_stride;
+    buf->u_stride = -buf->u_stride;
+    buf->v += ((H - 1) >> 1) * buf->v_stride;
+    buf->v_stride = -buf->v_stride;
+    if (buf->a != NULL) {
+      buf->a += (H - 1) * buf->a_stride;
+      buf->a_stride = -buf->a_stride;
+    }
+  }
+  return VP8_STATUS_OK;
+}
+
 VP8StatusCode WebPAllocateDecBuffer(int w, int h,
                                     const WebPDecoderOptions* const options,
                                     WebPDecBuffer* const out) {
+  VP8StatusCode status;
   if (out == NULL || w <= 0 || h <= 0) {
     return VP8_STATUS_INVALID_PARAM;
   }
@@ -160,8 +191,17 @@ VP8StatusCode WebPAllocateDecBuffer(int w, int h,
   out->width = w;
   out->height = h;
 
-  // Then, allocate buffer for real
-  return AllocateBuffer(out);
+  // Then, allocate buffer for real.
+  status = AllocateBuffer(out);
+  if (status != VP8_STATUS_OK) return status;
+
+#if WEBP_DECODER_ABI_VERSION > 0x0203
+  // Use the stride trick if vertical flip is needed.
+  if (options != NULL && options->flip) {
+    status = WebPFlipBuffer(out);
+  }
+#endif
+  return status;
 }
 
 //------------------------------------------------------------------------------
@@ -178,8 +218,9 @@ int WebPInitDecBufferInternal(WebPDecBuffer* buffer, int version) {
 
 void WebPFreeDecBuffer(WebPDecBuffer* buffer) {
   if (buffer != NULL) {
-    if (!buffer->is_external_memory)
-      free(buffer->private_memory);
+    if (!buffer->is_external_memory) {
+      WebPSafeFree(buffer->private_memory);
+    }
     buffer->private_memory = NULL;
   }
 }
diff --git a/src/3rdparty/libwebp/src/dec/frame.c b/src/3rdparty/libwebp/src/dec/frame.c
index e1eea94..2359acc 100644
--- a/src/3rdparty/libwebp/src/dec/frame.c
+++ b/src/3rdparty/libwebp/src/dec/frame.c
@@ -177,6 +177,15 @@ void VP8InitDithering(const WebPDecoderOptions* const options,
         dec->dither_ = 1;
       }
     }
+#if WEBP_DECODER_ABI_VERSION > 0x0204
+    // potentially allow alpha dithering
+    dec->alpha_dithering_ = options->alpha_dithering_strength;
+    if (dec->alpha_dithering_ > 100) {
+      dec->alpha_dithering_ = 100;
+    } else if (dec->alpha_dithering_ < 0) {
+      dec->alpha_dithering_ = 0;
+    }
+#endif
   }
 }
 
@@ -347,7 +356,7 @@ int VP8ProcessRow(VP8Decoder* const dec, VP8Io* const io) {
   } else {
     WebPWorker* const worker = &dec->worker_;
     // Finish previous job *before* updating context
-    ok &= WebPWorkerSync(worker);
+    ok &= WebPGetWorkerInterface()->Sync(worker);
     assert(worker->status_ == OK);
     if (ok) {   // spawn a new deblocking/output job
       ctx->io_ = *io;
@@ -367,7 +376,8 @@ int VP8ProcessRow(VP8Decoder* const dec, VP8Io* const io) {
         ctx->f_info_ = dec->f_info_;
         dec->f_info_ = tmp;
       }
-      WebPWorkerLaunch(worker);    // (reconstruct)+filter in parallel
+      // (reconstruct)+filter in parallel
+      WebPGetWorkerInterface()->Launch(worker);
       if (++dec->cache_id_ == dec->num_caches_) {
         dec->cache_id_ = 0;
       }
@@ -437,7 +447,7 @@ VP8StatusCode VP8EnterCritical(VP8Decoder* const dec, VP8Io* const io) {
 int VP8ExitCritical(VP8Decoder* const dec, VP8Io* const io) {
   int ok = 1;
   if (dec->mt_method_ > 0) {
-    ok = WebPWorkerSync(&dec->worker_);
+    ok = WebPGetWorkerInterface()->Sync(&dec->worker_);
   }
 
   if (io->teardown != NULL) {
@@ -478,7 +488,7 @@ static int InitThreadContext(VP8Decoder* const dec) {
   dec->cache_id_ = 0;
   if (dec->mt_method_ > 0) {
     WebPWorker* const worker = &dec->worker_;
-    if (!WebPWorkerReset(worker)) {
+    if (!WebPGetWorkerInterface()->Reset(worker)) {
       return VP8SetError(dec, VP8_STATUS_OUT_OF_MEMORY,
                          "thread initialization failed.");
     }
@@ -502,7 +512,7 @@ int VP8GetThreadMethod(const WebPDecoderOptions* const options,
   (void)headers;
   (void)width;
   (void)height;
-  assert(!headers->is_lossless);
+  assert(headers == NULL || !headers->is_lossless);
 #if defined(WEBP_USE_THREAD)
   if (width < MIN_WIDTH_FOR_THREADS) return 0;
   // TODO(skal): tune the heuristic further
@@ -549,7 +559,7 @@ static int AllocateMemory(VP8Decoder* const dec) {
 
   if (needed != (size_t)needed) return 0;  // check for overflow
   if (needed > dec->mem_size_) {
-    free(dec->mem_);
+    WebPSafeFree(dec->mem_);
     dec->mem_size_ = 0;
     dec->mem_ = WebPSafeMalloc(needed, sizeof(uint8_t));
     if (dec->mem_ == NULL) {
diff --git a/src/3rdparty/libwebp/src/dec/idec.c b/src/3rdparty/libwebp/src/dec/idec.c
index 40d5ff6..e003851 100644
--- a/src/3rdparty/libwebp/src/dec/idec.c
+++ b/src/3rdparty/libwebp/src/dec/idec.c
@@ -72,28 +72,20 @@ struct WebPIDecoder {
   MemBuffer mem_;          // input memory buffer.
   WebPDecBuffer output_;   // output buffer (when no external one is supplied)
   size_t chunk_size_;      // Compressed VP8/VP8L size extracted from Header.
+
+  int last_mb_y_;          // last row reached for intra-mode decoding
 };
 
 // MB context to restore in case VP8DecodeMB() fails
 typedef struct {
   VP8MB left_;
   VP8MB info_;
-  uint8_t intra_t_[4];
-  uint8_t intra_l_[4];
-  VP8BitReader br_;
   VP8BitReader token_br_;
 } MBContext;
 
 //------------------------------------------------------------------------------
 // MemBuffer: incoming data handling
 
-static void RemapBitReader(VP8BitReader* const br, ptrdiff_t offset) {
-  if (br->buf_ != NULL) {
-    br->buf_ += offset;
-    br->buf_end_ += offset;
-  }
-}
-
 static WEBP_INLINE size_t MemDataSize(const MemBuffer* mem) {
   return (mem->end_ - mem->start_);
 }
@@ -130,12 +122,12 @@ static void DoRemap(WebPIDecoder* const idec, ptrdiff_t offset) {
       if (offset != 0) {
         int p;
         for (p = 0; p <= last_part; ++p) {
-          RemapBitReader(dec->parts_ + p, offset);
+          VP8RemapBitReader(dec->parts_ + p, offset);
         }
         // Remap partition #0 data pointer to new offset, but only in MAP
         // mode (in APPEND mode, partition #0 is copied into a fixed memory).
         if (mem->mode_ == MEM_MODE_MAP) {
-          RemapBitReader(&dec->br_, offset);
+          VP8RemapBitReader(&dec->br_, offset);
         }
       }
       assert(last_part >= 0);
@@ -189,7 +181,7 @@ static int AppendToMemBuffer(WebPIDecoder* const idec,
         (uint8_t*)WebPSafeMalloc(extra_size, sizeof(*new_buf));
     if (new_buf == NULL) return 0;
     memcpy(new_buf, old_base, current_size);
-    free(mem->buf_);
+    WebPSafeFree(mem->buf_);
     mem->buf_ = new_buf;
     mem->buf_size_ = (size_t)extra_size;
     mem->start_ = new_mem_start;
@@ -231,8 +223,8 @@ static void InitMemBuffer(MemBuffer* const mem) {
 static void ClearMemBuffer(MemBuffer* const mem) {
   assert(mem);
   if (mem->mode_ == MEM_MODE_APPEND) {
-    free(mem->buf_);
-    free((void*)mem->part0_buf_);
+    WebPSafeFree(mem->buf_);
+    WebPSafeFree((void*)mem->part0_buf_);
   }
 }
 
@@ -246,35 +238,36 @@ static int CheckMemBufferMode(MemBuffer* const mem, MemBufferMode expected) {
   return 1;
 }
 
+// To be called last.
+static VP8StatusCode FinishDecoding(WebPIDecoder* const idec) {
+#if WEBP_DECODER_ABI_VERSION > 0x0203
+  const WebPDecoderOptions* const options = idec->params_.options;
+  WebPDecBuffer* const output = idec->params_.output;
+
+  idec->state_ = STATE_DONE;
+  if (options != NULL && options->flip) {
+    return WebPFlipBuffer(output);
+  }
+#endif
+  idec->state_ = STATE_DONE;
+  return VP8_STATUS_OK;
+}
+
 //------------------------------------------------------------------------------
 // Macroblock-decoding contexts
 
 static void SaveContext(const VP8Decoder* dec, const VP8BitReader* token_br,
                         MBContext* const context) {
-  const VP8BitReader* const br = &dec->br_;
-  const VP8MB* const left = dec->mb_info_ - 1;
-  const VP8MB* const info = dec->mb_info_ + dec->mb_x_;
-
-  context->left_ = *left;
-  context->info_ = *info;
-  context->br_ = *br;
+  context->left_ = dec->mb_info_[-1];
+  context->info_ = dec->mb_info_[dec->mb_x_];
   context->token_br_ = *token_br;
-  memcpy(context->intra_t_, dec->intra_t_ + 4 * dec->mb_x_, 4);
-  memcpy(context->intra_l_, dec->intra_l_, 4);
 }
 
 static void RestoreContext(const MBContext* context, VP8Decoder* const dec,
                            VP8BitReader* const token_br) {
-  VP8BitReader* const br = &dec->br_;
-  VP8MB* const left = dec->mb_info_ - 1;
-  VP8MB* const info = dec->mb_info_ + dec->mb_x_;
-
-  *left = context->left_;
-  *info = context->info_;
-  *br = context->br_;
+  dec->mb_info_[-1] = context->left_;
+  dec->mb_info_[dec->mb_x_] = context->info_;
   *token_br = context->token_br_;
-  memcpy(dec->intra_t_ + 4 * dec->mb_x_, context->intra_t_, 4);
-  memcpy(dec->intra_l_, context->intra_l_, 4);
 }
 
 //------------------------------------------------------------------------------
@@ -310,6 +303,7 @@ static VP8StatusCode DecodeWebPHeaders(WebPIDecoder* const idec) {
 
   headers.data = data;
   headers.data_size = curr_size;
+  headers.have_all_data = 0;
   status = WebPParseHeaders(&headers);
   if (status == VP8_STATUS_NOT_ENOUGH_DATA) {
     return VP8_STATUS_SUSPENDED;  // We haven't found a VP8 chunk yet.
@@ -363,30 +357,33 @@ static VP8StatusCode DecodeVP8FrameHeader(WebPIDecoder* const idec) {
 }
 
 // Partition #0
-static int CopyParts0Data(WebPIDecoder* const idec) {
+static VP8StatusCode CopyParts0Data(WebPIDecoder* const idec) {
   VP8Decoder* const dec = (VP8Decoder*)idec->dec_;
   VP8BitReader* const br = &dec->br_;
-  const size_t psize = br->buf_end_ - br->buf_;
+  const size_t part_size = br->buf_end_ - br->buf_;
   MemBuffer* const mem = &idec->mem_;
   assert(!idec->is_lossless_);
   assert(mem->part0_buf_ == NULL);
-  assert(psize > 0);
-  assert(psize <= mem->part0_size_);  // Format limit: no need for runtime check
+  // the following is a format limitation, no need for runtime check:
+  assert(part_size <= mem->part0_size_);
+  if (part_size == 0) {   // can't have zero-size partition #0
+    return VP8_STATUS_BITSTREAM_ERROR;
+  }
   if (mem->mode_ == MEM_MODE_APPEND) {
     // We copy and grab ownership of the partition #0 data.
-    uint8_t* const part0_buf = (uint8_t*)malloc(psize);
+    uint8_t* const part0_buf = (uint8_t*)WebPSafeMalloc(1ULL, part_size);
     if (part0_buf == NULL) {
-      return 0;
+      return VP8_STATUS_OUT_OF_MEMORY;
     }
-    memcpy(part0_buf, br->buf_, psize);
+    memcpy(part0_buf, br->buf_, part_size);
     mem->part0_buf_ = part0_buf;
     br->buf_ = part0_buf;
-    br->buf_end_ = part0_buf + psize;
+    br->buf_end_ = part0_buf + part_size;
   } else {
     // Else: just keep pointers to the partition #0's data in dec_->br_.
   }
-  mem->start_ += psize;
-  return 1;
+  mem->start_ += part_size;
+  return VP8_STATUS_OK;
 }
 
 static VP8StatusCode DecodePartition0(WebPIDecoder* const idec) {
@@ -420,8 +417,10 @@ static VP8StatusCode DecodePartition0(WebPIDecoder* const idec) {
   dec->mt_method_ = VP8GetThreadMethod(params->options, NULL,
                                        io->width, io->height);
   VP8InitDithering(params->options, dec);
-  if (!CopyParts0Data(idec)) {
-    return IDecError(idec, VP8_STATUS_OUT_OF_MEMORY);
+
+  dec->status_ = CopyParts0Data(idec);
+  if (dec->status_ != VP8_STATUS_OK) {
+    return IDecError(idec, dec->status_);
   }
 
   // Finish setting up the decoding parameters. Will call io->setup().
@@ -446,16 +445,26 @@ static VP8StatusCode DecodeRemaining(WebPIDecoder* const idec) {
 
   assert(dec->ready_);
   for (; dec->mb_y_ < dec->mb_h_; ++dec->mb_y_) {
-    VP8BitReader* token_br = &dec->parts_[dec->mb_y_ & (dec->num_parts_ - 1)];
+    if (idec->last_mb_y_ != dec->mb_y_) {
+      if (!VP8ParseIntraModeRow(&dec->br_, dec)) {
+        // note: normally, error shouldn't occur since we already have the whole
+        // partition0 available here in DecodeRemaining(). Reaching EOF while
+        // reading intra modes really means a BITSTREAM_ERROR.
+        return IDecError(idec, VP8_STATUS_BITSTREAM_ERROR);
+      }
+      idec->last_mb_y_ = dec->mb_y_;
+    }
     for (; dec->mb_x_ < dec->mb_w_; ++dec->mb_x_) {
+      VP8BitReader* const token_br =
+          &dec->parts_[dec->mb_y_ & (dec->num_parts_ - 1)];
       MBContext context;
       SaveContext(dec, token_br, &context);
       if (!VP8DecodeMB(dec, token_br)) {
-        RestoreContext(&context, dec, token_br);
         // We shouldn't fail when MAX_MB data was available
         if (dec->num_parts_ == 1 && MemDataSize(&idec->mem_) > MAX_MB_SIZE) {
           return IDecError(idec, VP8_STATUS_BITSTREAM_ERROR);
         }
+        RestoreContext(&context, dec, token_br);
         return VP8_STATUS_SUSPENDED;
       }
       // Release buffer only if there is only one partition
@@ -476,9 +485,7 @@ static VP8StatusCode DecodeRemaining(WebPIDecoder* const idec) {
     return IDecError(idec, VP8_STATUS_USER_ABORT);
   }
   dec->ready_ = 0;
-  idec->state_ = STATE_DONE;
-
-  return VP8_STATUS_OK;
+  return FinishDecoding(idec);
 }
 
 static VP8StatusCode ErrorStatusLossless(WebPIDecoder* const idec,
@@ -527,12 +534,16 @@ static VP8StatusCode DecodeVP8LData(WebPIDecoder* const idec) {
   }
 
   if (!VP8LDecodeImage(dec)) {
+    // The decoding is called after all the data-bytes are aggregated. Change
+    // the error to VP8_BITSTREAM_ERROR in case lossless decoder fails to decode
+    // all the pixels (VP8_STATUS_SUSPENDED).
+    if (dec->status_ == VP8_STATUS_SUSPENDED) {
+      dec->status_ = VP8_STATUS_BITSTREAM_ERROR;
+    }
     return ErrorStatusLossless(idec, dec->status_);
   }
 
-  idec->state_ = STATE_DONE;
-
-  return VP8_STATUS_OK;
+  return FinishDecoding(idec);
 }
 
   // Main decoding loop
@@ -568,7 +579,7 @@ static VP8StatusCode IDecode(WebPIDecoder* idec) {
 // Public functions
 
 WebPIDecoder* WebPINewDecoder(WebPDecBuffer* output_buffer) {
-  WebPIDecoder* idec = (WebPIDecoder*)calloc(1, sizeof(*idec));
+  WebPIDecoder* idec = (WebPIDecoder*)WebPSafeCalloc(1ULL, sizeof(*idec));
   if (idec == NULL) {
     return NULL;
   }
@@ -576,6 +587,8 @@ WebPIDecoder* WebPINewDecoder(WebPDecBuffer* output_buffer) {
   idec->state_ = STATE_WEBP_HEADER;
   idec->chunk_size_ = 0;
 
+  idec->last_mb_y_ = -1;
+
   InitMemBuffer(&idec->mem_);
   WebPInitDecBuffer(&idec->output_);
   VP8InitIo(&idec->io_);
@@ -625,7 +638,7 @@ void WebPIDelete(WebPIDecoder* idec) {
   }
   ClearMemBuffer(&idec->mem_);
   WebPFreeDecBuffer(&idec->output_);
-  free(idec);
+  WebPSafeFree(idec);
 }
 
 //------------------------------------------------------------------------------
diff --git a/src/3rdparty/libwebp/src/dec/io.c b/src/3rdparty/libwebp/src/dec/io.c
index 1ba376e..8094e44 100644
--- a/src/3rdparty/libwebp/src/dec/io.c
+++ b/src/3rdparty/libwebp/src/dec/io.c
@@ -17,6 +17,7 @@
 #include "./webpi.h"
 #include "../dsp/dsp.h"
 #include "../dsp/yuv.h"
+#include "../utils/utils.h"
 
 //------------------------------------------------------------------------------
 // Main YUV<->RGB conversion functions
@@ -44,27 +45,13 @@ static int EmitYUV(const VP8Io* const io, WebPDecParams* const p) {
 
 // Point-sampling U/V sampler.
 static int EmitSampledRGB(const VP8Io* const io, WebPDecParams* const p) {
-  WebPDecBuffer* output = p->output;
-  const WebPRGBABuffer* const buf = &output->u.RGBA;
-  uint8_t* dst = buf->rgba + io->mb_y * buf->stride;
-  const uint8_t* y_src = io->y;
-  const uint8_t* u_src = io->u;
-  const uint8_t* v_src = io->v;
-  const WebPSampleLinePairFunc sample = WebPSamplers[output->colorspace];
-  const int mb_w = io->mb_w;
-  const int last = io->mb_h - 1;
-  int j;
-  for (j = 0; j < last; j += 2) {
-    sample(y_src, y_src + io->y_stride, u_src, v_src,
-           dst, dst + buf->stride, mb_w);
-    y_src += 2 * io->y_stride;
-    u_src += io->uv_stride;
-    v_src += io->uv_stride;
-    dst += 2 * buf->stride;
-  }
-  if (j == last) {  // Just do the last line twice
-    sample(y_src, y_src, u_src, v_src, dst, dst, mb_w);
-  }
+  WebPDecBuffer* const output = p->output;
+  WebPRGBABuffer* const buf = &output->u.RGBA;
+  uint8_t* const dst = buf->rgba + io->mb_y * buf->stride;
+  WebPSamplerProcessPlane(io->y, io->y_stride,
+                          io->u, io->v, io->uv_stride,
+                          dst, buf->stride, io->mb_w, io->mb_h,
+                          WebPSamplers[output->colorspace]);
   return io->mb_h;
 }
 
@@ -250,7 +237,11 @@ static int EmitAlphaRGBA4444(const VP8Io* const io, WebPDecParams* const p) {
     int num_rows;
     const int start_y = GetAlphaSourceRow(io, &alpha, &num_rows);
     uint8_t* const base_rgba = buf->rgba + start_y * buf->stride;
+#ifdef WEBP_SWAP_16BIT_CSP
+    uint8_t* alpha_dst = base_rgba;
+#else
     uint8_t* alpha_dst = base_rgba + 1;
+#endif
     uint32_t alpha_mask = 0x0f;
     int i, j;
 
@@ -289,7 +280,17 @@ static int Rescale(const uint8_t* src, int src_stride,
 static int EmitRescaledYUV(const VP8Io* const io, WebPDecParams* const p) {
   const int mb_h = io->mb_h;
   const int uv_mb_h = (mb_h + 1) >> 1;
-  const int num_lines_out = Rescale(io->y, io->y_stride, mb_h, &p->scaler_y);
+  WebPRescaler* const scaler = &p->scaler_y;
+  int num_lines_out = 0;
+  if (WebPIsAlphaMode(p->output->colorspace) && io->a != NULL) {
+    // Before rescaling, we premultiply the luma directly into the io->y
+    // internal buffer. This is OK since these samples are not used for
+    // intra-prediction (the top samples are saved in cache_y_/u_/v_).
+    // But we need to cast the const away, though.
+    WebPMultRows((uint8_t*)io->y, io->y_stride,
+                 io->a, io->width, io->mb_w, mb_h, 0);
+  }
+  num_lines_out = Rescale(io->y, io->y_stride, mb_h, scaler);
   Rescale(io->u, io->uv_stride, uv_mb_h, &p->scaler_u);
   Rescale(io->v, io->uv_stride, uv_mb_h, &p->scaler_v);
   return num_lines_out;
@@ -297,7 +298,14 @@ static int EmitRescaledYUV(const VP8Io* const io, WebPDecParams* const p) {
 
 static int EmitRescaledAlphaYUV(const VP8Io* const io, WebPDecParams* const p) {
   if (io->a != NULL) {
-    Rescale(io->a, io->width, io->mb_h, &p->scaler_a);
+    const WebPYUVABuffer* const buf = &p->output->u.YUVA;
+    uint8_t* dst_y = buf->y + p->last_y * buf->y_stride;
+    const uint8_t* src_a = buf->a + p->last_y * buf->a_stride;
+    const int num_lines_out = Rescale(io->a, io->width, io->mb_h, &p->scaler_a);
+    if (num_lines_out > 0) {   // unmultiply the Y
+      WebPMultRows(dst_y, buf->y_stride, src_a, buf->a_stride,
+                   p->scaler_a.dst_width, num_lines_out, 1);
+    }
   }
   return 0;
 }
@@ -316,11 +324,11 @@ static int InitYUVRescaler(const VP8Io* const io, WebPDecParams* const p) {
   size_t tmp_size;
   int32_t* work;
 
-  tmp_size = work_size + 2 * uv_work_size;
+  tmp_size = (work_size + 2 * uv_work_size) * sizeof(*work);
   if (has_alpha) {
-    tmp_size += work_size;
+    tmp_size += work_size * sizeof(*work);
   }
-  p->memory = calloc(1, tmp_size * sizeof(*work));
+  p->memory = WebPSafeCalloc(1ULL, tmp_size);
   if (p->memory == NULL) {
     return 0;   // memory error
   }
@@ -347,6 +355,7 @@ static int InitYUVRescaler(const VP8Io* const io, WebPDecParams* const p) {
                      io->mb_w, out_width, io->mb_h, out_height,
                      work + work_size + 2 * uv_work_size);
     p->emit_alpha = EmitRescaledAlphaYUV;
+    WebPInitAlphaProcessing();
   }
   return 1;
 }
@@ -366,9 +375,9 @@ static int ExportRGB(WebPDecParams* const p, int y_pos) {
          WebPRescalerHasPendingOutput(&p->scaler_u)) {
     assert(p->last_y + y_pos + num_lines_out < p->output->height);
     assert(p->scaler_u.y_accum == p->scaler_v.y_accum);
-    WebPRescalerExportRow(&p->scaler_y);
-    WebPRescalerExportRow(&p->scaler_u);
-    WebPRescalerExportRow(&p->scaler_v);
+    WebPRescalerExportRow(&p->scaler_y, 0);
+    WebPRescalerExportRow(&p->scaler_u, 0);
+    WebPRescalerExportRow(&p->scaler_v, 0);
     convert(p->scaler_y.dst, p->scaler_u.dst, p->scaler_v.dst,
             dst, p->scaler_y.dst_width);
     dst += buf->stride;
@@ -416,7 +425,7 @@ static int ExportAlpha(WebPDecParams* const p, int y_pos) {
   while (WebPRescalerHasPendingOutput(&p->scaler_a)) {
     int i;
     assert(p->last_y + y_pos + num_lines_out < p->output->height);
-    WebPRescalerExportRow(&p->scaler_a);
+    WebPRescalerExportRow(&p->scaler_a, 0);
     for (i = 0; i < width; ++i) {
       const uint32_t alpha_value = p->scaler_a.dst[i];
       dst[4 * i] = alpha_value;
@@ -435,7 +444,11 @@ static int ExportAlpha(WebPDecParams* const p, int y_pos) {
 static int ExportAlphaRGBA4444(WebPDecParams* const p, int y_pos) {
   const WebPRGBABuffer* const buf = &p->output->u.RGBA;
   uint8_t* const base_rgba = buf->rgba + (p->last_y + y_pos) * buf->stride;
+#ifdef WEBP_SWAP_16BIT_CSP
+  uint8_t* alpha_dst = base_rgba;
+#else
   uint8_t* alpha_dst = base_rgba + 1;
+#endif
   int num_lines_out = 0;
   const WEBP_CSP_MODE colorspace = p->output->colorspace;
   const int width = p->scaler_a.dst_width;
@@ -445,7 +458,7 @@ static int ExportAlphaRGBA4444(WebPDecParams* const p, int y_pos) {
   while (WebPRescalerHasPendingOutput(&p->scaler_a)) {
     int i;
     assert(p->last_y + y_pos + num_lines_out < p->output->height);
-    WebPRescalerExportRow(&p->scaler_a);
+    WebPRescalerExportRow(&p->scaler_a, 0);
     for (i = 0; i < width; ++i) {
       // Fill in the alpha value (converted to 4 bits).
       const uint32_t alpha_value = p->scaler_a.dst[i] >> 4;
@@ -484,7 +497,7 @@ static int InitRGBRescaler(const VP8Io* const io, WebPDecParams* const p) {
   const size_t work_size = 2 * out_width;   // scratch memory for one rescaler
   int32_t* work;  // rescalers work area
   uint8_t* tmp;   // tmp storage for scaled YUV444 samples before RGB conversion
-  size_t tmp_size1, tmp_size2;
+  size_t tmp_size1, tmp_size2, total_size;
 
   tmp_size1 = 3 * work_size;
   tmp_size2 = 3 * out_width;
@@ -492,7 +505,8 @@ static int InitRGBRescaler(const VP8Io* const io, WebPDecParams* const p) {
     tmp_size1 += work_size;
     tmp_size2 += out_width;
   }
-  p->memory = calloc(1, tmp_size1 * sizeof(*work) + tmp_size2 * sizeof(*tmp));
+  total_size = tmp_size1 * sizeof(*work) + tmp_size2 * sizeof(*tmp);
+  p->memory = WebPSafeCalloc(1ULL, total_size);
   if (p->memory == NULL) {
     return 0;   // memory error
   }
@@ -524,6 +538,7 @@ static int InitRGBRescaler(const VP8Io* const io, WebPDecParams* const p) {
     } else {
       p->emit_alpha_row = ExportAlpha;
     }
+    WebPInitAlphaProcessing();
   }
   return 1;
 }
@@ -544,7 +559,9 @@ static int CustomSetup(VP8Io* io) {
   if (!WebPIoInitFromOptions(p->options, io, is_alpha ? MODE_YUV : MODE_YUVA)) {
     return 0;
   }
-
+  if (is_alpha && WebPIsPremultipliedMode(colorspace)) {
+    WebPInitUpsamplers();
+  }
   if (io->use_scaling) {
     const int ok = is_rgb ? InitRGBRescaler(io, p) : InitYUVRescaler(io, p);
     if (!ok) {
@@ -553,10 +570,10 @@ static int CustomSetup(VP8Io* io) {
   } else {
     if (is_rgb) {
       p->emit = EmitSampledRGB;   // default
-#ifdef FANCY_UPSAMPLING
       if (io->fancy_upsampling) {
+#ifdef FANCY_UPSAMPLING
         const int uv_width = (io->mb_w + 1) >> 1;
-        p->memory = malloc(io->mb_w + 2 * uv_width);
+        p->memory = WebPSafeMalloc(1ULL, (size_t)(io->mb_w + 2 * uv_width));
         if (p->memory == NULL) {
           return 0;   // memory error.
         }
@@ -565,18 +582,22 @@ static int CustomSetup(VP8Io* io) {
         p->tmp_v = p->tmp_u + uv_width;
         p->emit = EmitFancyRGB;
         WebPInitUpsamplers();
-      }
 #endif
+      } else {
+        WebPInitSamplers();
+      }
     } else {
       p->emit = EmitYUV;
     }
     if (is_alpha) {  // need transparency output
-      if (WebPIsPremultipliedMode(colorspace)) WebPInitPremultiply();
       p->emit_alpha =
           (colorspace == MODE_RGBA_4444 || colorspace == MODE_rgbA_4444) ?
               EmitAlphaRGBA4444
           : is_rgb ? EmitAlphaRGB
           : EmitAlphaYUV;
+      if (is_rgb) {
+        WebPInitAlphaProcessing();
+      }
     }
   }
 
@@ -610,7 +631,7 @@ static int CustomPut(const VP8Io* io) {
 
 static void CustomTeardown(const VP8Io* io) {
   WebPDecParams* const p = (WebPDecParams*)io->opaque;
-  free(p->memory);
+  WebPSafeFree(p->memory);
   p->memory = NULL;
 }
 
@@ -625,4 +646,3 @@ void WebPInitCustomIo(WebPDecParams* const params, VP8Io* const io) {
 }
 
 //------------------------------------------------------------------------------
-
diff --git a/src/3rdparty/libwebp/src/dec/tree.c b/src/3rdparty/libwebp/src/dec/tree.c
index bf9b7c5..31208d9 100644
--- a/src/3rdparty/libwebp/src/dec/tree.c
+++ b/src/3rdparty/libwebp/src/dec/tree.c
@@ -11,7 +11,8 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
-#include "vp8i.h"
+#include "./vp8i.h"
+#include "../utils/bit_reader_inl.h"
 
 #define USE_GENERIC_TREE
 
@@ -278,10 +279,23 @@ void VP8ResetProba(VP8Proba* const proba) {
   // proba->bands_[][] is initialized later
 }
 
-void VP8ParseIntraMode(VP8BitReader* const br, VP8Decoder* const dec) {
-  uint8_t* const top = dec->intra_t_ + 4 * dec->mb_x_;
+static void ParseIntraMode(VP8BitReader* const br,
+                           VP8Decoder* const dec, int mb_x) {
+  uint8_t* const top = dec->intra_t_ + 4 * mb_x;
   uint8_t* const left = dec->intra_l_;
-  VP8MBData* const block = dec->mb_data_ + dec->mb_x_;
+  VP8MBData* const block = dec->mb_data_ + mb_x;
+
+  // Note: we don't save segment map (yet), as we don't expect
+  // to decode more than 1 keyframe.
+  if (dec->segment_hdr_.update_map_) {
+    // Hardcoded tree parsing
+    block->segment_ = !VP8GetBit(br, dec->proba_.segments_[0])
+                    ? VP8GetBit(br, dec->proba_.segments_[1])
+                    : 2 + VP8GetBit(br, dec->proba_.segments_[2]);
+  } else {
+    block->segment_ = 0;  // default for intra
+  }
+  if (dec->use_skip_proba_) block->skip_ = VP8GetBit(br, dec->skip_p_);
 
   block->is_i4x4_ = !VP8GetBit(br, 145);   // decide for B_PRED first
   if (!block->is_i4x4_) {
@@ -332,6 +346,14 @@ void VP8ParseIntraMode(VP8BitReader* const br, VP8Decoder* const dec) {
                  : VP8GetBit(br, 183) ? TM_PRED : H_PRED;
 }
 
+int VP8ParseIntraModeRow(VP8BitReader* const br, VP8Decoder* const dec) {
+  int mb_x;
+  for (mb_x = 0; mb_x < dec->mb_w_; ++mb_x) {
+    ParseIntraMode(br, dec, mb_x);
+  }
+  return !dec->br_.eof_;
+}
+
 //------------------------------------------------------------------------------
 // Paragraph 13
 
diff --git a/src/3rdparty/libwebp/src/dec/vp8.c b/src/3rdparty/libwebp/src/dec/vp8.c
index bfd0e8f..89d478a 100644
--- a/src/3rdparty/libwebp/src/dec/vp8.c
+++ b/src/3rdparty/libwebp/src/dec/vp8.c
@@ -17,7 +17,8 @@
 #include "./vp8i.h"
 #include "./vp8li.h"
 #include "./webpi.h"
-#include "../utils/bit_reader.h"
+#include "../utils/bit_reader_inl.h"
+#include "../utils/utils.h"
 
 //------------------------------------------------------------------------------
 
@@ -44,10 +45,10 @@ int VP8InitIoInternal(VP8Io* const io, int version) {
 }
 
 VP8Decoder* VP8New(void) {
-  VP8Decoder* const dec = (VP8Decoder*)calloc(1, sizeof(*dec));
+  VP8Decoder* const dec = (VP8Decoder*)WebPSafeCalloc(1ULL, sizeof(*dec));
   if (dec != NULL) {
     SetOk(dec);
-    WebPWorkerInit(&dec->worker_);
+    WebPGetWorkerInterface()->Init(&dec->worker_);
     dec->ready_ = 0;
     dec->num_parts_ = 1;
   }
@@ -68,7 +69,7 @@ const char* VP8StatusMessage(VP8Decoder* const dec) {
 void VP8Delete(VP8Decoder* const dec) {
   if (dec != NULL) {
     VP8Clear(dec);
-    free(dec);
+    WebPSafeFree(dec);
   }
 }
 
@@ -317,7 +318,6 @@ int VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io) {
 
     VP8ResetProba(&dec->proba_);
     ResetSegmentHeader(&dec->segment_hdr_);
-    dec->segment_ = 0;    // default for intra
   }
 
   // Check if we have all the partition #0 available, and initialize dec->br_
@@ -363,28 +363,6 @@ int VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io) {
 
   VP8ParseProba(br, dec);
 
-#ifdef WEBP_EXPERIMENTAL_FEATURES
-  // Extensions
-  if (dec->pic_hdr_.colorspace_) {
-    const size_t kTrailerSize = 8;
-    const uint8_t kTrailerMarker = 0x01;
-    const uint8_t* ext_buf = buf - kTrailerSize;
-    size_t size;
-
-    if (frm_hdr->partition_length_ < kTrailerSize ||
-        ext_buf[kTrailerSize - 1] != kTrailerMarker) {
-      return VP8SetError(dec, VP8_STATUS_BITSTREAM_ERROR,
-                         "RIFF: Inconsistent extra information.");
-    }
-
-    // Layer
-    size = (ext_buf[0] << 0) | (ext_buf[1] << 8) | (ext_buf[2] << 16);
-    dec->layer_data_size_ = size;
-    dec->layer_data_ = NULL;  // will be set later
-    dec->layer_colorspace_ = ext_buf[3];
-  }
-#endif
-
   // sanitized state
   dec->ready_ = 1;
   return 1;
@@ -479,8 +457,8 @@ static int ParseResiduals(VP8Decoder* const dec,
                           VP8MB* const mb, VP8BitReader* const token_br) {
   VP8BandProbas (* const bands)[NUM_BANDS] = dec->proba_.bands_;
   const VP8BandProbas* ac_proba;
-  const VP8QuantMatrix* const q = &dec->dqm_[dec->segment_];
   VP8MBData* const block = dec->mb_data_ + dec->mb_x_;
+  const VP8QuantMatrix* const q = &dec->dqm_[block->segment_];
   int16_t* dst = block->coeffs_;
   VP8MB* const left_mb = dec->mb_info_ - 1;
   uint8_t tnz, lnz;
@@ -570,26 +548,10 @@ static int ParseResiduals(VP8Decoder* const dec,
 // Main loop
 
 int VP8DecodeMB(VP8Decoder* const dec, VP8BitReader* const token_br) {
-  VP8BitReader* const br = &dec->br_;
   VP8MB* const left = dec->mb_info_ - 1;
   VP8MB* const mb = dec->mb_info_ + dec->mb_x_;
   VP8MBData* const block = dec->mb_data_ + dec->mb_x_;
-  int skip;
-
-  // Note: we don't save segment map (yet), as we don't expect
-  // to decode more than 1 keyframe.
-  if (dec->segment_hdr_.update_map_) {
-    // Hardcoded tree parsing
-    dec->segment_ = !VP8GetBit(br, dec->proba_.segments_[0]) ?
-        VP8GetBit(br, dec->proba_.segments_[1]) :
-        2 + VP8GetBit(br, dec->proba_.segments_[2]);
-  }
-  skip = dec->use_skip_proba_ ? VP8GetBit(br, dec->skip_p_) : 0;
-
-  VP8ParseIntraMode(br, dec);
-  if (br->eof_) {
-    return 0;
-  }
+  int skip = dec->use_skip_proba_ ? block->skip_ : 0;
 
   if (!skip) {
     skip = ParseResiduals(dec, mb, token_br);
@@ -600,11 +562,12 @@ int VP8DecodeMB(VP8Decoder* const dec, VP8BitReader* const token_br) {
     }
     block->non_zero_y_ = 0;
     block->non_zero_uv_ = 0;
+    block->dither_ = 0;
   }
 
   if (dec->filter_type_ > 0) {  // store filter info
     VP8FInfo* const finfo = dec->f_info_ + dec->mb_x_;
-    *finfo = dec->fstrengths_[dec->segment_][block->is_i4x4_];
+    *finfo = dec->fstrengths_[block->segment_][block->is_i4x4_];
     finfo->f_inner_ |= !skip;
   }
 
@@ -624,6 +587,10 @@ static int ParseFrame(VP8Decoder* const dec, VP8Io* io) {
     // Parse bitstream for this row.
     VP8BitReader* const token_br =
         &dec->parts_[dec->mb_y_ & (dec->num_parts_ - 1)];
+    if (!VP8ParseIntraModeRow(&dec->br_, dec)) {
+      return VP8SetError(dec, VP8_STATUS_NOT_ENOUGH_DATA,
+                         "Premature end-of-partition0 encountered.");
+    }
     for (; dec->mb_x_ < dec->mb_w_; ++dec->mb_x_) {
       if (!VP8DecodeMB(dec, token_br)) {
         return VP8SetError(dec, VP8_STATUS_NOT_ENOUGH_DATA,
@@ -638,17 +605,8 @@ static int ParseFrame(VP8Decoder* const dec, VP8Io* io) {
     }
   }
   if (dec->mt_method_ > 0) {
-    if (!WebPWorkerSync(&dec->worker_)) return 0;
-  }
-
-  // Finish
-#ifdef WEBP_EXPERIMENTAL_FEATURES
-  if (dec->layer_data_size_ > 0) {
-    if (!VP8DecodeLayer(dec)) {
-      return 0;
-    }
+    if (!WebPGetWorkerInterface()->Sync(&dec->worker_)) return 0;
   }
-#endif
 
   return 1;
 }
@@ -697,12 +655,10 @@ void VP8Clear(VP8Decoder* const dec) {
   if (dec == NULL) {
     return;
   }
-  if (dec->mt_method_ > 0) {
-    WebPWorkerEnd(&dec->worker_);
-  }
+  WebPGetWorkerInterface()->End(&dec->worker_);
   ALPHDelete(dec->alph_dec_);
   dec->alph_dec_ = NULL;
-  free(dec->mem_);
+  WebPSafeFree(dec->mem_);
   dec->mem_ = NULL;
   dec->mem_size_ = 0;
   memset(&dec->br_, 0, sizeof(dec->br_));
diff --git a/src/3rdparty/libwebp/src/dec/vp8i.h b/src/3rdparty/libwebp/src/dec/vp8i.h
index 3f4cf29..a02d9ff 100644
--- a/src/3rdparty/libwebp/src/dec/vp8i.h
+++ b/src/3rdparty/libwebp/src/dec/vp8i.h
@@ -31,7 +31,7 @@ extern "C" {
 // version numbers
 #define DEC_MAJ_VERSION 0
 #define DEC_MIN_VERSION 4
-#define DEC_REV_VERSION 0
+#define DEC_REV_VERSION 3
 
 // intra prediction modes
 enum { B_DC_PRED = 0,   // 4x4 modes
@@ -195,6 +195,8 @@ typedef struct {
   uint32_t non_zero_y_;
   uint32_t non_zero_uv_;
   uint8_t dither_;      // local dithering strength (deduced from non_zero_*)
+  uint8_t skip_;
+  uint8_t segment_;
 } VP8MBData;
 
 // Persistent information needed by the parallel processing
@@ -265,7 +267,6 @@ struct VP8Decoder {
   uint8_t* intra_t_;      // top intra modes values: 4 * mb_w_
   uint8_t  intra_l_[4];   // left intra modes values
 
-  uint8_t segment_;       // segment of the currently parsed block
   VP8TopSamples* yuv_t_;  // top y/u/v samples
 
   VP8MB* mb_info_;        // contextual macroblock info (mb_w_ + 1)
@@ -295,12 +296,8 @@ struct VP8Decoder {
   const uint8_t* alpha_data_;     // compressed alpha data (if present)
   size_t alpha_data_size_;
   int is_alpha_decoded_;  // true if alpha_data_ is decoded in alpha_plane_
-  uint8_t* alpha_plane_;        // output. Persistent, contains the whole data.
-
-  // extensions
-  int layer_colorspace_;
-  const uint8_t* layer_data_;   // compressed layer data (if present)
-  size_t layer_data_size_;
+  uint8_t* alpha_plane_;  // output. Persistent, contains the whole data.
+  int alpha_dithering_;   // derived from decoding options (0=off, 100=full).
 };
 
 //------------------------------------------------------------------------------
@@ -313,7 +310,8 @@ int VP8SetError(VP8Decoder* const dec,
 // in tree.c
 void VP8ResetProba(VP8Proba* const proba);
 void VP8ParseProba(VP8BitReader* const br, VP8Decoder* const dec);
-void VP8ParseIntraMode(VP8BitReader* const br,  VP8Decoder* const dec);
+// parses one row of intra mode data in partition 0, returns !eof
+int VP8ParseIntraModeRow(VP8BitReader* const br, VP8Decoder* const dec);
 
 // in quant.c
 void VP8ParseQuant(VP8Decoder* const dec);
@@ -347,9 +345,6 @@ int VP8DecodeMB(VP8Decoder* const dec, VP8BitReader* const token_br);
 const uint8_t* VP8DecompressAlphaRows(VP8Decoder* const dec,
                                       int row, int num_rows);
 
-// in layer.c
-int VP8DecodeLayer(VP8Decoder* const dec);
-
 //------------------------------------------------------------------------------
 
 #ifdef __cplusplus
diff --git a/src/3rdparty/libwebp/src/dec/vp8l.c b/src/3rdparty/libwebp/src/dec/vp8l.c
index ea0254d..e2780e5 100644
--- a/src/3rdparty/libwebp/src/dec/vp8l.c
+++ b/src/3rdparty/libwebp/src/dec/vp8l.c
@@ -12,13 +12,13 @@
 // Authors: Vikas Arora (vikaas.arora@gmail.com)
 //          Jyrki Alakuijala (jyrki@google.com)
 
-#include <stdio.h>
 #include <stdlib.h>
+
 #include "./alphai.h"
 #include "./vp8li.h"
+#include "../dsp/dsp.h"
 #include "../dsp/lossless.h"
 #include "../dsp/yuv.h"
-#include "../utils/alpha_processing.h"
 #include "../utils/huffman.h"
 #include "../utils/utils.h"
 
@@ -187,9 +187,10 @@ static int ReadHuffmanCodeLengths(
   int max_symbol;
   int prev_code_len = DEFAULT_CODE_LENGTH;
   HuffmanTree tree;
+  int huff_codes[NUM_CODE_LENGTH_CODES] = { 0 };
 
-  if (!HuffmanTreeBuildImplicit(&tree, code_length_code_lengths,
-                                NUM_CODE_LENGTH_CODES)) {
+  if (!VP8LHuffmanTreeBuildImplicit(&tree, code_length_code_lengths,
+                                    huff_codes, NUM_CODE_LENGTH_CODES)) {
     dec->status_ = VP8_STATUS_BITSTREAM_ERROR;
     return 0;
   }
@@ -232,11 +233,15 @@ static int ReadHuffmanCodeLengths(
   ok = 1;
 
  End:
-  HuffmanTreeRelease(&tree);
+  VP8LHuffmanTreeFree(&tree);
+  if (!ok) dec->status_ = VP8_STATUS_BITSTREAM_ERROR;
   return ok;
 }
 
+// 'code_lengths' is pre-allocated temporary buffer, used for creating Huffman
+// tree.
 static int ReadHuffmanCode(int alphabet_size, VP8LDecoder* const dec,
+                           int* const code_lengths, int* const huff_codes,
                            HuffmanTree* const tree) {
   int ok = 0;
   VP8LBitReader* const br = &dec->br_;
@@ -245,7 +250,6 @@ static int ReadHuffmanCode(int alphabet_size, VP8LDecoder* const dec,
   if (simple_code) {  // Read symbols, codes & code lengths directly.
     int symbols[2];
     int codes[2];
-    int code_lengths[2];
     const int num_symbols = VP8LReadBits(br, 1) + 1;
     const int first_symbol_len_code = VP8LReadBits(br, 1);
     // The first code is either 1 bit or 8 bit code.
@@ -258,10 +262,9 @@ static int ReadHuffmanCode(int alphabet_size, VP8LDecoder* const dec,
       codes[1] = 1;
       code_lengths[1] = num_symbols - 1;
     }
-    ok = HuffmanTreeBuildExplicit(tree, code_lengths, codes, symbols,
-                                  alphabet_size, num_symbols);
+    ok = VP8LHuffmanTreeBuildExplicit(tree, code_lengths, codes, symbols,
+                                      alphabet_size, num_symbols);
   } else {  // Decode Huffman-coded code lengths.
-    int* code_lengths = NULL;
     int i;
     int code_length_code_lengths[NUM_CODE_LENGTH_CODES] = { 0 };
     const int num_codes = VP8LReadBits(br, 4) + 4;
@@ -270,22 +273,15 @@ static int ReadHuffmanCode(int alphabet_size, VP8LDecoder* const dec,
       return 0;
     }
 
-    code_lengths =
-        (int*)WebPSafeCalloc((uint64_t)alphabet_size, sizeof(*code_lengths));
-    if (code_lengths == NULL) {
-      dec->status_ = VP8_STATUS_OUT_OF_MEMORY;
-      return 0;
-    }
+    memset(code_lengths, 0, alphabet_size * sizeof(*code_lengths));
 
     for (i = 0; i < num_codes; ++i) {
       code_length_code_lengths[kCodeLengthCodeOrder[i]] = VP8LReadBits(br, 3);
     }
     ok = ReadHuffmanCodeLengths(dec, code_length_code_lengths, alphabet_size,
                                 code_lengths);
-    if (ok) {
-      ok = HuffmanTreeBuildImplicit(tree, code_lengths, alphabet_size);
-    }
-    free(code_lengths);
+    ok = ok && VP8LHuffmanTreeBuildImplicit(tree, code_lengths, huff_codes,
+                                            alphabet_size);
   }
   ok = ok && !br->error_;
   if (!ok) {
@@ -295,19 +291,6 @@ static int ReadHuffmanCode(int alphabet_size, VP8LDecoder* const dec,
   return 1;
 }
 
-static void DeleteHtreeGroups(HTreeGroup* htree_groups, int num_htree_groups) {
-  if (htree_groups != NULL) {
-    int i, j;
-    for (i = 0; i < num_htree_groups; ++i) {
-      HuffmanTree* const htrees = htree_groups[i].htrees_;
-      for (j = 0; j < HUFFMAN_CODES_PER_META_CODE; ++j) {
-        HuffmanTreeRelease(&htrees[j]);
-      }
-    }
-    free(htree_groups);
-  }
-}
-
 static int ReadHuffmanCodes(VP8LDecoder* const dec, int xsize, int ysize,
                             int color_cache_bits, int allow_recursion) {
   int i, j;
@@ -316,6 +299,9 @@ static int ReadHuffmanCodes(VP8LDecoder* const dec, int xsize, int ysize,
   uint32_t* huffman_image = NULL;
   HTreeGroup* htree_groups = NULL;
   int num_htree_groups = 1;
+  int max_alphabet_size = 0;
+  int* code_lengths = NULL;
+  int* huff_codes = NULL;
 
   if (allow_recursion && VP8LReadBits(br, 1)) {
     // use meta Huffman codes.
@@ -341,11 +327,24 @@ static int ReadHuffmanCodes(VP8LDecoder* const dec, int xsize, int ysize,
 
   if (br->error_) goto Error;
 
-  assert(num_htree_groups <= 0x10000);
-  htree_groups =
-      (HTreeGroup*)WebPSafeCalloc((uint64_t)num_htree_groups,
-                                  sizeof(*htree_groups));
-  if (htree_groups == NULL) {
+  // Find maximum alphabet size for the htree group.
+  for (j = 0; j < HUFFMAN_CODES_PER_META_CODE; ++j) {
+    int alphabet_size = kAlphabetSize[j];
+    if (j == 0 && color_cache_bits > 0) {
+      alphabet_size += 1 << color_cache_bits;
+    }
+    if (max_alphabet_size < alphabet_size) {
+      max_alphabet_size = alphabet_size;
+    }
+  }
+
+  htree_groups = VP8LHtreeGroupsNew(num_htree_groups);
+  code_lengths =
+      (int*)WebPSafeCalloc((uint64_t)max_alphabet_size, sizeof(*code_lengths));
+  huff_codes =
+      (int*)WebPSafeMalloc((uint64_t)max_alphabet_size, sizeof(*huff_codes));
+
+  if (htree_groups == NULL || code_lengths == NULL || huff_codes == NULL) {
     dec->status_ = VP8_STATUS_OUT_OF_MEMORY;
     goto Error;
   }
@@ -354,12 +353,18 @@ static int ReadHuffmanCodes(VP8LDecoder* const dec, int xsize, int ysize,
     HuffmanTree* const htrees = htree_groups[i].htrees_;
     for (j = 0; j < HUFFMAN_CODES_PER_META_CODE; ++j) {
       int alphabet_size = kAlphabetSize[j];
+      HuffmanTree* const htree = htrees + j;
       if (j == 0 && color_cache_bits > 0) {
         alphabet_size += 1 << color_cache_bits;
       }
-      if (!ReadHuffmanCode(alphabet_size, dec, htrees + j)) goto Error;
+      if (!ReadHuffmanCode(alphabet_size, dec, code_lengths, huff_codes,
+                           htree)) {
+        goto Error;
+      }
     }
   }
+  WebPSafeFree(huff_codes);
+  WebPSafeFree(code_lengths);
 
   // All OK. Finalize pointers and return.
   hdr->huffman_image_ = huffman_image;
@@ -368,8 +373,10 @@ static int ReadHuffmanCodes(VP8LDecoder* const dec, int xsize, int ysize,
   return 1;
 
  Error:
-  free(huffman_image);
-  DeleteHtreeGroups(htree_groups, num_htree_groups);
+  WebPSafeFree(huff_codes);
+  WebPSafeFree(code_lengths);
+  WebPSafeFree(huffman_image);
+  VP8LHtreeGroupsFree(htree_groups, num_htree_groups);
   return 0;
 }
 
@@ -420,7 +427,7 @@ static int Export(WebPRescaler* const rescaler, WEBP_CSP_MODE colorspace,
   int num_lines_out = 0;
   while (WebPRescalerHasPendingOutput(rescaler)) {
     uint8_t* const dst = rgba + num_lines_out * rgba_stride;
-    WebPRescalerExportRow(rescaler);
+    WebPRescalerExportRow(rescaler, 0);
     WebPMultARGBRow(src, dst_width, 1);
     VP8LConvertFromBGRA(src, dst_width, colorspace, dst);
     ++num_lines_out;
@@ -468,6 +475,7 @@ static int EmitRows(WEBP_CSP_MODE colorspace,
 //------------------------------------------------------------------------------
 // Export to YUVA
 
+// TODO(skal): should be in yuv.c
 static void ConvertToYUVA(const uint32_t* const src, int width, int y_pos,
                           const WebPDecBuffer* const output) {
   const WebPYUVABuffer* const buf = &output->u.YUVA;
@@ -537,7 +545,7 @@ static int ExportYUVA(const VP8LDecoder* const dec, int y_pos) {
   const int dst_width = rescaler->dst_width;
   int num_lines_out = 0;
   while (WebPRescalerHasPendingOutput(rescaler)) {
-    WebPRescalerExportRow(rescaler);
+    WebPRescalerExportRow(rescaler, 0);
     WebPMultARGBRow(src, dst_width, 1);
     ConvertToYUVA(src, dst_width, y_pos, dec->output_);
     ++y_pos;
@@ -740,6 +748,7 @@ static int DecodeAlphaData(VP8LDecoder* const dec, uint8_t* const data,
   const int len_code_limit = NUM_LITERAL_CODES + NUM_LENGTH_CODES;
   const int mask = hdr->huffman_mask_;
   assert(htree_group != NULL);
+  assert(pos < end);
   assert(last_row <= height);
   assert(Is8bOptimizable(hdr));
 
@@ -793,6 +802,7 @@ static int DecodeAlphaData(VP8LDecoder* const dec, uint8_t* const data,
       ok = 0;
       goto End;
     }
+    assert(br->eos_ == VP8LIsEndOfStream(br));
     ok = !br->error_;
     if (!ok) goto End;
   }
@@ -830,6 +840,7 @@ static int DecodeImageData(VP8LDecoder* const dec, uint32_t* const data,
       (hdr->color_cache_size_ > 0) ? &hdr->color_cache_ : NULL;
   const int mask = hdr->huffman_mask_;
   assert(htree_group != NULL);
+  assert(src < src_end);
   assert(src_last <= src_end);
 
   while (!br->eos_ && src < src_last) {
@@ -849,7 +860,7 @@ static int DecodeImageData(VP8LDecoder* const dec, uint32_t* const data,
       VP8LFillBitWindow(br);
       blue = ReadSymbol(&htree_group->htrees_[BLUE], br);
       alpha = ReadSymbol(&htree_group->htrees_[ALPHA], br);
-      *src = (alpha << 24) | (red << 16) | (green << 8) | blue;
+      *src = ((uint32_t)alpha << 24) | (red << 16) | (green << 8) | blue;
     AdvanceByOne:
       ++src;
       ++col;
@@ -889,7 +900,7 @@ static int DecodeImageData(VP8LDecoder* const dec, uint32_t* const data,
           process_func(dec, row);
         }
       }
-      if (src < src_last) {
+      if (src < src_end) {
         if (col & mask) htree_group = GetHtreeGroupForPos(hdr, col, row);
         if (color_cache != NULL) {
           while (last_cached < src) {
@@ -909,6 +920,7 @@ static int DecodeImageData(VP8LDecoder* const dec, uint32_t* const data,
       ok = 0;
       goto End;
     }
+    assert(br->eos_ == VP8LIsEndOfStream(br));
     ok = !br->error_;
     if (!ok) goto End;
   }
@@ -931,7 +943,7 @@ static int DecodeImageData(VP8LDecoder* const dec, uint32_t* const data,
 // VP8LTransform
 
 static void ClearTransform(VP8LTransform* const transform) {
-  free(transform->data_);
+  WebPSafeFree(transform->data_);
   transform->data_ = NULL;
 }
 
@@ -955,7 +967,7 @@ static int ExpandColorMap(int num_colors, VP8LTransform* const transform) {
     }
     for (; i < 4 * final_num_colors; ++i)
       new_data[i] = 0;  // black tail.
-    free(transform->data_);
+    WebPSafeFree(transform->data_);
     transform->data_ = new_color_map;
   }
   return 1;
@@ -1025,8 +1037,8 @@ static void InitMetadata(VP8LMetadata* const hdr) {
 static void ClearMetadata(VP8LMetadata* const hdr) {
   assert(hdr);
 
-  free(hdr->huffman_image_);
-  DeleteHtreeGroups(hdr->htree_groups_, hdr->num_htree_groups_);
+  WebPSafeFree(hdr->huffman_image_);
+  VP8LHtreeGroupsFree(hdr->htree_groups_, hdr->num_htree_groups_);
   VP8LColorCacheClear(&hdr->color_cache_);
   InitMetadata(hdr);
 }
@@ -1035,7 +1047,7 @@ static void ClearMetadata(VP8LMetadata* const hdr) {
 // VP8LDecoder
 
 VP8LDecoder* VP8LNew(void) {
-  VP8LDecoder* const dec = (VP8LDecoder*)calloc(1, sizeof(*dec));
+  VP8LDecoder* const dec = (VP8LDecoder*)WebPSafeCalloc(1ULL, sizeof(*dec));
   if (dec == NULL) return NULL;
   dec->status_ = VP8_STATUS_OK;
   dec->action_ = READ_DIM;
@@ -1051,7 +1063,7 @@ void VP8LClear(VP8LDecoder* const dec) {
   if (dec == NULL) return;
   ClearMetadata(&dec->hdr_);
 
-  free(dec->pixels_);
+  WebPSafeFree(dec->pixels_);
   dec->pixels_ = NULL;
   for (i = 0; i < dec->next_transform_; ++i) {
     ClearTransform(&dec->transforms_[i]);
@@ -1059,7 +1071,7 @@ void VP8LClear(VP8LDecoder* const dec) {
   dec->next_transform_ = 0;
   dec->transforms_seen_ = 0;
 
-  free(dec->rescaler_memory);
+  WebPSafeFree(dec->rescaler_memory);
   dec->rescaler_memory = NULL;
 
   dec->output_ = NULL;   // leave no trace behind
@@ -1068,7 +1080,7 @@ void VP8LClear(VP8LDecoder* const dec) {
 void VP8LDelete(VP8LDecoder* const dec) {
   if (dec != NULL) {
     VP8LClear(dec);
-    free(dec);
+    WebPSafeFree(dec);
   }
 }
 
@@ -1155,7 +1167,7 @@ static int DecodeImageStream(int xsize, int ysize,
  End:
 
   if (!ok) {
-    free(data);
+    WebPSafeFree(data);
     ClearMetadata(hdr);
     // If not enough data (br.eos_) resulted in BIT_STREAM_ERROR, update the
     // status appropriately.
@@ -1294,6 +1306,10 @@ int VP8LDecodeAlphaImageStream(ALPHDecoder* const alph_dec, int last_row) {
   assert(dec->action_ == READ_DATA);
   assert(last_row <= dec->height_);
 
+  if (dec->last_pixel_ == dec->width_ * dec->height_) {
+    return 1;  // done
+  }
+
   // Decode (with special row processing).
   return alph_dec->use_8b_decode ?
       DecodeAlphaData(dec, (uint8_t*)dec->pixels_, dec->width_, dec->height_,
@@ -1341,6 +1357,10 @@ int VP8LDecodeImage(VP8LDecoder* const dec) {
   // Sanity checks.
   if (dec == NULL) return 0;
 
+  dec->status_ = VP8_STATUS_BITSTREAM_ERROR;
+  assert(dec->hdr_.htree_groups_ != NULL);
+  assert(dec->hdr_.num_htree_groups_ > 0);
+
   io = dec->io_;
   assert(io != NULL);
   params = (WebPDecParams*)io->opaque;
@@ -1358,6 +1378,11 @@ int VP8LDecodeImage(VP8LDecoder* const dec) {
 
   if (io->use_scaling && !AllocateAndInitRescaler(dec, io)) goto Err;
 
+  if (io->use_scaling || WebPIsPremultipliedMode(dec->output_->colorspace)) {
+    // need the alpha-multiply functions for premultiplied output or rescaling
+    WebPInitAlphaProcessing();
+  }
+
   // Decode.
   dec->action_ = READ_DATA;
   if (!DecodeImageData(dec, dec->pixels_, dec->width_, dec->height_,
@@ -1377,4 +1402,3 @@ int VP8LDecodeImage(VP8LDecoder* const dec) {
 }
 
 //------------------------------------------------------------------------------
-
diff --git a/src/3rdparty/libwebp/src/dec/vp8li.h b/src/3rdparty/libwebp/src/dec/vp8li.h
index afa294d..21c593f 100644
--- a/src/3rdparty/libwebp/src/dec/vp8li.h
+++ b/src/3rdparty/libwebp/src/dec/vp8li.h
@@ -20,7 +20,6 @@
 #include "../utils/bit_reader.h"
 #include "../utils/color_cache.h"
 #include "../utils/huffman.h"
-#include "../webp/format_constants.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -42,10 +41,6 @@ struct VP8LTransform {
 };
 
 typedef struct {
-  HuffmanTree htrees_[HUFFMAN_CODES_PER_META_CODE];
-} HTreeGroup;
-
-typedef struct {
   int             color_cache_size_;
   VP8LColorCache  color_cache_;
 
diff --git a/src/3rdparty/libwebp/src/dec/webp.c b/src/3rdparty/libwebp/src/dec/webp.c
index fda88bd..59e21a9 100644
--- a/src/3rdparty/libwebp/src/dec/webp.c
+++ b/src/3rdparty/libwebp/src/dec/webp.c
@@ -52,13 +52,14 @@ static WEBP_INLINE uint32_t get_le32(const uint8_t* const data) {
 }
 
 // Validates the RIFF container (if detected) and skips over it.
-// If a RIFF container is detected,
-// Returns VP8_STATUS_BITSTREAM_ERROR for invalid header, and
-//         VP8_STATUS_OK otherwise.
+// If a RIFF container is detected, returns:
+//     VP8_STATUS_BITSTREAM_ERROR for invalid header,
+//     VP8_STATUS_NOT_ENOUGH_DATA for truncated data if have_all_data is true,
+// and VP8_STATUS_OK otherwise.
 // In case there are not enough bytes (partial RIFF container), return 0 for
 // *riff_size. Else return the RIFF size extracted from the header.
 static VP8StatusCode ParseRIFF(const uint8_t** const data,
-                               size_t* const data_size,
+                               size_t* const data_size, int have_all_data,
                                size_t* const riff_size) {
   assert(data != NULL);
   assert(data_size != NULL);
@@ -77,6 +78,9 @@ static VP8StatusCode ParseRIFF(const uint8_t** const data,
       if (size > MAX_CHUNK_PAYLOAD) {
         return VP8_STATUS_BITSTREAM_ERROR;
       }
+      if (have_all_data && (size > *data_size - CHUNK_HEADER_SIZE)) {
+        return VP8_STATUS_NOT_ENOUGH_DATA;  // Truncated bitstream.
+      }
       // We have a RIFF container. Skip it.
       *riff_size = size;
       *data += RIFF_HEADER_SIZE;
@@ -223,9 +227,8 @@ static VP8StatusCode ParseOptionalChunks(const uint8_t** const data,
 // extracted from the VP8/VP8L chunk header.
 // The flag '*is_lossless' is set to 1 in case of VP8L chunk / raw VP8L data.
 static VP8StatusCode ParseVP8Header(const uint8_t** const data_ptr,
-                                    size_t* const data_size,
-                                    size_t riff_size,
-                                    size_t* const chunk_size,
+                                    size_t* const data_size, int have_all_data,
+                                    size_t riff_size, size_t* const chunk_size,
                                     int* const is_lossless) {
   const uint8_t* const data = *data_ptr;
   const int is_vp8 = !memcmp(data, "VP8 ", TAG_SIZE);
@@ -248,6 +251,9 @@ static VP8StatusCode ParseVP8Header(const uint8_t** const data_ptr,
     if ((riff_size >= minimal_size) && (size > riff_size - minimal_size)) {
       return VP8_STATUS_BITSTREAM_ERROR;  // Inconsistent size information.
     }
+    if (have_all_data && (size > *data_size - CHUNK_HEADER_SIZE)) {
+      return VP8_STATUS_NOT_ENOUGH_DATA;  // Truncated bitstream.
+    }
     // Skip over CHUNK_HEADER_SIZE bytes from VP8/VP8L Header.
     *chunk_size = size;
     *data_ptr += CHUNK_HEADER_SIZE;
@@ -291,6 +297,7 @@ static VP8StatusCode ParseHeadersInternal(const uint8_t* data,
   int found_vp8x = 0;
   int animation_present = 0;
   int fragments_present = 0;
+  const int have_all_data = (headers != NULL) ? headers->have_all_data : 0;
 
   VP8StatusCode status;
   WebPHeaderStructure hdrs;
@@ -303,7 +310,7 @@ static VP8StatusCode ParseHeadersInternal(const uint8_t* data,
   hdrs.data_size = data_size;
 
   // Skip over RIFF header.
-  status = ParseRIFF(&data, &data_size, &hdrs.riff_size);
+  status = ParseRIFF(&data, &data_size, have_all_data, &hdrs.riff_size);
   if (status != VP8_STATUS_OK) {
     return status;   // Wrong RIFF header / insufficient data.
   }
@@ -353,7 +360,7 @@ static VP8StatusCode ParseHeadersInternal(const uint8_t* data,
   }
 
   // Skip over VP8/VP8L header.
-  status = ParseVP8Header(&data, &data_size, hdrs.riff_size,
+  status = ParseVP8Header(&data, &data_size, have_all_data, hdrs.riff_size,
                           &hdrs.compressed_size, &hdrs.is_lossless);
   if (status != VP8_STATUS_OK) {
     goto ReturnWidthHeight;  // Wrong VP8/VP8L chunk-header / insufficient data.
@@ -452,6 +459,7 @@ static VP8StatusCode DecodeInto(const uint8_t* const data, size_t data_size,
 
   headers.data = data;
   headers.data_size = data_size;
+  headers.have_all_data = 1;
   status = WebPParseHeaders(&headers);   // Process Pre-VP8 chunks.
   if (status != VP8_STATUS_OK) {
     return status;
@@ -512,6 +520,12 @@ static VP8StatusCode DecodeInto(const uint8_t* const data, size_t data_size,
   if (status != VP8_STATUS_OK) {
     WebPFreeDecBuffer(params->output);
   }
+
+#if WEBP_DECODER_ABI_VERSION > 0x0203
+  if (params->options != NULL && params->options->flip) {
+    status = WebPFlipBuffer(params->output);
+  }
+#endif
   return status;
 }
 
@@ -776,9 +790,9 @@ int WebPIoInitFromOptions(const WebPDecoderOptions* const options,
     h = options->crop_height;
     x = options->crop_left;
     y = options->crop_top;
-    if (!WebPIsRGBMode(src_colorspace)) {   // only snap for YUV420 or YUV422
+    if (!WebPIsRGBMode(src_colorspace)) {   // only snap for YUV420
       x &= ~1;
-      y &= ~1;    // TODO(later): only for YUV420, not YUV422.
+      y &= ~1;
     }
     if (x < 0 || y < 0 || w <= 0 || h <= 0 || x + w > W || y + h > H) {
       return 0;  // out of frame boundary error
diff --git a/src/3rdparty/libwebp/src/dec/webpi.h b/src/3rdparty/libwebp/src/dec/webpi.h
index d915f5e..457c72e 100644
--- a/src/3rdparty/libwebp/src/dec/webpi.h
+++ b/src/3rdparty/libwebp/src/dec/webpi.h
@@ -54,6 +54,7 @@ void WebPResetDecParams(WebPDecParams* const params);
 typedef struct {
   const uint8_t* data;         // input buffer
   size_t data_size;            // input buffer size
+  int have_all_data;           // true if all data is known to be available
   size_t offset;               // offset to main data chunk (VP8 or VP8L)
   const uint8_t* alpha_data;   // points to alpha chunk (if present)
   size_t alpha_data_size;      // alpha chunk size
@@ -93,10 +94,15 @@ int WebPIoInitFromOptions(const WebPDecoderOptions* const options,
 // dimension / etc.). If *options is not NULL, also verify that the options'
 // parameters are valid and apply them to the width/height dimensions of the
 // output buffer. This takes cropping / scaling / rotation into account.
+// Also incorporates the options->flip flag to flip the buffer parameters if
+// needed.
 VP8StatusCode WebPAllocateDecBuffer(int width, int height,
                                     const WebPDecoderOptions* const options,
                                     WebPDecBuffer* const buffer);
 
+// Flip buffer vertically by negating the various strides.
+VP8StatusCode WebPFlipBuffer(WebPDecBuffer* const buffer);
+
 // Copy 'src' into 'dst' buffer, making sure 'dst' is not marked as owner of the
 // memory (still held by 'src').
 void WebPCopyDecBuffer(const WebPDecBuffer* const src,
@@ -105,8 +111,6 @@ void WebPCopyDecBuffer(const WebPDecBuffer* const src,
 // Copy and transfer ownership from src to dst (beware of parameter order!)
 void WebPGrabDecBuffer(WebPDecBuffer* const src, WebPDecBuffer* const dst);
 
-
-
 //------------------------------------------------------------------------------
 
 #ifdef __cplusplus
diff --git a/src/3rdparty/libwebp/src/demux/demux.c b/src/3rdparty/libwebp/src/demux/demux.c
index f66ac6d..55a7918 100644
--- a/src/3rdparty/libwebp/src/demux/demux.c
+++ b/src/3rdparty/libwebp/src/demux/demux.c
@@ -11,7 +11,7 @@
 //
 
 #ifdef HAVE_CONFIG_H
-#include "config.h"
+#include "../webp/config.h"
 #endif
 
 #include <assert.h>
@@ -25,7 +25,7 @@
 
 #define DMUX_MAJ_VERSION 0
 #define DMUX_MIN_VERSION 2
-#define DMUX_REV_VERSION 0
+#define DMUX_REV_VERSION 2
 
 typedef struct {
   size_t start_;        // start location of the data
@@ -289,7 +289,7 @@ static ParseStatus NewFrame(const MemBuffer* const mem,
   if (actual_size < min_size) return PARSE_ERROR;
   if (MemDataSize(mem) < min_size)  return PARSE_NEED_MORE_DATA;
 
-  *frame = (Frame*)calloc(1, sizeof(**frame));
+  *frame = (Frame*)WebPSafeCalloc(1ULL, sizeof(**frame));
   return (*frame == NULL) ? PARSE_ERROR : PARSE_OK;
 }
 
@@ -317,7 +317,7 @@ static ParseStatus ParseAnimationFrame(
       (bits & 1) ? WEBP_MUX_DISPOSE_BACKGROUND : WEBP_MUX_DISPOSE_NONE;
   frame->blend_method_ = (bits & 2) ? WEBP_MUX_NO_BLEND : WEBP_MUX_BLEND;
   if (frame->width_ * (uint64_t)frame->height_ >= MAX_IMAGE_AREA) {
-    free(frame);
+    WebPSafeFree(frame);
     return PARSE_ERROR;
   }
 
@@ -333,7 +333,7 @@ static ParseStatus ParseAnimationFrame(
     }
   }
 
-  if (!added_frame) free(frame);
+  if (!added_frame) WebPSafeFree(frame);
   return status;
 }
 
@@ -368,7 +368,7 @@ static ParseStatus ParseFragment(WebPDemuxer* const dmux,
     }
   }
 
-  if (!added_fragment) free(frame);
+  if (!added_fragment) WebPSafeFree(frame);
   return status;
 }
 #endif  // WEBP_EXPERIMENTAL_FEATURES
@@ -379,7 +379,7 @@ static ParseStatus ParseFragment(WebPDemuxer* const dmux,
 // Returns true on success, false otherwise.
 static int StoreChunk(WebPDemuxer* const dmux,
                       size_t start_offset, uint32_t size) {
-  Chunk* const chunk = (Chunk*)calloc(1, sizeof(*chunk));
+  Chunk* const chunk = (Chunk*)WebPSafeCalloc(1ULL, sizeof(*chunk));
   if (chunk == NULL) return 0;
 
   chunk->data_.offset_ = start_offset;
@@ -427,7 +427,7 @@ static ParseStatus ParseSingleImage(WebPDemuxer* const dmux) {
   if (SizeIsInvalid(mem, min_size)) return PARSE_ERROR;
   if (MemDataSize(mem) < min_size) return PARSE_NEED_MORE_DATA;
 
-  frame = (Frame*)calloc(1, sizeof(*frame));
+  frame = (Frame*)WebPSafeCalloc(1ULL, sizeof(*frame));
   if (frame == NULL) return PARSE_ERROR;
 
   // For the single image case we allow parsing of a partial frame, but we need
@@ -458,7 +458,7 @@ static ParseStatus ParseSingleImage(WebPDemuxer* const dmux) {
     }
   }
 
-  if (!image_added) free(frame);
+  if (!image_added) WebPSafeFree(frame);
   return status;
 }
 
@@ -729,7 +729,7 @@ WebPDemuxer* WebPDemuxInternal(const WebPData* data, int allow_partial,
   partial = (mem.buf_size_ < mem.riff_end_);
   if (!allow_partial && partial) return NULL;
 
-  dmux = (WebPDemuxer*)calloc(1, sizeof(*dmux));
+  dmux = (WebPDemuxer*)WebPSafeCalloc(1ULL, sizeof(*dmux));
   if (dmux == NULL) return NULL;
   InitDemux(dmux, &mem);
 
@@ -761,14 +761,14 @@ void WebPDemuxDelete(WebPDemuxer* dmux) {
   for (f = dmux->frames_; f != NULL;) {
     Frame* const cur_frame = f;
     f = f->next_;
-    free(cur_frame);
+    WebPSafeFree(cur_frame);
   }
   for (c = dmux->chunks_; c != NULL;) {
     Chunk* const cur_chunk = c;
     c = c->next_;
-    free(cur_chunk);
+    WebPSafeFree(cur_chunk);
   }
-  free(dmux);
+  WebPSafeFree(dmux);
 }
 
 // -----------------------------------------------------------------------------
diff --git a/src/3rdparty/libwebp/src/utils/alpha_processing.c b/src/3rdparty/libwebp/src/dsp/alpha_processing.c
index 7362ff9..c8e0b4b 100644
--- a/src/3rdparty/libwebp/src/utils/alpha_processing.c
+++ b/src/3rdparty/libwebp/src/dsp/alpha_processing.c
@@ -12,7 +12,7 @@
 // Author: Skal (pascal.massimino@gmail.com)
 
 #include <assert.h>
-#include "./alpha_processing.h"
+#include "./dsp.h"
 
 // Tables can be faster on some platform but incur some extra binary size (~2k).
 // #define USE_TABLES_FOR_ALPHA_MULT
@@ -134,7 +134,7 @@ static WEBP_INLINE uint32_t GetScale(uint32_t a, int inverse) {
 
 #endif    // USE_TABLES_FOR_ALPHA_MULT
 
-void WebPMultARGBRow(uint32_t* const ptr, int width, int inverse) {
+static void MultARGBRow(uint32_t* const ptr, int width, int inverse) {
   int x;
   for (x = 0; x < width; ++x) {
     const uint32_t argb = ptr[x];
@@ -154,17 +154,8 @@ void WebPMultARGBRow(uint32_t* const ptr, int width, int inverse) {
   }
 }
 
-void WebPMultARGBRows(uint8_t* ptr, int stride, int width, int num_rows,
-                      int inverse) {
-  int n;
-  for (n = 0; n < num_rows; ++n) {
-    WebPMultARGBRow((uint32_t*)ptr, width, inverse);
-    ptr += stride;
-  }
-}
-
-void WebPMultRow(uint8_t* const ptr, const uint8_t* const alpha,
-                 int width, int inverse) {
+static void MultRow(uint8_t* const ptr, const uint8_t* const alpha,
+                    int width, int inverse) {
   int x;
   for (x = 0; x < width; ++x) {
     const uint32_t a = alpha[x];
@@ -179,6 +170,26 @@ void WebPMultRow(uint8_t* const ptr, const uint8_t* const alpha,
   }
 }
 
+#undef KINV_255
+#undef HALF
+#undef MFIX
+
+void (*WebPMultARGBRow)(uint32_t* const ptr, int width, int inverse);
+void (*WebPMultRow)(uint8_t* const ptr, const uint8_t* const alpha,
+                    int width, int inverse);
+
+//------------------------------------------------------------------------------
+// Generic per-plane calls
+
+void WebPMultARGBRows(uint8_t* ptr, int stride, int width, int num_rows,
+                      int inverse) {
+  int n;
+  for (n = 0; n < num_rows; ++n) {
+    WebPMultARGBRow((uint32_t*)ptr, width, inverse);
+    ptr += stride;
+  }
+}
+
 void WebPMultRows(uint8_t* ptr, int stride,
                   const uint8_t* alpha, int alpha_stride,
                   int width, int num_rows, int inverse) {
@@ -190,7 +201,135 @@ void WebPMultRows(uint8_t* ptr, int stride,
   }
 }
 
-#undef KINV_255
-#undef HALF
-#undef MFIX
+//------------------------------------------------------------------------------
+// Premultiplied modes
+
+// non dithered-modes
+
+// (x * a * 32897) >> 23 is bit-wise equivalent to (int)(x * a / 255.)
+// for all 8bit x or a. For bit-wise equivalence to (int)(x * a / 255. + .5),
+// one can use instead: (x * a * 65793 + (1 << 23)) >> 24
+#if 1     // (int)(x * a / 255.)
+#define MULTIPLIER(a)   ((a) * 32897U)
+#define PREMULTIPLY(x, m) (((x) * (m)) >> 23)
+#else     // (int)(x * a / 255. + .5)
+#define MULTIPLIER(a) ((a) * 65793U)
+#define PREMULTIPLY(x, m) (((x) * (m) + (1U << 23)) >> 24)
+#endif
+
+static void ApplyAlphaMultiply(uint8_t* rgba, int alpha_first,
+                               int w, int h, int stride) {
+  while (h-- > 0) {
+    uint8_t* const rgb = rgba + (alpha_first ? 1 : 0);
+    const uint8_t* const alpha = rgba + (alpha_first ? 0 : 3);
+    int i;
+    for (i = 0; i < w; ++i) {
+      const uint32_t a = alpha[4 * i];
+      if (a != 0xff) {
+        const uint32_t mult = MULTIPLIER(a);
+        rgb[4 * i + 0] = PREMULTIPLY(rgb[4 * i + 0], mult);
+        rgb[4 * i + 1] = PREMULTIPLY(rgb[4 * i + 1], mult);
+        rgb[4 * i + 2] = PREMULTIPLY(rgb[4 * i + 2], mult);
+      }
+    }
+    rgba += stride;
+  }
+}
+#undef MULTIPLIER
+#undef PREMULTIPLY
+
+// rgbA4444
+
+#define MULTIPLIER(a)  ((a) * 0x1111)    // 0x1111 ~= (1 << 16) / 15
+
+static WEBP_INLINE uint8_t dither_hi(uint8_t x) {
+  return (x & 0xf0) | (x >> 4);
+}
+
+static WEBP_INLINE uint8_t dither_lo(uint8_t x) {
+  return (x & 0x0f) | (x << 4);
+}
+
+static WEBP_INLINE uint8_t multiply(uint8_t x, uint32_t m) {
+  return (x * m) >> 16;
+}
+
+static WEBP_INLINE void ApplyAlphaMultiply4444(uint8_t* rgba4444,
+                                               int w, int h, int stride,
+                                               int rg_byte_pos /* 0 or 1 */) {
+  while (h-- > 0) {
+    int i;
+    for (i = 0; i < w; ++i) {
+      const uint32_t rg = rgba4444[2 * i + rg_byte_pos];
+      const uint32_t ba = rgba4444[2 * i + (rg_byte_pos ^ 1)];
+      const uint8_t a = ba & 0x0f;
+      const uint32_t mult = MULTIPLIER(a);
+      const uint8_t r = multiply(dither_hi(rg), mult);
+      const uint8_t g = multiply(dither_lo(rg), mult);
+      const uint8_t b = multiply(dither_hi(ba), mult);
+      rgba4444[2 * i + rg_byte_pos] = (r & 0xf0) | ((g >> 4) & 0x0f);
+      rgba4444[2 * i + (rg_byte_pos ^ 1)] = (b & 0xf0) | a;
+    }
+    rgba4444 += stride;
+  }
+}
+#undef MULTIPLIER
+
+static void ApplyAlphaMultiply_16b(uint8_t* rgba4444,
+                                   int w, int h, int stride) {
+#ifdef WEBP_SWAP_16BIT_CSP
+  ApplyAlphaMultiply4444(rgba4444, w, h, stride, 1);
+#else
+  ApplyAlphaMultiply4444(rgba4444, w, h, stride, 0);
+#endif
+}
+
+static int ExtractAlpha(const uint8_t* argb, int argb_stride,
+                        int width, int height,
+                        uint8_t* alpha, int alpha_stride) {
+  uint8_t alpha_mask = 0xff;
+  int i, j;
+
+  for (j = 0; j < height; ++j) {
+    for (i = 0; i < width; ++i) {
+      const uint8_t alpha_value = argb[4 * i];
+      alpha[i] = alpha_value;
+      alpha_mask &= alpha_value;
+    }
+    argb += argb_stride;
+    alpha += alpha_stride;
+  }
+  return (alpha_mask == 0xff);
+}
+
+void (*WebPApplyAlphaMultiply)(uint8_t*, int, int, int, int);
+void (*WebPApplyAlphaMultiply4444)(uint8_t*, int, int, int);
+int (*WebPExtractAlpha)(const uint8_t*, int, int, int, uint8_t*, int);
 
+//------------------------------------------------------------------------------
+// Init function
+
+extern void WebPInitAlphaProcessingSSE2(void);
+
+static volatile VP8CPUInfo alpha_processing_last_cpuinfo_used =
+    (VP8CPUInfo)&alpha_processing_last_cpuinfo_used;
+
+void WebPInitAlphaProcessing(void) {
+  if (alpha_processing_last_cpuinfo_used == VP8GetCPUInfo) return;
+
+  WebPMultARGBRow = MultARGBRow;
+  WebPMultRow = MultRow;
+  WebPApplyAlphaMultiply = ApplyAlphaMultiply;
+  WebPApplyAlphaMultiply4444 = ApplyAlphaMultiply_16b;
+  WebPExtractAlpha = ExtractAlpha;
+
+  // If defined, use CPUInfo() to overwrite some pointers with faster versions.
+  if (VP8GetCPUInfo != NULL) {
+#if defined(WEBP_USE_SSE2)
+    if (VP8GetCPUInfo(kSSE2)) {
+      WebPInitAlphaProcessingSSE2();
+    }
+#endif
+  }
+  alpha_processing_last_cpuinfo_used = VP8GetCPUInfo;
+}
diff --git a/src/3rdparty/libwebp/src/dsp/alpha_processing_sse2.c b/src/3rdparty/libwebp/src/dsp/alpha_processing_sse2.c
new file mode 100644
index 0000000..3d0a9b5
--- /dev/null
+++ b/src/3rdparty/libwebp/src/dsp/alpha_processing_sse2.c
@@ -0,0 +1,77 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Utilities for processing transparent channel.
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "./dsp.h"
+
+#if defined(WEBP_USE_SSE2)
+#include <emmintrin.h>
+
+//------------------------------------------------------------------------------
+
+static int ExtractAlpha(const uint8_t* argb, int argb_stride,
+                        int width, int height,
+                        uint8_t* alpha, int alpha_stride) {
+  // alpha_and stores an 'and' operation of all the alpha[] values. The final
+  // value is not 0xff if any of the alpha[] is not equal to 0xff.
+  uint32_t alpha_and = 0xff;
+  int i, j;
+  const __m128i a_mask = _mm_set1_epi32(0xffu);  // to preserve alpha
+  const __m128i all_0xff = _mm_set_epi32(0, 0, ~0u, ~0u);
+  __m128i all_alphas = all_0xff;
+
+  // We must be able to access 3 extra bytes after the last written byte
+  // 'src[4 * width - 4]', because we don't know if alpha is the first or the
+  // last byte of the quadruplet.
+  const int limit = (width - 1) & ~7;
+
+  for (j = 0; j < height; ++j) {
+    const __m128i* src = (const __m128i*)argb;
+    for (i = 0; i < limit; i += 8) {
+      // load 32 argb bytes
+      const __m128i a0 = _mm_loadu_si128(src + 0);
+      const __m128i a1 = _mm_loadu_si128(src + 1);
+      const __m128i b0 = _mm_and_si128(a0, a_mask);
+      const __m128i b1 = _mm_and_si128(a1, a_mask);
+      const __m128i c0 = _mm_packs_epi32(b0, b1);
+      const __m128i d0 = _mm_packus_epi16(c0, c0);
+      // store
+      _mm_storel_epi64((__m128i*)&alpha[i], d0);
+      // accumulate eight alpha 'and' in parallel
+      all_alphas = _mm_and_si128(all_alphas, d0);
+      src += 2;
+    }
+    for (; i < width; ++i) {
+      const uint32_t alpha_value = argb[4 * i];
+      alpha[i] = alpha_value;
+      alpha_and &= alpha_value;
+    }
+    argb += argb_stride;
+    alpha += alpha_stride;
+  }
+  // Combine the eight alpha 'and' into a 8-bit mask.
+  alpha_and &= _mm_movemask_epi8(_mm_cmpeq_epi8(all_alphas, all_0xff));
+  return (alpha_and == 0xff);
+}
+
+#endif   // WEBP_USE_SSE2
+
+//------------------------------------------------------------------------------
+// Init function
+
+extern void WebPInitAlphaProcessingSSE2(void);
+
+void WebPInitAlphaProcessingSSE2(void) {
+#if defined(WEBP_USE_SSE2)
+  WebPExtractAlpha = ExtractAlpha;
+#endif
+}
diff --git a/src/3rdparty/libwebp/src/dsp/cpu.c b/src/3rdparty/libwebp/src/dsp/cpu.c
index 7a1f417..ef04a75 100644
--- a/src/3rdparty/libwebp/src/dsp/cpu.c
+++ b/src/3rdparty/libwebp/src/dsp/cpu.c
@@ -29,19 +29,54 @@ static WEBP_INLINE void GetCPUInfo(int cpu_info[4], int info_type) {
     "cpuid\n"
     "xchg %%edi, %%ebx\n"
     : "=a"(cpu_info[0]), "=D"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
-    : "a"(info_type));
+    : "a"(info_type), "c"(0));
 }
 #elif defined(__i386__) || defined(__x86_64__)
 static WEBP_INLINE void GetCPUInfo(int cpu_info[4], int info_type) {
   __asm__ volatile (
     "cpuid\n"
     : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
-    : "a"(info_type));
+    : "a"(info_type), "c"(0));
 }
+#elif (defined(_M_X64) || defined(_M_IX86)) && \
+      defined(_MSC_FULL_VER) && _MSC_FULL_VER >= 150030729  // >= VS2008 SP1
+#include <intrin.h>
+#define GetCPUInfo(info, type) __cpuidex(info, type, 0)  // set ecx=0
 #elif defined(WEBP_MSC_SSE2)
 #define GetCPUInfo __cpuid
 #endif
 
+// NaCl has no support for xgetbv or the raw opcode.
+#if !defined(__native_client__) && (defined(__i386__) || defined(__x86_64__))
+static WEBP_INLINE uint64_t xgetbv(void) {
+  const uint32_t ecx = 0;
+  uint32_t eax, edx;
+  // Use the raw opcode for xgetbv for compatibility with older toolchains.
+  __asm__ volatile (
+    ".byte 0x0f, 0x01, 0xd0\n"
+    : "=a"(eax), "=d"(edx) : "c" (ecx));
+  return ((uint64_t)edx << 32) | eax;
+}
+#elif (defined(_M_X64) || defined(_M_IX86)) && \
+      defined(_MSC_FULL_VER) && _MSC_FULL_VER >= 160040219  // >= VS2010 SP1
+#include <immintrin.h>
+#define xgetbv() _xgetbv(0)
+#elif defined(_MSC_VER) && defined(_M_IX86)
+static WEBP_INLINE uint64_t xgetbv(void) {
+  uint32_t eax_, edx_;
+  __asm {
+    xor ecx, ecx  // ecx = 0
+    // Use the raw opcode for xgetbv for compatibility with older toolchains.
+    __asm _emit 0x0f __asm _emit 0x01 __asm _emit 0xd0
+    mov eax_, eax
+    mov edx_, edx
+  }
+  return ((uint64_t)edx_ << 32) | eax_;
+}
+#else
+#define xgetbv() 0U  // no AVX for older x64 or unrecognized toolchains.
+#endif
+
 #if defined(__i386__) || defined(__x86_64__) || defined(WEBP_MSC_SSE2)
 static int x86CPUInfo(CPUFeature feature) {
   int cpu_info[4];
@@ -52,10 +87,23 @@ static int x86CPUInfo(CPUFeature feature) {
   if (feature == kSSE3) {
     return 0 != (cpu_info[2] & 0x00000001);
   }
+  if (feature == kAVX) {
+    // bits 27 (OSXSAVE) & 28 (256-bit AVX)
+    if ((cpu_info[2] & 0x18000000) == 0x18000000) {
+      // XMM state and YMM state enabled by the OS.
+      return (xgetbv() & 0x6) == 0x6;
+    }
+  }
+  if (feature == kAVX2) {
+    if (x86CPUInfo(kAVX)) {
+      GetCPUInfo(cpu_info, 7);
+      return ((cpu_info[1] & 0x00000020) == 0x00000020);
+    }
+  }
   return 0;
 }
 VP8CPUInfo VP8GetCPUInfo = x86CPUInfo;
-#elif defined(WEBP_ANDROID_NEON)
+#elif defined(WEBP_ANDROID_NEON)  // NB: needs to be before generic NEON test.
 static int AndroidCPUInfo(CPUFeature feature) {
   const AndroidCpuFamily cpu_family = android_getCpuFamily();
   const uint64_t cpu_features = android_getCpuFeatures();
@@ -66,7 +114,7 @@ static int AndroidCPUInfo(CPUFeature feature) {
   return 0;
 }
 VP8CPUInfo VP8GetCPUInfo = AndroidCPUInfo;
-#elif defined(__ARM_NEON__)
+#elif defined(WEBP_USE_NEON)
 // define a dummy function to enable turning off NEON at runtime by setting
 // VP8DecGetCPUInfo = NULL
 static int armCPUInfo(CPUFeature feature) {
@@ -74,6 +122,12 @@ static int armCPUInfo(CPUFeature feature) {
   return 1;
 }
 VP8CPUInfo VP8GetCPUInfo = armCPUInfo;
+#elif defined(WEBP_USE_MIPS32)
+static int mipsCPUInfo(CPUFeature feature) {
+  (void)feature;
+  return 1;
+}
+VP8CPUInfo VP8GetCPUInfo = mipsCPUInfo;
 #else
 VP8CPUInfo VP8GetCPUInfo = NULL;
 #endif
diff --git a/src/3rdparty/libwebp/src/dsp/dec.c b/src/3rdparty/libwebp/src/dsp/dec.c
index 8b246fa..3a8dc81 100644
--- a/src/3rdparty/libwebp/src/dsp/dec.c
+++ b/src/3rdparty/libwebp/src/dsp/dec.c
@@ -15,37 +15,6 @@
 #include "../dec/vp8i.h"
 
 //------------------------------------------------------------------------------
-// run-time tables (~4k)
-
-static uint8_t abs0[255 + 255 + 1];     // abs(i)
-static uint8_t abs1[255 + 255 + 1];     // abs(i)>>1
-static int8_t sclip1[1020 + 1020 + 1];  // clips [-1020, 1020] to [-128, 127]
-static int8_t sclip2[112 + 112 + 1];    // clips [-112, 112] to [-16, 15]
-static uint8_t clip1[255 + 510 + 1];    // clips [-255,510] to [0,255]
-
-// We declare this variable 'volatile' to prevent instruction reordering
-// and make sure it's set to true _last_ (so as to be thread-safe)
-static volatile int tables_ok = 0;
-
-static void DspInitTables(void) {
-  if (!tables_ok) {
-    int i;
-    for (i = -255; i <= 255; ++i) {
-      abs0[255 + i] = (i < 0) ? -i : i;
-      abs1[255 + i] = abs0[255 + i] >> 1;
-    }
-    for (i = -1020; i <= 1020; ++i) {
-      sclip1[1020 + i] = (i < -128) ? -128 : (i > 127) ? 127 : i;
-    }
-    for (i = -112; i <= 112; ++i) {
-      sclip2[112 + i] = (i < -16) ? -16 : (i > 15) ? 15 : i;
-    }
-    for (i = -255; i <= 255 + 255; ++i) {
-      clip1[255 + i] = (i < 0) ? 0 : (i > 255) ? 255 : i;
-    }
-    tables_ok = 1;
-  }
-}
 
 static WEBP_INLINE uint8_t clip_8b(int v) {
   return (!(v & ~0xff)) ? v : (v < 0) ? 0 : 255;
@@ -146,10 +115,10 @@ static void TransformDC(const int16_t *in, uint8_t* dst) {
 }
 
 static void TransformDCUV(const int16_t* in, uint8_t* dst) {
-  if (in[0 * 16]) TransformDC(in + 0 * 16, dst);
-  if (in[1 * 16]) TransformDC(in + 1 * 16, dst + 4);
-  if (in[2 * 16]) TransformDC(in + 2 * 16, dst + 4 * BPS);
-  if (in[3 * 16]) TransformDC(in + 3 * 16, dst + 4 * BPS + 4);
+  if (in[0 * 16]) VP8TransformDC(in + 0 * 16, dst);
+  if (in[1 * 16]) VP8TransformDC(in + 1 * 16, dst + 4);
+  if (in[2 * 16]) VP8TransformDC(in + 2 * 16, dst + 4 * BPS);
+  if (in[3 * 16]) VP8TransformDC(in + 3 * 16, dst + 4 * BPS + 4);
 }
 
 #undef STORE
@@ -184,7 +153,7 @@ static void TransformWHT(const int16_t* in, int16_t* out) {
   }
 }
 
-void (*VP8TransformWHT)(const int16_t* in, int16_t* out) = TransformWHT;
+void (*VP8TransformWHT)(const int16_t* in, int16_t* out);
 
 //------------------------------------------------------------------------------
 // Intra predictions
@@ -193,7 +162,7 @@ void (*VP8TransformWHT)(const int16_t* in, int16_t* out) = TransformWHT;
 
 static WEBP_INLINE void TrueMotion(uint8_t *dst, int size) {
   const uint8_t* top = dst - BPS;
-  const uint8_t* const clip0 = clip1 + 255 - top[-1];
+  const uint8_t* const clip0 = VP8kclip1 - top[-1];
   int y;
   for (y = 0; y < size; ++y) {
     const uint8_t* const clip = clip0 + dst[-1];
@@ -448,14 +417,9 @@ static void HE8uv(uint8_t *dst) {    // horizontal
 // helper for chroma-DC predictions
 static WEBP_INLINE void Put8x8uv(uint8_t value, uint8_t* dst) {
   int j;
-#ifndef WEBP_REFERENCE_IMPLEMENTATION
-  const uint64_t v = (uint64_t)value * 0x0101010101010101ULL;
   for (j = 0; j < 8; ++j) {
-    *(uint64_t*)(dst + j * BPS) = v;
+    memset(dst + j * BPS, value, 8);
   }
-#else
-  for (j = 0; j < 8; ++j) memset(dst + j * BPS, value, 8);
-#endif
 }
 
 static void DC8uv(uint8_t *dst) {     // DC
@@ -512,61 +476,62 @@ const VP8PredFunc VP8PredChroma8[NUM_B_DC_MODES] = {
 // 4 pixels in, 2 pixels out
 static WEBP_INLINE void do_filter2(uint8_t* p, int step) {
   const int p1 = p[-2*step], p0 = p[-step], q0 = p[0], q1 = p[step];
-  const int a = 3 * (q0 - p0) + sclip1[1020 + p1 - q1];
-  const int a1 = sclip2[112 + ((a + 4) >> 3)];
-  const int a2 = sclip2[112 + ((a + 3) >> 3)];
-  p[-step] = clip1[255 + p0 + a2];
-  p[    0] = clip1[255 + q0 - a1];
+  const int a = 3 * (q0 - p0) + VP8ksclip1[p1 - q1];  // in [-893,892]
+  const int a1 = VP8ksclip2[(a + 4) >> 3];            // in [-16,15]
+  const int a2 = VP8ksclip2[(a + 3) >> 3];
+  p[-step] = VP8kclip1[p0 + a2];
+  p[    0] = VP8kclip1[q0 - a1];
 }
 
 // 4 pixels in, 4 pixels out
 static WEBP_INLINE void do_filter4(uint8_t* p, int step) {
   const int p1 = p[-2*step], p0 = p[-step], q0 = p[0], q1 = p[step];
   const int a = 3 * (q0 - p0);
-  const int a1 = sclip2[112 + ((a + 4) >> 3)];
-  const int a2 = sclip2[112 + ((a + 3) >> 3)];
+  const int a1 = VP8ksclip2[(a + 4) >> 3];
+  const int a2 = VP8ksclip2[(a + 3) >> 3];
   const int a3 = (a1 + 1) >> 1;
-  p[-2*step] = clip1[255 + p1 + a3];
-  p[-  step] = clip1[255 + p0 + a2];
-  p[      0] = clip1[255 + q0 - a1];
-  p[   step] = clip1[255 + q1 - a3];
+  p[-2*step] = VP8kclip1[p1 + a3];
+  p[-  step] = VP8kclip1[p0 + a2];
+  p[      0] = VP8kclip1[q0 - a1];
+  p[   step] = VP8kclip1[q1 - a3];
 }
 
 // 6 pixels in, 6 pixels out
 static WEBP_INLINE void do_filter6(uint8_t* p, int step) {
   const int p2 = p[-3*step], p1 = p[-2*step], p0 = p[-step];
   const int q0 = p[0], q1 = p[step], q2 = p[2*step];
-  const int a = sclip1[1020 + 3 * (q0 - p0) + sclip1[1020 + p1 - q1]];
+  const int a = VP8ksclip1[3 * (q0 - p0) + VP8ksclip1[p1 - q1]];
+  // a is in [-128,127], a1 in [-27,27], a2 in [-18,18] and a3 in [-9,9]
   const int a1 = (27 * a + 63) >> 7;  // eq. to ((3 * a + 7) * 9) >> 7
   const int a2 = (18 * a + 63) >> 7;  // eq. to ((2 * a + 7) * 9) >> 7
   const int a3 = (9  * a + 63) >> 7;  // eq. to ((1 * a + 7) * 9) >> 7
-  p[-3*step] = clip1[255 + p2 + a3];
-  p[-2*step] = clip1[255 + p1 + a2];
-  p[-  step] = clip1[255 + p0 + a1];
-  p[      0] = clip1[255 + q0 - a1];
-  p[   step] = clip1[255 + q1 - a2];
-  p[ 2*step] = clip1[255 + q2 - a3];
+  p[-3*step] = VP8kclip1[p2 + a3];
+  p[-2*step] = VP8kclip1[p1 + a2];
+  p[-  step] = VP8kclip1[p0 + a1];
+  p[      0] = VP8kclip1[q0 - a1];
+  p[   step] = VP8kclip1[q1 - a2];
+  p[ 2*step] = VP8kclip1[q2 - a3];
 }
 
 static WEBP_INLINE int hev(const uint8_t* p, int step, int thresh) {
   const int p1 = p[-2*step], p0 = p[-step], q0 = p[0], q1 = p[step];
-  return (abs0[255 + p1 - p0] > thresh) || (abs0[255 + q1 - q0] > thresh);
+  return (VP8kabs0[p1 - p0] > thresh) || (VP8kabs0[q1 - q0] > thresh);
 }
 
-static WEBP_INLINE int needs_filter(const uint8_t* p, int step, int thresh) {
-  const int p1 = p[-2*step], p0 = p[-step], q0 = p[0], q1 = p[step];
-  return (2 * abs0[255 + p0 - q0] + abs1[255 + p1 - q1]) <= thresh;
+static WEBP_INLINE int needs_filter(const uint8_t* p, int step, int t) {
+  const int p1 = p[-2 * step], p0 = p[-step], q0 = p[0], q1 = p[step];
+  return ((4 * VP8kabs0[p0 - q0] + VP8kabs0[p1 - q1]) <= t);
 }
 
 static WEBP_INLINE int needs_filter2(const uint8_t* p,
                                      int step, int t, int it) {
-  const int p3 = p[-4*step], p2 = p[-3*step], p1 = p[-2*step], p0 = p[-step];
-  const int q0 = p[0], q1 = p[step], q2 = p[2*step], q3 = p[3*step];
-  if ((2 * abs0[255 + p0 - q0] + abs1[255 + p1 - q1]) > t)
-    return 0;
-  return abs0[255 + p3 - p2] <= it && abs0[255 + p2 - p1] <= it &&
-         abs0[255 + p1 - p0] <= it && abs0[255 + q3 - q2] <= it &&
-         abs0[255 + q2 - q1] <= it && abs0[255 + q1 - q0] <= it;
+  const int p3 = p[-4 * step], p2 = p[-3 * step], p1 = p[-2 * step];
+  const int p0 = p[-step], q0 = p[0];
+  const int q1 = p[step], q2 = p[2 * step], q3 = p[3 * step];
+  if ((4 * VP8kabs0[p0 - q0] + VP8kabs0[p1 - q1]) > t) return 0;
+  return VP8kabs0[p3 - p2] <= it && VP8kabs0[p2 - p1] <= it &&
+         VP8kabs0[p1 - p0] <= it && VP8kabs0[q3 - q2] <= it &&
+         VP8kabs0[q2 - q1] <= it && VP8kabs0[q1 - q0] <= it;
 }
 
 //------------------------------------------------------------------------------
@@ -574,8 +539,9 @@ static WEBP_INLINE int needs_filter2(const uint8_t* p,
 
 static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {
   int i;
+  const int thresh2 = 2 * thresh + 1;
   for (i = 0; i < 16; ++i) {
-    if (needs_filter(p + i, stride, thresh)) {
+    if (needs_filter(p + i, stride, thresh2)) {
       do_filter2(p + i, stride);
     }
   }
@@ -583,8 +549,9 @@ static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {
 
 static void SimpleHFilter16(uint8_t* p, int stride, int thresh) {
   int i;
+  const int thresh2 = 2 * thresh + 1;
   for (i = 0; i < 16; ++i) {
-    if (needs_filter(p + i * stride, 1, thresh)) {
+    if (needs_filter(p + i * stride, 1, thresh2)) {
       do_filter2(p + i * stride, 1);
     }
   }
@@ -612,8 +579,9 @@ static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) {
 static WEBP_INLINE void FilterLoop26(uint8_t* p,
                                      int hstride, int vstride, int size,
                                      int thresh, int ithresh, int hev_thresh) {
+  const int thresh2 = 2 * thresh + 1;
   while (size-- > 0) {
-    if (needs_filter2(p, hstride, thresh, ithresh)) {
+    if (needs_filter2(p, hstride, thresh2, ithresh)) {
       if (hev(p, hstride, hev_thresh)) {
         do_filter2(p, hstride);
       } else {
@@ -627,8 +595,9 @@ static WEBP_INLINE void FilterLoop26(uint8_t* p,
 static WEBP_INLINE void FilterLoop24(uint8_t* p,
                                      int hstride, int vstride, int size,
                                      int thresh, int ithresh, int hev_thresh) {
+  const int thresh2 = 2 * thresh + 1;
   while (size-- > 0) {
-    if (needs_filter2(p, hstride, thresh, ithresh)) {
+    if (needs_filter2(p, hstride, thresh2, ithresh)) {
       if (hev(p, hstride, hev_thresh)) {
         do_filter2(p, hstride);
       } else {
@@ -717,10 +686,17 @@ VP8SimpleFilterFunc VP8SimpleHFilter16i;
 
 extern void VP8DspInitSSE2(void);
 extern void VP8DspInitNEON(void);
+extern void VP8DspInitMIPS32(void);
+
+static volatile VP8CPUInfo dec_last_cpuinfo_used =
+    (VP8CPUInfo)&dec_last_cpuinfo_used;
 
 void VP8DspInit(void) {
-  DspInitTables();
+  if (dec_last_cpuinfo_used == VP8GetCPUInfo) return;
 
+  VP8InitClipTables();
+
+  VP8TransformWHT = TransformWHT;
   VP8Transform = TransformTwo;
   VP8TransformUV = TransformUV;
   VP8TransformDC = TransformDC;
@@ -741,7 +717,7 @@ void VP8DspInit(void) {
   VP8SimpleHFilter16i = SimpleHFilter16i;
 
   // If defined, use CPUInfo() to overwrite some pointers with faster versions.
-  if (VP8GetCPUInfo) {
+  if (VP8GetCPUInfo != NULL) {
 #if defined(WEBP_USE_SSE2)
     if (VP8GetCPUInfo(kSSE2)) {
       VP8DspInitSSE2();
@@ -750,7 +726,11 @@ void VP8DspInit(void) {
     if (VP8GetCPUInfo(kNEON)) {
       VP8DspInitNEON();
     }
+#elif defined(WEBP_USE_MIPS32)
+    if (VP8GetCPUInfo(kMIPS32)) {
+      VP8DspInitMIPS32();
+    }
 #endif
   }
+  dec_last_cpuinfo_used = VP8GetCPUInfo;
 }
-
diff --git a/src/3rdparty/libwebp/src/dsp/dec_clip_tables.c b/src/3rdparty/libwebp/src/dsp/dec_clip_tables.c
new file mode 100644
index 0000000..eec5a6d
--- /dev/null
+++ b/src/3rdparty/libwebp/src/dsp/dec_clip_tables.c
@@ -0,0 +1,366 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Clipping tables for filtering
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "./dsp.h"
+
+#define USE_STATIC_TABLES     // undefine to have run-time table initialization
+
+#ifdef USE_STATIC_TABLES
+
+static const uint8_t abs0[255 + 255 + 1] = {
+  0xff, 0xfe, 0xfd, 0xfc, 0xfb, 0xfa, 0xf9, 0xf8, 0xf7, 0xf6, 0xf5, 0xf4,
+  0xf3, 0xf2, 0xf1, 0xf0, 0xef, 0xee, 0xed, 0xec, 0xeb, 0xea, 0xe9, 0xe8,
+  0xe7, 0xe6, 0xe5, 0xe4, 0xe3, 0xe2, 0xe1, 0xe0, 0xdf, 0xde, 0xdd, 0xdc,
+  0xdb, 0xda, 0xd9, 0xd8, 0xd7, 0xd6, 0xd5, 0xd4, 0xd3, 0xd2, 0xd1, 0xd0,
+  0xcf, 0xce, 0xcd, 0xcc, 0xcb, 0xca, 0xc9, 0xc8, 0xc7, 0xc6, 0xc5, 0xc4,
+  0xc3, 0xc2, 0xc1, 0xc0, 0xbf, 0xbe, 0xbd, 0xbc, 0xbb, 0xba, 0xb9, 0xb8,
+  0xb7, 0xb6, 0xb5, 0xb4, 0xb3, 0xb2, 0xb1, 0xb0, 0xaf, 0xae, 0xad, 0xac,
+  0xab, 0xaa, 0xa9, 0xa8, 0xa7, 0xa6, 0xa5, 0xa4, 0xa3, 0xa2, 0xa1, 0xa0,
+  0x9f, 0x9e, 0x9d, 0x9c, 0x9b, 0x9a, 0x99, 0x98, 0x97, 0x96, 0x95, 0x94,
+  0x93, 0x92, 0x91, 0x90, 0x8f, 0x8e, 0x8d, 0x8c, 0x8b, 0x8a, 0x89, 0x88,
+  0x87, 0x86, 0x85, 0x84, 0x83, 0x82, 0x81, 0x80, 0x7f, 0x7e, 0x7d, 0x7c,
+  0x7b, 0x7a, 0x79, 0x78, 0x77, 0x76, 0x75, 0x74, 0x73, 0x72, 0x71, 0x70,
+  0x6f, 0x6e, 0x6d, 0x6c, 0x6b, 0x6a, 0x69, 0x68, 0x67, 0x66, 0x65, 0x64,
+  0x63, 0x62, 0x61, 0x60, 0x5f, 0x5e, 0x5d, 0x5c, 0x5b, 0x5a, 0x59, 0x58,
+  0x57, 0x56, 0x55, 0x54, 0x53, 0x52, 0x51, 0x50, 0x4f, 0x4e, 0x4d, 0x4c,
+  0x4b, 0x4a, 0x49, 0x48, 0x47, 0x46, 0x45, 0x44, 0x43, 0x42, 0x41, 0x40,
+  0x3f, 0x3e, 0x3d, 0x3c, 0x3b, 0x3a, 0x39, 0x38, 0x37, 0x36, 0x35, 0x34,
+  0x33, 0x32, 0x31, 0x30, 0x2f, 0x2e, 0x2d, 0x2c, 0x2b, 0x2a, 0x29, 0x28,
+  0x27, 0x26, 0x25, 0x24, 0x23, 0x22, 0x21, 0x20, 0x1f, 0x1e, 0x1d, 0x1c,
+  0x1b, 0x1a, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10,
+  0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04,
+  0x03, 0x02, 0x01, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08,
+  0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14,
+  0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
+  0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c,
+  0x2d, 0x2e, 0x2f, 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38,
+  0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, 0x40, 0x41, 0x42, 0x43, 0x44,
+  0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, 0x50,
+  0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x5b, 0x5c,
+  0x5d, 0x5e, 0x5f, 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68,
+  0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74,
+  0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, 0x80,
+  0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c,
+  0x8d, 0x8e, 0x8f, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98,
+  0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, 0xa0, 0xa1, 0xa2, 0xa3, 0xa4,
+  0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, 0xb0,
+  0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xbb, 0xbc,
+  0xbd, 0xbe, 0xbf, 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8,
+  0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, 0xd0, 0xd1, 0xd2, 0xd3, 0xd4,
+  0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, 0xe0,
+  0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec,
+  0xed, 0xee, 0xef, 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8,
+  0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff
+};
+
+static const int8_t sclip1[1020 + 1020 + 1] = {
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
+  0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, 0x90, 0x91, 0x92, 0x93,
+  0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
+  0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xab,
+  0xac, 0xad, 0xae, 0xaf, 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7,
+  0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, 0xc0, 0xc1, 0xc2, 0xc3,
+  0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
+  0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb,
+  0xdc, 0xdd, 0xde, 0xdf, 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7,
+  0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, 0xf0, 0xf1, 0xf2, 0xf3,
+  0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff,
+  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b,
+  0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+  0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20, 0x21, 0x22, 0x23,
+  0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
+  0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
+  0x3c, 0x3d, 0x3e, 0x3f, 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
+  0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, 0x50, 0x51, 0x52, 0x53,
+  0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
+  0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b,
+  0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
+  0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+  0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f
+};
+
+static const int8_t sclip2[112 + 112 + 1] = {
+  0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0,
+  0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0,
+  0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0,
+  0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0,
+  0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0,
+  0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0,
+  0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0,
+  0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0,
+  0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb,
+  0xfc, 0xfd, 0xfe, 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+  0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
+  0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
+  0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
+  0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
+  0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
+  0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
+  0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
+  0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
+  0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f
+};
+
+static const uint8_t clip1[255 + 511 + 1] = {
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08,
+  0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14,
+  0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
+  0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c,
+  0x2d, 0x2e, 0x2f, 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38,
+  0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, 0x40, 0x41, 0x42, 0x43, 0x44,
+  0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, 0x50,
+  0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x5b, 0x5c,
+  0x5d, 0x5e, 0x5f, 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68,
+  0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74,
+  0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, 0x80,
+  0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c,
+  0x8d, 0x8e, 0x8f, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98,
+  0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, 0xa0, 0xa1, 0xa2, 0xa3, 0xa4,
+  0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, 0xb0,
+  0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xbb, 0xbc,
+  0xbd, 0xbe, 0xbf, 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8,
+  0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, 0xd0, 0xd1, 0xd2, 0xd3, 0xd4,
+  0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, 0xe0,
+  0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec,
+  0xed, 0xee, 0xef, 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8,
+  0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+};
+
+#else
+
+// uninitialized tables
+static uint8_t abs0[255 + 255 + 1];
+static int8_t sclip1[1020 + 1020 + 1];
+static int8_t sclip2[112 + 112 + 1];
+static uint8_t clip1[255 + 511 + 1];
+
+// We declare this variable 'volatile' to prevent instruction reordering
+// and make sure it's set to true _last_ (so as to be thread-safe)
+static volatile int tables_ok = 0;
+
+#endif
+
+const int8_t* const VP8ksclip1 = &sclip1[1020];
+const int8_t* const VP8ksclip2 = &sclip2[112];
+const uint8_t* const VP8kclip1 = &clip1[255];
+const uint8_t* const VP8kabs0 = &abs0[255];
+
+void VP8InitClipTables(void) {
+#if !defined(USE_STATIC_TABLES)
+  int i;
+  if (!tables_ok) {
+    for (i = -255; i <= 255; ++i) {
+      abs0[255 + i] = (i < 0) ? -i : i;
+    }
+    for (i = -1020; i <= 1020; ++i) {
+      sclip1[1020 + i] = (i < -128) ? -128 : (i > 127) ? 127 : i;
+    }
+    for (i = -112; i <= 112; ++i) {
+      sclip2[112 + i] = (i < -16) ? -16 : (i > 15) ? 15 : i;
+    }
+    for (i = -255; i <= 255 + 255; ++i) {
+      clip1[255 + i] = (i < 0) ? 0 : (i > 255) ? 255 : i;
+    }
+    tables_ok = 1;
+  }
+#endif    // USE_STATIC_TABLES
+}
diff --git a/src/3rdparty/libwebp/src/dsp/dec_mips32.c b/src/3rdparty/libwebp/src/dsp/dec_mips32.c
new file mode 100644
index 0000000..3e89ed3
--- /dev/null
+++ b/src/3rdparty/libwebp/src/dsp/dec_mips32.c
@@ -0,0 +1,578 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// MIPS version of dsp functions
+//
+// Author(s):  Djordje Pesut    (djordje.pesut@imgtec.com)
+//             Jovan Zelincevic (jovan.zelincevic@imgtec.com)
+
+#include "./dsp.h"
+
+#if defined(WEBP_USE_MIPS32)
+
+static const int kC1 = 20091 + (1 << 16);
+static const int kC2 = 35468;
+
+static WEBP_INLINE int abs_mips32(int x) {
+  const int sign = x >> 31;
+  return (x ^ sign) - sign;
+}
+
+// 4 pixels in, 2 pixels out
+static WEBP_INLINE void do_filter2(uint8_t* p, int step) {
+  const int p1 = p[-2 * step], p0 = p[-step], q0 = p[0], q1 = p[step];
+  const int a = 3 * (q0 - p0) + VP8ksclip1[p1 - q1];
+  const int a1 = VP8ksclip2[(a + 4) >> 3];
+  const int a2 = VP8ksclip2[(a + 3) >> 3];
+  p[-step] = VP8kclip1[p0 + a2];
+  p[    0] = VP8kclip1[q0 - a1];
+}
+
+// 4 pixels in, 4 pixels out
+static WEBP_INLINE void do_filter4(uint8_t* p, int step) {
+  const int p1 = p[-2 * step], p0 = p[-step], q0 = p[0], q1 = p[step];
+  const int a = 3 * (q0 - p0);
+  const int a1 = VP8ksclip2[(a + 4) >> 3];
+  const int a2 = VP8ksclip2[(a + 3) >> 3];
+  const int a3 = (a1 + 1) >> 1;
+  p[-2 * step] = VP8kclip1[p1 + a3];
+  p[-    step] = VP8kclip1[p0 + a2];
+  p[        0] = VP8kclip1[q0 - a1];
+  p[     step] = VP8kclip1[q1 - a3];
+}
+
+// 6 pixels in, 6 pixels out
+static WEBP_INLINE void do_filter6(uint8_t* p, int step) {
+  const int p2 = p[-3 * step], p1 = p[-2 * step], p0 = p[-step];
+  const int q0 = p[0], q1 = p[step], q2 = p[2 * step];
+  const int a = VP8ksclip1[3 * (q0 - p0) + VP8ksclip1[p1 - q1]];
+  const int a1 = (27 * a + 63) >> 7;  // eq. to ((3 * a + 7) * 9) >> 7
+  const int a2 = (18 * a + 63) >> 7;  // eq. to ((2 * a + 7) * 9) >> 7
+  const int a3 = (9  * a + 63) >> 7;  // eq. to ((1 * a + 7) * 9) >> 7
+  p[-3 * step] = VP8kclip1[p2 + a3];
+  p[-2 * step] = VP8kclip1[p1 + a2];
+  p[-    step] = VP8kclip1[p0 + a1];
+  p[        0] = VP8kclip1[q0 - a1];
+  p[     step] = VP8kclip1[q1 - a2];
+  p[ 2 * step] = VP8kclip1[q2 - a3];
+}
+
+static WEBP_INLINE int hev(const uint8_t* p, int step, int thresh) {
+  const int p1 = p[-2 * step], p0 = p[-step], q0 = p[0], q1 = p[step];
+  return (abs_mips32(p1 - p0) > thresh) || (abs_mips32(q1 - q0) > thresh);
+}
+
+static WEBP_INLINE int needs_filter(const uint8_t* p, int step, int thresh) {
+  const int p1 = p[-2 * step], p0 = p[-step], q0 = p[0], q1 = p[step];
+  return ((2 * abs_mips32(p0 - q0) + (abs_mips32(p1 - q1) >> 1)) <= thresh);
+}
+
+static WEBP_INLINE int needs_filter2(const uint8_t* p,
+                                     int step, int t, int it) {
+  const int p3 = p[-4 * step], p2 = p[-3 * step];
+  const int p1 = p[-2 * step], p0 = p[-step];
+  const int q0 = p[0], q1 = p[step], q2 = p[2 * step], q3 = p[3 * step];
+  if ((2 * abs_mips32(p0 - q0) + (abs_mips32(p1 - q1) >> 1)) > t) {
+    return 0;
+  }
+  return abs_mips32(p3 - p2) <= it && abs_mips32(p2 - p1) <= it &&
+         abs_mips32(p1 - p0) <= it && abs_mips32(q3 - q2) <= it &&
+         abs_mips32(q2 - q1) <= it && abs_mips32(q1 - q0) <= it;
+}
+
+static WEBP_INLINE void FilterLoop26(uint8_t* p,
+                                     int hstride, int vstride, int size,
+                                     int thresh, int ithresh, int hev_thresh) {
+  while (size-- > 0) {
+    if (needs_filter2(p, hstride, thresh, ithresh)) {
+      if (hev(p, hstride, hev_thresh)) {
+        do_filter2(p, hstride);
+      } else {
+        do_filter6(p, hstride);
+      }
+    }
+    p += vstride;
+  }
+}
+
+static WEBP_INLINE void FilterLoop24(uint8_t* p,
+                                     int hstride, int vstride, int size,
+                                     int thresh, int ithresh, int hev_thresh) {
+  while (size-- > 0) {
+    if (needs_filter2(p, hstride, thresh, ithresh)) {
+      if (hev(p, hstride, hev_thresh)) {
+        do_filter2(p, hstride);
+      } else {
+        do_filter4(p, hstride);
+      }
+    }
+    p += vstride;
+  }
+}
+
+// on macroblock edges
+static void VFilter16(uint8_t* p, int stride,
+                      int thresh, int ithresh, int hev_thresh) {
+  FilterLoop26(p, stride, 1, 16, thresh, ithresh, hev_thresh);
+}
+
+static void HFilter16(uint8_t* p, int stride,
+                      int thresh, int ithresh, int hev_thresh) {
+  FilterLoop26(p, 1, stride, 16, thresh, ithresh, hev_thresh);
+}
+
+// 8-pixels wide variant, for chroma filtering
+static void VFilter8(uint8_t* u, uint8_t* v, int stride,
+                     int thresh, int ithresh, int hev_thresh) {
+  FilterLoop26(u, stride, 1, 8, thresh, ithresh, hev_thresh);
+  FilterLoop26(v, stride, 1, 8, thresh, ithresh, hev_thresh);
+}
+
+static void HFilter8(uint8_t* u, uint8_t* v, int stride,
+                     int thresh, int ithresh, int hev_thresh) {
+  FilterLoop26(u, 1, stride, 8, thresh, ithresh, hev_thresh);
+  FilterLoop26(v, 1, stride, 8, thresh, ithresh, hev_thresh);
+}
+
+static void VFilter8i(uint8_t* u, uint8_t* v, int stride,
+                      int thresh, int ithresh, int hev_thresh) {
+  FilterLoop24(u + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
+  FilterLoop24(v + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
+}
+
+static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
+                      int thresh, int ithresh, int hev_thresh) {
+  FilterLoop24(u + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
+  FilterLoop24(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
+}
+
+// on three inner edges
+static void VFilter16i(uint8_t* p, int stride,
+                       int thresh, int ithresh, int hev_thresh) {
+  int k;
+  for (k = 3; k > 0; --k) {
+    p += 4 * stride;
+    FilterLoop24(p, stride, 1, 16, thresh, ithresh, hev_thresh);
+  }
+}
+
+static void HFilter16i(uint8_t* p, int stride,
+                       int thresh, int ithresh, int hev_thresh) {
+  int k;
+  for (k = 3; k > 0; --k) {
+    p += 4;
+    FilterLoop24(p, 1, stride, 16, thresh, ithresh, hev_thresh);
+  }
+}
+
+//------------------------------------------------------------------------------
+// Simple In-loop filtering (Paragraph 15.2)
+
+static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {
+  int i;
+  for (i = 0; i < 16; ++i) {
+    if (needs_filter(p + i, stride, thresh)) {
+      do_filter2(p + i, stride);
+    }
+  }
+}
+
+static void SimpleHFilter16(uint8_t* p, int stride, int thresh) {
+  int i;
+  for (i = 0; i < 16; ++i) {
+    if (needs_filter(p + i * stride, 1, thresh)) {
+      do_filter2(p + i * stride, 1);
+    }
+  }
+}
+
+static void SimpleVFilter16i(uint8_t* p, int stride, int thresh) {
+  int k;
+  for (k = 3; k > 0; --k) {
+    p += 4 * stride;
+    SimpleVFilter16(p, stride, thresh);
+  }
+}
+
+static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) {
+  int k;
+  for (k = 3; k > 0; --k) {
+    p += 4;
+    SimpleHFilter16(p, stride, thresh);
+  }
+}
+
+static void TransformOne(const int16_t* in, uint8_t* dst) {
+  int temp0, temp1, temp2, temp3, temp4;
+  int temp5, temp6, temp7, temp8, temp9;
+  int temp10, temp11, temp12, temp13, temp14;
+  int temp15, temp16, temp17, temp18;
+  int16_t* p_in = (int16_t*)in;
+
+  // loops unrolled and merged to avoid usage of tmp buffer
+  // and to reduce number of stalls. MUL macro is written
+  // in assembler and inlined
+  __asm__ volatile(
+    "lh       %[temp0],  0(%[in])                      \n\t"
+    "lh       %[temp8],  16(%[in])                     \n\t"
+    "lh       %[temp4],  8(%[in])                      \n\t"
+    "lh       %[temp12], 24(%[in])                     \n\t"
+    "addu     %[temp16], %[temp0],  %[temp8]           \n\t"
+    "subu     %[temp0],  %[temp0],  %[temp8]           \n\t"
+    "mul      %[temp8],  %[temp4],  %[kC2]             \n\t"
+    "mul      %[temp17], %[temp12], %[kC1]             \n\t"
+    "mul      %[temp4],  %[temp4],  %[kC1]             \n\t"
+    "mul      %[temp12], %[temp12], %[kC2]             \n\t"
+    "lh       %[temp1],  2(%[in])                      \n\t"
+    "lh       %[temp5],  10(%[in])                     \n\t"
+    "lh       %[temp9],  18(%[in])                     \n\t"
+    "lh       %[temp13], 26(%[in])                     \n\t"
+    "sra      %[temp8],  %[temp8],  16                 \n\t"
+    "sra      %[temp17], %[temp17], 16                 \n\t"
+    "sra      %[temp4],  %[temp4],  16                 \n\t"
+    "sra      %[temp12], %[temp12], 16                 \n\t"
+    "lh       %[temp2],  4(%[in])                      \n\t"
+    "lh       %[temp6],  12(%[in])                     \n\t"
+    "lh       %[temp10], 20(%[in])                     \n\t"
+    "lh       %[temp14], 28(%[in])                     \n\t"
+    "subu     %[temp17], %[temp8],  %[temp17]          \n\t"
+    "addu     %[temp4],  %[temp4],  %[temp12]          \n\t"
+    "addu     %[temp8],  %[temp16], %[temp4]           \n\t"
+    "subu     %[temp4],  %[temp16], %[temp4]           \n\t"
+    "addu     %[temp16], %[temp1],  %[temp9]           \n\t"
+    "subu     %[temp1],  %[temp1],  %[temp9]           \n\t"
+    "lh       %[temp3],  6(%[in])                      \n\t"
+    "lh       %[temp7],  14(%[in])                     \n\t"
+    "lh       %[temp11], 22(%[in])                     \n\t"
+    "lh       %[temp15], 30(%[in])                     \n\t"
+    "addu     %[temp12], %[temp0],  %[temp17]          \n\t"
+    "subu     %[temp0],  %[temp0],  %[temp17]          \n\t"
+    "mul      %[temp9],  %[temp5],  %[kC2]             \n\t"
+    "mul      %[temp17], %[temp13], %[kC1]             \n\t"
+    "mul      %[temp5],  %[temp5],  %[kC1]             \n\t"
+    "mul      %[temp13], %[temp13], %[kC2]             \n\t"
+    "sra      %[temp9],  %[temp9],  16                 \n\t"
+    "sra      %[temp17], %[temp17], 16                 \n\t"
+    "subu     %[temp17], %[temp9],  %[temp17]          \n\t"
+    "sra      %[temp5],  %[temp5],  16                 \n\t"
+    "sra      %[temp13], %[temp13], 16                 \n\t"
+    "addu     %[temp5],  %[temp5],  %[temp13]          \n\t"
+    "addu     %[temp13], %[temp1],  %[temp17]          \n\t"
+    "subu     %[temp1],  %[temp1],  %[temp17]          \n\t"
+    "mul      %[temp17], %[temp14], %[kC1]             \n\t"
+    "mul      %[temp14], %[temp14], %[kC2]             \n\t"
+    "addu     %[temp9],  %[temp16], %[temp5]           \n\t"
+    "subu     %[temp5],  %[temp16], %[temp5]           \n\t"
+    "addu     %[temp16], %[temp2],  %[temp10]          \n\t"
+    "subu     %[temp2],  %[temp2],  %[temp10]          \n\t"
+    "mul      %[temp10], %[temp6],  %[kC2]             \n\t"
+    "mul      %[temp6],  %[temp6],  %[kC1]             \n\t"
+    "sra      %[temp17], %[temp17], 16                 \n\t"
+    "sra      %[temp14], %[temp14], 16                 \n\t"
+    "sra      %[temp10], %[temp10], 16                 \n\t"
+    "sra      %[temp6],  %[temp6],  16                 \n\t"
+    "subu     %[temp17], %[temp10], %[temp17]          \n\t"
+    "addu     %[temp6],  %[temp6],  %[temp14]          \n\t"
+    "addu     %[temp10], %[temp16], %[temp6]           \n\t"
+    "subu     %[temp6],  %[temp16], %[temp6]           \n\t"
+    "addu     %[temp14], %[temp2],  %[temp17]          \n\t"
+    "subu     %[temp2],  %[temp2],  %[temp17]          \n\t"
+    "mul      %[temp17], %[temp15], %[kC1]             \n\t"
+    "mul      %[temp15], %[temp15], %[kC2]             \n\t"
+    "addu     %[temp16], %[temp3],  %[temp11]          \n\t"
+    "subu     %[temp3],  %[temp3],  %[temp11]          \n\t"
+    "mul      %[temp11], %[temp7],  %[kC2]             \n\t"
+    "mul      %[temp7],  %[temp7],  %[kC1]             \n\t"
+    "addiu    %[temp8],  %[temp8],  4                  \n\t"
+    "addiu    %[temp12], %[temp12], 4                  \n\t"
+    "addiu    %[temp0],  %[temp0],  4                  \n\t"
+    "addiu    %[temp4],  %[temp4],  4                  \n\t"
+    "sra      %[temp17], %[temp17], 16                 \n\t"
+    "sra      %[temp15], %[temp15], 16                 \n\t"
+    "sra      %[temp11], %[temp11], 16                 \n\t"
+    "sra      %[temp7],  %[temp7],  16                 \n\t"
+    "subu     %[temp17], %[temp11], %[temp17]          \n\t"
+    "addu     %[temp7],  %[temp7],  %[temp15]          \n\t"
+    "addu     %[temp15], %[temp3],  %[temp17]          \n\t"
+    "subu     %[temp3],  %[temp3],  %[temp17]          \n\t"
+    "addu     %[temp11], %[temp16], %[temp7]           \n\t"
+    "subu     %[temp7],  %[temp16], %[temp7]           \n\t"
+    "addu     %[temp16], %[temp8],  %[temp10]          \n\t"
+    "subu     %[temp8],  %[temp8],  %[temp10]          \n\t"
+    "mul      %[temp10], %[temp9],  %[kC2]             \n\t"
+    "mul      %[temp17], %[temp11], %[kC1]             \n\t"
+    "mul      %[temp9],  %[temp9],  %[kC1]             \n\t"
+    "mul      %[temp11], %[temp11], %[kC2]             \n\t"
+    "sra      %[temp10], %[temp10], 16                 \n\t"
+    "sra      %[temp17], %[temp17], 16                 \n\t"
+    "sra      %[temp9],  %[temp9],  16                 \n\t"
+    "sra      %[temp11], %[temp11], 16                 \n\t"
+    "subu     %[temp17], %[temp10], %[temp17]          \n\t"
+    "addu     %[temp11], %[temp9],  %[temp11]          \n\t"
+    "addu     %[temp10], %[temp12], %[temp14]          \n\t"
+    "subu     %[temp12], %[temp12], %[temp14]          \n\t"
+    "mul      %[temp14], %[temp13], %[kC2]             \n\t"
+    "mul      %[temp9],  %[temp15], %[kC1]             \n\t"
+    "mul      %[temp13], %[temp13], %[kC1]             \n\t"
+    "mul      %[temp15], %[temp15], %[kC2]             \n\t"
+    "sra      %[temp14], %[temp14], 16                 \n\t"
+    "sra      %[temp9],  %[temp9],  16                 \n\t"
+    "sra      %[temp13], %[temp13], 16                 \n\t"
+    "sra      %[temp15], %[temp15], 16                 \n\t"
+    "subu     %[temp9],  %[temp14], %[temp9]           \n\t"
+    "addu     %[temp15], %[temp13], %[temp15]          \n\t"
+    "addu     %[temp14], %[temp0],  %[temp2]           \n\t"
+    "subu     %[temp0],  %[temp0],  %[temp2]           \n\t"
+    "mul      %[temp2],  %[temp1],  %[kC2]             \n\t"
+    "mul      %[temp13], %[temp3],  %[kC1]             \n\t"
+    "mul      %[temp1],  %[temp1],  %[kC1]             \n\t"
+    "mul      %[temp3],  %[temp3],  %[kC2]             \n\t"
+    "sra      %[temp2],  %[temp2],  16                 \n\t"
+    "sra      %[temp13], %[temp13], 16                 \n\t"
+    "sra      %[temp1],  %[temp1],  16                 \n\t"
+    "sra      %[temp3],  %[temp3],  16                 \n\t"
+    "subu     %[temp13], %[temp2],  %[temp13]          \n\t"
+    "addu     %[temp3],  %[temp1],  %[temp3]           \n\t"
+    "addu     %[temp2],  %[temp4],  %[temp6]           \n\t"
+    "subu     %[temp4],  %[temp4],  %[temp6]           \n\t"
+    "mul      %[temp6],  %[temp5],  %[kC2]             \n\t"
+    "mul      %[temp1],  %[temp7],  %[kC1]             \n\t"
+    "mul      %[temp5],  %[temp5],  %[kC1]             \n\t"
+    "mul      %[temp7],  %[temp7],  %[kC2]             \n\t"
+    "sra      %[temp6],  %[temp6],  16                 \n\t"
+    "sra      %[temp1],  %[temp1],  16                 \n\t"
+    "sra      %[temp5],  %[temp5],  16                 \n\t"
+    "sra      %[temp7],  %[temp7],  16                 \n\t"
+    "subu     %[temp1],  %[temp6],  %[temp1]           \n\t"
+    "addu     %[temp7],  %[temp5],  %[temp7]           \n\t"
+    "addu     %[temp5],  %[temp16], %[temp11]          \n\t"
+    "subu     %[temp16], %[temp16], %[temp11]          \n\t"
+    "addu     %[temp11], %[temp8],  %[temp17]          \n\t"
+    "subu     %[temp8],  %[temp8],  %[temp17]          \n\t"
+    "sra      %[temp5],  %[temp5],  3                  \n\t"
+    "sra      %[temp16], %[temp16], 3                  \n\t"
+    "sra      %[temp11], %[temp11], 3                  \n\t"
+    "sra      %[temp8],  %[temp8],  3                  \n\t"
+    "addu     %[temp17], %[temp10], %[temp15]          \n\t"
+    "subu     %[temp10], %[temp10], %[temp15]          \n\t"
+    "addu     %[temp15], %[temp12], %[temp9]           \n\t"
+    "subu     %[temp12], %[temp12], %[temp9]           \n\t"
+    "sra      %[temp17], %[temp17], 3                  \n\t"
+    "sra      %[temp10], %[temp10], 3                  \n\t"
+    "sra      %[temp15], %[temp15], 3                  \n\t"
+    "sra      %[temp12], %[temp12], 3                  \n\t"
+    "addu     %[temp9],  %[temp14], %[temp3]           \n\t"
+    "subu     %[temp14], %[temp14], %[temp3]           \n\t"
+    "addu     %[temp3],  %[temp0],  %[temp13]          \n\t"
+    "subu     %[temp0],  %[temp0],  %[temp13]          \n\t"
+    "sra      %[temp9],  %[temp9],  3                  \n\t"
+    "sra      %[temp14], %[temp14], 3                  \n\t"
+    "sra      %[temp3],  %[temp3],  3                  \n\t"
+    "sra      %[temp0],  %[temp0],  3                  \n\t"
+    "addu     %[temp13], %[temp2],  %[temp7]           \n\t"
+    "subu     %[temp2],  %[temp2],  %[temp7]           \n\t"
+    "addu     %[temp7],  %[temp4],  %[temp1]           \n\t"
+    "subu     %[temp4],  %[temp4],  %[temp1]           \n\t"
+    "sra      %[temp13], %[temp13], 3                  \n\t"
+    "sra      %[temp2],  %[temp2],  3                  \n\t"
+    "sra      %[temp7],  %[temp7],  3                  \n\t"
+    "sra      %[temp4],  %[temp4],  3                  \n\t"
+    "addiu    %[temp6],  $zero,     255                \n\t"
+    "lbu      %[temp1],  0(%[dst])                     \n\t"
+    "addu     %[temp1],  %[temp1],  %[temp5]           \n\t"
+    "sra      %[temp5],  %[temp1],  8                  \n\t"
+    "sra      %[temp18], %[temp1],  31                 \n\t"
+    "beqz     %[temp5],  1f                            \n\t"
+    "xor      %[temp1],  %[temp1],  %[temp1]           \n\t"
+    "movz     %[temp1],  %[temp6],  %[temp18]          \n\t"
+  "1:                                                  \n\t"
+    "lbu      %[temp18], 1(%[dst])                     \n\t"
+    "sb       %[temp1],  0(%[dst])                     \n\t"
+    "addu     %[temp18], %[temp18], %[temp11]          \n\t"
+    "sra      %[temp11], %[temp18], 8                  \n\t"
+    "sra      %[temp1],  %[temp18], 31                 \n\t"
+    "beqz     %[temp11], 2f                            \n\t"
+    "xor      %[temp18], %[temp18], %[temp18]          \n\t"
+    "movz     %[temp18], %[temp6],  %[temp1]           \n\t"
+  "2:                                                  \n\t"
+    "lbu      %[temp1],  2(%[dst])                     \n\t"
+    "sb       %[temp18], 1(%[dst])                     \n\t"
+    "addu     %[temp1],  %[temp1],  %[temp8]           \n\t"
+    "sra      %[temp8],  %[temp1],  8                  \n\t"
+    "sra      %[temp18], %[temp1],  31                 \n\t"
+    "beqz     %[temp8],  3f                            \n\t"
+    "xor      %[temp1],  %[temp1],  %[temp1]           \n\t"
+    "movz     %[temp1],  %[temp6],  %[temp18]          \n\t"
+  "3:                                                  \n\t"
+    "lbu      %[temp18], 3(%[dst])                     \n\t"
+    "sb       %[temp1],  2(%[dst])                     \n\t"
+    "addu     %[temp18], %[temp18], %[temp16]          \n\t"
+    "sra      %[temp16], %[temp18], 8                  \n\t"
+    "sra      %[temp1],  %[temp18], 31                 \n\t"
+    "beqz     %[temp16], 4f                            \n\t"
+    "xor      %[temp18], %[temp18], %[temp18]          \n\t"
+    "movz     %[temp18], %[temp6],  %[temp1]           \n\t"
+  "4:                                                  \n\t"
+    "sb       %[temp18], 3(%[dst])                     \n\t"
+    "lbu      %[temp5],  32(%[dst])                    \n\t"
+    "lbu      %[temp8],  33(%[dst])                    \n\t"
+    "lbu      %[temp11], 34(%[dst])                    \n\t"
+    "lbu      %[temp16], 35(%[dst])                    \n\t"
+    "addu     %[temp5],  %[temp5],  %[temp17]          \n\t"
+    "addu     %[temp8],  %[temp8],  %[temp15]          \n\t"
+    "addu     %[temp11], %[temp11], %[temp12]          \n\t"
+    "addu     %[temp16], %[temp16], %[temp10]          \n\t"
+    "sra      %[temp18], %[temp5],  8                  \n\t"
+    "sra      %[temp1],  %[temp5],  31                 \n\t"
+    "beqz     %[temp18], 5f                            \n\t"
+    "xor      %[temp5],  %[temp5],  %[temp5]           \n\t"
+    "movz     %[temp5],  %[temp6],  %[temp1]           \n\t"
+  "5:                                                  \n\t"
+    "sra      %[temp18], %[temp8],  8                  \n\t"
+    "sra      %[temp1],  %[temp8],  31                 \n\t"
+    "beqz     %[temp18], 6f                            \n\t"
+    "xor      %[temp8],  %[temp8],  %[temp8]           \n\t"
+    "movz     %[temp8],  %[temp6],  %[temp1]           \n\t"
+  "6:                                                  \n\t"
+    "sra      %[temp18], %[temp11], 8                  \n\t"
+    "sra      %[temp1],  %[temp11], 31                 \n\t"
+    "sra      %[temp17], %[temp16], 8                  \n\t"
+    "sra      %[temp15], %[temp16], 31                 \n\t"
+    "beqz     %[temp18], 7f                            \n\t"
+    "xor      %[temp11], %[temp11], %[temp11]          \n\t"
+    "movz     %[temp11], %[temp6],  %[temp1]           \n\t"
+  "7:                                                  \n\t"
+    "beqz     %[temp17], 8f                            \n\t"
+    "xor      %[temp16], %[temp16], %[temp16]          \n\t"
+    "movz     %[temp16], %[temp6],  %[temp15]          \n\t"
+  "8:                                                  \n\t"
+    "sb       %[temp5],  32(%[dst])                    \n\t"
+    "sb       %[temp8],  33(%[dst])                    \n\t"
+    "sb       %[temp11], 34(%[dst])                    \n\t"
+    "sb       %[temp16], 35(%[dst])                    \n\t"
+    "lbu      %[temp5],  64(%[dst])                    \n\t"
+    "lbu      %[temp8],  65(%[dst])                    \n\t"
+    "lbu      %[temp11], 66(%[dst])                    \n\t"
+    "lbu      %[temp16], 67(%[dst])                    \n\t"
+    "addu     %[temp5],  %[temp5],  %[temp9]           \n\t"
+    "addu     %[temp8],  %[temp8],  %[temp3]           \n\t"
+    "addu     %[temp11], %[temp11], %[temp0]           \n\t"
+    "addu     %[temp16], %[temp16], %[temp14]          \n\t"
+    "sra      %[temp18], %[temp5],  8                  \n\t"
+    "sra      %[temp1],  %[temp5],  31                 \n\t"
+    "sra      %[temp17], %[temp8],  8                  \n\t"
+    "sra      %[temp15], %[temp8],  31                 \n\t"
+    "sra      %[temp12], %[temp11], 8                  \n\t"
+    "sra      %[temp10], %[temp11], 31                 \n\t"
+    "sra      %[temp9],  %[temp16], 8                  \n\t"
+    "sra      %[temp3],  %[temp16], 31                 \n\t"
+    "beqz     %[temp18], 9f                            \n\t"
+    "xor      %[temp5],  %[temp5],  %[temp5]           \n\t"
+    "movz     %[temp5],  %[temp6],  %[temp1]           \n\t"
+  "9:                                                  \n\t"
+    "beqz     %[temp17], 10f                           \n\t"
+    "xor      %[temp8],  %[temp8],  %[temp8]           \n\t"
+    "movz     %[temp8],  %[temp6],  %[temp15]          \n\t"
+  "10:                                                 \n\t"
+    "beqz     %[temp12], 11f                           \n\t"
+    "xor      %[temp11], %[temp11], %[temp11]          \n\t"
+    "movz     %[temp11], %[temp6],  %[temp10]          \n\t"
+  "11:                                                 \n\t"
+    "beqz     %[temp9],  12f                           \n\t"
+    "xor      %[temp16], %[temp16], %[temp16]          \n\t"
+    "movz     %[temp16], %[temp6],  %[temp3]           \n\t"
+  "12:                                                 \n\t"
+    "sb       %[temp5],  64(%[dst])                    \n\t"
+    "sb       %[temp8],  65(%[dst])                    \n\t"
+    "sb       %[temp11], 66(%[dst])                    \n\t"
+    "sb       %[temp16], 67(%[dst])                    \n\t"
+    "lbu      %[temp5],  96(%[dst])                    \n\t"
+    "lbu      %[temp8],  97(%[dst])                    \n\t"
+    "lbu      %[temp11], 98(%[dst])                    \n\t"
+    "lbu      %[temp16], 99(%[dst])                    \n\t"
+    "addu     %[temp5],  %[temp5],  %[temp13]          \n\t"
+    "addu     %[temp8],  %[temp8],  %[temp7]           \n\t"
+    "addu     %[temp11], %[temp11], %[temp4]           \n\t"
+    "addu     %[temp16], %[temp16], %[temp2]           \n\t"
+    "sra      %[temp18], %[temp5],  8                  \n\t"
+    "sra      %[temp1],  %[temp5],  31                 \n\t"
+    "sra      %[temp17], %[temp8],  8                  \n\t"
+    "sra      %[temp15], %[temp8],  31                 \n\t"
+    "sra      %[temp12], %[temp11], 8                  \n\t"
+    "sra      %[temp10], %[temp11], 31                 \n\t"
+    "sra      %[temp9],  %[temp16], 8                  \n\t"
+    "sra      %[temp3],  %[temp16], 31                 \n\t"
+    "beqz     %[temp18], 13f                           \n\t"
+    "xor      %[temp5],  %[temp5],  %[temp5]           \n\t"
+    "movz     %[temp5],  %[temp6],  %[temp1]           \n\t"
+  "13:                                                 \n\t"
+    "beqz     %[temp17], 14f                           \n\t"
+    "xor      %[temp8],  %[temp8],  %[temp8]           \n\t"
+    "movz     %[temp8],  %[temp6],  %[temp15]          \n\t"
+  "14:                                                 \n\t"
+    "beqz     %[temp12], 15f                           \n\t"
+    "xor      %[temp11], %[temp11], %[temp11]          \n\t"
+    "movz     %[temp11], %[temp6],  %[temp10]          \n\t"
+  "15:                                                 \n\t"
+    "beqz     %[temp9],  16f                           \n\t"
+    "xor      %[temp16], %[temp16], %[temp16]          \n\t"
+    "movz     %[temp16], %[temp6],  %[temp3]           \n\t"
+  "16:                                                 \n\t"
+    "sb       %[temp5],  96(%[dst])                    \n\t"
+    "sb       %[temp8],  97(%[dst])                    \n\t"
+    "sb       %[temp11], 98(%[dst])                    \n\t"
+    "sb       %[temp16], 99(%[dst])                    \n\t"
+
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
+      [temp9]"=&r"(temp9), [temp10]"=&r"(temp10), [temp11]"=&r"(temp11),
+      [temp12]"=&r"(temp12), [temp13]"=&r"(temp13), [temp14]"=&r"(temp14),
+      [temp15]"=&r"(temp15), [temp16]"=&r"(temp16), [temp17]"=&r"(temp17),
+      [temp18]"=&r"(temp18)
+    : [in]"r"(p_in), [kC1]"r"(kC1), [kC2]"r"(kC2), [dst]"r"(dst)
+    : "memory", "hi", "lo"
+  );
+}
+
+static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
+  TransformOne(in, dst);
+  if (do_two) {
+    TransformOne(in + 16, dst + 4);
+  }
+}
+
+#endif  // WEBP_USE_MIPS32
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8DspInitMIPS32(void);
+
+void VP8DspInitMIPS32(void) {
+#if defined(WEBP_USE_MIPS32)
+  VP8InitClipTables();
+
+  VP8Transform = TransformTwo;
+
+  VP8VFilter16 = VFilter16;
+  VP8HFilter16 = HFilter16;
+  VP8VFilter8 = VFilter8;
+  VP8HFilter8 = HFilter8;
+  VP8VFilter16i = VFilter16i;
+  VP8HFilter16i = HFilter16i;
+  VP8VFilter8i = VFilter8i;
+  VP8HFilter8i = HFilter8i;
+
+  VP8SimpleVFilter16 = SimpleVFilter16;
+  VP8SimpleHFilter16 = SimpleHFilter16;
+  VP8SimpleVFilter16i = SimpleVFilter16i;
+  VP8SimpleHFilter16i = SimpleHFilter16i;
+#endif  // WEBP_USE_MIPS32
+}
diff --git a/src/3rdparty/libwebp/src/dsp/dec_neon.c b/src/3rdparty/libwebp/src/dsp/dec_neon.c
index 9c3d8cc..9c5bc1c 100644
--- a/src/3rdparty/libwebp/src/dsp/dec_neon.c
+++ b/src/3rdparty/libwebp/src/dsp/dec_neon.c
@@ -16,8 +16,521 @@
 
 #if defined(WEBP_USE_NEON)
 
+#include "./neon.h"
 #include "../dec/vp8i.h"
 
+//------------------------------------------------------------------------------
+// NxM Loading functions
+
+// Load/Store vertical edge
+#define LOAD8x4(c1, c2, c3, c4, b1, b2, stride)                                \
+  "vld4.8   {" #c1"[0], " #c2"[0], " #c3"[0], " #c4"[0]}," #b1 "," #stride"\n" \
+  "vld4.8   {" #c1"[1], " #c2"[1], " #c3"[1], " #c4"[1]}," #b2 "," #stride"\n" \
+  "vld4.8   {" #c1"[2], " #c2"[2], " #c3"[2], " #c4"[2]}," #b1 "," #stride"\n" \
+  "vld4.8   {" #c1"[3], " #c2"[3], " #c3"[3], " #c4"[3]}," #b2 "," #stride"\n" \
+  "vld4.8   {" #c1"[4], " #c2"[4], " #c3"[4], " #c4"[4]}," #b1 "," #stride"\n" \
+  "vld4.8   {" #c1"[5], " #c2"[5], " #c3"[5], " #c4"[5]}," #b2 "," #stride"\n" \
+  "vld4.8   {" #c1"[6], " #c2"[6], " #c3"[6], " #c4"[6]}," #b1 "," #stride"\n" \
+  "vld4.8   {" #c1"[7], " #c2"[7], " #c3"[7], " #c4"[7]}," #b2 "," #stride"\n"
+
+#define STORE8x2(c1, c2, p, stride)                                            \
+  "vst2.8   {" #c1"[0], " #c2"[0]}," #p "," #stride " \n"                      \
+  "vst2.8   {" #c1"[1], " #c2"[1]}," #p "," #stride " \n"                      \
+  "vst2.8   {" #c1"[2], " #c2"[2]}," #p "," #stride " \n"                      \
+  "vst2.8   {" #c1"[3], " #c2"[3]}," #p "," #stride " \n"                      \
+  "vst2.8   {" #c1"[4], " #c2"[4]}," #p "," #stride " \n"                      \
+  "vst2.8   {" #c1"[5], " #c2"[5]}," #p "," #stride " \n"                      \
+  "vst2.8   {" #c1"[6], " #c2"[6]}," #p "," #stride " \n"                      \
+  "vst2.8   {" #c1"[7], " #c2"[7]}," #p "," #stride " \n"
+
+#if !defined(WORK_AROUND_GCC)
+
+// This intrinsics version makes gcc-4.6.3 crash during Load4x??() compilation
+// (register alloc, probably). The variants somewhat mitigate the problem, but
+// not quite. HFilter16i() remains problematic.
+static WEBP_INLINE uint8x8x4_t Load4x8(const uint8_t* const src, int stride) {
+  const uint8x8_t zero = vdup_n_u8(0);
+  uint8x8x4_t out;
+  INIT_VECTOR4(out, zero, zero, zero, zero);
+  out = vld4_lane_u8(src + 0 * stride, out, 0);
+  out = vld4_lane_u8(src + 1 * stride, out, 1);
+  out = vld4_lane_u8(src + 2 * stride, out, 2);
+  out = vld4_lane_u8(src + 3 * stride, out, 3);
+  out = vld4_lane_u8(src + 4 * stride, out, 4);
+  out = vld4_lane_u8(src + 5 * stride, out, 5);
+  out = vld4_lane_u8(src + 6 * stride, out, 6);
+  out = vld4_lane_u8(src + 7 * stride, out, 7);
+  return out;
+}
+
+static WEBP_INLINE void Load4x16(const uint8_t* const src, int stride,
+                                 uint8x16_t* const p1, uint8x16_t* const p0,
+                                 uint8x16_t* const q0, uint8x16_t* const q1) {
+  // row0 = p1[0..7]|p0[0..7]|q0[0..7]|q1[0..7]
+  // row8 = p1[8..15]|p0[8..15]|q0[8..15]|q1[8..15]
+  const uint8x8x4_t row0 = Load4x8(src - 2 + 0 * stride, stride);
+  const uint8x8x4_t row8 = Load4x8(src - 2 + 8 * stride, stride);
+  *p1 = vcombine_u8(row0.val[0], row8.val[0]);
+  *p0 = vcombine_u8(row0.val[1], row8.val[1]);
+  *q0 = vcombine_u8(row0.val[2], row8.val[2]);
+  *q1 = vcombine_u8(row0.val[3], row8.val[3]);
+}
+
+#else  // WORK_AROUND_GCC
+
+#define LOADQ_LANE_32b(VALUE, LANE) do {                             \
+  (VALUE) = vld1q_lane_u32((const uint32_t*)src, (VALUE), (LANE));   \
+  src += stride;                                                     \
+} while (0)
+
+static WEBP_INLINE void Load4x16(const uint8_t* src, int stride,
+                                 uint8x16_t* const p1, uint8x16_t* const p0,
+                                 uint8x16_t* const q0, uint8x16_t* const q1) {
+  const uint32x4_t zero = vdupq_n_u32(0);
+  uint32x4x4_t in;
+  INIT_VECTOR4(in, zero, zero, zero, zero);
+  src -= 2;
+  LOADQ_LANE_32b(in.val[0], 0);
+  LOADQ_LANE_32b(in.val[1], 0);
+  LOADQ_LANE_32b(in.val[2], 0);
+  LOADQ_LANE_32b(in.val[3], 0);
+  LOADQ_LANE_32b(in.val[0], 1);
+  LOADQ_LANE_32b(in.val[1], 1);
+  LOADQ_LANE_32b(in.val[2], 1);
+  LOADQ_LANE_32b(in.val[3], 1);
+  LOADQ_LANE_32b(in.val[0], 2);
+  LOADQ_LANE_32b(in.val[1], 2);
+  LOADQ_LANE_32b(in.val[2], 2);
+  LOADQ_LANE_32b(in.val[3], 2);
+  LOADQ_LANE_32b(in.val[0], 3);
+  LOADQ_LANE_32b(in.val[1], 3);
+  LOADQ_LANE_32b(in.val[2], 3);
+  LOADQ_LANE_32b(in.val[3], 3);
+  // Transpose four 4x4 parts:
+  {
+    const uint8x16x2_t row01 = vtrnq_u8(vreinterpretq_u8_u32(in.val[0]),
+                                        vreinterpretq_u8_u32(in.val[1]));
+    const uint8x16x2_t row23 = vtrnq_u8(vreinterpretq_u8_u32(in.val[2]),
+                                        vreinterpretq_u8_u32(in.val[3]));
+    const uint16x8x2_t row02 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[0]),
+                                         vreinterpretq_u16_u8(row23.val[0]));
+    const uint16x8x2_t row13 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[1]),
+                                         vreinterpretq_u16_u8(row23.val[1]));
+    *p1 = vreinterpretq_u8_u16(row02.val[0]);
+    *p0 = vreinterpretq_u8_u16(row13.val[0]);
+    *q0 = vreinterpretq_u8_u16(row02.val[1]);
+    *q1 = vreinterpretq_u8_u16(row13.val[1]);
+  }
+}
+#undef LOADQ_LANE_32b
+
+#endif  // !WORK_AROUND_GCC
+
+static WEBP_INLINE void Load8x16(const uint8_t* const src, int stride,
+                                 uint8x16_t* const p3, uint8x16_t* const p2,
+                                 uint8x16_t* const p1, uint8x16_t* const p0,
+                                 uint8x16_t* const q0, uint8x16_t* const q1,
+                                 uint8x16_t* const q2, uint8x16_t* const q3) {
+  Load4x16(src - 2, stride, p3, p2, p1, p0);
+  Load4x16(src + 2, stride, q0, q1, q2, q3);
+}
+
+static WEBP_INLINE void Load16x4(const uint8_t* const src, int stride,
+                                 uint8x16_t* const p1, uint8x16_t* const p0,
+                                 uint8x16_t* const q0, uint8x16_t* const q1) {
+  *p1 = vld1q_u8(src - 2 * stride);
+  *p0 = vld1q_u8(src - 1 * stride);
+  *q0 = vld1q_u8(src + 0 * stride);
+  *q1 = vld1q_u8(src + 1 * stride);
+}
+
+static WEBP_INLINE void Load16x8(const uint8_t* const src, int stride,
+                                 uint8x16_t* const p3, uint8x16_t* const p2,
+                                 uint8x16_t* const p1, uint8x16_t* const p0,
+                                 uint8x16_t* const q0, uint8x16_t* const q1,
+                                 uint8x16_t* const q2, uint8x16_t* const q3) {
+  Load16x4(src - 2  * stride, stride, p3, p2, p1, p0);
+  Load16x4(src + 2  * stride, stride, q0, q1, q2, q3);
+}
+
+static WEBP_INLINE void Load8x8x2(const uint8_t* const u,
+                                  const uint8_t* const v,
+                                  int stride,
+                                  uint8x16_t* const p3, uint8x16_t* const p2,
+                                  uint8x16_t* const p1, uint8x16_t* const p0,
+                                  uint8x16_t* const q0, uint8x16_t* const q1,
+                                  uint8x16_t* const q2, uint8x16_t* const q3) {
+  // We pack the 8x8 u-samples in the lower half of the uint8x16_t destination
+  // and the v-samples on the higher half.
+  *p3 = vcombine_u8(vld1_u8(u - 4 * stride), vld1_u8(v - 4 * stride));
+  *p2 = vcombine_u8(vld1_u8(u - 3 * stride), vld1_u8(v - 3 * stride));
+  *p1 = vcombine_u8(vld1_u8(u - 2 * stride), vld1_u8(v - 2 * stride));
+  *p0 = vcombine_u8(vld1_u8(u - 1 * stride), vld1_u8(v - 1 * stride));
+  *q0 = vcombine_u8(vld1_u8(u + 0 * stride), vld1_u8(v + 0 * stride));
+  *q1 = vcombine_u8(vld1_u8(u + 1 * stride), vld1_u8(v + 1 * stride));
+  *q2 = vcombine_u8(vld1_u8(u + 2 * stride), vld1_u8(v + 2 * stride));
+  *q3 = vcombine_u8(vld1_u8(u + 3 * stride), vld1_u8(v + 3 * stride));
+}
+
+#if !defined(WORK_AROUND_GCC)
+
+#define LOAD_UV_8(ROW) \
+  vcombine_u8(vld1_u8(u - 4 + (ROW) * stride), vld1_u8(v - 4 + (ROW) * stride))
+
+static WEBP_INLINE void Load8x8x2T(const uint8_t* const u,
+                                   const uint8_t* const v,
+                                   int stride,
+                                   uint8x16_t* const p3, uint8x16_t* const p2,
+                                   uint8x16_t* const p1, uint8x16_t* const p0,
+                                   uint8x16_t* const q0, uint8x16_t* const q1,
+                                   uint8x16_t* const q2, uint8x16_t* const q3) {
+  // We pack the 8x8 u-samples in the lower half of the uint8x16_t destination
+  // and the v-samples on the higher half.
+  const uint8x16_t row0 = LOAD_UV_8(0);
+  const uint8x16_t row1 = LOAD_UV_8(1);
+  const uint8x16_t row2 = LOAD_UV_8(2);
+  const uint8x16_t row3 = LOAD_UV_8(3);
+  const uint8x16_t row4 = LOAD_UV_8(4);
+  const uint8x16_t row5 = LOAD_UV_8(5);
+  const uint8x16_t row6 = LOAD_UV_8(6);
+  const uint8x16_t row7 = LOAD_UV_8(7);
+  // Perform two side-by-side 8x8 transposes
+  // u00 u01 u02 u03 u04 u05 u06 u07 | v00 v01 v02 v03 v04 v05 v06 v07
+  // u10 u11 u12 u13 u14 u15 u16 u17 | v10 v11 v12 ...
+  // u20 u21 u22 u23 u24 u25 u26 u27 | v20 v21 ...
+  // u30 u31 u32 u33 u34 u35 u36 u37 | ...
+  // u40 u41 u42 u43 u44 u45 u46 u47 | ...
+  // u50 u51 u52 u53 u54 u55 u56 u57 | ...
+  // u60 u61 u62 u63 u64 u65 u66 u67 | v60 ...
+  // u70 u71 u72 u73 u74 u75 u76 u77 | v70 v71 v72 ...
+  const uint8x16x2_t row01 = vtrnq_u8(row0, row1);  // u00 u10 u02 u12 ...
+                                                    // u01 u11 u03 u13 ...
+  const uint8x16x2_t row23 = vtrnq_u8(row2, row3);  // u20 u30 u22 u32 ...
+                                                    // u21 u31 u23 u33 ...
+  const uint8x16x2_t row45 = vtrnq_u8(row4, row5);  // ...
+  const uint8x16x2_t row67 = vtrnq_u8(row6, row7);  // ...
+  const uint16x8x2_t row02 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[0]),
+                                       vreinterpretq_u16_u8(row23.val[0]));
+  const uint16x8x2_t row13 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[1]),
+                                       vreinterpretq_u16_u8(row23.val[1]));
+  const uint16x8x2_t row46 = vtrnq_u16(vreinterpretq_u16_u8(row45.val[0]),
+                                       vreinterpretq_u16_u8(row67.val[0]));
+  const uint16x8x2_t row57 = vtrnq_u16(vreinterpretq_u16_u8(row45.val[1]),
+                                       vreinterpretq_u16_u8(row67.val[1]));
+  const uint32x4x2_t row04 = vtrnq_u32(vreinterpretq_u32_u16(row02.val[0]),
+                                       vreinterpretq_u32_u16(row46.val[0]));
+  const uint32x4x2_t row26 = vtrnq_u32(vreinterpretq_u32_u16(row02.val[1]),
+                                       vreinterpretq_u32_u16(row46.val[1]));
+  const uint32x4x2_t row15 = vtrnq_u32(vreinterpretq_u32_u16(row13.val[0]),
+                                       vreinterpretq_u32_u16(row57.val[0]));
+  const uint32x4x2_t row37 = vtrnq_u32(vreinterpretq_u32_u16(row13.val[1]),
+                                       vreinterpretq_u32_u16(row57.val[1]));
+  *p3 = vreinterpretq_u8_u32(row04.val[0]);
+  *p2 = vreinterpretq_u8_u32(row15.val[0]);
+  *p1 = vreinterpretq_u8_u32(row26.val[0]);
+  *p0 = vreinterpretq_u8_u32(row37.val[0]);
+  *q0 = vreinterpretq_u8_u32(row04.val[1]);
+  *q1 = vreinterpretq_u8_u32(row15.val[1]);
+  *q2 = vreinterpretq_u8_u32(row26.val[1]);
+  *q3 = vreinterpretq_u8_u32(row37.val[1]);
+}
+#undef LOAD_UV_8
+
+#endif  // !WORK_AROUND_GCC
+
+static WEBP_INLINE void Store2x8(const uint8x8x2_t v,
+                                 uint8_t* const dst, int stride) {
+  vst2_lane_u8(dst + 0 * stride, v, 0);
+  vst2_lane_u8(dst + 1 * stride, v, 1);
+  vst2_lane_u8(dst + 2 * stride, v, 2);
+  vst2_lane_u8(dst + 3 * stride, v, 3);
+  vst2_lane_u8(dst + 4 * stride, v, 4);
+  vst2_lane_u8(dst + 5 * stride, v, 5);
+  vst2_lane_u8(dst + 6 * stride, v, 6);
+  vst2_lane_u8(dst + 7 * stride, v, 7);
+}
+
+static WEBP_INLINE void Store2x16(const uint8x16_t p0, const uint8x16_t q0,
+                                  uint8_t* const dst, int stride) {
+  uint8x8x2_t lo, hi;
+  lo.val[0] = vget_low_u8(p0);
+  lo.val[1] = vget_low_u8(q0);
+  hi.val[0] = vget_high_u8(p0);
+  hi.val[1] = vget_high_u8(q0);
+  Store2x8(lo, dst - 1 + 0 * stride, stride);
+  Store2x8(hi, dst - 1 + 8 * stride, stride);
+}
+
+#if !defined(WORK_AROUND_GCC)
+static WEBP_INLINE void Store4x8(const uint8x8x4_t v,
+                                 uint8_t* const dst, int stride) {
+  vst4_lane_u8(dst + 0 * stride, v, 0);
+  vst4_lane_u8(dst + 1 * stride, v, 1);
+  vst4_lane_u8(dst + 2 * stride, v, 2);
+  vst4_lane_u8(dst + 3 * stride, v, 3);
+  vst4_lane_u8(dst + 4 * stride, v, 4);
+  vst4_lane_u8(dst + 5 * stride, v, 5);
+  vst4_lane_u8(dst + 6 * stride, v, 6);
+  vst4_lane_u8(dst + 7 * stride, v, 7);
+}
+
+static WEBP_INLINE void Store4x16(const uint8x16_t p1, const uint8x16_t p0,
+                                  const uint8x16_t q0, const uint8x16_t q1,
+                                  uint8_t* const dst, int stride) {
+  uint8x8x4_t lo, hi;
+  INIT_VECTOR4(lo,
+               vget_low_u8(p1), vget_low_u8(p0),
+               vget_low_u8(q0), vget_low_u8(q1));
+  INIT_VECTOR4(hi,
+               vget_high_u8(p1), vget_high_u8(p0),
+               vget_high_u8(q0), vget_high_u8(q1));
+  Store4x8(lo, dst - 2 + 0 * stride, stride);
+  Store4x8(hi, dst - 2 + 8 * stride, stride);
+}
+#endif  // !WORK_AROUND_GCC
+
+static WEBP_INLINE void Store16x2(const uint8x16_t p0, const uint8x16_t q0,
+                                  uint8_t* const dst, int stride) {
+  vst1q_u8(dst - stride, p0);
+  vst1q_u8(dst, q0);
+}
+
+static WEBP_INLINE void Store16x4(const uint8x16_t p1, const uint8x16_t p0,
+                                  const uint8x16_t q0, const uint8x16_t q1,
+                                  uint8_t* const dst, int stride) {
+  Store16x2(p1, p0, dst - stride, stride);
+  Store16x2(q0, q1, dst + stride, stride);
+}
+
+static WEBP_INLINE void Store8x2x2(const uint8x16_t p0, const uint8x16_t q0,
+                                   uint8_t* const u, uint8_t* const v,
+                                   int stride) {
+  // p0 and q0 contain the u+v samples packed in low/high halves.
+  vst1_u8(u - stride, vget_low_u8(p0));
+  vst1_u8(u,          vget_low_u8(q0));
+  vst1_u8(v - stride, vget_high_u8(p0));
+  vst1_u8(v,          vget_high_u8(q0));
+}
+
+static WEBP_INLINE void Store8x4x2(const uint8x16_t p1, const uint8x16_t p0,
+                                   const uint8x16_t q0, const uint8x16_t q1,
+                                   uint8_t* const u, uint8_t* const v,
+                                   int stride) {
+  // The p1...q1 registers contain the u+v samples packed in low/high halves.
+  Store8x2x2(p1, p0, u - stride, v - stride, stride);
+  Store8x2x2(q0, q1, u + stride, v + stride, stride);
+}
+
+#if !defined(WORK_AROUND_GCC)
+
+#define STORE6_LANE(DST, VAL0, VAL1, LANE) do {   \
+  vst3_lane_u8((DST) - 3, (VAL0), (LANE));        \
+  vst3_lane_u8((DST) + 0, (VAL1), (LANE));        \
+  (DST) += stride;                                \
+} while (0)
+
+static WEBP_INLINE void Store6x8x2(const uint8x16_t p2, const uint8x16_t p1,
+                                   const uint8x16_t p0, const uint8x16_t q0,
+                                   const uint8x16_t q1, const uint8x16_t q2,
+                                   uint8_t* u, uint8_t* v,
+                                   int stride) {
+  uint8x8x3_t u0, u1, v0, v1;
+  INIT_VECTOR3(u0, vget_low_u8(p2), vget_low_u8(p1), vget_low_u8(p0));
+  INIT_VECTOR3(u1, vget_low_u8(q0), vget_low_u8(q1), vget_low_u8(q2));
+  INIT_VECTOR3(v0, vget_high_u8(p2), vget_high_u8(p1), vget_high_u8(p0));
+  INIT_VECTOR3(v1, vget_high_u8(q0), vget_high_u8(q1), vget_high_u8(q2));
+  STORE6_LANE(u, u0, u1, 0);
+  STORE6_LANE(u, u0, u1, 1);
+  STORE6_LANE(u, u0, u1, 2);
+  STORE6_LANE(u, u0, u1, 3);
+  STORE6_LANE(u, u0, u1, 4);
+  STORE6_LANE(u, u0, u1, 5);
+  STORE6_LANE(u, u0, u1, 6);
+  STORE6_LANE(u, u0, u1, 7);
+  STORE6_LANE(v, v0, v1, 0);
+  STORE6_LANE(v, v0, v1, 1);
+  STORE6_LANE(v, v0, v1, 2);
+  STORE6_LANE(v, v0, v1, 3);
+  STORE6_LANE(v, v0, v1, 4);
+  STORE6_LANE(v, v0, v1, 5);
+  STORE6_LANE(v, v0, v1, 6);
+  STORE6_LANE(v, v0, v1, 7);
+}
+#undef STORE6_LANE
+
+static WEBP_INLINE void Store4x8x2(const uint8x16_t p1, const uint8x16_t p0,
+                                   const uint8x16_t q0, const uint8x16_t q1,
+                                   uint8_t* const u, uint8_t* const v,
+                                   int stride) {
+  uint8x8x4_t u0, v0;
+  INIT_VECTOR4(u0,
+               vget_low_u8(p1), vget_low_u8(p0),
+               vget_low_u8(q0), vget_low_u8(q1));
+  INIT_VECTOR4(v0,
+               vget_high_u8(p1), vget_high_u8(p0),
+               vget_high_u8(q0), vget_high_u8(q1));
+  vst4_lane_u8(u - 2 + 0 * stride, u0, 0);
+  vst4_lane_u8(u - 2 + 1 * stride, u0, 1);
+  vst4_lane_u8(u - 2 + 2 * stride, u0, 2);
+  vst4_lane_u8(u - 2 + 3 * stride, u0, 3);
+  vst4_lane_u8(u - 2 + 4 * stride, u0, 4);
+  vst4_lane_u8(u - 2 + 5 * stride, u0, 5);
+  vst4_lane_u8(u - 2 + 6 * stride, u0, 6);
+  vst4_lane_u8(u - 2 + 7 * stride, u0, 7);
+  vst4_lane_u8(v - 2 + 0 * stride, v0, 0);
+  vst4_lane_u8(v - 2 + 1 * stride, v0, 1);
+  vst4_lane_u8(v - 2 + 2 * stride, v0, 2);
+  vst4_lane_u8(v - 2 + 3 * stride, v0, 3);
+  vst4_lane_u8(v - 2 + 4 * stride, v0, 4);
+  vst4_lane_u8(v - 2 + 5 * stride, v0, 5);
+  vst4_lane_u8(v - 2 + 6 * stride, v0, 6);
+  vst4_lane_u8(v - 2 + 7 * stride, v0, 7);
+}
+
+#endif  // !WORK_AROUND_GCC
+
+// Treats 'v' as an uint8x8_t and zero extends to an int16x8_t.
+static WEBP_INLINE int16x8_t ConvertU8ToS16(uint32x2_t v) {
+  return vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(v)));
+}
+
+// Performs unsigned 8b saturation on 'dst01' and 'dst23' storing the result
+// to the corresponding rows of 'dst'.
+static WEBP_INLINE void SaturateAndStore4x4(uint8_t* const dst,
+                                            const int16x8_t dst01,
+                                            const int16x8_t dst23) {
+  // Unsigned saturate to 8b.
+  const uint8x8_t dst01_u8 = vqmovun_s16(dst01);
+  const uint8x8_t dst23_u8 = vqmovun_s16(dst23);
+
+  // Store the results.
+  vst1_lane_u32((uint32_t*)(dst + 0 * BPS), vreinterpret_u32_u8(dst01_u8), 0);
+  vst1_lane_u32((uint32_t*)(dst + 1 * BPS), vreinterpret_u32_u8(dst01_u8), 1);
+  vst1_lane_u32((uint32_t*)(dst + 2 * BPS), vreinterpret_u32_u8(dst23_u8), 0);
+  vst1_lane_u32((uint32_t*)(dst + 3 * BPS), vreinterpret_u32_u8(dst23_u8), 1);
+}
+
+static WEBP_INLINE void Add4x4(const int16x8_t row01, const int16x8_t row23,
+                               uint8_t* const dst) {
+  uint32x2_t dst01 = vdup_n_u32(0);
+  uint32x2_t dst23 = vdup_n_u32(0);
+
+  // Load the source pixels.
+  dst01 = vld1_lane_u32((uint32_t*)(dst + 0 * BPS), dst01, 0);
+  dst23 = vld1_lane_u32((uint32_t*)(dst + 2 * BPS), dst23, 0);
+  dst01 = vld1_lane_u32((uint32_t*)(dst + 1 * BPS), dst01, 1);
+  dst23 = vld1_lane_u32((uint32_t*)(dst + 3 * BPS), dst23, 1);
+
+  {
+    // Convert to 16b.
+    const int16x8_t dst01_s16 = ConvertU8ToS16(dst01);
+    const int16x8_t dst23_s16 = ConvertU8ToS16(dst23);
+
+    // Descale with rounding.
+    const int16x8_t out01 = vrsraq_n_s16(dst01_s16, row01, 3);
+    const int16x8_t out23 = vrsraq_n_s16(dst23_s16, row23, 3);
+    // Add the inverse transform.
+    SaturateAndStore4x4(dst, out01, out23);
+  }
+}
+
+//-----------------------------------------------------------------------------
+// Simple In-loop filtering (Paragraph 15.2)
+
+static uint8x16_t NeedsFilter(const uint8x16_t p1, const uint8x16_t p0,
+                              const uint8x16_t q0, const uint8x16_t q1,
+                              int thresh) {
+  const uint8x16_t thresh_v = vdupq_n_u8((uint8_t)thresh);
+  const uint8x16_t a_p0_q0 = vabdq_u8(p0, q0);               // abs(p0-q0)
+  const uint8x16_t a_p1_q1 = vabdq_u8(p1, q1);               // abs(p1-q1)
+  const uint8x16_t a_p0_q0_2 = vqaddq_u8(a_p0_q0, a_p0_q0);  // 2 * abs(p0-q0)
+  const uint8x16_t a_p1_q1_2 = vshrq_n_u8(a_p1_q1, 1);       // abs(p1-q1) / 2
+  const uint8x16_t sum = vqaddq_u8(a_p0_q0_2, a_p1_q1_2);
+  const uint8x16_t mask = vcgeq_u8(thresh_v, sum);
+  return mask;
+}
+
+static int8x16_t FlipSign(const uint8x16_t v) {
+  const uint8x16_t sign_bit = vdupq_n_u8(0x80);
+  return vreinterpretq_s8_u8(veorq_u8(v, sign_bit));
+}
+
+static uint8x16_t FlipSignBack(const int8x16_t v) {
+  const int8x16_t sign_bit = vdupq_n_s8(0x80);
+  return vreinterpretq_u8_s8(veorq_s8(v, sign_bit));
+}
+
+static int8x16_t GetBaseDelta(const int8x16_t p1, const int8x16_t p0,
+                              const int8x16_t q0, const int8x16_t q1) {
+  const int8x16_t q0_p0 = vqsubq_s8(q0, p0);      // (q0-p0)
+  const int8x16_t p1_q1 = vqsubq_s8(p1, q1);      // (p1-q1)
+  const int8x16_t s1 = vqaddq_s8(p1_q1, q0_p0);   // (p1-q1) + 1 * (q0 - p0)
+  const int8x16_t s2 = vqaddq_s8(q0_p0, s1);      // (p1-q1) + 2 * (q0 - p0)
+  const int8x16_t s3 = vqaddq_s8(q0_p0, s2);      // (p1-q1) + 3 * (q0 - p0)
+  return s3;
+}
+
+static int8x16_t GetBaseDelta0(const int8x16_t p0, const int8x16_t q0) {
+  const int8x16_t q0_p0 = vqsubq_s8(q0, p0);      // (q0-p0)
+  const int8x16_t s1 = vqaddq_s8(q0_p0, q0_p0);   // 2 * (q0 - p0)
+  const int8x16_t s2 = vqaddq_s8(q0_p0, s1);      // 3 * (q0 - p0)
+  return s2;
+}
+
+//------------------------------------------------------------------------------
+
+static void ApplyFilter2(const int8x16_t p0s, const int8x16_t q0s,
+                         const int8x16_t delta,
+                         uint8x16_t* const op0, uint8x16_t* const oq0) {
+  const int8x16_t kCst3 = vdupq_n_s8(0x03);
+  const int8x16_t kCst4 = vdupq_n_s8(0x04);
+  const int8x16_t delta_p3 = vqaddq_s8(delta, kCst3);
+  const int8x16_t delta_p4 = vqaddq_s8(delta, kCst4);
+  const int8x16_t delta3 = vshrq_n_s8(delta_p3, 3);
+  const int8x16_t delta4 = vshrq_n_s8(delta_p4, 3);
+  const int8x16_t sp0 = vqaddq_s8(p0s, delta3);
+  const int8x16_t sq0 = vqsubq_s8(q0s, delta4);
+  *op0 = FlipSignBack(sp0);
+  *oq0 = FlipSignBack(sq0);
+}
+
+#if defined(USE_INTRINSICS)
+
+static void DoFilter2(const uint8x16_t p1, const uint8x16_t p0,
+                      const uint8x16_t q0, const uint8x16_t q1,
+                      const uint8x16_t mask,
+                      uint8x16_t* const op0, uint8x16_t* const oq0) {
+  const int8x16_t p1s = FlipSign(p1);
+  const int8x16_t p0s = FlipSign(p0);
+  const int8x16_t q0s = FlipSign(q0);
+  const int8x16_t q1s = FlipSign(q1);
+  const int8x16_t delta0 = GetBaseDelta(p1s, p0s, q0s, q1s);
+  const int8x16_t delta1 = vandq_s8(delta0, vreinterpretq_s8_u8(mask));
+  ApplyFilter2(p0s, q0s, delta1, op0, oq0);
+}
+
+static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {
+  uint8x16_t p1, p0, q0, q1, op0, oq0;
+  Load16x4(p, stride, &p1, &p0, &q0, &q1);
+  {
+    const uint8x16_t mask = NeedsFilter(p1, p0, q0, q1, thresh);
+    DoFilter2(p1, p0, q0, q1, mask, &op0, &oq0);
+  }
+  Store16x2(op0, oq0, p, stride);
+}
+
+static void SimpleHFilter16(uint8_t* p, int stride, int thresh) {
+  uint8x16_t p1, p0, q0, q1, oq0, op0;
+  Load4x16(p, stride, &p1, &p0, &q0, &q1);
+  {
+    const uint8x16_t mask = NeedsFilter(p1, p0, q0, q1, thresh);
+    DoFilter2(p1, p0, q0, q1, mask, &op0, &oq0);
+  }
+  Store2x16(op0, oq0, p, stride);
+}
+
+#else
+
 #define QRegs "q0", "q1", "q2", "q3",                                          \
               "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
 
@@ -66,31 +579,7 @@
   DO_SIMPLE_FILTER(p0, q0, q9)                 /* apply filter */              \
   FLIP_SIGN_BIT2(p0, q0, q10)
 
-// Load/Store vertical edge
-#define LOAD8x4(c1, c2, c3, c4, b1, b2, stride)                                \
-  "vld4.8   {" #c1"[0], " #c2"[0], " #c3"[0], " #c4"[0]}," #b1 "," #stride"\n" \
-  "vld4.8   {" #c1"[1], " #c2"[1], " #c3"[1], " #c4"[1]}," #b2 "," #stride"\n" \
-  "vld4.8   {" #c1"[2], " #c2"[2], " #c3"[2], " #c4"[2]}," #b1 "," #stride"\n" \
-  "vld4.8   {" #c1"[3], " #c2"[3], " #c3"[3], " #c4"[3]}," #b2 "," #stride"\n" \
-  "vld4.8   {" #c1"[4], " #c2"[4], " #c3"[4], " #c4"[4]}," #b1 "," #stride"\n" \
-  "vld4.8   {" #c1"[5], " #c2"[5], " #c3"[5], " #c4"[5]}," #b2 "," #stride"\n" \
-  "vld4.8   {" #c1"[6], " #c2"[6], " #c3"[6], " #c4"[6]}," #b1 "," #stride"\n" \
-  "vld4.8   {" #c1"[7], " #c2"[7], " #c3"[7], " #c4"[7]}," #b2 "," #stride"\n"
-
-#define STORE8x2(c1, c2, p, stride)                                            \
-  "vst2.8   {" #c1"[0], " #c2"[0]}," #p "," #stride " \n"                      \
-  "vst2.8   {" #c1"[1], " #c2"[1]}," #p "," #stride " \n"                      \
-  "vst2.8   {" #c1"[2], " #c2"[2]}," #p "," #stride " \n"                      \
-  "vst2.8   {" #c1"[3], " #c2"[3]}," #p "," #stride " \n"                      \
-  "vst2.8   {" #c1"[4], " #c2"[4]}," #p "," #stride " \n"                      \
-  "vst2.8   {" #c1"[5], " #c2"[5]}," #p "," #stride " \n"                      \
-  "vst2.8   {" #c1"[6], " #c2"[6]}," #p "," #stride " \n"                      \
-  "vst2.8   {" #c1"[7], " #c2"[7]}," #p "," #stride " \n"
-
-//-----------------------------------------------------------------------------
-// Simple In-loop filtering (Paragraph 15.2)
-
-static void SimpleVFilter16NEON(uint8_t* p, int stride, int thresh) {
+static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {
   __asm__ volatile (
     "sub        %[p], %[p], %[stride], lsl #1  \n"  // p -= 2 * stride
 
@@ -111,7 +600,7 @@ static void SimpleVFilter16NEON(uint8_t* p, int stride, int thresh) {
   );
 }
 
-static void SimpleHFilter16NEON(uint8_t* p, int stride, int thresh) {
+static void SimpleHFilter16(uint8_t* p, int stride, int thresh) {
   __asm__ volatile (
     "sub        r4, %[p], #2                   \n"  // base1 = p - 2
     "lsl        r6, %[stride], #1              \n"  // r6 = 2 * stride
@@ -137,47 +626,416 @@ static void SimpleHFilter16NEON(uint8_t* p, int stride, int thresh) {
   );
 }
 
-static void SimpleVFilter16iNEON(uint8_t* p, int stride, int thresh) {
-  int k;
-  for (k = 3; k > 0; --k) {
+#endif    // USE_INTRINSICS
+
+static void SimpleVFilter16i(uint8_t* p, int stride, int thresh) {
+  uint32_t k;
+  for (k = 3; k != 0; --k) {
+    p += 4 * stride;
+    SimpleVFilter16(p, stride, thresh);
+  }
+}
+
+static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) {
+  uint32_t k;
+  for (k = 3; k != 0; --k) {
+    p += 4;
+    SimpleHFilter16(p, stride, thresh);
+  }
+}
+
+//------------------------------------------------------------------------------
+// Complex In-loop filtering (Paragraph 15.3)
+
+static uint8x16_t NeedsHev(const uint8x16_t p1, const uint8x16_t p0,
+                           const uint8x16_t q0, const uint8x16_t q1,
+                           int hev_thresh) {
+  const uint8x16_t hev_thresh_v = vdupq_n_u8((uint8_t)hev_thresh);
+  const uint8x16_t a_p1_p0 = vabdq_u8(p1, p0);  // abs(p1 - p0)
+  const uint8x16_t a_q1_q0 = vabdq_u8(q1, q0);  // abs(q1 - q0)
+  const uint8x16_t mask1 = vcgtq_u8(a_p1_p0, hev_thresh_v);
+  const uint8x16_t mask2 = vcgtq_u8(a_q1_q0, hev_thresh_v);
+  const uint8x16_t mask = vorrq_u8(mask1, mask2);
+  return mask;
+}
+
+static uint8x16_t NeedsFilter2(const uint8x16_t p3, const uint8x16_t p2,
+                               const uint8x16_t p1, const uint8x16_t p0,
+                               const uint8x16_t q0, const uint8x16_t q1,
+                               const uint8x16_t q2, const uint8x16_t q3,
+                               int ithresh, int thresh) {
+  const uint8x16_t ithresh_v = vdupq_n_u8((uint8_t)ithresh);
+  const uint8x16_t a_p3_p2 = vabdq_u8(p3, p2);  // abs(p3 - p2)
+  const uint8x16_t a_p2_p1 = vabdq_u8(p2, p1);  // abs(p2 - p1)
+  const uint8x16_t a_p1_p0 = vabdq_u8(p1, p0);  // abs(p1 - p0)
+  const uint8x16_t a_q3_q2 = vabdq_u8(q3, q2);  // abs(q3 - q2)
+  const uint8x16_t a_q2_q1 = vabdq_u8(q2, q1);  // abs(q2 - q1)
+  const uint8x16_t a_q1_q0 = vabdq_u8(q1, q0);  // abs(q1 - q0)
+  const uint8x16_t max1 = vmaxq_u8(a_p3_p2, a_p2_p1);
+  const uint8x16_t max2 = vmaxq_u8(a_p1_p0, a_q3_q2);
+  const uint8x16_t max3 = vmaxq_u8(a_q2_q1, a_q1_q0);
+  const uint8x16_t max12 = vmaxq_u8(max1, max2);
+  const uint8x16_t max123 = vmaxq_u8(max12, max3);
+  const uint8x16_t mask2 = vcgeq_u8(ithresh_v, max123);
+  const uint8x16_t mask1 = NeedsFilter(p1, p0, q0, q1, thresh);
+  const uint8x16_t mask = vandq_u8(mask1, mask2);
+  return mask;
+}
+
+//  4-points filter
+
+static void ApplyFilter4(
+    const int8x16_t p1, const int8x16_t p0,
+    const int8x16_t q0, const int8x16_t q1,
+    const int8x16_t delta0,
+    uint8x16_t* const op1, uint8x16_t* const op0,
+    uint8x16_t* const oq0, uint8x16_t* const oq1) {
+  const int8x16_t kCst3 = vdupq_n_s8(0x03);
+  const int8x16_t kCst4 = vdupq_n_s8(0x04);
+  const int8x16_t delta1 = vqaddq_s8(delta0, kCst4);
+  const int8x16_t delta2 = vqaddq_s8(delta0, kCst3);
+  const int8x16_t a1 = vshrq_n_s8(delta1, 3);
+  const int8x16_t a2 = vshrq_n_s8(delta2, 3);
+  const int8x16_t a3 = vrshrq_n_s8(a1, 1);   // a3 = (a1 + 1) >> 1
+  *op0 = FlipSignBack(vqaddq_s8(p0, a2));  // clip(p0 + a2)
+  *oq0 = FlipSignBack(vqsubq_s8(q0, a1));  // clip(q0 - a1)
+  *op1 = FlipSignBack(vqaddq_s8(p1, a3));  // clip(p1 + a3)
+  *oq1 = FlipSignBack(vqsubq_s8(q1, a3));  // clip(q1 - a3)
+}
+
+static void DoFilter4(
+    const uint8x16_t p1, const uint8x16_t p0,
+    const uint8x16_t q0, const uint8x16_t q1,
+    const uint8x16_t mask, const uint8x16_t hev_mask,
+    uint8x16_t* const op1, uint8x16_t* const op0,
+    uint8x16_t* const oq0, uint8x16_t* const oq1) {
+  // This is a fused version of DoFilter2() calling ApplyFilter2 directly
+  const int8x16_t p1s = FlipSign(p1);
+  int8x16_t p0s = FlipSign(p0);
+  int8x16_t q0s = FlipSign(q0);
+  const int8x16_t q1s = FlipSign(q1);
+  const uint8x16_t simple_lf_mask = vandq_u8(mask, hev_mask);
+
+  // do_filter2 part (simple loopfilter on pixels with hev)
+  {
+    const int8x16_t delta = GetBaseDelta(p1s, p0s, q0s, q1s);
+    const int8x16_t simple_lf_delta =
+        vandq_s8(delta, vreinterpretq_s8_u8(simple_lf_mask));
+    uint8x16_t tmp_p0, tmp_q0;
+    ApplyFilter2(p0s, q0s, simple_lf_delta, &tmp_p0, &tmp_q0);
+    // TODO(skal): avoid the double FlipSign() in ApplyFilter2() and here
+    p0s = FlipSign(tmp_p0);
+    q0s = FlipSign(tmp_q0);
+  }
+
+  // do_filter4 part (complex loopfilter on pixels without hev)
+  {
+    const int8x16_t delta0 = GetBaseDelta0(p0s, q0s);
+    // we use: (mask & hev_mask) ^ mask = mask & !hev_mask
+    const uint8x16_t complex_lf_mask = veorq_u8(simple_lf_mask, mask);
+    const int8x16_t complex_lf_delta =
+        vandq_s8(delta0, vreinterpretq_s8_u8(complex_lf_mask));
+    ApplyFilter4(p1s, p0s, q0s, q1s, complex_lf_delta, op1, op0, oq0, oq1);
+  }
+}
+
+//  6-points filter
+
+static void ApplyFilter6(
+    const int8x16_t p2, const int8x16_t p1, const int8x16_t p0,
+    const int8x16_t q0, const int8x16_t q1, const int8x16_t q2,
+    const int8x16_t delta,
+    uint8x16_t* const op2, uint8x16_t* const op1, uint8x16_t* const op0,
+    uint8x16_t* const oq0, uint8x16_t* const oq1, uint8x16_t* const oq2) {
+  const int16x8_t kCst63 = vdupq_n_s16(63);
+  const int8x8_t kCst27 = vdup_n_s8(27);
+  const int8x8_t kCst18 = vdup_n_s8(18);
+  const int8x8_t kCst9 = vdup_n_s8(9);
+  const int8x8_t delta_lo = vget_low_s8(delta);
+  const int8x8_t delta_hi = vget_high_s8(delta);
+  const int16x8_t s1_lo = vmlal_s8(kCst63, kCst27, delta_lo);  // 63 + 27 * a
+  const int16x8_t s1_hi = vmlal_s8(kCst63, kCst27, delta_hi);  // 63 + 27 * a
+  const int16x8_t s2_lo = vmlal_s8(kCst63, kCst18, delta_lo);  // 63 + 18 * a
+  const int16x8_t s2_hi = vmlal_s8(kCst63, kCst18, delta_hi);  // 63 + 18 * a
+  const int16x8_t s3_lo = vmlal_s8(kCst63, kCst9, delta_lo);   // 63 + 9 * a
+  const int16x8_t s3_hi = vmlal_s8(kCst63, kCst9, delta_hi);   // 63 + 9 * a
+  const int8x8_t a1_lo = vqshrn_n_s16(s1_lo, 7);
+  const int8x8_t a1_hi = vqshrn_n_s16(s1_hi, 7);
+  const int8x8_t a2_lo = vqshrn_n_s16(s2_lo, 7);
+  const int8x8_t a2_hi = vqshrn_n_s16(s2_hi, 7);
+  const int8x8_t a3_lo = vqshrn_n_s16(s3_lo, 7);
+  const int8x8_t a3_hi = vqshrn_n_s16(s3_hi, 7);
+  const int8x16_t a1 = vcombine_s8(a1_lo, a1_hi);
+  const int8x16_t a2 = vcombine_s8(a2_lo, a2_hi);
+  const int8x16_t a3 = vcombine_s8(a3_lo, a3_hi);
+
+  *op0 = FlipSignBack(vqaddq_s8(p0, a1));  // clip(p0 + a1)
+  *oq0 = FlipSignBack(vqsubq_s8(q0, a1));  // clip(q0 - q1)
+  *oq1 = FlipSignBack(vqsubq_s8(q1, a2));  // clip(q1 - a2)
+  *op1 = FlipSignBack(vqaddq_s8(p1, a2));  // clip(p1 + a2)
+  *oq2 = FlipSignBack(vqsubq_s8(q2, a3));  // clip(q2 - a3)
+  *op2 = FlipSignBack(vqaddq_s8(p2, a3));  // clip(p2 + a3)
+}
+
+static void DoFilter6(
+    const uint8x16_t p2, const uint8x16_t p1, const uint8x16_t p0,
+    const uint8x16_t q0, const uint8x16_t q1, const uint8x16_t q2,
+    const uint8x16_t mask, const uint8x16_t hev_mask,
+    uint8x16_t* const op2, uint8x16_t* const op1, uint8x16_t* const op0,
+    uint8x16_t* const oq0, uint8x16_t* const oq1, uint8x16_t* const oq2) {
+  // This is a fused version of DoFilter2() calling ApplyFilter2 directly
+  const int8x16_t p2s = FlipSign(p2);
+  const int8x16_t p1s = FlipSign(p1);
+  int8x16_t p0s = FlipSign(p0);
+  int8x16_t q0s = FlipSign(q0);
+  const int8x16_t q1s = FlipSign(q1);
+  const int8x16_t q2s = FlipSign(q2);
+  const uint8x16_t simple_lf_mask = vandq_u8(mask, hev_mask);
+  const int8x16_t delta0 = GetBaseDelta(p1s, p0s, q0s, q1s);
+
+  // do_filter2 part (simple loopfilter on pixels with hev)
+  {
+    const int8x16_t simple_lf_delta =
+        vandq_s8(delta0, vreinterpretq_s8_u8(simple_lf_mask));
+    uint8x16_t tmp_p0, tmp_q0;
+    ApplyFilter2(p0s, q0s, simple_lf_delta, &tmp_p0, &tmp_q0);
+    // TODO(skal): avoid the double FlipSign() in ApplyFilter2() and here
+    p0s = FlipSign(tmp_p0);
+    q0s = FlipSign(tmp_q0);
+  }
+
+  // do_filter6 part (complex loopfilter on pixels without hev)
+  {
+    // we use: (mask & hev_mask) ^ mask = mask & !hev_mask
+    const uint8x16_t complex_lf_mask = veorq_u8(simple_lf_mask, mask);
+    const int8x16_t complex_lf_delta =
+        vandq_s8(delta0, vreinterpretq_s8_u8(complex_lf_mask));
+    ApplyFilter6(p2s, p1s, p0s, q0s, q1s, q2s, complex_lf_delta,
+                 op2, op1, op0, oq0, oq1, oq2);
+  }
+}
+
+// on macroblock edges
+
+static void VFilter16(uint8_t* p, int stride,
+                      int thresh, int ithresh, int hev_thresh) {
+  uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
+  Load16x8(p, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
+  {
+    const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3,
+                                         ithresh, thresh);
+    const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
+    uint8x16_t op2, op1, op0, oq0, oq1, oq2;
+    DoFilter6(p2, p1, p0, q0, q1, q2, mask, hev_mask,
+              &op2, &op1, &op0, &oq0, &oq1, &oq2);
+    Store16x2(op2, op1, p - 2 * stride, stride);
+    Store16x2(op0, oq0, p + 0 * stride, stride);
+    Store16x2(oq1, oq2, p + 2 * stride, stride);
+  }
+}
+
+static void HFilter16(uint8_t* p, int stride,
+                      int thresh, int ithresh, int hev_thresh) {
+  uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
+  Load8x16(p, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
+  {
+    const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3,
+                                         ithresh, thresh);
+    const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
+    uint8x16_t op2, op1, op0, oq0, oq1, oq2;
+    DoFilter6(p2, p1, p0, q0, q1, q2, mask, hev_mask,
+              &op2, &op1, &op0, &oq0, &oq1, &oq2);
+    Store2x16(op2, op1, p - 2, stride);
+    Store2x16(op0, oq0, p + 0, stride);
+    Store2x16(oq1, oq2, p + 2, stride);
+  }
+}
+
+// on three inner edges
+static void VFilter16i(uint8_t* p, int stride,
+                       int thresh, int ithresh, int hev_thresh) {
+  uint32_t k;
+  uint8x16_t p3, p2, p1, p0;
+  Load16x4(p + 2  * stride, stride, &p3, &p2, &p1, &p0);
+  for (k = 3; k != 0; --k) {
+    uint8x16_t q0, q1, q2, q3;
     p += 4 * stride;
-    SimpleVFilter16NEON(p, stride, thresh);
+    Load16x4(p + 2  * stride, stride, &q0, &q1, &q2, &q3);
+    {
+      const uint8x16_t mask =
+          NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3, ithresh, thresh);
+      const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
+      // p3 and p2 are not just temporary variables here: they will be
+      // re-used for next span. And q2/q3 will become p1/p0 accordingly.
+      DoFilter4(p1, p0, q0, q1, mask, hev_mask, &p1, &p0, &p3, &p2);
+      Store16x4(p1, p0, p3, p2, p, stride);
+      p1 = q2;
+      p0 = q3;
+    }
   }
 }
 
-static void SimpleHFilter16iNEON(uint8_t* p, int stride, int thresh) {
-  int k;
-  for (k = 3; k > 0; --k) {
+#if !defined(WORK_AROUND_GCC)
+static void HFilter16i(uint8_t* p, int stride,
+                       int thresh, int ithresh, int hev_thresh) {
+  uint32_t k;
+  uint8x16_t p3, p2, p1, p0;
+  Load4x16(p + 2, stride, &p3, &p2, &p1, &p0);
+  for (k = 3; k != 0; --k) {
+    uint8x16_t q0, q1, q2, q3;
     p += 4;
-    SimpleHFilter16NEON(p, stride, thresh);
+    Load4x16(p + 2, stride, &q0, &q1, &q2, &q3);
+    {
+      const uint8x16_t mask =
+          NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3, ithresh, thresh);
+      const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
+      DoFilter4(p1, p0, q0, q1, mask, hev_mask, &p1, &p0, &p3, &p2);
+      Store4x16(p1, p0, p3, p2, p, stride);
+      p1 = q2;
+      p0 = q3;
+    }
+  }
+}
+#endif  // !WORK_AROUND_GCC
+
+// 8-pixels wide variant, for chroma filtering
+static void VFilter8(uint8_t* u, uint8_t* v, int stride,
+                     int thresh, int ithresh, int hev_thresh) {
+  uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
+  Load8x8x2(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
+  {
+    const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3,
+                                         ithresh, thresh);
+    const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
+    uint8x16_t op2, op1, op0, oq0, oq1, oq2;
+    DoFilter6(p2, p1, p0, q0, q1, q2, mask, hev_mask,
+              &op2, &op1, &op0, &oq0, &oq1, &oq2);
+    Store8x2x2(op2, op1, u - 2 * stride, v - 2 * stride, stride);
+    Store8x2x2(op0, oq0, u + 0 * stride, v + 0 * stride, stride);
+    Store8x2x2(oq1, oq2, u + 2 * stride, v + 2 * stride, stride);
+  }
+}
+static void VFilter8i(uint8_t* u, uint8_t* v, int stride,
+                      int thresh, int ithresh, int hev_thresh) {
+  uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
+  u += 4 * stride;
+  v += 4 * stride;
+  Load8x8x2(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
+  {
+    const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3,
+                                         ithresh, thresh);
+    const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
+    uint8x16_t op1, op0, oq0, oq1;
+    DoFilter4(p1, p0, q0, q1, mask, hev_mask, &op1, &op0, &oq0, &oq1);
+    Store8x4x2(op1, op0, oq0, oq1, u, v, stride);
+  }
+}
+
+#if !defined(WORK_AROUND_GCC)
+static void HFilter8(uint8_t* u, uint8_t* v, int stride,
+                     int thresh, int ithresh, int hev_thresh) {
+  uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
+  Load8x8x2T(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
+  {
+    const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3,
+                                         ithresh, thresh);
+    const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
+    uint8x16_t op2, op1, op0, oq0, oq1, oq2;
+    DoFilter6(p2, p1, p0, q0, q1, q2, mask, hev_mask,
+              &op2, &op1, &op0, &oq0, &oq1, &oq2);
+    Store6x8x2(op2, op1, op0, oq0, oq1, oq2, u, v, stride);
+  }
+}
+
+static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
+                      int thresh, int ithresh, int hev_thresh) {
+  uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
+  u += 4;
+  v += 4;
+  Load8x8x2T(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
+  {
+    const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3,
+                                         ithresh, thresh);
+    const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
+    uint8x16_t op1, op0, oq0, oq1;
+    DoFilter4(p1, p0, q0, q1, mask, hev_mask, &op1, &op0, &oq0, &oq1);
+    Store4x8x2(op1, op0, oq0, oq1, u, v, stride);
   }
 }
+#endif  // !WORK_AROUND_GCC
 
 //-----------------------------------------------------------------------------
 // Inverse transforms (Paragraph 14.4)
 
+// Technically these are unsigned but vqdmulh is only available in signed.
+// vqdmulh returns high half (effectively >> 16) but also doubles the value,
+// changing the >> 16 to >> 15 and requiring an additional >> 1.
+// We use this to our advantage with kC2. The canonical value is 35468.
+// However, the high bit is set so treating it as signed will give incorrect
+// results. We avoid this by down shifting by 1 here to clear the highest bit.
+// Combined with the doubling effect of vqdmulh we get >> 16.
+// This can not be applied to kC1 because the lowest bit is set. Down shifting
+// the constant would reduce precision.
+
+// libwebp uses a trick to avoid some extra addition that libvpx does.
+// Instead of:
+// temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1) >> 16);
+// libwebp adds 1 << 16 to cospi8sqrt2minus1 (kC1). However, this causes the
+// same issue with kC1 and vqdmulh that we work around by down shifting kC2
+
+static const int16_t kC1 = 20091;
+static const int16_t kC2 = 17734;  // half of kC2, actually. See comment above.
+
+#if defined(USE_INTRINSICS)
+static WEBP_INLINE void Transpose8x2(const int16x8_t in0, const int16x8_t in1,
+                                     int16x8x2_t* const out) {
+  // a0 a1 a2 a3 | b0 b1 b2 b3   => a0 b0 c0 d0 | a1 b1 c1 d1
+  // c0 c1 c2 c3 | d0 d1 d2 d3      a2 b2 c2 d2 | a3 b3 c3 d3
+  const int16x8x2_t tmp0 = vzipq_s16(in0, in1);   // a0 c0 a1 c1 a2 c2 ...
+                                                  // b0 d0 b1 d1 b2 d2 ...
+  *out = vzipq_s16(tmp0.val[0], tmp0.val[1]);
+}
+
+static WEBP_INLINE void TransformPass(int16x8x2_t* const rows) {
+  // {rows} = in0 | in4
+  //          in8 | in12
+  // B1 = in4 | in12
+  const int16x8_t B1 =
+      vcombine_s16(vget_high_s16(rows->val[0]), vget_high_s16(rows->val[1]));
+  // C0 = kC1 * in4 | kC1 * in12
+  // C1 = kC2 * in4 | kC2 * in12
+  const int16x8_t C0 = vsraq_n_s16(B1, vqdmulhq_n_s16(B1, kC1), 1);
+  const int16x8_t C1 = vqdmulhq_n_s16(B1, kC2);
+  const int16x4_t a = vqadd_s16(vget_low_s16(rows->val[0]),
+                                vget_low_s16(rows->val[1]));   // in0 + in8
+  const int16x4_t b = vqsub_s16(vget_low_s16(rows->val[0]),
+                                vget_low_s16(rows->val[1]));   // in0 - in8
+  // c = kC2 * in4 - kC1 * in12
+  // d = kC1 * in4 + kC2 * in12
+  const int16x4_t c = vqsub_s16(vget_low_s16(C1), vget_high_s16(C0));
+  const int16x4_t d = vqadd_s16(vget_low_s16(C0), vget_high_s16(C1));
+  const int16x8_t D0 = vcombine_s16(a, b);      // D0 = a | b
+  const int16x8_t D1 = vcombine_s16(d, c);      // D1 = d | c
+  const int16x8_t E0 = vqaddq_s16(D0, D1);      // a+d | b+c
+  const int16x8_t E_tmp = vqsubq_s16(D0, D1);   // a-d | b-c
+  const int16x8_t E1 = vcombine_s16(vget_high_s16(E_tmp), vget_low_s16(E_tmp));
+  Transpose8x2(E0, E1, rows);
+}
+
 static void TransformOne(const int16_t* in, uint8_t* dst) {
-  const int kBPS = BPS;
-  const int16_t constants[] = {20091, 17734, 0, 0};
-  /* kC1, kC2. Padded because vld1.16 loads 8 bytes
-   * Technically these are unsigned but vqdmulh is only available in signed.
-   * vqdmulh returns high half (effectively >> 16) but also doubles the value,
-   * changing the >> 16 to >> 15 and requiring an additional >> 1.
-   * We use this to our advantage with kC2. The canonical value is 35468.
-   * However, the high bit is set so treating it as signed will give incorrect
-   * results. We avoid this by down shifting by 1 here to clear the highest bit.
-   * Combined with the doubling effect of vqdmulh we get >> 16.
-   * This can not be applied to kC1 because the lowest bit is set. Down shifting
-   * the constant would reduce precision.
-   */
-
-  /* libwebp uses a trick to avoid some extra addition that libvpx does.
-   * Instead of:
-   * temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1) >> 16);
-   * libwebp adds 1 << 16 to cospi8sqrt2minus1 (kC1). However, this causes the
-   * same issue with kC1 and vqdmulh that we work around by down shifting kC2
-   */
+  int16x8x2_t rows;
+  INIT_VECTOR2(rows, vld1q_s16(in + 0), vld1q_s16(in + 8));
+  TransformPass(&rows);
+  TransformPass(&rows);
+  Add4x4(rows.val[0], rows.val[1], dst);
+}
+
+#else
 
+static void TransformOne(const int16_t* in, uint8_t* dst) {
+  const int kBPS = BPS;
+  // kC1, kC2. Padded because vld1.16 loads 8 bytes
+  const int16_t constants[4] = { kC1, kC2, 0, 0 };
   /* Adapted from libvpx: vp8/common/arm/neon/shortidct4x4llm_neon.asm */
   __asm__ volatile (
     "vld1.16         {q1, q2}, [%[in]]           \n"
@@ -305,6 +1163,8 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
   );
 }
 
+#endif    // USE_INTRINSICS
+
 static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
   TransformOne(in, dst);
   if (do_two) {
@@ -313,102 +1173,90 @@ static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
 }
 
 static void TransformDC(const int16_t* in, uint8_t* dst) {
-  const int DC = (in[0] + 4) >> 3;
-  const int kBPS = BPS;
-  __asm__ volatile (
-    "vdup.16         q1, %[DC]        \n"
+  const int16x8_t DC = vdupq_n_s16(in[0]);
+  Add4x4(DC, DC, dst);
+}
 
-    "vld1.32         d0[0], [%[dst]], %[kBPS]    \n"
-    "vld1.32         d1[0], [%[dst]], %[kBPS]    \n"
-    "vld1.32         d0[1], [%[dst]], %[kBPS]    \n"
-    "vld1.32         d1[1], [%[dst]], %[kBPS]    \n"
+//------------------------------------------------------------------------------
 
-    "sub         %[dst], %[dst], %[kBPS], lsl #2 \n"
+#define STORE_WHT(dst, col, rows) do {                  \
+  *dst = vgetq_lane_s32(rows.val[0], col); (dst) += 16; \
+  *dst = vgetq_lane_s32(rows.val[1], col); (dst) += 16; \
+  *dst = vgetq_lane_s32(rows.val[2], col); (dst) += 16; \
+  *dst = vgetq_lane_s32(rows.val[3], col); (dst) += 16; \
+} while (0)
 
-    // add DC and convert to s16.
-    "vaddw.u8        q2, q1, d0                  \n"
-    "vaddw.u8        q3, q1, d1                  \n"
-    // convert back to u8 with saturation
-    "vqmovun.s16     d0,  q2                     \n"
-    "vqmovun.s16     d1,  q3                     \n"
+static void TransformWHT(const int16_t* in, int16_t* out) {
+  int32x4x4_t tmp;
+
+  {
+    // Load the source.
+    const int16x4_t in00_03 = vld1_s16(in + 0);
+    const int16x4_t in04_07 = vld1_s16(in + 4);
+    const int16x4_t in08_11 = vld1_s16(in + 8);
+    const int16x4_t in12_15 = vld1_s16(in + 12);
+    const int32x4_t a0 = vaddl_s16(in00_03, in12_15);  // in[0..3] + in[12..15]
+    const int32x4_t a1 = vaddl_s16(in04_07, in08_11);  // in[4..7] + in[8..11]
+    const int32x4_t a2 = vsubl_s16(in04_07, in08_11);  // in[4..7] - in[8..11]
+    const int32x4_t a3 = vsubl_s16(in00_03, in12_15);  // in[0..3] - in[12..15]
+    tmp.val[0] = vaddq_s32(a0, a1);
+    tmp.val[1] = vaddq_s32(a3, a2);
+    tmp.val[2] = vsubq_s32(a0, a1);
+    tmp.val[3] = vsubq_s32(a3, a2);
+    // Arrange the temporary results column-wise.
+    tmp = Transpose4x4(tmp);
+  }
 
-    "vst1.32         d0[0], [%[dst]], %[kBPS]    \n"
-    "vst1.32         d1[0], [%[dst]], %[kBPS]    \n"
-    "vst1.32         d0[1], [%[dst]], %[kBPS]    \n"
-    "vst1.32         d1[1], [%[dst]]             \n"
-    : [in] "+r"(in), [dst] "+r"(dst)  /* modified registers */
-    : [kBPS] "r"(kBPS),   /* constants */
-      [DC] "r"(DC)
-    : "memory", "q0", "q1", "q2", "q3"  /* clobbered */
-  );
+  {
+    const int32x4_t kCst3 = vdupq_n_s32(3);
+    const int32x4_t dc = vaddq_s32(tmp.val[0], kCst3);  // add rounder
+    const int32x4_t a0 = vaddq_s32(dc, tmp.val[3]);
+    const int32x4_t a1 = vaddq_s32(tmp.val[1], tmp.val[2]);
+    const int32x4_t a2 = vsubq_s32(tmp.val[1], tmp.val[2]);
+    const int32x4_t a3 = vsubq_s32(dc, tmp.val[3]);
+
+    tmp.val[0] = vaddq_s32(a0, a1);
+    tmp.val[1] = vaddq_s32(a3, a2);
+    tmp.val[2] = vsubq_s32(a0, a1);
+    tmp.val[3] = vsubq_s32(a3, a2);
+
+    // right shift the results by 3.
+    tmp.val[0] = vshrq_n_s32(tmp.val[0], 3);
+    tmp.val[1] = vshrq_n_s32(tmp.val[1], 3);
+    tmp.val[2] = vshrq_n_s32(tmp.val[2], 3);
+    tmp.val[3] = vshrq_n_s32(tmp.val[3], 3);
+
+    STORE_WHT(out, 0, tmp);
+    STORE_WHT(out, 1, tmp);
+    STORE_WHT(out, 2, tmp);
+    STORE_WHT(out, 3, tmp);
+  }
 }
 
-static void TransformWHT(const int16_t* in, int16_t* out) {
-  const int kStep = 32;  // The store is only incrementing the pointer as if we
-                         // had stored a single byte.
-  __asm__ volatile (
-    // part 1
-    // load data into q0, q1
-    "vld1.16         {q0, q1}, [%[in]]           \n"
-
-    "vaddl.s16       q2, d0, d3                  \n"  // a0 = in[0] + in[12]
-    "vaddl.s16       q3, d1, d2                  \n"  // a1 = in[4] + in[8]
-    "vsubl.s16       q10, d1, d2                 \n"  // a2 = in[4] - in[8]
-    "vsubl.s16       q11, d0, d3                 \n"  // a3 = in[0] - in[12]
-
-    "vadd.s32        q0, q2, q3                  \n"  // tmp[0] = a0 + a1
-    "vsub.s32        q2, q2, q3                  \n"  // tmp[8] = a0 - a1
-    "vadd.s32        q1, q11, q10                \n"  // tmp[4] = a3 + a2
-    "vsub.s32        q3, q11, q10                \n"  // tmp[12] = a3 - a2
-
-    // Transpose
-    // q0 = tmp[0, 4, 8, 12], q1 = tmp[2, 6, 10, 14]
-    // q2 = tmp[1, 5, 9, 13], q3 = tmp[3, 7, 11, 15]
-    "vswp            d1, d4                      \n"  // vtrn.64 q0, q2
-    "vswp            d3, d6                      \n"  // vtrn.64 q1, q3
-    "vtrn.32         q0, q1                      \n"
-    "vtrn.32         q2, q3                      \n"
-
-    "vmov.s32        q10, #3                     \n"  // dc = 3
-    "vadd.s32        q0, q0, q10                 \n"  // dc = tmp[0] + 3
-    "vadd.s32        q12, q0, q3                 \n"  // a0 = dc + tmp[3]
-    "vadd.s32        q13, q1, q2                 \n"  // a1 = tmp[1] + tmp[2]
-    "vsub.s32        q8, q1, q2                  \n"  // a2 = tmp[1] - tmp[2]
-    "vsub.s32        q9, q0, q3                  \n"  // a3 = dc - tmp[3]
-
-    "vadd.s32        q0, q12, q13                \n"
-    "vshrn.s32       d0, q0, #3                  \n"  // (a0 + a1) >> 3
-    "vadd.s32        q1, q9, q8                  \n"
-    "vshrn.s32       d1, q1, #3                  \n"  // (a3 + a2) >> 3
-    "vsub.s32        q2, q12, q13                \n"
-    "vshrn.s32       d2, q2, #3                  \n"  // (a0 - a1) >> 3
-    "vsub.s32        q3, q9, q8                  \n"
-    "vshrn.s32       d3, q3, #3                  \n"  // (a3 - a2) >> 3
-
-    // set the results to output
-    "vst1.16         d0[0], [%[out]], %[kStep]   \n"
-    "vst1.16         d1[0], [%[out]], %[kStep]   \n"
-    "vst1.16         d2[0], [%[out]], %[kStep]   \n"
-    "vst1.16         d3[0], [%[out]], %[kStep]   \n"
-    "vst1.16         d0[1], [%[out]], %[kStep]   \n"
-    "vst1.16         d1[1], [%[out]], %[kStep]   \n"
-    "vst1.16         d2[1], [%[out]], %[kStep]   \n"
-    "vst1.16         d3[1], [%[out]], %[kStep]   \n"
-    "vst1.16         d0[2], [%[out]], %[kStep]   \n"
-    "vst1.16         d1[2], [%[out]], %[kStep]   \n"
-    "vst1.16         d2[2], [%[out]], %[kStep]   \n"
-    "vst1.16         d3[2], [%[out]], %[kStep]   \n"
-    "vst1.16         d0[3], [%[out]], %[kStep]   \n"
-    "vst1.16         d1[3], [%[out]], %[kStep]   \n"
-    "vst1.16         d2[3], [%[out]], %[kStep]   \n"
-    "vst1.16         d3[3], [%[out]], %[kStep]   \n"
-
-    : [out] "+r"(out)  // modified registers
-    : [in] "r"(in), [kStep] "r"(kStep)  // constants
-    : "memory", "q0", "q1", "q2", "q3",
-      "q8", "q9", "q10", "q11", "q12", "q13"  // clobbered
-  );
+#undef STORE_WHT
+
+//------------------------------------------------------------------------------
+
+#define MUL(a, b) (((a) * (b)) >> 16)
+static void TransformAC3(const int16_t* in, uint8_t* dst) {
+  static const int kC1_full = 20091 + (1 << 16);
+  static const int kC2_full = 35468;
+  const int16x4_t A = vdup_n_s16(in[0]);
+  const int16x4_t c4 = vdup_n_s16(MUL(in[4], kC2_full));
+  const int16x4_t d4 = vdup_n_s16(MUL(in[4], kC1_full));
+  const int c1 = MUL(in[1], kC2_full);
+  const int d1 = MUL(in[1], kC1_full);
+  const uint64_t cd = (uint64_t)( d1 & 0xffff) <<  0 |
+                      (uint64_t)( c1 & 0xffff) << 16 |
+                      (uint64_t)(-c1 & 0xffff) << 32 |
+                      (uint64_t)(-d1 & 0xffff) << 48;
+  const int16x4_t CD = vcreate_s16(cd);
+  const int16x4_t B = vqadd_s16(A, CD);
+  const int16x8_t m0_m1 = vcombine_s16(vqadd_s16(B, d4), vqadd_s16(B, c4));
+  const int16x8_t m2_m3 = vcombine_s16(vqsub_s16(B, c4), vqsub_s16(B, d4));
+  Add4x4(m0_m1, m2_m3, dst);
 }
+#undef MUL
 
 #endif   // WEBP_USE_NEON
 
@@ -420,14 +1268,25 @@ extern void VP8DspInitNEON(void);
 void VP8DspInitNEON(void) {
 #if defined(WEBP_USE_NEON)
   VP8Transform = TransformTwo;
-  VP8TransformAC3 = TransformOne;  // no special code here
+  VP8TransformAC3 = TransformAC3;
   VP8TransformDC = TransformDC;
   VP8TransformWHT = TransformWHT;
 
-  VP8SimpleVFilter16 = SimpleVFilter16NEON;
-  VP8SimpleHFilter16 = SimpleHFilter16NEON;
-  VP8SimpleVFilter16i = SimpleVFilter16iNEON;
-  VP8SimpleHFilter16i = SimpleHFilter16iNEON;
+  VP8VFilter16 = VFilter16;
+  VP8VFilter16i = VFilter16i;
+  VP8HFilter16 = HFilter16;
+#if !defined(WORK_AROUND_GCC)
+  VP8HFilter16i = HFilter16i;
+#endif
+  VP8VFilter8 = VFilter8;
+  VP8VFilter8i = VFilter8i;
+#if !defined(WORK_AROUND_GCC)
+  VP8HFilter8 = HFilter8;
+  VP8HFilter8i = HFilter8i;
+#endif
+  VP8SimpleVFilter16 = SimpleVFilter16;
+  VP8SimpleHFilter16 = SimpleHFilter16;
+  VP8SimpleVFilter16i = SimpleVFilter16i;
+  VP8SimpleHFilter16i = SimpleHFilter16i;
 #endif   // WEBP_USE_NEON
 }
-
diff --git a/src/3rdparty/libwebp/src/dsp/dec_sse2.c b/src/3rdparty/libwebp/src/dsp/dec_sse2.c
index 150c559..c37a637 100644
--- a/src/3rdparty/libwebp/src/dsp/dec_sse2.c
+++ b/src/3rdparty/libwebp/src/dsp/dec_sse2.c
@@ -26,7 +26,7 @@
 //------------------------------------------------------------------------------
 // Transforms (Paragraph 14.4)
 
-static void TransformSSE2(const int16_t* in, uint8_t* dst, int do_two) {
+static void Transform(const int16_t* in, uint8_t* dst, int do_two) {
   // This implementation makes use of 16-bit fixed point versions of two
   // multiply constants:
   //    K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16
@@ -246,7 +246,7 @@ static void TransformSSE2(const int16_t* in, uint8_t* dst, int do_two) {
 
 #if defined(USE_TRANSFORM_AC3)
 #define MUL(a, b) (((a) * (b)) >> 16)
-static void TransformAC3SSE2(const int16_t* in, uint8_t* dst) {
+static void TransformAC3(const int16_t* in, uint8_t* dst) {
   static const int kC1 = 20091 + (1 << 16);
   static const int kC2 = 35468;
   const __m128i A = _mm_set1_epi16(in[0] + 4);
@@ -298,20 +298,15 @@ static void TransformAC3SSE2(const int16_t* in, uint8_t* dst) {
     _mm_subs_epu8((q), (p)),                                                   \
     _mm_subs_epu8((p), (q)))
 
-// Shift each byte of "a" by N bits while preserving by the sign bit.
-//
-// It first shifts the lower bytes of the words and then the upper bytes and
-// then merges the results together.
-#define SIGNED_SHIFT_N(a, N) {                                                 \
-  __m128i t = a;                                                               \
-  t = _mm_slli_epi16(t, 8);                                                    \
-  t = _mm_srai_epi16(t, N);                                                    \
-  t = _mm_srli_epi16(t, 8);                                                    \
-                                                                               \
-  a = _mm_srai_epi16(a, N + 8);                                                \
-  a = _mm_slli_epi16(a, 8);                                                    \
-                                                                               \
-  a = _mm_or_si128(t, a);                                                      \
+// Shift each byte of "x" by 3 bits while preserving by the sign bit.
+static WEBP_INLINE void SignedShift8b(__m128i* const x) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i signs = _mm_cmpgt_epi8(zero, *x);
+  const __m128i lo_0 = _mm_unpacklo_epi8(*x, signs);  // s8 -> s16 sign extend
+  const __m128i hi_0 = _mm_unpackhi_epi8(*x, signs);
+  const __m128i lo_1 = _mm_srai_epi16(lo_0, 3);
+  const __m128i hi_1 = _mm_srai_epi16(hi_0, 3);
+  *x = _mm_packs_epi16(lo_1, hi_1);
 }
 
 #define FLIP_SIGN_BIT2(a, b) {                                                 \
@@ -324,103 +319,123 @@ static void TransformAC3SSE2(const int16_t* in, uint8_t* dst) {
   FLIP_SIGN_BIT2(c, d);                                                        \
 }
 
-#define GET_NOTHEV(p1, p0, q0, q1, hev_thresh, not_hev) {                      \
-  const __m128i zero = _mm_setzero_si128();                                    \
-  const __m128i t_1 = MM_ABS(p1, p0);                                          \
-  const __m128i t_2 = MM_ABS(q1, q0);                                          \
-                                                                               \
-  const __m128i h = _mm_set1_epi8(hev_thresh);                                 \
-  const __m128i t_3 = _mm_subs_epu8(t_1, h);  /* abs(p1 - p0) - hev_tresh */   \
-  const __m128i t_4 = _mm_subs_epu8(t_2, h);  /* abs(q1 - q0) - hev_tresh */   \
-                                                                               \
-  not_hev = _mm_or_si128(t_3, t_4);                                            \
-  not_hev = _mm_cmpeq_epi8(not_hev, zero); /* not_hev <= t1 && not_hev <= t2 */\
-}
-
-#define GET_BASE_DELTA(p1, p0, q0, q1, o) {                                    \
-  const __m128i qp0 = _mm_subs_epi8(q0, p0);  /* q0 - p0 */                    \
-  o = _mm_subs_epi8(p1, q1);            /* p1 - q1 */                          \
-  o = _mm_adds_epi8(o, qp0);            /* p1 - q1 + 1 * (q0 - p0) */          \
-  o = _mm_adds_epi8(o, qp0);            /* p1 - q1 + 2 * (q0 - p0) */          \
-  o = _mm_adds_epi8(o, qp0);            /* p1 - q1 + 3 * (q0 - p0) */          \
-}
-
-#define DO_SIMPLE_FILTER(p0, q0, fl) {                                         \
-  const __m128i three = _mm_set1_epi8(3);                                      \
-  const __m128i four = _mm_set1_epi8(4);                                       \
-  __m128i v3 = _mm_adds_epi8(fl, three);                                       \
-  __m128i v4 = _mm_adds_epi8(fl, four);                                        \
-                                                                               \
-  /* Do +4 side */                                                             \
-  SIGNED_SHIFT_N(v4, 3);                /* v4 >> 3  */                         \
-  q0 = _mm_subs_epi8(q0, v4);           /* q0 -= v4 */                         \
-                                                                               \
-  /* Now do +3 side */                                                         \
-  SIGNED_SHIFT_N(v3, 3);                /* v3 >> 3  */                         \
-  p0 = _mm_adds_epi8(p0, v3);           /* p0 += v3 */                         \
+// input/output is uint8_t
+static WEBP_INLINE void GetNotHEV(const __m128i* const p1,
+                                  const __m128i* const p0,
+                                  const __m128i* const q0,
+                                  const __m128i* const q1,
+                                  int hev_thresh, __m128i* const not_hev) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i t_1 = MM_ABS(*p1, *p0);
+  const __m128i t_2 = MM_ABS(*q1, *q0);
+
+  const __m128i h = _mm_set1_epi8(hev_thresh);
+  const __m128i t_3 = _mm_subs_epu8(t_1, h);  // abs(p1 - p0) - hev_tresh
+  const __m128i t_4 = _mm_subs_epu8(t_2, h);  // abs(q1 - q0) - hev_tresh
+
+  *not_hev = _mm_or_si128(t_3, t_4);
+  *not_hev = _mm_cmpeq_epi8(*not_hev, zero);  // not_hev <= t1 && not_hev <= t2
+}
+
+// input pixels are int8_t
+static WEBP_INLINE void GetBaseDelta(const __m128i* const p1,
+                                     const __m128i* const p0,
+                                     const __m128i* const q0,
+                                     const __m128i* const q1,
+                                     __m128i* const delta) {
+  // beware of addition order, for saturation!
+  const __m128i p1_q1 = _mm_subs_epi8(*p1, *q1);   // p1 - q1
+  const __m128i q0_p0 = _mm_subs_epi8(*q0, *p0);   // q0 - p0
+  const __m128i s1 = _mm_adds_epi8(p1_q1, q0_p0);  // p1 - q1 + 1 * (q0 - p0)
+  const __m128i s2 = _mm_adds_epi8(q0_p0, s1);     // p1 - q1 + 2 * (q0 - p0)
+  const __m128i s3 = _mm_adds_epi8(q0_p0, s2);     // p1 - q1 + 3 * (q0 - p0)
+  *delta = s3;
+}
+
+// input and output are int8_t
+static WEBP_INLINE void DoSimpleFilter(__m128i* const p0, __m128i* const q0,
+                                       const __m128i* const fl) {
+  const __m128i k3 = _mm_set1_epi8(3);
+  const __m128i k4 = _mm_set1_epi8(4);
+  __m128i v3 = _mm_adds_epi8(*fl, k3);
+  __m128i v4 = _mm_adds_epi8(*fl, k4);
+
+  SignedShift8b(&v4);                  // v4 >> 3
+  SignedShift8b(&v3);                  // v3 >> 3
+  *q0 = _mm_subs_epi8(*q0, v4);        // q0 -= v4
+  *p0 = _mm_adds_epi8(*p0, v3);        // p0 += v3
 }
 
 // Updates values of 2 pixels at MB edge during complex filtering.
 // Update operations:
 // q = q - delta and p = p + delta; where delta = [(a_hi >> 7), (a_lo >> 7)]
-#define UPDATE_2PIXELS(pi, qi, a_lo, a_hi) {                                   \
-  const __m128i a_lo7 = _mm_srai_epi16(a_lo, 7);                               \
-  const __m128i a_hi7 = _mm_srai_epi16(a_hi, 7);                               \
-  const __m128i delta = _mm_packs_epi16(a_lo7, a_hi7);                         \
-  pi = _mm_adds_epi8(pi, delta);                                               \
-  qi = _mm_subs_epi8(qi, delta);                                               \
+// Pixels 'pi' and 'qi' are int8_t on input, uint8_t on output (sign flip).
+static WEBP_INLINE void Update2Pixels(__m128i* const pi, __m128i* const qi,
+                                      const __m128i* const a0_lo,
+                                      const __m128i* const a0_hi) {
+  const __m128i a1_lo = _mm_srai_epi16(*a0_lo, 7);
+  const __m128i a1_hi = _mm_srai_epi16(*a0_hi, 7);
+  const __m128i delta = _mm_packs_epi16(a1_lo, a1_hi);
+  const __m128i sign_bit = _mm_set1_epi8(0x80);
+  *pi = _mm_adds_epi8(*pi, delta);
+  *qi = _mm_subs_epi8(*qi, delta);
+  FLIP_SIGN_BIT2(*pi, *qi);
 }
 
-static void NeedsFilter(const __m128i* p1, const __m128i* p0, const __m128i* q0,
-                        const __m128i* q1, int thresh, __m128i *mask) {
-  __m128i t1 = MM_ABS(*p1, *q1);        // abs(p1 - q1)
-  *mask = _mm_set1_epi8(0xFE);
-  t1 = _mm_and_si128(t1, *mask);        // set lsb of each byte to zero
-  t1 = _mm_srli_epi16(t1, 1);           // abs(p1 - q1) / 2
+// input pixels are uint8_t
+static WEBP_INLINE void NeedsFilter(const __m128i* const p1,
+                                    const __m128i* const p0,
+                                    const __m128i* const q0,
+                                    const __m128i* const q1,
+                                    int thresh, __m128i* const mask) {
+  const __m128i m_thresh = _mm_set1_epi8(thresh);
+  const __m128i t1 = MM_ABS(*p1, *q1);        // abs(p1 - q1)
+  const __m128i kFE = _mm_set1_epi8(0xFE);
+  const __m128i t2 = _mm_and_si128(t1, kFE);  // set lsb of each byte to zero
+  const __m128i t3 = _mm_srli_epi16(t2, 1);   // abs(p1 - q1) / 2
 
-  *mask = MM_ABS(*p0, *q0);             // abs(p0 - q0)
-  *mask = _mm_adds_epu8(*mask, *mask);  // abs(p0 - q0) * 2
-  *mask = _mm_adds_epu8(*mask, t1);     // abs(p0 - q0) * 2 + abs(p1 - q1) / 2
+  const __m128i t4 = MM_ABS(*p0, *q0);        // abs(p0 - q0)
+  const __m128i t5 = _mm_adds_epu8(t4, t4);   // abs(p0 - q0) * 2
+  const __m128i t6 = _mm_adds_epu8(t5, t3);   // abs(p0-q0)*2 + abs(p1-q1)/2
 
-  t1 = _mm_set1_epi8(thresh);
-  *mask = _mm_subs_epu8(*mask, t1);     // mask <= thresh
-  *mask = _mm_cmpeq_epi8(*mask, _mm_setzero_si128());
+  const __m128i t7 = _mm_subs_epu8(t6, m_thresh);  // mask <= m_thresh
+  *mask = _mm_cmpeq_epi8(t7, _mm_setzero_si128());
 }
 
 //------------------------------------------------------------------------------
 // Edge filtering functions
 
 // Applies filter on 2 pixels (p0 and q0)
-static WEBP_INLINE void DoFilter2(const __m128i* p1, __m128i* p0, __m128i* q0,
-                                  const __m128i* q1, int thresh) {
+static WEBP_INLINE void DoFilter2(__m128i* const p1, __m128i* const p0,
+                                  __m128i* const q0, __m128i* const q1,
+                                  int thresh) {
   __m128i a, mask;
   const __m128i sign_bit = _mm_set1_epi8(0x80);
+  // convert p1/q1 to int8_t (for GetBaseDelta)
   const __m128i p1s = _mm_xor_si128(*p1, sign_bit);
   const __m128i q1s = _mm_xor_si128(*q1, sign_bit);
 
   NeedsFilter(p1, p0, q0, q1, thresh, &mask);
 
-  // convert to signed values
   FLIP_SIGN_BIT2(*p0, *q0);
-
-  GET_BASE_DELTA(p1s, *p0, *q0, q1s, a);
+  GetBaseDelta(&p1s, p0, q0, &q1s, &a);
   a = _mm_and_si128(a, mask);     // mask filter values we don't care about
-  DO_SIMPLE_FILTER(*p0, *q0, a);
-
-  // unoffset
+  DoSimpleFilter(p0, q0, &a);
   FLIP_SIGN_BIT2(*p0, *q0);
 }
 
 // Applies filter on 4 pixels (p1, p0, q0 and q1)
-static WEBP_INLINE void DoFilter4(__m128i* p1, __m128i *p0,
-                                  __m128i* q0, __m128i* q1,
-                                  const __m128i* mask, int hev_thresh) {
+static WEBP_INLINE void DoFilter4(__m128i* const p1, __m128i* const p0,
+                                  __m128i* const q0, __m128i* const q1,
+                                  const __m128i* const mask, int hev_thresh) {
+  const __m128i sign_bit = _mm_set1_epi8(0x80);
+  const __m128i k64 = _mm_set1_epi8(0x40);
+  const __m128i zero = _mm_setzero_si128();
   __m128i not_hev;
   __m128i t1, t2, t3;
-  const __m128i sign_bit = _mm_set1_epi8(0x80);
 
   // compute hev mask
-  GET_NOTHEV(*p1, *p0, *q0, *q1, hev_thresh, not_hev);
+  GetNotHEV(p1, p0, q0, q1, hev_thresh, &not_hev);
 
   // convert to signed values
   FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1);
@@ -433,92 +448,83 @@ static WEBP_INLINE void DoFilter4(__m128i* p1, __m128i *p0,
   t1 = _mm_adds_epi8(t1, t2);          // hev(p1 - q1) + 3 * (q0 - p0)
   t1 = _mm_and_si128(t1, *mask);       // mask filter values we don't care about
 
-  // Do +4 side
-  t2 = _mm_set1_epi8(4);
-  t2 = _mm_adds_epi8(t1, t2);        // 3 * (q0 - p0) + (p1 - q1) + 4
-  SIGNED_SHIFT_N(t2, 3);             // (3 * (q0 - p0) + hev(p1 - q1) + 4) >> 3
-  t3 = t2;                           // save t2
-  *q0 = _mm_subs_epi8(*q0, t2);      // q0 -= t2
-
-  // Now do +3 side
   t2 = _mm_set1_epi8(3);
-  t2 = _mm_adds_epi8(t1, t2);        // +3 instead of +4
-  SIGNED_SHIFT_N(t2, 3);             // (3 * (q0 - p0) + hev(p1 - q1) + 3) >> 3
+  t3 = _mm_set1_epi8(4);
+  t2 = _mm_adds_epi8(t1, t2);        // 3 * (q0 - p0) + (p1 - q1) + 3
+  t3 = _mm_adds_epi8(t1, t3);        // 3 * (q0 - p0) + (p1 - q1) + 4
+  SignedShift8b(&t2);                // (3 * (q0 - p0) + hev(p1 - q1) + 3) >> 3
+  SignedShift8b(&t3);                // (3 * (q0 - p0) + hev(p1 - q1) + 4) >> 3
   *p0 = _mm_adds_epi8(*p0, t2);      // p0 += t2
+  *q0 = _mm_subs_epi8(*q0, t3);      // q0 -= t3
+  FLIP_SIGN_BIT2(*p0, *q0);
 
-  t2 = _mm_set1_epi8(1);
-  t3 = _mm_adds_epi8(t3, t2);
-  SIGNED_SHIFT_N(t3, 1);             // (3 * (q0 - p0) + hev(p1 - q1) + 4) >> 4
+  // this is equivalent to signed (a + 1) >> 1 calculation
+  t2 = _mm_add_epi8(t3, sign_bit);
+  t3 = _mm_avg_epu8(t2, zero);
+  t3 = _mm_sub_epi8(t3, k64);
 
   t3 = _mm_and_si128(not_hev, t3);   // if !hev
   *q1 = _mm_subs_epi8(*q1, t3);      // q1 -= t3
   *p1 = _mm_adds_epi8(*p1, t3);      // p1 += t3
-
-  // unoffset
-  FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1);
+  FLIP_SIGN_BIT2(*p1, *q1);
 }
 
 // Applies filter on 6 pixels (p2, p1, p0, q0, q1 and q2)
-static WEBP_INLINE void DoFilter6(__m128i *p2, __m128i* p1, __m128i *p0,
-                                  __m128i* q0, __m128i* q1, __m128i *q2,
-                                  const __m128i* mask, int hev_thresh) {
-  __m128i a, not_hev;
+static WEBP_INLINE void DoFilter6(__m128i* const p2, __m128i* const p1,
+                                  __m128i* const p0, __m128i* const q0,
+                                  __m128i* const q1, __m128i* const q2,
+                                  const __m128i* const mask, int hev_thresh) {
+  const __m128i zero = _mm_setzero_si128();
   const __m128i sign_bit = _mm_set1_epi8(0x80);
+  __m128i a, not_hev;
 
   // compute hev mask
-  GET_NOTHEV(*p1, *p0, *q0, *q1, hev_thresh, not_hev);
+  GetNotHEV(p1, p0, q0, q1, hev_thresh, &not_hev);
 
-  // convert to signed values
   FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1);
   FLIP_SIGN_BIT2(*p2, *q2);
-
-  GET_BASE_DELTA(*p1, *p0, *q0, *q1, a);
+  GetBaseDelta(p1, p0, q0, q1, &a);
 
   { // do simple filter on pixels with hev
     const __m128i m = _mm_andnot_si128(not_hev, *mask);
     const __m128i f = _mm_and_si128(a, m);
-    DO_SIMPLE_FILTER(*p0, *q0, f);
+    DoSimpleFilter(p0, q0, &f);
   }
+
   { // do strong filter on pixels with not hev
-    const __m128i zero = _mm_setzero_si128();
-    const __m128i nine = _mm_set1_epi16(0x0900);
-    const __m128i sixty_three = _mm_set1_epi16(63);
+    const __m128i k9 = _mm_set1_epi16(0x0900);
+    const __m128i k63 = _mm_set1_epi16(63);
 
     const __m128i m = _mm_and_si128(not_hev, *mask);
     const __m128i f = _mm_and_si128(a, m);
+
     const __m128i f_lo = _mm_unpacklo_epi8(zero, f);
     const __m128i f_hi = _mm_unpackhi_epi8(zero, f);
 
-    const __m128i f9_lo = _mm_mulhi_epi16(f_lo, nine);   // Filter (lo) * 9
-    const __m128i f9_hi = _mm_mulhi_epi16(f_hi, nine);   // Filter (hi) * 9
-    const __m128i f18_lo = _mm_add_epi16(f9_lo, f9_lo);  // Filter (lo) * 18
-    const __m128i f18_hi = _mm_add_epi16(f9_hi, f9_hi);  // Filter (hi) * 18
+    const __m128i f9_lo = _mm_mulhi_epi16(f_lo, k9);    // Filter (lo) * 9
+    const __m128i f9_hi = _mm_mulhi_epi16(f_hi, k9);    // Filter (hi) * 9
 
-    const __m128i a2_lo = _mm_add_epi16(f9_lo, sixty_three);  // Filter * 9 + 63
-    const __m128i a2_hi = _mm_add_epi16(f9_hi, sixty_three);  // Filter * 9 + 63
+    const __m128i a2_lo = _mm_add_epi16(f9_lo, k63);    // Filter * 9 + 63
+    const __m128i a2_hi = _mm_add_epi16(f9_hi, k63);    // Filter * 9 + 63
 
-    const __m128i a1_lo = _mm_add_epi16(f18_lo, sixty_three);  // F... * 18 + 63
-    const __m128i a1_hi = _mm_add_epi16(f18_hi, sixty_three);  // F... * 18 + 63
+    const __m128i a1_lo = _mm_add_epi16(a2_lo, f9_lo);  // Filter * 18 + 63
+    const __m128i a1_hi = _mm_add_epi16(a2_hi, f9_hi);  // Filter * 18 + 63
 
-    const __m128i a0_lo = _mm_add_epi16(f18_lo, a2_lo);  // Filter * 27 + 63
-    const __m128i a0_hi = _mm_add_epi16(f18_hi, a2_hi);  // Filter * 27 + 63
+    const __m128i a0_lo = _mm_add_epi16(a1_lo, f9_lo);  // Filter * 27 + 63
+    const __m128i a0_hi = _mm_add_epi16(a1_hi, f9_hi);  // Filter * 27 + 63
 
-    UPDATE_2PIXELS(*p2, *q2, a2_lo, a2_hi);
-    UPDATE_2PIXELS(*p1, *q1, a1_lo, a1_hi);
-    UPDATE_2PIXELS(*p0, *q0, a0_lo, a0_hi);
+    Update2Pixels(p2, q2, &a2_lo, &a2_hi);
+    Update2Pixels(p1, q1, &a1_lo, &a1_hi);
+    Update2Pixels(p0, q0, &a0_lo, &a0_hi);
   }
-
-  // unoffset
-  FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1);
-  FLIP_SIGN_BIT2(*p2, *q2);
 }
 
 // reads 8 rows across a vertical edge.
 //
 // TODO(somnath): Investigate _mm_shuffle* also see if it can be broken into
 // two Load4x4() to avoid code duplication.
-static WEBP_INLINE void Load8x4(const uint8_t* b, int stride,
-                                __m128i* p, __m128i* q) {
+static WEBP_INLINE void Load8x4(const uint8_t* const b, int stride,
+                                __m128i* const p, __m128i* const q) {
   __m128i t1, t2;
 
   // Load 0th, 1st, 4th and 5th rows
@@ -557,10 +563,11 @@ static WEBP_INLINE void Load8x4(const uint8_t* b, int stride,
   *q = _mm_unpackhi_epi32(t1, t2);
 }
 
-static WEBP_INLINE void Load16x4(const uint8_t* r0, const uint8_t* r8,
+static WEBP_INLINE void Load16x4(const uint8_t* const r0,
+                                 const uint8_t* const r8,
                                  int stride,
-                                 __m128i* p1, __m128i* p0,
-                                 __m128i* q0, __m128i* q1) {
+                                 __m128i* const p1, __m128i* const p0,
+                                 __m128i* const q0, __m128i* const q1) {
   __m128i t1, t2;
   // Assume the pixels around the edge (|) are numbered as follows
   //                00 01 | 02 03
@@ -592,7 +599,7 @@ static WEBP_INLINE void Load16x4(const uint8_t* r0, const uint8_t* r8,
   *q1 = _mm_unpackhi_epi64(t2, *q1);
 }
 
-static WEBP_INLINE void Store4x4(__m128i* x, uint8_t* dst, int stride) {
+static WEBP_INLINE void Store4x4(__m128i* const x, uint8_t* dst, int stride) {
   int i;
   for (i = 0; i < 4; ++i, dst += stride) {
     *((int32_t*)dst) = _mm_cvtsi128_si32(*x);
@@ -601,48 +608,51 @@ static WEBP_INLINE void Store4x4(__m128i* x, uint8_t* dst, int stride) {
 }
 
 // Transpose back and store
-static WEBP_INLINE void Store16x4(uint8_t* r0, uint8_t* r8, int stride,
-                                  __m128i* p1, __m128i* p0,
-                                  __m128i* q0, __m128i* q1) {
-  __m128i t1;
+static WEBP_INLINE void Store16x4(const __m128i* const p1,
+                                  const __m128i* const p0,
+                                  const __m128i* const q0,
+                                  const __m128i* const q1,
+                                  uint8_t* r0, uint8_t* r8,
+                                  int stride) {
+  __m128i t1, p1_s, p0_s, q0_s, q1_s;
 
   // p0 = 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
   // p1 = f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
   t1 = *p0;
-  *p0 = _mm_unpacklo_epi8(*p1, t1);
-  *p1 = _mm_unpackhi_epi8(*p1, t1);
+  p0_s = _mm_unpacklo_epi8(*p1, t1);
+  p1_s = _mm_unpackhi_epi8(*p1, t1);
 
   // q0 = 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
   // q1 = f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
   t1 = *q0;
-  *q0 = _mm_unpacklo_epi8(t1, *q1);
-  *q1 = _mm_unpackhi_epi8(t1, *q1);
+  q0_s = _mm_unpacklo_epi8(t1, *q1);
+  q1_s = _mm_unpackhi_epi8(t1, *q1);
 
   // p0 = 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
   // q0 = 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
-  t1 = *p0;
-  *p0 = _mm_unpacklo_epi16(t1, *q0);
-  *q0 = _mm_unpackhi_epi16(t1, *q0);
+  t1 = p0_s;
+  p0_s = _mm_unpacklo_epi16(t1, q0_s);
+  q0_s = _mm_unpackhi_epi16(t1, q0_s);
 
   // p1 = b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
   // q1 = f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
-  t1 = *p1;
-  *p1 = _mm_unpacklo_epi16(t1, *q1);
-  *q1 = _mm_unpackhi_epi16(t1, *q1);
+  t1 = p1_s;
+  p1_s = _mm_unpacklo_epi16(t1, q1_s);
+  q1_s = _mm_unpackhi_epi16(t1, q1_s);
 
-  Store4x4(p0, r0, stride);
+  Store4x4(&p0_s, r0, stride);
   r0 += 4 * stride;
-  Store4x4(q0, r0, stride);
+  Store4x4(&q0_s, r0, stride);
 
-  Store4x4(p1, r8, stride);
+  Store4x4(&p1_s, r8, stride);
   r8 += 4 * stride;
-  Store4x4(q1, r8, stride);
+  Store4x4(&q1_s, r8, stride);
 }
 
 //------------------------------------------------------------------------------
 // Simple In-loop filtering (Paragraph 15.2)
 
-static void SimpleVFilter16SSE2(uint8_t* p, int stride, int thresh) {
+static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {
   // Load
   __m128i p1 = _mm_loadu_si128((__m128i*)&p[-2 * stride]);
   __m128i p0 = _mm_loadu_si128((__m128i*)&p[-stride]);
@@ -653,49 +663,49 @@ static void SimpleVFilter16SSE2(uint8_t* p, int stride, int thresh) {
 
   // Store
   _mm_storeu_si128((__m128i*)&p[-stride], p0);
-  _mm_storeu_si128((__m128i*)p, q0);
+  _mm_storeu_si128((__m128i*)&p[0], q0);
 }
 
-static void SimpleHFilter16SSE2(uint8_t* p, int stride, int thresh) {
+static void SimpleHFilter16(uint8_t* p, int stride, int thresh) {
   __m128i p1, p0, q0, q1;
 
   p -= 2;  // beginning of p1
 
-  Load16x4(p, p + 8 * stride,  stride, &p1, &p0, &q0, &q1);
+  Load16x4(p, p + 8 * stride, stride, &p1, &p0, &q0, &q1);
   DoFilter2(&p1, &p0, &q0, &q1, thresh);
-  Store16x4(p, p + 8 * stride, stride, &p1, &p0, &q0, &q1);
+  Store16x4(&p1, &p0, &q0, &q1, p, p + 8 * stride, stride);
 }
 
-static void SimpleVFilter16iSSE2(uint8_t* p, int stride, int thresh) {
+static void SimpleVFilter16i(uint8_t* p, int stride, int thresh) {
   int k;
   for (k = 3; k > 0; --k) {
     p += 4 * stride;
-    SimpleVFilter16SSE2(p, stride, thresh);
+    SimpleVFilter16(p, stride, thresh);
   }
 }
 
-static void SimpleHFilter16iSSE2(uint8_t* p, int stride, int thresh) {
+static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) {
   int k;
   for (k = 3; k > 0; --k) {
     p += 4;
-    SimpleHFilter16SSE2(p, stride, thresh);
+    SimpleHFilter16(p, stride, thresh);
   }
 }
 
 //------------------------------------------------------------------------------
 // Complex In-loop filtering (Paragraph 15.3)
 
-#define MAX_DIFF1(p3, p2, p1, p0, m) {                                         \
-  m = MM_ABS(p3, p2);                                                          \
+#define MAX_DIFF1(p3, p2, p1, p0, m) do {                                      \
+  m = MM_ABS(p1, p0);                                                          \
+  m = _mm_max_epu8(m, MM_ABS(p3, p2));                                         \
   m = _mm_max_epu8(m, MM_ABS(p2, p1));                                         \
-  m = _mm_max_epu8(m, MM_ABS(p1, p0));                                         \
-}
+} while (0)
 
-#define MAX_DIFF2(p3, p2, p1, p0, m) {                                         \
+#define MAX_DIFF2(p3, p2, p1, p0, m) do {                                      \
+  m = _mm_max_epu8(m, MM_ABS(p1, p0));                                         \
   m = _mm_max_epu8(m, MM_ABS(p3, p2));                                         \
   m = _mm_max_epu8(m, MM_ABS(p2, p1));                                         \
-  m = _mm_max_epu8(m, MM_ABS(p1, p0));                                         \
-}
+} while (0)
 
 #define LOAD_H_EDGES4(p, stride, e1, e2, e3, e4) {                             \
   e1 = _mm_loadu_si128((__m128i*)&(p)[0 * stride]);                            \
@@ -704,10 +714,11 @@ static void SimpleHFilter16iSSE2(uint8_t* p, int stride, int thresh) {
   e4 = _mm_loadu_si128((__m128i*)&(p)[3 * stride]);                            \
 }
 
-#define LOADUV_H_EDGE(p, u, v, stride) {                                       \
-  p = _mm_loadl_epi64((__m128i*)&(u)[(stride)]);                               \
-  p = _mm_unpacklo_epi64(p, _mm_loadl_epi64((__m128i*)&(v)[(stride)]));        \
-}
+#define LOADUV_H_EDGE(p, u, v, stride) do {                                    \
+  const __m128i U = _mm_loadl_epi64((__m128i*)&(u)[(stride)]);                 \
+  const __m128i V = _mm_loadl_epi64((__m128i*)&(v)[(stride)]);                 \
+  p = _mm_unpacklo_epi64(U, V);                                                \
+} while (0)
 
 #define LOADUV_H_EDGES4(u, v, stride, e1, e2, e3, e4) {                        \
   LOADUV_H_EDGE(e1, u, v, 0 * stride);                                         \
@@ -722,18 +733,23 @@ static void SimpleHFilter16iSSE2(uint8_t* p, int stride, int thresh) {
   _mm_storel_epi64((__m128i*)&v[(stride)], p);                                 \
 }
 
-#define COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, ithresh, mask) {               \
-  __m128i fl_yes;                                                              \
-  const __m128i it = _mm_set1_epi8(ithresh);                                   \
-  mask = _mm_subs_epu8(mask, it);                                              \
-  mask = _mm_cmpeq_epi8(mask, _mm_setzero_si128());                            \
-  NeedsFilter(&p1, &p0, &q0, &q1, thresh, &fl_yes);                            \
-  mask = _mm_and_si128(mask, fl_yes);                                          \
+static WEBP_INLINE void ComplexMask(const __m128i* const p1,
+                                    const __m128i* const p0,
+                                    const __m128i* const q0,
+                                    const __m128i* const q1,
+                                    int thresh, int ithresh,
+                                    __m128i* const mask) {
+  const __m128i it = _mm_set1_epi8(ithresh);
+  const __m128i diff = _mm_subs_epu8(*mask, it);
+  const __m128i thresh_mask = _mm_cmpeq_epi8(diff, _mm_setzero_si128());
+  __m128i filter_mask;
+  NeedsFilter(p1, p0, q0, q1, thresh, &filter_mask);
+  *mask = _mm_and_si128(thresh_mask, filter_mask);
 }
 
 // on macroblock edges
-static void VFilter16SSE2(uint8_t* p, int stride,
-                          int thresh, int ithresh, int hev_thresh) {
+static void VFilter16(uint8_t* p, int stride,
+                      int thresh, int ithresh, int hev_thresh) {
   __m128i t1;
   __m128i mask;
   __m128i p2, p1, p0, q0, q1, q2;
@@ -746,20 +762,20 @@ static void VFilter16SSE2(uint8_t* p, int stride,
   LOAD_H_EDGES4(p, stride, q0, q1, q2, t1);
   MAX_DIFF2(t1, q2, q1, q0, mask);
 
-  COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, ithresh, mask);
+  ComplexMask(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
   DoFilter6(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh);
 
   // Store
   _mm_storeu_si128((__m128i*)&p[-3 * stride], p2);
   _mm_storeu_si128((__m128i*)&p[-2 * stride], p1);
   _mm_storeu_si128((__m128i*)&p[-1 * stride], p0);
-  _mm_storeu_si128((__m128i*)&p[0 * stride], q0);
-  _mm_storeu_si128((__m128i*)&p[1 * stride], q1);
-  _mm_storeu_si128((__m128i*)&p[2 * stride], q2);
+  _mm_storeu_si128((__m128i*)&p[+0 * stride], q0);
+  _mm_storeu_si128((__m128i*)&p[+1 * stride], q1);
+  _mm_storeu_si128((__m128i*)&p[+2 * stride], q2);
 }
 
-static void HFilter16SSE2(uint8_t* p, int stride,
-                          int thresh, int ithresh, int hev_thresh) {
+static void HFilter16(uint8_t* p, int stride,
+                      int thresh, int ithresh, int hev_thresh) {
   __m128i mask;
   __m128i p3, p2, p1, p0, q0, q1, q2, q3;
 
@@ -770,71 +786,78 @@ static void HFilter16SSE2(uint8_t* p, int stride,
   Load16x4(p, p + 8 * stride, stride, &q0, &q1, &q2, &q3);  // q0, q1, q2, q3
   MAX_DIFF2(q3, q2, q1, q0, mask);
 
-  COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, ithresh, mask);
+  ComplexMask(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
   DoFilter6(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh);
 
-  Store16x4(b, b + 8 * stride, stride, &p3, &p2, &p1, &p0);
-  Store16x4(p, p + 8 * stride, stride, &q0, &q1, &q2, &q3);
+  Store16x4(&p3, &p2, &p1, &p0, b, b + 8 * stride, stride);
+  Store16x4(&q0, &q1, &q2, &q3, p, p + 8 * stride, stride);
 }
 
 // on three inner edges
-static void VFilter16iSSE2(uint8_t* p, int stride,
-                           int thresh, int ithresh, int hev_thresh) {
+static void VFilter16i(uint8_t* p, int stride,
+                       int thresh, int ithresh, int hev_thresh) {
   int k;
-  __m128i mask;
-  __m128i t1, t2, p1, p0, q0, q1;
+  __m128i p3, p2, p1, p0;   // loop invariants
 
-  for (k = 3; k > 0; --k) {
-    // Load p3, p2, p1, p0
-    LOAD_H_EDGES4(p, stride, t2, t1, p1, p0);
-    MAX_DIFF1(t2, t1, p1, p0, mask);
+  LOAD_H_EDGES4(p, stride, p3, p2, p1, p0);  // prologue
 
+  for (k = 3; k > 0; --k) {
+    __m128i mask, tmp1, tmp2;
+    uint8_t* const b = p + 2 * stride;   // beginning of p1
     p += 4 * stride;
 
-    // Load q0, q1, q2, q3
-    LOAD_H_EDGES4(p, stride, q0, q1, t1, t2);
-    MAX_DIFF2(t2, t1, q1, q0, mask);
+    MAX_DIFF1(p3, p2, p1, p0, mask);   // compute partial mask
+    LOAD_H_EDGES4(p, stride, p3, p2, tmp1, tmp2);
+    MAX_DIFF2(p3, p2, tmp1, tmp2, mask);
 
-    COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, ithresh, mask);
-    DoFilter4(&p1, &p0, &q0, &q1, &mask, hev_thresh);
+    // p3 and p2 are not just temporary variables here: they will be
+    // re-used for next span. And q2/q3 will become p1/p0 accordingly.
+    ComplexMask(&p1, &p0, &p3, &p2, thresh, ithresh, &mask);
+    DoFilter4(&p1, &p0, &p3, &p2, &mask, hev_thresh);
 
     // Store
-    _mm_storeu_si128((__m128i*)&p[-2 * stride], p1);
-    _mm_storeu_si128((__m128i*)&p[-1 * stride], p0);
-    _mm_storeu_si128((__m128i*)&p[0 * stride], q0);
-    _mm_storeu_si128((__m128i*)&p[1 * stride], q1);
+    _mm_storeu_si128((__m128i*)&b[0 * stride], p1);
+    _mm_storeu_si128((__m128i*)&b[1 * stride], p0);
+    _mm_storeu_si128((__m128i*)&b[2 * stride], p3);
+    _mm_storeu_si128((__m128i*)&b[3 * stride], p2);
+
+    // rotate samples
+    p1 = tmp1;
+    p0 = tmp2;
   }
 }
 
-static void HFilter16iSSE2(uint8_t* p, int stride,
-                           int thresh, int ithresh, int hev_thresh) {
+static void HFilter16i(uint8_t* p, int stride,
+                       int thresh, int ithresh, int hev_thresh) {
   int k;
-  uint8_t* b;
-  __m128i mask;
-  __m128i t1, t2, p1, p0, q0, q1;
+  __m128i p3, p2, p1, p0;   // loop invariants
+
+  Load16x4(p, p + 8 * stride, stride, &p3, &p2, &p1, &p0);  // prologue
 
   for (k = 3; k > 0; --k) {
-    b = p;
-    Load16x4(b, b + 8 * stride, stride, &t2, &t1, &p1, &p0);  // p3, p2, p1, p0
-    MAX_DIFF1(t2, t1, p1, p0, mask);
+    __m128i mask, tmp1, tmp2;
+    uint8_t* const b = p + 2;   // beginning of p1
 
-    b += 4;  // beginning of q0
-    Load16x4(b, b + 8 * stride, stride, &q0, &q1, &t1, &t2);  // q0, q1, q2, q3
-    MAX_DIFF2(t2, t1, q1, q0, mask);
+    p += 4;  // beginning of q0 (and next span)
 
-    COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, ithresh, mask);
-    DoFilter4(&p1, &p0, &q0, &q1, &mask, hev_thresh);
+    MAX_DIFF1(p3, p2, p1, p0, mask);   // compute partial mask
+    Load16x4(p, p + 8 * stride, stride, &p3, &p2, &tmp1, &tmp2);
+    MAX_DIFF2(p3, p2, tmp1, tmp2, mask);
 
-    b -= 2;  // beginning of p1
-    Store16x4(b, b + 8 * stride, stride, &p1, &p0, &q0, &q1);
+    ComplexMask(&p1, &p0, &p3, &p2, thresh, ithresh, &mask);
+    DoFilter4(&p1, &p0, &p3, &p2, &mask, hev_thresh);
 
-    p += 4;
+    Store16x4(&p1, &p0, &p3, &p2, b, b + 8 * stride, stride);
+
+    // rotate samples
+    p1 = tmp1;
+    p0 = tmp2;
   }
 }
 
 // 8-pixels wide variant, for chroma filtering
-static void VFilter8SSE2(uint8_t* u, uint8_t* v, int stride,
-                         int thresh, int ithresh, int hev_thresh) {
+static void VFilter8(uint8_t* u, uint8_t* v, int stride,
+                     int thresh, int ithresh, int hev_thresh) {
   __m128i mask;
   __m128i t1, p2, p1, p0, q0, q1, q2;
 
@@ -846,7 +869,7 @@ static void VFilter8SSE2(uint8_t* u, uint8_t* v, int stride,
   LOADUV_H_EDGES4(u, v, stride, q0, q1, q2, t1);
   MAX_DIFF2(t1, q2, q1, q0, mask);
 
-  COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, ithresh, mask);
+  ComplexMask(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
   DoFilter6(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh);
 
   // Store
@@ -858,8 +881,8 @@ static void VFilter8SSE2(uint8_t* u, uint8_t* v, int stride,
   STOREUV(q2, u, v, 2 * stride);
 }
 
-static void HFilter8SSE2(uint8_t* u, uint8_t* v, int stride,
-                         int thresh, int ithresh, int hev_thresh) {
+static void HFilter8(uint8_t* u, uint8_t* v, int stride,
+                     int thresh, int ithresh, int hev_thresh) {
   __m128i mask;
   __m128i p3, p2, p1, p0, q0, q1, q2, q3;
 
@@ -871,15 +894,15 @@ static void HFilter8SSE2(uint8_t* u, uint8_t* v, int stride,
   Load16x4(u, v, stride, &q0, &q1, &q2, &q3);    // q0, q1, q2, q3
   MAX_DIFF2(q3, q2, q1, q0, mask);
 
-  COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, ithresh, mask);
+  ComplexMask(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
   DoFilter6(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh);
 
-  Store16x4(tu, tv, stride, &p3, &p2, &p1, &p0);
-  Store16x4(u, v, stride, &q0, &q1, &q2, &q3);
+  Store16x4(&p3, &p2, &p1, &p0, tu, tv, stride);
+  Store16x4(&q0, &q1, &q2, &q3, u, v, stride);
 }
 
-static void VFilter8iSSE2(uint8_t* u, uint8_t* v, int stride,
-                          int thresh, int ithresh, int hev_thresh) {
+static void VFilter8i(uint8_t* u, uint8_t* v, int stride,
+                      int thresh, int ithresh, int hev_thresh) {
   __m128i mask;
   __m128i t1, t2, p1, p0, q0, q1;
 
@@ -894,7 +917,7 @@ static void VFilter8iSSE2(uint8_t* u, uint8_t* v, int stride,
   LOADUV_H_EDGES4(u, v, stride, q0, q1, t1, t2);
   MAX_DIFF2(t2, t1, q1, q0, mask);
 
-  COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, ithresh, mask);
+  ComplexMask(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
   DoFilter4(&p1, &p0, &q0, &q1, &mask, hev_thresh);
 
   // Store
@@ -904,8 +927,8 @@ static void VFilter8iSSE2(uint8_t* u, uint8_t* v, int stride,
   STOREUV(q1, u, v, 1 * stride);
 }
 
-static void HFilter8iSSE2(uint8_t* u, uint8_t* v, int stride,
-                          int thresh, int ithresh, int hev_thresh) {
+static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
+                      int thresh, int ithresh, int hev_thresh) {
   __m128i mask;
   __m128i t1, t2, p1, p0, q0, q1;
   Load16x4(u, v, stride, &t2, &t1, &p1, &p0);   // p3, p2, p1, p0
@@ -916,12 +939,12 @@ static void HFilter8iSSE2(uint8_t* u, uint8_t* v, int stride,
   Load16x4(u, v, stride, &q0, &q1, &t1, &t2);  // q0, q1, q2, q3
   MAX_DIFF2(t2, t1, q1, q0, mask);
 
-  COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, ithresh, mask);
+  ComplexMask(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
   DoFilter4(&p1, &p0, &q0, &q1, &mask, hev_thresh);
 
   u -= 2;  // beginning of p1
   v -= 2;
-  Store16x4(u, v, stride, &p1, &p0, &q0, &q1);
+  Store16x4(&p1, &p0, &q0, &q1, u, v, stride);
 }
 
 #endif   // WEBP_USE_SSE2
@@ -933,24 +956,23 @@ extern void VP8DspInitSSE2(void);
 
 void VP8DspInitSSE2(void) {
 #if defined(WEBP_USE_SSE2)
-  VP8Transform = TransformSSE2;
+  VP8Transform = Transform;
 #if defined(USE_TRANSFORM_AC3)
-  VP8TransformAC3 = TransformAC3SSE2;
+  VP8TransformAC3 = TransformAC3;
 #endif
 
-  VP8VFilter16 = VFilter16SSE2;
-  VP8HFilter16 = HFilter16SSE2;
-  VP8VFilter8 = VFilter8SSE2;
-  VP8HFilter8 = HFilter8SSE2;
-  VP8VFilter16i = VFilter16iSSE2;
-  VP8HFilter16i = HFilter16iSSE2;
-  VP8VFilter8i = VFilter8iSSE2;
-  VP8HFilter8i = HFilter8iSSE2;
-
-  VP8SimpleVFilter16 = SimpleVFilter16SSE2;
-  VP8SimpleHFilter16 = SimpleHFilter16SSE2;
-  VP8SimpleVFilter16i = SimpleVFilter16iSSE2;
-  VP8SimpleHFilter16i = SimpleHFilter16iSSE2;
+  VP8VFilter16 = VFilter16;
+  VP8HFilter16 = HFilter16;
+  VP8VFilter8 = VFilter8;
+  VP8HFilter8 = HFilter8;
+  VP8VFilter16i = VFilter16i;
+  VP8HFilter16i = HFilter16i;
+  VP8VFilter8i = VFilter8i;
+  VP8HFilter8i = HFilter8i;
+
+  VP8SimpleVFilter16 = SimpleVFilter16;
+  VP8SimpleHFilter16 = SimpleHFilter16;
+  VP8SimpleVFilter16i = SimpleVFilter16i;
+  VP8SimpleHFilter16i = SimpleHFilter16i;
 #endif   // WEBP_USE_SSE2
 }
-
diff --git a/src/3rdparty/libwebp/src/dsp/dsp.h b/src/3rdparty/libwebp/src/dsp/dsp.h
index 3be783a..2409bae 100644
--- a/src/3rdparty/libwebp/src/dsp/dsp.h
+++ b/src/3rdparty/libwebp/src/dsp/dsp.h
@@ -14,6 +14,10 @@
 #ifndef WEBP_DSP_DSP_H_
 #define WEBP_DSP_DSP_H_
 
+#ifdef HAVE_CONFIG_H
+#include "../webp/config.h"
+#endif
+
 #include "../webp/types.h"
 
 #ifdef __cplusplus
@@ -23,27 +27,66 @@ extern "C" {
 //------------------------------------------------------------------------------
 // CPU detection
 
+#if defined(__GNUC__)
+# define LOCAL_GCC_VERSION ((__GNUC__ << 8) | __GNUC_MINOR__)
+# define LOCAL_GCC_PREREQ(maj, min) \
+    (LOCAL_GCC_VERSION >= (((maj) << 8) | (min)))
+#else
+# define LOCAL_GCC_VERSION 0
+# define LOCAL_GCC_PREREQ(maj, min) 0
+#endif
+
+#ifdef __clang__
+# define LOCAL_CLANG_VERSION ((__clang_major__ << 8) | __clang_minor__)
+# define LOCAL_CLANG_PREREQ(maj, min) \
+    (LOCAL_CLANG_VERSION >= (((maj) << 8) | (min)))
+#else
+# define LOCAL_CLANG_VERSION 0
+# define LOCAL_CLANG_PREREQ(maj, min) 0
+#endif  // __clang__
+
 #if defined(_MSC_VER) && _MSC_VER > 1310 && \
     (defined(_M_X64) || defined(_M_IX86))
 #define WEBP_MSC_SSE2  // Visual C++ SSE2 targets
 #endif
 
-#if defined(__SSE2__) || defined(WEBP_MSC_SSE2)
+// WEBP_HAVE_* are used to indicate the presence of the instruction set in dsp
+// files without intrinsics, allowing the corresponding Init() to be called.
+// Files containing intrinsics will need to be built targeting the instruction
+// set so should succeed on one of the earlier tests.
+#if defined(__SSE2__) || defined(WEBP_MSC_SSE2) || defined(WEBP_HAVE_SSE2)
 #define WEBP_USE_SSE2
 #endif
 
+#if defined(__AVX2__) || defined(WEBP_HAVE_AVX2)
+#define WEBP_USE_AVX2
+#endif
+
 #if defined(__ANDROID__) && defined(__ARM_ARCH_7A__)
 #define WEBP_ANDROID_NEON  // Android targets that might support NEON
 #endif
 
-#if defined(__ARM_NEON__) || defined(WEBP_ANDROID_NEON)
+// The intrinsics currently cause compiler errors with arm-nacl-gcc and the
+// inline assembly would need to be modified for use with Native Client.
+#if (defined(__ARM_NEON__) || defined(WEBP_ANDROID_NEON) || \
+     defined(__aarch64__)) && !defined(__native_client__)
 #define WEBP_USE_NEON
 #endif
 
+#if defined(__mips__) && !defined(__mips64) && (__mips_isa_rev < 6)
+#define WEBP_USE_MIPS32
+#if (__mips_isa_rev >= 2)
+#define WEBP_USE_MIPS32_R2
+#endif
+#endif
+
 typedef enum {
   kSSE2,
   kSSE3,
-  kNEON
+  kAVX,
+  kAVX2,
+  kNEON,
+  kMIPS32
 } CPUFeature;
 // returns true if the CPU supports the feature.
 typedef int (*VP8CPUInfo)(CPUFeature feature);
@@ -61,7 +104,6 @@ typedef void (*VP8Fdct)(const uint8_t* src, const uint8_t* ref, int16_t* out);
 typedef void (*VP8WHT)(const int16_t* in, int16_t* out);
 extern VP8Idct VP8ITransform;
 extern VP8Fdct VP8FTransform;
-extern VP8WHT VP8ITransformWHT;
 extern VP8WHT VP8FTransformWHT;
 // Predictions
 // *dst is the destination block. *top and *left can be NULL.
@@ -83,7 +125,7 @@ extern VP8BlockCopy VP8Copy4x4;
 // Quantization
 struct VP8Matrix;   // forward declaration
 typedef int (*VP8QuantizeBlock)(int16_t in[16], int16_t out[16],
-                                int n, const struct VP8Matrix* const mtx);
+                                const struct VP8Matrix* const mtx);
 extern VP8QuantizeBlock VP8EncQuantizeBlock;
 
 // specific to 2nd transform:
@@ -121,6 +163,13 @@ extern const VP8PredFunc VP8PredLuma16[/* NUM_B_DC_MODES */];
 extern const VP8PredFunc VP8PredChroma8[/* NUM_B_DC_MODES */];
 extern const VP8PredFunc VP8PredLuma4[/* NUM_BMODES */];
 
+// clipping tables (for filtering)
+extern const int8_t* const VP8ksclip1;  // clips [-1020, 1020] to [-128, 127]
+extern const int8_t* const VP8ksclip2;  // clips [-112, 112] to [-16, 15]
+extern const uint8_t* const VP8kclip1;  // clips [-255,511] to [0,255]
+extern const uint8_t* const VP8kabs0;   // abs(x) for x in [-255,255]
+void VP8InitClipTables(void);           // must be called first
+
 // simple filter (only for luma)
 typedef void (*VP8SimpleFilterFunc)(uint8_t* p, int stride, int thresh);
 extern VP8SimpleFilterFunc VP8SimpleVFilter16;
@@ -166,21 +215,20 @@ typedef void (*WebPUpsampleLinePairFunc)(
 // Fancy upsampling functions to convert YUV to RGB(A) modes
 extern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */];
 
-// Initializes SSE2 version of the fancy upsamplers.
-void WebPInitUpsamplersSSE2(void);
-
-// NEON version
-void WebPInitUpsamplersNEON(void);
-
 #endif    // FANCY_UPSAMPLING
 
-// Point-sampling methods.
-typedef void (*WebPSampleLinePairFunc)(
-    const uint8_t* top_y, const uint8_t* bottom_y,
-    const uint8_t* u, const uint8_t* v,
-    uint8_t* top_dst, uint8_t* bottom_dst, int len);
+// Per-row point-sampling methods.
+typedef void (*WebPSamplerRowFunc)(const uint8_t* y,
+                                   const uint8_t* u, const uint8_t* v,
+                                   uint8_t* dst, int len);
+// Generic function to apply 'WebPSamplerRowFunc' to the whole plane:
+void WebPSamplerProcessPlane(const uint8_t* y, int y_stride,
+                             const uint8_t* u, const uint8_t* v, int uv_stride,
+                             uint8_t* dst, int dst_stride,
+                             int width, int height, WebPSamplerRowFunc func);
 
-extern const WebPSampleLinePairFunc WebPSamplers[/* MODE_LAST */];
+// Sampling functions to convert rows of YUV to RGB(A)
+extern WebPSamplerRowFunc WebPSamplers[/* MODE_LAST */];
 
 // General function for converting two lines of ARGB or RGBA.
 // 'alpha_is_last' should be true if 0xff000000 is stored in memory as
@@ -194,11 +242,14 @@ typedef void (*WebPYUV444Converter)(const uint8_t* y,
 
 extern const WebPYUV444Converter WebPYUV444Converters[/* MODE_LAST */];
 
-// Main function to be called
+// Must be called before using the WebPUpsamplers[] (and for premultiplied
+// colorspaces like rgbA, rgbA4444, etc)
 void WebPInitUpsamplers(void);
+// Must be called before using WebPSamplers[]
+void WebPInitSamplers(void);
 
 //------------------------------------------------------------------------------
-// Pre-multiply planes with alpha values
+// Utilities for processing transparent channel.
 
 // Apply alpha pre-multiply on an rgba, bgra or argb plane of size w * h.
 // alpha_first should be 0 for argb, 1 for rgba or bgra (where alpha is last).
@@ -209,13 +260,34 @@ extern void (*WebPApplyAlphaMultiply)(
 extern void (*WebPApplyAlphaMultiply4444)(
     uint8_t* rgba4444, int w, int h, int stride);
 
-// To be called first before using the above.
-void WebPInitPremultiply(void);
+// Extract the alpha values from 32b values in argb[] and pack them into alpha[]
+// (this is the opposite of WebPDispatchAlpha).
+// Returns true if there's only trivial 0xff alpha values.
+extern int (*WebPExtractAlpha)(const uint8_t* argb, int argb_stride,
+                               int width, int height,
+                               uint8_t* alpha, int alpha_stride);
 
-void WebPInitPremultiplySSE2(void);   // should not be called directly.
-void WebPInitPremultiplyNEON(void);
+// Pre-Multiply operation transforms x into x * A / 255  (where x=Y,R,G or B).
+// Un-Multiply operation transforms x into x * 255 / A.
 
-//------------------------------------------------------------------------------
+// Pre-Multiply or Un-Multiply (if 'inverse' is true) argb values in a row.
+extern void (*WebPMultARGBRow)(uint32_t* const ptr, int width, int inverse);
+
+// Same a WebPMultARGBRow(), but for several rows.
+void WebPMultARGBRows(uint8_t* ptr, int stride, int width, int num_rows,
+                      int inverse);
+
+// Same for a row of single values, with side alpha values.
+extern void (*WebPMultRow)(uint8_t* const ptr, const uint8_t* const alpha,
+                           int width, int inverse);
+
+// Same a WebPMultRow(), but for several 'num_rows' rows.
+void WebPMultRows(uint8_t* ptr, int stride,
+                  const uint8_t* alpha, int alpha_stride,
+                  int width, int num_rows, int inverse);
+
+// To be called first before using the above.
+void WebPInitAlphaProcessing(void);
 
 #ifdef __cplusplus
 }    // extern "C"
diff --git a/src/3rdparty/libwebp/src/dsp/enc.c b/src/3rdparty/libwebp/src/dsp/enc.c
index fcc6ec8..f4e72d4 100644
--- a/src/3rdparty/libwebp/src/dsp/enc.c
+++ b/src/3rdparty/libwebp/src/dsp/enc.c
@@ -159,33 +159,6 @@ static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
   }
 }
 
-static void ITransformWHT(const int16_t* in, int16_t* out) {
-  int tmp[16];
-  int i;
-  for (i = 0; i < 4; ++i) {
-    const int a0 = in[0 + i] + in[12 + i];
-    const int a1 = in[4 + i] + in[ 8 + i];
-    const int a2 = in[4 + i] - in[ 8 + i];
-    const int a3 = in[0 + i] - in[12 + i];
-    tmp[0  + i] = a0 + a1;
-    tmp[8  + i] = a0 - a1;
-    tmp[4  + i] = a3 + a2;
-    tmp[12 + i] = a3 - a2;
-  }
-  for (i = 0; i < 4; ++i) {
-    const int dc = tmp[0 + i * 4] + 3;    // w/ rounder
-    const int a0 = dc             + tmp[3 + i * 4];
-    const int a1 = tmp[1 + i * 4] + tmp[2 + i * 4];
-    const int a2 = tmp[1 + i * 4] - tmp[2 + i * 4];
-    const int a3 = dc             - tmp[3 + i * 4];
-    out[ 0] = (a0 + a1) >> 3;
-    out[16] = (a3 + a2) >> 3;
-    out[32] = (a0 - a1) >> 3;
-    out[48] = (a3 - a2) >> 3;
-    out += 64;
-  }
-}
-
 static void FTransformWHT(const int16_t* in, int16_t* out) {
   // input is 12b signed
   int32_t tmp[16];
@@ -627,21 +600,23 @@ static const uint8_t kZigzag[16] = {
 
 // Simple quantization
 static int QuantizeBlock(int16_t in[16], int16_t out[16],
-                         int n, const VP8Matrix* const mtx) {
+                         const VP8Matrix* const mtx) {
   int last = -1;
-  for (; n < 16; ++n) {
+  int n;
+  for (n = 0; n < 16; ++n) {
     const int j = kZigzag[n];
     const int sign = (in[j] < 0);
-    const int coeff = (sign ? -in[j] : in[j]) + mtx->sharpen_[j];
+    const uint32_t coeff = (sign ? -in[j] : in[j]) + mtx->sharpen_[j];
     if (coeff > mtx->zthresh_[j]) {
-      const int Q = mtx->q_[j];
-      const int iQ = mtx->iq_[j];
-      const int B = mtx->bias_[j];
-      out[n] = QUANTDIV(coeff, iQ, B);
-      if (out[n] > MAX_LEVEL) out[n] = MAX_LEVEL;
-      if (sign) out[n] = -out[n];
-      in[j] = out[n] * Q;
-      if (out[n]) last = n;
+      const uint32_t Q = mtx->q_[j];
+      const uint32_t iQ = mtx->iq_[j];
+      const uint32_t B = mtx->bias_[j];
+      int level = QUANTDIV(coeff, iQ, B);
+      if (level > MAX_LEVEL) level = MAX_LEVEL;
+      if (sign) level = -level;
+      in[j] = level * Q;
+      out[n] = level;
+      if (level) last = n;
     } else {
       out[n] = 0;
       in[j] = 0;
@@ -656,17 +631,18 @@ static int QuantizeBlockWHT(int16_t in[16], int16_t out[16],
   for (n = 0; n < 16; ++n) {
     const int j = kZigzag[n];
     const int sign = (in[j] < 0);
-    const int coeff = sign ? -in[j] : in[j];
+    const uint32_t coeff = sign ? -in[j] : in[j];
     assert(mtx->sharpen_[j] == 0);
     if (coeff > mtx->zthresh_[j]) {
-      const int Q = mtx->q_[j];
-      const int iQ = mtx->iq_[j];
-      const int B = mtx->bias_[j];
-      out[n] = QUANTDIV(coeff, iQ, B);
-      if (out[n] > MAX_LEVEL) out[n] = MAX_LEVEL;
-      if (sign) out[n] = -out[n];
-      in[j] = out[n] * Q;
-      if (out[n]) last = n;
+      const uint32_t Q = mtx->q_[j];
+      const uint32_t iQ = mtx->iq_[j];
+      const uint32_t B = mtx->bias_[j];
+      int level = QUANTDIV(coeff, iQ, B);
+      if (level > MAX_LEVEL) level = MAX_LEVEL;
+      if (sign) level = -level;
+      in[j] = level * Q;
+      out[n] = level;
+      if (level) last = n;
     } else {
       out[n] = 0;
       in[j] = 0;
@@ -697,7 +673,6 @@ static void Copy4x4(const uint8_t* src, uint8_t* dst) { Copy(src, dst, 4); }
 VP8CHisto VP8CollectHistogram;
 VP8Idct VP8ITransform;
 VP8Fdct VP8FTransform;
-VP8WHT VP8ITransformWHT;
 VP8WHT VP8FTransformWHT;
 VP8Intra4Preds VP8EncPredLuma4;
 VP8IntraPreds VP8EncPredLuma16;
@@ -713,16 +688,23 @@ VP8QuantizeBlockWHT VP8EncQuantizeBlockWHT;
 VP8BlockCopy VP8Copy4x4;
 
 extern void VP8EncDspInitSSE2(void);
+extern void VP8EncDspInitAVX2(void);
 extern void VP8EncDspInitNEON(void);
+extern void VP8EncDspInitMIPS32(void);
+
+static volatile VP8CPUInfo enc_last_cpuinfo_used =
+    (VP8CPUInfo)&enc_last_cpuinfo_used;
 
 void VP8EncDspInit(void) {
+  if (enc_last_cpuinfo_used == VP8GetCPUInfo) return;
+
+  VP8DspInit();  // common inverse transforms
   InitTables();
 
   // default C implementations
   VP8CollectHistogram = CollectHistogram;
   VP8ITransform = ITransform;
   VP8FTransform = FTransform;
-  VP8ITransformWHT = ITransformWHT;
   VP8FTransformWHT = FTransformWHT;
   VP8EncPredLuma4 = Intra4Preds;
   VP8EncPredLuma16 = Intra16Preds;
@@ -738,16 +720,28 @@ void VP8EncDspInit(void) {
   VP8Copy4x4 = Copy4x4;
 
   // If defined, use CPUInfo() to overwrite some pointers with faster versions.
-  if (VP8GetCPUInfo) {
+  if (VP8GetCPUInfo != NULL) {
 #if defined(WEBP_USE_SSE2)
     if (VP8GetCPUInfo(kSSE2)) {
       VP8EncDspInitSSE2();
     }
-#elif defined(WEBP_USE_NEON)
+#endif
+#if defined(WEBP_USE_AVX2)
+    if (VP8GetCPUInfo(kAVX2)) {
+      VP8EncDspInitAVX2();
+    }
+#endif
+#if defined(WEBP_USE_NEON)
     if (VP8GetCPUInfo(kNEON)) {
       VP8EncDspInitNEON();
     }
 #endif
+#if defined(WEBP_USE_MIPS32)
+    if (VP8GetCPUInfo(kMIPS32)) {
+      VP8EncDspInitMIPS32();
+    }
+#endif
   }
+  enc_last_cpuinfo_used = VP8GetCPUInfo;
 }
 
diff --git a/src/3rdparty/libwebp/src/dec/layer.c b/src/3rdparty/libwebp/src/dsp/enc_avx2.c
index dacb9e2..372e616 100644
--- a/src/3rdparty/libwebp/src/dec/layer.c
+++ b/src/3rdparty/libwebp/src/dsp/enc_avx2.c
@@ -1,4 +1,4 @@
-// Copyright 2011 Google Inc. All Rights Reserved.
+// Copyright 2014 Google Inc. All Rights Reserved.
 //
 // Use of this source code is governed by a BSD-style license
 // that can be found in the COPYING file in the root of the source
@@ -7,24 +7,18 @@
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
-// Enhancement layer (for YUV444/422)
-//
-// Author: Skal (pascal.massimino@gmail.com)
+// AVX2 version of speed-critical encoding functions.
 
-#include <assert.h>
-#include <stdlib.h>
+#include "./dsp.h"
 
-#include "./vp8i.h"
+#if defined(WEBP_USE_AVX2)
 
-//------------------------------------------------------------------------------
+#endif  // WEBP_USE_AVX2
 
-int VP8DecodeLayer(VP8Decoder* const dec) {
-  assert(dec);
-  assert(dec->layer_data_size_ > 0);
-  (void)dec;
+//------------------------------------------------------------------------------
+// Entry point
 
-  // TODO: handle enhancement layer here.
+extern void VP8EncDspInitAVX2(void);
 
-  return 1;
+void VP8EncDspInitAVX2(void) {
 }
-
diff --git a/src/3rdparty/libwebp/src/dsp/enc_mips32.c b/src/3rdparty/libwebp/src/dsp/enc_mips32.c
new file mode 100644
index 0000000..def9a16
--- /dev/null
+++ b/src/3rdparty/libwebp/src/dsp/enc_mips32.c
@@ -0,0 +1,776 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// MIPS version of speed-critical encoding functions.
+//
+// Author(s): Djordje Pesut    (djordje.pesut@imgtec.com)
+//            Jovan Zelincevic (jovan.zelincevic@imgtec.com)
+//            Slobodan Prijic  (slobodan.prijic@imgtec.com)
+
+#include "./dsp.h"
+
+#if defined(WEBP_USE_MIPS32)
+
+#include "../enc/vp8enci.h"
+#include "../enc/cost.h"
+
+#if defined(__GNUC__) && defined(__ANDROID__) && LOCAL_GCC_VERSION == 0x409
+#define WORK_AROUND_GCC
+#endif
+
+static const int kC1 = 20091 + (1 << 16);
+static const int kC2 = 35468;
+
+// macro for one vertical pass in ITransformOne
+// MUL macro inlined
+// temp0..temp15 holds tmp[0]..tmp[15]
+// A..D - offsets in bytes to load from in buffer
+// TEMP0..TEMP3 - registers for corresponding tmp elements
+// TEMP4..TEMP5 - temporary registers
+#define VERTICAL_PASS(A, B, C, D, TEMP4, TEMP0, TEMP1, TEMP2, TEMP3)        \
+  "lh      %[temp16],      "#A"(%[temp20])                 \n\t"            \
+  "lh      %[temp18],      "#B"(%[temp20])                 \n\t"            \
+  "lh      %[temp17],      "#C"(%[temp20])                 \n\t"            \
+  "lh      %[temp19],      "#D"(%[temp20])                 \n\t"            \
+  "addu    %["#TEMP4"],    %[temp16],      %[temp18]       \n\t"            \
+  "subu    %[temp16],      %[temp16],      %[temp18]       \n\t"            \
+  "mul     %["#TEMP0"],    %[temp17],      %[kC2]          \n\t"            \
+  "mul     %[temp18],      %[temp19],      %[kC1]          \n\t"            \
+  "mul     %[temp17],      %[temp17],      %[kC1]          \n\t"            \
+  "mul     %[temp19],      %[temp19],      %[kC2]          \n\t"            \
+  "sra     %["#TEMP0"],    %["#TEMP0"],    16              \n\n"            \
+  "sra     %[temp18],      %[temp18],      16              \n\n"            \
+  "sra     %[temp17],      %[temp17],      16              \n\n"            \
+  "sra     %[temp19],      %[temp19],      16              \n\n"            \
+  "subu    %["#TEMP2"],    %["#TEMP0"],    %[temp18]       \n\t"            \
+  "addu    %["#TEMP3"],    %[temp17],      %[temp19]       \n\t"            \
+  "addu    %["#TEMP0"],    %["#TEMP4"],    %["#TEMP3"]     \n\t"            \
+  "addu    %["#TEMP1"],    %[temp16],      %["#TEMP2"]     \n\t"            \
+  "subu    %["#TEMP2"],    %[temp16],      %["#TEMP2"]     \n\t"            \
+  "subu    %["#TEMP3"],    %["#TEMP4"],    %["#TEMP3"]     \n\t"
+
+// macro for one horizontal pass in ITransformOne
+// MUL and STORE macros inlined
+// a = clip_8b(a) is replaced with: a = max(a, 0); a = min(a, 255)
+// temp0..temp15 holds tmp[0]..tmp[15]
+// A..D - offsets in bytes to load from ref and store to dst buffer
+// TEMP0, TEMP4, TEMP8 and TEMP12 - registers for corresponding tmp elements
+#define HORIZONTAL_PASS(A, B, C, D, TEMP0, TEMP4, TEMP8, TEMP12)            \
+  "addiu   %["#TEMP0"],    %["#TEMP0"],    4               \n\t"            \
+  "addu    %[temp16],      %["#TEMP0"],    %["#TEMP8"]     \n\t"            \
+  "subu    %[temp17],      %["#TEMP0"],    %["#TEMP8"]     \n\t"            \
+  "mul     %["#TEMP0"],    %["#TEMP4"],    %[kC2]          \n\t"            \
+  "mul     %["#TEMP8"],    %["#TEMP12"],   %[kC1]          \n\t"            \
+  "mul     %["#TEMP4"],    %["#TEMP4"],    %[kC1]          \n\t"            \
+  "mul     %["#TEMP12"],   %["#TEMP12"],   %[kC2]          \n\t"            \
+  "sra     %["#TEMP0"],    %["#TEMP0"],    16              \n\t"            \
+  "sra     %["#TEMP8"],    %["#TEMP8"],    16              \n\t"            \
+  "sra     %["#TEMP4"],    %["#TEMP4"],    16              \n\t"            \
+  "sra     %["#TEMP12"],   %["#TEMP12"],   16              \n\t"            \
+  "subu    %[temp18],      %["#TEMP0"],    %["#TEMP8"]     \n\t"            \
+  "addu    %[temp19],      %["#TEMP4"],    %["#TEMP12"]    \n\t"            \
+  "addu    %["#TEMP0"],    %[temp16],      %[temp19]       \n\t"            \
+  "addu    %["#TEMP4"],    %[temp17],      %[temp18]       \n\t"            \
+  "subu    %["#TEMP8"],    %[temp17],      %[temp18]       \n\t"            \
+  "subu    %["#TEMP12"],   %[temp16],      %[temp19]       \n\t"            \
+  "lw      %[temp20],      0(%[args])                      \n\t"            \
+  "sra     %["#TEMP0"],    %["#TEMP0"],    3               \n\t"            \
+  "sra     %["#TEMP4"],    %["#TEMP4"],    3               \n\t"            \
+  "sra     %["#TEMP8"],    %["#TEMP8"],    3               \n\t"            \
+  "sra     %["#TEMP12"],   %["#TEMP12"],   3               \n\t"            \
+  "lbu     %[temp16],      "#A"(%[temp20])                 \n\t"            \
+  "lbu     %[temp17],      "#B"(%[temp20])                 \n\t"            \
+  "lbu     %[temp18],      "#C"(%[temp20])                 \n\t"            \
+  "lbu     %[temp19],      "#D"(%[temp20])                 \n\t"            \
+  "addu    %["#TEMP0"],    %[temp16],      %["#TEMP0"]     \n\t"            \
+  "addu    %["#TEMP4"],    %[temp17],      %["#TEMP4"]     \n\t"            \
+  "addu    %["#TEMP8"],    %[temp18],      %["#TEMP8"]     \n\t"            \
+  "addu    %["#TEMP12"],   %[temp19],      %["#TEMP12"]    \n\t"            \
+  "slt     %[temp16],      %["#TEMP0"],    $zero           \n\t"            \
+  "slt     %[temp17],      %["#TEMP4"],    $zero           \n\t"            \
+  "slt     %[temp18],      %["#TEMP8"],    $zero           \n\t"            \
+  "slt     %[temp19],      %["#TEMP12"],   $zero           \n\t"            \
+  "movn    %["#TEMP0"],    $zero,          %[temp16]       \n\t"            \
+  "movn    %["#TEMP4"],    $zero,          %[temp17]       \n\t"            \
+  "movn    %["#TEMP8"],    $zero,          %[temp18]       \n\t"            \
+  "movn    %["#TEMP12"],   $zero,          %[temp19]       \n\t"            \
+  "addiu   %[temp20],      $zero,          255             \n\t"            \
+  "slt     %[temp16],      %["#TEMP0"],    %[temp20]       \n\t"            \
+  "slt     %[temp17],      %["#TEMP4"],    %[temp20]       \n\t"            \
+  "slt     %[temp18],      %["#TEMP8"],    %[temp20]       \n\t"            \
+  "slt     %[temp19],      %["#TEMP12"],   %[temp20]       \n\t"            \
+  "movz    %["#TEMP0"],    %[temp20],      %[temp16]       \n\t"            \
+  "movz    %["#TEMP4"],    %[temp20],      %[temp17]       \n\t"            \
+  "lw      %[temp16],      8(%[args])                      \n\t"            \
+  "movz    %["#TEMP8"],    %[temp20],      %[temp18]       \n\t"            \
+  "movz    %["#TEMP12"],   %[temp20],      %[temp19]       \n\t"            \
+  "sb      %["#TEMP0"],    "#A"(%[temp16])                 \n\t"            \
+  "sb      %["#TEMP4"],    "#B"(%[temp16])                 \n\t"            \
+  "sb      %["#TEMP8"],    "#C"(%[temp16])                 \n\t"            \
+  "sb      %["#TEMP12"],   "#D"(%[temp16])                 \n\t"
+
+// Does one or two inverse transforms.
+static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
+                                      uint8_t* dst) {
+  int temp0, temp1, temp2, temp3, temp4, temp5, temp6;
+  int temp7, temp8, temp9, temp10, temp11, temp12, temp13;
+  int temp14, temp15, temp16, temp17, temp18, temp19, temp20;
+  const int* args[3] = {(const int*)ref, (const int*)in, (const int*)dst};
+
+  __asm__ volatile(
+    "lw      %[temp20],      4(%[args])                      \n\t"
+    VERTICAL_PASS(0, 16,  8, 24, temp4,  temp0,  temp1,  temp2,  temp3)
+    VERTICAL_PASS(2, 18, 10, 26, temp8,  temp4,  temp5,  temp6,  temp7)
+    VERTICAL_PASS(4, 20, 12, 28, temp12, temp8,  temp9,  temp10, temp11)
+    VERTICAL_PASS(6, 22, 14, 30, temp20, temp12, temp13, temp14, temp15)
+
+    HORIZONTAL_PASS( 0,  1,  2,  3, temp0, temp4, temp8,  temp12)
+    HORIZONTAL_PASS(16, 17, 18, 19, temp1, temp5, temp9,  temp13)
+    HORIZONTAL_PASS(32, 33, 34, 35, temp2, temp6, temp10, temp14)
+    HORIZONTAL_PASS(48, 49, 50, 51, temp3, temp7, temp11, temp15)
+
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
+      [temp9]"=&r"(temp9), [temp10]"=&r"(temp10), [temp11]"=&r"(temp11),
+      [temp12]"=&r"(temp12), [temp13]"=&r"(temp13), [temp14]"=&r"(temp14),
+      [temp15]"=&r"(temp15), [temp16]"=&r"(temp16), [temp17]"=&r"(temp17),
+      [temp18]"=&r"(temp18), [temp19]"=&r"(temp19), [temp20]"=&r"(temp20)
+    : [args]"r"(args), [kC1]"r"(kC1), [kC2]"r"(kC2)
+    : "memory", "hi", "lo"
+  );
+}
+
+static void ITransform(const uint8_t* ref, const int16_t* in,
+                       uint8_t* dst, int do_two) {
+  ITransformOne(ref, in, dst);
+  if (do_two) {
+    ITransformOne(ref + 4, in + 16, dst + 4);
+  }
+}
+
+#undef VERTICAL_PASS
+#undef HORIZONTAL_PASS
+
+// macro for one pass through for loop in QuantizeBlock
+// QUANTDIV macro inlined
+// J - offset in bytes (kZigzag[n] * 2)
+// K - offset in bytes (kZigzag[n] * 4)
+// N - offset in bytes (n * 2)
+#define QUANTIZE_ONE(J, K, N)                                               \
+  "lh           %[temp0],       "#J"(%[ppin])                       \n\t"   \
+  "lhu          %[temp1],       "#J"(%[ppsharpen])                  \n\t"   \
+  "lw           %[temp2],       "#K"(%[ppzthresh])                  \n\t"   \
+  "sra          %[sign],        %[temp0],           15              \n\t"   \
+  "xor          %[coeff],       %[temp0],           %[sign]         \n\t"   \
+  "subu         %[coeff],       %[coeff],           %[sign]         \n\t"   \
+  "addu         %[coeff],       %[coeff],           %[temp1]        \n\t"   \
+  "slt          %[temp4],       %[temp2],           %[coeff]        \n\t"   \
+  "addiu        %[temp5],       $zero,              0               \n\t"   \
+  "addiu        %[level],       $zero,              0               \n\t"   \
+  "beqz         %[temp4],       2f                                  \n\t"   \
+  "lhu          %[temp1],       "#J"(%[ppiq])                       \n\t"   \
+  "lw           %[temp2],       "#K"(%[ppbias])                     \n\t"   \
+  "lhu          %[temp3],       "#J"(%[ppq])                        \n\t"   \
+  "mul          %[level],       %[coeff],           %[temp1]        \n\t"   \
+  "addu         %[level],       %[level],           %[temp2]        \n\t"   \
+  "sra          %[level],       %[level],           17              \n\t"   \
+  "slt          %[temp4],       %[max_level],       %[level]        \n\t"   \
+  "movn         %[level],       %[max_level],       %[temp4]        \n\t"   \
+  "xor          %[level],       %[level],           %[sign]         \n\t"   \
+  "subu         %[level],       %[level],           %[sign]         \n\t"   \
+  "mul          %[temp5],       %[level],           %[temp3]        \n\t"   \
+"2:                                                                 \n\t"   \
+  "sh           %[temp5],       "#J"(%[ppin])                       \n\t"   \
+  "sh           %[level],       "#N"(%[pout])                       \n\t"
+
+static int QuantizeBlock(int16_t in[16], int16_t out[16],
+                         const VP8Matrix* const mtx) {
+  int temp0, temp1, temp2, temp3, temp4, temp5;
+  int sign, coeff, level, i;
+  int max_level = MAX_LEVEL;
+
+  int16_t* ppin             = &in[0];
+  int16_t* pout             = &out[0];
+  const uint16_t* ppsharpen = &mtx->sharpen_[0];
+  const uint32_t* ppzthresh = &mtx->zthresh_[0];
+  const uint16_t* ppq       = &mtx->q_[0];
+  const uint16_t* ppiq      = &mtx->iq_[0];
+  const uint32_t* ppbias    = &mtx->bias_[0];
+
+  __asm__ volatile(
+    QUANTIZE_ONE( 0,  0,  0)
+    QUANTIZE_ONE( 2,  4,  2)
+    QUANTIZE_ONE( 8, 16,  4)
+    QUANTIZE_ONE(16, 32,  6)
+    QUANTIZE_ONE(10, 20,  8)
+    QUANTIZE_ONE( 4,  8, 10)
+    QUANTIZE_ONE( 6, 12, 12)
+    QUANTIZE_ONE(12, 24, 14)
+    QUANTIZE_ONE(18, 36, 16)
+    QUANTIZE_ONE(24, 48, 18)
+    QUANTIZE_ONE(26, 52, 20)
+    QUANTIZE_ONE(20, 40, 22)
+    QUANTIZE_ONE(14, 28, 24)
+    QUANTIZE_ONE(22, 44, 26)
+    QUANTIZE_ONE(28, 56, 28)
+    QUANTIZE_ONE(30, 60, 30)
+
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
+      [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
+      [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [sign]"=&r"(sign), [coeff]"=&r"(coeff),
+      [level]"=&r"(level)
+    : [pout]"r"(pout), [ppin]"r"(ppin),
+      [ppiq]"r"(ppiq), [max_level]"r"(max_level),
+      [ppbias]"r"(ppbias), [ppzthresh]"r"(ppzthresh),
+      [ppsharpen]"r"(ppsharpen), [ppq]"r"(ppq)
+    : "memory", "hi", "lo"
+  );
+
+  // moved out from macro to increase possibility for earlier breaking
+  for (i = 15; i >= 0; i--) {
+    if (out[i]) return 1;
+  }
+  return 0;
+}
+
+#undef QUANTIZE_ONE
+
+// macro for one horizontal pass in Disto4x4 (TTransform)
+// two calls of function TTransform are merged into single one
+// A..D - offsets in bytes to load from a and b buffers
+// E..H - offsets in bytes to store first results to tmp buffer
+// E1..H1 - offsets in bytes to store second results to tmp buffer
+#define HORIZONTAL_PASS(A, B, C, D, E, F, G, H, E1, F1, G1, H1)   \
+  "lbu    %[temp0],  "#A"(%[a])              \n\t"                \
+  "lbu    %[temp1],  "#B"(%[a])              \n\t"                \
+  "lbu    %[temp2],  "#C"(%[a])              \n\t"                \
+  "lbu    %[temp3],  "#D"(%[a])              \n\t"                \
+  "lbu    %[temp4],  "#A"(%[b])              \n\t"                \
+  "lbu    %[temp5],  "#B"(%[b])              \n\t"                \
+  "lbu    %[temp6],  "#C"(%[b])              \n\t"                \
+  "lbu    %[temp7],  "#D"(%[b])              \n\t"                \
+  "addu   %[temp8],  %[temp0],    %[temp2]   \n\t"                \
+  "subu   %[temp0],  %[temp0],    %[temp2]   \n\t"                \
+  "addu   %[temp2],  %[temp1],    %[temp3]   \n\t"                \
+  "subu   %[temp1],  %[temp1],    %[temp3]   \n\t"                \
+  "addu   %[temp3],  %[temp4],    %[temp6]   \n\t"                \
+  "subu   %[temp4],  %[temp4],    %[temp6]   \n\t"                \
+  "addu   %[temp6],  %[temp5],    %[temp7]   \n\t"                \
+  "subu   %[temp5],  %[temp5],    %[temp7]   \n\t"                \
+  "addu   %[temp7],  %[temp8],    %[temp2]   \n\t"                \
+  "subu   %[temp2],  %[temp8],    %[temp2]   \n\t"                \
+  "addu   %[temp8],  %[temp0],    %[temp1]   \n\t"                \
+  "subu   %[temp0],  %[temp0],    %[temp1]   \n\t"                \
+  "addu   %[temp1],  %[temp3],    %[temp6]   \n\t"                \
+  "subu   %[temp3],  %[temp3],    %[temp6]   \n\t"                \
+  "addu   %[temp6],  %[temp4],    %[temp5]   \n\t"                \
+  "subu   %[temp4],  %[temp4],    %[temp5]   \n\t"                \
+  "sw     %[temp7],  "#E"(%[tmp])            \n\t"                \
+  "sw     %[temp2],  "#H"(%[tmp])            \n\t"                \
+  "sw     %[temp8],  "#F"(%[tmp])            \n\t"                \
+  "sw     %[temp0],  "#G"(%[tmp])            \n\t"                \
+  "sw     %[temp1],  "#E1"(%[tmp])           \n\t"                \
+  "sw     %[temp3],  "#H1"(%[tmp])           \n\t"                \
+  "sw     %[temp6],  "#F1"(%[tmp])           \n\t"                \
+  "sw     %[temp4],  "#G1"(%[tmp])           \n\t"
+
+// macro for one vertical pass in Disto4x4 (TTransform)
+// two calls of function TTransform are merged into single one
+// since only one accu is available in mips32r1 instruction set
+//   first is done second call of function TTransform and after
+//   that first one.
+//   const int sum1 = TTransform(a, w);
+//   const int sum2 = TTransform(b, w);
+//   return abs(sum2 - sum1) >> 5;
+//   (sum2 - sum1) is calculated with madds (sub2) and msubs (sub1)
+// A..D - offsets in bytes to load first results from tmp buffer
+// A1..D1 - offsets in bytes to load second results from tmp buffer
+// E..H - offsets in bytes to load from w buffer
+#define VERTICAL_PASS(A, B, C, D, A1, B1, C1, D1, E, F, G, H)     \
+  "lw     %[temp0],  "#A1"(%[tmp])           \n\t"                \
+  "lw     %[temp1],  "#C1"(%[tmp])           \n\t"                \
+  "lw     %[temp2],  "#B1"(%[tmp])           \n\t"                \
+  "lw     %[temp3],  "#D1"(%[tmp])           \n\t"                \
+  "addu   %[temp8],  %[temp0],    %[temp1]   \n\t"                \
+  "subu   %[temp0],  %[temp0],    %[temp1]   \n\t"                \
+  "addu   %[temp1],  %[temp2],    %[temp3]   \n\t"                \
+  "subu   %[temp2],  %[temp2],    %[temp3]   \n\t"                \
+  "addu   %[temp3],  %[temp8],    %[temp1]   \n\t"                \
+  "subu   %[temp8],  %[temp8],    %[temp1]   \n\t"                \
+  "addu   %[temp1],  %[temp0],    %[temp2]   \n\t"                \
+  "subu   %[temp0],  %[temp0],    %[temp2]   \n\t"                \
+  "sra    %[temp4],  %[temp3],    31         \n\t"                \
+  "sra    %[temp5],  %[temp1],    31         \n\t"                \
+  "sra    %[temp6],  %[temp0],    31         \n\t"                \
+  "sra    %[temp7],  %[temp8],    31         \n\t"                \
+  "xor    %[temp3],  %[temp3],    %[temp4]   \n\t"                \
+  "xor    %[temp1],  %[temp1],    %[temp5]   \n\t"                \
+  "xor    %[temp0],  %[temp0],    %[temp6]   \n\t"                \
+  "xor    %[temp8],  %[temp8],    %[temp7]   \n\t"                \
+  "subu   %[temp3],  %[temp3],    %[temp4]   \n\t"                \
+  "subu   %[temp1],  %[temp1],    %[temp5]   \n\t"                \
+  "subu   %[temp0],  %[temp0],    %[temp6]   \n\t"                \
+  "subu   %[temp8],  %[temp8],    %[temp7]   \n\t"                \
+  "lhu    %[temp4],  "#E"(%[w])              \n\t"                \
+  "lhu    %[temp5],  "#F"(%[w])              \n\t"                \
+  "lhu    %[temp6],  "#G"(%[w])              \n\t"                \
+  "lhu    %[temp7],  "#H"(%[w])              \n\t"                \
+  "madd   %[temp4],  %[temp3]                \n\t"                \
+  "madd   %[temp5],  %[temp1]                \n\t"                \
+  "madd   %[temp6],  %[temp0]                \n\t"                \
+  "madd   %[temp7],  %[temp8]                \n\t"                \
+  "lw     %[temp0],  "#A"(%[tmp])            \n\t"                \
+  "lw     %[temp1],  "#C"(%[tmp])            \n\t"                \
+  "lw     %[temp2],  "#B"(%[tmp])            \n\t"                \
+  "lw     %[temp3],  "#D"(%[tmp])            \n\t"                \
+  "addu   %[temp8],  %[temp0],    %[temp1]   \n\t"                \
+  "subu   %[temp0],  %[temp0],    %[temp1]   \n\t"                \
+  "addu   %[temp1],  %[temp2],    %[temp3]   \n\t"                \
+  "subu   %[temp2],  %[temp2],    %[temp3]   \n\t"                \
+  "addu   %[temp3],  %[temp8],    %[temp1]   \n\t"                \
+  "subu   %[temp1],  %[temp8],    %[temp1]   \n\t"                \
+  "addu   %[temp8],  %[temp0],    %[temp2]   \n\t"                \
+  "subu   %[temp0],  %[temp0],    %[temp2]   \n\t"                \
+  "sra    %[temp2],  %[temp3],    31         \n\t"                \
+  "xor    %[temp3],  %[temp3],    %[temp2]   \n\t"                \
+  "subu   %[temp3],  %[temp3],    %[temp2]   \n\t"                \
+  "msub   %[temp4],  %[temp3]                \n\t"                \
+  "sra    %[temp2],  %[temp8],    31         \n\t"                \
+  "sra    %[temp3],  %[temp0],    31         \n\t"                \
+  "sra    %[temp4],  %[temp1],    31         \n\t"                \
+  "xor    %[temp8],  %[temp8],    %[temp2]   \n\t"                \
+  "xor    %[temp0],  %[temp0],    %[temp3]   \n\t"                \
+  "xor    %[temp1],  %[temp1],    %[temp4]   \n\t"                \
+  "subu   %[temp8],  %[temp8],    %[temp2]   \n\t"                \
+  "subu   %[temp0],  %[temp0],    %[temp3]   \n\t"                \
+  "subu   %[temp1],  %[temp1],    %[temp4]   \n\t"                \
+  "msub   %[temp5],  %[temp8]                \n\t"                \
+  "msub   %[temp6],  %[temp0]                \n\t"                \
+  "msub   %[temp7],  %[temp1]                \n\t"
+
+static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
+                    const uint16_t* const w) {
+  int tmp[32];
+  int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
+
+  __asm__ volatile(
+    HORIZONTAL_PASS( 0,  1,  2,  3,    0,  4,  8, 12,    64,  68,  72,  76)
+    HORIZONTAL_PASS(16, 17, 18, 19,   16, 20, 24, 28,    80,  84,  88,  92)
+    HORIZONTAL_PASS(32, 33, 34, 35,   32, 36, 40, 44,    96, 100, 104, 108)
+    HORIZONTAL_PASS(48, 49, 50, 51,   48, 52, 56, 60,   112, 116, 120, 124)
+    "mthi   $zero                             \n\t"
+    "mtlo   $zero                             \n\t"
+    VERTICAL_PASS( 0, 16, 32, 48,     64, 80,  96, 112,   0,  8, 16, 24)
+    VERTICAL_PASS( 4, 20, 36, 52,     68, 84, 100, 116,   2, 10, 18, 26)
+    VERTICAL_PASS( 8, 24, 40, 56,     72, 88, 104, 120,   4, 12, 20, 28)
+    VERTICAL_PASS(12, 28, 44, 60,     76, 92, 108, 124,   6, 14, 22, 30)
+    "mflo   %[temp0]                          \n\t"
+    "sra    %[temp1],  %[temp0],  31          \n\t"
+    "xor    %[temp0],  %[temp0],  %[temp1]    \n\t"
+    "subu   %[temp0],  %[temp0],  %[temp1]    \n\t"
+    "sra    %[temp0],  %[temp0],  5           \n\t"
+
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8)
+    : [a]"r"(a), [b]"r"(b), [w]"r"(w), [tmp]"r"(tmp)
+    : "memory", "hi", "lo"
+  );
+
+  return temp0;
+}
+
+#undef VERTICAL_PASS
+#undef HORIZONTAL_PASS
+
+static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
+                      const uint16_t* const w) {
+  int D = 0;
+  int x, y;
+  for (y = 0; y < 16 * BPS; y += 4 * BPS) {
+    for (x = 0; x < 16; x += 4) {
+      D += Disto4x4(a + x + y, b + x + y, w);
+    }
+  }
+  return D;
+}
+
+// macro for one horizontal pass in FTransform
+// temp0..temp15 holds tmp[0]..tmp[15]
+// A..D - offsets in bytes to load from src and ref buffers
+// TEMP0..TEMP3 - registers for corresponding tmp elements
+#define HORIZONTAL_PASS(A, B, C, D, TEMP0, TEMP1, TEMP2, TEMP3) \
+  "lw     %["#TEMP1"],  0(%[args])                     \n\t"    \
+  "lw     %["#TEMP2"],  4(%[args])                     \n\t"    \
+  "lbu    %[temp16],    "#A"(%["#TEMP1"])              \n\t"    \
+  "lbu    %[temp17],    "#A"(%["#TEMP2"])              \n\t"    \
+  "lbu    %[temp18],    "#B"(%["#TEMP1"])              \n\t"    \
+  "lbu    %[temp19],    "#B"(%["#TEMP2"])              \n\t"    \
+  "subu   %[temp20],    %[temp16],    %[temp17]        \n\t"    \
+  "lbu    %[temp16],    "#C"(%["#TEMP1"])              \n\t"    \
+  "lbu    %[temp17],    "#C"(%["#TEMP2"])              \n\t"    \
+  "subu   %["#TEMP0"],  %[temp18],    %[temp19]        \n\t"    \
+  "lbu    %[temp18],    "#D"(%["#TEMP1"])              \n\t"    \
+  "lbu    %[temp19],    "#D"(%["#TEMP2"])              \n\t"    \
+  "subu   %["#TEMP1"],  %[temp16],    %[temp17]        \n\t"    \
+  "subu   %["#TEMP2"],  %[temp18],    %[temp19]        \n\t"    \
+  "addu   %["#TEMP3"],  %[temp20],    %["#TEMP2"]      \n\t"    \
+  "subu   %["#TEMP2"],  %[temp20],    %["#TEMP2"]      \n\t"    \
+  "addu   %[temp20],    %["#TEMP0"],  %["#TEMP1"]      \n\t"    \
+  "subu   %["#TEMP0"],  %["#TEMP0"],  %["#TEMP1"]      \n\t"    \
+  "mul    %[temp16],    %["#TEMP2"],  %[c5352]         \n\t"    \
+  "mul    %[temp17],    %["#TEMP2"],  %[c2217]         \n\t"    \
+  "mul    %[temp18],    %["#TEMP0"],  %[c5352]         \n\t"    \
+  "mul    %[temp19],    %["#TEMP0"],  %[c2217]         \n\t"    \
+  "addu   %["#TEMP1"],  %["#TEMP3"],  %[temp20]        \n\t"    \
+  "subu   %[temp20],    %["#TEMP3"],  %[temp20]        \n\t"    \
+  "sll    %["#TEMP0"],  %["#TEMP1"],  3                \n\t"    \
+  "sll    %["#TEMP2"],  %[temp20],    3                \n\t"    \
+  "addiu  %[temp16],    %[temp16],    1812             \n\t"    \
+  "addiu  %[temp17],    %[temp17],    937              \n\t"    \
+  "addu   %[temp16],    %[temp16],    %[temp19]        \n\t"    \
+  "subu   %[temp17],    %[temp17],    %[temp18]        \n\t"    \
+  "sra    %["#TEMP1"],  %[temp16],    9                \n\t"    \
+  "sra    %["#TEMP3"],  %[temp17],    9                \n\t"
+
+// macro for one vertical pass in FTransform
+// temp0..temp15 holds tmp[0]..tmp[15]
+// A..D - offsets in bytes to store to out buffer
+// TEMP0, TEMP4, TEMP8 and TEMP12 - registers for corresponding tmp elements
+#define VERTICAL_PASS(A, B, C, D, TEMP0, TEMP4, TEMP8, TEMP12)  \
+  "addu   %[temp16],    %["#TEMP0"],  %["#TEMP12"]     \n\t"    \
+  "subu   %[temp19],    %["#TEMP0"],  %["#TEMP12"]     \n\t"    \
+  "addu   %[temp17],    %["#TEMP4"],  %["#TEMP8"]      \n\t"    \
+  "subu   %[temp18],    %["#TEMP4"],  %["#TEMP8"]      \n\t"    \
+  "mul    %["#TEMP8"],  %[temp19],    %[c2217]         \n\t"    \
+  "mul    %["#TEMP12"], %[temp18],    %[c2217]         \n\t"    \
+  "mul    %["#TEMP4"],  %[temp19],    %[c5352]         \n\t"    \
+  "mul    %[temp18],    %[temp18],    %[c5352]         \n\t"    \
+  "addiu  %[temp16],    %[temp16],    7                \n\t"    \
+  "addu   %["#TEMP0"],  %[temp16],    %[temp17]        \n\t"    \
+  "sra    %["#TEMP0"],  %["#TEMP0"],  4                \n\t"    \
+  "addu   %["#TEMP12"], %["#TEMP12"], %["#TEMP4"]      \n\t"    \
+  "subu   %["#TEMP4"],  %[temp16],    %[temp17]        \n\t"    \
+  "sra    %["#TEMP4"],  %["#TEMP4"],  4                \n\t"    \
+  "addiu  %["#TEMP8"],  %["#TEMP8"],  30000            \n\t"    \
+  "addiu  %["#TEMP12"], %["#TEMP12"], 12000            \n\t"    \
+  "addiu  %["#TEMP8"],  %["#TEMP8"],  21000            \n\t"    \
+  "subu   %["#TEMP8"],  %["#TEMP8"],  %[temp18]        \n\t"    \
+  "sra    %["#TEMP12"], %["#TEMP12"], 16               \n\t"    \
+  "sra    %["#TEMP8"],  %["#TEMP8"],  16               \n\t"    \
+  "addiu  %[temp16],    %["#TEMP12"], 1                \n\t"    \
+  "movn   %["#TEMP12"], %[temp16],    %[temp19]        \n\t"    \
+  "sh     %["#TEMP0"],  "#A"(%[temp20])                \n\t"    \
+  "sh     %["#TEMP4"],  "#C"(%[temp20])                \n\t"    \
+  "sh     %["#TEMP8"],  "#D"(%[temp20])                \n\t"    \
+  "sh     %["#TEMP12"], "#B"(%[temp20])                \n\t"
+
+static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
+  int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
+  int temp9, temp10, temp11, temp12, temp13, temp14, temp15, temp16;
+  int temp17, temp18, temp19, temp20;
+  const int c2217 = 2217;
+  const int c5352 = 5352;
+  const int* const args[3] =
+      { (const int*)src, (const int*)ref, (const int*)out };
+
+  __asm__ volatile(
+    HORIZONTAL_PASS( 0,  1,  2,  3, temp0,  temp1,  temp2,  temp3)
+    HORIZONTAL_PASS(16, 17, 18, 19, temp4,  temp5,  temp6,  temp7)
+    HORIZONTAL_PASS(32, 33, 34, 35, temp8,  temp9,  temp10, temp11)
+    HORIZONTAL_PASS(48, 49, 50, 51, temp12, temp13, temp14, temp15)
+    "lw   %[temp20],    8(%[args])                     \n\t"
+    VERTICAL_PASS(0,  8, 16, 24, temp0, temp4, temp8,  temp12)
+    VERTICAL_PASS(2, 10, 18, 26, temp1, temp5, temp9,  temp13)
+    VERTICAL_PASS(4, 12, 20, 28, temp2, temp6, temp10, temp14)
+    VERTICAL_PASS(6, 14, 22, 30, temp3, temp7, temp11, temp15)
+
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
+      [temp9]"=&r"(temp9), [temp10]"=&r"(temp10), [temp11]"=&r"(temp11),
+      [temp12]"=&r"(temp12), [temp13]"=&r"(temp13), [temp14]"=&r"(temp14),
+      [temp15]"=&r"(temp15), [temp16]"=&r"(temp16), [temp17]"=&r"(temp17),
+      [temp18]"=&r"(temp18), [temp19]"=&r"(temp19), [temp20]"=&r"(temp20)
+    : [args]"r"(args), [c2217]"r"(c2217), [c5352]"r"(c5352)
+    : "memory", "hi", "lo"
+  );
+}
+
+#undef VERTICAL_PASS
+#undef HORIZONTAL_PASS
+
+// Forward declaration.
+extern int VP8GetResidualCostMIPS32(int ctx0, const VP8Residual* const res);
+
+int VP8GetResidualCostMIPS32(int ctx0, const VP8Residual* const res) {
+  int n = res->first;
+  // should be prob[VP8EncBands[n]], but it's equivalent for n=0 or 1
+  int p0 = res->prob[n][ctx0][0];
+  const uint16_t* t = res->cost[n][ctx0];
+  int cost;
+  const int const_2 = 2;
+  const int const_255 = 255;
+  const int const_max_level = MAX_VARIABLE_LEVEL;
+  int res_cost;
+  int res_prob;
+  int res_coeffs;
+  int res_last;
+  int v_reg;
+  int b_reg;
+  int ctx_reg;
+  int cost_add, temp_1, temp_2, temp_3;
+
+  if (res->last < 0) {
+    return VP8BitCost(0, p0);
+  }
+
+  cost = (ctx0 == 0) ? VP8BitCost(1, p0) : 0;
+
+  res_cost = (int)res->cost;
+  res_prob = (int)res->prob;
+  res_coeffs = (int)res->coeffs;
+  res_last = (int)res->last;
+
+  __asm__ volatile(
+    ".set   push                                                           \n\t"
+    ".set   noreorder                                                      \n\t"
+
+    "sll    %[temp_1],     %[n],              1                            \n\t"
+    "addu   %[res_coeffs], %[res_coeffs],     %[temp_1]                    \n\t"
+    "slt    %[temp_2],     %[n],              %[res_last]                  \n\t"
+    "bnez   %[temp_2],     1f                                              \n\t"
+    " li    %[cost_add],   0                                               \n\t"
+    "b      2f                                                             \n\t"
+    " nop                                                                  \n\t"
+  "1:                                                                      \n\t"
+    "lh     %[v_reg],      0(%[res_coeffs])                                \n\t"
+    "addu   %[b_reg],      %[n],              %[VP8EncBands]               \n\t"
+    "move   %[temp_1],     %[const_max_level]                              \n\t"
+    "addu   %[cost],       %[cost],           %[cost_add]                  \n\t"
+    "negu   %[temp_2],     %[v_reg]                                        \n\t"
+    "slti   %[temp_3],     %[v_reg],          0                            \n\t"
+    "movn   %[v_reg],      %[temp_2],         %[temp_3]                    \n\t"
+    "lbu    %[b_reg],      1(%[b_reg])                                     \n\t"
+    "li     %[cost_add],   0                                               \n\t"
+
+    "sltiu  %[temp_3],     %[v_reg],          2                            \n\t"
+    "move   %[ctx_reg],    %[v_reg]                                        \n\t"
+    "movz   %[ctx_reg],    %[const_2],        %[temp_3]                    \n\t"
+    //  cost += VP8LevelCost(t, v);
+    "slt    %[temp_3],     %[v_reg],          %[const_max_level]           \n\t"
+    "movn   %[temp_1],     %[v_reg],          %[temp_3]                    \n\t"
+    "sll    %[temp_2],     %[v_reg],          1                            \n\t"
+    "addu   %[temp_2],     %[temp_2],         %[VP8LevelFixedCosts]        \n\t"
+    "lhu    %[temp_2],     0(%[temp_2])                                    \n\t"
+    "sll    %[temp_1],     %[temp_1],         1                            \n\t"
+    "addu   %[temp_1],     %[temp_1],         %[t]                         \n\t"
+    "lhu    %[temp_3],     0(%[temp_1])                                    \n\t"
+    "addu   %[cost],       %[cost],           %[temp_2]                    \n\t"
+
+    //  t = res->cost[b][ctx];
+    "sll    %[temp_1],     %[ctx_reg],        7                            \n\t"
+    "sll    %[temp_2],     %[ctx_reg],        3                            \n\t"
+    "addu   %[cost],       %[cost],           %[temp_3]                    \n\t"
+    "addu   %[temp_1],     %[temp_1],         %[temp_2]                    \n\t"
+    "sll    %[temp_2],     %[b_reg],          3                            \n\t"
+    "sll    %[temp_3],     %[b_reg],          5                            \n\t"
+    "sub    %[temp_2],     %[temp_3],         %[temp_2]                    \n\t"
+    "sll    %[temp_3],     %[temp_2],         4                            \n\t"
+    "addu   %[temp_1],     %[temp_1],         %[temp_3]                    \n\t"
+    "addu   %[temp_2],     %[temp_2],         %[res_cost]                  \n\t"
+    "addiu  %[n],          %[n],              1                            \n\t"
+    "addu   %[t],          %[temp_1],         %[temp_2]                    \n\t"
+    "slt    %[temp_1],     %[n],              %[res_last]                  \n\t"
+    "bnez   %[temp_1],     1b                                              \n\t"
+    " addiu %[res_coeffs], %[res_coeffs],     2                            \n\t"
+   "2:                                                                     \n\t"
+
+    ".set   pop                                                            \n\t"
+    : [cost]"+r"(cost), [t]"+r"(t), [n]"+r"(n), [v_reg]"=&r"(v_reg),
+      [ctx_reg]"=&r"(ctx_reg), [b_reg]"=&r"(b_reg), [cost_add]"=&r"(cost_add),
+      [temp_1]"=&r"(temp_1), [temp_2]"=&r"(temp_2), [temp_3]"=&r"(temp_3)
+    : [const_2]"r"(const_2), [const_255]"r"(const_255), [res_last]"r"(res_last),
+      [VP8EntropyCost]"r"(VP8EntropyCost), [VP8EncBands]"r"(VP8EncBands),
+      [const_max_level]"r"(const_max_level), [res_prob]"r"(res_prob),
+      [VP8LevelFixedCosts]"r"(VP8LevelFixedCosts), [res_coeffs]"r"(res_coeffs),
+      [res_cost]"r"(res_cost)
+    : "memory"
+  );
+
+  // Last coefficient is always non-zero
+  {
+    const int v = abs(res->coeffs[n]);
+    assert(v != 0);
+    cost += VP8LevelCost(t, v);
+    if (n < 15) {
+      const int b = VP8EncBands[n + 1];
+      const int ctx = (v == 1) ? 1 : 2;
+      const int last_p0 = res->prob[b][ctx][0];
+      cost += VP8BitCost(0, last_p0);
+    }
+  }
+  return cost;
+}
+
+#define GET_SSE_INNER(A, B, C, D)                               \
+  "lbu     %[temp0],    "#A"(%[a])                   \n\t"      \
+  "lbu     %[temp1],    "#A"(%[b])                   \n\t"      \
+  "lbu     %[temp2],    "#B"(%[a])                   \n\t"      \
+  "lbu     %[temp3],    "#B"(%[b])                   \n\t"      \
+  "lbu     %[temp4],    "#C"(%[a])                   \n\t"      \
+  "lbu     %[temp5],    "#C"(%[b])                   \n\t"      \
+  "lbu     %[temp6],    "#D"(%[a])                   \n\t"      \
+  "lbu     %[temp7],    "#D"(%[b])                   \n\t"      \
+  "subu    %[temp0],    %[temp0],     %[temp1]       \n\t"      \
+  "subu    %[temp2],    %[temp2],     %[temp3]       \n\t"      \
+  "subu    %[temp4],    %[temp4],     %[temp5]       \n\t"      \
+  "subu    %[temp6],    %[temp6],     %[temp7]       \n\t"      \
+  "madd    %[temp0],    %[temp0]                     \n\t"      \
+  "madd    %[temp2],    %[temp2]                     \n\t"      \
+  "madd    %[temp4],    %[temp4]                     \n\t"      \
+  "madd    %[temp6],    %[temp6]                     \n\t"
+
+#define GET_SSE(A, B, C, D)               \
+  GET_SSE_INNER(A, A + 1, A + 2, A + 3)   \
+  GET_SSE_INNER(B, B + 1, B + 2, B + 3)   \
+  GET_SSE_INNER(C, C + 1, C + 2, C + 3)   \
+  GET_SSE_INNER(D, D + 1, D + 2, D + 3)
+
+#if !defined(WORK_AROUND_GCC)
+static int SSE16x16(const uint8_t* a, const uint8_t* b) {
+  int count;
+  int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+
+  __asm__ volatile(
+     "mult   $zero,    $zero                            \n\t"
+
+     GET_SSE(  0,   4,   8,  12)
+     GET_SSE( 16,  20,  24,  28)
+     GET_SSE( 32,  36,  40,  44)
+     GET_SSE( 48,  52,  56,  60)
+     GET_SSE( 64,  68,  72,  76)
+     GET_SSE( 80,  84,  88,  92)
+     GET_SSE( 96, 100, 104, 108)
+     GET_SSE(112, 116, 120, 124)
+     GET_SSE(128, 132, 136, 140)
+     GET_SSE(144, 148, 152, 156)
+     GET_SSE(160, 164, 168, 172)
+     GET_SSE(176, 180, 184, 188)
+     GET_SSE(192, 196, 200, 204)
+     GET_SSE(208, 212, 216, 220)
+     GET_SSE(224, 228, 232, 236)
+     GET_SSE(240, 244, 248, 252)
+
+    "mflo    %[count]                                   \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [count]"=&r"(count)
+    : [a]"r"(a), [b]"r"(b)
+    : "memory", "hi" , "lo"
+  );
+  return count;
+}
+
+static int SSE16x8(const uint8_t* a, const uint8_t* b) {
+  int count;
+  int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+
+  __asm__ volatile(
+     "mult   $zero,    $zero                            \n\t"
+
+     GET_SSE(  0,   4,   8,  12)
+     GET_SSE( 16,  20,  24,  28)
+     GET_SSE( 32,  36,  40,  44)
+     GET_SSE( 48,  52,  56,  60)
+     GET_SSE( 64,  68,  72,  76)
+     GET_SSE( 80,  84,  88,  92)
+     GET_SSE( 96, 100, 104, 108)
+     GET_SSE(112, 116, 120, 124)
+
+    "mflo    %[count]                                   \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [count]"=&r"(count)
+    : [a]"r"(a), [b]"r"(b)
+    : "memory", "hi" , "lo"
+  );
+  return count;
+}
+
+static int SSE8x8(const uint8_t* a, const uint8_t* b) {
+  int count;
+  int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+
+  __asm__ volatile(
+     "mult   $zero,    $zero                            \n\t"
+
+     GET_SSE( 0,   4,  16,  20)
+     GET_SSE(32,  36,  48,  52)
+     GET_SSE(64,  68,  80,  84)
+     GET_SSE(96, 100, 112, 116)
+
+    "mflo    %[count]                                   \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [count]"=&r"(count)
+    : [a]"r"(a), [b]"r"(b)
+    : "memory", "hi" , "lo"
+  );
+  return count;
+}
+
+static int SSE4x4(const uint8_t* a, const uint8_t* b) {
+  int count;
+  int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+
+  __asm__ volatile(
+     "mult   $zero,    $zero                            \n\t"
+
+     GET_SSE(0, 16, 32, 48)
+
+    "mflo    %[count]                                   \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [count]"=&r"(count)
+    : [a]"r"(a), [b]"r"(b)
+    : "memory", "hi" , "lo"
+  );
+  return count;
+}
+
+#endif  // WORK_AROUND_GCC
+
+#undef GET_SSE_MIPS32
+#undef GET_SSE_MIPS32_INNER
+
+#endif  // WEBP_USE_MIPS32
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8EncDspInitMIPS32(void);
+
+void VP8EncDspInitMIPS32(void) {
+#if defined(WEBP_USE_MIPS32)
+  VP8ITransform = ITransform;
+  VP8EncQuantizeBlock = QuantizeBlock;
+  VP8TDisto4x4 = Disto4x4;
+  VP8TDisto16x16 = Disto16x16;
+  VP8FTransform = FTransform;
+#if !defined(WORK_AROUND_GCC)
+  VP8SSE16x16 = SSE16x16;
+  VP8SSE8x8 = SSE8x8;
+  VP8SSE16x8 = SSE16x8;
+  VP8SSE4x4 = SSE4x4;
+#endif
+#endif  // WEBP_USE_MIPS32
+}
diff --git a/src/3rdparty/libwebp/src/dsp/enc_neon.c b/src/3rdparty/libwebp/src/dsp/enc_neon.c
index 52cca18..5814fac 100644
--- a/src/3rdparty/libwebp/src/dsp/enc_neon.c
+++ b/src/3rdparty/libwebp/src/dsp/enc_neon.c
@@ -15,18 +15,122 @@
 
 #if defined(WEBP_USE_NEON)
 
+#include <assert.h>
+
+#include "./neon.h"
 #include "../enc/vp8enci.h"
 
 //------------------------------------------------------------------------------
 // Transforms (Paragraph 14.4)
 
 // Inverse transform.
-// This code is pretty much the same as TransformOneNEON in the decoder, except
+// This code is pretty much the same as TransformOne in the dec_neon.c, except
 // for subtraction to *ref. See the comments there for algorithmic explanations.
+
+static const int16_t kC1 = 20091;
+static const int16_t kC2 = 17734;  // half of kC2, actually. See comment above.
+
+// This code works but is *slower* than the inlined-asm version below
+// (with gcc-4.6). So we disable it for now. Later, it'll be conditional to
+// USE_INTRINSICS define.
+// With gcc-4.8, it's a little faster speed than inlined-assembly.
+#if defined(USE_INTRINSICS)
+
+// Treats 'v' as an uint8x8_t and zero extends to an int16x8_t.
+static WEBP_INLINE int16x8_t ConvertU8ToS16(uint32x2_t v) {
+  return vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(v)));
+}
+
+// Performs unsigned 8b saturation on 'dst01' and 'dst23' storing the result
+// to the corresponding rows of 'dst'.
+static WEBP_INLINE void SaturateAndStore4x4(uint8_t* const dst,
+                                            const int16x8_t dst01,
+                                            const int16x8_t dst23) {
+  // Unsigned saturate to 8b.
+  const uint8x8_t dst01_u8 = vqmovun_s16(dst01);
+  const uint8x8_t dst23_u8 = vqmovun_s16(dst23);
+
+  // Store the results.
+  vst1_lane_u32((uint32_t*)(dst + 0 * BPS), vreinterpret_u32_u8(dst01_u8), 0);
+  vst1_lane_u32((uint32_t*)(dst + 1 * BPS), vreinterpret_u32_u8(dst01_u8), 1);
+  vst1_lane_u32((uint32_t*)(dst + 2 * BPS), vreinterpret_u32_u8(dst23_u8), 0);
+  vst1_lane_u32((uint32_t*)(dst + 3 * BPS), vreinterpret_u32_u8(dst23_u8), 1);
+}
+
+static WEBP_INLINE void Add4x4(const int16x8_t row01, const int16x8_t row23,
+                               const uint8_t* const ref, uint8_t* const dst) {
+  uint32x2_t dst01 = vdup_n_u32(0);
+  uint32x2_t dst23 = vdup_n_u32(0);
+
+  // Load the source pixels.
+  dst01 = vld1_lane_u32((uint32_t*)(ref + 0 * BPS), dst01, 0);
+  dst23 = vld1_lane_u32((uint32_t*)(ref + 2 * BPS), dst23, 0);
+  dst01 = vld1_lane_u32((uint32_t*)(ref + 1 * BPS), dst01, 1);
+  dst23 = vld1_lane_u32((uint32_t*)(ref + 3 * BPS), dst23, 1);
+
+  {
+    // Convert to 16b.
+    const int16x8_t dst01_s16 = ConvertU8ToS16(dst01);
+    const int16x8_t dst23_s16 = ConvertU8ToS16(dst23);
+
+    // Descale with rounding.
+    const int16x8_t out01 = vrsraq_n_s16(dst01_s16, row01, 3);
+    const int16x8_t out23 = vrsraq_n_s16(dst23_s16, row23, 3);
+    // Add the inverse transform.
+    SaturateAndStore4x4(dst, out01, out23);
+  }
+}
+
+static WEBP_INLINE void Transpose8x2(const int16x8_t in0, const int16x8_t in1,
+                                     int16x8x2_t* const out) {
+  // a0 a1 a2 a3 | b0 b1 b2 b3   => a0 b0 c0 d0 | a1 b1 c1 d1
+  // c0 c1 c2 c3 | d0 d1 d2 d3      a2 b2 c2 d2 | a3 b3 c3 d3
+  const int16x8x2_t tmp0 = vzipq_s16(in0, in1);   // a0 c0 a1 c1 a2 c2 ...
+                                                  // b0 d0 b1 d1 b2 d2 ...
+  *out = vzipq_s16(tmp0.val[0], tmp0.val[1]);
+}
+
+static WEBP_INLINE void TransformPass(int16x8x2_t* const rows) {
+  // {rows} = in0 | in4
+  //          in8 | in12
+  // B1 = in4 | in12
+  const int16x8_t B1 =
+      vcombine_s16(vget_high_s16(rows->val[0]), vget_high_s16(rows->val[1]));
+  // C0 = kC1 * in4 | kC1 * in12
+  // C1 = kC2 * in4 | kC2 * in12
+  const int16x8_t C0 = vsraq_n_s16(B1, vqdmulhq_n_s16(B1, kC1), 1);
+  const int16x8_t C1 = vqdmulhq_n_s16(B1, kC2);
+  const int16x4_t a = vqadd_s16(vget_low_s16(rows->val[0]),
+                                vget_low_s16(rows->val[1]));   // in0 + in8
+  const int16x4_t b = vqsub_s16(vget_low_s16(rows->val[0]),
+                                vget_low_s16(rows->val[1]));   // in0 - in8
+  // c = kC2 * in4 - kC1 * in12
+  // d = kC1 * in4 + kC2 * in12
+  const int16x4_t c = vqsub_s16(vget_low_s16(C1), vget_high_s16(C0));
+  const int16x4_t d = vqadd_s16(vget_low_s16(C0), vget_high_s16(C1));
+  const int16x8_t D0 = vcombine_s16(a, b);      // D0 = a | b
+  const int16x8_t D1 = vcombine_s16(d, c);      // D1 = d | c
+  const int16x8_t E0 = vqaddq_s16(D0, D1);      // a+d | b+c
+  const int16x8_t E_tmp = vqsubq_s16(D0, D1);   // a-d | b-c
+  const int16x8_t E1 = vcombine_s16(vget_high_s16(E_tmp), vget_low_s16(E_tmp));
+  Transpose8x2(E0, E1, rows);
+}
+
+static void ITransformOne(const uint8_t* ref,
+                          const int16_t* in, uint8_t* dst) {
+  int16x8x2_t rows;
+  INIT_VECTOR2(rows, vld1q_s16(in + 0), vld1q_s16(in + 8));
+  TransformPass(&rows);
+  TransformPass(&rows);
+  Add4x4(rows.val[0], rows.val[1], ref, dst);
+}
+
+#else
+
 static void ITransformOne(const uint8_t* ref,
                           const int16_t* in, uint8_t* dst) {
   const int kBPS = BPS;
-  const int16_t kC1C2[] = { 20091, 17734, 0, 0 };  // kC1 / (kC2 >> 1) / 0 / 0
+  const int16_t kC1C2[] = { kC1, kC2, 0, 0 };
 
   __asm__ volatile (
     "vld1.16         {q1, q2}, [%[in]]           \n"
@@ -137,6 +241,8 @@ static void ITransformOne(const uint8_t* ref,
   );
 }
 
+#endif    // USE_INTRINSICS
+
 static void ITransform(const uint8_t* ref,
                        const int16_t* in, uint8_t* dst, int do_two) {
   ITransformOne(ref, in, dst);
@@ -145,76 +251,102 @@ static void ITransform(const uint8_t* ref,
   }
 }
 
-// Same code as dec_neon.c
-static void ITransformWHT(const int16_t* in, int16_t* out) {
-  const int kStep = 32;  // The store is only incrementing the pointer as if we
-                         // had stored a single byte.
-  __asm__ volatile (
-    // part 1
-    // load data into q0, q1
-    "vld1.16         {q0, q1}, [%[in]]           \n"
-
-    "vaddl.s16       q2, d0, d3                  \n" // a0 = in[0] + in[12]
-    "vaddl.s16       q3, d1, d2                  \n" // a1 = in[4] + in[8]
-    "vsubl.s16       q4, d1, d2                  \n" // a2 = in[4] - in[8]
-    "vsubl.s16       q5, d0, d3                  \n" // a3 = in[0] - in[12]
-
-    "vadd.s32        q0, q2, q3                  \n" // tmp[0] = a0 + a1
-    "vsub.s32        q2, q2, q3                  \n" // tmp[8] = a0 - a1
-    "vadd.s32        q1, q5, q4                  \n" // tmp[4] = a3 + a2
-    "vsub.s32        q3, q5, q4                  \n" // tmp[12] = a3 - a2
-
-    // Transpose
-    // q0 = tmp[0, 4, 8, 12], q1 = tmp[2, 6, 10, 14]
-    // q2 = tmp[1, 5, 9, 13], q3 = tmp[3, 7, 11, 15]
-    "vswp            d1, d4                      \n" // vtrn.64 q0, q2
-    "vswp            d3, d6                      \n" // vtrn.64 q1, q3
-    "vtrn.32         q0, q1                      \n"
-    "vtrn.32         q2, q3                      \n"
-
-    "vmov.s32        q4, #3                      \n" // dc = 3
-    "vadd.s32        q0, q0, q4                  \n" // dc = tmp[0] + 3
-    "vadd.s32        q6, q0, q3                  \n" // a0 = dc + tmp[3]
-    "vadd.s32        q7, q1, q2                  \n" // a1 = tmp[1] + tmp[2]
-    "vsub.s32        q8, q1, q2                  \n" // a2 = tmp[1] - tmp[2]
-    "vsub.s32        q9, q0, q3                  \n" // a3 = dc - tmp[3]
-
-    "vadd.s32        q0, q6, q7                  \n"
-    "vshrn.s32       d0, q0, #3                  \n" // (a0 + a1) >> 3
-    "vadd.s32        q1, q9, q8                  \n"
-    "vshrn.s32       d1, q1, #3                  \n" // (a3 + a2) >> 3
-    "vsub.s32        q2, q6, q7                  \n"
-    "vshrn.s32       d2, q2, #3                  \n" // (a0 - a1) >> 3
-    "vsub.s32        q3, q9, q8                  \n"
-    "vshrn.s32       d3, q3, #3                  \n" // (a3 - a2) >> 3
-
-    // set the results to output
-    "vst1.16         d0[0], [%[out]], %[kStep]      \n"
-    "vst1.16         d1[0], [%[out]], %[kStep]      \n"
-    "vst1.16         d2[0], [%[out]], %[kStep]      \n"
-    "vst1.16         d3[0], [%[out]], %[kStep]      \n"
-    "vst1.16         d0[1], [%[out]], %[kStep]      \n"
-    "vst1.16         d1[1], [%[out]], %[kStep]      \n"
-    "vst1.16         d2[1], [%[out]], %[kStep]      \n"
-    "vst1.16         d3[1], [%[out]], %[kStep]      \n"
-    "vst1.16         d0[2], [%[out]], %[kStep]      \n"
-    "vst1.16         d1[2], [%[out]], %[kStep]      \n"
-    "vst1.16         d2[2], [%[out]], %[kStep]      \n"
-    "vst1.16         d3[2], [%[out]], %[kStep]      \n"
-    "vst1.16         d0[3], [%[out]], %[kStep]      \n"
-    "vst1.16         d1[3], [%[out]], %[kStep]      \n"
-    "vst1.16         d2[3], [%[out]], %[kStep]      \n"
-    "vst1.16         d3[3], [%[out]], %[kStep]      \n"
-
-    : [out] "+r"(out)  // modified registers
-    : [in] "r"(in), [kStep] "r"(kStep)  // constants
-    : "memory", "q0", "q1", "q2", "q3", "q4",
-      "q5", "q6", "q7", "q8", "q9" // clobbered
-  );
+// Load all 4x4 pixels into a single uint8x16_t variable.
+static uint8x16_t Load4x4(const uint8_t* src) {
+  uint32x4_t out = vdupq_n_u32(0);
+  out = vld1q_lane_u32((const uint32_t*)(src + 0 * BPS), out, 0);
+  out = vld1q_lane_u32((const uint32_t*)(src + 1 * BPS), out, 1);
+  out = vld1q_lane_u32((const uint32_t*)(src + 2 * BPS), out, 2);
+  out = vld1q_lane_u32((const uint32_t*)(src + 3 * BPS), out, 3);
+  return vreinterpretq_u8_u32(out);
 }
 
 // Forward transform.
 
+#if defined(USE_INTRINSICS)
+
+static WEBP_INLINE void Transpose4x4_S16(const int16x4_t A, const int16x4_t B,
+                                         const int16x4_t C, const int16x4_t D,
+                                         int16x8_t* const out01,
+                                         int16x8_t* const out32) {
+  const int16x4x2_t AB = vtrn_s16(A, B);
+  const int16x4x2_t CD = vtrn_s16(C, D);
+  const int32x2x2_t tmp02 = vtrn_s32(vreinterpret_s32_s16(AB.val[0]),
+                                     vreinterpret_s32_s16(CD.val[0]));
+  const int32x2x2_t tmp13 = vtrn_s32(vreinterpret_s32_s16(AB.val[1]),
+                                     vreinterpret_s32_s16(CD.val[1]));
+  *out01 = vreinterpretq_s16_s64(
+      vcombine_s64(vreinterpret_s64_s32(tmp02.val[0]),
+                   vreinterpret_s64_s32(tmp13.val[0])));
+  *out32 = vreinterpretq_s16_s64(
+      vcombine_s64(vreinterpret_s64_s32(tmp13.val[1]),
+                   vreinterpret_s64_s32(tmp02.val[1])));
+}
+
+static WEBP_INLINE int16x8_t DiffU8ToS16(const uint8x8_t a,
+                                         const uint8x8_t b) {
+  return vreinterpretq_s16_u16(vsubl_u8(a, b));
+}
+
+static void FTransform(const uint8_t* src, const uint8_t* ref,
+                       int16_t* out) {
+  int16x8_t d0d1, d3d2;   // working 4x4 int16 variables
+  {
+    const uint8x16_t S0 = Load4x4(src);
+    const uint8x16_t R0 = Load4x4(ref);
+    const int16x8_t D0D1 = DiffU8ToS16(vget_low_u8(S0), vget_low_u8(R0));
+    const int16x8_t D2D3 = DiffU8ToS16(vget_high_u8(S0), vget_high_u8(R0));
+    const int16x4_t D0 = vget_low_s16(D0D1);
+    const int16x4_t D1 = vget_high_s16(D0D1);
+    const int16x4_t D2 = vget_low_s16(D2D3);
+    const int16x4_t D3 = vget_high_s16(D2D3);
+    Transpose4x4_S16(D0, D1, D2, D3, &d0d1, &d3d2);
+  }
+  {    // 1rst pass
+    const int32x4_t kCst937 = vdupq_n_s32(937);
+    const int32x4_t kCst1812 = vdupq_n_s32(1812);
+    const int16x8_t a0a1 = vaddq_s16(d0d1, d3d2);   // d0+d3 | d1+d2   (=a0|a1)
+    const int16x8_t a3a2 = vsubq_s16(d0d1, d3d2);   // d0-d3 | d1-d2   (=a3|a2)
+    const int16x8_t a0a1_2 = vshlq_n_s16(a0a1, 3);
+    const int16x4_t tmp0 = vadd_s16(vget_low_s16(a0a1_2),
+                                    vget_high_s16(a0a1_2));
+    const int16x4_t tmp2 = vsub_s16(vget_low_s16(a0a1_2),
+                                    vget_high_s16(a0a1_2));
+    const int32x4_t a3_2217 = vmull_n_s16(vget_low_s16(a3a2), 2217);
+    const int32x4_t a2_2217 = vmull_n_s16(vget_high_s16(a3a2), 2217);
+    const int32x4_t a2_p_a3 = vmlal_n_s16(a2_2217, vget_low_s16(a3a2), 5352);
+    const int32x4_t a3_m_a2 = vmlsl_n_s16(a3_2217, vget_high_s16(a3a2), 5352);
+    const int16x4_t tmp1 = vshrn_n_s32(vaddq_s32(a2_p_a3, kCst1812), 9);
+    const int16x4_t tmp3 = vshrn_n_s32(vaddq_s32(a3_m_a2, kCst937), 9);
+    Transpose4x4_S16(tmp0, tmp1, tmp2, tmp3, &d0d1, &d3d2);
+  }
+  {    // 2nd pass
+    // the (1<<16) addition is for the replacement: a3!=0  <-> 1-(a3==0)
+    const int32x4_t kCst12000 = vdupq_n_s32(12000 + (1 << 16));
+    const int32x4_t kCst51000 = vdupq_n_s32(51000);
+    const int16x8_t a0a1 = vaddq_s16(d0d1, d3d2);   // d0+d3 | d1+d2   (=a0|a1)
+    const int16x8_t a3a2 = vsubq_s16(d0d1, d3d2);   // d0-d3 | d1-d2   (=a3|a2)
+    const int16x4_t a0_k7 = vadd_s16(vget_low_s16(a0a1), vdup_n_s16(7));
+    const int16x4_t out0 = vshr_n_s16(vadd_s16(a0_k7, vget_high_s16(a0a1)), 4);
+    const int16x4_t out2 = vshr_n_s16(vsub_s16(a0_k7, vget_high_s16(a0a1)), 4);
+    const int32x4_t a3_2217 = vmull_n_s16(vget_low_s16(a3a2), 2217);
+    const int32x4_t a2_2217 = vmull_n_s16(vget_high_s16(a3a2), 2217);
+    const int32x4_t a2_p_a3 = vmlal_n_s16(a2_2217, vget_low_s16(a3a2), 5352);
+    const int32x4_t a3_m_a2 = vmlsl_n_s16(a3_2217, vget_high_s16(a3a2), 5352);
+    const int16x4_t tmp1 = vaddhn_s32(a2_p_a3, kCst12000);
+    const int16x4_t out3 = vaddhn_s32(a3_m_a2, kCst51000);
+    const int16x4_t a3_eq_0 =
+        vreinterpret_s16_u16(vceq_s16(vget_low_s16(a3a2), vdup_n_s16(0)));
+    const int16x4_t out1 = vadd_s16(tmp1, a3_eq_0);
+    vst1_s16(out +  0, out0);
+    vst1_s16(out +  4, out1);
+    vst1_s16(out +  8, out2);
+    vst1_s16(out + 12, out3);
+  }
+}
+
+#else
+
 // adapted from vp8/encoder/arm/neon/shortfdct_neon.asm
 static const int16_t kCoeff16[] = {
   5352,  5352,  5352, 5352, 2217,  2217,  2217, 2217
@@ -339,69 +471,76 @@ static void FTransform(const uint8_t* src, const uint8_t* ref,
   );
 }
 
-static void FTransformWHT(const int16_t* in, int16_t* out) {
-  const int kStep = 32;
-  __asm__ volatile (
-    // d0 = in[0 * 16] , d1 = in[1 * 16]
-    // d2 = in[2 * 16] , d3 = in[3 * 16]
-    "vld1.16         d0[0], [%[in]], %[kStep]   \n"
-    "vld1.16         d1[0], [%[in]], %[kStep]   \n"
-    "vld1.16         d2[0], [%[in]], %[kStep]   \n"
-    "vld1.16         d3[0], [%[in]], %[kStep]   \n"
-    "vld1.16         d0[1], [%[in]], %[kStep]   \n"
-    "vld1.16         d1[1], [%[in]], %[kStep]   \n"
-    "vld1.16         d2[1], [%[in]], %[kStep]   \n"
-    "vld1.16         d3[1], [%[in]], %[kStep]   \n"
-    "vld1.16         d0[2], [%[in]], %[kStep]   \n"
-    "vld1.16         d1[2], [%[in]], %[kStep]   \n"
-    "vld1.16         d2[2], [%[in]], %[kStep]   \n"
-    "vld1.16         d3[2], [%[in]], %[kStep]   \n"
-    "vld1.16         d0[3], [%[in]], %[kStep]   \n"
-    "vld1.16         d1[3], [%[in]], %[kStep]   \n"
-    "vld1.16         d2[3], [%[in]], %[kStep]   \n"
-    "vld1.16         d3[3], [%[in]], %[kStep]   \n"
-
-    "vaddl.s16       q2, d0, d2                 \n" // a0=(in[0*16]+in[2*16])
-    "vaddl.s16       q3, d1, d3                 \n" // a1=(in[1*16]+in[3*16])
-    "vsubl.s16       q4, d1, d3                 \n" // a2=(in[1*16]-in[3*16])
-    "vsubl.s16       q5, d0, d2                 \n" // a3=(in[0*16]-in[2*16])
-
-    "vqadd.s32       q6, q2, q3                 \n" // a0 + a1
-    "vqadd.s32       q7, q5, q4                 \n" // a3 + a2
-    "vqsub.s32       q8, q5, q4                 \n" // a3 - a2
-    "vqsub.s32       q9, q2, q3                 \n" // a0 - a1
-
-    // Transpose
-    // q6 = tmp[0, 1,  2,  3] ; q7 = tmp[ 4,  5,  6,  7]
-    // q8 = tmp[8, 9, 10, 11] ; q9 = tmp[12, 13, 14, 15]
-    "vswp            d13, d16                   \n" // vtrn.64 q0, q2
-    "vswp            d15, d18                   \n" // vtrn.64 q1, q3
-    "vtrn.32         q6, q7                     \n"
-    "vtrn.32         q8, q9                     \n"
-
-    "vqadd.s32       q0, q6, q8                 \n" // a0 = tmp[0] + tmp[8]
-    "vqadd.s32       q1, q7, q9                 \n" // a1 = tmp[4] + tmp[12]
-    "vqsub.s32       q2, q7, q9                 \n" // a2 = tmp[4] - tmp[12]
-    "vqsub.s32       q3, q6, q8                 \n" // a3 = tmp[0] - tmp[8]
-
-    "vqadd.s32       q4, q0, q1                 \n" // b0 = a0 + a1
-    "vqadd.s32       q5, q3, q2                 \n" // b1 = a3 + a2
-    "vqsub.s32       q6, q3, q2                 \n" // b2 = a3 - a2
-    "vqsub.s32       q7, q0, q1                 \n" // b3 = a0 - a1
-
-    "vshrn.s32       d18, q4, #1                \n" // b0 >> 1
-    "vshrn.s32       d19, q5, #1                \n" // b1 >> 1
-    "vshrn.s32       d20, q6, #1                \n" // b2 >> 1
-    "vshrn.s32       d21, q7, #1                \n" // b3 >> 1
-
-    "vst1.16         {q9, q10}, [%[out]]        \n"
-
-    : [in] "+r"(in)
-    : [kStep] "r"(kStep), [out] "r"(out)
-    : "memory", "q0", "q1", "q2", "q3", "q4", "q5",
-      "q6", "q7", "q8", "q9", "q10"       // clobbered
-  ) ;
+#endif
+
+#define LOAD_LANE_16b(VALUE, LANE) do {             \
+  (VALUE) = vld1_lane_s16(src, (VALUE), (LANE));    \
+  src += stride;                                    \
+} while (0)
+
+static void FTransformWHT(const int16_t* src, int16_t* out) {
+  const int stride = 16;
+  const int16x4_t zero = vdup_n_s16(0);
+  int32x4x4_t tmp0;
+  int16x4x4_t in;
+  INIT_VECTOR4(in, zero, zero, zero, zero);
+  LOAD_LANE_16b(in.val[0], 0);
+  LOAD_LANE_16b(in.val[1], 0);
+  LOAD_LANE_16b(in.val[2], 0);
+  LOAD_LANE_16b(in.val[3], 0);
+  LOAD_LANE_16b(in.val[0], 1);
+  LOAD_LANE_16b(in.val[1], 1);
+  LOAD_LANE_16b(in.val[2], 1);
+  LOAD_LANE_16b(in.val[3], 1);
+  LOAD_LANE_16b(in.val[0], 2);
+  LOAD_LANE_16b(in.val[1], 2);
+  LOAD_LANE_16b(in.val[2], 2);
+  LOAD_LANE_16b(in.val[3], 2);
+  LOAD_LANE_16b(in.val[0], 3);
+  LOAD_LANE_16b(in.val[1], 3);
+  LOAD_LANE_16b(in.val[2], 3);
+  LOAD_LANE_16b(in.val[3], 3);
+
+  {
+    // a0 = in[0 * 16] + in[2 * 16]
+    // a1 = in[1 * 16] + in[3 * 16]
+    // a2 = in[1 * 16] - in[3 * 16]
+    // a3 = in[0 * 16] - in[2 * 16]
+    const int32x4_t a0 = vaddl_s16(in.val[0], in.val[2]);
+    const int32x4_t a1 = vaddl_s16(in.val[1], in.val[3]);
+    const int32x4_t a2 = vsubl_s16(in.val[1], in.val[3]);
+    const int32x4_t a3 = vsubl_s16(in.val[0], in.val[2]);
+    tmp0.val[0] = vaddq_s32(a0, a1);
+    tmp0.val[1] = vaddq_s32(a3, a2);
+    tmp0.val[2] = vsubq_s32(a3, a2);
+    tmp0.val[3] = vsubq_s32(a0, a1);
+  }
+  {
+    const int32x4x4_t tmp1 = Transpose4x4(tmp0);
+    // a0 = tmp[0 + i] + tmp[ 8 + i]
+    // a1 = tmp[4 + i] + tmp[12 + i]
+    // a2 = tmp[4 + i] - tmp[12 + i]
+    // a3 = tmp[0 + i] - tmp[ 8 + i]
+    const int32x4_t a0 = vaddq_s32(tmp1.val[0], tmp1.val[2]);
+    const int32x4_t a1 = vaddq_s32(tmp1.val[1], tmp1.val[3]);
+    const int32x4_t a2 = vsubq_s32(tmp1.val[1], tmp1.val[3]);
+    const int32x4_t a3 = vsubq_s32(tmp1.val[0], tmp1.val[2]);
+    const int32x4_t b0 = vhaddq_s32(a0, a1);  // (a0 + a1) >> 1
+    const int32x4_t b1 = vhaddq_s32(a3, a2);  // (a3 + a2) >> 1
+    const int32x4_t b2 = vhsubq_s32(a3, a2);  // (a3 - a2) >> 1
+    const int32x4_t b3 = vhsubq_s32(a0, a1);  // (a0 - a1) >> 1
+    const int16x4_t out0 = vmovn_s32(b0);
+    const int16x4_t out1 = vmovn_s32(b1);
+    const int16x4_t out2 = vmovn_s32(b2);
+    const int16x4_t out3 = vmovn_s32(b3);
+
+    vst1_s16(out +  0, out0);
+    vst1_s16(out +  4, out1);
+    vst1_s16(out +  8, out2);
+    vst1_s16(out + 12, out3);
+  }
 }
+#undef LOAD_LANE_16b
 
 //------------------------------------------------------------------------------
 // Texture distortion
@@ -409,9 +548,136 @@ static void FTransformWHT(const int16_t* in, int16_t* out) {
 // We try to match the spectral content (weighted) between source and
 // reconstructed samples.
 
+// This code works but is *slower* than the inlined-asm version below
+// (with gcc-4.6). So we disable it for now. Later, it'll be conditional to
+// USE_INTRINSICS define.
+// With gcc-4.8, it's only slightly slower than the inlined.
+#if defined(USE_INTRINSICS)
+
+// Zero extend an uint16x4_t 'v' to an int32x4_t.
+static WEBP_INLINE int32x4_t ConvertU16ToS32(uint16x4_t v) {
+  return vreinterpretq_s32_u32(vmovl_u16(v));
+}
+
+// Does a regular 4x4 transpose followed by an adjustment of the upper columns
+// in the inner rows to restore the source order of differences,
+// i.e., a0 - a1 | a3 - a2.
+static WEBP_INLINE int32x4x4_t DistoTranspose4x4(const int32x4x4_t rows) {
+  int32x4x4_t out = Transpose4x4(rows);
+  // restore source order in the columns containing differences.
+  const int32x2_t r1h = vget_high_s32(out.val[1]);
+  const int32x2_t r2h = vget_high_s32(out.val[2]);
+  out.val[1] = vcombine_s32(vget_low_s32(out.val[1]), r2h);
+  out.val[2] = vcombine_s32(vget_low_s32(out.val[2]), r1h);
+  return out;
+}
+
+static WEBP_INLINE int32x4x4_t DistoHorizontalPass(const uint8x8_t r0r1,
+                                                   const uint8x8_t r2r3) {
+  // a0 = in[0] + in[2] | a1 = in[1] + in[3]
+  const uint16x8_t a0a1 = vaddl_u8(r0r1, r2r3);
+  // a3 = in[0] - in[2] | a2 = in[1] - in[3]
+  const uint16x8_t a3a2 = vsubl_u8(r0r1, r2r3);
+  const int32x4_t tmp0 = vpaddlq_s16(vreinterpretq_s16_u16(a0a1));  // a0 + a1
+  const int32x4_t tmp1 = vpaddlq_s16(vreinterpretq_s16_u16(a3a2));  // a3 + a2
+  // no pairwise subtraction; reorder to perform tmp[2]/tmp[3] calculations.
+  // a0a0 a3a3 a0a0 a3a3 a0a0 a3a3 a0a0 a3a3
+  // a1a1 a2a2 a1a1 a2a2 a1a1 a2a2 a1a1 a2a2
+  const int16x8x2_t transpose =
+      vtrnq_s16(vreinterpretq_s16_u16(a0a1), vreinterpretq_s16_u16(a3a2));
+  // tmp[3] = a0 - a1 | tmp[2] = a3 - a2
+  const int32x4_t tmp32_1 = vsubl_s16(vget_low_s16(transpose.val[0]),
+                                      vget_low_s16(transpose.val[1]));
+  const int32x4_t tmp32_2 = vsubl_s16(vget_high_s16(transpose.val[0]),
+                                      vget_high_s16(transpose.val[1]));
+  // [0]: tmp[3] [1]: tmp[2]
+  const int32x4x2_t split = vtrnq_s32(tmp32_1, tmp32_2);
+  const int32x4x4_t res = { { tmp0, tmp1, split.val[1], split.val[0] } };
+  return res;
+}
+
+static WEBP_INLINE int32x4x4_t DistoVerticalPass(const int32x4x4_t rows) {
+  // a0 = tmp[0 + i] + tmp[8 + i];
+  const int32x4_t a0 = vaddq_s32(rows.val[0], rows.val[1]);
+  // a1 = tmp[4 + i] + tmp[12+ i];
+  const int32x4_t a1 = vaddq_s32(rows.val[2], rows.val[3]);
+  // a2 = tmp[4 + i] - tmp[12+ i];
+  const int32x4_t a2 = vsubq_s32(rows.val[2], rows.val[3]);
+  // a3 = tmp[0 + i] - tmp[8 + i];
+  const int32x4_t a3 = vsubq_s32(rows.val[0], rows.val[1]);
+  const int32x4_t b0 = vqabsq_s32(vaddq_s32(a0, a1));  // abs(a0 + a1)
+  const int32x4_t b1 = vqabsq_s32(vaddq_s32(a3, a2));  // abs(a3 + a2)
+  const int32x4_t b2 = vabdq_s32(a3, a2);              // abs(a3 - a2)
+  const int32x4_t b3 = vabdq_s32(a0, a1);              // abs(a0 - a1)
+  const int32x4x4_t res = { { b0, b1, b2, b3 } };
+  return res;
+}
+
+// Calculate the weighted sum of the rows in 'b'.
+static WEBP_INLINE int64x1_t DistoSum(const int32x4x4_t b,
+                                      const int32x4_t w0, const int32x4_t w1,
+                                      const int32x4_t w2, const int32x4_t w3) {
+  const int32x4_t s0 = vmulq_s32(w0, b.val[0]);
+  const int32x4_t s1 = vmlaq_s32(s0, w1, b.val[1]);
+  const int32x4_t s2 = vmlaq_s32(s1, w2, b.val[2]);
+  const int32x4_t s3 = vmlaq_s32(s2, w3, b.val[3]);
+  const int64x2_t sum1 = vpaddlq_s32(s3);
+  const int64x1_t sum2 = vadd_s64(vget_low_s64(sum1), vget_high_s64(sum1));
+  return sum2;
+}
+
+#define LOAD_LANE_32b(src, VALUE, LANE) \
+    (VALUE) = vld1q_lane_u32((const uint32_t*)(src), (VALUE), (LANE))
+
+// Hadamard transform
+// Returns the weighted sum of the absolute value of transformed coefficients.
+static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
+                    const uint16_t* const w) {
+  uint32x4_t d0d1 = { 0, 0, 0, 0 };
+  uint32x4_t d2d3 = { 0, 0, 0, 0 };
+  LOAD_LANE_32b(a + 0 * BPS, d0d1, 0);  // a00 a01 a02 a03
+  LOAD_LANE_32b(a + 1 * BPS, d0d1, 1);  // a10 a11 a12 a13
+  LOAD_LANE_32b(b + 0 * BPS, d0d1, 2);  // b00 b01 b02 b03
+  LOAD_LANE_32b(b + 1 * BPS, d0d1, 3);  // b10 b11 b12 b13
+  LOAD_LANE_32b(a + 2 * BPS, d2d3, 0);  // a20 a21 a22 a23
+  LOAD_LANE_32b(a + 3 * BPS, d2d3, 1);  // a30 a31 a32 a33
+  LOAD_LANE_32b(b + 2 * BPS, d2d3, 2);  // b20 b21 b22 b23
+  LOAD_LANE_32b(b + 3 * BPS, d2d3, 3);  // b30 b31 b32 b33
+
+  {
+    // a00 a01 a20 a21 a10 a11 a30 a31 b00 b01 b20 b21 b10 b11 b30 b31
+    // a02 a03 a22 a23 a12 a13 a32 a33 b02 b03 b22 b23 b12 b13 b32 b33
+    const uint16x8x2_t tmp =
+        vtrnq_u16(vreinterpretq_u16_u32(d0d1), vreinterpretq_u16_u32(d2d3));
+    const uint8x16_t d0d1u8 = vreinterpretq_u8_u16(tmp.val[0]);
+    const uint8x16_t d2d3u8 = vreinterpretq_u8_u16(tmp.val[1]);
+    const int32x4x4_t hpass_a = DistoHorizontalPass(vget_low_u8(d0d1u8),
+                                                    vget_low_u8(d2d3u8));
+    const int32x4x4_t hpass_b = DistoHorizontalPass(vget_high_u8(d0d1u8),
+                                                    vget_high_u8(d2d3u8));
+    const int32x4x4_t tmp_a = DistoTranspose4x4(hpass_a);
+    const int32x4x4_t tmp_b = DistoTranspose4x4(hpass_b);
+    const int32x4x4_t vpass_a = DistoVerticalPass(tmp_a);
+    const int32x4x4_t vpass_b = DistoVerticalPass(tmp_b);
+    const int32x4_t w0 = ConvertU16ToS32(vld1_u16(w + 0));
+    const int32x4_t w1 = ConvertU16ToS32(vld1_u16(w + 4));
+    const int32x4_t w2 = ConvertU16ToS32(vld1_u16(w + 8));
+    const int32x4_t w3 = ConvertU16ToS32(vld1_u16(w + 12));
+    const int64x1_t sum1 = DistoSum(vpass_a, w0, w1, w2, w3);
+    const int64x1_t sum2 = DistoSum(vpass_b, w0, w1, w2, w3);
+    const int32x2_t diff = vabd_s32(vreinterpret_s32_s64(sum1),
+                                    vreinterpret_s32_s64(sum2));
+    const int32x2_t res = vshr_n_s32(diff, 5);
+    return vget_lane_s32(res, 0);
+  }
+}
+
+#undef LOAD_LANE_32b
+
+#else
+
 // Hadamard transform
 // Returns the weighted sum of the absolute value of transformed coefficients.
-// This uses a TTransform helper function in C
 static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
                     const uint16_t* const w) {
   const int kBPS = BPS;
@@ -598,6 +864,8 @@ static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
   return sum;
 }
 
+#endif  // USE_INTRINSICS
+
 static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
                       const uint16_t* const w) {
   int D = 0;
@@ -610,6 +878,179 @@ static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
   return D;
 }
 
+//------------------------------------------------------------------------------
+
+static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
+                             int start_block, int end_block,
+                             VP8Histogram* const histo) {
+  const uint16x8_t max_coeff_thresh = vdupq_n_u16(MAX_COEFF_THRESH);
+  int j;
+  for (j = start_block; j < end_block; ++j) {
+    int16_t out[16];
+    FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
+    {
+      int k;
+      const int16x8_t a0 = vld1q_s16(out + 0);
+      const int16x8_t b0 = vld1q_s16(out + 8);
+      const uint16x8_t a1 = vreinterpretq_u16_s16(vabsq_s16(a0));
+      const uint16x8_t b1 = vreinterpretq_u16_s16(vabsq_s16(b0));
+      const uint16x8_t a2 = vshrq_n_u16(a1, 3);
+      const uint16x8_t b2 = vshrq_n_u16(b1, 3);
+      const uint16x8_t a3 = vminq_u16(a2, max_coeff_thresh);
+      const uint16x8_t b3 = vminq_u16(b2, max_coeff_thresh);
+      vst1q_s16(out + 0, vreinterpretq_s16_u16(a3));
+      vst1q_s16(out + 8, vreinterpretq_s16_u16(b3));
+      // Convert coefficients to bin.
+      for (k = 0; k < 16; ++k) {
+        histo->distribution[out[k]]++;
+      }
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+
+static WEBP_INLINE void AccumulateSSE16(const uint8_t* const a,
+                                        const uint8_t* const b,
+                                        uint32x4_t* const sum) {
+  const uint8x16_t a0 = vld1q_u8(a);
+  const uint8x16_t b0 = vld1q_u8(b);
+  const uint8x16_t abs_diff = vabdq_u8(a0, b0);
+  uint16x8_t prod = vmull_u8(vget_low_u8(abs_diff), vget_low_u8(abs_diff));
+  prod = vmlal_u8(prod, vget_high_u8(abs_diff), vget_high_u8(abs_diff));
+  *sum = vpadalq_u16(*sum, prod);      // pair-wise add and accumulate
+}
+
+// Horizontal sum of all four uint32_t values in 'sum'.
+static int SumToInt(uint32x4_t sum) {
+  const uint64x2_t sum2 = vpaddlq_u32(sum);
+  const uint64_t sum3 = vgetq_lane_u64(sum2, 0) + vgetq_lane_u64(sum2, 1);
+  return (int)sum3;
+}
+
+static int SSE16x16(const uint8_t* a, const uint8_t* b) {
+  uint32x4_t sum = vdupq_n_u32(0);
+  int y;
+  for (y = 0; y < 16; ++y) {
+    AccumulateSSE16(a + y * BPS, b + y * BPS, &sum);
+  }
+  return SumToInt(sum);
+}
+
+static int SSE16x8(const uint8_t* a, const uint8_t* b) {
+  uint32x4_t sum = vdupq_n_u32(0);
+  int y;
+  for (y = 0; y < 8; ++y) {
+    AccumulateSSE16(a + y * BPS, b + y * BPS, &sum);
+  }
+  return SumToInt(sum);
+}
+
+static int SSE8x8(const uint8_t* a, const uint8_t* b) {
+  uint32x4_t sum = vdupq_n_u32(0);
+  int y;
+  for (y = 0; y < 8; ++y) {
+    const uint8x8_t a0 = vld1_u8(a + y * BPS);
+    const uint8x8_t b0 = vld1_u8(b + y * BPS);
+    const uint8x8_t abs_diff = vabd_u8(a0, b0);
+    const uint16x8_t prod = vmull_u8(abs_diff, abs_diff);
+    sum = vpadalq_u16(sum, prod);
+  }
+  return SumToInt(sum);
+}
+
+static int SSE4x4(const uint8_t* a, const uint8_t* b) {
+  const uint8x16_t a0 = Load4x4(a);
+  const uint8x16_t b0 = Load4x4(b);
+  const uint8x16_t abs_diff = vabdq_u8(a0, b0);
+  uint16x8_t prod = vmull_u8(vget_low_u8(abs_diff), vget_low_u8(abs_diff));
+  prod = vmlal_u8(prod, vget_high_u8(abs_diff), vget_high_u8(abs_diff));
+  return SumToInt(vpaddlq_u16(prod));
+}
+
+//------------------------------------------------------------------------------
+
+// Compilation with gcc-4.6.x is problematic for now.
+#if !defined(WORK_AROUND_GCC)
+
+static int16x8_t Quantize(int16_t* const in,
+                          const VP8Matrix* const mtx, int offset) {
+  const uint16x8_t sharp = vld1q_u16(&mtx->sharpen_[offset]);
+  const uint16x8_t q = vld1q_u16(&mtx->q_[offset]);
+  const uint16x8_t iq = vld1q_u16(&mtx->iq_[offset]);
+  const uint32x4_t bias0 = vld1q_u32(&mtx->bias_[offset + 0]);
+  const uint32x4_t bias1 = vld1q_u32(&mtx->bias_[offset + 4]);
+
+  const int16x8_t a = vld1q_s16(in + offset);                // in
+  const uint16x8_t b = vreinterpretq_u16_s16(vabsq_s16(a));  // coeff = abs(in)
+  const int16x8_t sign = vshrq_n_s16(a, 15);                 // sign
+  const uint16x8_t c = vaddq_u16(b, sharp);                  // + sharpen
+  const uint32x4_t m0 = vmull_u16(vget_low_u16(c), vget_low_u16(iq));
+  const uint32x4_t m1 = vmull_u16(vget_high_u16(c), vget_high_u16(iq));
+  const uint32x4_t m2 = vhaddq_u32(m0, bias0);
+  const uint32x4_t m3 = vhaddq_u32(m1, bias1);     // (coeff * iQ + bias) >> 1
+  const uint16x8_t c0 = vcombine_u16(vshrn_n_u32(m2, 16),
+                                     vshrn_n_u32(m3, 16));   // QFIX=17 = 16+1
+  const uint16x8_t c1 = vminq_u16(c0, vdupq_n_u16(MAX_LEVEL));
+  const int16x8_t c2 = veorq_s16(vreinterpretq_s16_u16(c1), sign);
+  const int16x8_t c3 = vsubq_s16(c2, sign);                  // restore sign
+  const int16x8_t c4 = vmulq_s16(c3, vreinterpretq_s16_u16(q));
+  vst1q_s16(in + offset, c4);
+  assert(QFIX == 17);  // this function can't work as is if QFIX != 16+1
+  return c3;
+}
+
+static const uint8_t kShuffles[4][8] = {
+  { 0,   1,  2,  3,  8,  9, 16, 17 },
+  { 10, 11,  4,  5,  6,  7, 12, 13 },
+  { 18, 19, 24, 25, 26, 27, 20, 21 },
+  { 14, 15, 22, 23, 28, 29, 30, 31 }
+};
+
+static int QuantizeBlock(int16_t in[16], int16_t out[16],
+                         const VP8Matrix* const mtx) {
+  const int16x8_t out0 = Quantize(in, mtx, 0);
+  const int16x8_t out1 = Quantize(in, mtx, 8);
+  uint8x8x4_t shuffles;
+  // vtbl?_u8 are marked unavailable for iOS arm64 with Xcode < 6.3, use
+  // non-standard versions there.
+#if defined(__APPLE__) && defined(__aarch64__) && \
+    defined(__apple_build_version__) && (__apple_build_version__< 6020037)
+  uint8x16x2_t all_out;
+  INIT_VECTOR2(all_out, vreinterpretq_u8_s16(out0), vreinterpretq_u8_s16(out1));
+  INIT_VECTOR4(shuffles,
+               vtbl2q_u8(all_out, vld1_u8(kShuffles[0])),
+               vtbl2q_u8(all_out, vld1_u8(kShuffles[1])),
+               vtbl2q_u8(all_out, vld1_u8(kShuffles[2])),
+               vtbl2q_u8(all_out, vld1_u8(kShuffles[3])));
+#else
+  uint8x8x4_t all_out;
+  INIT_VECTOR4(all_out,
+               vreinterpret_u8_s16(vget_low_s16(out0)),
+               vreinterpret_u8_s16(vget_high_s16(out0)),
+               vreinterpret_u8_s16(vget_low_s16(out1)),
+               vreinterpret_u8_s16(vget_high_s16(out1)));
+  INIT_VECTOR4(shuffles,
+               vtbl4_u8(all_out, vld1_u8(kShuffles[0])),
+               vtbl4_u8(all_out, vld1_u8(kShuffles[1])),
+               vtbl4_u8(all_out, vld1_u8(kShuffles[2])),
+               vtbl4_u8(all_out, vld1_u8(kShuffles[3])));
+#endif
+  // Zigzag reordering
+  vst1_u8((uint8_t*)(out +  0), shuffles.val[0]);
+  vst1_u8((uint8_t*)(out +  4), shuffles.val[1]);
+  vst1_u8((uint8_t*)(out +  8), shuffles.val[2]);
+  vst1_u8((uint8_t*)(out + 12), shuffles.val[3]);
+  // test zeros
+  if (*(uint64_t*)(out +  0) != 0) return 1;
+  if (*(uint64_t*)(out +  4) != 0) return 1;
+  if (*(uint64_t*)(out +  8) != 0) return 1;
+  if (*(uint64_t*)(out + 12) != 0) return 1;
+  return 0;
+}
+
+#endif   // !WORK_AROUND_GCC
+
 #endif   // WEBP_USE_NEON
 
 //------------------------------------------------------------------------------
@@ -622,11 +1063,17 @@ void VP8EncDspInitNEON(void) {
   VP8ITransform = ITransform;
   VP8FTransform = FTransform;
 
-  VP8ITransformWHT = ITransformWHT;
   VP8FTransformWHT = FTransformWHT;
 
   VP8TDisto4x4 = Disto4x4;
   VP8TDisto16x16 = Disto16x16;
+  VP8CollectHistogram = CollectHistogram;
+  VP8SSE16x16 = SSE16x16;
+  VP8SSE16x8 = SSE16x8;
+  VP8SSE8x8 = SSE8x8;
+  VP8SSE4x4 = SSE4x4;
+#if !defined(WORK_AROUND_GCC)
+  VP8EncQuantizeBlock = QuantizeBlock;
+#endif
 #endif   // WEBP_USE_NEON
 }
-
diff --git a/src/3rdparty/libwebp/src/dsp/enc_sse2.c b/src/3rdparty/libwebp/src/dsp/enc_sse2.c
index 540a3cb..9958d9f 100644
--- a/src/3rdparty/libwebp/src/dsp/enc_sse2.c
+++ b/src/3rdparty/libwebp/src/dsp/enc_sse2.c
@@ -17,7 +17,9 @@
 #include <stdlib.h>  // for abs()
 #include <emmintrin.h>
 
+#include "../enc/cost.h"
 #include "../enc/vp8enci.h"
+#include "../utils/utils.h"
 
 //------------------------------------------------------------------------------
 // Quite useful macro for debugging. Left here for convenience.
@@ -52,9 +54,9 @@ static void PrintReg(const __m128i r, const char* const name, int size) {
 // Compute susceptibility based on DCT-coeff histograms:
 // the higher, the "easier" the macroblock is to compress.
 
-static void CollectHistogramSSE2(const uint8_t* ref, const uint8_t* pred,
-                                 int start_block, int end_block,
-                                 VP8Histogram* const histo) {
+static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
+                             int start_block, int end_block,
+                             VP8Histogram* const histo) {
   const __m128i max_coeff_thresh = _mm_set1_epi16(MAX_COEFF_THRESH);
   int j;
   for (j = start_block; j < end_block; ++j) {
@@ -98,8 +100,8 @@ static void CollectHistogramSSE2(const uint8_t* ref, const uint8_t* pred,
 // Transforms (Paragraph 14.4)
 
 // Does one or two inverse transforms.
-static void ITransformSSE2(const uint8_t* ref, const int16_t* in, uint8_t* dst,
-                           int do_two) {
+static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst,
+                       int do_two) {
   // This implementation makes use of 16-bit fixed point versions of two
   // multiply constants:
   //    K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16
@@ -318,8 +320,7 @@ static void ITransformSSE2(const uint8_t* ref, const int16_t* in, uint8_t* dst,
   }
 }
 
-static void FTransformSSE2(const uint8_t* src, const uint8_t* ref,
-                           int16_t* out) {
+static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i seven = _mm_set1_epi16(7);
   const __m128i k937 = _mm_set1_epi32(937);
@@ -444,14 +445,14 @@ static void FTransformSSE2(const uint8_t* src, const uint8_t* ref,
     // -> f1 = f1 + 1 - (a3 == 0)
     const __m128i g1 = _mm_add_epi16(f1, _mm_cmpeq_epi16(a32, zero));
 
-    _mm_storel_epi64((__m128i*)&out[ 0], d0);
-    _mm_storel_epi64((__m128i*)&out[ 4], g1);
-    _mm_storel_epi64((__m128i*)&out[ 8], d2);
-    _mm_storel_epi64((__m128i*)&out[12], f3);
+    const __m128i d0_g1 = _mm_unpacklo_epi64(d0, g1);
+    const __m128i d2_f3 = _mm_unpacklo_epi64(d2, f3);
+    _mm_storeu_si128((__m128i*)&out[0], d0_g1);
+    _mm_storeu_si128((__m128i*)&out[8], d2_f3);
   }
 }
 
-static void FTransformWHTSSE2(const int16_t* in, int16_t* out) {
+static void FTransformWHT(const int16_t* in, int16_t* out) {
   int32_t tmp[16];
   int i;
   for (i = 0; i < 4; ++i, in += 64) {
@@ -487,8 +488,8 @@ static void FTransformWHTSSE2(const int16_t* in, int16_t* out) {
 //------------------------------------------------------------------------------
 // Metric
 
-static int SSE_Nx4SSE2(const uint8_t* a, const uint8_t* b,
-                       int num_quads, int do_16) {
+static int SSE_Nx4(const uint8_t* a, const uint8_t* b,
+                   int num_quads, int do_16) {
   const __m128i zero = _mm_setzero_si128();
   __m128i sum1 = zero;
   __m128i sum2 = zero;
@@ -565,19 +566,19 @@ static int SSE_Nx4SSE2(const uint8_t* a, const uint8_t* b,
   }
 }
 
-static int SSE16x16SSE2(const uint8_t* a, const uint8_t* b) {
-  return SSE_Nx4SSE2(a, b, 4, 1);
+static int SSE16x16(const uint8_t* a, const uint8_t* b) {
+  return SSE_Nx4(a, b, 4, 1);
 }
 
-static int SSE16x8SSE2(const uint8_t* a, const uint8_t* b) {
-  return SSE_Nx4SSE2(a, b, 2, 1);
+static int SSE16x8(const uint8_t* a, const uint8_t* b) {
+  return SSE_Nx4(a, b, 2, 1);
 }
 
-static int SSE8x8SSE2(const uint8_t* a, const uint8_t* b) {
-  return SSE_Nx4SSE2(a, b, 2, 0);
+static int SSE8x8(const uint8_t* a, const uint8_t* b) {
+  return SSE_Nx4(a, b, 2, 0);
 }
 
-static int SSE4x4SSE2(const uint8_t* a, const uint8_t* b) {
+static int SSE4x4(const uint8_t* a, const uint8_t* b) {
   const __m128i zero = _mm_setzero_si128();
 
   // Load values. Note that we read 8 pixels instead of 4,
@@ -634,8 +635,8 @@ static int SSE4x4SSE2(const uint8_t* a, const uint8_t* b) {
 // Hadamard transform
 // Returns the difference between the weighted sum of the absolute value of
 // transformed coefficients.
-static int TTransformSSE2(const uint8_t* inA, const uint8_t* inB,
-                          const uint16_t* const w) {
+static int TTransform(const uint8_t* inA, const uint8_t* inB,
+                      const uint16_t* const w) {
   int32_t sum[4];
   __m128i tmp_0, tmp_1, tmp_2, tmp_3;
   const __m128i zero = _mm_setzero_si128();
@@ -782,19 +783,19 @@ static int TTransformSSE2(const uint8_t* inA, const uint8_t* inB,
   return sum[0] + sum[1] + sum[2] + sum[3];
 }
 
-static int Disto4x4SSE2(const uint8_t* const a, const uint8_t* const b,
-                        const uint16_t* const w) {
-  const int diff_sum = TTransformSSE2(a, b, w);
+static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
+                    const uint16_t* const w) {
+  const int diff_sum = TTransform(a, b, w);
   return abs(diff_sum) >> 5;
 }
 
-static int Disto16x16SSE2(const uint8_t* const a, const uint8_t* const b,
-                          const uint16_t* const w) {
+static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
+                      const uint16_t* const w) {
   int D = 0;
   int x, y;
   for (y = 0; y < 16 * BPS; y += 4 * BPS) {
     for (x = 0; x < 16; x += 4) {
-      D += Disto4x4SSE2(a + x + y, b + x + y, w);
+      D += Disto4x4(a + x + y, b + x + y, w);
     }
   }
   return D;
@@ -804,9 +805,9 @@ static int Disto16x16SSE2(const uint8_t* const a, const uint8_t* const b,
 // Quantization
 //
 
-// Simple quantization
-static int QuantizeBlockSSE2(int16_t in[16], int16_t out[16],
-                             int n, const VP8Matrix* const mtx) {
+static WEBP_INLINE int DoQuantizeBlock(int16_t in[16], int16_t out[16],
+                                       const uint16_t* const sharpen,
+                                       const VP8Matrix* const mtx) {
   const __m128i max_coeff_2047 = _mm_set1_epi16(MAX_LEVEL);
   const __m128i zero = _mm_setzero_si128();
   __m128i coeff0, coeff8;
@@ -818,18 +819,14 @@ static int QuantizeBlockSSE2(int16_t in[16], int16_t out[16],
   //                  we can use _mm_load_si128 instead of _mm_loadu_si128.
   __m128i in0 = _mm_loadu_si128((__m128i*)&in[0]);
   __m128i in8 = _mm_loadu_si128((__m128i*)&in[8]);
-  const __m128i sharpen0 = _mm_loadu_si128((__m128i*)&mtx->sharpen_[0]);
-  const __m128i sharpen8 = _mm_loadu_si128((__m128i*)&mtx->sharpen_[8]);
   const __m128i iq0 = _mm_loadu_si128((__m128i*)&mtx->iq_[0]);
   const __m128i iq8 = _mm_loadu_si128((__m128i*)&mtx->iq_[8]);
-  const __m128i bias0 = _mm_loadu_si128((__m128i*)&mtx->bias_[0]);
-  const __m128i bias8 = _mm_loadu_si128((__m128i*)&mtx->bias_[8]);
   const __m128i q0 = _mm_loadu_si128((__m128i*)&mtx->q_[0]);
   const __m128i q8 = _mm_loadu_si128((__m128i*)&mtx->q_[8]);
 
-  // sign(in) = in >> 15  (0x0000 if positive, 0xffff if negative)
-  const __m128i sign0 = _mm_srai_epi16(in0, 15);
-  const __m128i sign8 = _mm_srai_epi16(in8, 15);
+  // extract sign(in)  (0x0000 if positive, 0xffff if negative)
+  const __m128i sign0 = _mm_cmpgt_epi16(zero, in0);
+  const __m128i sign8 = _mm_cmpgt_epi16(zero, in8);
 
   // coeff = abs(in) = (in ^ sign) - sign
   coeff0 = _mm_xor_si128(in0, sign0);
@@ -838,32 +835,35 @@ static int QuantizeBlockSSE2(int16_t in[16], int16_t out[16],
   coeff8 = _mm_sub_epi16(coeff8, sign8);
 
   // coeff = abs(in) + sharpen
-  coeff0 = _mm_add_epi16(coeff0, sharpen0);
-  coeff8 = _mm_add_epi16(coeff8, sharpen8);
+  if (sharpen != NULL) {
+    const __m128i sharpen0 = _mm_loadu_si128((__m128i*)&sharpen[0]);
+    const __m128i sharpen8 = _mm_loadu_si128((__m128i*)&sharpen[8]);
+    coeff0 = _mm_add_epi16(coeff0, sharpen0);
+    coeff8 = _mm_add_epi16(coeff8, sharpen8);
+  }
 
-  // out = (coeff * iQ + B) >> QFIX;
+  // out = (coeff * iQ + B) >> QFIX
   {
     // doing calculations with 32b precision (QFIX=17)
     // out = (coeff * iQ)
-    __m128i coeff_iQ0H = _mm_mulhi_epu16(coeff0, iq0);
-    __m128i coeff_iQ0L = _mm_mullo_epi16(coeff0, iq0);
-    __m128i coeff_iQ8H = _mm_mulhi_epu16(coeff8, iq8);
-    __m128i coeff_iQ8L = _mm_mullo_epi16(coeff8, iq8);
+    const __m128i coeff_iQ0H = _mm_mulhi_epu16(coeff0, iq0);
+    const __m128i coeff_iQ0L = _mm_mullo_epi16(coeff0, iq0);
+    const __m128i coeff_iQ8H = _mm_mulhi_epu16(coeff8, iq8);
+    const __m128i coeff_iQ8L = _mm_mullo_epi16(coeff8, iq8);
     __m128i out_00 = _mm_unpacklo_epi16(coeff_iQ0L, coeff_iQ0H);
     __m128i out_04 = _mm_unpackhi_epi16(coeff_iQ0L, coeff_iQ0H);
     __m128i out_08 = _mm_unpacklo_epi16(coeff_iQ8L, coeff_iQ8H);
     __m128i out_12 = _mm_unpackhi_epi16(coeff_iQ8L, coeff_iQ8H);
-    // expand bias from 16b to 32b
-    __m128i bias_00 = _mm_unpacklo_epi16(bias0, zero);
-    __m128i bias_04 = _mm_unpackhi_epi16(bias0, zero);
-    __m128i bias_08 = _mm_unpacklo_epi16(bias8, zero);
-    __m128i bias_12 = _mm_unpackhi_epi16(bias8, zero);
     // out = (coeff * iQ + B)
+    const __m128i bias_00 = _mm_loadu_si128((__m128i*)&mtx->bias_[0]);
+    const __m128i bias_04 = _mm_loadu_si128((__m128i*)&mtx->bias_[4]);
+    const __m128i bias_08 = _mm_loadu_si128((__m128i*)&mtx->bias_[8]);
+    const __m128i bias_12 = _mm_loadu_si128((__m128i*)&mtx->bias_[12]);
     out_00 = _mm_add_epi32(out_00, bias_00);
     out_04 = _mm_add_epi32(out_04, bias_04);
     out_08 = _mm_add_epi32(out_08, bias_08);
     out_12 = _mm_add_epi32(out_12, bias_12);
-    // out = (coeff * iQ + B) >> QFIX;
+    // out = QUANTDIV(coeff, iQ, B, QFIX)
     out_00 = _mm_srai_epi32(out_00, QFIX);
     out_04 = _mm_srai_epi32(out_04, QFIX);
     out_08 = _mm_srai_epi32(out_08, QFIX);
@@ -916,19 +916,44 @@ static int QuantizeBlockSSE2(int16_t in[16], int16_t out[16],
   }
 
   // detect if all 'out' values are zeroes or not
-  {
-    int32_t tmp[4];
-    _mm_storeu_si128((__m128i*)tmp, packed_out);
-    if (n) {
-      tmp[0] &= ~0xff;
-    }
-    return (tmp[3] || tmp[2] || tmp[1] || tmp[0]);
-  }
+  return (_mm_movemask_epi8(_mm_cmpeq_epi8(packed_out, zero)) != 0xffff);
 }
 
-static int QuantizeBlockWHTSSE2(int16_t in[16], int16_t out[16],
-                                const VP8Matrix* const mtx) {
-  return QuantizeBlockSSE2(in, out, 0, mtx);
+static int QuantizeBlock(int16_t in[16], int16_t out[16],
+                         const VP8Matrix* const mtx) {
+  return DoQuantizeBlock(in, out, &mtx->sharpen_[0], mtx);
+}
+
+static int QuantizeBlockWHT(int16_t in[16], int16_t out[16],
+                            const VP8Matrix* const mtx) {
+  return DoQuantizeBlock(in, out, NULL, mtx);
+}
+
+// Forward declaration.
+void VP8SetResidualCoeffsSSE2(const int16_t* const coeffs,
+                              VP8Residual* const res);
+
+void VP8SetResidualCoeffsSSE2(const int16_t* const coeffs,
+                              VP8Residual* const res) {
+  const __m128i c0 = _mm_loadu_si128((const __m128i*)coeffs);
+  const __m128i c1 = _mm_loadu_si128((const __m128i*)(coeffs + 8));
+  // Use SSE to compare 8 values with a single instruction.
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i m0 = _mm_cmpeq_epi16(c0, zero);
+  const __m128i m1 = _mm_cmpeq_epi16(c1, zero);
+  // Get the comparison results as a bitmask, consisting of two times 16 bits:
+  // two identical bits for each result. Concatenate both bitmasks to get a
+  // single 32 bit value. Negate the mask to get the position of entries that
+  // are not equal to zero. We don't need to mask out least significant bits
+  // according to res->first, since coeffs[0] is 0 if res->first > 0
+  const uint32_t mask =
+      ~(((uint32_t)_mm_movemask_epi8(m1) << 16) | _mm_movemask_epi8(m0));
+  // The position of the most significant non-zero bit indicates the position of
+  // the last non-zero value. Divide the result by two because __movemask_epi8
+  // operates on 8 bit values instead of 16 bit values.
+  assert(res->first == 0 || coeffs[0] == 0);
+  res->last = mask ? (BitsLog2Floor(mask) >> 1) : -1;
+  res->coeffs = coeffs;
 }
 
 #endif   // WEBP_USE_SSE2
@@ -940,18 +965,18 @@ extern void VP8EncDspInitSSE2(void);
 
 void VP8EncDspInitSSE2(void) {
 #if defined(WEBP_USE_SSE2)
-  VP8CollectHistogram = CollectHistogramSSE2;
-  VP8EncQuantizeBlock = QuantizeBlockSSE2;
-  VP8EncQuantizeBlockWHT = QuantizeBlockWHTSSE2;
-  VP8ITransform = ITransformSSE2;
-  VP8FTransform = FTransformSSE2;
-  VP8FTransformWHT = FTransformWHTSSE2;
-  VP8SSE16x16 = SSE16x16SSE2;
-  VP8SSE16x8 = SSE16x8SSE2;
-  VP8SSE8x8 = SSE8x8SSE2;
-  VP8SSE4x4 = SSE4x4SSE2;
-  VP8TDisto4x4 = Disto4x4SSE2;
-  VP8TDisto16x16 = Disto16x16SSE2;
+  VP8CollectHistogram = CollectHistogram;
+  VP8EncQuantizeBlock = QuantizeBlock;
+  VP8EncQuantizeBlockWHT = QuantizeBlockWHT;
+  VP8ITransform = ITransform;
+  VP8FTransform = FTransform;
+  VP8FTransformWHT = FTransformWHT;
+  VP8SSE16x16 = SSE16x16;
+  VP8SSE16x8 = SSE16x8;
+  VP8SSE8x8 = SSE8x8;
+  VP8SSE4x4 = SSE4x4;
+  VP8TDisto4x4 = Disto4x4;
+  VP8TDisto16x16 = Disto16x16;
 #endif   // WEBP_USE_SSE2
 }
 
diff --git a/src/3rdparty/libwebp/src/dsp/lossless.c b/src/3rdparty/libwebp/src/dsp/lossless.c
index bab76d2..ee334bc 100644
--- a/src/3rdparty/libwebp/src/dsp/lossless.c
+++ b/src/3rdparty/libwebp/src/dsp/lossless.c
@@ -15,21 +15,16 @@
 
 #include "./dsp.h"
 
-#if defined(WEBP_USE_SSE2)
-#include <emmintrin.h>
-#endif
-
 #include <math.h>
 #include <stdlib.h>
-#include "./lossless.h"
 #include "../dec/vp8li.h"
+#include "../utils/endian_inl.h"
+#include "./lossless.h"
 #include "./yuv.h"
 
 #define MAX_DIFF_COST (1e30f)
 
 // lookup table for small values of log2(int)
-#define APPROX_LOG_MAX  4096
-#define LOG_2_RECIPROCAL 1.44269504088896338700465094007086
 const float kLog2Table[LOG_LOOKUP_IDX_MAX] = {
   0.0000000000000000f, 0.0000000000000000f,
   1.0000000000000000f, 1.5849625007211560f,
@@ -331,30 +326,59 @@ const uint8_t kPrefixEncodeExtraBitsValue[PREFIX_LOOKUP_IDX_MAX] = {
   112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126
 };
 
-float VP8LFastSLog2Slow(int v) {
+// The threshold till approximate version of log_2 can be used.
+// Practically, we can get rid of the call to log() as the two values match to
+// very high degree (the ratio of these two is 0.99999x).
+// Keeping a high threshold for now.
+#define APPROX_LOG_WITH_CORRECTION_MAX  65536
+#define APPROX_LOG_MAX                   4096
+#define LOG_2_RECIPROCAL 1.44269504088896338700465094007086
+static float FastSLog2Slow(uint32_t v) {
   assert(v >= LOG_LOOKUP_IDX_MAX);
-  if (v < APPROX_LOG_MAX) {
+  if (v < APPROX_LOG_WITH_CORRECTION_MAX) {
     int log_cnt = 0;
+    uint32_t y = 1;
+    int correction = 0;
     const float v_f = (float)v;
-    while (v >= LOG_LOOKUP_IDX_MAX) {
+    const uint32_t orig_v = v;
+    do {
       ++log_cnt;
       v = v >> 1;
-    }
-    return v_f * (kLog2Table[v] + log_cnt);
+      y = y << 1;
+    } while (v >= LOG_LOOKUP_IDX_MAX);
+    // vf = (2^log_cnt) * Xf; where y = 2^log_cnt and Xf < 256
+    // Xf = floor(Xf) * (1 + (v % y) / v)
+    // log2(Xf) = log2(floor(Xf)) + log2(1 + (v % y) / v)
+    // The correction factor: log(1 + d) ~ d; for very small d values, so
+    // log2(1 + (v % y) / v) ~ LOG_2_RECIPROCAL * (v % y)/v
+    // LOG_2_RECIPROCAL ~ 23/16
+    correction = (23 * (orig_v & (y - 1))) >> 4;
+    return v_f * (kLog2Table[v] + log_cnt) + correction;
   } else {
     return (float)(LOG_2_RECIPROCAL * v * log((double)v));
   }
 }
 
-float VP8LFastLog2Slow(int v) {
+static float FastLog2Slow(uint32_t v) {
   assert(v >= LOG_LOOKUP_IDX_MAX);
-  if (v < APPROX_LOG_MAX) {
+  if (v < APPROX_LOG_WITH_CORRECTION_MAX) {
     int log_cnt = 0;
-    while (v >= LOG_LOOKUP_IDX_MAX) {
+    uint32_t y = 1;
+    const uint32_t orig_v = v;
+    double log_2;
+    do {
       ++log_cnt;
       v = v >> 1;
+      y = y << 1;
+    } while (v >= LOG_LOOKUP_IDX_MAX);
+    log_2 = kLog2Table[v] + log_cnt;
+    if (orig_v >= APPROX_LOG_MAX) {
+      // Since the division is still expensive, add this correction factor only
+      // for large values of 'v'.
+      const int correction = (23 * (orig_v & (y - 1))) >> 4;
+      log_2 += (double)correction / orig_v;
     }
-    return kLog2Table[v] + log_cnt;
+    return (float)log_2;
   } else {
     return (float)(LOG_2_RECIPROCAL * log((double)v));
   }
@@ -363,6 +387,9 @@ float VP8LFastLog2Slow(int v) {
 //------------------------------------------------------------------------------
 // Image transforms.
 
+// Mostly used to reduce code size + readability
+static WEBP_INLINE int GetMin(int a, int b) { return (a > b) ? b : a; }
+
 // In-place sum of each component with mod 256.
 static WEBP_INLINE void AddPixelsEq(uint32_t* a, uint32_t b) {
   const uint32_t alpha_and_green = (*a & 0xff00ff00u) + (b & 0xff00ff00u);
@@ -406,7 +433,7 @@ static WEBP_INLINE uint32_t ClampedAddSubtractFull(uint32_t c0, uint32_t c1,
                                          (c1 >> 8) & 0xff,
                                          (c2 >> 8) & 0xff);
   const int b = AddSubtractComponentFull(c0 & 0xff, c1 & 0xff, c2 & 0xff);
-  return (a << 24) | (r << 16) | (g << 8) | b;
+  return ((uint32_t)a << 24) | (r << 16) | (g << 8) | b;
 }
 
 static WEBP_INLINE int AddSubtractComponentHalf(int a, int b) {
@@ -420,15 +447,24 @@ static WEBP_INLINE uint32_t ClampedAddSubtractHalf(uint32_t c0, uint32_t c1,
   const int r = AddSubtractComponentHalf((ave >> 16) & 0xff, (c2 >> 16) & 0xff);
   const int g = AddSubtractComponentHalf((ave >> 8) & 0xff, (c2 >> 8) & 0xff);
   const int b = AddSubtractComponentHalf((ave >> 0) & 0xff, (c2 >> 0) & 0xff);
-  return (a << 24) | (r << 16) | (g << 8) | b;
+  return ((uint32_t)a << 24) | (r << 16) | (g << 8) | b;
 }
 
-static WEBP_INLINE int Sub3(int a, int b, int c) {
+// gcc-4.9 on ARM generates incorrect code in Select() when Sub3() is inlined.
+#if defined(__arm__) && LOCAL_GCC_VERSION == 0x409
+# define LOCAL_INLINE __attribute__ ((noinline))
+#else
+# define LOCAL_INLINE WEBP_INLINE
+#endif
+
+static LOCAL_INLINE int Sub3(int a, int b, int c) {
   const int pb = b - c;
   const int pa = a - c;
   return abs(pb) - abs(pa);
 }
 
+#undef LOCAL_INLINE
+
 static WEBP_INLINE uint32_t Select(uint32_t a, uint32_t b, uint32_t c) {
   const int pa_minus_pb =
       Sub3((a >> 24)       , (b >> 24)       , (c >> 24)       ) +
@@ -489,21 +525,19 @@ static uint32_t Predictor10(uint32_t left, const uint32_t* const top) {
   return pred;
 }
 static uint32_t Predictor11(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = VP8LSelect(top[0], left, top[-1]);
+  const uint32_t pred = Select(top[0], left, top[-1]);
   return pred;
 }
 static uint32_t Predictor12(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = VP8LClampedAddSubtractFull(left, top[0], top[-1]);
+  const uint32_t pred = ClampedAddSubtractFull(left, top[0], top[-1]);
   return pred;
 }
 static uint32_t Predictor13(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = VP8LClampedAddSubtractHalf(left, top[0], top[-1]);
+  const uint32_t pred = ClampedAddSubtractHalf(left, top[0], top[-1]);
   return pred;
 }
 
-// TODO(vikasa): Export the predictor array, to allow SSE2 variants.
-typedef uint32_t (*PredictorFunc)(uint32_t left, const uint32_t* const top);
-static const PredictorFunc kPredictors[16] = {
+static const VP8LPredictorFunc kPredictorsC[16] = {
   Predictor0, Predictor1, Predictor2, Predictor3,
   Predictor4, Predictor5, Predictor6, Predictor7,
   Predictor8, Predictor9, Predictor10, Predictor11,
@@ -511,10 +545,9 @@ static const PredictorFunc kPredictors[16] = {
   Predictor0, Predictor0    // <- padding security sentinels
 };
 
-// TODO(vikasa): Replace 256 etc with defines.
-static float PredictionCostSpatial(const int* counts,
-                                   int weight_0, double exp_val) {
-  const int significant_symbols = 16;
+static float PredictionCostSpatial(const int counts[256], int weight_0,
+                                   double exp_val) {
+  const int significant_symbols = 256 >> 4;
   const double exp_decay_factor = 0.6;
   double bits = weight_0 * counts[0];
   int i;
@@ -526,19 +559,19 @@ static float PredictionCostSpatial(const int* counts,
 }
 
 // Compute the combined Shanon's entropy for distribution {X} and {X+Y}
-static float CombinedShannonEntropy(const int* const X,
-                                    const int* const Y, int n) {
+static float CombinedShannonEntropy(const int X[256], const int Y[256]) {
   int i;
   double retval = 0.;
   int sumX = 0, sumXY = 0;
-  for (i = 0; i < n; ++i) {
+  for (i = 0; i < 256; ++i) {
     const int x = X[i];
-    const int xy = X[i] + Y[i];
+    const int xy = x + Y[i];
     if (x != 0) {
       sumX += x;
       retval -= VP8LFastSLog2(x);
-    }
-    if (xy != 0) {
+      sumXY += xy;
+      retval -= VP8LFastSLog2(xy);
+    } else if (xy != 0) {
       sumXY += xy;
       retval -= VP8LFastSLog2(xy);
     }
@@ -547,50 +580,53 @@ static float CombinedShannonEntropy(const int* const X,
   return (float)retval;
 }
 
-static float PredictionCostSpatialHistogram(int accumulated[4][256],
-                                            int tile[4][256]) {
+static float PredictionCostSpatialHistogram(const int accumulated[4][256],
+                                            const int tile[4][256]) {
   int i;
   double retval = 0;
   for (i = 0; i < 4; ++i) {
     const double kExpValue = 0.94;
     retval += PredictionCostSpatial(tile[i], 1, kExpValue);
-    retval += CombinedShannonEntropy(tile[i], accumulated[i], 256);
+    retval += CombinedShannonEntropy(tile[i], accumulated[i]);
   }
   return (float)retval;
 }
 
+static WEBP_INLINE void UpdateHisto(int histo_argb[4][256], uint32_t argb) {
+  ++histo_argb[0][argb >> 24];
+  ++histo_argb[1][(argb >> 16) & 0xff];
+  ++histo_argb[2][(argb >> 8) & 0xff];
+  ++histo_argb[3][argb & 0xff];
+}
+
 static int GetBestPredictorForTile(int width, int height,
                                    int tile_x, int tile_y, int bits,
-                                   int accumulated[4][256],
+                                   const int accumulated[4][256],
                                    const uint32_t* const argb_scratch) {
   const int kNumPredModes = 14;
   const int col_start = tile_x << bits;
   const int row_start = tile_y << bits;
   const int tile_size = 1 << bits;
-  const int ymax = (tile_size <= height - row_start) ?
-      tile_size : height - row_start;
-  const int xmax = (tile_size <= width - col_start) ?
-      tile_size : width - col_start;
-  int histo[4][256];
+  const int max_y = GetMin(tile_size, height - row_start);
+  const int max_x = GetMin(tile_size, width - col_start);
   float best_diff = MAX_DIFF_COST;
   int best_mode = 0;
-
   int mode;
   for (mode = 0; mode < kNumPredModes; ++mode) {
     const uint32_t* current_row = argb_scratch;
-    const PredictorFunc pred_func = kPredictors[mode];
+    const VP8LPredictorFunc pred_func = VP8LPredictors[mode];
     float cur_diff;
     int y;
-    memset(&histo[0][0], 0, sizeof(histo));
-    for (y = 0; y < ymax; ++y) {
+    int histo_argb[4][256];
+    memset(histo_argb, 0, sizeof(histo_argb));
+    for (y = 0; y < max_y; ++y) {
       int x;
       const int row = row_start + y;
       const uint32_t* const upper_row = current_row;
       current_row = upper_row + width;
-      for (x = 0; x < xmax; ++x) {
+      for (x = 0; x < max_x; ++x) {
         const int col = col_start + x;
         uint32_t predict;
-        uint32_t predict_diff;
         if (row == 0) {
           predict = (col == 0) ? ARGB_BLACK : current_row[col - 1];  // Left.
         } else if (col == 0) {
@@ -598,14 +634,11 @@ static int GetBestPredictorForTile(int width, int height,
         } else {
           predict = pred_func(current_row[col - 1], upper_row + col);
         }
-        predict_diff = VP8LSubPixels(current_row[col], predict);
-        ++histo[0][predict_diff >> 24];
-        ++histo[1][((predict_diff >> 16) & 0xff)];
-        ++histo[2][((predict_diff >> 8) & 0xff)];
-        ++histo[3][(predict_diff & 0xff)];
+        UpdateHisto(histo_argb, VP8LSubPixels(current_row[col], predict));
       }
     }
-    cur_diff = PredictionCostSpatialHistogram(accumulated, histo);
+    cur_diff = PredictionCostSpatialHistogram(
+        accumulated, (const int (*)[256])histo_argb);
     if (cur_diff < best_diff) {
       best_diff = cur_diff;
       best_mode = mode;
@@ -622,20 +655,18 @@ static void CopyTileWithPrediction(int width, int height,
   const int col_start = tile_x << bits;
   const int row_start = tile_y << bits;
   const int tile_size = 1 << bits;
-  const int ymax = (tile_size <= height - row_start) ?
-      tile_size : height - row_start;
-  const int xmax = (tile_size <= width - col_start) ?
-      tile_size : width - col_start;
-  const PredictorFunc pred_func = kPredictors[mode];
+  const int max_y = GetMin(tile_size, height - row_start);
+  const int max_x = GetMin(tile_size, width - col_start);
+  const VP8LPredictorFunc pred_func = VP8LPredictors[mode];
   const uint32_t* current_row = argb_scratch;
 
   int y;
-  for (y = 0; y < ymax; ++y) {
+  for (y = 0; y < max_y; ++y) {
     int x;
     const int row = row_start + y;
     const uint32_t* const upper_row = current_row;
     current_row = upper_row + width;
-    for (x = 0; x < xmax; ++x) {
+    for (x = 0; x < max_x; ++x) {
       const int col = col_start + x;
       const int pix = row * width + col;
       uint32_t predict;
@@ -681,7 +712,8 @@ void VP8LResidualImage(int width, int height, int bits,
       if (all_x_max > width) {
         all_x_max = width;
       }
-      pred = GetBestPredictorForTile(width, height, tile_x, tile_y, bits, histo,
+      pred = GetBestPredictorForTile(width, height, tile_x, tile_y, bits,
+                                     (const int (*)[256])histo,
                                      argb_scratch);
       image[tile_y * tiles_per_row + tile_x] = 0xff000000u | (pred << 8);
       CopyTileWithPrediction(width, height, tile_x, tile_y, bits, pred,
@@ -695,11 +727,7 @@ void VP8LResidualImage(int width, int height, int bits,
         }
         ix = all_y * width + tile_x_offset;
         for (all_x = tile_x_offset; all_x < all_x_max; ++all_x, ++ix) {
-          const uint32_t a = argb[ix];
-          ++histo[0][a >> 24];
-          ++histo[1][((a >> 16) & 0xff)];
-          ++histo[2][((a >> 8) & 0xff)];
-          ++histo[3][(a & 0xff)];
+          UpdateHisto(histo, argb[ix]);
         }
       }
     }
@@ -724,29 +752,36 @@ static void PredictorInverseTransform(const VP8LTransform* const transform,
 
   {
     int y = y_start;
-    const int mask = (1 << transform->bits_) - 1;
+    const int tile_width = 1 << transform->bits_;
+    const int mask = tile_width - 1;
+    const int safe_width = width & ~mask;
     const int tiles_per_row = VP8LSubSampleSize(width, transform->bits_);
     const uint32_t* pred_mode_base =
         transform->data_ + (y >> transform->bits_) * tiles_per_row;
 
     while (y < y_end) {
-      int x;
       const uint32_t pred2 = Predictor2(data[-1], data - width);
       const uint32_t* pred_mode_src = pred_mode_base;
-      PredictorFunc pred_func;
-
+      VP8LPredictorFunc pred_func;
+      int x = 1;
+      int t = 1;
       // First pixel follows the T (mode=2) mode.
       AddPixelsEq(data, pred2);
-
       // .. the rest:
-      pred_func = kPredictors[((*pred_mode_src++) >> 8) & 0xf];
-      for (x = 1; x < width; ++x) {
-        uint32_t pred;
-        if ((x & mask) == 0) {    // start of tile. Read predictor function.
-          pred_func = kPredictors[((*pred_mode_src++) >> 8) & 0xf];
+      while (x < safe_width) {
+        pred_func = VP8LPredictors[((*pred_mode_src++) >> 8) & 0xf];
+        for (; t < tile_width; ++t, ++x) {
+          const uint32_t pred = pred_func(data[x - 1], data + x - width);
+          AddPixelsEq(data + x, pred);
+        }
+        t = 0;
+      }
+      if (x < width) {
+        pred_func = VP8LPredictors[((*pred_mode_src++) >> 8) & 0xf];
+        for (; x < width; ++x) {
+          const uint32_t pred = pred_func(data[x - 1], data + x - width);
+          AddPixelsEq(data + x, pred);
         }
-        pred = pred_func(data[x - 1], data + x - width);
-        AddPixelsEq(data + x, pred);
       }
       data += width;
       ++y;
@@ -757,9 +792,9 @@ static void PredictorInverseTransform(const VP8LTransform* const transform,
   }
 }
 
-static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixs) {
-  int i = 0;
-  for (; i < num_pixs; ++i) {
+void VP8LSubtractGreenFromBlueAndRed_C(uint32_t* argb_data, int num_pixels) {
+  int i;
+  for (i = 0; i < num_pixels; ++i) {
     const uint32_t argb = argb_data[i];
     const uint32_t green = (argb >> 8) & 0xff;
     const uint32_t new_r = (((argb >> 16) & 0xff) - green) & 0xff;
@@ -770,26 +805,19 @@ static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixs) {
 
 // Add green to blue and red channels (i.e. perform the inverse transform of
 // 'subtract green').
-static void AddGreenToBlueAndRed(uint32_t* data, const uint32_t* data_end) {
-  while (data < data_end) {
-    const uint32_t argb = *data;
+void VP8LAddGreenToBlueAndRed_C(uint32_t* data, int num_pixels) {
+  int i;
+  for (i = 0; i < num_pixels; ++i) {
+    const uint32_t argb = data[i];
     const uint32_t green = ((argb >> 8) & 0xff);
     uint32_t red_blue = (argb & 0x00ff00ffu);
     red_blue += (green << 16) | green;
     red_blue &= 0x00ff00ffu;
-    *data++ = (argb & 0xff00ff00u) | red_blue;
+    data[i] = (argb & 0xff00ff00u) | red_blue;
   }
 }
 
-typedef struct {
-  // Note: the members are uint8_t, so that any negative values are
-  // automatically converted to "mod 256" values.
-  uint8_t green_to_red_;
-  uint8_t green_to_blue_;
-  uint8_t red_to_blue_;
-} Multipliers;
-
-static WEBP_INLINE void MultipliersClear(Multipliers* m) {
+static WEBP_INLINE void MultipliersClear(VP8LMultipliers* const m) {
   m->green_to_red_ = 0;
   m->green_to_blue_ = 0;
   m->red_to_blue_ = 0;
@@ -801,40 +829,54 @@ static WEBP_INLINE uint32_t ColorTransformDelta(int8_t color_pred,
 }
 
 static WEBP_INLINE void ColorCodeToMultipliers(uint32_t color_code,
-                                               Multipliers* const m) {
+                                               VP8LMultipliers* const m) {
   m->green_to_red_  = (color_code >>  0) & 0xff;
   m->green_to_blue_ = (color_code >>  8) & 0xff;
   m->red_to_blue_   = (color_code >> 16) & 0xff;
 }
 
-static WEBP_INLINE uint32_t MultipliersToColorCode(Multipliers* const m) {
+static WEBP_INLINE uint32_t MultipliersToColorCode(
+    const VP8LMultipliers* const m) {
   return 0xff000000u |
          ((uint32_t)(m->red_to_blue_) << 16) |
          ((uint32_t)(m->green_to_blue_) << 8) |
          m->green_to_red_;
 }
 
-static WEBP_INLINE uint32_t TransformColor(const Multipliers* const m,
-                                           uint32_t argb, int inverse) {
-  const uint32_t green = argb >> 8;
-  const uint32_t red = argb >> 16;
-  uint32_t new_red = red;
-  uint32_t new_blue = argb;
+void VP8LTransformColor_C(const VP8LMultipliers* const m, uint32_t* data,
+                          int num_pixels) {
+  int i;
+  for (i = 0; i < num_pixels; ++i) {
+    const uint32_t argb = data[i];
+    const uint32_t green = argb >> 8;
+    const uint32_t red = argb >> 16;
+    uint32_t new_red = red;
+    uint32_t new_blue = argb;
+    new_red -= ColorTransformDelta(m->green_to_red_, green);
+    new_red &= 0xff;
+    new_blue -= ColorTransformDelta(m->green_to_blue_, green);
+    new_blue -= ColorTransformDelta(m->red_to_blue_, red);
+    new_blue &= 0xff;
+    data[i] = (argb & 0xff00ff00u) | (new_red << 16) | (new_blue);
+  }
+}
 
-  if (inverse) {
+void VP8LTransformColorInverse_C(const VP8LMultipliers* const m, uint32_t* data,
+                                 int num_pixels) {
+  int i;
+  for (i = 0; i < num_pixels; ++i) {
+    const uint32_t argb = data[i];
+    const uint32_t green = argb >> 8;
+    const uint32_t red = argb >> 16;
+    uint32_t new_red = red;
+    uint32_t new_blue = argb;
     new_red += ColorTransformDelta(m->green_to_red_, green);
     new_red &= 0xff;
     new_blue += ColorTransformDelta(m->green_to_blue_, green);
     new_blue += ColorTransformDelta(m->red_to_blue_, new_red);
     new_blue &= 0xff;
-  } else {
-    new_red -= ColorTransformDelta(m->green_to_red_, green);
-    new_red &= 0xff;
-    new_blue -= ColorTransformDelta(m->green_to_blue_, green);
-    new_blue -= ColorTransformDelta(m->red_to_blue_, red);
-    new_blue &= 0xff;
+    data[i] = (argb & 0xff00ff00u) | (new_red << 16) | (new_blue);
   }
-  return (argb & 0xff00ff00u) | (new_red << 16) | (new_blue);
 }
 
 static WEBP_INLINE uint8_t TransformColorRed(uint8_t green_to_red,
@@ -856,225 +898,251 @@ static WEBP_INLINE uint8_t TransformColorBlue(uint8_t green_to_blue,
   return (new_blue & 0xff);
 }
 
-static WEBP_INLINE int SkipRepeatedPixels(const uint32_t* const argb,
-                                          int ix, int xsize) {
-  const uint32_t v = argb[ix];
-  if (ix >= xsize + 3) {
-    if (v == argb[ix - xsize] &&
-        argb[ix - 1] == argb[ix - xsize - 1] &&
-        argb[ix - 2] == argb[ix - xsize - 2] &&
-        argb[ix - 3] == argb[ix - xsize - 3]) {
-      return 1;
-    }
-    return v == argb[ix - 3] && v == argb[ix - 2] && v == argb[ix - 1];
-  } else if (ix >= 3) {
-    return v == argb[ix - 3] && v == argb[ix - 2] && v == argb[ix - 1];
-  }
-  return 0;
-}
-
 static float PredictionCostCrossColor(const int accumulated[256],
                                       const int counts[256]) {
   // Favor low entropy, locally and globally.
   // Favor small absolute values for PredictionCostSpatial
   static const double kExpValue = 2.4;
-  return CombinedShannonEntropy(counts, accumulated, 256) +
+  return CombinedShannonEntropy(counts, accumulated) +
          PredictionCostSpatial(counts, 3, kExpValue);
 }
 
-static Multipliers GetBestColorTransformForTile(
-    int tile_x, int tile_y, int bits,
-    Multipliers prevX,
-    Multipliers prevY,
-    int step, int xsize, int ysize,
-    int* accumulated_red_histo,
-    int* accumulated_blue_histo,
-    const uint32_t* const argb) {
-  float best_diff = MAX_DIFF_COST;
+static float GetPredictionCostCrossColorRed(
+    int tile_x_offset, int tile_y_offset, int all_x_max, int all_y_max,
+    int xsize, VP8LMultipliers prev_x, VP8LMultipliers prev_y, int green_to_red,
+    const int accumulated_red_histo[256], const uint32_t* const argb) {
+  int all_y;
+  int histo[256] = { 0 };
   float cur_diff;
-  const int halfstep = step / 2;
-  const int max_tile_size = 1 << bits;
-  const int tile_y_offset = tile_y * max_tile_size;
-  const int tile_x_offset = tile_x * max_tile_size;
-  int green_to_red;
-  int green_to_blue;
-  int red_to_blue;
-  int all_x_max = tile_x_offset + max_tile_size;
-  int all_y_max = tile_y_offset + max_tile_size;
-  Multipliers best_tx;
-  MultipliersClear(&best_tx);
-  if (all_x_max > xsize) {
-    all_x_max = xsize;
+  for (all_y = tile_y_offset; all_y < all_y_max; ++all_y) {
+    int ix = all_y * xsize + tile_x_offset;
+    int all_x;
+    for (all_x = tile_x_offset; all_x < all_x_max; ++all_x, ++ix) {
+      ++histo[TransformColorRed(green_to_red, argb[ix])];  // red.
+    }
   }
-  if (all_y_max > ysize) {
-    all_y_max = ysize;
+  cur_diff = PredictionCostCrossColor(accumulated_red_histo, histo);
+  if ((uint8_t)green_to_red == prev_x.green_to_red_) {
+    cur_diff -= 3;  // favor keeping the areas locally similar
   }
-
-  for (green_to_red = -64; green_to_red <= 64; green_to_red += halfstep) {
-    int histo[256] = { 0 };
-    int all_y;
-
-    for (all_y = tile_y_offset; all_y < all_y_max; ++all_y) {
-      int ix = all_y * xsize + tile_x_offset;
-      int all_x;
-      for (all_x = tile_x_offset; all_x < all_x_max; ++all_x, ++ix) {
-        if (SkipRepeatedPixels(argb, ix, xsize)) {
-          continue;
-        }
-        ++histo[TransformColorRed(green_to_red, argb[ix])];  // red.
-      }
-    }
-    cur_diff = PredictionCostCrossColor(&accumulated_red_histo[0], &histo[0]);
-    if ((uint8_t)green_to_red == prevX.green_to_red_) {
-      cur_diff -= 3;  // favor keeping the areas locally similar
+  if ((uint8_t)green_to_red == prev_y.green_to_red_) {
+    cur_diff -= 3;  // favor keeping the areas locally similar
+  }
+  if (green_to_red == 0) {
+    cur_diff -= 3;
+  }
+  return cur_diff;
+}
+
+static void GetBestGreenToRed(
+    int tile_x_offset, int tile_y_offset, int all_x_max, int all_y_max,
+    int xsize, VP8LMultipliers prev_x, VP8LMultipliers prev_y,
+    const int accumulated_red_histo[256], const uint32_t* const argb,
+    VP8LMultipliers* const best_tx) {
+  int min_green_to_red = -64;
+  int max_green_to_red = 64;
+  int green_to_red = 0;
+  int eval_min = 1;
+  int eval_max = 1;
+  float cur_diff_min = MAX_DIFF_COST;
+  float cur_diff_max = MAX_DIFF_COST;
+  // Do a binary search to find the optimal green_to_red color transform.
+  while (max_green_to_red - min_green_to_red > 2) {
+    if (eval_min) {
+      cur_diff_min = GetPredictionCostCrossColorRed(
+          tile_x_offset, tile_y_offset, all_x_max, all_y_max, xsize,
+          prev_x, prev_y, min_green_to_red, accumulated_red_histo, argb);
+      eval_min = 0;
     }
-    if ((uint8_t)green_to_red == prevY.green_to_red_) {
-      cur_diff -= 3;  // favor keeping the areas locally similar
+    if (eval_max) {
+      cur_diff_max = GetPredictionCostCrossColorRed(
+          tile_x_offset, tile_y_offset, all_x_max, all_y_max, xsize,
+          prev_x, prev_y, max_green_to_red, accumulated_red_histo, argb);
+      eval_max = 0;
     }
-    if (green_to_red == 0) {
-      cur_diff -= 3;
+    if (cur_diff_min < cur_diff_max) {
+      green_to_red = min_green_to_red;
+      max_green_to_red = (max_green_to_red + min_green_to_red) / 2;
+      eval_max = 1;
+    } else {
+      green_to_red = max_green_to_red;
+      min_green_to_red = (max_green_to_red + min_green_to_red) / 2;
+      eval_min = 1;
     }
-    if (cur_diff < best_diff) {
-      best_diff = cur_diff;
-      best_tx.green_to_red_ = green_to_red;
+  }
+  best_tx->green_to_red_ = green_to_red;
+}
+
+static float GetPredictionCostCrossColorBlue(
+    int tile_x_offset, int tile_y_offset, int all_x_max, int all_y_max,
+    int xsize, VP8LMultipliers prev_x, VP8LMultipliers prev_y,
+    int green_to_blue, int red_to_blue, const int accumulated_blue_histo[256],
+    const uint32_t* const argb) {
+  int all_y;
+  int histo[256] = { 0 };
+  float cur_diff;
+  for (all_y = tile_y_offset; all_y < all_y_max; ++all_y) {
+    int all_x;
+    int ix = all_y * xsize + tile_x_offset;
+    for (all_x = tile_x_offset; all_x < all_x_max; ++all_x, ++ix) {
+      ++histo[TransformColorBlue(green_to_blue, red_to_blue, argb[ix])];
     }
   }
-  best_diff = MAX_DIFF_COST;
-  for (green_to_blue = -32; green_to_blue <= 32; green_to_blue += step) {
-    for (red_to_blue = -32; red_to_blue <= 32; red_to_blue += step) {
-      int all_y;
-      int histo[256] = { 0 };
-      for (all_y = tile_y_offset; all_y < all_y_max; ++all_y) {
-        int all_x;
-        int ix = all_y * xsize + tile_x_offset;
-        for (all_x = tile_x_offset; all_x < all_x_max; ++all_x, ++ix) {
-          if (SkipRepeatedPixels(argb, ix, xsize)) {
-            continue;
-          }
-          ++histo[TransformColorBlue(green_to_blue, red_to_blue, argb[ix])];
-        }
-      }
-      cur_diff =
-          PredictionCostCrossColor(&accumulated_blue_histo[0], &histo[0]);
-      if ((uint8_t)green_to_blue == prevX.green_to_blue_) {
-        cur_diff -= 3;  // favor keeping the areas locally similar
-      }
-      if ((uint8_t)green_to_blue == prevY.green_to_blue_) {
-        cur_diff -= 3;  // favor keeping the areas locally similar
-      }
-      if ((uint8_t)red_to_blue == prevX.red_to_blue_) {
-        cur_diff -= 3;  // favor keeping the areas locally similar
-      }
-      if ((uint8_t)red_to_blue == prevY.red_to_blue_) {
-        cur_diff -= 3;  // favor keeping the areas locally similar
-      }
-      if (green_to_blue == 0) {
-        cur_diff -= 3;
-      }
-      if (red_to_blue == 0) {
-        cur_diff -= 3;
-      }
+  cur_diff = PredictionCostCrossColor(accumulated_blue_histo, histo);
+  if ((uint8_t)green_to_blue == prev_x.green_to_blue_) {
+    cur_diff -= 3;  // favor keeping the areas locally similar
+  }
+  if ((uint8_t)green_to_blue == prev_y.green_to_blue_) {
+    cur_diff -= 3;  // favor keeping the areas locally similar
+  }
+  if ((uint8_t)red_to_blue == prev_x.red_to_blue_) {
+    cur_diff -= 3;  // favor keeping the areas locally similar
+  }
+  if ((uint8_t)red_to_blue == prev_y.red_to_blue_) {
+    cur_diff -= 3;  // favor keeping the areas locally similar
+  }
+  if (green_to_blue == 0) {
+    cur_diff -= 3;
+  }
+  if (red_to_blue == 0) {
+    cur_diff -= 3;
+  }
+  return cur_diff;
+}
+
+static void GetBestGreenRedToBlue(
+    int tile_x_offset, int tile_y_offset, int all_x_max, int all_y_max,
+    int xsize, VP8LMultipliers prev_x, VP8LMultipliers prev_y, int quality,
+    const int accumulated_blue_histo[256], const uint32_t* const argb,
+    VP8LMultipliers* const best_tx) {
+  float best_diff = MAX_DIFF_COST;
+  float cur_diff;
+  const int step = (quality < 25) ? 32 : (quality > 50) ? 8 : 16;
+  const int min_green_to_blue = -32;
+  const int max_green_to_blue = 32;
+  const int min_red_to_blue = -32;
+  const int max_red_to_blue = 32;
+  const int num_iters =
+      (1 + (max_green_to_blue - min_green_to_blue) / step) *
+      (1 + (max_red_to_blue - min_red_to_blue) / step);
+  // Number of tries to get optimal green_to_blue & red_to_blue color transforms
+  // after finding a local minima.
+  const int max_tries_after_min = 4 + (num_iters >> 2);
+  int num_tries_after_min = 0;
+  int green_to_blue;
+  for (green_to_blue = min_green_to_blue;
+       green_to_blue <= max_green_to_blue &&
+       num_tries_after_min < max_tries_after_min;
+       green_to_blue += step) {
+    int red_to_blue;
+    for (red_to_blue = min_red_to_blue;
+         red_to_blue <= max_red_to_blue &&
+         num_tries_after_min < max_tries_after_min;
+         red_to_blue += step) {
+      cur_diff = GetPredictionCostCrossColorBlue(
+          tile_x_offset, tile_y_offset, all_x_max, all_y_max, xsize, prev_x,
+          prev_y, green_to_blue, red_to_blue, accumulated_blue_histo, argb);
       if (cur_diff < best_diff) {
         best_diff = cur_diff;
-        best_tx.green_to_blue_ = green_to_blue;
-        best_tx.red_to_blue_ = red_to_blue;
+        best_tx->green_to_blue_ = green_to_blue;
+        best_tx->red_to_blue_ = red_to_blue;
+        num_tries_after_min = 0;
+      } else {
+        ++num_tries_after_min;
       }
     }
   }
+}
+
+static VP8LMultipliers GetBestColorTransformForTile(
+    int tile_x, int tile_y, int bits,
+    VP8LMultipliers prev_x,
+    VP8LMultipliers prev_y,
+    int quality, int xsize, int ysize,
+    const int accumulated_red_histo[256],
+    const int accumulated_blue_histo[256],
+    const uint32_t* const argb) {
+  const int max_tile_size = 1 << bits;
+  const int tile_y_offset = tile_y * max_tile_size;
+  const int tile_x_offset = tile_x * max_tile_size;
+  const int all_x_max = GetMin(tile_x_offset + max_tile_size, xsize);
+  const int all_y_max = GetMin(tile_y_offset + max_tile_size, ysize);
+  VP8LMultipliers best_tx;
+  MultipliersClear(&best_tx);
+
+  GetBestGreenToRed(tile_x_offset, tile_y_offset, all_x_max, all_y_max, xsize,
+                    prev_x, prev_y, accumulated_red_histo, argb, &best_tx);
+  GetBestGreenRedToBlue(tile_x_offset, tile_y_offset, all_x_max, all_y_max,
+                        xsize, prev_x, prev_y, quality, accumulated_blue_histo,
+                        argb, &best_tx);
   return best_tx;
 }
 
 static void CopyTileWithColorTransform(int xsize, int ysize,
-                                       int tile_x, int tile_y, int bits,
-                                       Multipliers color_transform,
-                                       uint32_t* const argb) {
-  int y;
-  int xscan = 1 << bits;
-  int yscan = 1 << bits;
-  tile_x <<= bits;
-  tile_y <<= bits;
-  if (xscan > xsize - tile_x) {
-    xscan = xsize - tile_x;
-  }
-  if (yscan > ysize - tile_y) {
-    yscan = ysize - tile_y;
-  }
-  yscan += tile_y;
-  for (y = tile_y; y < yscan; ++y) {
-    int ix = y * xsize + tile_x;
-    const int end_ix = ix + xscan;
-    for (; ix < end_ix; ++ix) {
-      argb[ix] = TransformColor(&color_transform, argb[ix], 0);
-    }
+                                       int tile_x, int tile_y,
+                                       int max_tile_size,
+                                       VP8LMultipliers color_transform,
+                                       uint32_t* argb) {
+  const int xscan = GetMin(max_tile_size, xsize - tile_x);
+  int yscan = GetMin(max_tile_size, ysize - tile_y);
+  argb += tile_y * xsize + tile_x;
+  while (yscan-- > 0) {
+    VP8LTransformColor(&color_transform, argb, xscan);
+    argb += xsize;
   }
 }
 
-void VP8LColorSpaceTransform(int width, int height, int bits, int step,
+void VP8LColorSpaceTransform(int width, int height, int bits, int quality,
                              uint32_t* const argb, uint32_t* image) {
   const int max_tile_size = 1 << bits;
-  int tile_xsize = VP8LSubSampleSize(width, bits);
-  int tile_ysize = VP8LSubSampleSize(height, bits);
+  const int tile_xsize = VP8LSubSampleSize(width, bits);
+  const int tile_ysize = VP8LSubSampleSize(height, bits);
   int accumulated_red_histo[256] = { 0 };
   int accumulated_blue_histo[256] = { 0 };
-  int tile_y;
-  int tile_x;
-  Multipliers prevX;
-  Multipliers prevY;
-  MultipliersClear(&prevY);
-  MultipliersClear(&prevX);
+  int tile_x, tile_y;
+  VP8LMultipliers prev_x, prev_y;
+  MultipliersClear(&prev_y);
+  MultipliersClear(&prev_x);
   for (tile_y = 0; tile_y < tile_ysize; ++tile_y) {
     for (tile_x = 0; tile_x < tile_xsize; ++tile_x) {
-      Multipliers color_transform;
-      int all_x_max;
       int y;
-      const int tile_y_offset = tile_y * max_tile_size;
       const int tile_x_offset = tile_x * max_tile_size;
+      const int tile_y_offset = tile_y * max_tile_size;
+      const int all_x_max = GetMin(tile_x_offset + max_tile_size, width);
+      const int all_y_max = GetMin(tile_y_offset + max_tile_size, height);
+      const int offset = tile_y * tile_xsize + tile_x;
       if (tile_y != 0) {
-        ColorCodeToMultipliers(image[tile_y * tile_xsize + tile_x - 1], &prevX);
-        ColorCodeToMultipliers(image[(tile_y - 1) * tile_xsize + tile_x],
-                               &prevY);
-      } else if (tile_x != 0) {
-        ColorCodeToMultipliers(image[tile_y * tile_xsize + tile_x - 1], &prevX);
+        ColorCodeToMultipliers(image[offset - tile_xsize], &prev_y);
       }
-      color_transform =
-          GetBestColorTransformForTile(tile_x, tile_y, bits,
-                                       prevX, prevY,
-                                       step, width, height,
-                                       &accumulated_red_histo[0],
-                                       &accumulated_blue_histo[0],
-                                       argb);
-      image[tile_y * tile_xsize + tile_x] =
-          MultipliersToColorCode(&color_transform);
-      CopyTileWithColorTransform(width, height, tile_x, tile_y, bits,
-                                 color_transform, argb);
+      prev_x = GetBestColorTransformForTile(tile_x, tile_y, bits,
+                                            prev_x, prev_y,
+                                            quality, width, height,
+                                            accumulated_red_histo,
+                                            accumulated_blue_histo,
+                                            argb);
+      image[offset] = MultipliersToColorCode(&prev_x);
+      CopyTileWithColorTransform(width, height, tile_x_offset, tile_y_offset,
+                                 max_tile_size, prev_x, argb);
 
       // Gather accumulated histogram data.
-      all_x_max = tile_x_offset + max_tile_size;
-      if (all_x_max > width) {
-        all_x_max = width;
-      }
-      for (y = 0; y < max_tile_size; ++y) {
-        int ix;
-        int all_x;
-        int all_y = tile_y_offset + y;
-        if (all_y >= height) {
-          break;
-        }
-        ix = all_y * width + tile_x_offset;
-        for (all_x = tile_x_offset; all_x < all_x_max; ++all_x, ++ix) {
+      for (y = tile_y_offset; y < all_y_max; ++y) {
+        int ix = y * width + tile_x_offset;
+        const int ix_end = ix + all_x_max - tile_x_offset;
+        for (; ix < ix_end; ++ix) {
+          const uint32_t pix = argb[ix];
           if (ix >= 2 &&
-              argb[ix] == argb[ix - 2] &&
-              argb[ix] == argb[ix - 1]) {
+              pix == argb[ix - 2] &&
+              pix == argb[ix - 1]) {
             continue;  // repeated pixels are handled by backward references
           }
           if (ix >= width + 2 &&
               argb[ix - 2] == argb[ix - width - 2] &&
               argb[ix - 1] == argb[ix - width - 1] &&
-              argb[ix] == argb[ix - width]) {
+              pix == argb[ix - width]) {
             continue;  // repeated pixels are handled by backward references
           }
-          ++accumulated_red_histo[(argb[ix] >> 16) & 0xff];
-          ++accumulated_blue_histo[argb[ix] & 0xff];
+          ++accumulated_red_histo[(pix >> 16) & 0xff];
+          ++accumulated_blue_histo[(pix >> 0) & 0xff];
         }
       }
     }
@@ -1085,7 +1153,10 @@ void VP8LColorSpaceTransform(int width, int height, int bits, int step,
 static void ColorSpaceInverseTransform(const VP8LTransform* const transform,
                                        int y_start, int y_end, uint32_t* data) {
   const int width = transform->xsize_;
-  const int mask = (1 << transform->bits_) - 1;
+  const int tile_width = 1 << transform->bits_;
+  const int mask = tile_width - 1;
+  const int safe_width = width & ~mask;
+  const int remaining_width = width - safe_width;
   const int tiles_per_row = VP8LSubSampleSize(width, transform->bits_);
   int y = y_start;
   const uint32_t* pred_row =
@@ -1093,16 +1164,21 @@ static void ColorSpaceInverseTransform(const VP8LTransform* const transform,
 
   while (y < y_end) {
     const uint32_t* pred = pred_row;
-    Multipliers m = { 0, 0, 0 };
-    int x;
-
-    for (x = 0; x < width; ++x) {
-      if ((x & mask) == 0) ColorCodeToMultipliers(*pred++, &m);
-      data[x] = TransformColor(&m, data[x], 1);
+    VP8LMultipliers m = { 0, 0, 0 };
+    const uint32_t* const data_safe_end = data + safe_width;
+    const uint32_t* const data_end = data + width;
+    while (data < data_safe_end) {
+      ColorCodeToMultipliers(*pred++, &m);
+      VP8LTransformColorInverse(&m, data, tile_width);
+      data += tile_width;
+    }
+    if (data < data_end) {  // Left-overs using C-version.
+      ColorCodeToMultipliers(*pred++, &m);
+      VP8LTransformColorInverse(&m, data, remaining_width);
+      data += remaining_width;
     }
-    data += width;
     ++y;
-    if ((y & mask) == 0) pred_row += tiles_per_row;;
+    if ((y & mask) == 0) pred_row += tiles_per_row;
   }
 }
 
@@ -1173,7 +1249,7 @@ void VP8LInverseTransform(const VP8LTransform* const transform,
   assert(row_end <= transform->ysize_);
   switch (transform->type_) {
     case SUBTRACT_GREEN:
-      VP8LAddGreenToBlueAndRed(out, out + (row_end - row_start) * width);
+      VP8LAddGreenToBlueAndRed(out, (row_end - row_start) * width);
       break;
     case PREDICTOR_TRANSFORM:
       PredictorInverseTransform(transform, row_start, row_end, out);
@@ -1218,8 +1294,8 @@ static int is_big_endian(void) {
   return (tmp.b[0] != 1);
 }
 
-static void ConvertBGRAToRGB(const uint32_t* src,
-                             int num_pixels, uint8_t* dst) {
+void VP8LConvertBGRAToRGB_C(const uint32_t* src,
+                            int num_pixels, uint8_t* dst) {
   const uint32_t* const src_end = src + num_pixels;
   while (src < src_end) {
     const uint32_t argb = *src++;
@@ -1229,8 +1305,8 @@ static void ConvertBGRAToRGB(const uint32_t* src,
   }
 }
 
-static void ConvertBGRAToRGBA(const uint32_t* src,
-                              int num_pixels, uint8_t* dst) {
+void VP8LConvertBGRAToRGBA_C(const uint32_t* src,
+                             int num_pixels, uint8_t* dst) {
   const uint32_t* const src_end = src + num_pixels;
   while (src < src_end) {
     const uint32_t argb = *src++;
@@ -1241,8 +1317,8 @@ static void ConvertBGRAToRGBA(const uint32_t* src,
   }
 }
 
-static void ConvertBGRAToRGBA4444(const uint32_t* src,
-                                  int num_pixels, uint8_t* dst) {
+void VP8LConvertBGRAToRGBA4444_C(const uint32_t* src,
+                                 int num_pixels, uint8_t* dst) {
   const uint32_t* const src_end = src + num_pixels;
   while (src < src_end) {
     const uint32_t argb = *src++;
@@ -1258,8 +1334,8 @@ static void ConvertBGRAToRGBA4444(const uint32_t* src,
   }
 }
 
-static void ConvertBGRAToRGB565(const uint32_t* src,
-                                int num_pixels, uint8_t* dst) {
+void VP8LConvertBGRAToRGB565_C(const uint32_t* src,
+                               int num_pixels, uint8_t* dst) {
   const uint32_t* const src_end = src + num_pixels;
   while (src < src_end) {
     const uint32_t argb = *src++;
@@ -1275,8 +1351,8 @@ static void ConvertBGRAToRGB565(const uint32_t* src,
   }
 }
 
-static void ConvertBGRAToBGR(const uint32_t* src,
-                             int num_pixels, uint8_t* dst) {
+void VP8LConvertBGRAToBGR_C(const uint32_t* src,
+                            int num_pixels, uint8_t* dst) {
   const uint32_t* const src_end = src + num_pixels;
   while (src < src_end) {
     const uint32_t argb = *src++;
@@ -1291,29 +1367,18 @@ static void CopyOrSwap(const uint32_t* src, int num_pixels, uint8_t* dst,
   if (is_big_endian() == swap_on_big_endian) {
     const uint32_t* const src_end = src + num_pixels;
     while (src < src_end) {
-      uint32_t argb = *src++;
+      const uint32_t argb = *src++;
 
-#if !defined(__BIG_ENDIAN__)
+#if !defined(WORDS_BIGENDIAN)
 #if !defined(WEBP_REFERENCE_IMPLEMENTATION)
-#if defined(__i386__) || defined(__x86_64__)
-      __asm__ volatile("bswap %0" : "=r"(argb) : "0"(argb));
-      *(uint32_t*)dst = argb;
-#elif defined(_MSC_VER)
-      argb = _byteswap_ulong(argb);
-      *(uint32_t*)dst = argb;
-#else
-      dst[0] = (argb >> 24) & 0xff;
-      dst[1] = (argb >> 16) & 0xff;
-      dst[2] = (argb >>  8) & 0xff;
-      dst[3] = (argb >>  0) & 0xff;
-#endif
+      *(uint32_t*)dst = BSwap32(argb);
 #else  // WEBP_REFERENCE_IMPLEMENTATION
       dst[0] = (argb >> 24) & 0xff;
       dst[1] = (argb >> 16) & 0xff;
       dst[2] = (argb >>  8) & 0xff;
       dst[3] = (argb >>  0) & 0xff;
 #endif
-#else  // __BIG_ENDIAN__
+#else  // WORDS_BIGENDIAN
       dst[0] = (argb >>  0) & 0xff;
       dst[1] = (argb >>  8) & 0xff;
       dst[2] = (argb >> 16) & 0xff;
@@ -1330,17 +1395,17 @@ void VP8LConvertFromBGRA(const uint32_t* const in_data, int num_pixels,
                          WEBP_CSP_MODE out_colorspace, uint8_t* const rgba) {
   switch (out_colorspace) {
     case MODE_RGB:
-      ConvertBGRAToRGB(in_data, num_pixels, rgba);
+      VP8LConvertBGRAToRGB(in_data, num_pixels, rgba);
       break;
     case MODE_RGBA:
-      ConvertBGRAToRGBA(in_data, num_pixels, rgba);
+      VP8LConvertBGRAToRGBA(in_data, num_pixels, rgba);
       break;
     case MODE_rgbA:
-      ConvertBGRAToRGBA(in_data, num_pixels, rgba);
+      VP8LConvertBGRAToRGBA(in_data, num_pixels, rgba);
       WebPApplyAlphaMultiply(rgba, 0, num_pixels, 1, 0);
       break;
     case MODE_BGR:
-      ConvertBGRAToBGR(in_data, num_pixels, rgba);
+      VP8LConvertBGRAToBGR(in_data, num_pixels, rgba);
       break;
     case MODE_BGRA:
       CopyOrSwap(in_data, num_pixels, rgba, 1);
@@ -1357,20 +1422,21 @@ void VP8LConvertFromBGRA(const uint32_t* const in_data, int num_pixels,
       WebPApplyAlphaMultiply(rgba, 1, num_pixels, 1, 0);
       break;
     case MODE_RGBA_4444:
-      ConvertBGRAToRGBA4444(in_data, num_pixels, rgba);
+      VP8LConvertBGRAToRGBA4444(in_data, num_pixels, rgba);
       break;
     case MODE_rgbA_4444:
-      ConvertBGRAToRGBA4444(in_data, num_pixels, rgba);
+      VP8LConvertBGRAToRGBA4444(in_data, num_pixels, rgba);
       WebPApplyAlphaMultiply4444(rgba, num_pixels, 1, 0);
       break;
     case MODE_RGB_565:
-      ConvertBGRAToRGB565(in_data, num_pixels, rgba);
+      VP8LConvertBGRAToRGB565(in_data, num_pixels, rgba);
       break;
     default:
       assert(0);          // Code flow should not reach here.
   }
 }
 
+//------------------------------------------------------------------------------
 // Bundles multiple (1, 2, 4 or 8) pixels into a single pixel.
 void VP8LBundleColorMap(const uint8_t* const row, int width,
                         int xbits, uint32_t* const dst) {
@@ -1394,129 +1460,166 @@ void VP8LBundleColorMap(const uint8_t* const row, int width,
 
 //------------------------------------------------------------------------------
 
-// TODO(vikasa): Move the SSE2 functions to lossless_dsp.c (new file), once
-// color-space conversion methods (ConvertFromBGRA) are also updated for SSE2.
-#if defined(WEBP_USE_SSE2)
-static WEBP_INLINE uint32_t ClampedAddSubtractFullSSE2(uint32_t c0, uint32_t c1,
-                                                       uint32_t c2) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i C0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c0), zero);
-  const __m128i C1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c1), zero);
-  const __m128i C2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c2), zero);
-  const __m128i V1 = _mm_add_epi16(C0, C1);
-  const __m128i V2 = _mm_sub_epi16(V1, C2);
-  const __m128i b = _mm_packus_epi16(V2, V2);
-  const uint32_t output = _mm_cvtsi128_si32(b);
-  return output;
-}
-
-static WEBP_INLINE uint32_t ClampedAddSubtractHalfSSE2(uint32_t c0, uint32_t c1,
-                                                       uint32_t c2) {
-  const uint32_t ave = Average2(c0, c1);
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i A0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ave), zero);
-  const __m128i B0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c2), zero);
-  const __m128i A1 = _mm_sub_epi16(A0, B0);
-  const __m128i BgtA = _mm_cmpgt_epi16(B0, A0);
-  const __m128i A2 = _mm_sub_epi16(A1, BgtA);
-  const __m128i A3 = _mm_srai_epi16(A2, 1);
-  const __m128i A4 = _mm_add_epi16(A0, A3);
-  const __m128i A5 = _mm_packus_epi16(A4, A4);
-  const uint32_t output = _mm_cvtsi128_si32(A5);
-  return output;
-}
-
-static WEBP_INLINE uint32_t SelectSSE2(uint32_t a, uint32_t b, uint32_t c) {
-  int pa_minus_pb;
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i A0 = _mm_cvtsi32_si128(a);
-  const __m128i B0 = _mm_cvtsi32_si128(b);
-  const __m128i C0 = _mm_cvtsi32_si128(c);
-  const __m128i AC0 = _mm_subs_epu8(A0, C0);
-  const __m128i CA0 = _mm_subs_epu8(C0, A0);
-  const __m128i BC0 = _mm_subs_epu8(B0, C0);
-  const __m128i CB0 = _mm_subs_epu8(C0, B0);
-  const __m128i AC = _mm_or_si128(AC0, CA0);
-  const __m128i BC = _mm_or_si128(BC0, CB0);
-  const __m128i pa = _mm_unpacklo_epi8(AC, zero);  // |a - c|
-  const __m128i pb = _mm_unpacklo_epi8(BC, zero);  // |b - c|
-  const __m128i diff = _mm_sub_epi16(pb, pa);
-  {
-    int16_t out[8];
-    _mm_storeu_si128((__m128i*)out, diff);
-    pa_minus_pb = out[0] + out[1] + out[2] + out[3];
-  }
-  return (pa_minus_pb <= 0) ? a : b;
+static double ExtraCost(const uint32_t* population, int length) {
+  int i;
+  double cost = 0.;
+  for (i = 2; i < length - 2; ++i) cost += (i >> 1) * population[i + 2];
+  return cost;
 }
 
-static void SubtractGreenFromBlueAndRedSSE2(uint32_t* argb_data, int num_pixs) {
-  int i = 0;
-  const __m128i mask = _mm_set1_epi32(0x0000ff00);
-  for (; i + 4 < num_pixs; i += 4) {
-    const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]);
-    const __m128i in_00g0 = _mm_and_si128(in, mask);     // 00g0|00g0|...
-    const __m128i in_0g00 = _mm_slli_epi32(in_00g0, 8);  // 0g00|0g00|...
-    const __m128i in_000g = _mm_srli_epi32(in_00g0, 8);  // 000g|000g|...
-    const __m128i in_0g0g = _mm_or_si128(in_0g00, in_000g);
-    const __m128i out = _mm_sub_epi8(in, in_0g0g);
-    _mm_storeu_si128((__m128i*)&argb_data[i], out);
+static double ExtraCostCombined(const uint32_t* X, const uint32_t* Y,
+                                int length) {
+  int i;
+  double cost = 0.;
+  for (i = 2; i < length - 2; ++i) {
+    const int xy = X[i + 2] + Y[i + 2];
+    cost += (i >> 1) * xy;
   }
-  // fallthrough and finish off with plain-C
-  for (; i < num_pixs; ++i) {
-    const uint32_t argb = argb_data[i];
-    const uint32_t green = (argb >> 8) & 0xff;
-    const uint32_t new_r = (((argb >> 16) & 0xff) - green) & 0xff;
-    const uint32_t new_b = ((argb & 0xff) - green) & 0xff;
-    argb_data[i] = (argb & 0xff00ff00) | (new_r << 16) | new_b;
+  return cost;
+}
+
+// Returns the various RLE counts
+static VP8LStreaks HuffmanCostCount(const uint32_t* population, int length) {
+  int i;
+  int streak = 0;
+  VP8LStreaks stats;
+  memset(&stats, 0, sizeof(stats));
+  for (i = 0; i < length - 1; ++i) {
+    ++streak;
+    if (population[i] == population[i + 1]) {
+      continue;
+    }
+    stats.counts[population[i] != 0] += (streak > 3);
+    stats.streaks[population[i] != 0][(streak > 3)] += streak;
+    streak = 0;
   }
+  ++streak;
+  stats.counts[population[i] != 0] += (streak > 3);
+  stats.streaks[population[i] != 0][(streak > 3)] += streak;
+  return stats;
 }
 
-static void AddGreenToBlueAndRedSSE2(uint32_t* data, const uint32_t* data_end) {
-  const __m128i mask = _mm_set1_epi32(0x0000ff00);
-  for (; data + 4 < data_end; data += 4) {
-    const __m128i in = _mm_loadu_si128((__m128i*)data);
-    const __m128i in_00g0 = _mm_and_si128(in, mask);     // 00g0|00g0|...
-    const __m128i in_0g00 = _mm_slli_epi32(in_00g0, 8);  // 0g00|0g00|...
-    const __m128i in_000g = _mm_srli_epi32(in_00g0, 8);  // 000g|000g|...
-    const __m128i in_0g0g = _mm_or_si128(in_0g00, in_000g);
-    const __m128i out = _mm_add_epi8(in, in_0g0g);
-    _mm_storeu_si128((__m128i*)data, out);
+static VP8LStreaks HuffmanCostCombinedCount(const uint32_t* X,
+                                            const uint32_t* Y, int length) {
+  int i;
+  int streak = 0;
+  VP8LStreaks stats;
+  memset(&stats, 0, sizeof(stats));
+  for (i = 0; i < length - 1; ++i) {
+    const int xy = X[i] + Y[i];
+    const int xy_next = X[i + 1] + Y[i + 1];
+    ++streak;
+    if (xy == xy_next) {
+      continue;
+    }
+    stats.counts[xy != 0] += (streak > 3);
+    stats.streaks[xy != 0][(streak > 3)] += streak;
+    streak = 0;
   }
-  // fallthrough and finish off with plain-C
-  while (data < data_end) {
-    const uint32_t argb = *data;
-    const uint32_t green = ((argb >> 8) & 0xff);
-    uint32_t red_blue = (argb & 0x00ff00ffu);
-    red_blue += (green << 16) | green;
-    red_blue &= 0x00ff00ffu;
-    *data++ = (argb & 0xff00ff00u) | red_blue;
+  {
+    const int xy = X[i] + Y[i];
+    ++streak;
+    stats.counts[xy != 0] += (streak > 3);
+    stats.streaks[xy != 0][(streak > 3)] += streak;
   }
+  return stats;
 }
 
-extern void VP8LDspInitSSE2(void);
+//------------------------------------------------------------------------------
 
-void VP8LDspInitSSE2(void) {
-  VP8LClampedAddSubtractFull = ClampedAddSubtractFullSSE2;
-  VP8LClampedAddSubtractHalf = ClampedAddSubtractHalfSSE2;
-  VP8LSelect = SelectSSE2;
-  VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRedSSE2;
-  VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRedSSE2;
+static void HistogramAdd(const VP8LHistogram* const a,
+                         const VP8LHistogram* const b,
+                         VP8LHistogram* const out) {
+  int i;
+  const int literal_size = VP8LHistogramNumCodes(a->palette_code_bits_);
+  assert(a->palette_code_bits_ == b->palette_code_bits_);
+  if (b != out) {
+    for (i = 0; i < literal_size; ++i) {
+      out->literal_[i] = a->literal_[i] + b->literal_[i];
+    }
+    for (i = 0; i < NUM_DISTANCE_CODES; ++i) {
+      out->distance_[i] = a->distance_[i] + b->distance_[i];
+    }
+    for (i = 0; i < NUM_LITERAL_CODES; ++i) {
+      out->red_[i] = a->red_[i] + b->red_[i];
+      out->blue_[i] = a->blue_[i] + b->blue_[i];
+      out->alpha_[i] = a->alpha_[i] + b->alpha_[i];
+    }
+  } else {
+    for (i = 0; i < literal_size; ++i) {
+      out->literal_[i] += a->literal_[i];
+    }
+    for (i = 0; i < NUM_DISTANCE_CODES; ++i) {
+      out->distance_[i] += a->distance_[i];
+    }
+    for (i = 0; i < NUM_LITERAL_CODES; ++i) {
+      out->red_[i] += a->red_[i];
+      out->blue_[i] += a->blue_[i];
+      out->alpha_[i] += a->alpha_[i];
+    }
+  }
 }
-#endif
+
 //------------------------------------------------------------------------------
 
-VP8LPredClampedAddSubFunc VP8LClampedAddSubtractFull;
-VP8LPredClampedAddSubFunc VP8LClampedAddSubtractHalf;
-VP8LPredSelectFunc VP8LSelect;
-VP8LSubtractGreenFromBlueAndRedFunc VP8LSubtractGreenFromBlueAndRed;
-VP8LAddGreenToBlueAndRedFunc VP8LAddGreenToBlueAndRed;
+VP8LProcessBlueAndRedFunc VP8LSubtractGreenFromBlueAndRed;
+VP8LProcessBlueAndRedFunc VP8LAddGreenToBlueAndRed;
+VP8LPredictorFunc VP8LPredictors[16];
+
+VP8LTransformColorFunc VP8LTransformColor;
+VP8LTransformColorFunc VP8LTransformColorInverse;
+
+VP8LConvertFunc VP8LConvertBGRAToRGB;
+VP8LConvertFunc VP8LConvertBGRAToRGBA;
+VP8LConvertFunc VP8LConvertBGRAToRGBA4444;
+VP8LConvertFunc VP8LConvertBGRAToRGB565;
+VP8LConvertFunc VP8LConvertBGRAToBGR;
+
+VP8LFastLog2SlowFunc VP8LFastLog2Slow;
+VP8LFastLog2SlowFunc VP8LFastSLog2Slow;
+
+VP8LCostFunc VP8LExtraCost;
+VP8LCostCombinedFunc VP8LExtraCostCombined;
+
+VP8LCostCountFunc VP8LHuffmanCostCount;
+VP8LCostCombinedCountFunc VP8LHuffmanCostCombinedCount;
+
+VP8LHistogramAddFunc VP8LHistogramAdd;
+
+extern void VP8LDspInitSSE2(void);
+extern void VP8LDspInitNEON(void);
+extern void VP8LDspInitMIPS32(void);
+
+static volatile VP8CPUInfo lossless_last_cpuinfo_used =
+    (VP8CPUInfo)&lossless_last_cpuinfo_used;
 
 void VP8LDspInit(void) {
-  VP8LClampedAddSubtractFull = ClampedAddSubtractFull;
-  VP8LClampedAddSubtractHalf = ClampedAddSubtractHalf;
-  VP8LSelect = Select;
-  VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed;
-  VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed;
+  if (lossless_last_cpuinfo_used == VP8GetCPUInfo) return;
+
+  memcpy(VP8LPredictors, kPredictorsC, sizeof(VP8LPredictors));
+
+  VP8LSubtractGreenFromBlueAndRed = VP8LSubtractGreenFromBlueAndRed_C;
+  VP8LAddGreenToBlueAndRed = VP8LAddGreenToBlueAndRed_C;
+
+  VP8LTransformColor = VP8LTransformColor_C;
+  VP8LTransformColorInverse = VP8LTransformColorInverse_C;
+
+  VP8LConvertBGRAToRGB = VP8LConvertBGRAToRGB_C;
+  VP8LConvertBGRAToRGBA = VP8LConvertBGRAToRGBA_C;
+  VP8LConvertBGRAToRGBA4444 = VP8LConvertBGRAToRGBA4444_C;
+  VP8LConvertBGRAToRGB565 = VP8LConvertBGRAToRGB565_C;
+  VP8LConvertBGRAToBGR = VP8LConvertBGRAToBGR_C;
+
+  VP8LFastLog2Slow = FastLog2Slow;
+  VP8LFastSLog2Slow = FastSLog2Slow;
+
+  VP8LExtraCost = ExtraCost;
+  VP8LExtraCostCombined = ExtraCostCombined;
+
+  VP8LHuffmanCostCount = HuffmanCostCount;
+  VP8LHuffmanCostCombinedCount = HuffmanCostCombinedCount;
+
+  VP8LHistogramAdd = HistogramAdd;
 
   // If defined, use CPUInfo() to overwrite some pointers with faster versions.
   if (VP8GetCPUInfo != NULL) {
@@ -1525,8 +1628,18 @@ void VP8LDspInit(void) {
       VP8LDspInitSSE2();
     }
 #endif
+#if defined(WEBP_USE_NEON)
+    if (VP8GetCPUInfo(kNEON)) {
+      VP8LDspInitNEON();
+    }
+#endif
+#if defined(WEBP_USE_MIPS32)
+    if (VP8GetCPUInfo(kMIPS32)) {
+      VP8LDspInitMIPS32();
+    }
+#endif
   }
+  lossless_last_cpuinfo_used = VP8GetCPUInfo;
 }
 
 //------------------------------------------------------------------------------
-
diff --git a/src/3rdparty/libwebp/src/dsp/lossless.h b/src/3rdparty/libwebp/src/dsp/lossless.h
index 0f1d442..8c7551c 100644
--- a/src/3rdparty/libwebp/src/dsp/lossless.h
+++ b/src/3rdparty/libwebp/src/dsp/lossless.h
@@ -18,26 +18,58 @@
 #include "../webp/types.h"
 #include "../webp/decode.h"
 
+#include "../enc/histogram.h"
+#include "../utils/utils.h"
+
 #ifdef __cplusplus
 extern "C" {
 #endif
 
 //------------------------------------------------------------------------------
-//
+// Signatures and generic function-pointers
+
+typedef uint32_t (*VP8LPredictorFunc)(uint32_t left, const uint32_t* const top);
+extern VP8LPredictorFunc VP8LPredictors[16];
 
-typedef uint32_t (*VP8LPredClampedAddSubFunc)(uint32_t c0, uint32_t c1,
-                                              uint32_t c2);
-typedef uint32_t (*VP8LPredSelectFunc)(uint32_t c0, uint32_t c1, uint32_t c2);
-typedef void (*VP8LSubtractGreenFromBlueAndRedFunc)(uint32_t* argb_data,
-                                                    int num_pixs);
-typedef void (*VP8LAddGreenToBlueAndRedFunc)(uint32_t* data_start,
-                                             const uint32_t* data_end);
+typedef void (*VP8LProcessBlueAndRedFunc)(uint32_t* argb_data, int num_pixels);
+extern VP8LProcessBlueAndRedFunc VP8LSubtractGreenFromBlueAndRed;
+extern VP8LProcessBlueAndRedFunc VP8LAddGreenToBlueAndRed;
 
-extern VP8LPredClampedAddSubFunc VP8LClampedAddSubtractFull;
-extern VP8LPredClampedAddSubFunc VP8LClampedAddSubtractHalf;
-extern VP8LPredSelectFunc VP8LSelect;
-extern VP8LSubtractGreenFromBlueAndRedFunc VP8LSubtractGreenFromBlueAndRed;
-extern VP8LAddGreenToBlueAndRedFunc VP8LAddGreenToBlueAndRed;
+typedef struct {
+  // Note: the members are uint8_t, so that any negative values are
+  // automatically converted to "mod 256" values.
+  uint8_t green_to_red_;
+  uint8_t green_to_blue_;
+  uint8_t red_to_blue_;
+} VP8LMultipliers;
+typedef void (*VP8LTransformColorFunc)(const VP8LMultipliers* const m,
+                                       uint32_t* argb_data, int num_pixels);
+extern VP8LTransformColorFunc VP8LTransformColor;
+extern VP8LTransformColorFunc VP8LTransformColorInverse;
+
+typedef void (*VP8LConvertFunc)(const uint32_t* src, int num_pixels,
+                                uint8_t* dst);
+extern VP8LConvertFunc VP8LConvertBGRAToRGB;
+extern VP8LConvertFunc VP8LConvertBGRAToRGBA;
+extern VP8LConvertFunc VP8LConvertBGRAToRGBA4444;
+extern VP8LConvertFunc VP8LConvertBGRAToRGB565;
+extern VP8LConvertFunc VP8LConvertBGRAToBGR;
+
+// Expose some C-only fallback functions
+void VP8LTransformColor_C(const VP8LMultipliers* const m,
+                          uint32_t* data, int num_pixels);
+void VP8LTransformColorInverse_C(const VP8LMultipliers* const m,
+                                 uint32_t* data, int num_pixels);
+
+void VP8LConvertBGRAToRGB_C(const uint32_t* src, int num_pixels, uint8_t* dst);
+void VP8LConvertBGRAToRGBA_C(const uint32_t* src, int num_pixels, uint8_t* dst);
+void VP8LConvertBGRAToRGBA4444_C(const uint32_t* src,
+                                 int num_pixels, uint8_t* dst);
+void VP8LConvertBGRAToRGB565_C(const uint32_t* src,
+                               int num_pixels, uint8_t* dst);
+void VP8LConvertBGRAToBGR_C(const uint32_t* src, int num_pixels, uint8_t* dst);
+void VP8LSubtractGreenFromBlueAndRed_C(uint32_t* argb_data, int num_pixels);
+void VP8LAddGreenToBlueAndRed_C(uint32_t* data, int num_pixels);
 
 // Must be called before calling any of the above methods.
 void VP8LDspInit(void);
@@ -66,7 +98,7 @@ void VP8LResidualImage(int width, int height, int bits,
                        uint32_t* const argb, uint32_t* const argb_scratch,
                        uint32_t* const image);
 
-void VP8LColorSpaceTransform(int width, int height, int bits, int step,
+void VP8LColorSpaceTransform(int width, int height, int bits, int quality,
                              uint32_t* const argb, uint32_t* image);
 
 //------------------------------------------------------------------------------
@@ -85,57 +117,54 @@ static WEBP_INLINE uint32_t VP8LSubSampleSize(uint32_t size,
   return (size + (1 << sampling_bits) - 1) >> sampling_bits;
 }
 
+// -----------------------------------------------------------------------------
 // Faster logarithm for integers. Small values use a look-up table.
 #define LOG_LOOKUP_IDX_MAX 256
 extern const float kLog2Table[LOG_LOOKUP_IDX_MAX];
 extern const float kSLog2Table[LOG_LOOKUP_IDX_MAX];
-float VP8LFastLog2Slow(int v);
-float VP8LFastSLog2Slow(int v);
-static WEBP_INLINE float VP8LFastLog2(int v) {
+typedef float (*VP8LFastLog2SlowFunc)(uint32_t v);
+
+extern VP8LFastLog2SlowFunc VP8LFastLog2Slow;
+extern VP8LFastLog2SlowFunc VP8LFastSLog2Slow;
+
+static WEBP_INLINE float VP8LFastLog2(uint32_t v) {
   return (v < LOG_LOOKUP_IDX_MAX) ? kLog2Table[v] : VP8LFastLog2Slow(v);
 }
 // Fast calculation of v * log2(v) for integer input.
-static WEBP_INLINE float VP8LFastSLog2(int v) {
+static WEBP_INLINE float VP8LFastSLog2(uint32_t v) {
   return (v < LOG_LOOKUP_IDX_MAX) ? kSLog2Table[v] : VP8LFastSLog2Slow(v);
 }
 
 // -----------------------------------------------------------------------------
-// PrefixEncode()
+// Huffman-cost related functions.
 
-// use GNU builtins where available.
-#if defined(__GNUC__) && \
-    ((__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || __GNUC__ >= 4)
-static WEBP_INLINE int BitsLog2Floor(uint32_t n) {
-  return 31 ^ __builtin_clz(n);
-}
-#elif defined(_MSC_VER) && _MSC_VER > 1310 && \
-      (defined(_M_X64) || defined(_M_IX86))
-#include <intrin.h>
-#pragma intrinsic(_BitScanReverse)
-
-static WEBP_INLINE int BitsLog2Floor(uint32_t n) {
-  unsigned long first_set_bit;
-  _BitScanReverse(&first_set_bit, n);
-  return first_set_bit;
-}
-#else
-// Returns (int)floor(log2(n)). n must be > 0.
-static WEBP_INLINE int BitsLog2Floor(uint32_t n) {
-  int log = 0;
-  uint32_t value = n;
-  int i;
-
-  for (i = 4; i >= 0; --i) {
-    const int shift = (1 << i);
-    const uint32_t x = value >> shift;
-    if (x != 0) {
-      value = x;
-      log += shift;
-    }
-  }
-  return log;
-}
-#endif
+typedef double (*VP8LCostFunc)(const uint32_t* population, int length);
+typedef double (*VP8LCostCombinedFunc)(const uint32_t* X, const uint32_t* Y,
+                                       int length);
+
+extern VP8LCostFunc VP8LExtraCost;
+extern VP8LCostCombinedFunc VP8LExtraCostCombined;
+
+typedef struct {        // small struct to hold counters
+  int counts[2];        // index: 0=zero steak, 1=non-zero streak
+  int streaks[2][2];    // [zero/non-zero][streak<3 / streak>=3]
+} VP8LStreaks;
+
+typedef VP8LStreaks (*VP8LCostCountFunc)(const uint32_t* population,
+                                         int length);
+typedef VP8LStreaks (*VP8LCostCombinedCountFunc)(const uint32_t* X,
+                                                 const uint32_t* Y, int length);
+
+extern VP8LCostCountFunc VP8LHuffmanCostCount;
+extern VP8LCostCombinedCountFunc VP8LHuffmanCostCombinedCount;
+
+typedef void (*VP8LHistogramAddFunc)(const VP8LHistogram* const a,
+                                     const VP8LHistogram* const b,
+                                     VP8LHistogram* const out);
+extern VP8LHistogramAddFunc VP8LHistogramAdd;
+
+// -----------------------------------------------------------------------------
+// PrefixEncode()
 
 static WEBP_INLINE int VP8LBitsLog2Ceiling(uint32_t n) {
   const int log_floor = BitsLog2Floor(n);
diff --git a/src/3rdparty/libwebp/src/dsp/lossless_mips32.c b/src/3rdparty/libwebp/src/dsp/lossless_mips32.c
new file mode 100644
index 0000000..1308580
--- /dev/null
+++ b/src/3rdparty/libwebp/src/dsp/lossless_mips32.c
@@ -0,0 +1,416 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// MIPS version of lossless functions
+//
+// Author(s):  Djordje Pesut    (djordje.pesut@imgtec.com)
+//             Jovan Zelincevic (jovan.zelincevic@imgtec.com)
+
+#include "./dsp.h"
+#include "./lossless.h"
+
+#if defined(WEBP_USE_MIPS32)
+
+#include <assert.h>
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#define APPROX_LOG_WITH_CORRECTION_MAX  65536
+#define APPROX_LOG_MAX                   4096
+#define LOG_2_RECIPROCAL 1.44269504088896338700465094007086
+
+static float FastSLog2Slow(uint32_t v) {
+  assert(v >= LOG_LOOKUP_IDX_MAX);
+  if (v < APPROX_LOG_WITH_CORRECTION_MAX) {
+    uint32_t log_cnt, y, correction;
+    const int c24 = 24;
+    const float v_f = (float)v;
+    uint32_t temp;
+
+    // Xf = 256 = 2^8
+    // log_cnt is index of leading one in upper 24 bits
+    __asm__ volatile(
+      "clz      %[log_cnt], %[v]                      \n\t"
+      "addiu    %[y],       $zero,        1           \n\t"
+      "subu     %[log_cnt], %[c24],       %[log_cnt]  \n\t"
+      "sllv     %[y],       %[y],         %[log_cnt]  \n\t"
+      "srlv     %[temp],    %[v],         %[log_cnt]  \n\t"
+      : [log_cnt]"=&r"(log_cnt), [y]"=&r"(y),
+        [temp]"=r"(temp)
+      : [c24]"r"(c24), [v]"r"(v)
+    );
+
+    // vf = (2^log_cnt) * Xf; where y = 2^log_cnt and Xf < 256
+    // Xf = floor(Xf) * (1 + (v % y) / v)
+    // log2(Xf) = log2(floor(Xf)) + log2(1 + (v % y) / v)
+    // The correction factor: log(1 + d) ~ d; for very small d values, so
+    // log2(1 + (v % y) / v) ~ LOG_2_RECIPROCAL * (v % y)/v
+    // LOG_2_RECIPROCAL ~ 23/16
+
+    // (v % y) = (v % 2^log_cnt) = v & (2^log_cnt - 1)
+    correction = (23 * (v & (y - 1))) >> 4;
+    return v_f * (kLog2Table[temp] + log_cnt) + correction;
+  } else {
+    return (float)(LOG_2_RECIPROCAL * v * log((double)v));
+  }
+}
+
+static float FastLog2Slow(uint32_t v) {
+  assert(v >= LOG_LOOKUP_IDX_MAX);
+  if (v < APPROX_LOG_WITH_CORRECTION_MAX) {
+    uint32_t log_cnt, y;
+    const int c24 = 24;
+    double log_2;
+    uint32_t temp;
+
+    __asm__ volatile(
+      "clz      %[log_cnt], %[v]                      \n\t"
+      "addiu    %[y],       $zero,        1           \n\t"
+      "subu     %[log_cnt], %[c24],       %[log_cnt]  \n\t"
+      "sllv     %[y],       %[y],         %[log_cnt]  \n\t"
+      "srlv     %[temp],    %[v],         %[log_cnt]  \n\t"
+      : [log_cnt]"=&r"(log_cnt), [y]"=&r"(y),
+        [temp]"=r"(temp)
+      : [c24]"r"(c24), [v]"r"(v)
+    );
+
+    log_2 = kLog2Table[temp] + log_cnt;
+    if (v >= APPROX_LOG_MAX) {
+      // Since the division is still expensive, add this correction factor only
+      // for large values of 'v'.
+
+      const uint32_t correction = (23 * (v & (y - 1))) >> 4;
+      log_2 += (double)correction / v;
+    }
+    return (float)log_2;
+  } else {
+    return (float)(LOG_2_RECIPROCAL * log((double)v));
+  }
+}
+
+// C version of this function:
+//   int i = 0;
+//   int64_t cost = 0;
+//   const uint32_t* pop = &population[4];
+//   const uint32_t* LoopEnd = &population[length];
+//   while (pop != LoopEnd) {
+//     ++i;
+//     cost += i * *pop;
+//     cost += i * *(pop + 1);
+//     pop += 2;
+//   }
+//   return (double)cost;
+static double ExtraCost(const uint32_t* const population, int length) {
+  int i, temp0, temp1;
+  const uint32_t* pop = &population[4];
+  const uint32_t* const LoopEnd = &population[length];
+
+  __asm__ volatile(
+    "mult   $zero,    $zero                  \n\t"
+    "xor    %[i],     %[i],       %[i]       \n\t"
+    "beq    %[pop],   %[LoopEnd], 2f         \n\t"
+  "1:                                        \n\t"
+    "lw     %[temp0], 0(%[pop])              \n\t"
+    "lw     %[temp1], 4(%[pop])              \n\t"
+    "addiu  %[i],     %[i],       1          \n\t"
+    "addiu  %[pop],   %[pop],     8          \n\t"
+    "madd   %[i],     %[temp0]               \n\t"
+    "madd   %[i],     %[temp1]               \n\t"
+    "bne    %[pop],   %[LoopEnd], 1b         \n\t"
+  "2:                                        \n\t"
+    "mfhi   %[temp0]                         \n\t"
+    "mflo   %[temp1]                         \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
+      [i]"=&r"(i), [pop]"+r"(pop)
+    : [LoopEnd]"r"(LoopEnd)
+    : "memory", "hi", "lo"
+  );
+
+  return (double)((int64_t)temp0 << 32 | temp1);
+}
+
+// C version of this function:
+//   int i = 0;
+//   int64_t cost = 0;
+//   const uint32_t* pX = &X[4];
+//   const uint32_t* pY = &Y[4];
+//   const uint32_t* LoopEnd = &X[length];
+//   while (pX != LoopEnd) {
+//     const uint32_t xy0 = *pX + *pY;
+//     const uint32_t xy1 = *(pX + 1) + *(pY + 1);
+//     ++i;
+//     cost += i * xy0;
+//     cost += i * xy1;
+//     pX += 2;
+//     pY += 2;
+//   }
+//   return (double)cost;
+static double ExtraCostCombined(const uint32_t* const X,
+                                const uint32_t* const Y, int length) {
+  int i, temp0, temp1, temp2, temp3;
+  const uint32_t* pX = &X[4];
+  const uint32_t* pY = &Y[4];
+  const uint32_t* const LoopEnd = &X[length];
+
+  __asm__ volatile(
+    "mult   $zero,    $zero                  \n\t"
+    "xor    %[i],     %[i],       %[i]       \n\t"
+    "beq    %[pX],    %[LoopEnd], 2f         \n\t"
+  "1:                                        \n\t"
+    "lw     %[temp0], 0(%[pX])               \n\t"
+    "lw     %[temp1], 0(%[pY])               \n\t"
+    "lw     %[temp2], 4(%[pX])               \n\t"
+    "lw     %[temp3], 4(%[pY])               \n\t"
+    "addiu  %[i],     %[i],       1          \n\t"
+    "addu   %[temp0], %[temp0],   %[temp1]   \n\t"
+    "addu   %[temp2], %[temp2],   %[temp3]   \n\t"
+    "addiu  %[pX],    %[pX],      8          \n\t"
+    "addiu  %[pY],    %[pY],      8          \n\t"
+    "madd   %[i],     %[temp0]               \n\t"
+    "madd   %[i],     %[temp2]               \n\t"
+    "bne    %[pX],    %[LoopEnd], 1b         \n\t"
+  "2:                                        \n\t"
+    "mfhi   %[temp0]                         \n\t"
+    "mflo   %[temp1]                         \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
+      [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
+      [i]"=&r"(i), [pX]"+r"(pX), [pY]"+r"(pY)
+    : [LoopEnd]"r"(LoopEnd)
+    : "memory", "hi", "lo"
+  );
+
+  return (double)((int64_t)temp0 << 32 | temp1);
+}
+
+#define HUFFMAN_COST_PASS                                 \
+  __asm__ volatile(                                       \
+    "sll   %[temp1],  %[temp0],    3           \n\t"      \
+    "addiu %[temp3],  %[streak],   -3          \n\t"      \
+    "addu  %[temp2],  %[pstreaks], %[temp1]    \n\t"      \
+    "blez  %[temp3],  1f                       \n\t"      \
+    "srl   %[temp1],  %[temp1],    1           \n\t"      \
+    "addu  %[temp3],  %[pcnts],    %[temp1]    \n\t"      \
+    "lw    %[temp0],  4(%[temp2])              \n\t"      \
+    "lw    %[temp1],  0(%[temp3])              \n\t"      \
+    "addu  %[temp0],  %[temp0],    %[streak]   \n\t"      \
+    "addiu %[temp1],  %[temp1],    1           \n\t"      \
+    "sw    %[temp0],  4(%[temp2])              \n\t"      \
+    "sw    %[temp1],  0(%[temp3])              \n\t"      \
+    "b     2f                                  \n\t"      \
+  "1:                                          \n\t"      \
+    "lw    %[temp0],  0(%[temp2])              \n\t"      \
+    "addu  %[temp0],  %[temp0],    %[streak]   \n\t"      \
+    "sw    %[temp0],  0(%[temp2])              \n\t"      \
+  "2:                                          \n\t"      \
+    : [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),           \
+      [temp3]"=&r"(temp3), [temp0]"+r"(temp0)             \
+    : [pstreaks]"r"(pstreaks), [pcnts]"r"(pcnts),         \
+      [streak]"r"(streak)                                 \
+    : "memory"                                            \
+  );
+
+// Returns the various RLE counts
+static VP8LStreaks HuffmanCostCount(const uint32_t* population, int length) {
+  int i;
+  int streak = 0;
+  VP8LStreaks stats;
+  int* const pstreaks = &stats.streaks[0][0];
+  int* const pcnts = &stats.counts[0];
+  int temp0, temp1, temp2, temp3;
+  memset(&stats, 0, sizeof(stats));
+  for (i = 0; i < length - 1; ++i) {
+    ++streak;
+    if (population[i] == population[i + 1]) {
+      continue;
+    }
+    temp0 = (population[i] != 0);
+    HUFFMAN_COST_PASS
+    streak = 0;
+  }
+  ++streak;
+  temp0 = (population[i] != 0);
+  HUFFMAN_COST_PASS
+
+  return stats;
+}
+
+static VP8LStreaks HuffmanCostCombinedCount(const uint32_t* X,
+                                            const uint32_t* Y, int length) {
+  int i;
+  int streak = 0;
+  VP8LStreaks stats;
+  int* const pstreaks = &stats.streaks[0][0];
+  int* const pcnts = &stats.counts[0];
+  int temp0, temp1, temp2, temp3;
+  memset(&stats, 0, sizeof(stats));
+  for (i = 0; i < length - 1; ++i) {
+    const uint32_t xy = X[i] + Y[i];
+    const uint32_t xy_next = X[i + 1] + Y[i + 1];
+    ++streak;
+    if (xy == xy_next) {
+      continue;
+    }
+    temp0 = (xy != 0);
+    HUFFMAN_COST_PASS
+    streak = 0;
+  }
+  {
+    const uint32_t xy = X[i] + Y[i];
+    ++streak;
+    temp0 = (xy != 0);
+    HUFFMAN_COST_PASS
+  }
+
+  return stats;
+}
+
+#define ASM_START                                       \
+  __asm__ volatile(                                     \
+    ".set   push                            \n\t"       \
+    ".set   at                              \n\t"       \
+    ".set   macro                           \n\t"       \
+  "1:                                       \n\t"
+
+// P2 = P0 + P1
+// A..D - offsets
+// E - temp variable to tell macro
+//     if pointer should be incremented
+// literal_ and successive histograms could be unaligned
+// so we must use ulw and usw
+#define ADD_TO_OUT(A, B, C, D, E, P0, P1, P2)           \
+    "ulw    %[temp0], "#A"(%["#P0"])        \n\t"       \
+    "ulw    %[temp1], "#B"(%["#P0"])        \n\t"       \
+    "ulw    %[temp2], "#C"(%["#P0"])        \n\t"       \
+    "ulw    %[temp3], "#D"(%["#P0"])        \n\t"       \
+    "ulw    %[temp4], "#A"(%["#P1"])        \n\t"       \
+    "ulw    %[temp5], "#B"(%["#P1"])        \n\t"       \
+    "ulw    %[temp6], "#C"(%["#P1"])        \n\t"       \
+    "ulw    %[temp7], "#D"(%["#P1"])        \n\t"       \
+    "addu   %[temp4], %[temp4],   %[temp0]  \n\t"       \
+    "addu   %[temp5], %[temp5],   %[temp1]  \n\t"       \
+    "addu   %[temp6], %[temp6],   %[temp2]  \n\t"       \
+    "addu   %[temp7], %[temp7],   %[temp3]  \n\t"       \
+    "addiu  %["#P0"],  %["#P0"],  16        \n\t"       \
+  ".if "#E" == 1                            \n\t"       \
+    "addiu  %["#P1"],  %["#P1"],  16        \n\t"       \
+  ".endif                                   \n\t"       \
+    "usw    %[temp4], "#A"(%["#P2"])        \n\t"       \
+    "usw    %[temp5], "#B"(%["#P2"])        \n\t"       \
+    "usw    %[temp6], "#C"(%["#P2"])        \n\t"       \
+    "usw    %[temp7], "#D"(%["#P2"])        \n\t"       \
+    "addiu  %["#P2"], %["#P2"],   16        \n\t"       \
+    "bne    %["#P0"], %[LoopEnd], 1b        \n\t"       \
+    ".set   pop                             \n\t"       \
+
+#define ASM_END_COMMON_0                                \
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),         \
+      [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),         \
+      [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),         \
+      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7),         \
+      [pa]"+r"(pa), [pout]"+r"(pout)
+
+#define ASM_END_COMMON_1                                \
+    : [LoopEnd]"r"(LoopEnd)                             \
+    : "memory", "at"                                    \
+  );
+
+#define ASM_END_0                                       \
+    ASM_END_COMMON_0                                    \
+      , [pb]"+r"(pb)                                    \
+    ASM_END_COMMON_1
+
+#define ASM_END_1                                       \
+    ASM_END_COMMON_0                                    \
+    ASM_END_COMMON_1
+
+#define ADD_VECTOR(A, B, OUT, SIZE, EXTRA_SIZE)  do {   \
+  const uint32_t* pa = (const uint32_t*)(A);            \
+  const uint32_t* pb = (const uint32_t*)(B);            \
+  uint32_t* pout = (uint32_t*)(OUT);                    \
+  const uint32_t* const LoopEnd = pa + (SIZE);          \
+  assert((SIZE) % 4 == 0);                              \
+  ASM_START                                             \
+  ADD_TO_OUT(0, 4, 8, 12, 1, pa, pb, pout)              \
+  ASM_END_0                                             \
+  if ((EXTRA_SIZE) > 0) {                               \
+    const int last = (EXTRA_SIZE);                      \
+    int i;                                              \
+    for (i = 0; i < last; ++i) pout[i] = pa[i] + pb[i]; \
+  }                                                     \
+} while (0)
+
+#define ADD_VECTOR_EQ(A, OUT, SIZE, EXTRA_SIZE)  do {   \
+  const uint32_t* pa = (const uint32_t*)(A);            \
+  uint32_t* pout = (uint32_t*)(OUT);                    \
+  const uint32_t* const LoopEnd = pa + (SIZE);          \
+  assert((SIZE) % 4 == 0);                              \
+  ASM_START                                             \
+  ADD_TO_OUT(0, 4, 8, 12, 0, pa, pout, pout)            \
+  ASM_END_1                                             \
+  if ((EXTRA_SIZE) > 0) {                               \
+    const int last = (EXTRA_SIZE);                      \
+    int i;                                              \
+    for (i = 0; i < last; ++i) pout[i] += pa[i];        \
+  }                                                     \
+} while (0)
+
+static void HistogramAdd(const VP8LHistogram* const a,
+                         const VP8LHistogram* const b,
+                         VP8LHistogram* const out) {
+  uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+  const int extra_cache_size = VP8LHistogramNumCodes(a->palette_code_bits_)
+                             - (NUM_LITERAL_CODES + NUM_LENGTH_CODES);
+  assert(a->palette_code_bits_ == b->palette_code_bits_);
+
+  if (b != out) {
+    ADD_VECTOR(a->literal_, b->literal_, out->literal_,
+               NUM_LITERAL_CODES + NUM_LENGTH_CODES, extra_cache_size);
+    ADD_VECTOR(a->distance_, b->distance_, out->distance_,
+               NUM_DISTANCE_CODES, 0);
+    ADD_VECTOR(a->red_, b->red_, out->red_, NUM_LITERAL_CODES, 0);
+    ADD_VECTOR(a->blue_, b->blue_, out->blue_, NUM_LITERAL_CODES, 0);
+    ADD_VECTOR(a->alpha_, b->alpha_, out->alpha_, NUM_LITERAL_CODES, 0);
+  } else {
+    ADD_VECTOR_EQ(a->literal_, out->literal_,
+                  NUM_LITERAL_CODES + NUM_LENGTH_CODES, extra_cache_size);
+    ADD_VECTOR_EQ(a->distance_, out->distance_, NUM_DISTANCE_CODES, 0);
+    ADD_VECTOR_EQ(a->red_, out->red_, NUM_LITERAL_CODES, 0);
+    ADD_VECTOR_EQ(a->blue_, out->blue_, NUM_LITERAL_CODES, 0);
+    ADD_VECTOR_EQ(a->alpha_, out->alpha_, NUM_LITERAL_CODES, 0);
+  }
+}
+
+#undef ADD_VECTOR_EQ
+#undef ADD_VECTOR
+#undef ASM_END_1
+#undef ASM_END_0
+#undef ASM_END_COMMON_1
+#undef ASM_END_COMMON_0
+#undef ADD_TO_OUT
+#undef ASM_START
+
+#endif  // WEBP_USE_MIPS32
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8LDspInitMIPS32(void);
+
+void VP8LDspInitMIPS32(void) {
+#if defined(WEBP_USE_MIPS32)
+  VP8LFastSLog2Slow = FastSLog2Slow;
+  VP8LFastLog2Slow = FastLog2Slow;
+  VP8LExtraCost = ExtraCost;
+  VP8LExtraCostCombined = ExtraCostCombined;
+  VP8LHuffmanCostCount = HuffmanCostCount;
+  VP8LHuffmanCostCombinedCount = HuffmanCostCombinedCount;
+  VP8LHistogramAdd = HistogramAdd;
+#endif  // WEBP_USE_MIPS32
+}
diff --git a/src/3rdparty/libwebp/src/dsp/lossless_neon.c b/src/3rdparty/libwebp/src/dsp/lossless_neon.c
new file mode 100644
index 0000000..8c82b19
--- /dev/null
+++ b/src/3rdparty/libwebp/src/dsp/lossless_neon.c
@@ -0,0 +1,357 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// NEON variant of methods for lossless decoder
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "./dsp.h"
+
+#if defined(WEBP_USE_NEON)
+
+#include <arm_neon.h>
+
+#include "./lossless.h"
+#include "./neon.h"
+
+//------------------------------------------------------------------------------
+// Colorspace conversion functions
+
+#if !defined(WORK_AROUND_GCC)
+// gcc 4.6.0 had some trouble (NDK-r9) with this code. We only use it for
+// gcc-4.8.x at least.
+static void ConvertBGRAToRGBA(const uint32_t* src,
+                              int num_pixels, uint8_t* dst) {
+  const uint32_t* const end = src + (num_pixels & ~15);
+  for (; src < end; src += 16) {
+    uint8x16x4_t pixel = vld4q_u8((uint8_t*)src);
+    // swap B and R. (VSWP d0,d2 has no intrinsics equivalent!)
+    const uint8x16_t tmp = pixel.val[0];
+    pixel.val[0] = pixel.val[2];
+    pixel.val[2] = tmp;
+    vst4q_u8(dst, pixel);
+    dst += 64;
+  }
+  VP8LConvertBGRAToRGBA_C(src, num_pixels & 15, dst);  // left-overs
+}
+
+static void ConvertBGRAToBGR(const uint32_t* src,
+                             int num_pixels, uint8_t* dst) {
+  const uint32_t* const end = src + (num_pixels & ~15);
+  for (; src < end; src += 16) {
+    const uint8x16x4_t pixel = vld4q_u8((uint8_t*)src);
+    const uint8x16x3_t tmp = { { pixel.val[0], pixel.val[1], pixel.val[2] } };
+    vst3q_u8(dst, tmp);
+    dst += 48;
+  }
+  VP8LConvertBGRAToBGR_C(src, num_pixels & 15, dst);  // left-overs
+}
+
+static void ConvertBGRAToRGB(const uint32_t* src,
+                             int num_pixels, uint8_t* dst) {
+  const uint32_t* const end = src + (num_pixels & ~15);
+  for (; src < end; src += 16) {
+    const uint8x16x4_t pixel = vld4q_u8((uint8_t*)src);
+    const uint8x16x3_t tmp = { { pixel.val[2], pixel.val[1], pixel.val[0] } };
+    vst3q_u8(dst, tmp);
+    dst += 48;
+  }
+  VP8LConvertBGRAToRGB_C(src, num_pixels & 15, dst);  // left-overs
+}
+
+#else  // WORK_AROUND_GCC
+
+// gcc-4.6.0 fallback
+
+static const uint8_t kRGBAShuffle[8] = { 2, 1, 0, 3, 6, 5, 4, 7 };
+
+static void ConvertBGRAToRGBA(const uint32_t* src,
+                              int num_pixels, uint8_t* dst) {
+  const uint32_t* const end = src + (num_pixels & ~1);
+  const uint8x8_t shuffle = vld1_u8(kRGBAShuffle);
+  for (; src < end; src += 2) {
+    const uint8x8_t pixels = vld1_u8((uint8_t*)src);
+    vst1_u8(dst, vtbl1_u8(pixels, shuffle));
+    dst += 8;
+  }
+  VP8LConvertBGRAToRGBA_C(src, num_pixels & 1, dst);  // left-overs
+}
+
+static const uint8_t kBGRShuffle[3][8] = {
+  {  0,  1,  2,  4,  5,  6,  8,  9 },
+  { 10, 12, 13, 14, 16, 17, 18, 20 },
+  { 21, 22, 24, 25, 26, 28, 29, 30 }
+};
+
+static void ConvertBGRAToBGR(const uint32_t* src,
+                             int num_pixels, uint8_t* dst) {
+  const uint32_t* const end = src + (num_pixels & ~7);
+  const uint8x8_t shuffle0 = vld1_u8(kBGRShuffle[0]);
+  const uint8x8_t shuffle1 = vld1_u8(kBGRShuffle[1]);
+  const uint8x8_t shuffle2 = vld1_u8(kBGRShuffle[2]);
+  for (; src < end; src += 8) {
+    uint8x8x4_t pixels;
+    INIT_VECTOR4(pixels,
+                 vld1_u8((const uint8_t*)(src + 0)),
+                 vld1_u8((const uint8_t*)(src + 2)),
+                 vld1_u8((const uint8_t*)(src + 4)),
+                 vld1_u8((const uint8_t*)(src + 6)));
+    vst1_u8(dst +  0, vtbl4_u8(pixels, shuffle0));
+    vst1_u8(dst +  8, vtbl4_u8(pixels, shuffle1));
+    vst1_u8(dst + 16, vtbl4_u8(pixels, shuffle2));
+    dst += 8 * 3;
+  }
+  VP8LConvertBGRAToBGR_C(src, num_pixels & 7, dst);  // left-overs
+}
+
+static const uint8_t kRGBShuffle[3][8] = {
+  {  2,  1,  0,  6,  5,  4, 10,  9 },
+  {  8, 14, 13, 12, 18, 17, 16, 22 },
+  { 21, 20, 26, 25, 24, 30, 29, 28 }
+};
+
+static void ConvertBGRAToRGB(const uint32_t* src,
+                             int num_pixels, uint8_t* dst) {
+  const uint32_t* const end = src + (num_pixels & ~7);
+  const uint8x8_t shuffle0 = vld1_u8(kRGBShuffle[0]);
+  const uint8x8_t shuffle1 = vld1_u8(kRGBShuffle[1]);
+  const uint8x8_t shuffle2 = vld1_u8(kRGBShuffle[2]);
+  for (; src < end; src += 8) {
+    uint8x8x4_t pixels;
+    INIT_VECTOR4(pixels,
+                 vld1_u8((const uint8_t*)(src + 0)),
+                 vld1_u8((const uint8_t*)(src + 2)),
+                 vld1_u8((const uint8_t*)(src + 4)),
+                 vld1_u8((const uint8_t*)(src + 6)));
+    vst1_u8(dst +  0, vtbl4_u8(pixels, shuffle0));
+    vst1_u8(dst +  8, vtbl4_u8(pixels, shuffle1));
+    vst1_u8(dst + 16, vtbl4_u8(pixels, shuffle2));
+    dst += 8 * 3;
+  }
+  VP8LConvertBGRAToRGB_C(src, num_pixels & 7, dst);  // left-overs
+}
+
+#endif   // !WORK_AROUND_GCC
+
+//------------------------------------------------------------------------------
+
+#ifdef USE_INTRINSICS
+
+static WEBP_INLINE uint32_t Average2(const uint32_t* const a,
+                                     const uint32_t* const b) {
+  const uint8x8_t a0 = vreinterpret_u8_u64(vcreate_u64(*a));
+  const uint8x8_t b0 = vreinterpret_u8_u64(vcreate_u64(*b));
+  const uint8x8_t avg = vhadd_u8(a0, b0);
+  return vget_lane_u32(vreinterpret_u32_u8(avg), 0);
+}
+
+static WEBP_INLINE uint32_t Average3(const uint32_t* const a,
+                                     const uint32_t* const b,
+                                     const uint32_t* const c) {
+  const uint8x8_t a0 = vreinterpret_u8_u64(vcreate_u64(*a));
+  const uint8x8_t b0 = vreinterpret_u8_u64(vcreate_u64(*b));
+  const uint8x8_t c0 = vreinterpret_u8_u64(vcreate_u64(*c));
+  const uint8x8_t avg1 = vhadd_u8(a0, c0);
+  const uint8x8_t avg2 = vhadd_u8(avg1, b0);
+  return vget_lane_u32(vreinterpret_u32_u8(avg2), 0);
+}
+
+static WEBP_INLINE uint32_t Average4(const uint32_t* const a,
+                                     const uint32_t* const b,
+                                     const uint32_t* const c,
+                                     const uint32_t* const d) {
+  const uint8x8_t a0 = vreinterpret_u8_u64(vcreate_u64(*a));
+  const uint8x8_t b0 = vreinterpret_u8_u64(vcreate_u64(*b));
+  const uint8x8_t c0 = vreinterpret_u8_u64(vcreate_u64(*c));
+  const uint8x8_t d0 = vreinterpret_u8_u64(vcreate_u64(*d));
+  const uint8x8_t avg1 = vhadd_u8(a0, b0);
+  const uint8x8_t avg2 = vhadd_u8(c0, d0);
+  const uint8x8_t avg3 = vhadd_u8(avg1, avg2);
+  return vget_lane_u32(vreinterpret_u32_u8(avg3), 0);
+}
+
+static uint32_t Predictor5(uint32_t left, const uint32_t* const top) {
+  return Average3(&left, top + 0, top + 1);
+}
+
+static uint32_t Predictor6(uint32_t left, const uint32_t* const top) {
+  return Average2(&left, top - 1);
+}
+
+static uint32_t Predictor7(uint32_t left, const uint32_t* const top) {
+  return Average2(&left, top + 0);
+}
+
+static uint32_t Predictor8(uint32_t left, const uint32_t* const top) {
+  (void)left;
+  return Average2(top - 1, top + 0);
+}
+
+static uint32_t Predictor9(uint32_t left, const uint32_t* const top) {
+  (void)left;
+  return Average2(top + 0, top + 1);
+}
+
+static uint32_t Predictor10(uint32_t left, const uint32_t* const top) {
+  return Average4(&left, top - 1, top + 0, top + 1);
+}
+
+//------------------------------------------------------------------------------
+
+static WEBP_INLINE uint32_t Select(const uint32_t* const c0,
+                                   const uint32_t* const c1,
+                                   const uint32_t* const c2) {
+  const uint8x8_t p0 = vreinterpret_u8_u64(vcreate_u64(*c0));
+  const uint8x8_t p1 = vreinterpret_u8_u64(vcreate_u64(*c1));
+  const uint8x8_t p2 = vreinterpret_u8_u64(vcreate_u64(*c2));
+  const uint8x8_t bc = vabd_u8(p1, p2);   // |b-c|
+  const uint8x8_t ac = vabd_u8(p0, p2);   // |a-c|
+  const int16x4_t sum_bc = vreinterpret_s16_u16(vpaddl_u8(bc));
+  const int16x4_t sum_ac = vreinterpret_s16_u16(vpaddl_u8(ac));
+  const int32x2_t diff = vpaddl_s16(vsub_s16(sum_bc, sum_ac));
+  const int32_t pa_minus_pb = vget_lane_s32(diff, 0);
+  return (pa_minus_pb <= 0) ? *c0 : *c1;
+}
+
+static uint32_t Predictor11(uint32_t left, const uint32_t* const top) {
+  return Select(top + 0, &left, top - 1);
+}
+
+static WEBP_INLINE uint32_t ClampedAddSubtractFull(const uint32_t* const c0,
+                                                   const uint32_t* const c1,
+                                                   const uint32_t* const c2) {
+  const uint8x8_t p0 = vreinterpret_u8_u64(vcreate_u64(*c0));
+  const uint8x8_t p1 = vreinterpret_u8_u64(vcreate_u64(*c1));
+  const uint8x8_t p2 = vreinterpret_u8_u64(vcreate_u64(*c2));
+  const uint16x8_t sum0 = vaddl_u8(p0, p1);                // add and widen
+  const uint16x8_t sum1 = vqsubq_u16(sum0, vmovl_u8(p2));  // widen and subtract
+  const uint8x8_t out = vqmovn_u16(sum1);                  // narrow and clamp
+  return vget_lane_u32(vreinterpret_u32_u8(out), 0);
+}
+
+static uint32_t Predictor12(uint32_t left, const uint32_t* const top) {
+  return ClampedAddSubtractFull(&left, top + 0, top - 1);
+}
+
+static WEBP_INLINE uint32_t ClampedAddSubtractHalf(const uint32_t* const c0,
+                                                   const uint32_t* const c1,
+                                                   const uint32_t* const c2) {
+  const uint8x8_t p0 = vreinterpret_u8_u64(vcreate_u64(*c0));
+  const uint8x8_t p1 = vreinterpret_u8_u64(vcreate_u64(*c1));
+  const uint8x8_t p2 = vreinterpret_u8_u64(vcreate_u64(*c2));
+  const uint8x8_t avg = vhadd_u8(p0, p1);                  // Average(c0,c1)
+  const uint8x8_t ab = vshr_n_u8(vqsub_u8(avg, p2), 1);    // (a-b)>>1 saturated
+  const uint8x8_t ba = vshr_n_u8(vqsub_u8(p2, avg), 1);    // (b-a)>>1 saturated
+  const uint8x8_t out = vqsub_u8(vqadd_u8(avg, ab), ba);
+  return vget_lane_u32(vreinterpret_u32_u8(out), 0);
+}
+
+static uint32_t Predictor13(uint32_t left, const uint32_t* const top) {
+  return ClampedAddSubtractHalf(&left, top + 0, top - 1);
+}
+
+//------------------------------------------------------------------------------
+// Subtract-Green Transform
+
+// vtbl?_u8 are marked unavailable for iOS arm64 with Xcode < 6.3, use
+// non-standard versions there.
+#if defined(__APPLE__) && defined(__aarch64__) && \
+    defined(__apple_build_version__) && (__apple_build_version__< 6020037)
+#define USE_VTBLQ
+#endif
+
+#ifdef USE_VTBLQ
+// 255 = byte will be zeroed
+static const uint8_t kGreenShuffle[16] = {
+  1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255
+};
+
+static WEBP_INLINE uint8x16_t DoGreenShuffle(const uint8x16_t argb,
+                                             const uint8x16_t shuffle) {
+  return vcombine_u8(vtbl1q_u8(argb, vget_low_u8(shuffle)),
+                     vtbl1q_u8(argb, vget_high_u8(shuffle)));
+}
+#else  // !USE_VTBLQ
+// 255 = byte will be zeroed
+static const uint8_t kGreenShuffle[8] = { 1, 255, 1, 255, 5, 255, 5, 255  };
+
+static WEBP_INLINE uint8x16_t DoGreenShuffle(const uint8x16_t argb,
+                                             const uint8x8_t shuffle) {
+  return vcombine_u8(vtbl1_u8(vget_low_u8(argb), shuffle),
+                     vtbl1_u8(vget_high_u8(argb), shuffle));
+}
+#endif  // USE_VTBLQ
+
+static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) {
+  const uint32_t* const end = argb_data + (num_pixels & ~3);
+#ifdef USE_VTBLQ
+  const uint8x16_t shuffle = vld1q_u8(kGreenShuffle);
+#else
+  const uint8x8_t shuffle = vld1_u8(kGreenShuffle);
+#endif
+  for (; argb_data < end; argb_data += 4) {
+    const uint8x16_t argb = vld1q_u8((uint8_t*)argb_data);
+    const uint8x16_t greens = DoGreenShuffle(argb, shuffle);
+    vst1q_u8((uint8_t*)argb_data, vsubq_u8(argb, greens));
+  }
+  // fallthrough and finish off with plain-C
+  VP8LSubtractGreenFromBlueAndRed_C(argb_data, num_pixels & 3);
+}
+
+static void AddGreenToBlueAndRed(uint32_t* argb_data, int num_pixels) {
+  const uint32_t* const end = argb_data + (num_pixels & ~3);
+#ifdef USE_VTBLQ
+  const uint8x16_t shuffle = vld1q_u8(kGreenShuffle);
+#else
+  const uint8x8_t shuffle = vld1_u8(kGreenShuffle);
+#endif
+  for (; argb_data < end; argb_data += 4) {
+    const uint8x16_t argb = vld1q_u8((uint8_t*)argb_data);
+    const uint8x16_t greens = DoGreenShuffle(argb, shuffle);
+    vst1q_u8((uint8_t*)argb_data, vaddq_u8(argb, greens));
+  }
+  // fallthrough and finish off with plain-C
+  VP8LAddGreenToBlueAndRed_C(argb_data, num_pixels & 3);
+}
+
+#undef USE_VTBLQ
+
+#endif   // USE_INTRINSICS
+
+#endif   // WEBP_USE_NEON
+
+//------------------------------------------------------------------------------
+
+extern void VP8LDspInitNEON(void);
+
+void VP8LDspInitNEON(void) {
+#if defined(WEBP_USE_NEON)
+  VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA;
+  VP8LConvertBGRAToBGR = ConvertBGRAToBGR;
+  VP8LConvertBGRAToRGB = ConvertBGRAToRGB;
+
+#ifdef USE_INTRINSICS
+  VP8LPredictors[5] = Predictor5;
+  VP8LPredictors[6] = Predictor6;
+  VP8LPredictors[7] = Predictor7;
+  VP8LPredictors[8] = Predictor8;
+  VP8LPredictors[9] = Predictor9;
+  VP8LPredictors[10] = Predictor10;
+  VP8LPredictors[11] = Predictor11;
+  VP8LPredictors[12] = Predictor12;
+  VP8LPredictors[13] = Predictor13;
+
+  VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed;
+  VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed;
+#endif
+
+#endif   // WEBP_USE_NEON
+}
+
+//------------------------------------------------------------------------------
diff --git a/src/3rdparty/libwebp/src/dsp/lossless_sse2.c b/src/3rdparty/libwebp/src/dsp/lossless_sse2.c
new file mode 100644
index 0000000..7130909
--- /dev/null
+++ b/src/3rdparty/libwebp/src/dsp/lossless_sse2.c
@@ -0,0 +1,535 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// SSE2 variant of methods for lossless decoder
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "./dsp.h"
+
+#include <assert.h>
+
+#if defined(WEBP_USE_SSE2)
+#include <emmintrin.h>
+#include "./lossless.h"
+
+//------------------------------------------------------------------------------
+// Predictor Transform
+
+static WEBP_INLINE uint32_t ClampedAddSubtractFull(uint32_t c0, uint32_t c1,
+                                                   uint32_t c2) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i C0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c0), zero);
+  const __m128i C1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c1), zero);
+  const __m128i C2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c2), zero);
+  const __m128i V1 = _mm_add_epi16(C0, C1);
+  const __m128i V2 = _mm_sub_epi16(V1, C2);
+  const __m128i b = _mm_packus_epi16(V2, V2);
+  const uint32_t output = _mm_cvtsi128_si32(b);
+  return output;
+}
+
+static WEBP_INLINE uint32_t ClampedAddSubtractHalf(uint32_t c0, uint32_t c1,
+                                                   uint32_t c2) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i C0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c0), zero);
+  const __m128i C1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c1), zero);
+  const __m128i B0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c2), zero);
+  const __m128i avg = _mm_add_epi16(C1, C0);
+  const __m128i A0 = _mm_srli_epi16(avg, 1);
+  const __m128i A1 = _mm_sub_epi16(A0, B0);
+  const __m128i BgtA = _mm_cmpgt_epi16(B0, A0);
+  const __m128i A2 = _mm_sub_epi16(A1, BgtA);
+  const __m128i A3 = _mm_srai_epi16(A2, 1);
+  const __m128i A4 = _mm_add_epi16(A0, A3);
+  const __m128i A5 = _mm_packus_epi16(A4, A4);
+  const uint32_t output = _mm_cvtsi128_si32(A5);
+  return output;
+}
+
+static WEBP_INLINE uint32_t Select(uint32_t a, uint32_t b, uint32_t c) {
+  int pa_minus_pb;
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i A0 = _mm_cvtsi32_si128(a);
+  const __m128i B0 = _mm_cvtsi32_si128(b);
+  const __m128i C0 = _mm_cvtsi32_si128(c);
+  const __m128i AC0 = _mm_subs_epu8(A0, C0);
+  const __m128i CA0 = _mm_subs_epu8(C0, A0);
+  const __m128i BC0 = _mm_subs_epu8(B0, C0);
+  const __m128i CB0 = _mm_subs_epu8(C0, B0);
+  const __m128i AC = _mm_or_si128(AC0, CA0);
+  const __m128i BC = _mm_or_si128(BC0, CB0);
+  const __m128i pa = _mm_unpacklo_epi8(AC, zero);  // |a - c|
+  const __m128i pb = _mm_unpacklo_epi8(BC, zero);  // |b - c|
+  const __m128i diff = _mm_sub_epi16(pb, pa);
+  {
+    int16_t out[8];
+    _mm_storeu_si128((__m128i*)out, diff);
+    pa_minus_pb = out[0] + out[1] + out[2] + out[3];
+  }
+  return (pa_minus_pb <= 0) ? a : b;
+}
+
+static WEBP_INLINE __m128i Average2_128i(uint32_t a0, uint32_t a1) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i A0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(a0), zero);
+  const __m128i A1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(a1), zero);
+  const __m128i sum = _mm_add_epi16(A1, A0);
+  const __m128i avg = _mm_srli_epi16(sum, 1);
+  return avg;
+}
+
+static WEBP_INLINE uint32_t Average2(uint32_t a0, uint32_t a1) {
+  const __m128i avg = Average2_128i(a0, a1);
+  const __m128i A2 = _mm_packus_epi16(avg, avg);
+  const uint32_t output = _mm_cvtsi128_si32(A2);
+  return output;
+}
+
+static WEBP_INLINE uint32_t Average3(uint32_t a0, uint32_t a1, uint32_t a2) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i avg1 = Average2_128i(a0, a2);
+  const __m128i A1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(a1), zero);
+  const __m128i sum = _mm_add_epi16(avg1, A1);
+  const __m128i avg2 = _mm_srli_epi16(sum, 1);
+  const __m128i A2 = _mm_packus_epi16(avg2, avg2);
+  const uint32_t output = _mm_cvtsi128_si32(A2);
+  return output;
+}
+
+static WEBP_INLINE uint32_t Average4(uint32_t a0, uint32_t a1,
+                                     uint32_t a2, uint32_t a3) {
+  const __m128i avg1 = Average2_128i(a0, a1);
+  const __m128i avg2 = Average2_128i(a2, a3);
+  const __m128i sum = _mm_add_epi16(avg2, avg1);
+  const __m128i avg3 = _mm_srli_epi16(sum, 1);
+  const __m128i A0 = _mm_packus_epi16(avg3, avg3);
+  const uint32_t output = _mm_cvtsi128_si32(A0);
+  return output;
+}
+
+static uint32_t Predictor5(uint32_t left, const uint32_t* const top) {
+  const uint32_t pred = Average3(left, top[0], top[1]);
+  return pred;
+}
+static uint32_t Predictor6(uint32_t left, const uint32_t* const top) {
+  const uint32_t pred = Average2(left, top[-1]);
+  return pred;
+}
+static uint32_t Predictor7(uint32_t left, const uint32_t* const top) {
+  const uint32_t pred = Average2(left, top[0]);
+  return pred;
+}
+static uint32_t Predictor8(uint32_t left, const uint32_t* const top) {
+  const uint32_t pred = Average2(top[-1], top[0]);
+  (void)left;
+  return pred;
+}
+static uint32_t Predictor9(uint32_t left, const uint32_t* const top) {
+  const uint32_t pred = Average2(top[0], top[1]);
+  (void)left;
+  return pred;
+}
+static uint32_t Predictor10(uint32_t left, const uint32_t* const top) {
+  const uint32_t pred = Average4(left, top[-1], top[0], top[1]);
+  return pred;
+}
+static uint32_t Predictor11(uint32_t left, const uint32_t* const top) {
+  const uint32_t pred = Select(top[0], left, top[-1]);
+  return pred;
+}
+static uint32_t Predictor12(uint32_t left, const uint32_t* const top) {
+  const uint32_t pred = ClampedAddSubtractFull(left, top[0], top[-1]);
+  return pred;
+}
+static uint32_t Predictor13(uint32_t left, const uint32_t* const top) {
+  const uint32_t pred = ClampedAddSubtractHalf(left, top[0], top[-1]);
+  return pred;
+}
+
+//------------------------------------------------------------------------------
+// Subtract-Green Transform
+
+static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) {
+  const __m128i mask = _mm_set1_epi32(0x0000ff00);
+  int i;
+  for (i = 0; i + 4 <= num_pixels; i += 4) {
+    const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]);
+    const __m128i in_00g0 = _mm_and_si128(in, mask);     // 00g0|00g0|...
+    const __m128i in_0g00 = _mm_slli_epi32(in_00g0, 8);  // 0g00|0g00|...
+    const __m128i in_000g = _mm_srli_epi32(in_00g0, 8);  // 000g|000g|...
+    const __m128i in_0g0g = _mm_or_si128(in_0g00, in_000g);
+    const __m128i out = _mm_sub_epi8(in, in_0g0g);
+    _mm_storeu_si128((__m128i*)&argb_data[i], out);
+  }
+  // fallthrough and finish off with plain-C
+  VP8LSubtractGreenFromBlueAndRed_C(argb_data + i, num_pixels - i);
+}
+
+static void AddGreenToBlueAndRed(uint32_t* argb_data, int num_pixels) {
+  const __m128i mask = _mm_set1_epi32(0x0000ff00);
+  int i;
+  for (i = 0; i + 4 <= num_pixels; i += 4) {
+    const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]);
+    const __m128i in_00g0 = _mm_and_si128(in, mask);     // 00g0|00g0|...
+    const __m128i in_0g00 = _mm_slli_epi32(in_00g0, 8);  // 0g00|0g00|...
+    const __m128i in_000g = _mm_srli_epi32(in_00g0, 8);  // 000g|000g|...
+    const __m128i in_0g0g = _mm_or_si128(in_0g00, in_000g);
+    const __m128i out = _mm_add_epi8(in, in_0g0g);
+    _mm_storeu_si128((__m128i*)&argb_data[i], out);
+  }
+  // fallthrough and finish off with plain-C
+  VP8LAddGreenToBlueAndRed_C(argb_data + i, num_pixels - i);
+}
+
+//------------------------------------------------------------------------------
+// Color Transform
+
+static WEBP_INLINE __m128i ColorTransformDelta(__m128i color_pred,
+                                               __m128i color) {
+  // We simulate signed 8-bit multiplication as:
+  // * Left shift the two (8-bit) numbers by 8 bits,
+  // * Perform a 16-bit signed multiplication and retain the higher 16-bits.
+  const __m128i color_pred_shifted = _mm_slli_epi32(color_pred, 8);
+  const __m128i color_shifted = _mm_slli_epi32(color, 8);
+  // Note: This performs multiplication on 8 packed 16-bit numbers, 4 of which
+  // happen to be zeroes.
+  const __m128i signed_mult =
+      _mm_mulhi_epi16(color_pred_shifted, color_shifted);
+  return _mm_srli_epi32(signed_mult, 5);
+}
+
+static WEBP_INLINE void TransformColor(const VP8LMultipliers* const m,
+                                       uint32_t* argb_data,
+                                       int num_pixels) {
+  const __m128i g_to_r = _mm_set1_epi32(m->green_to_red_);       // multipliers
+  const __m128i g_to_b = _mm_set1_epi32(m->green_to_blue_);
+  const __m128i r_to_b = _mm_set1_epi32(m->red_to_blue_);
+
+  int i;
+
+  for (i = 0; i + 4 <= num_pixels; i += 4) {
+    const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]);
+    const __m128i alpha_green_mask = _mm_set1_epi32(0xff00ff00);  // masks
+    const __m128i red_mask = _mm_set1_epi32(0x00ff0000);
+    const __m128i green_mask = _mm_set1_epi32(0x0000ff00);
+    const __m128i lower_8bit_mask  = _mm_set1_epi32(0x000000ff);
+    const __m128i ag = _mm_and_si128(in, alpha_green_mask);      // alpha, green
+    const __m128i r = _mm_srli_epi32(_mm_and_si128(in, red_mask), 16);
+    const __m128i g = _mm_srli_epi32(_mm_and_si128(in, green_mask), 8);
+    const __m128i b = in;
+
+    const __m128i r_delta = ColorTransformDelta(g_to_r, g);      // red
+    const __m128i r_new =
+        _mm_and_si128(_mm_sub_epi32(r, r_delta), lower_8bit_mask);
+    const __m128i r_new_shifted = _mm_slli_epi32(r_new, 16);
+
+    const __m128i b_delta_1 = ColorTransformDelta(g_to_b, g);    // blue
+    const __m128i b_delta_2 = ColorTransformDelta(r_to_b, r);
+    const __m128i b_delta = _mm_add_epi32(b_delta_1, b_delta_2);
+    const __m128i b_new =
+        _mm_and_si128(_mm_sub_epi32(b, b_delta), lower_8bit_mask);
+
+    const __m128i out = _mm_or_si128(_mm_or_si128(ag, r_new_shifted), b_new);
+    _mm_storeu_si128((__m128i*)&argb_data[i], out);
+  }
+
+  // Fall-back to C-version for left-overs.
+  VP8LTransformColor_C(m, argb_data + i, num_pixels - i);
+}
+
+static WEBP_INLINE void TransformColorInverse(const VP8LMultipliers* const m,
+                                              uint32_t* argb_data,
+                                              int num_pixels) {
+  const __m128i g_to_r = _mm_set1_epi32(m->green_to_red_);       // multipliers
+  const __m128i g_to_b = _mm_set1_epi32(m->green_to_blue_);
+  const __m128i r_to_b = _mm_set1_epi32(m->red_to_blue_);
+
+  int i;
+
+  for (i = 0; i + 4 <= num_pixels; i += 4) {
+    const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]);
+    const __m128i alpha_green_mask = _mm_set1_epi32(0xff00ff00);  // masks
+    const __m128i red_mask = _mm_set1_epi32(0x00ff0000);
+    const __m128i green_mask = _mm_set1_epi32(0x0000ff00);
+    const __m128i lower_8bit_mask  = _mm_set1_epi32(0x000000ff);
+    const __m128i ag = _mm_and_si128(in, alpha_green_mask);      // alpha, green
+    const __m128i r = _mm_srli_epi32(_mm_and_si128(in, red_mask), 16);
+    const __m128i g = _mm_srli_epi32(_mm_and_si128(in, green_mask), 8);
+    const __m128i b = in;
+
+    const __m128i r_delta = ColorTransformDelta(g_to_r, g);      // red
+    const __m128i r_new =
+        _mm_and_si128(_mm_add_epi32(r, r_delta), lower_8bit_mask);
+    const __m128i r_new_shifted = _mm_slli_epi32(r_new, 16);
+
+    const __m128i b_delta_1 = ColorTransformDelta(g_to_b, g);    // blue
+    const __m128i b_delta_2 = ColorTransformDelta(r_to_b, r_new);
+    const __m128i b_delta = _mm_add_epi32(b_delta_1, b_delta_2);
+    const __m128i b_new =
+        _mm_and_si128(_mm_add_epi32(b, b_delta), lower_8bit_mask);
+
+    const __m128i out = _mm_or_si128(_mm_or_si128(ag, r_new_shifted), b_new);
+    _mm_storeu_si128((__m128i*)&argb_data[i], out);
+  }
+
+  // Fall-back to C-version for left-overs.
+  VP8LTransformColorInverse_C(m, argb_data + i, num_pixels - i);
+}
+
+//------------------------------------------------------------------------------
+// Color-space conversion functions
+
+static void ConvertBGRAToRGBA(const uint32_t* src,
+                              int num_pixels, uint8_t* dst) {
+  const __m128i* in = (const __m128i*)src;
+  __m128i* out = (__m128i*)dst;
+  while (num_pixels >= 8) {
+    const __m128i bgra0 = _mm_loadu_si128(in++);     // bgra0|bgra1|bgra2|bgra3
+    const __m128i bgra4 = _mm_loadu_si128(in++);     // bgra4|bgra5|bgra6|bgra7
+    const __m128i v0l = _mm_unpacklo_epi8(bgra0, bgra4);  // b0b4g0g4r0r4a0a4...
+    const __m128i v0h = _mm_unpackhi_epi8(bgra0, bgra4);  // b2b6g2g6r2r6a2a6...
+    const __m128i v1l = _mm_unpacklo_epi8(v0l, v0h);   // b0b2b4b6g0g2g4g6...
+    const __m128i v1h = _mm_unpackhi_epi8(v0l, v0h);   // b1b3b5b7g1g3g5g7...
+    const __m128i v2l = _mm_unpacklo_epi8(v1l, v1h);   // b0...b7 | g0...g7
+    const __m128i v2h = _mm_unpackhi_epi8(v1l, v1h);   // r0...r7 | a0...a7
+    const __m128i ga0 = _mm_unpackhi_epi64(v2l, v2h);  // g0...g7 | a0...a7
+    const __m128i rb0 = _mm_unpacklo_epi64(v2h, v2l);  // r0...r7 | b0...b7
+    const __m128i rg0 = _mm_unpacklo_epi8(rb0, ga0);   // r0g0r1g1 ... r6g6r7g7
+    const __m128i ba0 = _mm_unpackhi_epi8(rb0, ga0);   // b0a0b1a1 ... b6a6b7a7
+    const __m128i rgba0 = _mm_unpacklo_epi16(rg0, ba0);  // rgba0|rgba1...
+    const __m128i rgba4 = _mm_unpackhi_epi16(rg0, ba0);  // rgba4|rgba5...
+    _mm_storeu_si128(out++, rgba0);
+    _mm_storeu_si128(out++, rgba4);
+    num_pixels -= 8;
+  }
+  // left-overs
+  VP8LConvertBGRAToRGBA_C((const uint32_t*)in, num_pixels, (uint8_t*)out);
+}
+
+static void ConvertBGRAToRGBA4444(const uint32_t* src,
+                                  int num_pixels, uint8_t* dst) {
+  const __m128i mask_0x0f = _mm_set1_epi8(0x0f);
+  const __m128i mask_0xf0 = _mm_set1_epi8(0xf0);
+  const __m128i* in = (const __m128i*)src;
+  __m128i* out = (__m128i*)dst;
+  while (num_pixels >= 8) {
+    const __m128i bgra0 = _mm_loadu_si128(in++);     // bgra0|bgra1|bgra2|bgra3
+    const __m128i bgra4 = _mm_loadu_si128(in++);     // bgra4|bgra5|bgra6|bgra7
+    const __m128i v0l = _mm_unpacklo_epi8(bgra0, bgra4);  // b0b4g0g4r0r4a0a4...
+    const __m128i v0h = _mm_unpackhi_epi8(bgra0, bgra4);  // b2b6g2g6r2r6a2a6...
+    const __m128i v1l = _mm_unpacklo_epi8(v0l, v0h);    // b0b2b4b6g0g2g4g6...
+    const __m128i v1h = _mm_unpackhi_epi8(v0l, v0h);    // b1b3b5b7g1g3g5g7...
+    const __m128i v2l = _mm_unpacklo_epi8(v1l, v1h);    // b0...b7 | g0...g7
+    const __m128i v2h = _mm_unpackhi_epi8(v1l, v1h);    // r0...r7 | a0...a7
+    const __m128i ga0 = _mm_unpackhi_epi64(v2l, v2h);   // g0...g7 | a0...a7
+    const __m128i rb0 = _mm_unpacklo_epi64(v2h, v2l);   // r0...r7 | b0...b7
+    const __m128i ga1 = _mm_srli_epi16(ga0, 4);         // g0-|g1-|...|a6-|a7-
+    const __m128i rb1 = _mm_and_si128(rb0, mask_0xf0);  // -r0|-r1|...|-b6|-a7
+    const __m128i ga2 = _mm_and_si128(ga1, mask_0x0f);  // g0-|g1-|...|a6-|a7-
+    const __m128i rgba0 = _mm_or_si128(ga2, rb1);       // rg0..rg7 | ba0..ba7
+    const __m128i rgba1 = _mm_srli_si128(rgba0, 8);     // ba0..ba7 | 0
+#ifdef WEBP_SWAP_16BIT_CSP
+    const __m128i rgba = _mm_unpacklo_epi8(rgba1, rgba0);  // barg0...barg7
+#else
+    const __m128i rgba = _mm_unpacklo_epi8(rgba0, rgba1);  // rgba0...rgba7
+#endif
+    _mm_storeu_si128(out++, rgba);
+    num_pixels -= 8;
+  }
+  // left-overs
+  VP8LConvertBGRAToRGBA4444_C((const uint32_t*)in, num_pixels, (uint8_t*)out);
+}
+
+static void ConvertBGRAToRGB565(const uint32_t* src,
+                                int num_pixels, uint8_t* dst) {
+  const __m128i mask_0xe0 = _mm_set1_epi8(0xe0);
+  const __m128i mask_0xf8 = _mm_set1_epi8(0xf8);
+  const __m128i mask_0x07 = _mm_set1_epi8(0x07);
+  const __m128i* in = (const __m128i*)src;
+  __m128i* out = (__m128i*)dst;
+  while (num_pixels >= 8) {
+    const __m128i bgra0 = _mm_loadu_si128(in++);     // bgra0|bgra1|bgra2|bgra3
+    const __m128i bgra4 = _mm_loadu_si128(in++);     // bgra4|bgra5|bgra6|bgra7
+    const __m128i v0l = _mm_unpacklo_epi8(bgra0, bgra4);  // b0b4g0g4r0r4a0a4...
+    const __m128i v0h = _mm_unpackhi_epi8(bgra0, bgra4);  // b2b6g2g6r2r6a2a6...
+    const __m128i v1l = _mm_unpacklo_epi8(v0l, v0h);      // b0b2b4b6g0g2g4g6...
+    const __m128i v1h = _mm_unpackhi_epi8(v0l, v0h);      // b1b3b5b7g1g3g5g7...
+    const __m128i v2l = _mm_unpacklo_epi8(v1l, v1h);      // b0...b7 | g0...g7
+    const __m128i v2h = _mm_unpackhi_epi8(v1l, v1h);      // r0...r7 | a0...a7
+    const __m128i ga0 = _mm_unpackhi_epi64(v2l, v2h);     // g0...g7 | a0...a7
+    const __m128i rb0 = _mm_unpacklo_epi64(v2h, v2l);     // r0...r7 | b0...b7
+    const __m128i rb1 = _mm_and_si128(rb0, mask_0xf8);    // -r0..-r7|-b0..-b7
+    const __m128i g_lo1 = _mm_srli_epi16(ga0, 5);
+    const __m128i g_lo2 = _mm_and_si128(g_lo1, mask_0x07);  // g0-...g7-|xx (3b)
+    const __m128i g_hi1 = _mm_slli_epi16(ga0, 3);
+    const __m128i g_hi2 = _mm_and_si128(g_hi1, mask_0xe0);  // -g0...-g7|xx (3b)
+    const __m128i b0 = _mm_srli_si128(rb1, 8);              // -b0...-b7|0
+    const __m128i rg1 = _mm_or_si128(rb1, g_lo2);           // gr0...gr7|xx
+    const __m128i b1 = _mm_srli_epi16(b0, 3);
+    const __m128i gb1 = _mm_or_si128(b1, g_hi2);            // bg0...bg7|xx
+#ifdef WEBP_SWAP_16BIT_CSP
+    const __m128i rgba = _mm_unpacklo_epi8(gb1, rg1);     // rggb0...rggb7
+#else
+    const __m128i rgba = _mm_unpacklo_epi8(rg1, gb1);     // bgrb0...bgrb7
+#endif
+    _mm_storeu_si128(out++, rgba);
+    num_pixels -= 8;
+  }
+  // left-overs
+  VP8LConvertBGRAToRGB565_C((const uint32_t*)in, num_pixels, (uint8_t*)out);
+}
+
+static void ConvertBGRAToBGR(const uint32_t* src,
+                             int num_pixels, uint8_t* dst) {
+  const __m128i mask_l = _mm_set_epi32(0, 0x00ffffff, 0, 0x00ffffff);
+  const __m128i mask_h = _mm_set_epi32(0x00ffffff, 0, 0x00ffffff, 0);
+  const __m128i* in = (const __m128i*)src;
+  const uint8_t* const end = dst + num_pixels * 3;
+  // the last storel_epi64 below writes 8 bytes starting at offset 18
+  while (dst + 26 <= end) {
+    const __m128i bgra0 = _mm_loadu_si128(in++);     // bgra0|bgra1|bgra2|bgra3
+    const __m128i bgra4 = _mm_loadu_si128(in++);     // bgra4|bgra5|bgra6|bgra7
+    const __m128i a0l = _mm_and_si128(bgra0, mask_l);   // bgr0|0|bgr0|0
+    const __m128i a4l = _mm_and_si128(bgra4, mask_l);   // bgr0|0|bgr0|0
+    const __m128i a0h = _mm_and_si128(bgra0, mask_h);   // 0|bgr0|0|bgr0
+    const __m128i a4h = _mm_and_si128(bgra4, mask_h);   // 0|bgr0|0|bgr0
+    const __m128i b0h = _mm_srli_epi64(a0h, 8);         // 000b|gr00|000b|gr00
+    const __m128i b4h = _mm_srli_epi64(a4h, 8);         // 000b|gr00|000b|gr00
+    const __m128i c0 = _mm_or_si128(a0l, b0h);          // rgbrgb00|rgbrgb00
+    const __m128i c4 = _mm_or_si128(a4l, b4h);          // rgbrgb00|rgbrgb00
+    const __m128i c2 = _mm_srli_si128(c0, 8);
+    const __m128i c6 = _mm_srli_si128(c4, 8);
+    _mm_storel_epi64((__m128i*)(dst +   0), c0);
+    _mm_storel_epi64((__m128i*)(dst +   6), c2);
+    _mm_storel_epi64((__m128i*)(dst +  12), c4);
+    _mm_storel_epi64((__m128i*)(dst +  18), c6);
+    dst += 24;
+    num_pixels -= 8;
+  }
+  // left-overs
+  VP8LConvertBGRAToBGR_C((const uint32_t*)in, num_pixels, dst);
+}
+
+//------------------------------------------------------------------------------
+
+#define LINE_SIZE 16    // 8 or 16
+static void AddVector(const uint32_t* a, const uint32_t* b, uint32_t* out,
+                      int size) {
+  int i;
+  assert(size % LINE_SIZE == 0);
+  for (i = 0; i < size; i += LINE_SIZE) {
+    const __m128i a0 = _mm_loadu_si128((__m128i*)&a[i +  0]);
+    const __m128i a1 = _mm_loadu_si128((__m128i*)&a[i +  4]);
+#if (LINE_SIZE == 16)
+    const __m128i a2 = _mm_loadu_si128((__m128i*)&a[i +  8]);
+    const __m128i a3 = _mm_loadu_si128((__m128i*)&a[i + 12]);
+#endif
+    const __m128i b0 = _mm_loadu_si128((__m128i*)&b[i +  0]);
+    const __m128i b1 = _mm_loadu_si128((__m128i*)&b[i +  4]);
+#if (LINE_SIZE == 16)
+    const __m128i b2 = _mm_loadu_si128((__m128i*)&b[i +  8]);
+    const __m128i b3 = _mm_loadu_si128((__m128i*)&b[i + 12]);
+#endif
+    _mm_storeu_si128((__m128i*)&out[i +  0], _mm_add_epi32(a0, b0));
+    _mm_storeu_si128((__m128i*)&out[i +  4], _mm_add_epi32(a1, b1));
+#if (LINE_SIZE == 16)
+    _mm_storeu_si128((__m128i*)&out[i +  8], _mm_add_epi32(a2, b2));
+    _mm_storeu_si128((__m128i*)&out[i + 12], _mm_add_epi32(a3, b3));
+#endif
+  }
+}
+
+static void AddVectorEq(const uint32_t* a, uint32_t* out, int size) {
+  int i;
+  assert(size % LINE_SIZE == 0);
+  for (i = 0; i < size; i += LINE_SIZE) {
+    const __m128i a0 = _mm_loadu_si128((__m128i*)&a[i +  0]);
+    const __m128i a1 = _mm_loadu_si128((__m128i*)&a[i +  4]);
+#if (LINE_SIZE == 16)
+    const __m128i a2 = _mm_loadu_si128((__m128i*)&a[i +  8]);
+    const __m128i a3 = _mm_loadu_si128((__m128i*)&a[i + 12]);
+#endif
+    const __m128i b0 = _mm_loadu_si128((__m128i*)&out[i +  0]);
+    const __m128i b1 = _mm_loadu_si128((__m128i*)&out[i +  4]);
+#if (LINE_SIZE == 16)
+    const __m128i b2 = _mm_loadu_si128((__m128i*)&out[i +  8]);
+    const __m128i b3 = _mm_loadu_si128((__m128i*)&out[i + 12]);
+#endif
+    _mm_storeu_si128((__m128i*)&out[i +  0], _mm_add_epi32(a0, b0));
+    _mm_storeu_si128((__m128i*)&out[i +  4], _mm_add_epi32(a1, b1));
+#if (LINE_SIZE == 16)
+    _mm_storeu_si128((__m128i*)&out[i +  8], _mm_add_epi32(a2, b2));
+    _mm_storeu_si128((__m128i*)&out[i + 12], _mm_add_epi32(a3, b3));
+#endif
+  }
+}
+#undef LINE_SIZE
+
+// Note we are adding uint32_t's as *signed* int32's (using _mm_add_epi32). But
+// that's ok since the histogram values are less than 1<<28 (max picture size).
+static void HistogramAdd(const VP8LHistogram* const a,
+                         const VP8LHistogram* const b,
+                         VP8LHistogram* const out) {
+  int i;
+  const int literal_size = VP8LHistogramNumCodes(a->palette_code_bits_);
+  assert(a->palette_code_bits_ == b->palette_code_bits_);
+  if (b != out) {
+    AddVector(a->literal_, b->literal_, out->literal_, NUM_LITERAL_CODES);
+    AddVector(a->red_, b->red_, out->red_, NUM_LITERAL_CODES);
+    AddVector(a->blue_, b->blue_, out->blue_, NUM_LITERAL_CODES);
+    AddVector(a->alpha_, b->alpha_, out->alpha_, NUM_LITERAL_CODES);
+  } else {
+    AddVectorEq(a->literal_, out->literal_, NUM_LITERAL_CODES);
+    AddVectorEq(a->red_, out->red_, NUM_LITERAL_CODES);
+    AddVectorEq(a->blue_, out->blue_, NUM_LITERAL_CODES);
+    AddVectorEq(a->alpha_, out->alpha_, NUM_LITERAL_CODES);
+  }
+  for (i = NUM_LITERAL_CODES; i < literal_size; ++i) {
+    out->literal_[i] = a->literal_[i] + b->literal_[i];
+  }
+  for (i = 0; i < NUM_DISTANCE_CODES; ++i) {
+    out->distance_[i] = a->distance_[i] + b->distance_[i];
+  }
+}
+
+#endif   // WEBP_USE_SSE2
+
+//------------------------------------------------------------------------------
+
+extern void VP8LDspInitSSE2(void);
+
+void VP8LDspInitSSE2(void) {
+#if defined(WEBP_USE_SSE2)
+  VP8LPredictors[5] = Predictor5;
+  VP8LPredictors[6] = Predictor6;
+  VP8LPredictors[7] = Predictor7;
+  VP8LPredictors[8] = Predictor8;
+  VP8LPredictors[9] = Predictor9;
+  VP8LPredictors[10] = Predictor10;
+  VP8LPredictors[11] = Predictor11;
+  VP8LPredictors[12] = Predictor12;
+  VP8LPredictors[13] = Predictor13;
+
+  VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed;
+  VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed;
+
+  VP8LTransformColor = TransformColor;
+  VP8LTransformColorInverse = TransformColorInverse;
+
+  VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA;
+  VP8LConvertBGRAToRGBA4444 = ConvertBGRAToRGBA4444;
+  VP8LConvertBGRAToRGB565 = ConvertBGRAToRGB565;
+  VP8LConvertBGRAToBGR = ConvertBGRAToBGR;
+
+  VP8LHistogramAdd = HistogramAdd;
+#endif   // WEBP_USE_SSE2
+}
+
+//------------------------------------------------------------------------------
diff --git a/src/3rdparty/libwebp/src/dsp/neon.h b/src/3rdparty/libwebp/src/dsp/neon.h
new file mode 100644
index 0000000..7e06eae
--- /dev/null
+++ b/src/3rdparty/libwebp/src/dsp/neon.h
@@ -0,0 +1,82 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+//  NEON common code.
+
+#ifndef WEBP_DSP_NEON_H_
+#define WEBP_DSP_NEON_H_
+
+#include <arm_neon.h>
+
+#include "./dsp.h"
+
+// Right now, some intrinsics functions seem slower, so we disable them
+// everywhere except aarch64 where the inline assembly is incompatible.
+#if defined(__aarch64__)
+#define USE_INTRINSICS   // use intrinsics when possible
+#endif
+
+#define INIT_VECTOR2(v, a, b) do {  \
+  v.val[0] = a;                     \
+  v.val[1] = b;                     \
+} while (0)
+
+#define INIT_VECTOR3(v, a, b, c) do {  \
+  v.val[0] = a;                        \
+  v.val[1] = b;                        \
+  v.val[2] = c;                        \
+} while (0)
+
+#define INIT_VECTOR4(v, a, b, c, d) do {  \
+  v.val[0] = a;                           \
+  v.val[1] = b;                           \
+  v.val[2] = c;                           \
+  v.val[3] = d;                           \
+} while (0)
+
+// if using intrinsics, this flag avoids some functions that make gcc-4.6.3
+// crash ("internal compiler error: in immed_double_const, at emit-rtl.").
+// (probably similar to gcc.gnu.org/bugzilla/show_bug.cgi?id=48183)
+#if !(LOCAL_GCC_PREREQ(4,8) || defined(__aarch64__))
+#define WORK_AROUND_GCC
+#endif
+
+static WEBP_INLINE int32x4x4_t Transpose4x4(const int32x4x4_t rows) {
+  uint64x2x2_t row01, row23;
+
+  row01.val[0] = vreinterpretq_u64_s32(rows.val[0]);
+  row01.val[1] = vreinterpretq_u64_s32(rows.val[1]);
+  row23.val[0] = vreinterpretq_u64_s32(rows.val[2]);
+  row23.val[1] = vreinterpretq_u64_s32(rows.val[3]);
+  // Transpose 64-bit values (there's no vswp equivalent)
+  {
+    const uint64x1_t row0h = vget_high_u64(row01.val[0]);
+    const uint64x1_t row2l = vget_low_u64(row23.val[0]);
+    const uint64x1_t row1h = vget_high_u64(row01.val[1]);
+    const uint64x1_t row3l = vget_low_u64(row23.val[1]);
+    row01.val[0] = vcombine_u64(vget_low_u64(row01.val[0]), row2l);
+    row23.val[0] = vcombine_u64(row0h, vget_high_u64(row23.val[0]));
+    row01.val[1] = vcombine_u64(vget_low_u64(row01.val[1]), row3l);
+    row23.val[1] = vcombine_u64(row1h, vget_high_u64(row23.val[1]));
+  }
+  {
+    const int32x4x2_t out01 = vtrnq_s32(vreinterpretq_s32_u64(row01.val[0]),
+                                        vreinterpretq_s32_u64(row01.val[1]));
+    const int32x4x2_t out23 = vtrnq_s32(vreinterpretq_s32_u64(row23.val[0]),
+                                        vreinterpretq_s32_u64(row23.val[1]));
+    int32x4x4_t out;
+    out.val[0] = out01.val[0];
+    out.val[1] = out01.val[1];
+    out.val[2] = out23.val[0];
+    out.val[3] = out23.val[1];
+    return out;
+  }
+}
+
+#endif  // WEBP_DSP_NEON_H_
diff --git a/src/3rdparty/libwebp/src/dsp/upsampling.c b/src/3rdparty/libwebp/src/dsp/upsampling.c
index 978e3ce..53c68d5 100644
--- a/src/3rdparty/libwebp/src/dsp/upsampling.c
+++ b/src/3rdparty/libwebp/src/dsp/upsampling.c
@@ -107,57 +107,6 @@ UPSAMPLE_FUNC(UpsampleRgb565LinePair,  VP8YuvToRgb565,  2)
 #endif  // FANCY_UPSAMPLING
 
 //------------------------------------------------------------------------------
-// simple point-sampling
-
-#define SAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP)                                    \
-static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y,           \
-                      const uint8_t* u, const uint8_t* v,                      \
-                      uint8_t* top_dst, uint8_t* bottom_dst, int len) {        \
-  int i;                                                                       \
-  for (i = 0; i < len - 1; i += 2) {                                           \
-    FUNC(top_y[0], u[0], v[0], top_dst);                                       \
-    FUNC(top_y[1], u[0], v[0], top_dst + XSTEP);                               \
-    FUNC(bottom_y[0], u[0], v[0], bottom_dst);                                 \
-    FUNC(bottom_y[1], u[0], v[0], bottom_dst + XSTEP);                         \
-    top_y += 2;                                                                \
-    bottom_y += 2;                                                             \
-    u++;                                                                       \
-    v++;                                                                       \
-    top_dst += 2 * XSTEP;                                                      \
-    bottom_dst += 2 * XSTEP;                                                   \
-  }                                                                            \
-  if (i == len - 1) {    /* last one */                                        \
-    FUNC(top_y[0], u[0], v[0], top_dst);                                       \
-    FUNC(bottom_y[0], u[0], v[0], bottom_dst);                                 \
-  }                                                                            \
-}
-
-// All variants implemented.
-SAMPLE_FUNC(SampleRgbLinePair,      VP8YuvToRgb,  3)
-SAMPLE_FUNC(SampleBgrLinePair,      VP8YuvToBgr,  3)
-SAMPLE_FUNC(SampleRgbaLinePair,     VP8YuvToRgba, 4)
-SAMPLE_FUNC(SampleBgraLinePair,     VP8YuvToBgra, 4)
-SAMPLE_FUNC(SampleArgbLinePair,     VP8YuvToArgb, 4)
-SAMPLE_FUNC(SampleRgba4444LinePair, VP8YuvToRgba4444, 2)
-SAMPLE_FUNC(SampleRgb565LinePair,   VP8YuvToRgb565, 2)
-
-#undef SAMPLE_FUNC
-
-const WebPSampleLinePairFunc WebPSamplers[MODE_LAST] = {
-  SampleRgbLinePair,       // MODE_RGB
-  SampleRgbaLinePair,      // MODE_RGBA
-  SampleBgrLinePair,       // MODE_BGR
-  SampleBgraLinePair,      // MODE_BGRA
-  SampleArgbLinePair,      // MODE_ARGB
-  SampleRgba4444LinePair,  // MODE_RGBA_4444
-  SampleRgb565LinePair,    // MODE_RGB_565
-  SampleRgbaLinePair,      // MODE_rgbA
-  SampleBgraLinePair,      // MODE_bgrA
-  SampleArgbLinePair,      // MODE_Argb
-  SampleRgba4444LinePair   // MODE_rgbA_4444
-};
-
-//------------------------------------------------------------------------------
 
 #if !defined(FANCY_UPSAMPLING)
 #define DUAL_SAMPLE_FUNC(FUNC_NAME, FUNC)                                      \
@@ -235,85 +184,17 @@ const WebPYUV444Converter WebPYUV444Converters[MODE_LAST] = {
 };
 
 //------------------------------------------------------------------------------
-// Premultiplied modes
-
-// non dithered-modes
-
-// (x * a * 32897) >> 23 is bit-wise equivalent to (int)(x * a / 255.)
-// for all 8bit x or a. For bit-wise equivalence to (int)(x * a / 255. + .5),
-// one can use instead: (x * a * 65793 + (1 << 23)) >> 24
-#if 1     // (int)(x * a / 255.)
-#define MULTIPLIER(a)   ((a) * 32897UL)
-#define PREMULTIPLY(x, m) (((x) * (m)) >> 23)
-#else     // (int)(x * a / 255. + .5)
-#define MULTIPLIER(a) ((a) * 65793UL)
-#define PREMULTIPLY(x, m) (((x) * (m) + (1UL << 23)) >> 24)
-#endif
-
-static void ApplyAlphaMultiply(uint8_t* rgba, int alpha_first,
-                               int w, int h, int stride) {
-  while (h-- > 0) {
-    uint8_t* const rgb = rgba + (alpha_first ? 1 : 0);
-    const uint8_t* const alpha = rgba + (alpha_first ? 0 : 3);
-    int i;
-    for (i = 0; i < w; ++i) {
-      const uint32_t a = alpha[4 * i];
-      if (a != 0xff) {
-        const uint32_t mult = MULTIPLIER(a);
-        rgb[4 * i + 0] = PREMULTIPLY(rgb[4 * i + 0], mult);
-        rgb[4 * i + 1] = PREMULTIPLY(rgb[4 * i + 1], mult);
-        rgb[4 * i + 2] = PREMULTIPLY(rgb[4 * i + 2], mult);
-      }
-    }
-    rgba += stride;
-  }
-}
-#undef MULTIPLIER
-#undef PREMULTIPLY
-
-// rgbA4444
-
-#define MULTIPLIER(a)  ((a) * 0x1111)    // 0x1111 ~= (1 << 16) / 15
-
-static WEBP_INLINE uint8_t dither_hi(uint8_t x) {
-  return (x & 0xf0) | (x >> 4);
-}
-
-static WEBP_INLINE uint8_t dither_lo(uint8_t x) {
-  return (x & 0x0f) | (x << 4);
-}
-
-static WEBP_INLINE uint8_t multiply(uint8_t x, uint32_t m) {
-  return (x * m) >> 16;
-}
+// Main calls
 
-static void ApplyAlphaMultiply4444(uint8_t* rgba4444,
-                                   int w, int h, int stride) {
-  while (h-- > 0) {
-    int i;
-    for (i = 0; i < w; ++i) {
-      const uint8_t a = (rgba4444[2 * i + 1] & 0x0f);
-      const uint32_t mult = MULTIPLIER(a);
-      const uint8_t r = multiply(dither_hi(rgba4444[2 * i + 0]), mult);
-      const uint8_t g = multiply(dither_lo(rgba4444[2 * i + 0]), mult);
-      const uint8_t b = multiply(dither_hi(rgba4444[2 * i + 1]), mult);
-      rgba4444[2 * i + 0] = (r & 0xf0) | ((g >> 4) & 0x0f);
-      rgba4444[2 * i + 1] = (b & 0xf0) | a;
-    }
-    rgba4444 += stride;
-  }
-}
-#undef MULTIPLIER
-
-void (*WebPApplyAlphaMultiply)(uint8_t*, int, int, int, int)
-    = ApplyAlphaMultiply;
-void (*WebPApplyAlphaMultiply4444)(uint8_t*, int, int, int)
-    = ApplyAlphaMultiply4444;
+extern void WebPInitUpsamplersSSE2(void);
+extern void WebPInitUpsamplersNEON(void);
 
-//------------------------------------------------------------------------------
-// Main call
+static volatile VP8CPUInfo upsampling_last_cpuinfo_used2 =
+    (VP8CPUInfo)&upsampling_last_cpuinfo_used2;
 
 void WebPInitUpsamplers(void) {
+  if (upsampling_last_cpuinfo_used2 == VP8GetCPUInfo) return;
+
 #ifdef FANCY_UPSAMPLING
   WebPUpsamplers[MODE_RGB]       = UpsampleRgbLinePair;
   WebPUpsamplers[MODE_RGBA]      = UpsampleRgbaLinePair;
@@ -322,45 +203,26 @@ void WebPInitUpsamplers(void) {
   WebPUpsamplers[MODE_ARGB]      = UpsampleArgbLinePair;
   WebPUpsamplers[MODE_RGBA_4444] = UpsampleRgba4444LinePair;
   WebPUpsamplers[MODE_RGB_565]   = UpsampleRgb565LinePair;
-
-  // If defined, use CPUInfo() to overwrite some pointers with faster versions.
-  if (VP8GetCPUInfo != NULL) {
-#if defined(WEBP_USE_SSE2)
-    if (VP8GetCPUInfo(kSSE2)) {
-      WebPInitUpsamplersSSE2();
-    }
-#endif
-#if defined(WEBP_USE_NEON)
-    if (VP8GetCPUInfo(kNEON)) {
-      WebPInitUpsamplersNEON();
-    }
-#endif
-  }
-#endif  // FANCY_UPSAMPLING
-}
-
-void WebPInitPremultiply(void) {
-  WebPApplyAlphaMultiply = ApplyAlphaMultiply;
-  WebPApplyAlphaMultiply4444 = ApplyAlphaMultiply4444;
-
-#ifdef FANCY_UPSAMPLING
   WebPUpsamplers[MODE_rgbA]      = UpsampleRgbaLinePair;
   WebPUpsamplers[MODE_bgrA]      = UpsampleBgraLinePair;
   WebPUpsamplers[MODE_Argb]      = UpsampleArgbLinePair;
   WebPUpsamplers[MODE_rgbA_4444] = UpsampleRgba4444LinePair;
 
+  // If defined, use CPUInfo() to overwrite some pointers with faster versions.
   if (VP8GetCPUInfo != NULL) {
 #if defined(WEBP_USE_SSE2)
     if (VP8GetCPUInfo(kSSE2)) {
-      WebPInitPremultiplySSE2();
+      WebPInitUpsamplersSSE2();
     }
 #endif
 #if defined(WEBP_USE_NEON)
     if (VP8GetCPUInfo(kNEON)) {
-      WebPInitPremultiplyNEON();
+      WebPInitUpsamplersNEON();
     }
 #endif
   }
 #endif  // FANCY_UPSAMPLING
+  upsampling_last_cpuinfo_used2 = VP8GetCPUInfo;
 }
 
+//------------------------------------------------------------------------------
diff --git a/src/3rdparty/libwebp/src/dsp/upsampling_neon.c b/src/3rdparty/libwebp/src/dsp/upsampling_neon.c
index 791222f..d31ed4d 100644
--- a/src/3rdparty/libwebp/src/dsp/upsampling_neon.c
+++ b/src/3rdparty/libwebp/src/dsp/upsampling_neon.c
@@ -19,6 +19,7 @@
 #include <assert.h>
 #include <arm_neon.h>
 #include <string.h>
+#include "./neon.h"
 #include "./yuv.h"
 
 #ifdef FANCY_UPSAMPLING
@@ -61,8 +62,9 @@
   d = vrhadd_u8(d, diag1);                                              \
                                                                         \
   {                                                                     \
-    const uint8x8x2_t a_b = {{ a, b }};                                 \
-    const uint8x8x2_t c_d = {{ c, d }};                                 \
+    uint8x8x2_t a_b, c_d;                                               \
+    INIT_VECTOR2(a_b, a, b);                                            \
+    INIT_VECTOR2(c_d, c, d);                                            \
     vst2_u8(out,      a_b);                                             \
     vst2_u8(out + 32, c_d);                                             \
   }                                                                     \
@@ -89,25 +91,29 @@ static void Upsample16Pixels(const uint8_t *r1, const uint8_t *r2,
 
 static const int16_t kCoeffs[4] = { kYScale, kVToR, kUToG, kVToG };
 
-#define v255 vmov_n_u8(255)
+#define v255 vdup_n_u8(255)
 
 #define STORE_Rgb(out, r, g, b) do {                                    \
-  const uint8x8x3_t r_g_b = {{ r, g, b }};                              \
+  uint8x8x3_t r_g_b;                                                    \
+  INIT_VECTOR3(r_g_b, r, g, b);                                         \
   vst3_u8(out, r_g_b);                                                  \
 } while (0)
 
 #define STORE_Bgr(out, r, g, b) do {                                    \
-  const uint8x8x3_t b_g_r = {{ b, g, r }};                              \
+  uint8x8x3_t b_g_r;                                                    \
+  INIT_VECTOR3(b_g_r, b, g, r);                                         \
   vst3_u8(out, b_g_r);                                                  \
 } while (0)
 
 #define STORE_Rgba(out, r, g, b) do {                                   \
-  const uint8x8x4_t r_g_b_v255 = {{ r, g, b, v255 }};                   \
+  uint8x8x4_t r_g_b_v255;                                               \
+  INIT_VECTOR4(r_g_b_v255, r, g, b, v255);                              \
   vst4_u8(out, r_g_b_v255);                                             \
 } while (0)
 
 #define STORE_Bgra(out, r, g, b) do {                                   \
-  const uint8x8x4_t b_g_r_v255 = {{ b, g, r, v255 }};                   \
+  uint8x8x4_t b_g_r_v255;                                               \
+  INIT_VECTOR4(b_g_r_v255, b, g, r, v255);                              \
   vst4_u8(out, b_g_r_v255);                                             \
 } while (0)
 
@@ -190,9 +196,9 @@ static void FUNC_NAME(const uint8_t *top_y, const uint8_t *bottom_y,    \
   const int v_diag = ((top_v[0] + cur_v[0]) >> 1) + 1;                  \
                                                                         \
   const int16x4_t cf16 = vld1_s16(kCoeffs);                             \
-  const int32x2_t cf32 = vmov_n_s32(kUToB);                             \
-  const uint8x8_t u16  = vmov_n_u8(16);                                 \
-  const uint8x8_t u128 = vmov_n_u8(128);                                \
+  const int32x2_t cf32 = vdup_n_s32(kUToB);                             \
+  const uint8x8_t u16  = vdup_n_u8(16);                                 \
+  const uint8x8_t u128 = vdup_n_u8(128);                                \
                                                                         \
   /* Treat the first pixel in regular way */                            \
   assert(top_y != NULL);                                                \
@@ -225,10 +231,10 @@ static void FUNC_NAME(const uint8_t *top_y, const uint8_t *bottom_y,    \
 }
 
 // NEON variants of the fancy upsampler.
-NEON_UPSAMPLE_FUNC(UpsampleRgbLinePairNEON,  Rgb,  3)
-NEON_UPSAMPLE_FUNC(UpsampleBgrLinePairNEON,  Bgr,  3)
-NEON_UPSAMPLE_FUNC(UpsampleRgbaLinePairNEON, Rgba, 4)
-NEON_UPSAMPLE_FUNC(UpsampleBgraLinePairNEON, Bgra, 4)
+NEON_UPSAMPLE_FUNC(UpsampleRgbLinePair,  Rgb,  3)
+NEON_UPSAMPLE_FUNC(UpsampleBgrLinePair,  Bgr,  3)
+NEON_UPSAMPLE_FUNC(UpsampleRgbaLinePair, Rgba, 4)
+NEON_UPSAMPLE_FUNC(UpsampleBgraLinePair, Bgra, 4)
 
 #endif  // FANCY_UPSAMPLING
 
@@ -236,30 +242,26 @@ NEON_UPSAMPLE_FUNC(UpsampleBgraLinePairNEON, Bgra, 4)
 
 //------------------------------------------------------------------------------
 
+extern void WebPInitUpsamplersNEON(void);
+
 #ifdef FANCY_UPSAMPLING
 
 extern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */];
 
 void WebPInitUpsamplersNEON(void) {
 #if defined(WEBP_USE_NEON)
-  WebPUpsamplers[MODE_RGB]  = UpsampleRgbLinePairNEON;
-  WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePairNEON;
-  WebPUpsamplers[MODE_BGR]  = UpsampleBgrLinePairNEON;
-  WebPUpsamplers[MODE_BGRA] = UpsampleBgraLinePairNEON;
-#endif   // WEBP_USE_NEON
-}
-
-void WebPInitPremultiplyNEON(void) {
-#if defined(WEBP_USE_NEON)
-  WebPUpsamplers[MODE_rgbA] = UpsampleRgbaLinePairNEON;
-  WebPUpsamplers[MODE_bgrA] = UpsampleBgraLinePairNEON;
+  WebPUpsamplers[MODE_RGB]  = UpsampleRgbLinePair;
+  WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePair;
+  WebPUpsamplers[MODE_BGR]  = UpsampleBgrLinePair;
+  WebPUpsamplers[MODE_BGRA] = UpsampleBgraLinePair;
+  WebPUpsamplers[MODE_rgbA] = UpsampleRgbaLinePair;
+  WebPUpsamplers[MODE_bgrA] = UpsampleBgraLinePair;
 #endif   // WEBP_USE_NEON
 }
 
 #else
 
 // this empty function is to avoid an empty .o
-void WebPInitPremultiplyNEON(void) {}
+void WebPInitUpsamplersNEON(void) {}
 
 #endif  // FANCY_UPSAMPLING
-
diff --git a/src/3rdparty/libwebp/src/dsp/upsampling_sse2.c b/src/3rdparty/libwebp/src/dsp/upsampling_sse2.c
index 0db0798..45cf090 100644
--- a/src/3rdparty/libwebp/src/dsp/upsampling_sse2.c
+++ b/src/3rdparty/libwebp/src/dsp/upsampling_sse2.c
@@ -169,10 +169,10 @@ static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y,           \
 }
 
 // SSE2 variants of the fancy upsampler.
-SSE2_UPSAMPLE_FUNC(UpsampleRgbLinePairSSE2,  VP8YuvToRgb,  3)
-SSE2_UPSAMPLE_FUNC(UpsampleBgrLinePairSSE2,  VP8YuvToBgr,  3)
-SSE2_UPSAMPLE_FUNC(UpsampleRgbaLinePairSSE2, VP8YuvToRgba, 4)
-SSE2_UPSAMPLE_FUNC(UpsampleBgraLinePairSSE2, VP8YuvToBgra, 4)
+SSE2_UPSAMPLE_FUNC(UpsampleRgbLinePair,  VP8YuvToRgb,  3)
+SSE2_UPSAMPLE_FUNC(UpsampleBgrLinePair,  VP8YuvToBgr,  3)
+SSE2_UPSAMPLE_FUNC(UpsampleRgbaLinePair, VP8YuvToRgba, 4)
+SSE2_UPSAMPLE_FUNC(UpsampleBgraLinePair, VP8YuvToBgra, 4)
 
 #undef GET_M
 #undef PACK_AND_STORE
@@ -188,6 +188,8 @@ SSE2_UPSAMPLE_FUNC(UpsampleBgraLinePairSSE2, VP8YuvToBgra, 4)
 
 //------------------------------------------------------------------------------
 
+extern void WebPInitUpsamplersSSE2(void);
+
 #ifdef FANCY_UPSAMPLING
 
 extern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */];
@@ -195,24 +197,18 @@ extern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */];
 void WebPInitUpsamplersSSE2(void) {
 #if defined(WEBP_USE_SSE2)
   VP8YUVInitSSE2();
-  WebPUpsamplers[MODE_RGB]  = UpsampleRgbLinePairSSE2;
-  WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePairSSE2;
-  WebPUpsamplers[MODE_BGR]  = UpsampleBgrLinePairSSE2;
-  WebPUpsamplers[MODE_BGRA] = UpsampleBgraLinePairSSE2;
-#endif   // WEBP_USE_SSE2
-}
-
-void WebPInitPremultiplySSE2(void) {
-#if defined(WEBP_USE_SSE2)
-  WebPUpsamplers[MODE_rgbA] = UpsampleRgbaLinePairSSE2;
-  WebPUpsamplers[MODE_bgrA] = UpsampleBgraLinePairSSE2;
+  WebPUpsamplers[MODE_RGB]  = UpsampleRgbLinePair;
+  WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePair;
+  WebPUpsamplers[MODE_BGR]  = UpsampleBgrLinePair;
+  WebPUpsamplers[MODE_BGRA] = UpsampleBgraLinePair;
+  WebPUpsamplers[MODE_rgbA] = UpsampleRgbaLinePair;
+  WebPUpsamplers[MODE_bgrA] = UpsampleBgraLinePair;
 #endif   // WEBP_USE_SSE2
 }
 
 #else
 
 // this empty function is to avoid an empty .o
-void WebPInitPremultiplySSE2(void) {}
+void WebPInitUpsamplersSSE2(void) {}
 
 #endif  // FANCY_UPSAMPLING
-
diff --git a/src/3rdparty/libwebp/src/dsp/yuv.c b/src/3rdparty/libwebp/src/dsp/yuv.c
index 4f9cafc..6f422da 100644
--- a/src/3rdparty/libwebp/src/dsp/yuv.c
+++ b/src/3rdparty/libwebp/src/dsp/yuv.c
@@ -7,13 +7,12 @@
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
-// YUV->RGB conversion function
+// YUV->RGB conversion functions
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
 #include "./yuv.h"
 
-
 #if defined(WEBP_YUV_USE_TABLE)
 
 static int done = 0;
@@ -68,140 +67,94 @@ void VP8YUVInit(void) {}
 #endif  // WEBP_YUV_USE_TABLE
 
 //-----------------------------------------------------------------------------
-// SSE2 extras
-
-#if defined(WEBP_USE_SSE2)
-
-#ifdef FANCY_UPSAMPLING
-
-#include <emmintrin.h>
-#include <string.h>   // for memcpy
-
-typedef union {   // handy struct for converting SSE2 registers
-  int32_t i32[4];
-  uint8_t u8[16];
-  __m128i m;
-} VP8kCstSSE2;
-
-static int done_sse2 = 0;
-static VP8kCstSSE2 VP8kUtoRGBA[256], VP8kVtoRGBA[256], VP8kYtoRGBA[256];
-
-void VP8YUVInitSSE2(void) {
-  if (!done_sse2) {
-    int i;
-    for (i = 0; i < 256; ++i) {
-      VP8kYtoRGBA[i].i32[0] =
-        VP8kYtoRGBA[i].i32[1] =
-        VP8kYtoRGBA[i].i32[2] = (i - 16) * kYScale + YUV_HALF2;
-      VP8kYtoRGBA[i].i32[3] = 0xff << YUV_FIX2;
-
-      VP8kUtoRGBA[i].i32[0] = 0;
-      VP8kUtoRGBA[i].i32[1] = -kUToG * (i - 128);
-      VP8kUtoRGBA[i].i32[2] =  kUToB * (i - 128);
-      VP8kUtoRGBA[i].i32[3] = 0;
-
-      VP8kVtoRGBA[i].i32[0] =  kVToR * (i - 128);
-      VP8kVtoRGBA[i].i32[1] = -kVToG * (i - 128);
-      VP8kVtoRGBA[i].i32[2] = 0;
-      VP8kVtoRGBA[i].i32[3] = 0;
+// Plain-C version
+
+#define ROW_FUNC(FUNC_NAME, FUNC, XSTEP)                                       \
+static void FUNC_NAME(const uint8_t* y,                                        \
+                      const uint8_t* u, const uint8_t* v,                      \
+                      uint8_t* dst, int len) {                                 \
+  const uint8_t* const end = dst + (len & ~1) * XSTEP;                         \
+  while (dst != end) {                                                         \
+    FUNC(y[0], u[0], v[0], dst);                                               \
+    FUNC(y[1], u[0], v[0], dst + XSTEP);                                       \
+    y += 2;                                                                    \
+    ++u;                                                                       \
+    ++v;                                                                       \
+    dst += 2 * XSTEP;                                                          \
+  }                                                                            \
+  if (len & 1) {                                                               \
+    FUNC(y[0], u[0], v[0], dst);                                               \
+  }                                                                            \
+}                                                                              \
+
+// All variants implemented.
+ROW_FUNC(YuvToRgbRow,      VP8YuvToRgb,  3)
+ROW_FUNC(YuvToBgrRow,      VP8YuvToBgr,  3)
+ROW_FUNC(YuvToRgbaRow,     VP8YuvToRgba, 4)
+ROW_FUNC(YuvToBgraRow,     VP8YuvToBgra, 4)
+ROW_FUNC(YuvToArgbRow,     VP8YuvToArgb, 4)
+ROW_FUNC(YuvToRgba4444Row, VP8YuvToRgba4444, 2)
+ROW_FUNC(YuvToRgb565Row,   VP8YuvToRgb565, 2)
+
+#undef ROW_FUNC
+
+// Main call for processing a plane with a WebPSamplerRowFunc function:
+void WebPSamplerProcessPlane(const uint8_t* y, int y_stride,
+                             const uint8_t* u, const uint8_t* v, int uv_stride,
+                             uint8_t* dst, int dst_stride,
+                             int width, int height, WebPSamplerRowFunc func) {
+  int j;
+  for (j = 0; j < height; ++j) {
+    func(y, u, v, dst, width);
+    y += y_stride;
+    if (j & 1) {
+      u += uv_stride;
+      v += uv_stride;
     }
-    done_sse2 = 1;
+    dst += dst_stride;
   }
 }
 
-static WEBP_INLINE __m128i VP8GetRGBA32b(int y, int u, int v) {
-  const __m128i u_part = _mm_loadu_si128(&VP8kUtoRGBA[u].m);
-  const __m128i v_part = _mm_loadu_si128(&VP8kVtoRGBA[v].m);
-  const __m128i y_part = _mm_loadu_si128(&VP8kYtoRGBA[y].m);
-  const __m128i uv_part = _mm_add_epi32(u_part, v_part);
-  const __m128i rgba1 = _mm_add_epi32(y_part, uv_part);
-  const __m128i rgba2 = _mm_srai_epi32(rgba1, YUV_FIX2);
-  return rgba2;
-}
+//-----------------------------------------------------------------------------
+// Main call
 
-static WEBP_INLINE void VP8YuvToRgbSSE2(uint8_t y, uint8_t u, uint8_t v,
-                                        uint8_t* const rgb) {
-  const __m128i tmp0 = VP8GetRGBA32b(y, u, v);
-  const __m128i tmp1 = _mm_packs_epi32(tmp0, tmp0);
-  const __m128i tmp2 = _mm_packus_epi16(tmp1, tmp1);
-  // Note: we store 8 bytes at a time, not 3 bytes! -> memory stomp
-  _mm_storel_epi64((__m128i*)rgb, tmp2);
-}
+WebPSamplerRowFunc WebPSamplers[MODE_LAST];
 
-static WEBP_INLINE void VP8YuvToBgrSSE2(uint8_t y, uint8_t u, uint8_t v,
-                                        uint8_t* const bgr) {
-  const __m128i tmp0 = VP8GetRGBA32b(y, u, v);
-  const __m128i tmp1 = _mm_shuffle_epi32(tmp0, _MM_SHUFFLE(3, 0, 1, 2));
-  const __m128i tmp2 = _mm_packs_epi32(tmp1, tmp1);
-  const __m128i tmp3 = _mm_packus_epi16(tmp2, tmp2);
-  // Note: we store 8 bytes at a time, not 3 bytes! -> memory stomp
-  _mm_storel_epi64((__m128i*)bgr, tmp3);
-}
+extern void WebPInitSamplersSSE2(void);
+extern void WebPInitSamplersMIPS32(void);
 
-void VP8YuvToRgba32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
-                    uint8_t* dst) {
-  int n;
-  for (n = 0; n < 32; n += 4) {
-    const __m128i tmp0_1 = VP8GetRGBA32b(y[n + 0], u[n + 0], v[n + 0]);
-    const __m128i tmp0_2 = VP8GetRGBA32b(y[n + 1], u[n + 1], v[n + 1]);
-    const __m128i tmp0_3 = VP8GetRGBA32b(y[n + 2], u[n + 2], v[n + 2]);
-    const __m128i tmp0_4 = VP8GetRGBA32b(y[n + 3], u[n + 3], v[n + 3]);
-    const __m128i tmp1_1 = _mm_packs_epi32(tmp0_1, tmp0_2);
-    const __m128i tmp1_2 = _mm_packs_epi32(tmp0_3, tmp0_4);
-    const __m128i tmp2 = _mm_packus_epi16(tmp1_1, tmp1_2);
-    _mm_storeu_si128((__m128i*)dst, tmp2);
-    dst += 4 * 4;
-  }
-}
+static volatile VP8CPUInfo yuv_last_cpuinfo_used =
+    (VP8CPUInfo)&yuv_last_cpuinfo_used;
 
-void VP8YuvToBgra32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
-                    uint8_t* dst) {
-  int n;
-  for (n = 0; n < 32; n += 2) {
-    const __m128i tmp0_1 = VP8GetRGBA32b(y[n + 0], u[n + 0], v[n + 0]);
-    const __m128i tmp0_2 = VP8GetRGBA32b(y[n + 1], u[n + 1], v[n + 1]);
-    const __m128i tmp1_1 = _mm_shuffle_epi32(tmp0_1, _MM_SHUFFLE(3, 0, 1, 2));
-    const __m128i tmp1_2 = _mm_shuffle_epi32(tmp0_2, _MM_SHUFFLE(3, 0, 1, 2));
-    const __m128i tmp2_1 = _mm_packs_epi32(tmp1_1, tmp1_2);
-    const __m128i tmp3 = _mm_packus_epi16(tmp2_1, tmp2_1);
-    _mm_storel_epi64((__m128i*)dst, tmp3);
-    dst += 4 * 2;
-  }
-}
+void WebPInitSamplers(void) {
+  if (yuv_last_cpuinfo_used == VP8GetCPUInfo) return;
 
-void VP8YuvToRgb32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
-                   uint8_t* dst) {
-  int n;
-  uint8_t tmp0[2 * 3 + 5 + 15];
-  uint8_t* const tmp = (uint8_t*)((uintptr_t)(tmp0 + 15) & ~15);  // align
-  for (n = 0; n < 30; ++n) {   // we directly stomp the *dst memory
-    VP8YuvToRgbSSE2(y[n], u[n], v[n], dst + n * 3);
-  }
-  // Last two pixels are special: we write in a tmp buffer before sending
-  // to dst.
-  VP8YuvToRgbSSE2(y[n + 0], u[n + 0], v[n + 0], tmp + 0);
-  VP8YuvToRgbSSE2(y[n + 1], u[n + 1], v[n + 1], tmp + 3);
-  memcpy(dst + n * 3, tmp, 2 * 3);
-}
+  WebPSamplers[MODE_RGB]       = YuvToRgbRow;
+  WebPSamplers[MODE_RGBA]      = YuvToRgbaRow;
+  WebPSamplers[MODE_BGR]       = YuvToBgrRow;
+  WebPSamplers[MODE_BGRA]      = YuvToBgraRow;
+  WebPSamplers[MODE_ARGB]      = YuvToArgbRow;
+  WebPSamplers[MODE_RGBA_4444] = YuvToRgba4444Row;
+  WebPSamplers[MODE_RGB_565]   = YuvToRgb565Row;
+  WebPSamplers[MODE_rgbA]      = YuvToRgbaRow;
+  WebPSamplers[MODE_bgrA]      = YuvToBgraRow;
+  WebPSamplers[MODE_Argb]      = YuvToArgbRow;
+  WebPSamplers[MODE_rgbA_4444] = YuvToRgba4444Row;
 
-void VP8YuvToBgr32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
-                   uint8_t* dst) {
-  int n;
-  uint8_t tmp0[2 * 3 + 5 + 15];
-  uint8_t* const tmp = (uint8_t*)((uintptr_t)(tmp0 + 15) & ~15);  // align
-  for (n = 0; n < 30; ++n) {
-    VP8YuvToBgrSSE2(y[n], u[n], v[n], dst + n * 3);
+  // If defined, use CPUInfo() to overwrite some pointers with faster versions.
+  if (VP8GetCPUInfo != NULL) {
+#if defined(WEBP_USE_SSE2)
+    if (VP8GetCPUInfo(kSSE2)) {
+      WebPInitSamplersSSE2();
+    }
+#endif  // WEBP_USE_SSE2
+#if defined(WEBP_USE_MIPS32)
+    if (VP8GetCPUInfo(kMIPS32)) {
+      WebPInitSamplersMIPS32();
+    }
+#endif  // WEBP_USE_MIPS32
   }
-  VP8YuvToBgrSSE2(y[n + 0], u[n + 0], v[n + 0], tmp + 0);
-  VP8YuvToBgrSSE2(y[n + 1], u[n + 1], v[n + 1], tmp + 3);
-  memcpy(dst + n * 3, tmp, 2 * 3);
+  yuv_last_cpuinfo_used = VP8GetCPUInfo;
 }
 
-#else
-
-void VP8YUVInitSSE2(void) {}
-
-#endif  // FANCY_UPSAMPLING
-
-#endif  // WEBP_USE_SSE2
-
+//-----------------------------------------------------------------------------
diff --git a/src/3rdparty/libwebp/src/dsp/yuv.h b/src/3rdparty/libwebp/src/dsp/yuv.h
index dd778f9..8a47edd 100644
--- a/src/3rdparty/libwebp/src/dsp/yuv.h
+++ b/src/3rdparty/libwebp/src/dsp/yuv.h
@@ -245,6 +245,10 @@ void VP8YUVInit(void);
 
 #if defined(WEBP_USE_SSE2)
 
+// When the following is defined, tables are initialized statically, adding ~12k
+// to the binary size. Otherwise, they are initialized at run-time (small cost).
+#define WEBP_YUV_USE_SSE2_TABLES
+
 #if defined(FANCY_UPSAMPLING)
 // Process 32 pixels and store the result (24b or 32b per pixel) in *dst.
 void VP8YuvToRgba32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
@@ -298,12 +302,12 @@ static WEBP_INLINE int VP8RGBToY(int r, int g, int b, int rounding) {
   return (luma + rounding) >> YUV_FIX;  // no need to clip
 }
 
-static WEBP_INLINE int VP8_RGB_TO_U(int r, int g, int b, int rounding) {
+static WEBP_INLINE int VP8RGBToU(int r, int g, int b, int rounding) {
   const int u = -11058 * r - 21710 * g + 32768 * b;
   return VP8ClipUV(u, rounding);
 }
 
-static WEBP_INLINE int VP8_RGB_TO_V(int r, int g, int b, int rounding) {
+static WEBP_INLINE int VP8RGBToV(int r, int g, int b, int rounding) {
   const int v = 32768 * r - 27439 * g - 5329 * b;
   return VP8ClipUV(v, rounding);
 }
diff --git a/src/3rdparty/libwebp/src/dsp/yuv_mips32.c b/src/3rdparty/libwebp/src/dsp/yuv_mips32.c
new file mode 100644
index 0000000..c82b4df
--- /dev/null
+++ b/src/3rdparty/libwebp/src/dsp/yuv_mips32.c
@@ -0,0 +1,100 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// MIPS version of YUV to RGB upsampling functions.
+//
+// Author(s):  Djordje Pesut    (djordje.pesut@imgtec.com)
+//             Jovan Zelincevic (jovan.zelincevic@imgtec.com)
+
+#include "./dsp.h"
+
+#if defined(WEBP_USE_MIPS32)
+
+#include "./yuv.h"
+
+//------------------------------------------------------------------------------
+// simple point-sampling
+
+#define ROW_FUNC(FUNC_NAME, XSTEP, R, G, B, A)                                 \
+static void FUNC_NAME(const uint8_t* y,                                        \
+                      const uint8_t* u, const uint8_t* v,                      \
+                      uint8_t* dst, int len) {                                 \
+  int i, r, g, b;                                                              \
+  int temp0, temp1, temp2, temp3, temp4;                                       \
+  for (i = 0; i < (len >> 1); i++) {                                           \
+    temp1 = kVToR * v[0];                                                      \
+    temp3 = kVToG * v[0];                                                      \
+    temp2 = kUToG * u[0];                                                      \
+    temp4 = kUToB * u[0];                                                      \
+    temp0 = kYScale * y[0];                                                    \
+    temp1 += kRCst;                                                            \
+    temp3 -= kGCst;                                                            \
+    temp2 += temp3;                                                            \
+    temp4 += kBCst;                                                            \
+    r = VP8Clip8(temp0 + temp1);                                               \
+    g = VP8Clip8(temp0 - temp2);                                               \
+    b = VP8Clip8(temp0 + temp4);                                               \
+    temp0 = kYScale * y[1];                                                    \
+    dst[R] = r;                                                                \
+    dst[G] = g;                                                                \
+    dst[B] = b;                                                                \
+    if (A) dst[A] = 0xff;                                                      \
+    r = VP8Clip8(temp0 + temp1);                                               \
+    g = VP8Clip8(temp0 - temp2);                                               \
+    b = VP8Clip8(temp0 + temp4);                                               \
+    dst[R + XSTEP] = r;                                                        \
+    dst[G + XSTEP] = g;                                                        \
+    dst[B + XSTEP] = b;                                                        \
+    if (A) dst[A + XSTEP] = 0xff;                                              \
+    y += 2;                                                                    \
+    ++u;                                                                       \
+    ++v;                                                                       \
+    dst += 2 * XSTEP;                                                          \
+  }                                                                            \
+  if (len & 1) {                                                               \
+    temp1 = kVToR * v[0];                                                      \
+    temp3 = kVToG * v[0];                                                      \
+    temp2 = kUToG * u[0];                                                      \
+    temp4 = kUToB * u[0];                                                      \
+    temp0 = kYScale * y[0];                                                    \
+    temp1 += kRCst;                                                            \
+    temp3 -= kGCst;                                                            \
+    temp2 += temp3;                                                            \
+    temp4 += kBCst;                                                            \
+    r = VP8Clip8(temp0 + temp1);                                               \
+    g = VP8Clip8(temp0 - temp2);                                               \
+    b = VP8Clip8(temp0 + temp4);                                               \
+    dst[R] = r;                                                                \
+    dst[G] = g;                                                                \
+    dst[B] = b;                                                                \
+    if (A) dst[A] = 0xff;                                                      \
+  }                                                                            \
+}
+
+ROW_FUNC(YuvToRgbRow,      3, 0, 1, 2, 0)
+ROW_FUNC(YuvToRgbaRow,     4, 0, 1, 2, 3)
+ROW_FUNC(YuvToBgrRow,      3, 2, 1, 0, 0)
+ROW_FUNC(YuvToBgraRow,     4, 2, 1, 0, 3)
+
+#undef ROW_FUNC
+
+#endif   // WEBP_USE_MIPS32
+
+//------------------------------------------------------------------------------
+
+extern void WebPInitSamplersMIPS32(void);
+
+void WebPInitSamplersMIPS32(void) {
+#if defined(WEBP_USE_MIPS32)
+  WebPSamplers[MODE_RGB]  = YuvToRgbRow;
+  WebPSamplers[MODE_RGBA] = YuvToRgbaRow;
+  WebPSamplers[MODE_BGR]  = YuvToBgrRow;
+  WebPSamplers[MODE_BGRA] = YuvToBgraRow;
+#endif  // WEBP_USE_MIPS32
+}
diff --git a/src/3rdparty/libwebp/src/dsp/yuv_sse2.c b/src/3rdparty/libwebp/src/dsp/yuv_sse2.c
new file mode 100644
index 0000000..6fe0f3b
--- /dev/null
+++ b/src/3rdparty/libwebp/src/dsp/yuv_sse2.c
@@ -0,0 +1,322 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// YUV->RGB conversion functions
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "./yuv.h"
+
+#if defined(WEBP_USE_SSE2)
+
+#include <emmintrin.h>
+#include <string.h>   // for memcpy
+
+typedef union {   // handy struct for converting SSE2 registers
+  int32_t i32[4];
+  uint8_t u8[16];
+  __m128i m;
+} VP8kCstSSE2;
+
+#if defined(WEBP_YUV_USE_SSE2_TABLES)
+
+#include "./yuv_tables_sse2.h"
+
+void VP8YUVInitSSE2(void) {}
+
+#else
+
+static int done_sse2 = 0;
+static VP8kCstSSE2 VP8kUtoRGBA[256], VP8kVtoRGBA[256], VP8kYtoRGBA[256];
+
+void VP8YUVInitSSE2(void) {
+  if (!done_sse2) {
+    int i;
+    for (i = 0; i < 256; ++i) {
+      VP8kYtoRGBA[i].i32[0] =
+        VP8kYtoRGBA[i].i32[1] =
+        VP8kYtoRGBA[i].i32[2] = (i - 16) * kYScale + YUV_HALF2;
+      VP8kYtoRGBA[i].i32[3] = 0xff << YUV_FIX2;
+
+      VP8kUtoRGBA[i].i32[0] = 0;
+      VP8kUtoRGBA[i].i32[1] = -kUToG * (i - 128);
+      VP8kUtoRGBA[i].i32[2] =  kUToB * (i - 128);
+      VP8kUtoRGBA[i].i32[3] = 0;
+
+      VP8kVtoRGBA[i].i32[0] =  kVToR * (i - 128);
+      VP8kVtoRGBA[i].i32[1] = -kVToG * (i - 128);
+      VP8kVtoRGBA[i].i32[2] = 0;
+      VP8kVtoRGBA[i].i32[3] = 0;
+    }
+    done_sse2 = 1;
+
+#if 0   // code used to generate 'yuv_tables_sse2.h'
+    printf("static const VP8kCstSSE2 VP8kYtoRGBA[256] = {\n");
+    for (i = 0; i < 256; ++i) {
+      printf("  {{0x%.8x, 0x%.8x, 0x%.8x, 0x%.8x}},\n",
+             VP8kYtoRGBA[i].i32[0], VP8kYtoRGBA[i].i32[1],
+             VP8kYtoRGBA[i].i32[2], VP8kYtoRGBA[i].i32[3]);
+    }
+    printf("};\n\n");
+    printf("static const VP8kCstSSE2 VP8kUtoRGBA[256] = {\n");
+    for (i = 0; i < 256; ++i) {
+      printf("  {{0, 0x%.8x, 0x%.8x, 0}},\n",
+             VP8kUtoRGBA[i].i32[1], VP8kUtoRGBA[i].i32[2]);
+    }
+    printf("};\n\n");
+    printf("static VP8kCstSSE2 VP8kVtoRGBA[256] = {\n");
+    for (i = 0; i < 256; ++i) {
+      printf("  {{0x%.8x, 0x%.8x, 0, 0}},\n",
+             VP8kVtoRGBA[i].i32[0], VP8kVtoRGBA[i].i32[1]);
+    }
+    printf("};\n\n");
+#endif
+  }
+}
+
+#endif  // WEBP_YUV_USE_SSE2_TABLES
+
+//-----------------------------------------------------------------------------
+
+static WEBP_INLINE __m128i LoadUVPart(int u, int v) {
+  const __m128i u_part = _mm_loadu_si128(&VP8kUtoRGBA[u].m);
+  const __m128i v_part = _mm_loadu_si128(&VP8kVtoRGBA[v].m);
+  const __m128i uv_part = _mm_add_epi32(u_part, v_part);
+  return uv_part;
+}
+
+static WEBP_INLINE __m128i GetRGBA32bWithUV(int y, const __m128i uv_part) {
+  const __m128i y_part = _mm_loadu_si128(&VP8kYtoRGBA[y].m);
+  const __m128i rgba1 = _mm_add_epi32(y_part, uv_part);
+  const __m128i rgba2 = _mm_srai_epi32(rgba1, YUV_FIX2);
+  return rgba2;
+}
+
+static WEBP_INLINE __m128i GetRGBA32b(int y, int u, int v) {
+  const __m128i uv_part = LoadUVPart(u, v);
+  return GetRGBA32bWithUV(y, uv_part);
+}
+
+static WEBP_INLINE void YuvToRgbSSE2(uint8_t y, uint8_t u, uint8_t v,
+                                     uint8_t* const rgb) {
+  const __m128i tmp0 = GetRGBA32b(y, u, v);
+  const __m128i tmp1 = _mm_packs_epi32(tmp0, tmp0);
+  const __m128i tmp2 = _mm_packus_epi16(tmp1, tmp1);
+  // Note: we store 8 bytes at a time, not 3 bytes! -> memory stomp
+  _mm_storel_epi64((__m128i*)rgb, tmp2);
+}
+
+static WEBP_INLINE void YuvToBgrSSE2(uint8_t y, uint8_t u, uint8_t v,
+                                     uint8_t* const bgr) {
+  const __m128i tmp0 = GetRGBA32b(y, u, v);
+  const __m128i tmp1 = _mm_shuffle_epi32(tmp0, _MM_SHUFFLE(3, 0, 1, 2));
+  const __m128i tmp2 = _mm_packs_epi32(tmp1, tmp1);
+  const __m128i tmp3 = _mm_packus_epi16(tmp2, tmp2);
+  // Note: we store 8 bytes at a time, not 3 bytes! -> memory stomp
+  _mm_storel_epi64((__m128i*)bgr, tmp3);
+}
+
+//-----------------------------------------------------------------------------
+// Convert spans of 32 pixels to various RGB formats for the fancy upsampler.
+
+#ifdef FANCY_UPSAMPLING
+
+void VP8YuvToRgba32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+                    uint8_t* dst) {
+  int n;
+  for (n = 0; n < 32; n += 4) {
+    const __m128i tmp0_1 = GetRGBA32b(y[n + 0], u[n + 0], v[n + 0]);
+    const __m128i tmp0_2 = GetRGBA32b(y[n + 1], u[n + 1], v[n + 1]);
+    const __m128i tmp0_3 = GetRGBA32b(y[n + 2], u[n + 2], v[n + 2]);
+    const __m128i tmp0_4 = GetRGBA32b(y[n + 3], u[n + 3], v[n + 3]);
+    const __m128i tmp1_1 = _mm_packs_epi32(tmp0_1, tmp0_2);
+    const __m128i tmp1_2 = _mm_packs_epi32(tmp0_3, tmp0_4);
+    const __m128i tmp2 = _mm_packus_epi16(tmp1_1, tmp1_2);
+    _mm_storeu_si128((__m128i*)dst, tmp2);
+    dst += 4 * 4;
+  }
+}
+
+void VP8YuvToBgra32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+                    uint8_t* dst) {
+  int n;
+  for (n = 0; n < 32; n += 2) {
+    const __m128i tmp0_1 = GetRGBA32b(y[n + 0], u[n + 0], v[n + 0]);
+    const __m128i tmp0_2 = GetRGBA32b(y[n + 1], u[n + 1], v[n + 1]);
+    const __m128i tmp1_1 = _mm_shuffle_epi32(tmp0_1, _MM_SHUFFLE(3, 0, 1, 2));
+    const __m128i tmp1_2 = _mm_shuffle_epi32(tmp0_2, _MM_SHUFFLE(3, 0, 1, 2));
+    const __m128i tmp2_1 = _mm_packs_epi32(tmp1_1, tmp1_2);
+    const __m128i tmp3 = _mm_packus_epi16(tmp2_1, tmp2_1);
+    _mm_storel_epi64((__m128i*)dst, tmp3);
+    dst += 4 * 2;
+  }
+}
+
+void VP8YuvToRgb32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+                   uint8_t* dst) {
+  int n;
+  uint8_t tmp0[2 * 3 + 5 + 15];
+  uint8_t* const tmp = (uint8_t*)((uintptr_t)(tmp0 + 15) & ~15);  // align
+  for (n = 0; n < 30; ++n) {   // we directly stomp the *dst memory
+    YuvToRgbSSE2(y[n], u[n], v[n], dst + n * 3);
+  }
+  // Last two pixels are special: we write in a tmp buffer before sending
+  // to dst.
+  YuvToRgbSSE2(y[n + 0], u[n + 0], v[n + 0], tmp + 0);
+  YuvToRgbSSE2(y[n + 1], u[n + 1], v[n + 1], tmp + 3);
+  memcpy(dst + n * 3, tmp, 2 * 3);
+}
+
+void VP8YuvToBgr32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+                   uint8_t* dst) {
+  int n;
+  uint8_t tmp0[2 * 3 + 5 + 15];
+  uint8_t* const tmp = (uint8_t*)((uintptr_t)(tmp0 + 15) & ~15);  // align
+  for (n = 0; n < 30; ++n) {
+    YuvToBgrSSE2(y[n], u[n], v[n], dst + n * 3);
+  }
+  YuvToBgrSSE2(y[n + 0], u[n + 0], v[n + 0], tmp + 0);
+  YuvToBgrSSE2(y[n + 1], u[n + 1], v[n + 1], tmp + 3);
+  memcpy(dst + n * 3, tmp, 2 * 3);
+}
+
+#endif  // FANCY_UPSAMPLING
+
+//-----------------------------------------------------------------------------
+// Arbitrary-length row conversion functions
+
+static void YuvToRgbaRowSSE2(const uint8_t* y,
+                             const uint8_t* u, const uint8_t* v,
+                             uint8_t* dst, int len) {
+  int n;
+  for (n = 0; n + 4 <= len; n += 4) {
+    const __m128i uv_0 = LoadUVPart(u[0], v[0]);
+    const __m128i uv_1 = LoadUVPart(u[1], v[1]);
+    const __m128i tmp0_1 = GetRGBA32bWithUV(y[0], uv_0);
+    const __m128i tmp0_2 = GetRGBA32bWithUV(y[1], uv_0);
+    const __m128i tmp0_3 = GetRGBA32bWithUV(y[2], uv_1);
+    const __m128i tmp0_4 = GetRGBA32bWithUV(y[3], uv_1);
+    const __m128i tmp1_1 = _mm_packs_epi32(tmp0_1, tmp0_2);
+    const __m128i tmp1_2 = _mm_packs_epi32(tmp0_3, tmp0_4);
+    const __m128i tmp2 = _mm_packus_epi16(tmp1_1, tmp1_2);
+    _mm_storeu_si128((__m128i*)dst, tmp2);
+    dst += 4 * 4;
+    y += 4;
+    u += 2;
+    v += 2;
+  }
+  // Finish off
+  while (n < len) {
+    VP8YuvToRgba(y[0], u[0], v[0], dst);
+    dst += 4;
+    ++y;
+    u += (n & 1);
+    v += (n & 1);
+    ++n;
+  }
+}
+
+static void YuvToBgraRowSSE2(const uint8_t* y,
+                             const uint8_t* u, const uint8_t* v,
+                             uint8_t* dst, int len) {
+  int n;
+  for (n = 0; n + 2 <= len; n += 2) {
+    const __m128i uv_0 = LoadUVPart(u[0], v[0]);
+    const __m128i tmp0_1 = GetRGBA32bWithUV(y[0], uv_0);
+    const __m128i tmp0_2 = GetRGBA32bWithUV(y[1], uv_0);
+    const __m128i tmp1_1 = _mm_shuffle_epi32(tmp0_1, _MM_SHUFFLE(3, 0, 1, 2));
+    const __m128i tmp1_2 = _mm_shuffle_epi32(tmp0_2, _MM_SHUFFLE(3, 0, 1, 2));
+    const __m128i tmp2_1 = _mm_packs_epi32(tmp1_1, tmp1_2);
+    const __m128i tmp3 = _mm_packus_epi16(tmp2_1, tmp2_1);
+    _mm_storel_epi64((__m128i*)dst, tmp3);
+    dst += 4 * 2;
+    y += 2;
+    ++u;
+    ++v;
+  }
+  // Finish off
+  if (len & 1) {
+    VP8YuvToBgra(y[0], u[0], v[0], dst);
+  }
+}
+
+static void YuvToArgbRowSSE2(const uint8_t* y,
+                             const uint8_t* u, const uint8_t* v,
+                             uint8_t* dst, int len) {
+  int n;
+  for (n = 0; n + 2 <= len; n += 2) {
+    const __m128i uv_0 = LoadUVPart(u[0], v[0]);
+    const __m128i tmp0_1 = GetRGBA32bWithUV(y[0], uv_0);
+    const __m128i tmp0_2 = GetRGBA32bWithUV(y[1], uv_0);
+    const __m128i tmp1_1 = _mm_shuffle_epi32(tmp0_1, _MM_SHUFFLE(2, 1, 0, 3));
+    const __m128i tmp1_2 = _mm_shuffle_epi32(tmp0_2, _MM_SHUFFLE(2, 1, 0, 3));
+    const __m128i tmp2_1 = _mm_packs_epi32(tmp1_1, tmp1_2);
+    const __m128i tmp3 = _mm_packus_epi16(tmp2_1, tmp2_1);
+    _mm_storel_epi64((__m128i*)dst, tmp3);
+    dst += 4 * 2;
+    y += 2;
+    ++u;
+    ++v;
+  }
+  // Finish off
+  if (len & 1) {
+    VP8YuvToArgb(y[0], u[0], v[0], dst);
+  }
+}
+
+static void YuvToRgbRowSSE2(const uint8_t* y,
+                            const uint8_t* u, const uint8_t* v,
+                            uint8_t* dst, int len) {
+  int n;
+  for (n = 0; n + 2 < len; ++n) {   // we directly stomp the *dst memory
+    YuvToRgbSSE2(y[0], u[0], v[0], dst);  // stomps 8 bytes
+    dst += 3;
+    ++y;
+    u += (n & 1);
+    v += (n & 1);
+  }
+  VP8YuvToRgb(y[0], u[0], v[0], dst);
+  if (len > 1) {
+    VP8YuvToRgb(y[1], u[n & 1], v[n & 1], dst + 3);
+  }
+}
+
+static void YuvToBgrRowSSE2(const uint8_t* y,
+                            const uint8_t* u, const uint8_t* v,
+                            uint8_t* dst, int len) {
+  int n;
+  for (n = 0; n + 2 < len; ++n) {   // we directly stomp the *dst memory
+    YuvToBgrSSE2(y[0], u[0], v[0], dst);  // stomps 8 bytes
+    dst += 3;
+    ++y;
+    u += (n & 1);
+    v += (n & 1);
+  }
+  VP8YuvToBgr(y[0], u[0], v[0], dst + 0);
+  if (len > 1) {
+    VP8YuvToBgr(y[1], u[n & 1], v[n & 1], dst + 3);
+  }
+}
+
+#endif  // WEBP_USE_SSE2
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void WebPInitSamplersSSE2(void);
+
+void WebPInitSamplersSSE2(void) {
+#if defined(WEBP_USE_SSE2)
+  WebPSamplers[MODE_RGB]  = YuvToRgbRowSSE2;
+  WebPSamplers[MODE_RGBA] = YuvToRgbaRowSSE2;
+  WebPSamplers[MODE_BGR]  = YuvToBgrRowSSE2;
+  WebPSamplers[MODE_BGRA] = YuvToBgraRowSSE2;
+  WebPSamplers[MODE_ARGB] = YuvToArgbRowSSE2;
+#endif  // WEBP_USE_SSE2
+}
diff --git a/src/3rdparty/libwebp/src/dsp/yuv_tables_sse2.h b/src/3rdparty/libwebp/src/dsp/yuv_tables_sse2.h
new file mode 100644
index 0000000..2b0f057
--- /dev/null
+++ b/src/3rdparty/libwebp/src/dsp/yuv_tables_sse2.h
@@ -0,0 +1,536 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// SSE2 tables for YUV->RGB conversion (12kB overall)
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+// This file is not compiled, but #include'd directly from yuv.c
+// Only used if WEBP_YUV_USE_SSE2_TABLES is defined.
+
+static const VP8kCstSSE2 VP8kYtoRGBA[256] = {
+  {{0xfffb77b0, 0xfffb77b0, 0xfffb77b0, 0x003fc000}},
+  {{0xfffbc235, 0xfffbc235, 0xfffbc235, 0x003fc000}},
+  {{0xfffc0cba, 0xfffc0cba, 0xfffc0cba, 0x003fc000}},
+  {{0xfffc573f, 0xfffc573f, 0xfffc573f, 0x003fc000}},
+  {{0xfffca1c4, 0xfffca1c4, 0xfffca1c4, 0x003fc000}},
+  {{0xfffcec49, 0xfffcec49, 0xfffcec49, 0x003fc000}},
+  {{0xfffd36ce, 0xfffd36ce, 0xfffd36ce, 0x003fc000}},
+  {{0xfffd8153, 0xfffd8153, 0xfffd8153, 0x003fc000}},
+  {{0xfffdcbd8, 0xfffdcbd8, 0xfffdcbd8, 0x003fc000}},
+  {{0xfffe165d, 0xfffe165d, 0xfffe165d, 0x003fc000}},
+  {{0xfffe60e2, 0xfffe60e2, 0xfffe60e2, 0x003fc000}},
+  {{0xfffeab67, 0xfffeab67, 0xfffeab67, 0x003fc000}},
+  {{0xfffef5ec, 0xfffef5ec, 0xfffef5ec, 0x003fc000}},
+  {{0xffff4071, 0xffff4071, 0xffff4071, 0x003fc000}},
+  {{0xffff8af6, 0xffff8af6, 0xffff8af6, 0x003fc000}},
+  {{0xffffd57b, 0xffffd57b, 0xffffd57b, 0x003fc000}},
+  {{0x00002000, 0x00002000, 0x00002000, 0x003fc000}},
+  {{0x00006a85, 0x00006a85, 0x00006a85, 0x003fc000}},
+  {{0x0000b50a, 0x0000b50a, 0x0000b50a, 0x003fc000}},
+  {{0x0000ff8f, 0x0000ff8f, 0x0000ff8f, 0x003fc000}},
+  {{0x00014a14, 0x00014a14, 0x00014a14, 0x003fc000}},
+  {{0x00019499, 0x00019499, 0x00019499, 0x003fc000}},
+  {{0x0001df1e, 0x0001df1e, 0x0001df1e, 0x003fc000}},
+  {{0x000229a3, 0x000229a3, 0x000229a3, 0x003fc000}},
+  {{0x00027428, 0x00027428, 0x00027428, 0x003fc000}},
+  {{0x0002bead, 0x0002bead, 0x0002bead, 0x003fc000}},
+  {{0x00030932, 0x00030932, 0x00030932, 0x003fc000}},
+  {{0x000353b7, 0x000353b7, 0x000353b7, 0x003fc000}},
+  {{0x00039e3c, 0x00039e3c, 0x00039e3c, 0x003fc000}},
+  {{0x0003e8c1, 0x0003e8c1, 0x0003e8c1, 0x003fc000}},
+  {{0x00043346, 0x00043346, 0x00043346, 0x003fc000}},
+  {{0x00047dcb, 0x00047dcb, 0x00047dcb, 0x003fc000}},
+  {{0x0004c850, 0x0004c850, 0x0004c850, 0x003fc000}},
+  {{0x000512d5, 0x000512d5, 0x000512d5, 0x003fc000}},
+  {{0x00055d5a, 0x00055d5a, 0x00055d5a, 0x003fc000}},
+  {{0x0005a7df, 0x0005a7df, 0x0005a7df, 0x003fc000}},
+  {{0x0005f264, 0x0005f264, 0x0005f264, 0x003fc000}},
+  {{0x00063ce9, 0x00063ce9, 0x00063ce9, 0x003fc000}},
+  {{0x0006876e, 0x0006876e, 0x0006876e, 0x003fc000}},
+  {{0x0006d1f3, 0x0006d1f3, 0x0006d1f3, 0x003fc000}},
+  {{0x00071c78, 0x00071c78, 0x00071c78, 0x003fc000}},
+  {{0x000766fd, 0x000766fd, 0x000766fd, 0x003fc000}},
+  {{0x0007b182, 0x0007b182, 0x0007b182, 0x003fc000}},
+  {{0x0007fc07, 0x0007fc07, 0x0007fc07, 0x003fc000}},
+  {{0x0008468c, 0x0008468c, 0x0008468c, 0x003fc000}},
+  {{0x00089111, 0x00089111, 0x00089111, 0x003fc000}},
+  {{0x0008db96, 0x0008db96, 0x0008db96, 0x003fc000}},
+  {{0x0009261b, 0x0009261b, 0x0009261b, 0x003fc000}},
+  {{0x000970a0, 0x000970a0, 0x000970a0, 0x003fc000}},
+  {{0x0009bb25, 0x0009bb25, 0x0009bb25, 0x003fc000}},
+  {{0x000a05aa, 0x000a05aa, 0x000a05aa, 0x003fc000}},
+  {{0x000a502f, 0x000a502f, 0x000a502f, 0x003fc000}},
+  {{0x000a9ab4, 0x000a9ab4, 0x000a9ab4, 0x003fc000}},
+  {{0x000ae539, 0x000ae539, 0x000ae539, 0x003fc000}},
+  {{0x000b2fbe, 0x000b2fbe, 0x000b2fbe, 0x003fc000}},
+  {{0x000b7a43, 0x000b7a43, 0x000b7a43, 0x003fc000}},
+  {{0x000bc4c8, 0x000bc4c8, 0x000bc4c8, 0x003fc000}},
+  {{0x000c0f4d, 0x000c0f4d, 0x000c0f4d, 0x003fc000}},
+  {{0x000c59d2, 0x000c59d2, 0x000c59d2, 0x003fc000}},
+  {{0x000ca457, 0x000ca457, 0x000ca457, 0x003fc000}},
+  {{0x000ceedc, 0x000ceedc, 0x000ceedc, 0x003fc000}},
+  {{0x000d3961, 0x000d3961, 0x000d3961, 0x003fc000}},
+  {{0x000d83e6, 0x000d83e6, 0x000d83e6, 0x003fc000}},
+  {{0x000dce6b, 0x000dce6b, 0x000dce6b, 0x003fc000}},
+  {{0x000e18f0, 0x000e18f0, 0x000e18f0, 0x003fc000}},
+  {{0x000e6375, 0x000e6375, 0x000e6375, 0x003fc000}},
+  {{0x000eadfa, 0x000eadfa, 0x000eadfa, 0x003fc000}},
+  {{0x000ef87f, 0x000ef87f, 0x000ef87f, 0x003fc000}},
+  {{0x000f4304, 0x000f4304, 0x000f4304, 0x003fc000}},
+  {{0x000f8d89, 0x000f8d89, 0x000f8d89, 0x003fc000}},
+  {{0x000fd80e, 0x000fd80e, 0x000fd80e, 0x003fc000}},
+  {{0x00102293, 0x00102293, 0x00102293, 0x003fc000}},
+  {{0x00106d18, 0x00106d18, 0x00106d18, 0x003fc000}},
+  {{0x0010b79d, 0x0010b79d, 0x0010b79d, 0x003fc000}},
+  {{0x00110222, 0x00110222, 0x00110222, 0x003fc000}},
+  {{0x00114ca7, 0x00114ca7, 0x00114ca7, 0x003fc000}},
+  {{0x0011972c, 0x0011972c, 0x0011972c, 0x003fc000}},
+  {{0x0011e1b1, 0x0011e1b1, 0x0011e1b1, 0x003fc000}},
+  {{0x00122c36, 0x00122c36, 0x00122c36, 0x003fc000}},
+  {{0x001276bb, 0x001276bb, 0x001276bb, 0x003fc000}},
+  {{0x0012c140, 0x0012c140, 0x0012c140, 0x003fc000}},
+  {{0x00130bc5, 0x00130bc5, 0x00130bc5, 0x003fc000}},
+  {{0x0013564a, 0x0013564a, 0x0013564a, 0x003fc000}},
+  {{0x0013a0cf, 0x0013a0cf, 0x0013a0cf, 0x003fc000}},
+  {{0x0013eb54, 0x0013eb54, 0x0013eb54, 0x003fc000}},
+  {{0x001435d9, 0x001435d9, 0x001435d9, 0x003fc000}},
+  {{0x0014805e, 0x0014805e, 0x0014805e, 0x003fc000}},
+  {{0x0014cae3, 0x0014cae3, 0x0014cae3, 0x003fc000}},
+  {{0x00151568, 0x00151568, 0x00151568, 0x003fc000}},
+  {{0x00155fed, 0x00155fed, 0x00155fed, 0x003fc000}},
+  {{0x0015aa72, 0x0015aa72, 0x0015aa72, 0x003fc000}},
+  {{0x0015f4f7, 0x0015f4f7, 0x0015f4f7, 0x003fc000}},
+  {{0x00163f7c, 0x00163f7c, 0x00163f7c, 0x003fc000}},
+  {{0x00168a01, 0x00168a01, 0x00168a01, 0x003fc000}},
+  {{0x0016d486, 0x0016d486, 0x0016d486, 0x003fc000}},
+  {{0x00171f0b, 0x00171f0b, 0x00171f0b, 0x003fc000}},
+  {{0x00176990, 0x00176990, 0x00176990, 0x003fc000}},
+  {{0x0017b415, 0x0017b415, 0x0017b415, 0x003fc000}},
+  {{0x0017fe9a, 0x0017fe9a, 0x0017fe9a, 0x003fc000}},
+  {{0x0018491f, 0x0018491f, 0x0018491f, 0x003fc000}},
+  {{0x001893a4, 0x001893a4, 0x001893a4, 0x003fc000}},
+  {{0x0018de29, 0x0018de29, 0x0018de29, 0x003fc000}},
+  {{0x001928ae, 0x001928ae, 0x001928ae, 0x003fc000}},
+  {{0x00197333, 0x00197333, 0x00197333, 0x003fc000}},
+  {{0x0019bdb8, 0x0019bdb8, 0x0019bdb8, 0x003fc000}},
+  {{0x001a083d, 0x001a083d, 0x001a083d, 0x003fc000}},
+  {{0x001a52c2, 0x001a52c2, 0x001a52c2, 0x003fc000}},
+  {{0x001a9d47, 0x001a9d47, 0x001a9d47, 0x003fc000}},
+  {{0x001ae7cc, 0x001ae7cc, 0x001ae7cc, 0x003fc000}},
+  {{0x001b3251, 0x001b3251, 0x001b3251, 0x003fc000}},
+  {{0x001b7cd6, 0x001b7cd6, 0x001b7cd6, 0x003fc000}},
+  {{0x001bc75b, 0x001bc75b, 0x001bc75b, 0x003fc000}},
+  {{0x001c11e0, 0x001c11e0, 0x001c11e0, 0x003fc000}},
+  {{0x001c5c65, 0x001c5c65, 0x001c5c65, 0x003fc000}},
+  {{0x001ca6ea, 0x001ca6ea, 0x001ca6ea, 0x003fc000}},
+  {{0x001cf16f, 0x001cf16f, 0x001cf16f, 0x003fc000}},
+  {{0x001d3bf4, 0x001d3bf4, 0x001d3bf4, 0x003fc000}},
+  {{0x001d8679, 0x001d8679, 0x001d8679, 0x003fc000}},
+  {{0x001dd0fe, 0x001dd0fe, 0x001dd0fe, 0x003fc000}},
+  {{0x001e1b83, 0x001e1b83, 0x001e1b83, 0x003fc000}},
+  {{0x001e6608, 0x001e6608, 0x001e6608, 0x003fc000}},
+  {{0x001eb08d, 0x001eb08d, 0x001eb08d, 0x003fc000}},
+  {{0x001efb12, 0x001efb12, 0x001efb12, 0x003fc000}},
+  {{0x001f4597, 0x001f4597, 0x001f4597, 0x003fc000}},
+  {{0x001f901c, 0x001f901c, 0x001f901c, 0x003fc000}},
+  {{0x001fdaa1, 0x001fdaa1, 0x001fdaa1, 0x003fc000}},
+  {{0x00202526, 0x00202526, 0x00202526, 0x003fc000}},
+  {{0x00206fab, 0x00206fab, 0x00206fab, 0x003fc000}},
+  {{0x0020ba30, 0x0020ba30, 0x0020ba30, 0x003fc000}},
+  {{0x002104b5, 0x002104b5, 0x002104b5, 0x003fc000}},
+  {{0x00214f3a, 0x00214f3a, 0x00214f3a, 0x003fc000}},
+  {{0x002199bf, 0x002199bf, 0x002199bf, 0x003fc000}},
+  {{0x0021e444, 0x0021e444, 0x0021e444, 0x003fc000}},
+  {{0x00222ec9, 0x00222ec9, 0x00222ec9, 0x003fc000}},
+  {{0x0022794e, 0x0022794e, 0x0022794e, 0x003fc000}},
+  {{0x0022c3d3, 0x0022c3d3, 0x0022c3d3, 0x003fc000}},
+  {{0x00230e58, 0x00230e58, 0x00230e58, 0x003fc000}},
+  {{0x002358dd, 0x002358dd, 0x002358dd, 0x003fc000}},
+  {{0x0023a362, 0x0023a362, 0x0023a362, 0x003fc000}},
+  {{0x0023ede7, 0x0023ede7, 0x0023ede7, 0x003fc000}},
+  {{0x0024386c, 0x0024386c, 0x0024386c, 0x003fc000}},
+  {{0x002482f1, 0x002482f1, 0x002482f1, 0x003fc000}},
+  {{0x0024cd76, 0x0024cd76, 0x0024cd76, 0x003fc000}},
+  {{0x002517fb, 0x002517fb, 0x002517fb, 0x003fc000}},
+  {{0x00256280, 0x00256280, 0x00256280, 0x003fc000}},
+  {{0x0025ad05, 0x0025ad05, 0x0025ad05, 0x003fc000}},
+  {{0x0025f78a, 0x0025f78a, 0x0025f78a, 0x003fc000}},
+  {{0x0026420f, 0x0026420f, 0x0026420f, 0x003fc000}},
+  {{0x00268c94, 0x00268c94, 0x00268c94, 0x003fc000}},
+  {{0x0026d719, 0x0026d719, 0x0026d719, 0x003fc000}},
+  {{0x0027219e, 0x0027219e, 0x0027219e, 0x003fc000}},
+  {{0x00276c23, 0x00276c23, 0x00276c23, 0x003fc000}},
+  {{0x0027b6a8, 0x0027b6a8, 0x0027b6a8, 0x003fc000}},
+  {{0x0028012d, 0x0028012d, 0x0028012d, 0x003fc000}},
+  {{0x00284bb2, 0x00284bb2, 0x00284bb2, 0x003fc000}},
+  {{0x00289637, 0x00289637, 0x00289637, 0x003fc000}},
+  {{0x0028e0bc, 0x0028e0bc, 0x0028e0bc, 0x003fc000}},
+  {{0x00292b41, 0x00292b41, 0x00292b41, 0x003fc000}},
+  {{0x002975c6, 0x002975c6, 0x002975c6, 0x003fc000}},
+  {{0x0029c04b, 0x0029c04b, 0x0029c04b, 0x003fc000}},
+  {{0x002a0ad0, 0x002a0ad0, 0x002a0ad0, 0x003fc000}},
+  {{0x002a5555, 0x002a5555, 0x002a5555, 0x003fc000}},
+  {{0x002a9fda, 0x002a9fda, 0x002a9fda, 0x003fc000}},
+  {{0x002aea5f, 0x002aea5f, 0x002aea5f, 0x003fc000}},
+  {{0x002b34e4, 0x002b34e4, 0x002b34e4, 0x003fc000}},
+  {{0x002b7f69, 0x002b7f69, 0x002b7f69, 0x003fc000}},
+  {{0x002bc9ee, 0x002bc9ee, 0x002bc9ee, 0x003fc000}},
+  {{0x002c1473, 0x002c1473, 0x002c1473, 0x003fc000}},
+  {{0x002c5ef8, 0x002c5ef8, 0x002c5ef8, 0x003fc000}},
+  {{0x002ca97d, 0x002ca97d, 0x002ca97d, 0x003fc000}},
+  {{0x002cf402, 0x002cf402, 0x002cf402, 0x003fc000}},
+  {{0x002d3e87, 0x002d3e87, 0x002d3e87, 0x003fc000}},
+  {{0x002d890c, 0x002d890c, 0x002d890c, 0x003fc000}},
+  {{0x002dd391, 0x002dd391, 0x002dd391, 0x003fc000}},
+  {{0x002e1e16, 0x002e1e16, 0x002e1e16, 0x003fc000}},
+  {{0x002e689b, 0x002e689b, 0x002e689b, 0x003fc000}},
+  {{0x002eb320, 0x002eb320, 0x002eb320, 0x003fc000}},
+  {{0x002efda5, 0x002efda5, 0x002efda5, 0x003fc000}},
+  {{0x002f482a, 0x002f482a, 0x002f482a, 0x003fc000}},
+  {{0x002f92af, 0x002f92af, 0x002f92af, 0x003fc000}},
+  {{0x002fdd34, 0x002fdd34, 0x002fdd34, 0x003fc000}},
+  {{0x003027b9, 0x003027b9, 0x003027b9, 0x003fc000}},
+  {{0x0030723e, 0x0030723e, 0x0030723e, 0x003fc000}},
+  {{0x0030bcc3, 0x0030bcc3, 0x0030bcc3, 0x003fc000}},
+  {{0x00310748, 0x00310748, 0x00310748, 0x003fc000}},
+  {{0x003151cd, 0x003151cd, 0x003151cd, 0x003fc000}},
+  {{0x00319c52, 0x00319c52, 0x00319c52, 0x003fc000}},
+  {{0x0031e6d7, 0x0031e6d7, 0x0031e6d7, 0x003fc000}},
+  {{0x0032315c, 0x0032315c, 0x0032315c, 0x003fc000}},
+  {{0x00327be1, 0x00327be1, 0x00327be1, 0x003fc000}},
+  {{0x0032c666, 0x0032c666, 0x0032c666, 0x003fc000}},
+  {{0x003310eb, 0x003310eb, 0x003310eb, 0x003fc000}},
+  {{0x00335b70, 0x00335b70, 0x00335b70, 0x003fc000}},
+  {{0x0033a5f5, 0x0033a5f5, 0x0033a5f5, 0x003fc000}},
+  {{0x0033f07a, 0x0033f07a, 0x0033f07a, 0x003fc000}},
+  {{0x00343aff, 0x00343aff, 0x00343aff, 0x003fc000}},
+  {{0x00348584, 0x00348584, 0x00348584, 0x003fc000}},
+  {{0x0034d009, 0x0034d009, 0x0034d009, 0x003fc000}},
+  {{0x00351a8e, 0x00351a8e, 0x00351a8e, 0x003fc000}},
+  {{0x00356513, 0x00356513, 0x00356513, 0x003fc000}},
+  {{0x0035af98, 0x0035af98, 0x0035af98, 0x003fc000}},
+  {{0x0035fa1d, 0x0035fa1d, 0x0035fa1d, 0x003fc000}},
+  {{0x003644a2, 0x003644a2, 0x003644a2, 0x003fc000}},
+  {{0x00368f27, 0x00368f27, 0x00368f27, 0x003fc000}},
+  {{0x0036d9ac, 0x0036d9ac, 0x0036d9ac, 0x003fc000}},
+  {{0x00372431, 0x00372431, 0x00372431, 0x003fc000}},
+  {{0x00376eb6, 0x00376eb6, 0x00376eb6, 0x003fc000}},
+  {{0x0037b93b, 0x0037b93b, 0x0037b93b, 0x003fc000}},
+  {{0x003803c0, 0x003803c0, 0x003803c0, 0x003fc000}},
+  {{0x00384e45, 0x00384e45, 0x00384e45, 0x003fc000}},
+  {{0x003898ca, 0x003898ca, 0x003898ca, 0x003fc000}},
+  {{0x0038e34f, 0x0038e34f, 0x0038e34f, 0x003fc000}},
+  {{0x00392dd4, 0x00392dd4, 0x00392dd4, 0x003fc000}},
+  {{0x00397859, 0x00397859, 0x00397859, 0x003fc000}},
+  {{0x0039c2de, 0x0039c2de, 0x0039c2de, 0x003fc000}},
+  {{0x003a0d63, 0x003a0d63, 0x003a0d63, 0x003fc000}},
+  {{0x003a57e8, 0x003a57e8, 0x003a57e8, 0x003fc000}},
+  {{0x003aa26d, 0x003aa26d, 0x003aa26d, 0x003fc000}},
+  {{0x003aecf2, 0x003aecf2, 0x003aecf2, 0x003fc000}},
+  {{0x003b3777, 0x003b3777, 0x003b3777, 0x003fc000}},
+  {{0x003b81fc, 0x003b81fc, 0x003b81fc, 0x003fc000}},
+  {{0x003bcc81, 0x003bcc81, 0x003bcc81, 0x003fc000}},
+  {{0x003c1706, 0x003c1706, 0x003c1706, 0x003fc000}},
+  {{0x003c618b, 0x003c618b, 0x003c618b, 0x003fc000}},
+  {{0x003cac10, 0x003cac10, 0x003cac10, 0x003fc000}},
+  {{0x003cf695, 0x003cf695, 0x003cf695, 0x003fc000}},
+  {{0x003d411a, 0x003d411a, 0x003d411a, 0x003fc000}},
+  {{0x003d8b9f, 0x003d8b9f, 0x003d8b9f, 0x003fc000}},
+  {{0x003dd624, 0x003dd624, 0x003dd624, 0x003fc000}},
+  {{0x003e20a9, 0x003e20a9, 0x003e20a9, 0x003fc000}},
+  {{0x003e6b2e, 0x003e6b2e, 0x003e6b2e, 0x003fc000}},
+  {{0x003eb5b3, 0x003eb5b3, 0x003eb5b3, 0x003fc000}},
+  {{0x003f0038, 0x003f0038, 0x003f0038, 0x003fc000}},
+  {{0x003f4abd, 0x003f4abd, 0x003f4abd, 0x003fc000}},
+  {{0x003f9542, 0x003f9542, 0x003f9542, 0x003fc000}},
+  {{0x003fdfc7, 0x003fdfc7, 0x003fdfc7, 0x003fc000}},
+  {{0x00402a4c, 0x00402a4c, 0x00402a4c, 0x003fc000}},
+  {{0x004074d1, 0x004074d1, 0x004074d1, 0x003fc000}},
+  {{0x0040bf56, 0x0040bf56, 0x0040bf56, 0x003fc000}},
+  {{0x004109db, 0x004109db, 0x004109db, 0x003fc000}},
+  {{0x00415460, 0x00415460, 0x00415460, 0x003fc000}},
+  {{0x00419ee5, 0x00419ee5, 0x00419ee5, 0x003fc000}},
+  {{0x0041e96a, 0x0041e96a, 0x0041e96a, 0x003fc000}},
+  {{0x004233ef, 0x004233ef, 0x004233ef, 0x003fc000}},
+  {{0x00427e74, 0x00427e74, 0x00427e74, 0x003fc000}},
+  {{0x0042c8f9, 0x0042c8f9, 0x0042c8f9, 0x003fc000}},
+  {{0x0043137e, 0x0043137e, 0x0043137e, 0x003fc000}},
+  {{0x00435e03, 0x00435e03, 0x00435e03, 0x003fc000}},
+  {{0x0043a888, 0x0043a888, 0x0043a888, 0x003fc000}},
+  {{0x0043f30d, 0x0043f30d, 0x0043f30d, 0x003fc000}},
+  {{0x00443d92, 0x00443d92, 0x00443d92, 0x003fc000}},
+  {{0x00448817, 0x00448817, 0x00448817, 0x003fc000}},
+  {{0x0044d29c, 0x0044d29c, 0x0044d29c, 0x003fc000}},
+  {{0x00451d21, 0x00451d21, 0x00451d21, 0x003fc000}},
+  {{0x004567a6, 0x004567a6, 0x004567a6, 0x003fc000}},
+  {{0x0045b22b, 0x0045b22b, 0x0045b22b, 0x003fc000}}
+};
+
+static const VP8kCstSSE2 VP8kUtoRGBA[256] = {
+  {{0, 0x000c8980, 0xffbf7300, 0}}, {{0, 0x000c706d, 0xffbff41a, 0}},
+  {{0, 0x000c575a, 0xffc07534, 0}}, {{0, 0x000c3e47, 0xffc0f64e, 0}},
+  {{0, 0x000c2534, 0xffc17768, 0}}, {{0, 0x000c0c21, 0xffc1f882, 0}},
+  {{0, 0x000bf30e, 0xffc2799c, 0}}, {{0, 0x000bd9fb, 0xffc2fab6, 0}},
+  {{0, 0x000bc0e8, 0xffc37bd0, 0}}, {{0, 0x000ba7d5, 0xffc3fcea, 0}},
+  {{0, 0x000b8ec2, 0xffc47e04, 0}}, {{0, 0x000b75af, 0xffc4ff1e, 0}},
+  {{0, 0x000b5c9c, 0xffc58038, 0}}, {{0, 0x000b4389, 0xffc60152, 0}},
+  {{0, 0x000b2a76, 0xffc6826c, 0}}, {{0, 0x000b1163, 0xffc70386, 0}},
+  {{0, 0x000af850, 0xffc784a0, 0}}, {{0, 0x000adf3d, 0xffc805ba, 0}},
+  {{0, 0x000ac62a, 0xffc886d4, 0}}, {{0, 0x000aad17, 0xffc907ee, 0}},
+  {{0, 0x000a9404, 0xffc98908, 0}}, {{0, 0x000a7af1, 0xffca0a22, 0}},
+  {{0, 0x000a61de, 0xffca8b3c, 0}}, {{0, 0x000a48cb, 0xffcb0c56, 0}},
+  {{0, 0x000a2fb8, 0xffcb8d70, 0}}, {{0, 0x000a16a5, 0xffcc0e8a, 0}},
+  {{0, 0x0009fd92, 0xffcc8fa4, 0}}, {{0, 0x0009e47f, 0xffcd10be, 0}},
+  {{0, 0x0009cb6c, 0xffcd91d8, 0}}, {{0, 0x0009b259, 0xffce12f2, 0}},
+  {{0, 0x00099946, 0xffce940c, 0}}, {{0, 0x00098033, 0xffcf1526, 0}},
+  {{0, 0x00096720, 0xffcf9640, 0}}, {{0, 0x00094e0d, 0xffd0175a, 0}},
+  {{0, 0x000934fa, 0xffd09874, 0}}, {{0, 0x00091be7, 0xffd1198e, 0}},
+  {{0, 0x000902d4, 0xffd19aa8, 0}}, {{0, 0x0008e9c1, 0xffd21bc2, 0}},
+  {{0, 0x0008d0ae, 0xffd29cdc, 0}}, {{0, 0x0008b79b, 0xffd31df6, 0}},
+  {{0, 0x00089e88, 0xffd39f10, 0}}, {{0, 0x00088575, 0xffd4202a, 0}},
+  {{0, 0x00086c62, 0xffd4a144, 0}}, {{0, 0x0008534f, 0xffd5225e, 0}},
+  {{0, 0x00083a3c, 0xffd5a378, 0}}, {{0, 0x00082129, 0xffd62492, 0}},
+  {{0, 0x00080816, 0xffd6a5ac, 0}}, {{0, 0x0007ef03, 0xffd726c6, 0}},
+  {{0, 0x0007d5f0, 0xffd7a7e0, 0}}, {{0, 0x0007bcdd, 0xffd828fa, 0}},
+  {{0, 0x0007a3ca, 0xffd8aa14, 0}}, {{0, 0x00078ab7, 0xffd92b2e, 0}},
+  {{0, 0x000771a4, 0xffd9ac48, 0}}, {{0, 0x00075891, 0xffda2d62, 0}},
+  {{0, 0x00073f7e, 0xffdaae7c, 0}}, {{0, 0x0007266b, 0xffdb2f96, 0}},
+  {{0, 0x00070d58, 0xffdbb0b0, 0}}, {{0, 0x0006f445, 0xffdc31ca, 0}},
+  {{0, 0x0006db32, 0xffdcb2e4, 0}}, {{0, 0x0006c21f, 0xffdd33fe, 0}},
+  {{0, 0x0006a90c, 0xffddb518, 0}}, {{0, 0x00068ff9, 0xffde3632, 0}},
+  {{0, 0x000676e6, 0xffdeb74c, 0}}, {{0, 0x00065dd3, 0xffdf3866, 0}},
+  {{0, 0x000644c0, 0xffdfb980, 0}}, {{0, 0x00062bad, 0xffe03a9a, 0}},
+  {{0, 0x0006129a, 0xffe0bbb4, 0}}, {{0, 0x0005f987, 0xffe13cce, 0}},
+  {{0, 0x0005e074, 0xffe1bde8, 0}}, {{0, 0x0005c761, 0xffe23f02, 0}},
+  {{0, 0x0005ae4e, 0xffe2c01c, 0}}, {{0, 0x0005953b, 0xffe34136, 0}},
+  {{0, 0x00057c28, 0xffe3c250, 0}}, {{0, 0x00056315, 0xffe4436a, 0}},
+  {{0, 0x00054a02, 0xffe4c484, 0}}, {{0, 0x000530ef, 0xffe5459e, 0}},
+  {{0, 0x000517dc, 0xffe5c6b8, 0}}, {{0, 0x0004fec9, 0xffe647d2, 0}},
+  {{0, 0x0004e5b6, 0xffe6c8ec, 0}}, {{0, 0x0004cca3, 0xffe74a06, 0}},
+  {{0, 0x0004b390, 0xffe7cb20, 0}}, {{0, 0x00049a7d, 0xffe84c3a, 0}},
+  {{0, 0x0004816a, 0xffe8cd54, 0}}, {{0, 0x00046857, 0xffe94e6e, 0}},
+  {{0, 0x00044f44, 0xffe9cf88, 0}}, {{0, 0x00043631, 0xffea50a2, 0}},
+  {{0, 0x00041d1e, 0xffead1bc, 0}}, {{0, 0x0004040b, 0xffeb52d6, 0}},
+  {{0, 0x0003eaf8, 0xffebd3f0, 0}}, {{0, 0x0003d1e5, 0xffec550a, 0}},
+  {{0, 0x0003b8d2, 0xffecd624, 0}}, {{0, 0x00039fbf, 0xffed573e, 0}},
+  {{0, 0x000386ac, 0xffedd858, 0}}, {{0, 0x00036d99, 0xffee5972, 0}},
+  {{0, 0x00035486, 0xffeeda8c, 0}}, {{0, 0x00033b73, 0xffef5ba6, 0}},
+  {{0, 0x00032260, 0xffefdcc0, 0}}, {{0, 0x0003094d, 0xfff05dda, 0}},
+  {{0, 0x0002f03a, 0xfff0def4, 0}}, {{0, 0x0002d727, 0xfff1600e, 0}},
+  {{0, 0x0002be14, 0xfff1e128, 0}}, {{0, 0x0002a501, 0xfff26242, 0}},
+  {{0, 0x00028bee, 0xfff2e35c, 0}}, {{0, 0x000272db, 0xfff36476, 0}},
+  {{0, 0x000259c8, 0xfff3e590, 0}}, {{0, 0x000240b5, 0xfff466aa, 0}},
+  {{0, 0x000227a2, 0xfff4e7c4, 0}}, {{0, 0x00020e8f, 0xfff568de, 0}},
+  {{0, 0x0001f57c, 0xfff5e9f8, 0}}, {{0, 0x0001dc69, 0xfff66b12, 0}},
+  {{0, 0x0001c356, 0xfff6ec2c, 0}}, {{0, 0x0001aa43, 0xfff76d46, 0}},
+  {{0, 0x00019130, 0xfff7ee60, 0}}, {{0, 0x0001781d, 0xfff86f7a, 0}},
+  {{0, 0x00015f0a, 0xfff8f094, 0}}, {{0, 0x000145f7, 0xfff971ae, 0}},
+  {{0, 0x00012ce4, 0xfff9f2c8, 0}}, {{0, 0x000113d1, 0xfffa73e2, 0}},
+  {{0, 0x0000fabe, 0xfffaf4fc, 0}}, {{0, 0x0000e1ab, 0xfffb7616, 0}},
+  {{0, 0x0000c898, 0xfffbf730, 0}}, {{0, 0x0000af85, 0xfffc784a, 0}},
+  {{0, 0x00009672, 0xfffcf964, 0}}, {{0, 0x00007d5f, 0xfffd7a7e, 0}},
+  {{0, 0x0000644c, 0xfffdfb98, 0}}, {{0, 0x00004b39, 0xfffe7cb2, 0}},
+  {{0, 0x00003226, 0xfffefdcc, 0}}, {{0, 0x00001913, 0xffff7ee6, 0}},
+  {{0, 0x00000000, 0x00000000, 0}}, {{0, 0xffffe6ed, 0x0000811a, 0}},
+  {{0, 0xffffcdda, 0x00010234, 0}}, {{0, 0xffffb4c7, 0x0001834e, 0}},
+  {{0, 0xffff9bb4, 0x00020468, 0}}, {{0, 0xffff82a1, 0x00028582, 0}},
+  {{0, 0xffff698e, 0x0003069c, 0}}, {{0, 0xffff507b, 0x000387b6, 0}},
+  {{0, 0xffff3768, 0x000408d0, 0}}, {{0, 0xffff1e55, 0x000489ea, 0}},
+  {{0, 0xffff0542, 0x00050b04, 0}}, {{0, 0xfffeec2f, 0x00058c1e, 0}},
+  {{0, 0xfffed31c, 0x00060d38, 0}}, {{0, 0xfffeba09, 0x00068e52, 0}},
+  {{0, 0xfffea0f6, 0x00070f6c, 0}}, {{0, 0xfffe87e3, 0x00079086, 0}},
+  {{0, 0xfffe6ed0, 0x000811a0, 0}}, {{0, 0xfffe55bd, 0x000892ba, 0}},
+  {{0, 0xfffe3caa, 0x000913d4, 0}}, {{0, 0xfffe2397, 0x000994ee, 0}},
+  {{0, 0xfffe0a84, 0x000a1608, 0}}, {{0, 0xfffdf171, 0x000a9722, 0}},
+  {{0, 0xfffdd85e, 0x000b183c, 0}}, {{0, 0xfffdbf4b, 0x000b9956, 0}},
+  {{0, 0xfffda638, 0x000c1a70, 0}}, {{0, 0xfffd8d25, 0x000c9b8a, 0}},
+  {{0, 0xfffd7412, 0x000d1ca4, 0}}, {{0, 0xfffd5aff, 0x000d9dbe, 0}},
+  {{0, 0xfffd41ec, 0x000e1ed8, 0}}, {{0, 0xfffd28d9, 0x000e9ff2, 0}},
+  {{0, 0xfffd0fc6, 0x000f210c, 0}}, {{0, 0xfffcf6b3, 0x000fa226, 0}},
+  {{0, 0xfffcdda0, 0x00102340, 0}}, {{0, 0xfffcc48d, 0x0010a45a, 0}},
+  {{0, 0xfffcab7a, 0x00112574, 0}}, {{0, 0xfffc9267, 0x0011a68e, 0}},
+  {{0, 0xfffc7954, 0x001227a8, 0}}, {{0, 0xfffc6041, 0x0012a8c2, 0}},
+  {{0, 0xfffc472e, 0x001329dc, 0}}, {{0, 0xfffc2e1b, 0x0013aaf6, 0}},
+  {{0, 0xfffc1508, 0x00142c10, 0}}, {{0, 0xfffbfbf5, 0x0014ad2a, 0}},
+  {{0, 0xfffbe2e2, 0x00152e44, 0}}, {{0, 0xfffbc9cf, 0x0015af5e, 0}},
+  {{0, 0xfffbb0bc, 0x00163078, 0}}, {{0, 0xfffb97a9, 0x0016b192, 0}},
+  {{0, 0xfffb7e96, 0x001732ac, 0}}, {{0, 0xfffb6583, 0x0017b3c6, 0}},
+  {{0, 0xfffb4c70, 0x001834e0, 0}}, {{0, 0xfffb335d, 0x0018b5fa, 0}},
+  {{0, 0xfffb1a4a, 0x00193714, 0}}, {{0, 0xfffb0137, 0x0019b82e, 0}},
+  {{0, 0xfffae824, 0x001a3948, 0}}, {{0, 0xfffacf11, 0x001aba62, 0}},
+  {{0, 0xfffab5fe, 0x001b3b7c, 0}}, {{0, 0xfffa9ceb, 0x001bbc96, 0}},
+  {{0, 0xfffa83d8, 0x001c3db0, 0}}, {{0, 0xfffa6ac5, 0x001cbeca, 0}},
+  {{0, 0xfffa51b2, 0x001d3fe4, 0}}, {{0, 0xfffa389f, 0x001dc0fe, 0}},
+  {{0, 0xfffa1f8c, 0x001e4218, 0}}, {{0, 0xfffa0679, 0x001ec332, 0}},
+  {{0, 0xfff9ed66, 0x001f444c, 0}}, {{0, 0xfff9d453, 0x001fc566, 0}},
+  {{0, 0xfff9bb40, 0x00204680, 0}}, {{0, 0xfff9a22d, 0x0020c79a, 0}},
+  {{0, 0xfff9891a, 0x002148b4, 0}}, {{0, 0xfff97007, 0x0021c9ce, 0}},
+  {{0, 0xfff956f4, 0x00224ae8, 0}}, {{0, 0xfff93de1, 0x0022cc02, 0}},
+  {{0, 0xfff924ce, 0x00234d1c, 0}}, {{0, 0xfff90bbb, 0x0023ce36, 0}},
+  {{0, 0xfff8f2a8, 0x00244f50, 0}}, {{0, 0xfff8d995, 0x0024d06a, 0}},
+  {{0, 0xfff8c082, 0x00255184, 0}}, {{0, 0xfff8a76f, 0x0025d29e, 0}},
+  {{0, 0xfff88e5c, 0x002653b8, 0}}, {{0, 0xfff87549, 0x0026d4d2, 0}},
+  {{0, 0xfff85c36, 0x002755ec, 0}}, {{0, 0xfff84323, 0x0027d706, 0}},
+  {{0, 0xfff82a10, 0x00285820, 0}}, {{0, 0xfff810fd, 0x0028d93a, 0}},
+  {{0, 0xfff7f7ea, 0x00295a54, 0}}, {{0, 0xfff7ded7, 0x0029db6e, 0}},
+  {{0, 0xfff7c5c4, 0x002a5c88, 0}}, {{0, 0xfff7acb1, 0x002adda2, 0}},
+  {{0, 0xfff7939e, 0x002b5ebc, 0}}, {{0, 0xfff77a8b, 0x002bdfd6, 0}},
+  {{0, 0xfff76178, 0x002c60f0, 0}}, {{0, 0xfff74865, 0x002ce20a, 0}},
+  {{0, 0xfff72f52, 0x002d6324, 0}}, {{0, 0xfff7163f, 0x002de43e, 0}},
+  {{0, 0xfff6fd2c, 0x002e6558, 0}}, {{0, 0xfff6e419, 0x002ee672, 0}},
+  {{0, 0xfff6cb06, 0x002f678c, 0}}, {{0, 0xfff6b1f3, 0x002fe8a6, 0}},
+  {{0, 0xfff698e0, 0x003069c0, 0}}, {{0, 0xfff67fcd, 0x0030eada, 0}},
+  {{0, 0xfff666ba, 0x00316bf4, 0}}, {{0, 0xfff64da7, 0x0031ed0e, 0}},
+  {{0, 0xfff63494, 0x00326e28, 0}}, {{0, 0xfff61b81, 0x0032ef42, 0}},
+  {{0, 0xfff6026e, 0x0033705c, 0}}, {{0, 0xfff5e95b, 0x0033f176, 0}},
+  {{0, 0xfff5d048, 0x00347290, 0}}, {{0, 0xfff5b735, 0x0034f3aa, 0}},
+  {{0, 0xfff59e22, 0x003574c4, 0}}, {{0, 0xfff5850f, 0x0035f5de, 0}},
+  {{0, 0xfff56bfc, 0x003676f8, 0}}, {{0, 0xfff552e9, 0x0036f812, 0}},
+  {{0, 0xfff539d6, 0x0037792c, 0}}, {{0, 0xfff520c3, 0x0037fa46, 0}},
+  {{0, 0xfff507b0, 0x00387b60, 0}}, {{0, 0xfff4ee9d, 0x0038fc7a, 0}},
+  {{0, 0xfff4d58a, 0x00397d94, 0}}, {{0, 0xfff4bc77, 0x0039feae, 0}},
+  {{0, 0xfff4a364, 0x003a7fc8, 0}}, {{0, 0xfff48a51, 0x003b00e2, 0}},
+  {{0, 0xfff4713e, 0x003b81fc, 0}}, {{0, 0xfff4582b, 0x003c0316, 0}},
+  {{0, 0xfff43f18, 0x003c8430, 0}}, {{0, 0xfff42605, 0x003d054a, 0}},
+  {{0, 0xfff40cf2, 0x003d8664, 0}}, {{0, 0xfff3f3df, 0x003e077e, 0}},
+  {{0, 0xfff3dacc, 0x003e8898, 0}}, {{0, 0xfff3c1b9, 0x003f09b2, 0}},
+  {{0, 0xfff3a8a6, 0x003f8acc, 0}}, {{0, 0xfff38f93, 0x00400be6, 0}}
+};
+
+static VP8kCstSSE2 VP8kVtoRGBA[256] = {
+  {{0xffcced80, 0x001a0400, 0, 0}}, {{0xffcd53a5, 0x0019cff8, 0, 0}},
+  {{0xffcdb9ca, 0x00199bf0, 0, 0}}, {{0xffce1fef, 0x001967e8, 0, 0}},
+  {{0xffce8614, 0x001933e0, 0, 0}}, {{0xffceec39, 0x0018ffd8, 0, 0}},
+  {{0xffcf525e, 0x0018cbd0, 0, 0}}, {{0xffcfb883, 0x001897c8, 0, 0}},
+  {{0xffd01ea8, 0x001863c0, 0, 0}}, {{0xffd084cd, 0x00182fb8, 0, 0}},
+  {{0xffd0eaf2, 0x0017fbb0, 0, 0}}, {{0xffd15117, 0x0017c7a8, 0, 0}},
+  {{0xffd1b73c, 0x001793a0, 0, 0}}, {{0xffd21d61, 0x00175f98, 0, 0}},
+  {{0xffd28386, 0x00172b90, 0, 0}}, {{0xffd2e9ab, 0x0016f788, 0, 0}},
+  {{0xffd34fd0, 0x0016c380, 0, 0}}, {{0xffd3b5f5, 0x00168f78, 0, 0}},
+  {{0xffd41c1a, 0x00165b70, 0, 0}}, {{0xffd4823f, 0x00162768, 0, 0}},
+  {{0xffd4e864, 0x0015f360, 0, 0}}, {{0xffd54e89, 0x0015bf58, 0, 0}},
+  {{0xffd5b4ae, 0x00158b50, 0, 0}}, {{0xffd61ad3, 0x00155748, 0, 0}},
+  {{0xffd680f8, 0x00152340, 0, 0}}, {{0xffd6e71d, 0x0014ef38, 0, 0}},
+  {{0xffd74d42, 0x0014bb30, 0, 0}}, {{0xffd7b367, 0x00148728, 0, 0}},
+  {{0xffd8198c, 0x00145320, 0, 0}}, {{0xffd87fb1, 0x00141f18, 0, 0}},
+  {{0xffd8e5d6, 0x0013eb10, 0, 0}}, {{0xffd94bfb, 0x0013b708, 0, 0}},
+  {{0xffd9b220, 0x00138300, 0, 0}}, {{0xffda1845, 0x00134ef8, 0, 0}},
+  {{0xffda7e6a, 0x00131af0, 0, 0}}, {{0xffdae48f, 0x0012e6e8, 0, 0}},
+  {{0xffdb4ab4, 0x0012b2e0, 0, 0}}, {{0xffdbb0d9, 0x00127ed8, 0, 0}},
+  {{0xffdc16fe, 0x00124ad0, 0, 0}}, {{0xffdc7d23, 0x001216c8, 0, 0}},
+  {{0xffdce348, 0x0011e2c0, 0, 0}}, {{0xffdd496d, 0x0011aeb8, 0, 0}},
+  {{0xffddaf92, 0x00117ab0, 0, 0}}, {{0xffde15b7, 0x001146a8, 0, 0}},
+  {{0xffde7bdc, 0x001112a0, 0, 0}}, {{0xffdee201, 0x0010de98, 0, 0}},
+  {{0xffdf4826, 0x0010aa90, 0, 0}}, {{0xffdfae4b, 0x00107688, 0, 0}},
+  {{0xffe01470, 0x00104280, 0, 0}}, {{0xffe07a95, 0x00100e78, 0, 0}},
+  {{0xffe0e0ba, 0x000fda70, 0, 0}}, {{0xffe146df, 0x000fa668, 0, 0}},
+  {{0xffe1ad04, 0x000f7260, 0, 0}}, {{0xffe21329, 0x000f3e58, 0, 0}},
+  {{0xffe2794e, 0x000f0a50, 0, 0}}, {{0xffe2df73, 0x000ed648, 0, 0}},
+  {{0xffe34598, 0x000ea240, 0, 0}}, {{0xffe3abbd, 0x000e6e38, 0, 0}},
+  {{0xffe411e2, 0x000e3a30, 0, 0}}, {{0xffe47807, 0x000e0628, 0, 0}},
+  {{0xffe4de2c, 0x000dd220, 0, 0}}, {{0xffe54451, 0x000d9e18, 0, 0}},
+  {{0xffe5aa76, 0x000d6a10, 0, 0}}, {{0xffe6109b, 0x000d3608, 0, 0}},
+  {{0xffe676c0, 0x000d0200, 0, 0}}, {{0xffe6dce5, 0x000ccdf8, 0, 0}},
+  {{0xffe7430a, 0x000c99f0, 0, 0}}, {{0xffe7a92f, 0x000c65e8, 0, 0}},
+  {{0xffe80f54, 0x000c31e0, 0, 0}}, {{0xffe87579, 0x000bfdd8, 0, 0}},
+  {{0xffe8db9e, 0x000bc9d0, 0, 0}}, {{0xffe941c3, 0x000b95c8, 0, 0}},
+  {{0xffe9a7e8, 0x000b61c0, 0, 0}}, {{0xffea0e0d, 0x000b2db8, 0, 0}},
+  {{0xffea7432, 0x000af9b0, 0, 0}}, {{0xffeada57, 0x000ac5a8, 0, 0}},
+  {{0xffeb407c, 0x000a91a0, 0, 0}}, {{0xffeba6a1, 0x000a5d98, 0, 0}},
+  {{0xffec0cc6, 0x000a2990, 0, 0}}, {{0xffec72eb, 0x0009f588, 0, 0}},
+  {{0xffecd910, 0x0009c180, 0, 0}}, {{0xffed3f35, 0x00098d78, 0, 0}},
+  {{0xffeda55a, 0x00095970, 0, 0}}, {{0xffee0b7f, 0x00092568, 0, 0}},
+  {{0xffee71a4, 0x0008f160, 0, 0}}, {{0xffeed7c9, 0x0008bd58, 0, 0}},
+  {{0xffef3dee, 0x00088950, 0, 0}}, {{0xffefa413, 0x00085548, 0, 0}},
+  {{0xfff00a38, 0x00082140, 0, 0}}, {{0xfff0705d, 0x0007ed38, 0, 0}},
+  {{0xfff0d682, 0x0007b930, 0, 0}}, {{0xfff13ca7, 0x00078528, 0, 0}},
+  {{0xfff1a2cc, 0x00075120, 0, 0}}, {{0xfff208f1, 0x00071d18, 0, 0}},
+  {{0xfff26f16, 0x0006e910, 0, 0}}, {{0xfff2d53b, 0x0006b508, 0, 0}},
+  {{0xfff33b60, 0x00068100, 0, 0}}, {{0xfff3a185, 0x00064cf8, 0, 0}},
+  {{0xfff407aa, 0x000618f0, 0, 0}}, {{0xfff46dcf, 0x0005e4e8, 0, 0}},
+  {{0xfff4d3f4, 0x0005b0e0, 0, 0}}, {{0xfff53a19, 0x00057cd8, 0, 0}},
+  {{0xfff5a03e, 0x000548d0, 0, 0}}, {{0xfff60663, 0x000514c8, 0, 0}},
+  {{0xfff66c88, 0x0004e0c0, 0, 0}}, {{0xfff6d2ad, 0x0004acb8, 0, 0}},
+  {{0xfff738d2, 0x000478b0, 0, 0}}, {{0xfff79ef7, 0x000444a8, 0, 0}},
+  {{0xfff8051c, 0x000410a0, 0, 0}}, {{0xfff86b41, 0x0003dc98, 0, 0}},
+  {{0xfff8d166, 0x0003a890, 0, 0}}, {{0xfff9378b, 0x00037488, 0, 0}},
+  {{0xfff99db0, 0x00034080, 0, 0}}, {{0xfffa03d5, 0x00030c78, 0, 0}},
+  {{0xfffa69fa, 0x0002d870, 0, 0}}, {{0xfffad01f, 0x0002a468, 0, 0}},
+  {{0xfffb3644, 0x00027060, 0, 0}}, {{0xfffb9c69, 0x00023c58, 0, 0}},
+  {{0xfffc028e, 0x00020850, 0, 0}}, {{0xfffc68b3, 0x0001d448, 0, 0}},
+  {{0xfffcced8, 0x0001a040, 0, 0}}, {{0xfffd34fd, 0x00016c38, 0, 0}},
+  {{0xfffd9b22, 0x00013830, 0, 0}}, {{0xfffe0147, 0x00010428, 0, 0}},
+  {{0xfffe676c, 0x0000d020, 0, 0}}, {{0xfffecd91, 0x00009c18, 0, 0}},
+  {{0xffff33b6, 0x00006810, 0, 0}}, {{0xffff99db, 0x00003408, 0, 0}},
+  {{0x00000000, 0x00000000, 0, 0}}, {{0x00006625, 0xffffcbf8, 0, 0}},
+  {{0x0000cc4a, 0xffff97f0, 0, 0}}, {{0x0001326f, 0xffff63e8, 0, 0}},
+  {{0x00019894, 0xffff2fe0, 0, 0}}, {{0x0001feb9, 0xfffefbd8, 0, 0}},
+  {{0x000264de, 0xfffec7d0, 0, 0}}, {{0x0002cb03, 0xfffe93c8, 0, 0}},
+  {{0x00033128, 0xfffe5fc0, 0, 0}}, {{0x0003974d, 0xfffe2bb8, 0, 0}},
+  {{0x0003fd72, 0xfffdf7b0, 0, 0}}, {{0x00046397, 0xfffdc3a8, 0, 0}},
+  {{0x0004c9bc, 0xfffd8fa0, 0, 0}}, {{0x00052fe1, 0xfffd5b98, 0, 0}},
+  {{0x00059606, 0xfffd2790, 0, 0}}, {{0x0005fc2b, 0xfffcf388, 0, 0}},
+  {{0x00066250, 0xfffcbf80, 0, 0}}, {{0x0006c875, 0xfffc8b78, 0, 0}},
+  {{0x00072e9a, 0xfffc5770, 0, 0}}, {{0x000794bf, 0xfffc2368, 0, 0}},
+  {{0x0007fae4, 0xfffbef60, 0, 0}}, {{0x00086109, 0xfffbbb58, 0, 0}},
+  {{0x0008c72e, 0xfffb8750, 0, 0}}, {{0x00092d53, 0xfffb5348, 0, 0}},
+  {{0x00099378, 0xfffb1f40, 0, 0}}, {{0x0009f99d, 0xfffaeb38, 0, 0}},
+  {{0x000a5fc2, 0xfffab730, 0, 0}}, {{0x000ac5e7, 0xfffa8328, 0, 0}},
+  {{0x000b2c0c, 0xfffa4f20, 0, 0}}, {{0x000b9231, 0xfffa1b18, 0, 0}},
+  {{0x000bf856, 0xfff9e710, 0, 0}}, {{0x000c5e7b, 0xfff9b308, 0, 0}},
+  {{0x000cc4a0, 0xfff97f00, 0, 0}}, {{0x000d2ac5, 0xfff94af8, 0, 0}},
+  {{0x000d90ea, 0xfff916f0, 0, 0}}, {{0x000df70f, 0xfff8e2e8, 0, 0}},
+  {{0x000e5d34, 0xfff8aee0, 0, 0}}, {{0x000ec359, 0xfff87ad8, 0, 0}},
+  {{0x000f297e, 0xfff846d0, 0, 0}}, {{0x000f8fa3, 0xfff812c8, 0, 0}},
+  {{0x000ff5c8, 0xfff7dec0, 0, 0}}, {{0x00105bed, 0xfff7aab8, 0, 0}},
+  {{0x0010c212, 0xfff776b0, 0, 0}}, {{0x00112837, 0xfff742a8, 0, 0}},
+  {{0x00118e5c, 0xfff70ea0, 0, 0}}, {{0x0011f481, 0xfff6da98, 0, 0}},
+  {{0x00125aa6, 0xfff6a690, 0, 0}}, {{0x0012c0cb, 0xfff67288, 0, 0}},
+  {{0x001326f0, 0xfff63e80, 0, 0}}, {{0x00138d15, 0xfff60a78, 0, 0}},
+  {{0x0013f33a, 0xfff5d670, 0, 0}}, {{0x0014595f, 0xfff5a268, 0, 0}},
+  {{0x0014bf84, 0xfff56e60, 0, 0}}, {{0x001525a9, 0xfff53a58, 0, 0}},
+  {{0x00158bce, 0xfff50650, 0, 0}}, {{0x0015f1f3, 0xfff4d248, 0, 0}},
+  {{0x00165818, 0xfff49e40, 0, 0}}, {{0x0016be3d, 0xfff46a38, 0, 0}},
+  {{0x00172462, 0xfff43630, 0, 0}}, {{0x00178a87, 0xfff40228, 0, 0}},
+  {{0x0017f0ac, 0xfff3ce20, 0, 0}}, {{0x001856d1, 0xfff39a18, 0, 0}},
+  {{0x0018bcf6, 0xfff36610, 0, 0}}, {{0x0019231b, 0xfff33208, 0, 0}},
+  {{0x00198940, 0xfff2fe00, 0, 0}}, {{0x0019ef65, 0xfff2c9f8, 0, 0}},
+  {{0x001a558a, 0xfff295f0, 0, 0}}, {{0x001abbaf, 0xfff261e8, 0, 0}},
+  {{0x001b21d4, 0xfff22de0, 0, 0}}, {{0x001b87f9, 0xfff1f9d8, 0, 0}},
+  {{0x001bee1e, 0xfff1c5d0, 0, 0}}, {{0x001c5443, 0xfff191c8, 0, 0}},
+  {{0x001cba68, 0xfff15dc0, 0, 0}}, {{0x001d208d, 0xfff129b8, 0, 0}},
+  {{0x001d86b2, 0xfff0f5b0, 0, 0}}, {{0x001decd7, 0xfff0c1a8, 0, 0}},
+  {{0x001e52fc, 0xfff08da0, 0, 0}}, {{0x001eb921, 0xfff05998, 0, 0}},
+  {{0x001f1f46, 0xfff02590, 0, 0}}, {{0x001f856b, 0xffeff188, 0, 0}},
+  {{0x001feb90, 0xffefbd80, 0, 0}}, {{0x002051b5, 0xffef8978, 0, 0}},
+  {{0x0020b7da, 0xffef5570, 0, 0}}, {{0x00211dff, 0xffef2168, 0, 0}},
+  {{0x00218424, 0xffeeed60, 0, 0}}, {{0x0021ea49, 0xffeeb958, 0, 0}},
+  {{0x0022506e, 0xffee8550, 0, 0}}, {{0x0022b693, 0xffee5148, 0, 0}},
+  {{0x00231cb8, 0xffee1d40, 0, 0}}, {{0x002382dd, 0xffede938, 0, 0}},
+  {{0x0023e902, 0xffedb530, 0, 0}}, {{0x00244f27, 0xffed8128, 0, 0}},
+  {{0x0024b54c, 0xffed4d20, 0, 0}}, {{0x00251b71, 0xffed1918, 0, 0}},
+  {{0x00258196, 0xffece510, 0, 0}}, {{0x0025e7bb, 0xffecb108, 0, 0}},
+  {{0x00264de0, 0xffec7d00, 0, 0}}, {{0x0026b405, 0xffec48f8, 0, 0}},
+  {{0x00271a2a, 0xffec14f0, 0, 0}}, {{0x0027804f, 0xffebe0e8, 0, 0}},
+  {{0x0027e674, 0xffebace0, 0, 0}}, {{0x00284c99, 0xffeb78d8, 0, 0}},
+  {{0x0028b2be, 0xffeb44d0, 0, 0}}, {{0x002918e3, 0xffeb10c8, 0, 0}},
+  {{0x00297f08, 0xffeadcc0, 0, 0}}, {{0x0029e52d, 0xffeaa8b8, 0, 0}},
+  {{0x002a4b52, 0xffea74b0, 0, 0}}, {{0x002ab177, 0xffea40a8, 0, 0}},
+  {{0x002b179c, 0xffea0ca0, 0, 0}}, {{0x002b7dc1, 0xffe9d898, 0, 0}},
+  {{0x002be3e6, 0xffe9a490, 0, 0}}, {{0x002c4a0b, 0xffe97088, 0, 0}},
+  {{0x002cb030, 0xffe93c80, 0, 0}}, {{0x002d1655, 0xffe90878, 0, 0}},
+  {{0x002d7c7a, 0xffe8d470, 0, 0}}, {{0x002de29f, 0xffe8a068, 0, 0}},
+  {{0x002e48c4, 0xffe86c60, 0, 0}}, {{0x002eaee9, 0xffe83858, 0, 0}},
+  {{0x002f150e, 0xffe80450, 0, 0}}, {{0x002f7b33, 0xffe7d048, 0, 0}},
+  {{0x002fe158, 0xffe79c40, 0, 0}}, {{0x0030477d, 0xffe76838, 0, 0}},
+  {{0x0030ada2, 0xffe73430, 0, 0}}, {{0x003113c7, 0xffe70028, 0, 0}},
+  {{0x003179ec, 0xffe6cc20, 0, 0}}, {{0x0031e011, 0xffe69818, 0, 0}},
+  {{0x00324636, 0xffe66410, 0, 0}}, {{0x0032ac5b, 0xffe63008, 0, 0}}
+};
diff --git a/src/3rdparty/libwebp/src/enc/alpha.c b/src/3rdparty/libwebp/src/enc/alpha.c
index 21d4b5c..79cb94d 100644
--- a/src/3rdparty/libwebp/src/enc/alpha.c
+++ b/src/3rdparty/libwebp/src/enc/alpha.c
@@ -17,6 +17,7 @@
 #include "./vp8enci.h"
 #include "../utils/filters.h"
 #include "../utils/quant_levels.h"
+#include "../utils/utils.h"
 #include "../webp/format_constants.h"
 
 // -----------------------------------------------------------------------------
@@ -34,7 +35,7 @@
 //
 // 'output' corresponds to the buffer containing compressed alpha data.
 //          This buffer is allocated by this method and caller should call
-//          free(*output) when done.
+//          WebPSafeFree(*output) when done.
 // 'output_size' corresponds to size of this compressed alpha buffer.
 //
 // Returns 1 on successfully encoding the alpha and
@@ -46,12 +47,11 @@
 
 static int EncodeLossless(const uint8_t* const data, int width, int height,
                           int effort_level,  // in [0..6] range
-                          VP8BitWriter* const bw,
+                          VP8LBitWriter* const bw,
                           WebPAuxStats* const stats) {
   int ok = 0;
   WebPConfig config;
   WebPPicture picture;
-  VP8LBitWriter tmp_bw;
 
   WebPPictureInit(&picture);
   picture.width = width;
@@ -83,16 +83,15 @@ static int EncodeLossless(const uint8_t* const data, int width, int height,
   config.quality = 8.f * effort_level;
   assert(config.quality >= 0 && config.quality <= 100.f);
 
-  ok = VP8LBitWriterInit(&tmp_bw, (width * height) >> 3);
-  ok = ok && (VP8LEncodeStream(&config, &picture, &tmp_bw) == VP8_ENC_OK);
+  ok = (VP8LEncodeStream(&config, &picture, bw) == VP8_ENC_OK);
   WebPPictureFree(&picture);
-  if (ok) {
-    const uint8_t* const buffer = VP8LBitWriterFinish(&tmp_bw);
-    const size_t buffer_size = VP8LBitWriterNumBytes(&tmp_bw);
-    VP8BitWriterAppend(bw, buffer, buffer_size);
+  ok = ok && !bw->error_;
+  if (!ok) {
+    VP8LBitWriterDestroy(bw);
+    return 0;
   }
-  VP8LBitWriterDestroy(&tmp_bw);
-  return ok && !bw->error_;
+  return 1;
+
 }
 
 // -----------------------------------------------------------------------------
@@ -114,8 +113,10 @@ static int EncodeAlphaInternal(const uint8_t* const data, int width, int height,
   const uint8_t* alpha_src;
   WebPFilterFunc filter_func;
   uint8_t header;
-  size_t expected_size;
   const size_t data_size = width * height;
+  const uint8_t* output = NULL;
+  size_t output_size = 0;
+  VP8LBitWriter tmp_bw;
 
   assert((uint64_t)data_size == (uint64_t)width * height);  // as per spec
   assert(filter >= 0 && filter < WEBP_FILTER_LAST);
@@ -124,15 +125,6 @@ static int EncodeAlphaInternal(const uint8_t* const data, int width, int height,
   assert(sizeof(header) == ALPHA_HEADER_LEN);
   // TODO(skal): have a common function and #define's to validate alpha params.
 
-  expected_size =
-      (method == ALPHA_NO_COMPRESSION) ? (ALPHA_HEADER_LEN + data_size)
-                                       : (data_size >> 5);
-  header = method | (filter << 2);
-  if (reduce_levels) header |= ALPHA_PREPROCESSED_LEVELS << 4;
-
-  VP8BitWriterInit(&result->bw, expected_size);
-  VP8BitWriterAppend(&result->bw, &header, ALPHA_HEADER_LEN);
-
   filter_func = WebPFilters[filter];
   if (filter_func != NULL) {
     filter_func(data, width, height, width, tmp_alpha);
@@ -141,14 +133,42 @@ static int EncodeAlphaInternal(const uint8_t* const data, int width, int height,
     alpha_src = data;
   }
 
+  if (method != ALPHA_NO_COMPRESSION) {
+    ok = VP8LBitWriterInit(&tmp_bw, data_size >> 3);
+    ok = ok && EncodeLossless(alpha_src, width, height, effort_level,
+                              &tmp_bw, &result->stats);
+    if (ok) {
+      output = VP8LBitWriterFinish(&tmp_bw);
+      output_size = VP8LBitWriterNumBytes(&tmp_bw);
+      if (output_size > data_size) {
+        // compressed size is larger than source! Revert to uncompressed mode.
+        method = ALPHA_NO_COMPRESSION;
+        VP8LBitWriterDestroy(&tmp_bw);
+      }
+    } else {
+      VP8LBitWriterDestroy(&tmp_bw);
+      return 0;
+    }
+  }
+
   if (method == ALPHA_NO_COMPRESSION) {
-    ok = VP8BitWriterAppend(&result->bw, alpha_src, width * height);
-    ok = ok && !result->bw.error_;
-  } else {
-    ok = EncodeLossless(alpha_src, width, height, effort_level,
-                        &result->bw, &result->stats);
-    VP8BitWriterFinish(&result->bw);
+    output = alpha_src;
+    output_size = data_size;
+    ok = 1;
+  }
+
+  // Emit final result.
+  header = method | (filter << 2);
+  if (reduce_levels) header |= ALPHA_PREPROCESSED_LEVELS << 4;
+
+  VP8BitWriterInit(&result->bw, ALPHA_HEADER_LEN + output_size);
+  ok = ok && VP8BitWriterAppend(&result->bw, &header, ALPHA_HEADER_LEN);
+  ok = ok && VP8BitWriterAppend(&result->bw, output, output_size);
+
+  if (method != ALPHA_NO_COMPRESSION) {
+    VP8LBitWriterDestroy(&tmp_bw);
   }
+  ok = ok && !result->bw.error_;
   result->score = VP8BitWriterSize(&result->bw);
   return ok;
 }
@@ -231,7 +251,7 @@ static int ApplyFiltersAndEncode(const uint8_t* alpha, int width, int height,
       GetFilterMap(alpha, width, height, filter, effort_level);
   InitFilterTrial(&best);
   if (try_map != FILTER_TRY_NONE) {
-    uint8_t* filtered_alpha =  (uint8_t*)malloc(data_size);
+    uint8_t* filtered_alpha =  (uint8_t*)WebPSafeMalloc(1ULL, data_size);
     if (filtered_alpha == NULL) return 0;
 
     for (filter = WEBP_FILTER_NONE; ok && try_map; ++filter, try_map >>= 1) {
@@ -248,7 +268,7 @@ static int ApplyFiltersAndEncode(const uint8_t* alpha, int width, int height,
         }
       }
     }
-    free(filtered_alpha);
+    WebPSafeFree(filtered_alpha);
   } else {
     ok = EncodeAlphaInternal(alpha, width, height, method, WEBP_FILTER_NONE,
                              reduce_levels, effort_level, NULL, &best);
@@ -298,7 +318,7 @@ static int EncodeAlpha(VP8Encoder* const enc,
     filter = WEBP_FILTER_NONE;
   }
 
-  quant_alpha = (uint8_t*)malloc(data_size);
+  quant_alpha = (uint8_t*)WebPSafeMalloc(1ULL, data_size);
   if (quant_alpha == NULL) {
     return 0;
   }
@@ -325,7 +345,7 @@ static int EncodeAlpha(VP8Encoder* const enc,
     }
   }
 
-  free(quant_alpha);
+  WebPSafeFree(quant_alpha);
   return ok;
 }
 
@@ -346,7 +366,7 @@ static int CompressAlphaJob(VP8Encoder* const enc, void* dummy) {
     return 0;
   }
   if (alpha_size != (uint32_t)alpha_size) {  // Sanity check.
-    free(alpha_data);
+    WebPSafeFree(alpha_data);
     return 0;
   }
   enc->alpha_data_size_ = (uint32_t)alpha_size;
@@ -361,7 +381,7 @@ void VP8EncInitAlpha(VP8Encoder* const enc) {
   enc->alpha_data_size_ = 0;
   if (enc->thread_level_ > 0) {
     WebPWorker* const worker = &enc->alpha_worker_;
-    WebPWorkerInit(worker);
+    WebPGetWorkerInterface()->Init(worker);
     worker->data1 = enc;
     worker->data2 = NULL;
     worker->hook = (WebPWorkerHook)CompressAlphaJob;
@@ -372,10 +392,11 @@ int VP8EncStartAlpha(VP8Encoder* const enc) {
   if (enc->has_alpha_) {
     if (enc->thread_level_ > 0) {
       WebPWorker* const worker = &enc->alpha_worker_;
-      if (!WebPWorkerReset(worker)) {    // Makes sure worker is good to go.
+      // Makes sure worker is good to go.
+      if (!WebPGetWorkerInterface()->Reset(worker)) {
         return 0;
       }
-      WebPWorkerLaunch(worker);
+      WebPGetWorkerInterface()->Launch(worker);
       return 1;
     } else {
       return CompressAlphaJob(enc, NULL);   // just do the job right away
@@ -388,7 +409,7 @@ int VP8EncFinishAlpha(VP8Encoder* const enc) {
   if (enc->has_alpha_) {
     if (enc->thread_level_ > 0) {
       WebPWorker* const worker = &enc->alpha_worker_;
-      if (!WebPWorkerSync(worker)) return 0;  // error
+      if (!WebPGetWorkerInterface()->Sync(worker)) return 0;  // error
     }
   }
   return WebPReportProgress(enc->pic_, enc->percent_ + 20, &enc->percent_);
@@ -398,10 +419,12 @@ int VP8EncDeleteAlpha(VP8Encoder* const enc) {
   int ok = 1;
   if (enc->thread_level_ > 0) {
     WebPWorker* const worker = &enc->alpha_worker_;
-    ok = WebPWorkerSync(worker);  // finish anything left in flight
-    WebPWorkerEnd(worker);  // still need to end the worker, even if !ok
+    // finish anything left in flight
+    ok = WebPGetWorkerInterface()->Sync(worker);
+    // still need to end the worker, even if !ok
+    WebPGetWorkerInterface()->End(worker);
   }
-  free(enc->alpha_data_);
+  WebPSafeFree(enc->alpha_data_);
   enc->alpha_data_ = NULL;
   enc->alpha_data_size_ = 0;
   enc->has_alpha_ = 0;
diff --git a/src/3rdparty/libwebp/src/enc/analysis.c b/src/3rdparty/libwebp/src/enc/analysis.c
index 7d4cfdc..e019465 100644
--- a/src/3rdparty/libwebp/src/enc/analysis.c
+++ b/src/3rdparty/libwebp/src/enc/analysis.c
@@ -30,7 +30,7 @@ static void SmoothSegmentMap(VP8Encoder* const enc) {
   const int w = enc->mb_w_;
   const int h = enc->mb_h_;
   const int majority_cnt_3_x_3_grid = 5;
-  uint8_t* const tmp = (uint8_t*)WebPSafeMalloc((uint64_t)w * h, sizeof(*tmp));
+  uint8_t* const tmp = (uint8_t*)WebPSafeMalloc(w * h, sizeof(*tmp));
   assert((uint64_t)(w * h) == (uint64_t)w * h);   // no overflow, as per spec
 
   if (tmp == NULL) return;
@@ -63,7 +63,7 @@ static void SmoothSegmentMap(VP8Encoder* const enc) {
       mb->segment_ = tmp[x + y * w];
     }
   }
-  free(tmp);
+  WebPSafeFree(tmp);
 }
 
 //------------------------------------------------------------------------------
@@ -141,7 +141,11 @@ static void MergeHistograms(const VP8Histogram* const in,
 
 static void AssignSegments(VP8Encoder* const enc,
                            const int alphas[MAX_ALPHA + 1]) {
-  const int nb = enc->segment_hdr_.num_segments_;
+  // 'num_segments_' is previously validated and <= NUM_MB_SEGMENTS, but an
+  // explicit check is needed to avoid spurious warning about 'n + 1' exceeding
+  // array bounds of 'centers' with some compilers (noticed with gcc-4.9).
+  const int nb = (enc->segment_hdr_.num_segments_ < NUM_MB_SEGMENTS) ?
+                 enc->segment_hdr_.num_segments_ : NUM_MB_SEGMENTS;
   int centers[NUM_MB_SEGMENTS];
   int weighted_average = 0;
   int map[MAX_ALPHA + 1];
@@ -151,6 +155,7 @@ static void AssignSegments(VP8Encoder* const enc,
   int accum[NUM_MB_SEGMENTS], dist_accum[NUM_MB_SEGMENTS];
 
   assert(nb >= 1);
+  assert(nb <= NUM_MB_SEGMENTS);
 
   // bracket the input
   for (n = 0; n <= MAX_ALPHA && alphas[n] == 0; ++n) {}
@@ -225,18 +230,15 @@ static void AssignSegments(VP8Encoder* const enc,
 // susceptibility and set best modes for this macroblock.
 // Segment assignment is done later.
 
-// Number of modes to inspect for alpha_ evaluation. For high-quality settings
-// (method >= FAST_ANALYSIS_METHOD) we don't need to test all the possible modes
-// during the analysis phase.
-#define FAST_ANALYSIS_METHOD 4  // method above which we do partial analysis
+// Number of modes to inspect for alpha_ evaluation. We don't need to test all
+// the possible modes during the analysis phase: we risk falling into a local
+// optimum, or be subject to boundary effect
 #define MAX_INTRA16_MODE 2
 #define MAX_INTRA4_MODE  2
 #define MAX_UV_MODE      2
 
 static int MBAnalyzeBestIntra16Mode(VP8EncIterator* const it) {
-  const int max_mode =
-      (it->enc_->method_ >= FAST_ANALYSIS_METHOD) ? MAX_INTRA16_MODE
-                                                  : NUM_PRED_MODES;
+  const int max_mode = MAX_INTRA16_MODE;
   int mode;
   int best_alpha = DEFAULT_ALPHA;
   int best_mode = 0;
@@ -262,9 +264,7 @@ static int MBAnalyzeBestIntra16Mode(VP8EncIterator* const it) {
 static int MBAnalyzeBestIntra4Mode(VP8EncIterator* const it,
                                    int best_alpha) {
   uint8_t modes[16];
-  const int max_mode =
-      (it->enc_->method_ >= FAST_ANALYSIS_METHOD) ? MAX_INTRA4_MODE
-                                                  : NUM_BMODES;
+  const int max_mode = MAX_INTRA4_MODE;
   int i4_alpha;
   VP8Histogram total_histo = { { 0 } };
   int cur_histo = 0;
@@ -306,10 +306,9 @@ static int MBAnalyzeBestIntra4Mode(VP8EncIterator* const it,
 static int MBAnalyzeBestUVMode(VP8EncIterator* const it) {
   int best_alpha = DEFAULT_ALPHA;
   int best_mode = 0;
-  const int max_mode =
-      (it->enc_->method_ >= FAST_ANALYSIS_METHOD) ? MAX_UV_MODE
-                                                  : NUM_PRED_MODES;
+  const int max_mode = MAX_UV_MODE;
   int mode;
+
   VP8MakeChroma8Preds(it);
   for (mode = 0; mode < max_mode; ++mode) {
     VP8Histogram histo = { { 0 } };
@@ -425,7 +424,7 @@ static void MergeJobs(const SegmentJob* const src, SegmentJob* const dst) {
 // initialize the job struct with some TODOs
 static void InitSegmentJob(VP8Encoder* const enc, SegmentJob* const job,
                            int start_row, int end_row) {
-  WebPWorkerInit(&job->worker);
+  WebPGetWorkerInterface()->Init(&job->worker);
   job->worker.data1 = job;
   job->worker.data2 = &job->it;
   job->worker.hook = (WebPWorkerHook)DoSegmentsJob;
@@ -458,6 +457,8 @@ int VP8EncAnalyze(VP8Encoder* const enc) {
 #else
     const int do_mt = 0;
 #endif
+    const WebPWorkerInterface* const worker_interface =
+        WebPGetWorkerInterface();
     SegmentJob main_job;
     if (do_mt) {
       SegmentJob side_job;
@@ -467,23 +468,23 @@ int VP8EncAnalyze(VP8Encoder* const enc) {
       InitSegmentJob(enc, &side_job, split_row, last_row);
       // we don't need to call Reset() on main_job.worker, since we're calling
       // WebPWorkerExecute() on it
-      ok &= WebPWorkerReset(&side_job.worker);
+      ok &= worker_interface->Reset(&side_job.worker);
       // launch the two jobs in parallel
       if (ok) {
-        WebPWorkerLaunch(&side_job.worker);
-        WebPWorkerExecute(&main_job.worker);
-        ok &= WebPWorkerSync(&side_job.worker);
-        ok &= WebPWorkerSync(&main_job.worker);
+        worker_interface->Launch(&side_job.worker);
+        worker_interface->Execute(&main_job.worker);
+        ok &= worker_interface->Sync(&side_job.worker);
+        ok &= worker_interface->Sync(&main_job.worker);
       }
-      WebPWorkerEnd(&side_job.worker);
+      worker_interface->End(&side_job.worker);
       if (ok) MergeJobs(&side_job, &main_job);  // merge results together
     } else {
       // Even for single-thread case, we use the generic Worker tools.
       InitSegmentJob(enc, &main_job, 0, last_row);
-      WebPWorkerExecute(&main_job.worker);
-      ok &= WebPWorkerSync(&main_job.worker);
+      worker_interface->Execute(&main_job.worker);
+      ok &= worker_interface->Sync(&main_job.worker);
     }
-    WebPWorkerEnd(&main_job.worker);
+    worker_interface->End(&main_job.worker);
     if (ok) {
       enc->alpha_ = main_job.alpha / total_mb;
       enc->uv_alpha_ = main_job.uv_alpha / total_mb;
diff --git a/src/3rdparty/libwebp/src/enc/backward_references.c b/src/3rdparty/libwebp/src/enc/backward_references.c
index 77b4be7..a3c30aa 100644
--- a/src/3rdparty/libwebp/src/enc/backward_references.c
+++ b/src/3rdparty/libwebp/src/enc/backward_references.c
@@ -12,7 +12,6 @@
 
 #include <assert.h>
 #include <math.h>
-#include <stdio.h>
 
 #include "./backward_references.h"
 #include "./histogram.h"
@@ -22,10 +21,12 @@
 
 #define VALUES_IN_BYTE 256
 
-#define HASH_BITS 18
-#define HASH_SIZE (1 << HASH_BITS)
 #define HASH_MULTIPLIER (0xc6a4a7935bd1e995ULL)
 
+#define MIN_BLOCK_SIZE 256  // minimum block size for backward references
+
+#define MAX_ENTROPY    (1e30f)
+
 // 1M window (4M bytes) minus 120 special codes for short distances.
 #define WINDOW_SIZE ((1 << 20) - 120)
 
@@ -33,14 +34,6 @@
 #define MIN_LENGTH 2
 #define MAX_LENGTH 4096
 
-typedef struct {
-  // Stores the most recently added position with the given hash value.
-  int32_t hash_to_first_index_[HASH_SIZE];
-  // chain_[pos] stores the previous position with the same hash value
-  // for every pixel in the image.
-  int32_t* chain_;
-} HashChain;
-
 // -----------------------------------------------------------------------------
 
 static const uint8_t plane_to_code_lut[128] = {
@@ -78,65 +71,152 @@ static WEBP_INLINE int FindMatchLength(const uint32_t* const array1,
 // -----------------------------------------------------------------------------
 //  VP8LBackwardRefs
 
-void VP8LInitBackwardRefs(VP8LBackwardRefs* const refs) {
-  if (refs != NULL) {
-    refs->refs = NULL;
-    refs->size = 0;
-    refs->max_size = 0;
+struct PixOrCopyBlock {
+  PixOrCopyBlock* next_;   // next block (or NULL)
+  PixOrCopy* start_;       // data start
+  int size_;               // currently used size
+};
+
+static void ClearBackwardRefs(VP8LBackwardRefs* const refs) {
+  assert(refs != NULL);
+  if (refs->tail_ != NULL) {
+    *refs->tail_ = refs->free_blocks_;  // recycle all blocks at once
   }
+  refs->free_blocks_ = refs->refs_;
+  refs->tail_ = &refs->refs_;
+  refs->last_block_ = NULL;
+  refs->refs_ = NULL;
 }
 
-void VP8LClearBackwardRefs(VP8LBackwardRefs* const refs) {
-  if (refs != NULL) {
-    free(refs->refs);
-    VP8LInitBackwardRefs(refs);
+void VP8LBackwardRefsClear(VP8LBackwardRefs* const refs) {
+  assert(refs != NULL);
+  ClearBackwardRefs(refs);
+  while (refs->free_blocks_ != NULL) {
+    PixOrCopyBlock* const next = refs->free_blocks_->next_;
+    WebPSafeFree(refs->free_blocks_);
+    refs->free_blocks_ = next;
   }
 }
 
-int VP8LBackwardRefsAlloc(VP8LBackwardRefs* const refs, int max_size) {
+void VP8LBackwardRefsInit(VP8LBackwardRefs* const refs, int block_size) {
   assert(refs != NULL);
-  refs->size = 0;
-  refs->max_size = 0;
-  refs->refs = (PixOrCopy*)WebPSafeMalloc((uint64_t)max_size,
-                                          sizeof(*refs->refs));
-  if (refs->refs == NULL) return 0;
-  refs->max_size = max_size;
+  memset(refs, 0, sizeof(*refs));
+  refs->tail_ = &refs->refs_;
+  refs->block_size_ =
+      (block_size < MIN_BLOCK_SIZE) ? MIN_BLOCK_SIZE : block_size;
+}
+
+VP8LRefsCursor VP8LRefsCursorInit(const VP8LBackwardRefs* const refs) {
+  VP8LRefsCursor c;
+  c.cur_block_ = refs->refs_;
+  if (refs->refs_ != NULL) {
+    c.cur_pos = c.cur_block_->start_;
+    c.last_pos_ = c.cur_pos + c.cur_block_->size_;
+  } else {
+    c.cur_pos = NULL;
+    c.last_pos_ = NULL;
+  }
+  return c;
+}
+
+void VP8LRefsCursorNextBlock(VP8LRefsCursor* const c) {
+  PixOrCopyBlock* const b = c->cur_block_->next_;
+  c->cur_pos = (b == NULL) ? NULL : b->start_;
+  c->last_pos_ = (b == NULL) ? NULL : b->start_ + b->size_;
+  c->cur_block_ = b;
+}
+
+// Create a new block, either from the free list or allocated
+static PixOrCopyBlock* BackwardRefsNewBlock(VP8LBackwardRefs* const refs) {
+  PixOrCopyBlock* b = refs->free_blocks_;
+  if (b == NULL) {   // allocate new memory chunk
+    const size_t total_size =
+        sizeof(*b) + refs->block_size_ * sizeof(*b->start_);
+    b = (PixOrCopyBlock*)WebPSafeMalloc(1ULL, total_size);
+    if (b == NULL) {
+      refs->error_ |= 1;
+      return NULL;
+    }
+    b->start_ = (PixOrCopy*)((uint8_t*)b + sizeof(*b));  // not always aligned
+  } else {  // recycle from free-list
+    refs->free_blocks_ = b->next_;
+  }
+  *refs->tail_ = b;
+  refs->tail_ = &b->next_;
+  refs->last_block_ = b;
+  b->next_ = NULL;
+  b->size_ = 0;
+  return b;
+}
+
+static WEBP_INLINE void BackwardRefsCursorAdd(VP8LBackwardRefs* const refs,
+                                              const PixOrCopy v) {
+  PixOrCopyBlock* b = refs->last_block_;
+  if (b == NULL || b->size_ == refs->block_size_) {
+    b = BackwardRefsNewBlock(refs);
+    if (b == NULL) return;   // refs->error_ is set
+  }
+  b->start_[b->size_++] = v;
+}
+
+int VP8LBackwardRefsCopy(const VP8LBackwardRefs* const src,
+                         VP8LBackwardRefs* const dst) {
+  const PixOrCopyBlock* b = src->refs_;
+  ClearBackwardRefs(dst);
+  assert(src->block_size_ == dst->block_size_);
+  while (b != NULL) {
+    PixOrCopyBlock* const new_b = BackwardRefsNewBlock(dst);
+    if (new_b == NULL) return 0;   // dst->error_ is set
+    memcpy(new_b->start_, b->start_, b->size_ * sizeof(*b->start_));
+    new_b->size_ = b->size_;
+    b = b->next_;
+  }
   return 1;
 }
 
 // -----------------------------------------------------------------------------
 // Hash chains
 
-static WEBP_INLINE uint64_t GetPixPairHash64(const uint32_t* const argb) {
-  uint64_t key = ((uint64_t)(argb[1]) << 32) | argb[0];
-  key = (key * HASH_MULTIPLIER) >> (64 - HASH_BITS);
-  return key;
-}
-
-static int HashChainInit(HashChain* const p, int size) {
+// initialize as empty
+static void HashChainInit(VP8LHashChain* const p) {
   int i;
-  p->chain_ = (int*)WebPSafeMalloc((uint64_t)size, sizeof(*p->chain_));
-  if (p->chain_ == NULL) {
-    return 0;
-  }
-  for (i = 0; i < size; ++i) {
+  assert(p != NULL);
+  for (i = 0; i < p->size_; ++i) {
     p->chain_[i] = -1;
   }
   for (i = 0; i < HASH_SIZE; ++i) {
     p->hash_to_first_index_[i] = -1;
   }
+}
+
+int VP8LHashChainInit(VP8LHashChain* const p, int size) {
+  assert(p->size_ == 0);
+  assert(p->chain_ == NULL);
+  assert(size > 0);
+  p->chain_ = (int*)WebPSafeMalloc(size, sizeof(*p->chain_));
+  if (p->chain_ == NULL) return 0;
+  p->size_ = size;
+  HashChainInit(p);
   return 1;
 }
 
-static void HashChainDelete(HashChain* const p) {
-  if (p != NULL) {
-    free(p->chain_);
-    free(p);
-  }
+void VP8LHashChainClear(VP8LHashChain* const p) {
+  assert(p != NULL);
+  WebPSafeFree(p->chain_);
+  p->size_ = 0;
+  p->chain_ = NULL;
+}
+
+// -----------------------------------------------------------------------------
+
+static WEBP_INLINE uint64_t GetPixPairHash64(const uint32_t* const argb) {
+  uint64_t key = ((uint64_t)argb[1] << 32) | argb[0];
+  key = (key * HASH_MULTIPLIER) >> (64 - HASH_BITS);
+  return key;
 }
 
 // Insertion of two pixels at a time.
-static void HashChainInsert(HashChain* const p,
+static void HashChainInsert(VP8LHashChain* const p,
                             const uint32_t* const argb, int pos) {
   const uint64_t hash_code = GetPixPairHash64(argb);
   p->chain_[pos] = p->hash_to_first_index_[hash_code];
@@ -161,7 +241,7 @@ static void GetParamsForHashChainFindCopy(int quality, int xsize,
   *iter_limit = (cache_bits > 0) ? iter_neg : iter_neg / 2;
 }
 
-static int HashChainFindCopy(const HashChain* const p,
+static int HashChainFindCopy(const VP8LHashChain* const p,
                              int base_position, int xsize_signed,
                              const uint32_t* const argb, int max_len,
                              int window_size, int iter_pos, int iter_limit,
@@ -185,10 +265,8 @@ static int HashChainFindCopy(const HashChain* const p,
     uint64_t val;
     uint32_t curr_length;
     uint32_t distance;
-    const uint64_t* const ptr1 =
-        (const uint64_t*)(argb + pos + best_length - 1);
-    const uint64_t* const ptr2 =
-        (const uint64_t*)(argb_start + best_length - 1);
+    const uint32_t* const ptr1 = (argb + pos + best_length - 1);
+    const uint32_t* const ptr2 = (argb_start + best_length - 1);
 
     if (iter_pos < 0) {
       if (iter_pos < iter_limit || best_val >= 0xff0000) {
@@ -199,7 +277,7 @@ static int HashChainFindCopy(const HashChain* const p,
 
     // Before 'expensive' linear match, check if the two arrays match at the
     // current best length index and also for the succeeding elements.
-    if (*ptr1 != *ptr2) continue;
+    if (ptr1[0] != ptr2[0] || ptr1[1] != ptr2[1]) continue;
 
     curr_length = FindMatchLength(argb + pos, argb_start, max_len);
     if (curr_length < best_length) continue;
@@ -237,64 +315,61 @@ static int HashChainFindCopy(const HashChain* const p,
 }
 
 static WEBP_INLINE void PushBackCopy(VP8LBackwardRefs* const refs, int length) {
-  int size = refs->size;
   while (length >= MAX_LENGTH) {
-    refs->refs[size++] = PixOrCopyCreateCopy(1, MAX_LENGTH);
+    BackwardRefsCursorAdd(refs, PixOrCopyCreateCopy(1, MAX_LENGTH));
     length -= MAX_LENGTH;
   }
   if (length > 0) {
-    refs->refs[size++] = PixOrCopyCreateCopy(1, length);
+    BackwardRefsCursorAdd(refs, PixOrCopyCreateCopy(1, length));
   }
-  refs->size = size;
 }
 
-static void BackwardReferencesRle(int xsize, int ysize,
-                                  const uint32_t* const argb,
-                                  VP8LBackwardRefs* const refs) {
+static int BackwardReferencesRle(int xsize, int ysize,
+                                 const uint32_t* const argb,
+                                 VP8LBackwardRefs* const refs) {
   const int pix_count = xsize * ysize;
   int match_len = 0;
   int i;
-  refs->size = 0;
+  ClearBackwardRefs(refs);
   PushBackCopy(refs, match_len);    // i=0 case
-  refs->refs[refs->size++] = PixOrCopyCreateLiteral(argb[0]);
+  BackwardRefsCursorAdd(refs, PixOrCopyCreateLiteral(argb[0]));
   for (i = 1; i < pix_count; ++i) {
     if (argb[i] == argb[i - 1]) {
       ++match_len;
     } else {
       PushBackCopy(refs, match_len);
       match_len = 0;
-      refs->refs[refs->size++] = PixOrCopyCreateLiteral(argb[i]);
+      BackwardRefsCursorAdd(refs, PixOrCopyCreateLiteral(argb[i]));
     }
   }
   PushBackCopy(refs, match_len);
+  return !refs->error_;
 }
 
 static int BackwardReferencesHashChain(int xsize, int ysize,
                                        const uint32_t* const argb,
                                        int cache_bits, int quality,
+                                       VP8LHashChain* const hash_chain,
                                        VP8LBackwardRefs* const refs) {
   int i;
   int ok = 0;
   int cc_init = 0;
   const int use_color_cache = (cache_bits > 0);
   const int pix_count = xsize * ysize;
-  HashChain* const hash_chain = (HashChain*)malloc(sizeof(*hash_chain));
   VP8LColorCache hashers;
   int window_size = WINDOW_SIZE;
   int iter_pos = 1;
   int iter_limit = -1;
 
-  if (hash_chain == NULL) return 0;
   if (use_color_cache) {
     cc_init = VP8LColorCacheInit(&hashers, cache_bits);
     if (!cc_init) goto Error;
   }
 
-  if (!HashChainInit(hash_chain, pix_count)) goto Error;
-
-  refs->size = 0;
+  ClearBackwardRefs(refs);
   GetParamsForHashChainFindCopy(quality, xsize, cache_bits,
                                 &window_size, &iter_pos, &iter_limit);
+  HashChainInit(hash_chain);
   for (i = 0; i < pix_count; ) {
     // Alternative#1: Code the pixels starting at 'i' using backward reference.
     int offset = 0;
@@ -320,14 +395,15 @@ static int BackwardReferencesHashChain(int xsize, int ysize,
         if (len2 > len + 1) {
           const uint32_t pixel = argb[i];
           // Alternative#2 is a better match. So push pixel at 'i' as literal.
+          PixOrCopy v;
           if (use_color_cache && VP8LColorCacheContains(&hashers, pixel)) {
             const int ix = VP8LColorCacheGetIndex(&hashers, pixel);
-            refs->refs[refs->size] = PixOrCopyCreateCacheIdx(ix);
+            v = PixOrCopyCreateCacheIdx(ix);
           } else {
             if (use_color_cache) VP8LColorCacheInsert(&hashers, pixel);
-            refs->refs[refs->size] = PixOrCopyCreateLiteral(pixel);
+            v = PixOrCopyCreateLiteral(pixel);
           }
-          ++refs->size;
+          BackwardRefsCursorAdd(refs, v);
           i++;  // Backward reference to be done for next pixel.
           len = len2;
           offset = offset2;
@@ -336,7 +412,7 @@ static int BackwardReferencesHashChain(int xsize, int ysize,
       if (len >= MAX_LENGTH) {
         len = MAX_LENGTH - 1;
       }
-      refs->refs[refs->size++] = PixOrCopyCreateCopy(offset, len);
+      BackwardRefsCursorAdd(refs, PixOrCopyCreateCopy(offset, len));
       if (use_color_cache) {
         for (k = 0; k < len; ++k) {
           VP8LColorCacheInsert(&hashers, argb[i + k]);
@@ -352,25 +428,25 @@ static int BackwardReferencesHashChain(int xsize, int ysize,
       i += len;
     } else {
       const uint32_t pixel = argb[i];
+      PixOrCopy v;
       if (use_color_cache && VP8LColorCacheContains(&hashers, pixel)) {
         // push pixel as a PixOrCopyCreateCacheIdx pixel
         const int ix = VP8LColorCacheGetIndex(&hashers, pixel);
-        refs->refs[refs->size] = PixOrCopyCreateCacheIdx(ix);
+        v = PixOrCopyCreateCacheIdx(ix);
       } else {
         if (use_color_cache) VP8LColorCacheInsert(&hashers, pixel);
-        refs->refs[refs->size] = PixOrCopyCreateLiteral(pixel);
+        v = PixOrCopyCreateLiteral(pixel);
       }
-      ++refs->size;
+      BackwardRefsCursorAdd(refs, v);
       if (i + 1 < pix_count) {
         HashChainInsert(hash_chain, &argb[i], i);
       }
       ++i;
     }
   }
-  ok = 1;
+  ok = !refs->error_;
 Error:
   if (cc_init) VP8LColorCacheClear(&hashers);
-  HashChainDelete(hash_chain);
   return ok;
 }
 
@@ -387,11 +463,12 @@ typedef struct {
 static int BackwardReferencesTraceBackwards(
     int xsize, int ysize, int recursive_cost_model,
     const uint32_t* const argb, int quality, int cache_bits,
+    VP8LHashChain* const hash_chain,
     VP8LBackwardRefs* const refs);
 
 static void ConvertPopulationCountTableToBitEstimates(
-    int num_symbols, const int population_counts[], double output[]) {
-  int sum = 0;
+    int num_symbols, const uint32_t population_counts[], double output[]) {
+  uint32_t sum = 0;
   int nonzeros = 0;
   int i;
   for (i = 0; i < num_symbols; ++i) {
@@ -412,39 +489,45 @@ static void ConvertPopulationCountTableToBitEstimates(
 
 static int CostModelBuild(CostModel* const m, int xsize, int ysize,
                           int recursion_level, const uint32_t* const argb,
-                          int quality, int cache_bits) {
+                          int quality, int cache_bits,
+                          VP8LHashChain* const hash_chain,
+                          VP8LBackwardRefs* const refs) {
   int ok = 0;
-  VP8LHistogram histo;
-  VP8LBackwardRefs refs;
-
-  if (!VP8LBackwardRefsAlloc(&refs, xsize * ysize)) goto Error;
+  VP8LHistogram* histo = NULL;
 
+  ClearBackwardRefs(refs);
   if (recursion_level > 0) {
     if (!BackwardReferencesTraceBackwards(xsize, ysize, recursion_level - 1,
-                                          argb, quality, cache_bits, &refs)) {
+                                          argb, quality, cache_bits, hash_chain,
+                                          refs)) {
       goto Error;
     }
   } else {
     if (!BackwardReferencesHashChain(xsize, ysize, argb, cache_bits, quality,
-                                     &refs)) {
+                                     hash_chain, refs)) {
       goto Error;
     }
   }
-  VP8LHistogramCreate(&histo, &refs, cache_bits);
+  histo = VP8LAllocateHistogram(cache_bits);
+  if (histo == NULL) goto Error;
+
+  VP8LHistogramCreate(histo, refs, cache_bits);
+
   ConvertPopulationCountTableToBitEstimates(
-      VP8LHistogramNumCodes(&histo), histo.literal_, m->literal_);
+      VP8LHistogramNumCodes(histo->palette_code_bits_),
+      histo->literal_, m->literal_);
   ConvertPopulationCountTableToBitEstimates(
-      VALUES_IN_BYTE, histo.red_, m->red_);
+      VALUES_IN_BYTE, histo->red_, m->red_);
   ConvertPopulationCountTableToBitEstimates(
-      VALUES_IN_BYTE, histo.blue_, m->blue_);
+      VALUES_IN_BYTE, histo->blue_, m->blue_);
   ConvertPopulationCountTableToBitEstimates(
-      VALUES_IN_BYTE, histo.alpha_, m->alpha_);
+      VALUES_IN_BYTE, histo->alpha_, m->alpha_);
   ConvertPopulationCountTableToBitEstimates(
-      NUM_DISTANCE_CODES, histo.distance_, m->distance_);
+      NUM_DISTANCE_CODES, histo->distance_, m->distance_);
   ok = 1;
 
  Error:
-  VP8LClearBackwardRefs(&refs);
+  VP8LFreeHistogram(histo);
   return ok;
 }
 
@@ -476,16 +559,16 @@ static WEBP_INLINE double GetDistanceCost(const CostModel* const m,
 
 static int BackwardReferencesHashChainDistanceOnly(
     int xsize, int ysize, int recursive_cost_model, const uint32_t* const argb,
-    int quality, int cache_bits, uint32_t* const dist_array) {
+    int quality, int cache_bits, VP8LHashChain* const hash_chain,
+    VP8LBackwardRefs* const refs, uint32_t* const dist_array) {
   int i;
   int ok = 0;
   int cc_init = 0;
   const int pix_count = xsize * ysize;
   const int use_color_cache = (cache_bits > 0);
   float* const cost =
-      (float*)WebPSafeMalloc((uint64_t)pix_count, sizeof(*cost));
-  CostModel* cost_model = (CostModel*)malloc(sizeof(*cost_model));
-  HashChain* hash_chain = (HashChain*)malloc(sizeof(*hash_chain));
+      (float*)WebPSafeMalloc(pix_count, sizeof(*cost));
+  CostModel* cost_model = (CostModel*)WebPSafeMalloc(1ULL, sizeof(*cost_model));
   VP8LColorCache hashers;
   const double mul0 = (recursive_cost_model != 0) ? 1.0 : 0.68;
   const double mul1 = (recursive_cost_model != 0) ? 1.0 : 0.82;
@@ -494,9 +577,7 @@ static int BackwardReferencesHashChainDistanceOnly(
   int iter_pos = 1;
   int iter_limit = -1;
 
-  if (cost == NULL || cost_model == NULL || hash_chain == NULL) goto Error;
-
-  if (!HashChainInit(hash_chain, pix_count)) goto Error;
+  if (cost == NULL || cost_model == NULL) goto Error;
 
   if (use_color_cache) {
     cc_init = VP8LColorCacheInit(&hashers, cache_bits);
@@ -504,7 +585,7 @@ static int BackwardReferencesHashChainDistanceOnly(
   }
 
   if (!CostModelBuild(cost_model, xsize, ysize, recursive_cost_model, argb,
-                      quality, cache_bits)) {
+                      quality, cache_bits, hash_chain, refs)) {
     goto Error;
   }
 
@@ -515,6 +596,7 @@ static int BackwardReferencesHashChainDistanceOnly(
   dist_array[0] = 0;
   GetParamsForHashChainFindCopy(quality, xsize, cache_bits,
                                 &window_size, &iter_pos, &iter_limit);
+  HashChainInit(hash_chain);
   for (i = 0; i < pix_count; ++i) {
     double prev_cost = 0.0;
     int shortmax;
@@ -589,12 +671,11 @@ static int BackwardReferencesHashChainDistanceOnly(
   }
   // Last pixel still to do, it can only be a single step if not reached
   // through cheaper means already.
-  ok = 1;
+  ok = !refs->error_;
 Error:
   if (cc_init) VP8LColorCacheClear(&hashers);
-  HashChainDelete(hash_chain);
-  free(cost_model);
-  free(cost);
+  WebPSafeFree(cost_model);
+  WebPSafeFree(cost);
   return ok;
 }
 
@@ -621,6 +702,7 @@ static int BackwardReferencesHashChainFollowChosenPath(
     int xsize, int ysize, const uint32_t* const argb,
     int quality, int cache_bits,
     const uint32_t* const chosen_path, int chosen_path_size,
+    VP8LHashChain* const hash_chain,
     VP8LBackwardRefs* const refs) {
   const int pix_count = xsize * ysize;
   const int use_color_cache = (cache_bits > 0);
@@ -633,20 +715,17 @@ static int BackwardReferencesHashChainFollowChosenPath(
   int window_size = WINDOW_SIZE;
   int iter_pos = 1;
   int iter_limit = -1;
-  HashChain* hash_chain = (HashChain*)malloc(sizeof(*hash_chain));
   VP8LColorCache hashers;
 
-  if (hash_chain == NULL || !HashChainInit(hash_chain, pix_count)) {
-    goto Error;
-  }
   if (use_color_cache) {
     cc_init = VP8LColorCacheInit(&hashers, cache_bits);
     if (!cc_init) goto Error;
   }
 
-  refs->size = 0;
+  ClearBackwardRefs(refs);
   GetParamsForHashChainFindCopy(quality, xsize, cache_bits,
                                 &window_size, &iter_pos, &iter_limit);
+  HashChainInit(hash_chain);
   for (ix = 0; ix < chosen_path_size; ++ix, ++size) {
     int offset = 0;
     int len = 0;
@@ -656,7 +735,7 @@ static int BackwardReferencesHashChainFollowChosenPath(
                         window_size, iter_pos, iter_limit,
                         &offset, &len);
       assert(len == max_len);
-      refs->refs[size] = PixOrCopyCreateCopy(offset, len);
+      BackwardRefsCursorAdd(refs, PixOrCopyCreateCopy(offset, len));
       if (use_color_cache) {
         for (k = 0; k < len; ++k) {
           VP8LColorCacheInsert(&hashers, argb[i + k]);
@@ -670,26 +749,25 @@ static int BackwardReferencesHashChainFollowChosenPath(
       }
       i += len;
     } else {
+      PixOrCopy v;
       if (use_color_cache && VP8LColorCacheContains(&hashers, argb[i])) {
         // push pixel as a color cache index
         const int idx = VP8LColorCacheGetIndex(&hashers, argb[i]);
-        refs->refs[size] = PixOrCopyCreateCacheIdx(idx);
+        v = PixOrCopyCreateCacheIdx(idx);
       } else {
         if (use_color_cache) VP8LColorCacheInsert(&hashers, argb[i]);
-        refs->refs[size] = PixOrCopyCreateLiteral(argb[i]);
+        v = PixOrCopyCreateLiteral(argb[i]);
       }
+      BackwardRefsCursorAdd(refs, v);
       if (i + 1 < pix_count) {
         HashChainInsert(hash_chain, &argb[i], i);
       }
       ++i;
     }
   }
-  assert(size <= refs->max_size);
-  refs->size = size;
-  ok = 1;
+  ok = !refs->error_;
 Error:
   if (cc_init) VP8LColorCacheClear(&hashers);
-  HashChainDelete(hash_chain);
   return ok;
 }
 
@@ -698,142 +776,129 @@ static int BackwardReferencesTraceBackwards(int xsize, int ysize,
                                             int recursive_cost_model,
                                             const uint32_t* const argb,
                                             int quality, int cache_bits,
+                                            VP8LHashChain* const hash_chain,
                                             VP8LBackwardRefs* const refs) {
   int ok = 0;
   const int dist_array_size = xsize * ysize;
   uint32_t* chosen_path = NULL;
   int chosen_path_size = 0;
   uint32_t* dist_array =
-      (uint32_t*)WebPSafeMalloc((uint64_t)dist_array_size, sizeof(*dist_array));
+      (uint32_t*)WebPSafeMalloc(dist_array_size, sizeof(*dist_array));
 
   if (dist_array == NULL) goto Error;
 
   if (!BackwardReferencesHashChainDistanceOnly(
-      xsize, ysize, recursive_cost_model, argb, quality, cache_bits,
-      dist_array)) {
+      xsize, ysize, recursive_cost_model, argb, quality, cache_bits, hash_chain,
+      refs, dist_array)) {
     goto Error;
   }
   TraceBackwards(dist_array, dist_array_size, &chosen_path, &chosen_path_size);
   if (!BackwardReferencesHashChainFollowChosenPath(
       xsize, ysize, argb, quality, cache_bits, chosen_path, chosen_path_size,
-      refs)) {
+      hash_chain, refs)) {
     goto Error;
   }
   ok = 1;
  Error:
-  free(dist_array);
+  WebPSafeFree(dist_array);
   return ok;
 }
 
 static void BackwardReferences2DLocality(int xsize,
-                                         VP8LBackwardRefs* const refs) {
-  int i;
-  for (i = 0; i < refs->size; ++i) {
-    if (PixOrCopyIsCopy(&refs->refs[i])) {
-      const int dist = refs->refs[i].argb_or_distance;
+                                         const VP8LBackwardRefs* const refs) {
+  VP8LRefsCursor c = VP8LRefsCursorInit(refs);
+  while (VP8LRefsCursorOk(&c)) {
+    if (PixOrCopyIsCopy(c.cur_pos)) {
+      const int dist = c.cur_pos->argb_or_distance;
       const int transformed_dist = DistanceToPlaneCode(xsize, dist);
-      refs->refs[i].argb_or_distance = transformed_dist;
+      c.cur_pos->argb_or_distance = transformed_dist;
     }
+    VP8LRefsCursorNext(&c);
   }
 }
 
-int VP8LGetBackwardReferences(int width, int height,
-                              const uint32_t* const argb,
-                              int quality, int cache_bits, int use_2d_locality,
-                              VP8LBackwardRefs* const best) {
-  int ok = 0;
+VP8LBackwardRefs* VP8LGetBackwardReferences(
+    int width, int height, const uint32_t* const argb, int quality,
+    int cache_bits, int use_2d_locality, VP8LHashChain* const hash_chain,
+    VP8LBackwardRefs refs_array[2]) {
   int lz77_is_useful;
-  VP8LBackwardRefs refs_rle, refs_lz77;
   const int num_pix = width * height;
-
-  VP8LBackwardRefsAlloc(&refs_rle, num_pix);
-  VP8LBackwardRefsAlloc(&refs_lz77, num_pix);
-  VP8LInitBackwardRefs(best);
-  if (refs_rle.refs == NULL || refs_lz77.refs == NULL) {
- Error1:
-    VP8LClearBackwardRefs(&refs_rle);
-    VP8LClearBackwardRefs(&refs_lz77);
-    goto End;
-  }
+  VP8LBackwardRefs* best = NULL;
+  VP8LBackwardRefs* const refs_lz77 = &refs_array[0];
+  VP8LBackwardRefs* const refs_rle = &refs_array[1];
 
   if (!BackwardReferencesHashChain(width, height, argb, cache_bits, quality,
-                                   &refs_lz77)) {
-    goto End;
+                                   hash_chain, refs_lz77)) {
+    return NULL;
+  }
+  if (!BackwardReferencesRle(width, height, argb, refs_rle)) {
+    return NULL;
   }
-  // Backward Reference using RLE only.
-  BackwardReferencesRle(width, height, argb, &refs_rle);
 
   {
     double bit_cost_lz77, bit_cost_rle;
-    VP8LHistogram* const histo = (VP8LHistogram*)malloc(sizeof(*histo));
-    if (histo == NULL) goto Error1;
-    // Evaluate lz77 coding
-    VP8LHistogramCreate(histo, &refs_lz77, cache_bits);
+    VP8LHistogram* const histo = VP8LAllocateHistogram(cache_bits);
+    if (histo == NULL) return NULL;
+    // Evaluate LZ77 coding.
+    VP8LHistogramCreate(histo, refs_lz77, cache_bits);
     bit_cost_lz77 = VP8LHistogramEstimateBits(histo);
-    // Evaluate RLE coding
-    VP8LHistogramCreate(histo, &refs_rle, cache_bits);
+    // Evaluate RLE coding.
+    VP8LHistogramCreate(histo, refs_rle, cache_bits);
     bit_cost_rle = VP8LHistogramEstimateBits(histo);
     // Decide if LZ77 is useful.
     lz77_is_useful = (bit_cost_lz77 < bit_cost_rle);
-    free(histo);
+    VP8LFreeHistogram(histo);
   }
 
   // Choose appropriate backward reference.
   if (lz77_is_useful) {
     // TraceBackwards is costly. Don't execute it at lower quality.
     const int try_lz77_trace_backwards = (quality >= 25);
-    *best = refs_lz77;   // default guess: lz77 is better
-    VP8LClearBackwardRefs(&refs_rle);
+    best = refs_lz77;   // default guess: lz77 is better
     if (try_lz77_trace_backwards) {
       // Set recursion level for large images using a color cache.
       const int recursion_level =
           (num_pix < 320 * 200) && (cache_bits > 0) ? 1 : 0;
-      VP8LBackwardRefs refs_trace;
-      if (!VP8LBackwardRefsAlloc(&refs_trace, num_pix)) {
-        goto End;
-      }
+      VP8LBackwardRefs* const refs_trace = &refs_array[1];
+      ClearBackwardRefs(refs_trace);
       if (BackwardReferencesTraceBackwards(width, height, recursion_level, argb,
-                                           quality, cache_bits, &refs_trace)) {
-        VP8LClearBackwardRefs(&refs_lz77);
-        *best = refs_trace;
+                                           quality, cache_bits, hash_chain,
+                                           refs_trace)) {
+        best = refs_trace;
       }
     }
   } else {
-    VP8LClearBackwardRefs(&refs_lz77);
-    *best = refs_rle;
+    best = refs_rle;
   }
 
   if (use_2d_locality) BackwardReferences2DLocality(width, best);
 
-  ok = 1;
-
- End:
-  if (!ok) {
-    VP8LClearBackwardRefs(best);
-  }
-  return ok;
+  return best;
 }
 
-// Returns 1 on success.
-static int ComputeCacheHistogram(const uint32_t* const argb,
-                                 int xsize, int ysize,
-                                 const VP8LBackwardRefs* const refs,
-                                 int cache_bits,
-                                 VP8LHistogram* const histo) {
+// Returns entropy for the given cache bits.
+static double ComputeCacheEntropy(const uint32_t* const argb,
+                                  int xsize, int ysize,
+                                  const VP8LBackwardRefs* const refs,
+                                  int cache_bits) {
   int pixel_index = 0;
-  int i;
   uint32_t k;
-  VP8LColorCache hashers;
   const int use_color_cache = (cache_bits > 0);
   int cc_init = 0;
+  double entropy = MAX_ENTROPY;
+  const double kSmallPenaltyForLargeCache = 4.0;
+  VP8LColorCache hashers;
+  VP8LRefsCursor c = VP8LRefsCursorInit(refs);
+  VP8LHistogram* histo = VP8LAllocateHistogram(cache_bits);
+  if (histo == NULL) goto Error;
 
   if (use_color_cache) {
     cc_init = VP8LColorCacheInit(&hashers, cache_bits);
-    if (!cc_init) return 0;
+    if (!cc_init) goto Error;
   }
 
-  for (i = 0; i < refs->size; ++i) {
-    const PixOrCopy* const v = &refs->refs[i];
+  while (VP8LRefsCursorOk(&c)) {
+    const PixOrCopy* const v = c.cur_pos;
     if (PixOrCopyIsLiteral(v)) {
       if (use_color_cache &&
           VP8LColorCacheContains(&hashers, argb[pixel_index])) {
@@ -853,42 +918,58 @@ static int ComputeCacheHistogram(const uint32_t* const argb,
       }
     }
     pixel_index += PixOrCopyLength(v);
+    VP8LRefsCursorNext(&c);
   }
   assert(pixel_index == xsize * ysize);
   (void)xsize;  // xsize is not used in non-debug compilations otherwise.
   (void)ysize;  // ysize is not used in non-debug compilations otherwise.
+  entropy = VP8LHistogramEstimateBits(histo) +
+      kSmallPenaltyForLargeCache * cache_bits;
+ Error:
   if (cc_init) VP8LColorCacheClear(&hashers);
-  return 1;
+  VP8LFreeHistogram(histo);
+  return entropy;
 }
 
-// Returns how many bits are to be used for a color cache.
+// *best_cache_bits will contain how many bits are to be used for a color cache.
+// Returns 0 in case of memory error.
 int VP8LCalculateEstimateForCacheSize(const uint32_t* const argb,
-                                      int xsize, int ysize,
+                                      int xsize, int ysize, int quality,
+                                      VP8LHashChain* const hash_chain,
+                                      VP8LBackwardRefs* const refs,
                                       int* const best_cache_bits) {
-  int ok = 0;
-  int cache_bits;
-  double lowest_entropy = 1e99;
-  VP8LBackwardRefs refs;
-  static const double kSmallPenaltyForLargeCache = 4.0;
-  static const int quality = 30;
-  if (!VP8LBackwardRefsAlloc(&refs, xsize * ysize) ||
-      !BackwardReferencesHashChain(xsize, ysize, argb, 0, quality, &refs)) {
-    goto Error;
+  int eval_low = 1;
+  int eval_high = 1;
+  double entropy_low = MAX_ENTROPY;
+  double entropy_high = MAX_ENTROPY;
+  int cache_bits_low = 0;
+  int cache_bits_high = MAX_COLOR_CACHE_BITS;
+
+  if (!BackwardReferencesHashChain(xsize, ysize, argb, 0, quality, hash_chain,
+                                   refs)) {
+    return 0;
   }
-  for (cache_bits = 0; cache_bits <= MAX_COLOR_CACHE_BITS; ++cache_bits) {
-    double cur_entropy;
-    VP8LHistogram histo;
-    VP8LHistogramInit(&histo, cache_bits);
-    ComputeCacheHistogram(argb, xsize, ysize, &refs, cache_bits, &histo);
-    cur_entropy = VP8LHistogramEstimateBits(&histo) +
-        kSmallPenaltyForLargeCache * cache_bits;
-    if (cache_bits == 0 || cur_entropy < lowest_entropy) {
-      *best_cache_bits = cache_bits;
-      lowest_entropy = cur_entropy;
+  // Do a binary search to find the optimal entropy for cache_bits.
+  while (cache_bits_high - cache_bits_low > 1) {
+    if (eval_low) {
+      entropy_low =
+          ComputeCacheEntropy(argb, xsize, ysize, refs, cache_bits_low);
+      eval_low = 0;
+    }
+    if (eval_high) {
+      entropy_high =
+          ComputeCacheEntropy(argb, xsize, ysize, refs, cache_bits_high);
+      eval_high = 0;
+    }
+    if (entropy_high < entropy_low) {
+      *best_cache_bits = cache_bits_high;
+      cache_bits_low = (cache_bits_low + cache_bits_high) / 2;
+      eval_low = 1;
+    } else {
+      *best_cache_bits = cache_bits_low;
+      cache_bits_high = (cache_bits_low + cache_bits_high) / 2;
+      eval_high = 1;
     }
   }
-  ok = 1;
- Error:
-  VP8LClearBackwardRefs(&refs);
-  return ok;
+  return 1;
 }
diff --git a/src/3rdparty/libwebp/src/enc/backward_references.h b/src/3rdparty/libwebp/src/enc/backward_references.h
index e1c75f0..c2c81c5 100644
--- a/src/3rdparty/libwebp/src/enc/backward_references.h
+++ b/src/3rdparty/libwebp/src/enc/backward_references.h
@@ -113,36 +113,96 @@ static WEBP_INLINE uint32_t PixOrCopyDistance(const PixOrCopy* const p) {
 }
 
 // -----------------------------------------------------------------------------
-// VP8LBackwardRefs
+// VP8LHashChain
+
+#define HASH_BITS 18
+#define HASH_SIZE (1 << HASH_BITS)
+
+typedef struct VP8LHashChain VP8LHashChain;
+struct VP8LHashChain {
+  // Stores the most recently added position with the given hash value.
+  int32_t hash_to_first_index_[HASH_SIZE];
+  // chain_[pos] stores the previous position with the same hash value
+  // for every pixel in the image.
+  int32_t* chain_;
+  // This is the maximum size of the hash_chain that can be constructed.
+  // Typically this is the pixel count (width x height) for a given image.
+  int size_;
+};
 
-typedef struct {
-  PixOrCopy* refs;
-  int size;      // currently used
-  int max_size;  // maximum capacity
-} VP8LBackwardRefs;
+// Must be called first, to set size.
+int VP8LHashChainInit(VP8LHashChain* const p, int size);
+void VP8LHashChainClear(VP8LHashChain* const p);  // release memory
 
-// Initialize the object. Must be called first. 'refs' can be NULL.
-void VP8LInitBackwardRefs(VP8LBackwardRefs* const refs);
+// -----------------------------------------------------------------------------
+// VP8LBackwardRefs (block-based backward-references storage)
+
+// maximum number of reference blocks the image will be segmented into
+#define MAX_REFS_BLOCK_PER_IMAGE 16
+
+typedef struct PixOrCopyBlock PixOrCopyBlock;   // forward declaration
+typedef struct VP8LBackwardRefs VP8LBackwardRefs;
+
+// Container for blocks chain
+struct VP8LBackwardRefs {
+  int block_size_;               // common block-size
+  int error_;                    // set to true if some memory error occurred
+  PixOrCopyBlock* refs_;         // list of currently used blocks
+  PixOrCopyBlock** tail_;        // for list recycling
+  PixOrCopyBlock* free_blocks_;  // free-list
+  PixOrCopyBlock* last_block_;   // used for adding new refs (internal)
+};
 
-// Release memory and re-initialize the object. 'refs' can be NULL.
-void VP8LClearBackwardRefs(VP8LBackwardRefs* const refs);
+// Initialize the object. 'block_size' is the common block size to store
+// references (typically, width * height / MAX_REFS_BLOCK_PER_IMAGE).
+void VP8LBackwardRefsInit(VP8LBackwardRefs* const refs, int block_size);
+// Release memory for backward references.
+void VP8LBackwardRefsClear(VP8LBackwardRefs* const refs);
+// Copies the 'src' backward refs to the 'dst'. Returns 0 in case of error.
+int VP8LBackwardRefsCopy(const VP8LBackwardRefs* const src,
+                         VP8LBackwardRefs* const dst);
 
-// Allocate 'max_size' references. Returns false in case of memory error.
-int VP8LBackwardRefsAlloc(VP8LBackwardRefs* const refs, int max_size);
+// Cursor for iterating on references content
+typedef struct {
+  // public:
+  PixOrCopy* cur_pos;           // current position
+  // private:
+  PixOrCopyBlock* cur_block_;   // current block in the refs list
+  const PixOrCopy* last_pos_;   // sentinel for switching to next block
+} VP8LRefsCursor;
+
+// Returns a cursor positioned at the beginning of the references list.
+VP8LRefsCursor VP8LRefsCursorInit(const VP8LBackwardRefs* const refs);
+// Returns true if cursor is pointing at a valid position.
+static WEBP_INLINE int VP8LRefsCursorOk(const VP8LRefsCursor* const c) {
+  return (c->cur_pos != NULL);
+}
+// Move to next block of references. Internal, not to be called directly.
+void VP8LRefsCursorNextBlock(VP8LRefsCursor* const c);
+// Move to next position, or NULL. Should not be called if !VP8LRefsCursorOk().
+static WEBP_INLINE void VP8LRefsCursorNext(VP8LRefsCursor* const c) {
+  assert(c != NULL);
+  assert(VP8LRefsCursorOk(c));
+  if (++c->cur_pos == c->last_pos_) VP8LRefsCursorNextBlock(c);
+}
 
 // -----------------------------------------------------------------------------
 // Main entry points
 
 // Evaluates best possible backward references for specified quality.
 // Further optimize for 2D locality if use_2d_locality flag is set.
-int VP8LGetBackwardReferences(int width, int height,
-                              const uint32_t* const argb,
-                              int quality, int cache_bits, int use_2d_locality,
-                              VP8LBackwardRefs* const best);
+// The return value is the pointer to the best of the two backward refs viz,
+// refs[0] or refs[1].
+VP8LBackwardRefs* VP8LGetBackwardReferences(
+    int width, int height, const uint32_t* const argb, int quality,
+    int cache_bits, int use_2d_locality, VP8LHashChain* const hash_chain,
+    VP8LBackwardRefs refs[2]);
 
 // Produce an estimate for a good color cache size for the image.
 int VP8LCalculateEstimateForCacheSize(const uint32_t* const argb,
-                                      int xsize, int ysize,
+                                      int xsize, int ysize, int quality,
+                                      VP8LHashChain* const hash_chain,
+                                      VP8LBackwardRefs* const ref,
                                       int* const best_cache_bits);
 
 #ifdef __cplusplus
diff --git a/src/3rdparty/libwebp/src/enc/config.c b/src/3rdparty/libwebp/src/enc/config.c
index af7f0b0..53a3bb2 100644
--- a/src/3rdparty/libwebp/src/enc/config.c
+++ b/src/3rdparty/libwebp/src/enc/config.c
@@ -111,7 +111,11 @@ int WebPValidateConfig(const WebPConfig* config) {
     return 0;
   if (config->show_compressed < 0 || config->show_compressed > 1)
     return 0;
+#if WEBP_ENCODER_ABI_VERSION > 0x0204
+  if (config->preprocessing < 0 || config->preprocessing > 7)
+#else
   if (config->preprocessing < 0 || config->preprocessing > 3)
+#endif
     return 0;
   if (config->partitions < 0 || config->partitions > 3)
     return 0;
@@ -138,3 +142,25 @@ int WebPValidateConfig(const WebPConfig* config) {
 
 //------------------------------------------------------------------------------
 
+#if WEBP_ENCODER_ABI_VERSION > 0x0202
+#define MAX_LEVEL 9
+
+// Mapping between -z level and -m / -q parameter settings.
+static const struct {
+  uint8_t method_;
+  uint8_t quality_;
+} kLosslessPresets[MAX_LEVEL + 1] = {
+  { 0,  0 }, { 1, 20 }, { 2, 25 }, { 3, 30 }, { 3, 50 },
+  { 4, 50 }, { 4, 75 }, { 4, 90 }, { 5, 90 }, { 6, 100 }
+};
+
+int WebPConfigLosslessPreset(WebPConfig* config, int level) {
+  if (config == NULL || level < 0 || level > MAX_LEVEL) return 0;
+  config->lossless = 1;
+  config->method = kLosslessPresets[level].method_;
+  config->quality = kLosslessPresets[level].quality_;
+  return 1;
+}
+#endif
+
+//------------------------------------------------------------------------------
diff --git a/src/3rdparty/libwebp/src/enc/cost.c b/src/3rdparty/libwebp/src/enc/cost.c
index 09699f8..9d2cc01 100644
--- a/src/3rdparty/libwebp/src/enc/cost.c
+++ b/src/3rdparty/libwebp/src/enc/cost.c
@@ -360,9 +360,10 @@ void VP8CalculateLevelCosts(VP8Proba* const proba) {
       for (ctx = 0; ctx < NUM_CTX; ++ctx) {
         const uint8_t* const p = proba->coeffs_[ctype][band][ctx];
         uint16_t* const table = proba->level_cost_[ctype][band][ctx];
-        const int cost_base = VP8BitCost(1, p[1]);
+        const int cost0 = (ctx > 0) ? VP8BitCost(1, p[0]) : 0;
+        const int cost_base = VP8BitCost(1, p[1]) + cost0;
         int v;
-        table[0] = VP8BitCost(0, p[1]);
+        table[0] = VP8BitCost(0, p[1]) + cost0;
         for (v = 1; v <= MAX_VARIABLE_LEVEL; ++v) {
           table[v] = cost_base + VariableLevelCost(v, p);
         }
@@ -486,4 +487,249 @@ const uint16_t VP8FixedCostsI4[NUM_BMODES][NUM_BMODES][NUM_BMODES] = {
 };
 
 //------------------------------------------------------------------------------
+// Mode costs
 
+static int GetResidualCost(int ctx0, const VP8Residual* const res) {
+  int n = res->first;
+  // should be prob[VP8EncBands[n]], but it's equivalent for n=0 or 1
+  const int p0 = res->prob[n][ctx0][0];
+  const uint16_t* t = res->cost[n][ctx0];
+  // bit_cost(1, p0) is already incorporated in t[] tables, but only if ctx != 0
+  // (as required by the syntax). For ctx0 == 0, we need to add it here or it'll
+  // be missing during the loop.
+  int cost = (ctx0 == 0) ? VP8BitCost(1, p0) : 0;
+
+  if (res->last < 0) {
+    return VP8BitCost(0, p0);
+  }
+  for (; n < res->last; ++n) {
+    const int v = abs(res->coeffs[n]);
+    const int b = VP8EncBands[n + 1];
+    const int ctx = (v >= 2) ? 2 : v;
+    cost += VP8LevelCost(t, v);
+    t = res->cost[b][ctx];
+  }
+  // Last coefficient is always non-zero
+  {
+    const int v = abs(res->coeffs[n]);
+    assert(v != 0);
+    cost += VP8LevelCost(t, v);
+    if (n < 15) {
+      const int b = VP8EncBands[n + 1];
+      const int ctx = (v == 1) ? 1 : 2;
+      const int last_p0 = res->prob[b][ctx][0];
+      cost += VP8BitCost(0, last_p0);
+    }
+  }
+  return cost;
+}
+
+//------------------------------------------------------------------------------
+// init function
+
+#if defined(WEBP_USE_MIPS32)
+extern int VP8GetResidualCostMIPS32(int ctx0, const VP8Residual* const res);
+#endif  // WEBP_USE_MIPS32
+
+// TODO(skal): this, and GetResidualCost(), should probably go somewhere
+// under src/dsp/ at some point.
+VP8GetResidualCostFunc VP8GetResidualCost;
+
+void VP8GetResidualCostInit(void) {
+  VP8GetResidualCost = GetResidualCost;
+  if (VP8GetCPUInfo != NULL) {
+#if defined(WEBP_USE_MIPS32)
+    if (VP8GetCPUInfo(kMIPS32)) {
+      VP8GetResidualCost = VP8GetResidualCostMIPS32;
+    }
+#endif
+  }
+}
+
+//------------------------------------------------------------------------------
+// helper functions for residuals struct VP8Residual.
+
+void VP8InitResidual(int first, int coeff_type,
+                     VP8Encoder* const enc, VP8Residual* const res) {
+  res->coeff_type = coeff_type;
+  res->prob  = enc->proba_.coeffs_[coeff_type];
+  res->stats = enc->proba_.stats_[coeff_type];
+  res->cost  = enc->proba_.level_cost_[coeff_type];
+  res->first = first;
+}
+
+static void SetResidualCoeffs(const int16_t* const coeffs,
+                              VP8Residual* const res) {
+  int n;
+  res->last = -1;
+  assert(res->first == 0 || coeffs[0] == 0);
+  for (n = 15; n >= 0; --n) {
+    if (coeffs[n]) {
+      res->last = n;
+      break;
+    }
+  }
+  res->coeffs = coeffs;
+}
+
+//------------------------------------------------------------------------------
+// init function
+
+#if defined(WEBP_USE_SSE2)
+extern void VP8SetResidualCoeffsSSE2(const int16_t* const coeffs,
+                                     VP8Residual* const res);
+#endif  // WEBP_USE_SSE2
+
+VP8SetResidualCoeffsFunc VP8SetResidualCoeffs;
+
+void VP8SetResidualCoeffsInit(void) {
+  VP8SetResidualCoeffs = SetResidualCoeffs;
+  if (VP8GetCPUInfo != NULL) {
+#if defined(WEBP_USE_SSE2)
+    if (VP8GetCPUInfo(kSSE2)) {
+      VP8SetResidualCoeffs = VP8SetResidualCoeffsSSE2;
+    }
+#endif
+  }
+}
+
+//------------------------------------------------------------------------------
+// Mode costs
+
+int VP8GetCostLuma4(VP8EncIterator* const it, const int16_t levels[16]) {
+  const int x = (it->i4_ & 3), y = (it->i4_ >> 2);
+  VP8Residual res;
+  VP8Encoder* const enc = it->enc_;
+  int R = 0;
+  int ctx;
+
+  VP8InitResidual(0, 3, enc, &res);
+  ctx = it->top_nz_[x] + it->left_nz_[y];
+  VP8SetResidualCoeffs(levels, &res);
+  R += VP8GetResidualCost(ctx, &res);
+  return R;
+}
+
+int VP8GetCostLuma16(VP8EncIterator* const it, const VP8ModeScore* const rd) {
+  VP8Residual res;
+  VP8Encoder* const enc = it->enc_;
+  int x, y;
+  int R = 0;
+
+  VP8IteratorNzToBytes(it);   // re-import the non-zero context
+
+  // DC
+  VP8InitResidual(0, 1, enc, &res);
+  VP8SetResidualCoeffs(rd->y_dc_levels, &res);
+  R += VP8GetResidualCost(it->top_nz_[8] + it->left_nz_[8], &res);
+
+  // AC
+  VP8InitResidual(1, 0, enc, &res);
+  for (y = 0; y < 4; ++y) {
+    for (x = 0; x < 4; ++x) {
+      const int ctx = it->top_nz_[x] + it->left_nz_[y];
+      VP8SetResidualCoeffs(rd->y_ac_levels[x + y * 4], &res);
+      R += VP8GetResidualCost(ctx, &res);
+      it->top_nz_[x] = it->left_nz_[y] = (res.last >= 0);
+    }
+  }
+  return R;
+}
+
+int VP8GetCostUV(VP8EncIterator* const it, const VP8ModeScore* const rd) {
+  VP8Residual res;
+  VP8Encoder* const enc = it->enc_;
+  int ch, x, y;
+  int R = 0;
+
+  VP8IteratorNzToBytes(it);  // re-import the non-zero context
+
+  VP8InitResidual(0, 2, enc, &res);
+  for (ch = 0; ch <= 2; ch += 2) {
+    for (y = 0; y < 2; ++y) {
+      for (x = 0; x < 2; ++x) {
+        const int ctx = it->top_nz_[4 + ch + x] + it->left_nz_[4 + ch + y];
+        VP8SetResidualCoeffs(rd->uv_levels[ch * 2 + x + y * 2], &res);
+        R += VP8GetResidualCost(ctx, &res);
+        it->top_nz_[4 + ch + x] = it->left_nz_[4 + ch + y] = (res.last >= 0);
+      }
+    }
+  }
+  return R;
+}
+
+
+//------------------------------------------------------------------------------
+// Recording of token probabilities.
+
+// Record proba context used
+static int Record(int bit, proba_t* const stats) {
+  proba_t p = *stats;
+  if (p >= 0xffff0000u) {               // an overflow is inbound.
+    p = ((p + 1u) >> 1) & 0x7fff7fffu;  // -> divide the stats by 2.
+  }
+  // record bit count (lower 16 bits) and increment total count (upper 16 bits).
+  p += 0x00010000u + bit;
+  *stats = p;
+  return bit;
+}
+
+// We keep the table-free variant around for reference, in case.
+#define USE_LEVEL_CODE_TABLE
+
+// Simulate block coding, but only record statistics.
+// Note: no need to record the fixed probas.
+int VP8RecordCoeffs(int ctx, const VP8Residual* const res) {
+  int n = res->first;
+  // should be stats[VP8EncBands[n]], but it's equivalent for n=0 or 1
+  proba_t* s = res->stats[n][ctx];
+  if (res->last  < 0) {
+    Record(0, s + 0);
+    return 0;
+  }
+  while (n <= res->last) {
+    int v;
+    Record(1, s + 0);  // order of record doesn't matter
+    while ((v = res->coeffs[n++]) == 0) {
+      Record(0, s + 1);
+      s = res->stats[VP8EncBands[n]][0];
+    }
+    Record(1, s + 1);
+    if (!Record(2u < (unsigned int)(v + 1), s + 2)) {  // v = -1 or 1
+      s = res->stats[VP8EncBands[n]][1];
+    } else {
+      v = abs(v);
+#if !defined(USE_LEVEL_CODE_TABLE)
+      if (!Record(v > 4, s + 3)) {
+        if (Record(v != 2, s + 4))
+          Record(v == 4, s + 5);
+      } else if (!Record(v > 10, s + 6)) {
+        Record(v > 6, s + 7);
+      } else if (!Record((v >= 3 + (8 << 2)), s + 8)) {
+        Record((v >= 3 + (8 << 1)), s + 9);
+      } else {
+        Record((v >= 3 + (8 << 3)), s + 10);
+      }
+#else
+      if (v > MAX_VARIABLE_LEVEL) {
+        v = MAX_VARIABLE_LEVEL;
+      }
+
+      {
+        const int bits = VP8LevelCodes[v - 1][1];
+        int pattern = VP8LevelCodes[v - 1][0];
+        int i;
+        for (i = 0; (pattern >>= 1) != 0; ++i) {
+          const int mask = 2 << i;
+          if (pattern & 1) Record(!!(bits & mask), s + 3 + i);
+        }
+      }
+#endif
+      s = res->stats[VP8EncBands[n]][2];
+    }
+  }
+  if (n < 16) Record(0, s + 0);
+  return 1;
+}
+
+//------------------------------------------------------------------------------
diff --git a/src/3rdparty/libwebp/src/enc/cost.h b/src/3rdparty/libwebp/src/enc/cost.h
index 3cbad1a..4e55895 100644
--- a/src/3rdparty/libwebp/src/enc/cost.h
+++ b/src/3rdparty/libwebp/src/enc/cost.h
@@ -14,12 +14,38 @@
 #ifndef WEBP_ENC_COST_H_
 #define WEBP_ENC_COST_H_
 
+#include <assert.h>
+#include <stdlib.h>
 #include "./vp8enci.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
+// On-the-fly info about the current set of residuals. Handy to avoid
+// passing zillions of params.
+typedef struct {
+  int first;
+  int last;
+  const int16_t* coeffs;
+
+  int coeff_type;
+  ProbaArray* prob;
+  StatsArray* stats;
+  CostArray*  cost;
+} VP8Residual;
+
+void VP8InitResidual(int first, int coeff_type,
+                     VP8Encoder* const enc, VP8Residual* const res);
+
+typedef void (*VP8SetResidualCoeffsFunc)(const int16_t* const coeffs,
+                                         VP8Residual* const res);
+extern VP8SetResidualCoeffsFunc VP8SetResidualCoeffs;
+
+void VP8SetResidualCoeffsInit(void);  // must be called first
+
+int VP8RecordCoeffs(int ctx, const VP8Residual* const res);
+
 // approximate cost per level:
 extern const uint16_t VP8LevelFixedCosts[MAX_LEVEL + 1];
 extern const uint16_t VP8EntropyCost[256];        // 8bit fixed-point log(p)
@@ -29,6 +55,12 @@ static WEBP_INLINE int VP8BitCost(int bit, uint8_t proba) {
   return !bit ? VP8EntropyCost[proba] : VP8EntropyCost[255 - proba];
 }
 
+// Cost calculation function.
+typedef int (*VP8GetResidualCostFunc)(int ctx0, const VP8Residual* const res);
+extern VP8GetResidualCostFunc VP8GetResidualCost;
+
+void VP8GetResidualCostInit(void);  // must be called first
+
 // Level cost calculations
 extern const uint16_t VP8LevelCodes[MAX_VARIABLE_LEVEL][2];
 void VP8CalculateLevelCosts(VP8Proba* const proba);
diff --git a/src/3rdparty/libwebp/src/enc/filter.c b/src/3rdparty/libwebp/src/enc/filter.c
index dd27804..11db4bd 100644
--- a/src/3rdparty/libwebp/src/enc/filter.c
+++ b/src/3rdparty/libwebp/src/enc/filter.c
@@ -13,6 +13,7 @@
 
 #include <assert.h>
 #include "./vp8enci.h"
+#include "../dsp/dsp.h"
 
 // This table gives, for a given sharpness, the filtering strength to be
 // used (at least) in order to filter a given edge step delta.
@@ -61,180 +62,6 @@ int VP8FilterStrengthFromDelta(int sharpness, int delta) {
   return kLevelsFromDelta[sharpness][pos];
 }
 
-// -----------------------------------------------------------------------------
-// NOTE: clip1, tables and InitTables are repeated entries of dsp.c
-static uint8_t abs0[255 + 255 + 1];     // abs(i)
-static uint8_t abs1[255 + 255 + 1];     // abs(i)>>1
-static int8_t sclip1[1020 + 1020 + 1];  // clips [-1020, 1020] to [-128, 127]
-static int8_t sclip2[112 + 112 + 1];    // clips [-112, 112] to [-16, 15]
-static uint8_t clip1[255 + 510 + 1];    // clips [-255,510] to [0,255]
-
-static int tables_ok = 0;
-
-static void InitTables(void) {
-  if (!tables_ok) {
-    int i;
-    for (i = -255; i <= 255; ++i) {
-      abs0[255 + i] = (i < 0) ? -i : i;
-      abs1[255 + i] = abs0[255 + i] >> 1;
-    }
-    for (i = -1020; i <= 1020; ++i) {
-      sclip1[1020 + i] = (i < -128) ? -128 : (i > 127) ? 127 : i;
-    }
-    for (i = -112; i <= 112; ++i) {
-      sclip2[112 + i] = (i < -16) ? -16 : (i > 15) ? 15 : i;
-    }
-    for (i = -255; i <= 255 + 255; ++i) {
-      clip1[255 + i] = (i < 0) ? 0 : (i > 255) ? 255 : i;
-    }
-    tables_ok = 1;
-  }
-}
-
-//------------------------------------------------------------------------------
-// Edge filtering functions
-
-// 4 pixels in, 2 pixels out
-static WEBP_INLINE void do_filter2(uint8_t* p, int step) {
-  const int p1 = p[-2*step], p0 = p[-step], q0 = p[0], q1 = p[step];
-  const int a = 3 * (q0 - p0) + sclip1[1020 + p1 - q1];
-  const int a1 = sclip2[112 + ((a + 4) >> 3)];
-  const int a2 = sclip2[112 + ((a + 3) >> 3)];
-  p[-step] = clip1[255 + p0 + a2];
-  p[    0] = clip1[255 + q0 - a1];
-}
-
-// 4 pixels in, 4 pixels out
-static WEBP_INLINE void do_filter4(uint8_t* p, int step) {
-  const int p1 = p[-2*step], p0 = p[-step], q0 = p[0], q1 = p[step];
-  const int a = 3 * (q0 - p0);
-  const int a1 = sclip2[112 + ((a + 4) >> 3)];
-  const int a2 = sclip2[112 + ((a + 3) >> 3)];
-  const int a3 = (a1 + 1) >> 1;
-  p[-2*step] = clip1[255 + p1 + a3];
-  p[-  step] = clip1[255 + p0 + a2];
-  p[      0] = clip1[255 + q0 - a1];
-  p[   step] = clip1[255 + q1 - a3];
-}
-
-// high edge-variance
-static WEBP_INLINE int hev(const uint8_t* p, int step, int thresh) {
-  const int p1 = p[-2*step], p0 = p[-step], q0 = p[0], q1 = p[step];
-  return (abs0[255 + p1 - p0] > thresh) || (abs0[255 + q1 - q0] > thresh);
-}
-
-static WEBP_INLINE int needs_filter(const uint8_t* p, int step, int thresh) {
-  const int p1 = p[-2*step], p0 = p[-step], q0 = p[0], q1 = p[step];
-  return (2 * abs0[255 + p0 - q0] + abs1[255 + p1 - q1]) <= thresh;
-}
-
-static WEBP_INLINE int needs_filter2(const uint8_t* p,
-                                     int step, int t, int it) {
-  const int p3 = p[-4*step], p2 = p[-3*step], p1 = p[-2*step], p0 = p[-step];
-  const int q0 = p[0], q1 = p[step], q2 = p[2*step], q3 = p[3*step];
-  if ((2 * abs0[255 + p0 - q0] + abs1[255 + p1 - q1]) > t)
-    return 0;
-  return abs0[255 + p3 - p2] <= it && abs0[255 + p2 - p1] <= it &&
-         abs0[255 + p1 - p0] <= it && abs0[255 + q3 - q2] <= it &&
-         abs0[255 + q2 - q1] <= it && abs0[255 + q1 - q0] <= it;
-}
-
-//------------------------------------------------------------------------------
-// Simple In-loop filtering (Paragraph 15.2)
-
-static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {
-  int i;
-  for (i = 0; i < 16; ++i) {
-    if (needs_filter(p + i, stride, thresh)) {
-      do_filter2(p + i, stride);
-    }
-  }
-}
-
-static void SimpleHFilter16(uint8_t* p, int stride, int thresh) {
-  int i;
-  for (i = 0; i < 16; ++i) {
-    if (needs_filter(p + i * stride, 1, thresh)) {
-      do_filter2(p + i * stride, 1);
-    }
-  }
-}
-
-static void SimpleVFilter16i(uint8_t* p, int stride, int thresh) {
-  int k;
-  for (k = 3; k > 0; --k) {
-    p += 4 * stride;
-    SimpleVFilter16(p, stride, thresh);
-  }
-}
-
-static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) {
-  int k;
-  for (k = 3; k > 0; --k) {
-    p += 4;
-    SimpleHFilter16(p, stride, thresh);
-  }
-}
-
-//------------------------------------------------------------------------------
-// Complex In-loop filtering (Paragraph 15.3)
-
-static WEBP_INLINE void FilterLoop24(uint8_t* p,
-                                     int hstride, int vstride, int size,
-                                     int thresh, int ithresh, int hev_thresh) {
-  while (size-- > 0) {
-    if (needs_filter2(p, hstride, thresh, ithresh)) {
-      if (hev(p, hstride, hev_thresh)) {
-        do_filter2(p, hstride);
-      } else {
-        do_filter4(p, hstride);
-      }
-    }
-    p += vstride;
-  }
-}
-
-// on three inner edges
-static void VFilter16i(uint8_t* p, int stride,
-                       int thresh, int ithresh, int hev_thresh) {
-  int k;
-  for (k = 3; k > 0; --k) {
-    p += 4 * stride;
-    FilterLoop24(p, stride, 1, 16, thresh, ithresh, hev_thresh);
-  }
-}
-
-static void HFilter16i(uint8_t* p, int stride,
-                       int thresh, int ithresh, int hev_thresh) {
-  int k;
-  for (k = 3; k > 0; --k) {
-    p += 4;
-    FilterLoop24(p, 1, stride, 16, thresh, ithresh, hev_thresh);
-  }
-}
-
-static void VFilter8i(uint8_t* u, uint8_t* v, int stride,
-                      int thresh, int ithresh, int hev_thresh) {
-  FilterLoop24(u + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
-  FilterLoop24(v + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
-}
-
-static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
-                      int thresh, int ithresh, int hev_thresh) {
-  FilterLoop24(u + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
-  FilterLoop24(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
-}
-
-//------------------------------------------------------------------------------
-
-void (*VP8EncVFilter16i)(uint8_t*, int, int, int, int) = VFilter16i;
-void (*VP8EncHFilter16i)(uint8_t*, int, int, int, int) = HFilter16i;
-void (*VP8EncVFilter8i)(uint8_t*, uint8_t*, int, int, int, int) = VFilter8i;
-void (*VP8EncHFilter8i)(uint8_t*, uint8_t*, int, int, int, int) = HFilter8i;
-
-void (*VP8EncSimpleVFilter16i)(uint8_t*, int, int) = SimpleVFilter16i;
-void (*VP8EncSimpleHFilter16i)(uint8_t*, int, int) = SimpleHFilter16i;
-
 //------------------------------------------------------------------------------
 // Paragraph 15.4: compute the inner-edge filtering strength
 
@@ -266,14 +93,14 @@ static void DoFilter(const VP8EncIterator* const it, int level) {
   memcpy(y_dst, it->yuv_out_, YUV_SIZE * sizeof(uint8_t));
 
   if (enc->filter_hdr_.simple_ == 1) {   // simple
-    VP8EncSimpleHFilter16i(y_dst, BPS, limit);
-    VP8EncSimpleVFilter16i(y_dst, BPS, limit);
+    VP8SimpleHFilter16i(y_dst, BPS, limit);
+    VP8SimpleVFilter16i(y_dst, BPS, limit);
   } else {    // complex
     const int hev_thresh = (level >= 40) ? 2 : (level >= 15) ? 1 : 0;
-    VP8EncHFilter16i(y_dst, BPS, limit, ilevel, hev_thresh);
-    VP8EncHFilter8i(u_dst, v_dst, BPS, limit, ilevel, hev_thresh);
-    VP8EncVFilter16i(y_dst, BPS, limit, ilevel, hev_thresh);
-    VP8EncVFilter8i(u_dst, v_dst, BPS, limit, ilevel, hev_thresh);
+    VP8HFilter16i(y_dst, BPS, limit, ilevel, hev_thresh);
+    VP8HFilter8i(u_dst, v_dst, BPS, limit, ilevel, hev_thresh);
+    VP8VFilter16i(y_dst, BPS, limit, ilevel, hev_thresh);
+    VP8VFilter8i(u_dst, v_dst, BPS, limit, ilevel, hev_thresh);
   }
 }
 
@@ -387,7 +214,6 @@ static double GetMBSSIM(const uint8_t* yuv1, const uint8_t* yuv2) {
 void VP8InitFilter(VP8EncIterator* const it) {
   if (it->lf_stats_ != NULL) {
     int s, i;
-    InitTables();
     for (s = 0; s < NUM_MB_SEGMENTS; s++) {
       for (i = 0; i < MAX_LF_LEVELS; i++) {
         (*it->lf_stats_)[s][i] = 0;
@@ -468,4 +294,3 @@ void VP8AdjustFilterStrength(VP8EncIterator* const it) {
 }
 
 // -----------------------------------------------------------------------------
-
diff --git a/src/3rdparty/libwebp/src/enc/frame.c b/src/3rdparty/libwebp/src/enc/frame.c
index 2582244..cdf1dab 100644
--- a/src/3rdparty/libwebp/src/enc/frame.c
+++ b/src/3rdparty/libwebp/src/enc/frame.c
@@ -11,8 +11,6 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
-#include <assert.h>
-#include <stdlib.h>
 #include <string.h>
 #include <math.h>
 
@@ -23,19 +21,6 @@
 #define SEGMENT_VISU 0
 #define DEBUG_SEARCH 0    // useful to track search convergence
 
-// On-the-fly info about the current set of residuals. Handy to avoid
-// passing zillions of params.
-typedef struct {
-  int first;
-  int last;
-  const int16_t* coeffs;
-
-  int coeff_type;
-  ProbaArray* prob;
-  StatsArray* stats;
-  CostArray*  cost;
-} VP8Residual;
-
 //------------------------------------------------------------------------------
 // multi-pass convergence
 
@@ -142,83 +127,6 @@ static int FinalizeSkipProba(VP8Encoder* const enc) {
   return size;
 }
 
-//------------------------------------------------------------------------------
-// Recording of token probabilities.
-
-static void ResetTokenStats(VP8Encoder* const enc) {
-  VP8Proba* const proba = &enc->proba_;
-  memset(proba->stats_, 0, sizeof(proba->stats_));
-}
-
-// Record proba context used
-static int Record(int bit, proba_t* const stats) {
-  proba_t p = *stats;
-  if (p >= 0xffff0000u) {               // an overflow is inbound.
-    p = ((p + 1u) >> 1) & 0x7fff7fffu;  // -> divide the stats by 2.
-  }
-  // record bit count (lower 16 bits) and increment total count (upper 16 bits).
-  p += 0x00010000u + bit;
-  *stats = p;
-  return bit;
-}
-
-// We keep the table free variant around for reference, in case.
-#define USE_LEVEL_CODE_TABLE
-
-// Simulate block coding, but only record statistics.
-// Note: no need to record the fixed probas.
-static int RecordCoeffs(int ctx, const VP8Residual* const res) {
-  int n = res->first;
-  // should be stats[VP8EncBands[n]], but it's equivalent for n=0 or 1
-  proba_t* s = res->stats[n][ctx];
-  if (res->last  < 0) {
-    Record(0, s + 0);
-    return 0;
-  }
-  while (n <= res->last) {
-    int v;
-    Record(1, s + 0);  // order of record doesn't matter
-    while ((v = res->coeffs[n++]) == 0) {
-      Record(0, s + 1);
-      s = res->stats[VP8EncBands[n]][0];
-    }
-    Record(1, s + 1);
-    if (!Record(2u < (unsigned int)(v + 1), s + 2)) {  // v = -1 or 1
-      s = res->stats[VP8EncBands[n]][1];
-    } else {
-      v = abs(v);
-#if !defined(USE_LEVEL_CODE_TABLE)
-      if (!Record(v > 4, s + 3)) {
-        if (Record(v != 2, s + 4))
-          Record(v == 4, s + 5);
-      } else if (!Record(v > 10, s + 6)) {
-        Record(v > 6, s + 7);
-      } else if (!Record((v >= 3 + (8 << 2)), s + 8)) {
-        Record((v >= 3 + (8 << 1)), s + 9);
-      } else {
-        Record((v >= 3 + (8 << 3)), s + 10);
-      }
-#else
-      if (v > MAX_VARIABLE_LEVEL)
-        v = MAX_VARIABLE_LEVEL;
-
-      {
-        const int bits = VP8LevelCodes[v - 1][1];
-        int pattern = VP8LevelCodes[v - 1][0];
-        int i;
-        for (i = 0; (pattern >>= 1) != 0; ++i) {
-          const int mask = 2 << i;
-          if (pattern & 1) Record(!!(bits & mask), s + 3 + i);
-        }
-      }
-#endif
-      s = res->stats[VP8EncBands[n]][2];
-    }
-  }
-  if (n < 16) Record(0, s + 0);
-  return 1;
-}
-
 // Collect statistics and deduce probabilities for next coding pass.
 // Return the total bit-cost for coding the probability updates.
 static int CalcTokenProba(int nb, int total) {
@@ -231,6 +139,11 @@ static int BranchCost(int nb, int total, int proba) {
   return nb * VP8BitCost(1, proba) + (total - nb) * VP8BitCost(0, proba);
 }
 
+static void ResetTokenStats(VP8Encoder* const enc) {
+  VP8Proba* const proba = &enc->proba_;
+  memset(proba->stats_, 0, sizeof(proba->stats_));
+}
+
 static int FinalizeTokenProbas(VP8Proba* const proba) {
   int has_changed = 0;
   int size = 0;
@@ -309,131 +222,6 @@ static void SetSegmentProbas(VP8Encoder* const enc) {
 }
 
 //------------------------------------------------------------------------------
-// helper functions for residuals struct VP8Residual.
-
-static void InitResidual(int first, int coeff_type,
-                         VP8Encoder* const enc, VP8Residual* const res) {
-  res->coeff_type = coeff_type;
-  res->prob  = enc->proba_.coeffs_[coeff_type];
-  res->stats = enc->proba_.stats_[coeff_type];
-  res->cost  = enc->proba_.level_cost_[coeff_type];
-  res->first = first;
-}
-
-static void SetResidualCoeffs(const int16_t* const coeffs,
-                              VP8Residual* const res) {
-  int n;
-  res->last = -1;
-  for (n = 15; n >= res->first; --n) {
-    if (coeffs[n]) {
-      res->last = n;
-      break;
-    }
-  }
-  res->coeffs = coeffs;
-}
-
-//------------------------------------------------------------------------------
-// Mode costs
-
-static int GetResidualCost(int ctx0, const VP8Residual* const res) {
-  int n = res->first;
-  // should be prob[VP8EncBands[n]], but it's equivalent for n=0 or 1
-  int p0 = res->prob[n][ctx0][0];
-  const uint16_t* t = res->cost[n][ctx0];
-  int cost;
-
-  if (res->last < 0) {
-    return VP8BitCost(0, p0);
-  }
-  cost = VP8BitCost(1, p0);
-  for (; n < res->last; ++n) {
-    const int v = abs(res->coeffs[n]);
-    const int b = VP8EncBands[n + 1];
-    const int ctx = (v >= 2) ? 2 : v;
-    cost += VP8LevelCost(t, v);
-    t = res->cost[b][ctx];
-    // the masking trick is faster than "if (v) cost += ..." with clang
-    cost += (v ? ~0U : 0) & VP8BitCost(1, res->prob[b][ctx][0]);
-  }
-  // Last coefficient is always non-zero
-  {
-    const int v = abs(res->coeffs[n]);
-    assert(v != 0);
-    cost += VP8LevelCost(t, v);
-    if (n < 15) {
-      const int b = VP8EncBands[n + 1];
-      const int ctx = (v == 1) ? 1 : 2;
-      const int last_p0 = res->prob[b][ctx][0];
-      cost += VP8BitCost(0, last_p0);
-    }
-  }
-  return cost;
-}
-
-int VP8GetCostLuma4(VP8EncIterator* const it, const int16_t levels[16]) {
-  const int x = (it->i4_ & 3), y = (it->i4_ >> 2);
-  VP8Residual res;
-  VP8Encoder* const enc = it->enc_;
-  int R = 0;
-  int ctx;
-
-  InitResidual(0, 3, enc, &res);
-  ctx = it->top_nz_[x] + it->left_nz_[y];
-  SetResidualCoeffs(levels, &res);
-  R += GetResidualCost(ctx, &res);
-  return R;
-}
-
-int VP8GetCostLuma16(VP8EncIterator* const it, const VP8ModeScore* const rd) {
-  VP8Residual res;
-  VP8Encoder* const enc = it->enc_;
-  int x, y;
-  int R = 0;
-
-  VP8IteratorNzToBytes(it);   // re-import the non-zero context
-
-  // DC
-  InitResidual(0, 1, enc, &res);
-  SetResidualCoeffs(rd->y_dc_levels, &res);
-  R += GetResidualCost(it->top_nz_[8] + it->left_nz_[8], &res);
-
-  // AC
-  InitResidual(1, 0, enc, &res);
-  for (y = 0; y < 4; ++y) {
-    for (x = 0; x < 4; ++x) {
-      const int ctx = it->top_nz_[x] + it->left_nz_[y];
-      SetResidualCoeffs(rd->y_ac_levels[x + y * 4], &res);
-      R += GetResidualCost(ctx, &res);
-      it->top_nz_[x] = it->left_nz_[y] = (res.last >= 0);
-    }
-  }
-  return R;
-}
-
-int VP8GetCostUV(VP8EncIterator* const it, const VP8ModeScore* const rd) {
-  VP8Residual res;
-  VP8Encoder* const enc = it->enc_;
-  int ch, x, y;
-  int R = 0;
-
-  VP8IteratorNzToBytes(it);  // re-import the non-zero context
-
-  InitResidual(0, 2, enc, &res);
-  for (ch = 0; ch <= 2; ch += 2) {
-    for (y = 0; y < 2; ++y) {
-      for (x = 0; x < 2; ++x) {
-        const int ctx = it->top_nz_[4 + ch + x] + it->left_nz_[4 + ch + y];
-        SetResidualCoeffs(rd->uv_levels[ch * 2 + x + y * 2], &res);
-        R += GetResidualCost(ctx, &res);
-        it->top_nz_[4 + ch + x] = it->left_nz_[4 + ch + y] = (res.last >= 0);
-      }
-    }
-  }
-  return R;
-}
-
-//------------------------------------------------------------------------------
 // Coefficient coding
 
 static int PutCoeffs(VP8BitWriter* const bw, int ctx, const VP8Residual* res) {
@@ -521,32 +309,32 @@ static void CodeResiduals(VP8BitWriter* const bw, VP8EncIterator* const it,
 
   pos1 = VP8BitWriterPos(bw);
   if (i16) {
-    InitResidual(0, 1, enc, &res);
-    SetResidualCoeffs(rd->y_dc_levels, &res);
+    VP8InitResidual(0, 1, enc, &res);
+    VP8SetResidualCoeffs(rd->y_dc_levels, &res);
     it->top_nz_[8] = it->left_nz_[8] =
       PutCoeffs(bw, it->top_nz_[8] + it->left_nz_[8], &res);
-    InitResidual(1, 0, enc, &res);
+    VP8InitResidual(1, 0, enc, &res);
   } else {
-    InitResidual(0, 3, enc, &res);
+    VP8InitResidual(0, 3, enc, &res);
   }
 
   // luma-AC
   for (y = 0; y < 4; ++y) {
     for (x = 0; x < 4; ++x) {
       const int ctx = it->top_nz_[x] + it->left_nz_[y];
-      SetResidualCoeffs(rd->y_ac_levels[x + y * 4], &res);
+      VP8SetResidualCoeffs(rd->y_ac_levels[x + y * 4], &res);
       it->top_nz_[x] = it->left_nz_[y] = PutCoeffs(bw, ctx, &res);
     }
   }
   pos2 = VP8BitWriterPos(bw);
 
   // U/V
-  InitResidual(0, 2, enc, &res);
+  VP8InitResidual(0, 2, enc, &res);
   for (ch = 0; ch <= 2; ch += 2) {
     for (y = 0; y < 2; ++y) {
       for (x = 0; x < 2; ++x) {
         const int ctx = it->top_nz_[4 + ch + x] + it->left_nz_[4 + ch + y];
-        SetResidualCoeffs(rd->uv_levels[ch * 2 + x + y * 2], &res);
+        VP8SetResidualCoeffs(rd->uv_levels[ch * 2 + x + y * 2], &res);
         it->top_nz_[4 + ch + x] = it->left_nz_[4 + ch + y] =
             PutCoeffs(bw, ctx, &res);
       }
@@ -571,33 +359,33 @@ static void RecordResiduals(VP8EncIterator* const it,
   VP8IteratorNzToBytes(it);
 
   if (it->mb_->type_ == 1) {   // i16x16
-    InitResidual(0, 1, enc, &res);
-    SetResidualCoeffs(rd->y_dc_levels, &res);
+    VP8InitResidual(0, 1, enc, &res);
+    VP8SetResidualCoeffs(rd->y_dc_levels, &res);
     it->top_nz_[8] = it->left_nz_[8] =
-      RecordCoeffs(it->top_nz_[8] + it->left_nz_[8], &res);
-    InitResidual(1, 0, enc, &res);
+      VP8RecordCoeffs(it->top_nz_[8] + it->left_nz_[8], &res);
+    VP8InitResidual(1, 0, enc, &res);
   } else {
-    InitResidual(0, 3, enc, &res);
+    VP8InitResidual(0, 3, enc, &res);
   }
 
   // luma-AC
   for (y = 0; y < 4; ++y) {
     for (x = 0; x < 4; ++x) {
       const int ctx = it->top_nz_[x] + it->left_nz_[y];
-      SetResidualCoeffs(rd->y_ac_levels[x + y * 4], &res);
-      it->top_nz_[x] = it->left_nz_[y] = RecordCoeffs(ctx, &res);
+      VP8SetResidualCoeffs(rd->y_ac_levels[x + y * 4], &res);
+      it->top_nz_[x] = it->left_nz_[y] = VP8RecordCoeffs(ctx, &res);
     }
   }
 
   // U/V
-  InitResidual(0, 2, enc, &res);
+  VP8InitResidual(0, 2, enc, &res);
   for (ch = 0; ch <= 2; ch += 2) {
     for (y = 0; y < 2; ++y) {
       for (x = 0; x < 2; ++x) {
         const int ctx = it->top_nz_[4 + ch + x] + it->left_nz_[4 + ch + y];
-        SetResidualCoeffs(rd->uv_levels[ch * 2 + x + y * 2], &res);
+        VP8SetResidualCoeffs(rd->uv_levels[ch * 2 + x + y * 2], &res);
         it->top_nz_[4 + ch + x] = it->left_nz_[4 + ch + y] =
-            RecordCoeffs(ctx, &res);
+            VP8RecordCoeffs(ctx, &res);
       }
     }
   }
@@ -610,8 +398,8 @@ static void RecordResiduals(VP8EncIterator* const it,
 
 #if !defined(DISABLE_TOKEN_BUFFER)
 
-static void RecordTokens(VP8EncIterator* const it, const VP8ModeScore* const rd,
-                         VP8TBuffer* const tokens) {
+static int RecordTokens(VP8EncIterator* const it, const VP8ModeScore* const rd,
+                        VP8TBuffer* const tokens) {
   int x, y, ch;
   VP8Residual res;
   VP8Encoder* const enc = it->enc_;
@@ -619,44 +407,45 @@ static void RecordTokens(VP8EncIterator* const it, const VP8ModeScore* const rd,
   VP8IteratorNzToBytes(it);
   if (it->mb_->type_ == 1) {   // i16x16
     const int ctx = it->top_nz_[8] + it->left_nz_[8];
-    InitResidual(0, 1, enc, &res);
-    SetResidualCoeffs(rd->y_dc_levels, &res);
+    VP8InitResidual(0, 1, enc, &res);
+    VP8SetResidualCoeffs(rd->y_dc_levels, &res);
     it->top_nz_[8] = it->left_nz_[8] =
         VP8RecordCoeffTokens(ctx, 1,
                              res.first, res.last, res.coeffs, tokens);
-    RecordCoeffs(ctx, &res);
-    InitResidual(1, 0, enc, &res);
+    VP8RecordCoeffs(ctx, &res);
+    VP8InitResidual(1, 0, enc, &res);
   } else {
-    InitResidual(0, 3, enc, &res);
+    VP8InitResidual(0, 3, enc, &res);
   }
 
   // luma-AC
   for (y = 0; y < 4; ++y) {
     for (x = 0; x < 4; ++x) {
       const int ctx = it->top_nz_[x] + it->left_nz_[y];
-      SetResidualCoeffs(rd->y_ac_levels[x + y * 4], &res);
+      VP8SetResidualCoeffs(rd->y_ac_levels[x + y * 4], &res);
       it->top_nz_[x] = it->left_nz_[y] =
           VP8RecordCoeffTokens(ctx, res.coeff_type,
                                res.first, res.last, res.coeffs, tokens);
-      RecordCoeffs(ctx, &res);
+      VP8RecordCoeffs(ctx, &res);
     }
   }
 
   // U/V
-  InitResidual(0, 2, enc, &res);
+  VP8InitResidual(0, 2, enc, &res);
   for (ch = 0; ch <= 2; ch += 2) {
     for (y = 0; y < 2; ++y) {
       for (x = 0; x < 2; ++x) {
         const int ctx = it->top_nz_[4 + ch + x] + it->left_nz_[4 + ch + y];
-        SetResidualCoeffs(rd->uv_levels[ch * 2 + x + y * 2], &res);
+        VP8SetResidualCoeffs(rd->uv_levels[ch * 2 + x + y * 2], &res);
         it->top_nz_[4 + ch + x] = it->left_nz_[4 + ch + y] =
             VP8RecordCoeffTokens(ctx, 2,
                                  res.first, res.last, res.coeffs, tokens);
-        RecordCoeffs(ctx, &res);
+        VP8RecordCoeffs(ctx, &res);
       }
     }
   }
   VP8IteratorBytesToNz(it);
+  return !tokens->error_;
 }
 
 #endif    // !DISABLE_TOKEN_BUFFER
@@ -719,7 +508,7 @@ static void StoreSideInfo(const VP8EncIterator* const it) {
       }
       case 7: *info = mb->alpha_; break;
       default: *info = 0; break;
-    };
+    }
   }
 #if SEGMENT_VISU  // visualize segments and prediction modes
   SetBlock(it->yuv_out_ + Y_OFF, mb->segment_ * 64, 16);
@@ -863,7 +652,10 @@ static int PreLoopInitialize(VP8Encoder* const enc) {
   for (p = 0; ok && p < enc->num_parts_; ++p) {
     ok = VP8BitWriterInit(enc->parts_ + p, bytes_per_parts);
   }
-  if (!ok) VP8EncFreeBitWriters(enc);  // malloc error occurred
+  if (!ok) {
+    VP8EncFreeBitWriters(enc);  // malloc error occurred
+    WebPEncodingSetError(enc->pic_, VP8_ENC_ERROR_OUT_OF_MEMORY);
+  }
   return ok;
 }
 
@@ -928,11 +720,6 @@ int VP8EncLoop(VP8Encoder* const enc) {
     } else {   // reset predictors after a skip
       ResetAfterSkip(&it);
     }
-#ifdef WEBP_EXPERIMENTAL_FEATURES
-    if (enc->use_layer_) {
-      VP8EncCodeLayerBlock(&it);
-    }
-#endif
     StoreSideInfo(&it);
     VP8StoreFilterStats(&it);
     VP8IteratorExport(&it);
@@ -997,14 +784,13 @@ int VP8EncTokenLoop(VP8Encoder* const enc) {
         cnt = max_count;
       }
       VP8Decimate(&it, &info, rd_opt);
-      RecordTokens(&it, &info, &enc->tokens_);
+      ok = RecordTokens(&it, &info, &enc->tokens_);
+      if (!ok) {
+        WebPEncodingSetError(enc->pic_, VP8_ENC_ERROR_OUT_OF_MEMORY);
+        break;
+      }
       size_p0 += info.H;
       distortion += info.D;
-#ifdef WEBP_EXPERIMENTAL_FEATURES
-      if (enc->use_layer_) {
-        VP8EncCodeLayerBlock(&it);
-      }
-#endif
       if (is_last_pass) {
         StoreSideInfo(&it);
         VP8StoreFilterStats(&it);
diff --git a/src/3rdparty/libwebp/src/enc/histogram.c b/src/3rdparty/libwebp/src/enc/histogram.c
index abd253b..7c6abb4 100644
--- a/src/3rdparty/libwebp/src/enc/histogram.c
+++ b/src/3rdparty/libwebp/src/enc/histogram.c
@@ -10,31 +10,64 @@
 // Author: Jyrki Alakuijala (jyrki@google.com)
 //
 #ifdef HAVE_CONFIG_H
-#include "config.h"
+#include "../webp/config.h"
 #endif
 
 #include <math.h>
-#include <stdio.h>
 
 #include "./backward_references.h"
 #include "./histogram.h"
 #include "../dsp/lossless.h"
 #include "../utils/utils.h"
 
+#define MAX_COST 1.e38
+
+// Number of partitions for the three dominant (literal, red and blue) symbol
+// costs.
+#define NUM_PARTITIONS 4
+// The size of the bin-hash corresponding to the three dominant costs.
+#define BIN_SIZE (NUM_PARTITIONS * NUM_PARTITIONS * NUM_PARTITIONS)
+
 static void HistogramClear(VP8LHistogram* const p) {
-  memset(p->literal_, 0, sizeof(p->literal_));
-  memset(p->red_, 0, sizeof(p->red_));
-  memset(p->blue_, 0, sizeof(p->blue_));
-  memset(p->alpha_, 0, sizeof(p->alpha_));
-  memset(p->distance_, 0, sizeof(p->distance_));
-  p->bit_cost_ = 0;
+  uint32_t* const literal = p->literal_;
+  const int cache_bits = p->palette_code_bits_;
+  const int histo_size = VP8LGetHistogramSize(cache_bits);
+  memset(p, 0, histo_size);
+  p->palette_code_bits_ = cache_bits;
+  p->literal_ = literal;
+}
+
+static void HistogramCopy(const VP8LHistogram* const src,
+                          VP8LHistogram* const dst) {
+  uint32_t* const dst_literal = dst->literal_;
+  const int dst_cache_bits = dst->palette_code_bits_;
+  const int histo_size = VP8LGetHistogramSize(dst_cache_bits);
+  assert(src->palette_code_bits_ == dst_cache_bits);
+  memcpy(dst, src, histo_size);
+  dst->literal_ = dst_literal;
+}
+
+int VP8LGetHistogramSize(int cache_bits) {
+  const int literal_size = VP8LHistogramNumCodes(cache_bits);
+  const size_t total_size = sizeof(VP8LHistogram) + sizeof(int) * literal_size;
+  assert(total_size <= (size_t)0x7fffffff);
+  return (int)total_size;
+}
+
+void VP8LFreeHistogram(VP8LHistogram* const histo) {
+  WebPSafeFree(histo);
+}
+
+void VP8LFreeHistogramSet(VP8LHistogramSet* const histo) {
+  WebPSafeFree(histo);
 }
 
 void VP8LHistogramStoreRefs(const VP8LBackwardRefs* const refs,
                             VP8LHistogram* const histo) {
-  int i;
-  for (i = 0; i < refs->size; ++i) {
-    VP8LHistogramAddSinglePixOrCopy(histo, &refs->refs[i]);
+  VP8LRefsCursor c = VP8LRefsCursorInit(refs);
+  while (VP8LRefsCursorOk(&c)) {
+    VP8LHistogramAddSinglePixOrCopy(histo, c.cur_pos);
+    VP8LRefsCursorNext(&c);
   }
 }
 
@@ -53,13 +86,24 @@ void VP8LHistogramInit(VP8LHistogram* const p, int palette_code_bits) {
   HistogramClear(p);
 }
 
+VP8LHistogram* VP8LAllocateHistogram(int cache_bits) {
+  VP8LHistogram* histo = NULL;
+  const int total_size = VP8LGetHistogramSize(cache_bits);
+  uint8_t* const memory = (uint8_t*)WebPSafeMalloc(total_size, sizeof(*memory));
+  if (memory == NULL) return NULL;
+  histo = (VP8LHistogram*)memory;
+  // literal_ won't necessary be aligned.
+  histo->literal_ = (uint32_t*)(memory + sizeof(VP8LHistogram));
+  VP8LHistogramInit(histo, cache_bits);
+  return histo;
+}
+
 VP8LHistogramSet* VP8LAllocateHistogramSet(int size, int cache_bits) {
   int i;
   VP8LHistogramSet* set;
-  VP8LHistogram* bulk;
-  const uint64_t total_size = sizeof(*set)
-                            + (uint64_t)size * sizeof(*set->histograms)
-                            + (uint64_t)size * sizeof(**set->histograms);
+  const size_t total_size = sizeof(*set)
+                            + sizeof(*set->histograms) * size
+                            + (size_t)VP8LGetHistogramSize(cache_bits) * size;
   uint8_t* memory = (uint8_t*)WebPSafeMalloc(total_size, sizeof(*memory));
   if (memory == NULL) return NULL;
 
@@ -67,12 +111,15 @@ VP8LHistogramSet* VP8LAllocateHistogramSet(int size, int cache_bits) {
   memory += sizeof(*set);
   set->histograms = (VP8LHistogram**)memory;
   memory += size * sizeof(*set->histograms);
-  bulk = (VP8LHistogram*)memory;
   set->max_size = size;
   set->size = size;
   for (i = 0; i < size; ++i) {
-    set->histograms[i] = bulk + i;
+    set->histograms[i] = (VP8LHistogram*)memory;
+    // literal_ won't necessary be aligned.
+    set->histograms[i]->literal_ = (uint32_t*)(memory + sizeof(VP8LHistogram));
     VP8LHistogramInit(set->histograms[i], cache_bits);
+    // There's no padding/alignment between successive histograms.
+    memory += VP8LGetHistogramSize(cache_bits);
   }
   return set;
 }
@@ -87,36 +134,21 @@ void VP8LHistogramAddSinglePixOrCopy(VP8LHistogram* const histo,
     ++histo->literal_[PixOrCopyLiteral(v, 1)];
     ++histo->blue_[PixOrCopyLiteral(v, 0)];
   } else if (PixOrCopyIsCacheIdx(v)) {
-    int literal_ix = 256 + NUM_LENGTH_CODES + PixOrCopyCacheIdx(v);
+    const int literal_ix =
+        NUM_LITERAL_CODES + NUM_LENGTH_CODES + PixOrCopyCacheIdx(v);
     ++histo->literal_[literal_ix];
   } else {
     int code, extra_bits;
     VP8LPrefixEncodeBits(PixOrCopyLength(v), &code, &extra_bits);
-    ++histo->literal_[256 + code];
+    ++histo->literal_[NUM_LITERAL_CODES + code];
     VP8LPrefixEncodeBits(PixOrCopyDistance(v), &code, &extra_bits);
     ++histo->distance_[code];
   }
 }
 
-static double BitsEntropy(const int* const array, int n) {
-  double retval = 0.;
-  int sum = 0;
-  int nonzeros = 0;
-  int max_val = 0;
-  int i;
+static WEBP_INLINE double BitsEntropyRefine(int nonzeros, int sum, int max_val,
+                                            double retval) {
   double mix;
-  for (i = 0; i < n; ++i) {
-    if (array[i] != 0) {
-      sum += array[i];
-      ++nonzeros;
-      retval -= VP8LFastSLog2(array[i]);
-      if (max_val < array[i]) {
-        max_val = array[i];
-      }
-    }
-  }
-  retval += VP8LFastSLog2(sum);
-
   if (nonzeros < 5) {
     if (nonzeros <= 1) {
       return 0;
@@ -147,95 +179,142 @@ static double BitsEntropy(const int* const array, int n) {
   }
 }
 
-// Returns the cost encode the rle-encoded entropy code.
-// The constants in this function are experimental.
-static double HuffmanCost(const int* const population, int length) {
-  // Small bias because Huffman code length is typically not stored in
-  // full length.
-  static const int kHuffmanCodeOfHuffmanCodeSize = CODE_LENGTH_CODES * 3;
-  static const double kSmallBias = 9.1;
-  double retval = kHuffmanCodeOfHuffmanCodeSize - kSmallBias;
-  int streak = 0;
-  int i = 0;
-  for (; i < length - 1; ++i) {
-    ++streak;
-    if (population[i] == population[i + 1]) {
-      continue;
-    }
- last_streak_hack:
-    // population[i] points now to the symbol in the streak of same values.
-    if (streak > 3) {
-      if (population[i] == 0) {
-        retval += 1.5625 + 0.234375 * streak;
-      } else {
-        retval += 2.578125 + 0.703125 * streak;
-      }
-    } else {
-      if (population[i] == 0) {
-        retval += 1.796875 * streak;
-      } else {
-        retval += 3.28125 * streak;
+static double BitsEntropy(const uint32_t* const array, int n) {
+  double retval = 0.;
+  uint32_t sum = 0;
+  int nonzeros = 0;
+  uint32_t max_val = 0;
+  int i;
+  for (i = 0; i < n; ++i) {
+    if (array[i] != 0) {
+      sum += array[i];
+      ++nonzeros;
+      retval -= VP8LFastSLog2(array[i]);
+      if (max_val < array[i]) {
+        max_val = array[i];
       }
     }
-    streak = 0;
   }
-  if (i == length - 1) {
-    ++streak;
-    goto last_streak_hack;
+  retval += VP8LFastSLog2(sum);
+  return BitsEntropyRefine(nonzeros, sum, max_val, retval);
+}
+
+static double BitsEntropyCombined(const uint32_t* const X,
+                                  const uint32_t* const Y, int n) {
+  double retval = 0.;
+  int sum = 0;
+  int nonzeros = 0;
+  int max_val = 0;
+  int i;
+  for (i = 0; i < n; ++i) {
+    const int xy = X[i] + Y[i];
+    if (xy != 0) {
+      sum += xy;
+      ++nonzeros;
+      retval -= VP8LFastSLog2(xy);
+      if (max_val < xy) {
+        max_val = xy;
+      }
+    }
   }
+  retval += VP8LFastSLog2(sum);
+  return BitsEntropyRefine(nonzeros, sum, max_val, retval);
+}
+
+static double InitialHuffmanCost(void) {
+  // Small bias because Huffman code length is typically not stored in
+  // full length.
+  static const int kHuffmanCodeOfHuffmanCodeSize = CODE_LENGTH_CODES * 3;
+  static const double kSmallBias = 9.1;
+  return kHuffmanCodeOfHuffmanCodeSize - kSmallBias;
+}
+
+// Finalize the Huffman cost based on streak numbers and length type (<3 or >=3)
+static double FinalHuffmanCost(const VP8LStreaks* const stats) {
+  double retval = InitialHuffmanCost();
+  retval += stats->counts[0] * 1.5625 + 0.234375 * stats->streaks[0][1];
+  retval += stats->counts[1] * 2.578125 + 0.703125 * stats->streaks[1][1];
+  retval += 1.796875 * stats->streaks[0][0];
+  retval += 3.28125 * stats->streaks[1][0];
   return retval;
 }
 
-static double PopulationCost(const int* const population, int length) {
+// Trampolines
+static double HuffmanCost(const uint32_t* const population, int length) {
+  const VP8LStreaks stats = VP8LHuffmanCostCount(population, length);
+  return FinalHuffmanCost(&stats);
+}
+
+static double HuffmanCostCombined(const uint32_t* const X,
+                                  const uint32_t* const Y, int length) {
+  const VP8LStreaks stats = VP8LHuffmanCostCombinedCount(X, Y, length);
+  return FinalHuffmanCost(&stats);
+}
+
+// Aggregated costs
+static double PopulationCost(const uint32_t* const population, int length) {
   return BitsEntropy(population, length) + HuffmanCost(population, length);
 }
 
-static double ExtraCost(const int* const population, int length) {
-  int i;
-  double cost = 0.;
-  for (i = 2; i < length - 2; ++i) cost += (i >> 1) * population[i + 2];
-  return cost;
+static double GetCombinedEntropy(const uint32_t* const X,
+                                 const uint32_t* const Y, int length) {
+  return BitsEntropyCombined(X, Y, length) + HuffmanCostCombined(X, Y, length);
 }
 
 // Estimates the Entropy + Huffman + other block overhead size cost.
 double VP8LHistogramEstimateBits(const VP8LHistogram* const p) {
-  return PopulationCost(p->literal_, VP8LHistogramNumCodes(p))
-       + PopulationCost(p->red_, 256)
-       + PopulationCost(p->blue_, 256)
-       + PopulationCost(p->alpha_, 256)
-       + PopulationCost(p->distance_, NUM_DISTANCE_CODES)
-       + ExtraCost(p->literal_ + 256, NUM_LENGTH_CODES)
-       + ExtraCost(p->distance_, NUM_DISTANCE_CODES);
+  return
+      PopulationCost(p->literal_, VP8LHistogramNumCodes(p->palette_code_bits_))
+      + PopulationCost(p->red_, NUM_LITERAL_CODES)
+      + PopulationCost(p->blue_, NUM_LITERAL_CODES)
+      + PopulationCost(p->alpha_, NUM_LITERAL_CODES)
+      + PopulationCost(p->distance_, NUM_DISTANCE_CODES)
+      + VP8LExtraCost(p->literal_ + NUM_LITERAL_CODES, NUM_LENGTH_CODES)
+      + VP8LExtraCost(p->distance_, NUM_DISTANCE_CODES);
 }
 
 double VP8LHistogramEstimateBitsBulk(const VP8LHistogram* const p) {
-  return BitsEntropy(p->literal_, VP8LHistogramNumCodes(p))
-       + BitsEntropy(p->red_, 256)
-       + BitsEntropy(p->blue_, 256)
-       + BitsEntropy(p->alpha_, 256)
-       + BitsEntropy(p->distance_, NUM_DISTANCE_CODES)
-       + ExtraCost(p->literal_ + 256, NUM_LENGTH_CODES)
-       + ExtraCost(p->distance_, NUM_DISTANCE_CODES);
+  return
+      BitsEntropy(p->literal_, VP8LHistogramNumCodes(p->palette_code_bits_))
+      + BitsEntropy(p->red_, NUM_LITERAL_CODES)
+      + BitsEntropy(p->blue_, NUM_LITERAL_CODES)
+      + BitsEntropy(p->alpha_, NUM_LITERAL_CODES)
+      + BitsEntropy(p->distance_, NUM_DISTANCE_CODES)
+      + VP8LExtraCost(p->literal_ + NUM_LITERAL_CODES, NUM_LENGTH_CODES)
+      + VP8LExtraCost(p->distance_, NUM_DISTANCE_CODES);
 }
 
 // -----------------------------------------------------------------------------
 // Various histogram combine/cost-eval functions
 
-// Adds 'in' histogram to 'out'
-static void HistogramAdd(const VP8LHistogram* const in,
-                         VP8LHistogram* const out) {
-  int i;
-  for (i = 0; i < PIX_OR_COPY_CODES_MAX; ++i) {
-    out->literal_[i] += in->literal_[i];
-  }
-  for (i = 0; i < NUM_DISTANCE_CODES; ++i) {
-    out->distance_[i] += in->distance_[i];
-  }
-  for (i = 0; i < 256; ++i) {
-    out->red_[i] += in->red_[i];
-    out->blue_[i] += in->blue_[i];
-    out->alpha_[i] += in->alpha_[i];
-  }
+static int GetCombinedHistogramEntropy(const VP8LHistogram* const a,
+                                       const VP8LHistogram* const b,
+                                       double cost_threshold,
+                                       double* cost) {
+  const int palette_code_bits = a->palette_code_bits_;
+  assert(a->palette_code_bits_ == b->palette_code_bits_);
+  *cost += GetCombinedEntropy(a->literal_, b->literal_,
+                              VP8LHistogramNumCodes(palette_code_bits));
+  *cost += VP8LExtraCostCombined(a->literal_ + NUM_LITERAL_CODES,
+                                 b->literal_ + NUM_LITERAL_CODES,
+                                 NUM_LENGTH_CODES);
+  if (*cost > cost_threshold) return 0;
+
+  *cost += GetCombinedEntropy(a->red_, b->red_, NUM_LITERAL_CODES);
+  if (*cost > cost_threshold) return 0;
+
+  *cost += GetCombinedEntropy(a->blue_, b->blue_, NUM_LITERAL_CODES);
+  if (*cost > cost_threshold) return 0;
+
+  *cost += GetCombinedEntropy(a->alpha_, b->alpha_, NUM_LITERAL_CODES);
+  if (*cost > cost_threshold) return 0;
+
+  *cost += GetCombinedEntropy(a->distance_, b->distance_, NUM_DISTANCE_CODES);
+  *cost += VP8LExtraCostCombined(a->distance_, b->distance_,
+                                 NUM_DISTANCE_CODES);
+  if (*cost > cost_threshold) return 0;
+
+  return 1;
 }
 
 // Performs out = a + b, computing the cost C(a+b) - C(a) - C(b) while comparing
@@ -250,41 +329,14 @@ static double HistogramAddEval(const VP8LHistogram* const a,
                                double cost_threshold) {
   double cost = 0;
   const double sum_cost = a->bit_cost_ + b->bit_cost_;
-  int i;
-
   cost_threshold += sum_cost;
 
-  // palette_code_bits_ is part of the cost evaluation for literal_.
-  // TODO(skal): remove/simplify this palette_code_bits_?
-  out->palette_code_bits_ =
-      (a->palette_code_bits_ > b->palette_code_bits_) ? a->palette_code_bits_ :
-                                                        b->palette_code_bits_;
-  for (i = 0; i < PIX_OR_COPY_CODES_MAX; ++i) {
-    out->literal_[i] = a->literal_[i] + b->literal_[i];
-  }
-  cost += PopulationCost(out->literal_, VP8LHistogramNumCodes(out));
-  cost += ExtraCost(out->literal_ + 256, NUM_LENGTH_CODES);
-  if (cost > cost_threshold) return cost;
-
-  for (i = 0; i < 256; ++i) out->red_[i] = a->red_[i] + b->red_[i];
-  cost += PopulationCost(out->red_, 256);
-  if (cost > cost_threshold) return cost;
-
-  for (i = 0; i < 256; ++i) out->blue_[i] = a->blue_[i] + b->blue_[i];
-  cost += PopulationCost(out->blue_, 256);
-  if (cost > cost_threshold) return cost;
-
-  for (i = 0; i < NUM_DISTANCE_CODES; ++i) {
-    out->distance_[i] = a->distance_[i] + b->distance_[i];
+  if (GetCombinedHistogramEntropy(a, b, cost_threshold, &cost)) {
+    VP8LHistogramAdd(a, b, out);
+    out->bit_cost_ = cost;
+    out->palette_code_bits_ = a->palette_code_bits_;
   }
-  cost += PopulationCost(out->distance_, NUM_DISTANCE_CODES);
-  cost += ExtraCost(out->distance_, NUM_DISTANCE_CODES);
-  if (cost > cost_threshold) return cost;
 
-  for (i = 0; i < 256; ++i) out->alpha_[i] = a->alpha_[i] + b->alpha_[i];
-  cost += PopulationCost(out->alpha_, 256);
-
-  out->bit_cost_ = cost;
   return cost - sum_cost;
 }
 
@@ -294,52 +346,92 @@ static double HistogramAddEval(const VP8LHistogram* const a,
 static double HistogramAddThresh(const VP8LHistogram* const a,
                                  const VP8LHistogram* const b,
                                  double cost_threshold) {
-  int tmp[PIX_OR_COPY_CODES_MAX];  // <= max storage we'll need
-  int i;
   double cost = -a->bit_cost_;
+  GetCombinedHistogramEntropy(a, b, cost_threshold, &cost);
+  return cost;
+}
 
-  for (i = 0; i < PIX_OR_COPY_CODES_MAX; ++i) {
-    tmp[i] = a->literal_[i] + b->literal_[i];
-  }
-  // note that the tests are ordered so that the usually largest
-  // cost shares come first.
-  cost += PopulationCost(tmp, VP8LHistogramNumCodes(a));
-  cost += ExtraCost(tmp + 256, NUM_LENGTH_CODES);
-  if (cost > cost_threshold) return cost;
-
-  for (i = 0; i < 256; ++i) tmp[i] = a->red_[i] + b->red_[i];
-  cost += PopulationCost(tmp, 256);
-  if (cost > cost_threshold) return cost;
-
-  for (i = 0; i < 256; ++i) tmp[i] = a->blue_[i] + b->blue_[i];
-  cost += PopulationCost(tmp, 256);
-  if (cost > cost_threshold) return cost;
-
-  for (i = 0; i < NUM_DISTANCE_CODES; ++i) {
-    tmp[i] = a->distance_[i] + b->distance_[i];
-  }
-  cost += PopulationCost(tmp, NUM_DISTANCE_CODES);
-  cost += ExtraCost(tmp, NUM_DISTANCE_CODES);
-  if (cost > cost_threshold) return cost;
+// -----------------------------------------------------------------------------
 
-  for (i = 0; i < 256; ++i) tmp[i] = a->alpha_[i] + b->alpha_[i];
-  cost += PopulationCost(tmp, 256);
+// The structure to keep track of cost range for the three dominant entropy
+// symbols.
+// TODO(skal): Evaluate if float can be used here instead of double for
+// representing the entropy costs.
+typedef struct {
+  double literal_max_;
+  double literal_min_;
+  double red_max_;
+  double red_min_;
+  double blue_max_;
+  double blue_min_;
+} DominantCostRange;
+
+static void DominantCostRangeInit(DominantCostRange* const c) {
+  c->literal_max_ = 0.;
+  c->literal_min_ = MAX_COST;
+  c->red_max_ = 0.;
+  c->red_min_ = MAX_COST;
+  c->blue_max_ = 0.;
+  c->blue_min_ = MAX_COST;
+}
 
-  return cost;
+static void UpdateDominantCostRange(
+    const VP8LHistogram* const h, DominantCostRange* const c) {
+  if (c->literal_max_ < h->literal_cost_) c->literal_max_ = h->literal_cost_;
+  if (c->literal_min_ > h->literal_cost_) c->literal_min_ = h->literal_cost_;
+  if (c->red_max_ < h->red_cost_) c->red_max_ = h->red_cost_;
+  if (c->red_min_ > h->red_cost_) c->red_min_ = h->red_cost_;
+  if (c->blue_max_ < h->blue_cost_) c->blue_max_ = h->blue_cost_;
+  if (c->blue_min_ > h->blue_cost_) c->blue_min_ = h->blue_cost_;
 }
 
-// -----------------------------------------------------------------------------
+static void UpdateHistogramCost(VP8LHistogram* const h) {
+  const double alpha_cost = PopulationCost(h->alpha_, NUM_LITERAL_CODES);
+  const double distance_cost =
+      PopulationCost(h->distance_, NUM_DISTANCE_CODES) +
+      VP8LExtraCost(h->distance_, NUM_DISTANCE_CODES);
+  const int num_codes = VP8LHistogramNumCodes(h->palette_code_bits_);
+  h->literal_cost_ = PopulationCost(h->literal_, num_codes) +
+                     VP8LExtraCost(h->literal_ + NUM_LITERAL_CODES,
+                                   NUM_LENGTH_CODES);
+  h->red_cost_ = PopulationCost(h->red_, NUM_LITERAL_CODES);
+  h->blue_cost_ = PopulationCost(h->blue_, NUM_LITERAL_CODES);
+  h->bit_cost_ = h->literal_cost_ + h->red_cost_ + h->blue_cost_ +
+                 alpha_cost + distance_cost;
+}
 
-static void HistogramBuildImage(int xsize, int histo_bits,
-                                const VP8LBackwardRefs* const backward_refs,
-                                VP8LHistogramSet* const image) {
-  int i;
+static int GetBinIdForEntropy(double min, double max, double val) {
+  const double range = max - min + 1e-6;
+  const double delta = val - min;
+  return (int)(NUM_PARTITIONS * delta / range);
+}
+
+// TODO(vikasa): Evaluate, if there's any correlation between red & blue.
+static int GetHistoBinIndex(
+    const VP8LHistogram* const h, const DominantCostRange* const c) {
+  const int bin_id =
+      GetBinIdForEntropy(c->blue_min_, c->blue_max_, h->blue_cost_) +
+      NUM_PARTITIONS * GetBinIdForEntropy(c->red_min_, c->red_max_,
+                                          h->red_cost_) +
+      NUM_PARTITIONS * NUM_PARTITIONS * GetBinIdForEntropy(c->literal_min_,
+                                                           c->literal_max_,
+                                                           h->literal_cost_);
+  assert(bin_id < BIN_SIZE);
+  return bin_id;
+}
+
+// Construct the histograms from backward references.
+static void HistogramBuild(
+    int xsize, int histo_bits, const VP8LBackwardRefs* const backward_refs,
+    VP8LHistogramSet* const image_histo) {
   int x = 0, y = 0;
   const int histo_xsize = VP8LSubSampleSize(xsize, histo_bits);
-  VP8LHistogram** const histograms = image->histograms;
+  VP8LHistogram** const histograms = image_histo->histograms;
+  VP8LRefsCursor c = VP8LRefsCursorInit(backward_refs);
   assert(histo_bits > 0);
-  for (i = 0; i < backward_refs->size; ++i) {
-    const PixOrCopy* const v = &backward_refs->refs[i];
+  // Construct the Histo from a given backward references.
+  while (VP8LRefsCursorOk(&c)) {
+    const PixOrCopy* const v = c.cur_pos;
     const int ix = (y >> histo_bits) * histo_xsize + (x >> histo_bits);
     VP8LHistogramAddSinglePixOrCopy(histograms[ix], v);
     x += PixOrCopyLength(v);
@@ -347,9 +439,119 @@ static void HistogramBuildImage(int xsize, int histo_bits,
       x -= xsize;
       ++y;
     }
+    VP8LRefsCursorNext(&c);
   }
 }
 
+// Copies the histograms and computes its bit_cost.
+static void HistogramCopyAndAnalyze(
+    VP8LHistogramSet* const orig_histo, VP8LHistogramSet* const image_histo) {
+  int i;
+  const int histo_size = orig_histo->size;
+  VP8LHistogram** const orig_histograms = orig_histo->histograms;
+  VP8LHistogram** const histograms = image_histo->histograms;
+  for (i = 0; i < histo_size; ++i) {
+    VP8LHistogram* const histo = orig_histograms[i];
+    UpdateHistogramCost(histo);
+    // Copy histograms from orig_histo[] to image_histo[].
+    HistogramCopy(histo, histograms[i]);
+  }
+}
+
+// Partition histograms to different entropy bins for three dominant (literal,
+// red and blue) symbol costs and compute the histogram aggregate bit_cost.
+static void HistogramAnalyzeEntropyBin(
+    VP8LHistogramSet* const image_histo, int16_t* const bin_map) {
+  int i;
+  VP8LHistogram** const histograms = image_histo->histograms;
+  const int histo_size = image_histo->size;
+  const int bin_depth = histo_size + 1;
+  DominantCostRange cost_range;
+  DominantCostRangeInit(&cost_range);
+
+  // Analyze the dominant (literal, red and blue) entropy costs.
+  for (i = 0; i < histo_size; ++i) {
+    VP8LHistogram* const histo = histograms[i];
+    UpdateDominantCostRange(histo, &cost_range);
+  }
+
+  // bin-hash histograms on three of the dominant (literal, red and blue)
+  // symbol costs.
+  for (i = 0; i < histo_size; ++i) {
+    int num_histos;
+    VP8LHistogram* const histo = histograms[i];
+    const int16_t bin_id = (int16_t)GetHistoBinIndex(histo, &cost_range);
+    const int bin_offset = bin_id * bin_depth;
+    // bin_map[n][0] for every bin 'n' maintains the counter for the number of
+    // histograms in that bin.
+    // Get and increment the num_histos in that bin.
+    num_histos = ++bin_map[bin_offset];
+    assert(bin_offset + num_histos < bin_depth * BIN_SIZE);
+    // Add histogram i'th index at num_histos (last) position in the bin_map.
+    bin_map[bin_offset + num_histos] = i;
+  }
+}
+
+// Compact the histogram set by moving the valid one left in the set to the
+// head and moving the ones that have been merged to other histograms towards
+// the end.
+// TODO(vikasa): Evaluate if this method can be avoided by altering the code
+// logic of HistogramCombineEntropyBin main loop.
+static void HistogramCompactBins(VP8LHistogramSet* const image_histo) {
+  int start = 0;
+  int end = image_histo->size - 1;
+  VP8LHistogram** const histograms = image_histo->histograms;
+  while (start < end) {
+    while (start <= end && histograms[start] != NULL &&
+           histograms[start]->bit_cost_ != 0.) {
+      ++start;
+    }
+    while (start <= end && histograms[end]->bit_cost_ == 0.) {
+      histograms[end] = NULL;
+      --end;
+    }
+    if (start < end) {
+      assert(histograms[start] != NULL);
+      assert(histograms[end] != NULL);
+      HistogramCopy(histograms[end], histograms[start]);
+      histograms[end] = NULL;
+      --end;
+    }
+  }
+  image_histo->size = end + 1;
+}
+
+static void HistogramCombineEntropyBin(VP8LHistogramSet* const image_histo,
+                                       VP8LHistogram* const histos,
+                                       int16_t* const bin_map, int bin_depth,
+                                       double combine_cost_factor) {
+  int bin_id;
+  VP8LHistogram* cur_combo = histos;
+  VP8LHistogram** const histograms = image_histo->histograms;
+
+  for (bin_id = 0; bin_id < BIN_SIZE; ++bin_id) {
+    const int bin_offset = bin_id * bin_depth;
+    const int num_histos = bin_map[bin_offset];
+    const int idx1 = bin_map[bin_offset + 1];
+    int n;
+    for (n = 2; n <= num_histos; ++n) {
+      const int idx2 = bin_map[bin_offset + n];
+      const double bit_cost_idx2 = histograms[idx2]->bit_cost_;
+      if (bit_cost_idx2 > 0.) {
+        const double bit_cost_thresh = -bit_cost_idx2 * combine_cost_factor;
+        const double curr_cost_diff =
+            HistogramAddEval(histograms[idx1], histograms[idx2],
+                             cur_combo, bit_cost_thresh);
+        if (curr_cost_diff < bit_cost_thresh) {
+          HistogramCopy(cur_combo, histograms[idx1]);
+          histograms[idx2]->bit_cost_ = 0.;
+        }
+      }
+    }
+  }
+  HistogramCompactBins(image_histo);
+}
+
 static uint32_t MyRand(uint32_t *seed) {
   *seed *= 16807U;
   if (*seed == 0) {
@@ -358,48 +560,45 @@ static uint32_t MyRand(uint32_t *seed) {
   return *seed;
 }
 
-static int HistogramCombine(const VP8LHistogramSet* const in,
-                            VP8LHistogramSet* const out, int iter_mult,
-                            int num_pairs, int num_tries_no_success) {
-  int ok = 0;
-  int i, iter;
+static void HistogramCombine(VP8LHistogramSet* const image_histo,
+                             VP8LHistogramSet* const histos, int quality) {
+  int iter;
   uint32_t seed = 0;
   int tries_with_no_success = 0;
-  int out_size = in->size;
-  const int outer_iters = in->size * iter_mult;
+  int image_histo_size = image_histo->size;
+  const int iter_mult = (quality < 25) ? 2 : 2 + (quality - 25) / 8;
+  const int outer_iters = image_histo_size * iter_mult;
+  const int num_pairs = image_histo_size / 2;
+  const int num_tries_no_success = outer_iters / 2;
   const int min_cluster_size = 2;
-  VP8LHistogram* const histos = (VP8LHistogram*)malloc(2 * sizeof(*histos));
-  VP8LHistogram* cur_combo = histos + 0;    // trial merged histogram
-  VP8LHistogram* best_combo = histos + 1;   // best merged histogram so far
-  if (histos == NULL) goto End;
-
-  // Copy histograms from in[] to out[].
-  assert(in->size <= out->size);
-  for (i = 0; i < in->size; ++i) {
-    in->histograms[i]->bit_cost_ = VP8LHistogramEstimateBits(in->histograms[i]);
-    *out->histograms[i] = *in->histograms[i];
-  }
-
-  // Collapse similar histograms in 'out'.
-  for (iter = 0; iter < outer_iters && out_size >= min_cluster_size; ++iter) {
+  VP8LHistogram** const histograms = image_histo->histograms;
+  VP8LHistogram* cur_combo = histos->histograms[0];   // trial histogram
+  VP8LHistogram* best_combo = histos->histograms[1];  // best histogram so far
+
+  // Collapse similar histograms in 'image_histo'.
+  for (iter = 0;
+       iter < outer_iters && image_histo_size >= min_cluster_size;
+       ++iter) {
     double best_cost_diff = 0.;
     int best_idx1 = -1, best_idx2 = 1;
     int j;
-    const int num_tries = (num_pairs < out_size) ? num_pairs : out_size;
+    const int num_tries =
+        (num_pairs < image_histo_size) ? num_pairs : image_histo_size;
     seed += iter;
     for (j = 0; j < num_tries; ++j) {
       double curr_cost_diff;
       // Choose two histograms at random and try to combine them.
-      const uint32_t idx1 = MyRand(&seed) % out_size;
+      const uint32_t idx1 = MyRand(&seed) % image_histo_size;
       const uint32_t tmp = (j & 7) + 1;
-      const uint32_t diff = (tmp < 3) ? tmp : MyRand(&seed) % (out_size - 1);
-      const uint32_t idx2 = (idx1 + diff + 1) % out_size;
+      const uint32_t diff =
+          (tmp < 3) ? tmp : MyRand(&seed) % (image_histo_size - 1);
+      const uint32_t idx2 = (idx1 + diff + 1) % image_histo_size;
       if (idx1 == idx2) {
         continue;
       }
+
       // Calculate cost reduction on combining.
-      curr_cost_diff = HistogramAddEval(out->histograms[idx1],
-                                        out->histograms[idx2],
+      curr_cost_diff = HistogramAddEval(histograms[idx1], histograms[idx2],
                                         cur_combo, best_cost_diff);
       if (curr_cost_diff < best_cost_diff) {    // found a better pair?
         {     // swap cur/best combo histograms
@@ -414,12 +613,12 @@ static int HistogramCombine(const VP8LHistogramSet* const in,
     }
 
     if (best_idx1 >= 0) {
-      *out->histograms[best_idx1] = *best_combo;
+      HistogramCopy(best_combo, histograms[best_idx1]);
       // swap best_idx2 slot with last one (which is now unused)
-      --out_size;
-      if (best_idx2 != out_size) {
-        out->histograms[best_idx2] = out->histograms[out_size];
-        out->histograms[out_size] = NULL;   // just for sanity check.
+      --image_histo_size;
+      if (best_idx2 != image_histo_size) {
+        HistogramCopy(histograms[image_histo_size], histograms[best_idx2]);
+        histograms[image_histo_size] = NULL;
       }
       tries_with_no_success = 0;
     }
@@ -427,38 +626,28 @@ static int HistogramCombine(const VP8LHistogramSet* const in,
       break;
     }
   }
-  out->size = out_size;
-  ok = 1;
-
- End:
-  free(histos);
-  return ok;
+  image_histo->size = image_histo_size;
 }
 
 // -----------------------------------------------------------------------------
 // Histogram refinement
 
-// What is the bit cost of moving square_histogram from cur_symbol to candidate.
-static double HistogramDistance(const VP8LHistogram* const square_histogram,
-                                const VP8LHistogram* const candidate,
-                                double cost_threshold) {
-  return HistogramAddThresh(candidate, square_histogram, cost_threshold);
-}
-
 // Find the best 'out' histogram for each of the 'in' histograms.
 // Note: we assume that out[]->bit_cost_ is already up-to-date.
-static void HistogramRemap(const VP8LHistogramSet* const in,
-                           const VP8LHistogramSet* const out,
+static void HistogramRemap(const VP8LHistogramSet* const orig_histo,
+                           const VP8LHistogramSet* const image_histo,
                            uint16_t* const symbols) {
   int i;
-  for (i = 0; i < in->size; ++i) {
+  VP8LHistogram** const orig_histograms = orig_histo->histograms;
+  VP8LHistogram** const histograms = image_histo->histograms;
+  for (i = 0; i < orig_histo->size; ++i) {
     int best_out = 0;
     double best_bits =
-        HistogramDistance(in->histograms[i], out->histograms[0], 1.e38);
+        HistogramAddThresh(histograms[0], orig_histograms[i], MAX_COST);
     int k;
-    for (k = 1; k < out->size; ++k) {
+    for (k = 1; k < image_histo->size; ++k) {
       const double cur_bits =
-          HistogramDistance(in->histograms[i], out->histograms[k], best_bits);
+          HistogramAddThresh(histograms[k], orig_histograms[i], best_bits);
       if (cur_bits < best_bits) {
         best_bits = cur_bits;
         best_out = k;
@@ -468,45 +657,85 @@ static void HistogramRemap(const VP8LHistogramSet* const in,
   }
 
   // Recompute each out based on raw and symbols.
-  for (i = 0; i < out->size; ++i) {
-    HistogramClear(out->histograms[i]);
+  for (i = 0; i < image_histo->size; ++i) {
+    HistogramClear(histograms[i]);
   }
-  for (i = 0; i < in->size; ++i) {
-    HistogramAdd(in->histograms[i], out->histograms[symbols[i]]);
+
+  for (i = 0; i < orig_histo->size; ++i) {
+    const int idx = symbols[i];
+    VP8LHistogramAdd(orig_histograms[i], histograms[idx], histograms[idx]);
   }
 }
 
+static double GetCombineCostFactor(int histo_size, int quality) {
+  double combine_cost_factor = 0.16;
+  if (histo_size > 256) combine_cost_factor /= 2.;
+  if (histo_size > 512) combine_cost_factor /= 2.;
+  if (histo_size > 1024) combine_cost_factor /= 2.;
+  if (quality <= 50) combine_cost_factor /= 2.;
+  return combine_cost_factor;
+}
+
 int VP8LGetHistoImageSymbols(int xsize, int ysize,
                              const VP8LBackwardRefs* const refs,
                              int quality, int histo_bits, int cache_bits,
-                             VP8LHistogramSet* const image_in,
+                             VP8LHistogramSet* const image_histo,
                              uint16_t* const histogram_symbols) {
   int ok = 0;
   const int histo_xsize = histo_bits ? VP8LSubSampleSize(xsize, histo_bits) : 1;
   const int histo_ysize = histo_bits ? VP8LSubSampleSize(ysize, histo_bits) : 1;
-  const int histo_image_raw_size = histo_xsize * histo_ysize;
-
-  // Heuristic params for HistogramCombine().
-  const int num_tries_no_success = 8 + (quality >> 1);
-  const int iter_mult = (quality < 27) ? 1 : 1 + ((quality - 27) >> 4);
-  const int num_pairs = (quality < 25) ? 10 : (5 * quality) >> 3;
-
-  VP8LHistogramSet* const image_out =
-      VP8LAllocateHistogramSet(histo_image_raw_size, cache_bits);
-  if (image_out == NULL) return 0;
-
-  // Build histogram image.
-  HistogramBuildImage(xsize, histo_bits, refs, image_out);
-  // Collapse similar histograms.
-  if (!HistogramCombine(image_out, image_in, iter_mult, num_pairs,
-                        num_tries_no_success)) {
+  const int image_histo_raw_size = histo_xsize * histo_ysize;
+
+  // The bin_map for every bin follows following semantics:
+  // bin_map[n][0] = num_histo; // The number of histograms in that bin.
+  // bin_map[n][1] = index of first histogram in that bin;
+  // bin_map[n][num_histo] = index of last histogram in that bin;
+  // bin_map[n][num_histo + 1] ... bin_map[n][bin_depth - 1] = un-used indices.
+  const int bin_depth = image_histo_raw_size + 1;
+  int16_t* bin_map = NULL;
+  VP8LHistogramSet* const histos = VP8LAllocateHistogramSet(2, cache_bits);
+  VP8LHistogramSet* const orig_histo =
+      VP8LAllocateHistogramSet(image_histo_raw_size, cache_bits);
+
+  if (orig_histo == NULL || histos == NULL) {
     goto Error;
   }
+
+  // Don't attempt linear bin-partition heuristic for:
+  // histograms of small sizes, as bin_map will be very sparse and;
+  // Higher qualities (> 90), to preserve the compression gains at those
+  // quality settings.
+  if (orig_histo->size > 2 * BIN_SIZE && quality < 90) {
+    const int bin_map_size = bin_depth * BIN_SIZE;
+    bin_map = (int16_t*)WebPSafeCalloc(bin_map_size, sizeof(*bin_map));
+    if (bin_map == NULL) goto Error;
+  }
+
+  // Construct the histograms from backward references.
+  HistogramBuild(xsize, histo_bits, refs, orig_histo);
+  // Copies the histograms and computes its bit_cost.
+  HistogramCopyAndAnalyze(orig_histo, image_histo);
+
+  if (bin_map != NULL) {
+    const double combine_cost_factor =
+        GetCombineCostFactor(image_histo_raw_size, quality);
+    HistogramAnalyzeEntropyBin(orig_histo, bin_map);
+    // Collapse histograms with similar entropy.
+    HistogramCombineEntropyBin(image_histo, histos->histograms[0],
+                               bin_map, bin_depth, combine_cost_factor);
+  }
+
+  // Collapse similar histograms by random histogram-pair compares.
+  HistogramCombine(image_histo, histos, quality);
+
   // Find the optimal map from original histograms to the final ones.
-  HistogramRemap(image_out, image_in, histogram_symbols);
+  HistogramRemap(orig_histo, image_histo, histogram_symbols);
+
   ok = 1;
 
-Error:
-  free(image_out);
+ Error:
+  WebPSafeFree(bin_map);
+  VP8LFreeHistogramSet(orig_histo);
+  VP8LFreeHistogramSet(histos);
   return ok;
 }
diff --git a/src/3rdparty/libwebp/src/enc/histogram.h b/src/3rdparty/libwebp/src/enc/histogram.h
index 4d346a8..1cf4c54 100644
--- a/src/3rdparty/libwebp/src/enc/histogram.h
+++ b/src/3rdparty/libwebp/src/enc/histogram.h
@@ -32,18 +32,21 @@ extern "C" {
 typedef struct {
   // literal_ contains green literal, palette-code and
   // copy-length-prefix histogram
-  int literal_[PIX_OR_COPY_CODES_MAX];
-  int red_[256];
-  int blue_[256];
-  int alpha_[256];
+  uint32_t* literal_;         // Pointer to the allocated buffer for literal.
+  uint32_t red_[NUM_LITERAL_CODES];
+  uint32_t blue_[NUM_LITERAL_CODES];
+  uint32_t alpha_[NUM_LITERAL_CODES];
   // Backward reference prefix-code histogram.
-  int distance_[NUM_DISTANCE_CODES];
+  uint32_t distance_[NUM_DISTANCE_CODES];
   int palette_code_bits_;
-  double bit_cost_;   // cached value of VP8LHistogramEstimateBits(this)
+  double bit_cost_;      // cached value of VP8LHistogramEstimateBits(this)
+  double literal_cost_;  // Cached values of dominant entropy costs:
+  double red_cost_;      //   literal, red & blue.
+  double blue_cost_;
 } VP8LHistogram;
 
 // Collection of histograms with fixed capacity, allocated as one
-// big memory chunk. Can be destroyed by simply calling 'free()'.
+// big memory chunk. Can be destroyed by calling WebPSafeFree().
 typedef struct {
   int size;         // number of slots currently in use
   int max_size;     // maximum capacity
@@ -59,6 +62,9 @@ void VP8LHistogramCreate(VP8LHistogram* const p,
                          const VP8LBackwardRefs* const refs,
                          int palette_code_bits);
 
+// Return the size of the histogram for a given palette_code_bits.
+int VP8LGetHistogramSize(int palette_code_bits);
+
 // Set the palette_code_bits and reset the stats.
 void VP8LHistogramInit(VP8LHistogram* const p, int palette_code_bits);
 
@@ -66,10 +72,21 @@ void VP8LHistogramInit(VP8LHistogram* const p, int palette_code_bits);
 void VP8LHistogramStoreRefs(const VP8LBackwardRefs* const refs,
                             VP8LHistogram* const histo);
 
+// Free the memory allocated for the histogram.
+void VP8LFreeHistogram(VP8LHistogram* const histo);
+
+// Free the memory allocated for the histogram set.
+void VP8LFreeHistogramSet(VP8LHistogramSet* const histo);
+
 // Allocate an array of pointer to histograms, allocated and initialized
 // using 'cache_bits'. Return NULL in case of memory error.
 VP8LHistogramSet* VP8LAllocateHistogramSet(int size, int cache_bits);
 
+// Allocate and initialize histogram object with specified 'cache_bits'.
+// Returns NULL in case of memory error.
+// Special case of VP8LAllocateHistogramSet, with size equals 1.
+VP8LHistogram* VP8LAllocateHistogram(int cache_bits);
+
 // Accumulate a token 'v' into a histogram.
 void VP8LHistogramAddSinglePixOrCopy(VP8LHistogram* const histo,
                                      const PixOrCopy* const v);
@@ -82,9 +99,9 @@ double VP8LHistogramEstimateBits(const VP8LHistogram* const p);
 // represent the entropy code itself.
 double VP8LHistogramEstimateBitsBulk(const VP8LHistogram* const p);
 
-static WEBP_INLINE int VP8LHistogramNumCodes(const VP8LHistogram* const p) {
-  return 256 + NUM_LENGTH_CODES +
-      ((p->palette_code_bits_ > 0) ? (1 << p->palette_code_bits_) : 0);
+static WEBP_INLINE int VP8LHistogramNumCodes(int palette_code_bits) {
+  return NUM_LITERAL_CODES + NUM_LENGTH_CODES +
+      ((palette_code_bits > 0) ? (1 << palette_code_bits) : 0);
 }
 
 // Builds the histogram image.
diff --git a/src/3rdparty/libwebp/src/enc/layer.c b/src/3rdparty/libwebp/src/enc/layer.c
deleted file mode 100644
index 2402362..0000000
--- a/src/3rdparty/libwebp/src/enc/layer.c
+++ /dev/null
@@ -1,44 +0,0 @@
-// Copyright 2011 Google Inc. All Rights Reserved.
-//
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
-// -----------------------------------------------------------------------------
-//
-// Enhancement layer (for YUV444/422)
-//
-// Author: Skal (pascal.massimino@gmail.com)
-
-#include <stdlib.h>
-
-#include "./vp8enci.h"
-
-//------------------------------------------------------------------------------
-
-void VP8EncInitLayer(VP8Encoder* const enc) {
-  enc->use_layer_ = (enc->pic_->u0 != NULL);
-  enc->layer_data_size_ = 0;
-  enc->layer_data_ = NULL;
-  if (enc->use_layer_) {
-    VP8BitWriterInit(&enc->layer_bw_, enc->mb_w_ * enc->mb_h_ * 3);
-  }
-}
-
-void VP8EncCodeLayerBlock(VP8EncIterator* it) {
-  (void)it;   // remove a warning
-}
-
-int VP8EncFinishLayer(VP8Encoder* const enc) {
-  if (enc->use_layer_) {
-    enc->layer_data_ = VP8BitWriterFinish(&enc->layer_bw_);
-    enc->layer_data_size_ = VP8BitWriterSize(&enc->layer_bw_);
-  }
-  return 1;
-}
-
-void VP8EncDeleteLayer(VP8Encoder* enc) {
-  free(enc->layer_data_);
-}
-
diff --git a/src/3rdparty/libwebp/src/enc/picture.c b/src/3rdparty/libwebp/src/enc/picture.c
index 011690d..9a66fbe 100644
--- a/src/3rdparty/libwebp/src/enc/picture.c
+++ b/src/3rdparty/libwebp/src/enc/picture.c
@@ -7,506 +7,170 @@
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
-// WebPPicture utils: colorspace conversion, crop, ...
+// WebPPicture class basis
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
 #include <assert.h>
 #include <stdlib.h>
-#include <math.h>
 
 #include "./vp8enci.h"
-#include "../utils/alpha_processing.h"
-#include "../utils/random.h"
-#include "../utils/rescaler.h"
 #include "../utils/utils.h"
-#include "../dsp/dsp.h"
-#include "../dsp/yuv.h"
-
-// Uncomment to disable gamma-compression during RGB->U/V averaging
-#define USE_GAMMA_COMPRESSION
-
-#define HALVE(x) (((x) + 1) >> 1)
-#define IS_YUV_CSP(csp, YUV_CSP) (((csp) & WEBP_CSP_UV_MASK) == (YUV_CSP))
-
-static const union {
-  uint32_t argb;
-  uint8_t  bytes[4];
-} test_endian = { 0xff000000u };
-#define ALPHA_IS_LAST (test_endian.bytes[3] == 0xff)
-
-static WEBP_INLINE uint32_t MakeARGB32(int r, int g, int b) {
-  return (0xff000000u | (r << 16) | (g << 8) | b);
-}
 
 //------------------------------------------------------------------------------
 // WebPPicture
 //------------------------------------------------------------------------------
 
-int WebPPictureAlloc(WebPPicture* picture) {
-  if (picture != NULL) {
-    const WebPEncCSP uv_csp = picture->colorspace & WEBP_CSP_UV_MASK;
-    const int has_alpha = picture->colorspace & WEBP_CSP_ALPHA_BIT;
-    const int width = picture->width;
-    const int height = picture->height;
-
-    if (!picture->use_argb) {
-      const int y_stride = width;
-      const int uv_width = HALVE(width);
-      const int uv_height = HALVE(height);
-      const int uv_stride = uv_width;
-      int uv0_stride = 0;
-      int a_width, a_stride;
-      uint64_t y_size, uv_size, uv0_size, a_size, total_size;
-      uint8_t* mem;
-
-      // U/V
-      switch (uv_csp) {
-        case WEBP_YUV420:
-          break;
-#ifdef WEBP_EXPERIMENTAL_FEATURES
-        case WEBP_YUV400:    // for now, we'll just reset the U/V samples
-          break;
-        case WEBP_YUV422:
-          uv0_stride = uv_width;
-          break;
-        case WEBP_YUV444:
-          uv0_stride = width;
-          break;
-#endif
-        default:
-          return 0;
-      }
-      uv0_size = height * uv0_stride;
-
-      // alpha
-      a_width = has_alpha ? width : 0;
-      a_stride = a_width;
-      y_size = (uint64_t)y_stride * height;
-      uv_size = (uint64_t)uv_stride * uv_height;
-      a_size =  (uint64_t)a_stride * height;
-
-      total_size = y_size + a_size + 2 * uv_size + 2 * uv0_size;
-
-      // Security and validation checks
-      if (width <= 0 || height <= 0 ||         // luma/alpha param error
-          uv_width < 0 || uv_height < 0) {     // u/v param error
-        return 0;
-      }
-      // Clear previous buffer and allocate a new one.
-      WebPPictureFree(picture);   // erase previous buffer
-      mem = (uint8_t*)WebPSafeMalloc(total_size, sizeof(*mem));
-      if (mem == NULL) return 0;
-
-      // From now on, we're in the clear, we can no longer fail...
-      picture->memory_ = (void*)mem;
-      picture->y_stride  = y_stride;
-      picture->uv_stride = uv_stride;
-      picture->a_stride  = a_stride;
-      picture->uv0_stride = uv0_stride;
-      // TODO(skal): we could align the y/u/v planes and adjust stride.
-      picture->y = mem;
-      mem += y_size;
-
-      picture->u = mem;
-      mem += uv_size;
-      picture->v = mem;
-      mem += uv_size;
-
-      if (a_size) {
-        picture->a = mem;
-        mem += a_size;
-      }
-      if (uv0_size) {
-        picture->u0 = mem;
-        mem += uv0_size;
-        picture->v0 = mem;
-        mem += uv0_size;
-      }
-      (void)mem;  // makes the static analyzer happy
-    } else {
-      void* memory;
-      const uint64_t argb_size = (uint64_t)width * height;
-      if (width <= 0 || height <= 0) {
-        return 0;
-      }
-      // Clear previous buffer and allocate a new one.
-      WebPPictureFree(picture);   // erase previous buffer
-      memory = WebPSafeMalloc(argb_size, sizeof(*picture->argb));
-      if (memory == NULL) return 0;
+static int DummyWriter(const uint8_t* data, size_t data_size,
+                       const WebPPicture* const picture) {
+  // The following are to prevent 'unused variable' error message.
+  (void)data;
+  (void)data_size;
+  (void)picture;
+  return 1;
+}
 
-      // TODO(skal): align plane to cache line?
-      picture->memory_argb_ = memory;
-      picture->argb = (uint32_t*)memory;
-      picture->argb_stride = width;
-    }
+int WebPPictureInitInternal(WebPPicture* picture, int version) {
+  if (WEBP_ABI_IS_INCOMPATIBLE(version, WEBP_ENCODER_ABI_VERSION)) {
+    return 0;   // caller/system version mismatch!
+  }
+  if (picture != NULL) {
+    memset(picture, 0, sizeof(*picture));
+    picture->writer = DummyWriter;
+    WebPEncodingSetError(picture, VP8_ENC_OK);
   }
   return 1;
 }
 
-// Remove reference to the ARGB buffer (doesn't free anything).
-static void PictureResetARGB(WebPPicture* const picture) {
+//------------------------------------------------------------------------------
+
+static void WebPPictureResetBufferARGB(WebPPicture* const picture) {
   picture->memory_argb_ = NULL;
   picture->argb = NULL;
   picture->argb_stride = 0;
 }
 
-// Remove reference to the YUVA buffer (doesn't free anything).
-static void PictureResetYUVA(WebPPicture* const picture) {
+static void WebPPictureResetBufferYUVA(WebPPicture* const picture) {
   picture->memory_ = NULL;
   picture->y = picture->u = picture->v = picture->a = NULL;
-  picture->u0 = picture->v0 = NULL;
   picture->y_stride = picture->uv_stride = 0;
   picture->a_stride = 0;
-  picture->uv0_stride = 0;
 }
 
-// Grab the 'specs' (writer, *opaque, width, height...) from 'src' and copy them
-// into 'dst'. Mark 'dst' as not owning any memory.
-static void WebPPictureGrabSpecs(const WebPPicture* const src,
-                                 WebPPicture* const dst) {
-  assert(src != NULL && dst != NULL);
-  *dst = *src;
-  PictureResetYUVA(dst);
-  PictureResetARGB(dst);
+void WebPPictureResetBuffers(WebPPicture* const picture) {
+  WebPPictureResetBufferARGB(picture);
+  WebPPictureResetBufferYUVA(picture);
 }
 
-// Allocate a new argb buffer, discarding any existing one and preserving
-// the other YUV(A) buffer.
-static int PictureAllocARGB(WebPPicture* const picture) {
-  WebPPicture tmp;
-  free(picture->memory_argb_);
-  PictureResetARGB(picture);
-  picture->use_argb = 1;
-  WebPPictureGrabSpecs(picture, &tmp);
-  if (!WebPPictureAlloc(&tmp)) {
-    return WebPEncodingSetError(picture, VP8_ENC_ERROR_OUT_OF_MEMORY);
-  }
-  picture->memory_argb_ = tmp.memory_argb_;
-  picture->argb = tmp.argb;
-  picture->argb_stride = tmp.argb_stride;
-  return 1;
-}
+int WebPPictureAllocARGB(WebPPicture* const picture, int width, int height) {
+  void* memory;
+  const uint64_t argb_size = (uint64_t)width * height;
 
-// Release memory owned by 'picture' (both YUV and ARGB buffers).
-void WebPPictureFree(WebPPicture* picture) {
-  if (picture != NULL) {
-    free(picture->memory_);
-    free(picture->memory_argb_);
-    PictureResetYUVA(picture);
-    PictureResetARGB(picture);
-  }
-}
+  assert(picture != NULL);
 
-//------------------------------------------------------------------------------
-// Picture copying
+  WebPSafeFree(picture->memory_argb_);
+  WebPPictureResetBufferARGB(picture);
 
-// Not worth moving to dsp/enc.c (only used here).
-static void CopyPlane(const uint8_t* src, int src_stride,
-                      uint8_t* dst, int dst_stride, int width, int height) {
-  while (height-- > 0) {
-    memcpy(dst, src, width);
-    src += src_stride;
-    dst += dst_stride;
+  if (width <= 0 || height <= 0) {
+    return WebPEncodingSetError(picture, VP8_ENC_ERROR_BAD_DIMENSION);
   }
-}
-
-// Adjust top-left corner to chroma sample position.
-static void SnapTopLeftPosition(const WebPPicture* const pic,
-                                int* const left, int* const top) {
-  if (!pic->use_argb) {
-    const int is_yuv422 = IS_YUV_CSP(pic->colorspace, WEBP_YUV422);
-    if (IS_YUV_CSP(pic->colorspace, WEBP_YUV420) || is_yuv422) {
-      *left &= ~1;
-      if (!is_yuv422) *top &= ~1;
-    }
+  // allocate a new buffer.
+  memory = WebPSafeMalloc(argb_size, sizeof(*picture->argb));
+  if (memory == NULL) {
+    return WebPEncodingSetError(picture, VP8_ENC_ERROR_OUT_OF_MEMORY);
   }
-}
-
-// Adjust top-left corner and verify that the sub-rectangle is valid.
-static int AdjustAndCheckRectangle(const WebPPicture* const pic,
-                                   int* const left, int* const top,
-                                   int width, int height) {
-  SnapTopLeftPosition(pic, left, top);
-  if ((*left) < 0 || (*top) < 0) return 0;
-  if (width <= 0 || height <= 0) return 0;
-  if ((*left) + width > pic->width) return 0;
-  if ((*top) + height > pic->height) return 0;
+  // TODO(skal): align plane to cache line?
+  picture->memory_argb_ = memory;
+  picture->argb = (uint32_t*)memory;
+  picture->argb_stride = width;
   return 1;
 }
 
-int WebPPictureCopy(const WebPPicture* src, WebPPicture* dst) {
-  if (src == NULL || dst == NULL) return 0;
-  if (src == dst) return 1;
+int WebPPictureAllocYUVA(WebPPicture* const picture, int width, int height) {
+  const WebPEncCSP uv_csp = picture->colorspace & WEBP_CSP_UV_MASK;
+  const int has_alpha = picture->colorspace & WEBP_CSP_ALPHA_BIT;
+  const int y_stride = width;
+  const int uv_width = (width + 1) >> 1;
+  const int uv_height = (height + 1) >> 1;
+  const int uv_stride = uv_width;
+  int a_width, a_stride;
+  uint64_t y_size, uv_size, a_size, total_size;
+  uint8_t* mem;
 
-  WebPPictureGrabSpecs(src, dst);
-  if (!WebPPictureAlloc(dst)) return 0;
+  assert(picture != NULL);
 
-  if (!src->use_argb) {
-    CopyPlane(src->y, src->y_stride,
-              dst->y, dst->y_stride, dst->width, dst->height);
-    CopyPlane(src->u, src->uv_stride,
-              dst->u, dst->uv_stride, HALVE(dst->width), HALVE(dst->height));
-    CopyPlane(src->v, src->uv_stride,
-              dst->v, dst->uv_stride, HALVE(dst->width), HALVE(dst->height));
-    if (dst->a != NULL)  {
-      CopyPlane(src->a, src->a_stride,
-                dst->a, dst->a_stride, dst->width, dst->height);
-    }
-#ifdef WEBP_EXPERIMENTAL_FEATURES
-    if (dst->u0 != NULL)  {
-      int uv0_width = src->width;
-      if (IS_YUV_CSP(dst->colorspace, WEBP_YUV422)) {
-        uv0_width = HALVE(uv0_width);
-      }
-      CopyPlane(src->u0, src->uv0_stride,
-                dst->u0, dst->uv0_stride, uv0_width, dst->height);
-      CopyPlane(src->v0, src->uv0_stride,
-                dst->v0, dst->uv0_stride, uv0_width, dst->height);
-    }
-#endif
-  } else {
-    CopyPlane((const uint8_t*)src->argb, 4 * src->argb_stride,
-              (uint8_t*)dst->argb, 4 * dst->argb_stride,
-              4 * dst->width, dst->height);
-  }
-  return 1;
-}
+  WebPSafeFree(picture->memory_);
+  WebPPictureResetBufferYUVA(picture);
 
-int WebPPictureIsView(const WebPPicture* picture) {
-  if (picture == NULL) return 0;
-  if (picture->use_argb) {
-    return (picture->memory_argb_ == NULL);
+  if (uv_csp != WEBP_YUV420) {
+    return WebPEncodingSetError(picture, VP8_ENC_ERROR_INVALID_CONFIGURATION);
   }
-  return (picture->memory_ == NULL);
-}
 
-int WebPPictureView(const WebPPicture* src,
-                    int left, int top, int width, int height,
-                    WebPPicture* dst) {
-  if (src == NULL || dst == NULL) return 0;
+  // alpha
+  a_width = has_alpha ? width : 0;
+  a_stride = a_width;
+  y_size = (uint64_t)y_stride * height;
+  uv_size = (uint64_t)uv_stride * uv_height;
+  a_size =  (uint64_t)a_stride * height;
 
-  // verify rectangle position.
-  if (!AdjustAndCheckRectangle(src, &left, &top, width, height)) return 0;
+  total_size = y_size + a_size + 2 * uv_size;
 
-  if (src != dst) {  // beware of aliasing! We don't want to leak 'memory_'.
-    WebPPictureGrabSpecs(src, dst);
+  // Security and validation checks
+  if (width <= 0 || height <= 0 ||         // luma/alpha param error
+      uv_width < 0 || uv_height < 0) {     // u/v param error
+    return WebPEncodingSetError(picture, VP8_ENC_ERROR_BAD_DIMENSION);
   }
-  dst->width = width;
-  dst->height = height;
-  if (!src->use_argb) {
-    dst->y = src->y + top * src->y_stride + left;
-    dst->u = src->u + (top >> 1) * src->uv_stride + (left >> 1);
-    dst->v = src->v + (top >> 1) * src->uv_stride + (left >> 1);
-    dst->y_stride = src->y_stride;
-    dst->uv_stride = src->uv_stride;
-    if (src->a != NULL) {
-      dst->a = src->a + top * src->a_stride + left;
-      dst->a_stride = src->a_stride;
-    }
-#ifdef WEBP_EXPERIMENTAL_FEATURES
-    if (src->u0 != NULL) {
-      const int left_pos =
-          IS_YUV_CSP(dst->colorspace, WEBP_YUV422) ? (left >> 1) : left;
-      dst->u0 = src->u0 + top * src->uv0_stride + left_pos;
-      dst->v0 = src->v0 + top * src->uv0_stride + left_pos;
-      dst->uv0_stride = src->uv0_stride;
-    }
-#endif
-  } else {
-    dst->argb = src->argb + top * src->argb_stride + left;
-    dst->argb_stride = src->argb_stride;
+  // allocate a new buffer.
+  mem = (uint8_t*)WebPSafeMalloc(total_size, sizeof(*mem));
+  if (mem == NULL) {
+    return WebPEncodingSetError(picture, VP8_ENC_ERROR_OUT_OF_MEMORY);
   }
-  return 1;
-}
-
-//------------------------------------------------------------------------------
-// Picture cropping
 
-int WebPPictureCrop(WebPPicture* pic,
-                    int left, int top, int width, int height) {
-  WebPPicture tmp;
+  // From now on, we're in the clear, we can no longer fail...
+  picture->memory_ = (void*)mem;
+  picture->y_stride  = y_stride;
+  picture->uv_stride = uv_stride;
+  picture->a_stride  = a_stride;
 
-  if (pic == NULL) return 0;
-  if (!AdjustAndCheckRectangle(pic, &left, &top, width, height)) return 0;
+  // TODO(skal): we could align the y/u/v planes and adjust stride.
+  picture->y = mem;
+  mem += y_size;
 
-  WebPPictureGrabSpecs(pic, &tmp);
-  tmp.width = width;
-  tmp.height = height;
-  if (!WebPPictureAlloc(&tmp)) return 0;
+  picture->u = mem;
+  mem += uv_size;
+  picture->v = mem;
+  mem += uv_size;
 
-  if (!pic->use_argb) {
-    const int y_offset = top * pic->y_stride + left;
-    const int uv_offset = (top / 2) * pic->uv_stride + left / 2;
-    CopyPlane(pic->y + y_offset, pic->y_stride,
-              tmp.y, tmp.y_stride, width, height);
-    CopyPlane(pic->u + uv_offset, pic->uv_stride,
-              tmp.u, tmp.uv_stride, HALVE(width), HALVE(height));
-    CopyPlane(pic->v + uv_offset, pic->uv_stride,
-              tmp.v, tmp.uv_stride, HALVE(width), HALVE(height));
-
-    if (tmp.a != NULL) {
-      const int a_offset = top * pic->a_stride + left;
-      CopyPlane(pic->a + a_offset, pic->a_stride,
-                tmp.a, tmp.a_stride, width, height);
-    }
-#ifdef WEBP_EXPERIMENTAL_FEATURES
-    if (tmp.u0 != NULL) {
-      int w = width;
-      int left_pos = left;
-      if (IS_YUV_CSP(tmp.colorspace, WEBP_YUV422)) {
-        w = HALVE(w);
-        left_pos = HALVE(left_pos);
-      }
-      CopyPlane(pic->u0 + top * pic->uv0_stride + left_pos, pic->uv0_stride,
-                tmp.u0, tmp.uv0_stride, w, height);
-      CopyPlane(pic->v0 + top * pic->uv0_stride + left_pos, pic->uv0_stride,
-                tmp.v0, tmp.uv0_stride, w, height);
-    }
-#endif
-  } else {
-    const uint8_t* const src =
-        (const uint8_t*)(pic->argb + top * pic->argb_stride + left);
-    CopyPlane(src, pic->argb_stride * 4,
-              (uint8_t*)tmp.argb, tmp.argb_stride * 4,
-              width * 4, height);
+  if (a_size > 0) {
+    picture->a = mem;
+    mem += a_size;
   }
-  WebPPictureFree(pic);
-  *pic = tmp;
+  (void)mem;  // makes the static analyzer happy
   return 1;
 }
 
-//------------------------------------------------------------------------------
-// Simple picture rescaler
-
-static void RescalePlane(const uint8_t* src,
-                         int src_width, int src_height, int src_stride,
-                         uint8_t* dst,
-                         int dst_width, int dst_height, int dst_stride,
-                         int32_t* const work,
-                         int num_channels) {
-  WebPRescaler rescaler;
-  int y = 0;
-  WebPRescalerInit(&rescaler, src_width, src_height,
-                   dst, dst_width, dst_height, dst_stride,
-                   num_channels,
-                   src_width, dst_width,
-                   src_height, dst_height,
-                   work);
-  memset(work, 0, 2 * dst_width * num_channels * sizeof(*work));
-  while (y < src_height) {
-    y += WebPRescalerImport(&rescaler, src_height - y,
-                            src + y * src_stride, src_stride);
-    WebPRescalerExport(&rescaler);
-  }
-}
+int WebPPictureAlloc(WebPPicture* picture) {
+  if (picture != NULL) {
+    const int width = picture->width;
+    const int height = picture->height;
 
-static void AlphaMultiplyARGB(WebPPicture* const pic, int inverse) {
-  uint32_t* ptr = pic->argb;
-  int y;
-  for (y = 0; y < pic->height; ++y) {
-    WebPMultARGBRow(ptr, pic->width, inverse);
-    ptr += pic->argb_stride;
-  }
-}
+    WebPPictureFree(picture);   // erase previous buffer
 
-static void AlphaMultiplyY(WebPPicture* const pic, int inverse) {
-  const uint8_t* ptr_a = pic->a;
-  if (ptr_a != NULL) {
-    uint8_t* ptr_y = pic->y;
-    int y;
-    for (y = 0; y < pic->height; ++y) {
-      WebPMultRow(ptr_y, ptr_a, pic->width, inverse);
-      ptr_y += pic->y_stride;
-      ptr_a += pic->a_stride;
+    if (!picture->use_argb) {
+      return WebPPictureAllocYUVA(picture, width, height);
+    } else {
+      return WebPPictureAllocARGB(picture, width, height);
     }
   }
+  return 1;
 }
 
-int WebPPictureRescale(WebPPicture* pic, int width, int height) {
-  WebPPicture tmp;
-  int prev_width, prev_height;
-  int32_t* work;
-
-  if (pic == NULL) return 0;
-  prev_width = pic->width;
-  prev_height = pic->height;
-  // if width is unspecified, scale original proportionally to height ratio.
-  if (width == 0) {
-    width = (prev_width * height + prev_height / 2) / prev_height;
-  }
-  // if height is unspecified, scale original proportionally to width ratio.
-  if (height == 0) {
-    height = (prev_height * width + prev_width / 2) / prev_width;
-  }
-  // Check if the overall dimensions still make sense.
-  if (width <= 0 || height <= 0) return 0;
-
-  WebPPictureGrabSpecs(pic, &tmp);
-  tmp.width = width;
-  tmp.height = height;
-  if (!WebPPictureAlloc(&tmp)) return 0;
-
-  if (!pic->use_argb) {
-    work = (int32_t*)WebPSafeMalloc(2ULL * width, sizeof(*work));
-    if (work == NULL) {
-      WebPPictureFree(&tmp);
-      return 0;
-    }
-    // If present, we need to rescale alpha first (for AlphaMultiplyY).
-    if (pic->a != NULL) {
-      RescalePlane(pic->a, prev_width, prev_height, pic->a_stride,
-                   tmp.a, width, height, tmp.a_stride, work, 1);
-    }
-
-    // We take transparency into account on the luma plane only. That's not
-    // totally exact blending, but still is a good approximation.
-    AlphaMultiplyY(pic, 0);
-    RescalePlane(pic->y, prev_width, prev_height, pic->y_stride,
-                 tmp.y, width, height, tmp.y_stride, work, 1);
-    AlphaMultiplyY(&tmp, 1);
-
-    RescalePlane(pic->u,
-                 HALVE(prev_width), HALVE(prev_height), pic->uv_stride,
-                 tmp.u,
-                 HALVE(width), HALVE(height), tmp.uv_stride, work, 1);
-    RescalePlane(pic->v,
-                 HALVE(prev_width), HALVE(prev_height), pic->uv_stride,
-                 tmp.v,
-                 HALVE(width), HALVE(height), tmp.uv_stride, work, 1);
-
-#ifdef WEBP_EXPERIMENTAL_FEATURES
-    if (tmp.u0 != NULL) {
-      const int s = IS_YUV_CSP(tmp.colorspace, WEBP_YUV422) ? 2 : 1;
-      RescalePlane(
-          pic->u0, (prev_width + s / 2) / s, prev_height, pic->uv0_stride,
-          tmp.u0, (width + s / 2) / s, height, tmp.uv0_stride, work, 1);
-      RescalePlane(
-          pic->v0, (prev_width + s / 2) / s, prev_height, pic->uv0_stride,
-          tmp.v0, (width + s / 2) / s, height, tmp.uv0_stride, work, 1);
-    }
-#endif
-  } else {
-    work = (int32_t*)WebPSafeMalloc(2ULL * width * 4, sizeof(*work));
-    if (work == NULL) {
-      WebPPictureFree(&tmp);
-      return 0;
-    }
-    // In order to correctly interpolate colors, we need to apply the alpha
-    // weighting first (black-matting), scale the RGB values, and remove
-    // the premultiplication afterward (while preserving the alpha channel).
-    AlphaMultiplyARGB(pic, 0);
-    RescalePlane((const uint8_t*)pic->argb, prev_width, prev_height,
-                 pic->argb_stride * 4,
-                 (uint8_t*)tmp.argb, width, height,
-                 tmp.argb_stride * 4,
-                 work, 4);
-    AlphaMultiplyARGB(&tmp, 1);
+void WebPPictureFree(WebPPicture* picture) {
+  if (picture != NULL) {
+    WebPSafeFree(picture->memory_);
+    WebPSafeFree(picture->memory_argb_);
+    WebPPictureResetBuffers(picture);
   }
-  WebPPictureFree(pic);
-  free(work);
-  *pic = tmp;
-  return 1;
 }
 
 //------------------------------------------------------------------------------
@@ -538,7 +202,7 @@ int WebPMemoryWrite(const uint8_t* data, size_t data_size,
     if (w->size > 0) {
       memcpy(new_mem, w->mem, w->size);
     }
-    free(w->mem);
+    WebPSafeFree(w->mem);
     w->mem = new_mem;
     // down-cast is ok, thanks to WebPSafeMalloc
     w->max_size = (size_t)next_max_size;
@@ -550,713 +214,15 @@ int WebPMemoryWrite(const uint8_t* data, size_t data_size,
   return 1;
 }
 
-//------------------------------------------------------------------------------
-// Detection of non-trivial transparency
-
-// Returns true if alpha[] has non-0xff values.
-static int CheckNonOpaque(const uint8_t* alpha, int width, int height,
-                          int x_step, int y_step) {
-  if (alpha == NULL) return 0;
-  while (height-- > 0) {
-    int x;
-    for (x = 0; x < width * x_step; x += x_step) {
-      if (alpha[x] != 0xff) return 1;  // TODO(skal): check 4/8 bytes at a time.
-    }
-    alpha += y_step;
-  }
-  return 0;
-}
-
-// Checking for the presence of non-opaque alpha.
-int WebPPictureHasTransparency(const WebPPicture* picture) {
-  if (picture == NULL) return 0;
-  if (!picture->use_argb) {
-    return CheckNonOpaque(picture->a, picture->width, picture->height,
-                          1, picture->a_stride);
-  } else {
-    int x, y;
-    const uint32_t* argb = picture->argb;
-    if (argb == NULL) return 0;
-    for (y = 0; y < picture->height; ++y) {
-      for (x = 0; x < picture->width; ++x) {
-        if (argb[x] < 0xff000000u) return 1;   // test any alpha values != 0xff
-      }
-      argb += picture->argb_stride;
-    }
-  }
-  return 0;
-}
-
-//------------------------------------------------------------------------------
-// RGB -> YUV conversion
-
-static int RGBToY(int r, int g, int b, VP8Random* const rg) {
-  return VP8RGBToY(r, g, b, VP8RandomBits(rg, YUV_FIX));
-}
-
-static int RGBToU(int r, int g, int b, VP8Random* const rg) {
-  return VP8RGBToU(r, g, b, VP8RandomBits(rg, YUV_FIX + 2));
-}
-
-static int RGBToV(int r, int g, int b, VP8Random* const rg) {
-  return VP8RGBToV(r, g, b, VP8RandomBits(rg, YUV_FIX + 2));
-}
-
-//------------------------------------------------------------------------------
-
-#if defined(USE_GAMMA_COMPRESSION)
-
-// gamma-compensates loss of resolution during chroma subsampling
-#define kGamma 0.80
-#define kGammaFix 12     // fixed-point precision for linear values
-#define kGammaScale ((1 << kGammaFix) - 1)
-#define kGammaTabFix 7   // fixed-point fractional bits precision
-#define kGammaTabScale (1 << kGammaTabFix)
-#define kGammaTabRounder (kGammaTabScale >> 1)
-#define kGammaTabSize (1 << (kGammaFix - kGammaTabFix))
-
-static int kLinearToGammaTab[kGammaTabSize + 1];
-static uint16_t kGammaToLinearTab[256];
-static int kGammaTablesOk = 0;
-
-static void InitGammaTables(void) {
-  if (!kGammaTablesOk) {
-    int v;
-    const double scale = 1. / kGammaScale;
-    for (v = 0; v <= 255; ++v) {
-      kGammaToLinearTab[v] =
-          (uint16_t)(pow(v / 255., kGamma) * kGammaScale + .5);
-    }
-    for (v = 0; v <= kGammaTabSize; ++v) {
-      const double x = scale * (v << kGammaTabFix);
-      kLinearToGammaTab[v] = (int)(pow(x, 1. / kGamma) * 255. + .5);
-    }
-    kGammaTablesOk = 1;
+void WebPMemoryWriterClear(WebPMemoryWriter* writer) {
+  if (writer != NULL) {
+    WebPSafeFree(writer->mem);
+    writer->mem = NULL;
+    writer->size = 0;
+    writer->max_size = 0;
   }
 }
 
-static WEBP_INLINE uint32_t GammaToLinear(uint8_t v) {
-  return kGammaToLinearTab[v];
-}
-
-// Convert a linear value 'v' to YUV_FIX+2 fixed-point precision
-// U/V value, suitable for RGBToU/V calls.
-static WEBP_INLINE int LinearToGamma(uint32_t base_value, int shift) {
-  const int v = base_value << shift;              // final uplifted value
-  const int tab_pos = v >> (kGammaTabFix + 2);    // integer part
-  const int x = v & ((kGammaTabScale << 2) - 1);  // fractional part
-  const int v0 = kLinearToGammaTab[tab_pos];
-  const int v1 = kLinearToGammaTab[tab_pos + 1];
-  const int y = v1 * x + v0 * ((kGammaTabScale << 2) - x);   // interpolate
-  return (y + kGammaTabRounder) >> kGammaTabFix;             // descale
-}
-
-#else
-
-static void InitGammaTables(void) {}
-static WEBP_INLINE uint32_t GammaToLinear(uint8_t v) { return v; }
-static WEBP_INLINE int LinearToGamma(uint32_t base_value, int shift) {
-  (void)shift;
-  return v;
-}
-
-#endif    // USE_GAMMA_COMPRESSION
-
-//------------------------------------------------------------------------------
-
-#define SUM4(ptr) LinearToGamma(                         \
-    GammaToLinear((ptr)[0]) +                            \
-    GammaToLinear((ptr)[step]) +                         \
-    GammaToLinear((ptr)[rgb_stride]) +                   \
-    GammaToLinear((ptr)[rgb_stride + step]), 0)          \
-
-#define SUM2H(ptr) \
-    LinearToGamma(GammaToLinear((ptr)[0]) + GammaToLinear((ptr)[step]), 1)
-#define SUM2V(ptr) \
-    LinearToGamma(GammaToLinear((ptr)[0]) + GammaToLinear((ptr)[rgb_stride]), 1)
-#define SUM1(ptr)  \
-    LinearToGamma(GammaToLinear((ptr)[0]), 2)
-
-#define RGB_TO_UV(x, y, SUM) {                           \
-  const int src = (2 * (step * (x) + (y) * rgb_stride)); \
-  const int dst = (x) + (y) * picture->uv_stride;        \
-  const int r = SUM(r_ptr + src);                        \
-  const int g = SUM(g_ptr + src);                        \
-  const int b = SUM(b_ptr + src);                        \
-  picture->u[dst] = RGBToU(r, g, b, &rg);                \
-  picture->v[dst] = RGBToV(r, g, b, &rg);                \
-}
-
-#define RGB_TO_UV0(x_in, x_out, y, SUM) {                \
-  const int src = (step * (x_in) + (y) * rgb_stride);    \
-  const int dst = (x_out) + (y) * picture->uv0_stride;   \
-  const int r = SUM(r_ptr + src);                        \
-  const int g = SUM(g_ptr + src);                        \
-  const int b = SUM(b_ptr + src);                        \
-  picture->u0[dst] = RGBToU(r, g, b, &rg);               \
-  picture->v0[dst] = RGBToV(r, g, b, &rg);               \
-}
-
-static void MakeGray(WebPPicture* const picture) {
-  int y;
-  const int uv_width = HALVE(picture->width);
-  const int uv_height = HALVE(picture->height);
-  for (y = 0; y < uv_height; ++y) {
-    memset(picture->u + y * picture->uv_stride, 128, uv_width);
-    memset(picture->v + y * picture->uv_stride, 128, uv_width);
-  }
-}
-
-static int ImportYUVAFromRGBA(const uint8_t* const r_ptr,
-                              const uint8_t* const g_ptr,
-                              const uint8_t* const b_ptr,
-                              const uint8_t* const a_ptr,
-                              int step,         // bytes per pixel
-                              int rgb_stride,   // bytes per scanline
-                              float dithering,
-                              WebPPicture* const picture) {
-  const WebPEncCSP uv_csp = picture->colorspace & WEBP_CSP_UV_MASK;
-  int x, y;
-  const int width = picture->width;
-  const int height = picture->height;
-  const int has_alpha = CheckNonOpaque(a_ptr, width, height, step, rgb_stride);
-  VP8Random rg;
-
-  picture->colorspace = uv_csp;
-  picture->use_argb = 0;
-  if (has_alpha) {
-    picture->colorspace |= WEBP_CSP_ALPHA_BIT;
-  }
-  if (!WebPPictureAlloc(picture)) return 0;
-
-  VP8InitRandom(&rg, dithering);
-  InitGammaTables();
-
-  // Import luma plane
-  for (y = 0; y < height; ++y) {
-    for (x = 0; x < width; ++x) {
-      const int offset = step * x + y * rgb_stride;
-      picture->y[x + y * picture->y_stride] =
-          RGBToY(r_ptr[offset], g_ptr[offset], b_ptr[offset], &rg);
-    }
-  }
-
-  // Downsample U/V plane
-  if (uv_csp != WEBP_YUV400) {
-    for (y = 0; y < (height >> 1); ++y) {
-      for (x = 0; x < (width >> 1); ++x) {
-        RGB_TO_UV(x, y, SUM4);
-      }
-      if (width & 1) {
-        RGB_TO_UV(x, y, SUM2V);
-      }
-    }
-    if (height & 1) {
-      for (x = 0; x < (width >> 1); ++x) {
-        RGB_TO_UV(x, y, SUM2H);
-      }
-      if (width & 1) {
-        RGB_TO_UV(x, y, SUM1);
-      }
-    }
-
-#ifdef WEBP_EXPERIMENTAL_FEATURES
-    // Store original U/V samples too
-    if (uv_csp == WEBP_YUV422) {
-      for (y = 0; y < height; ++y) {
-        for (x = 0; x < (width >> 1); ++x) {
-          RGB_TO_UV0(2 * x, x, y, SUM2H);
-        }
-        if (width & 1) {
-          RGB_TO_UV0(2 * x, x, y, SUM1);
-        }
-      }
-    } else if (uv_csp == WEBP_YUV444) {
-      for (y = 0; y < height; ++y) {
-        for (x = 0; x < width; ++x) {
-          RGB_TO_UV0(x, x, y, SUM1);
-        }
-      }
-    }
-#endif
-  } else {
-    MakeGray(picture);
-  }
-
-  if (has_alpha) {
-    assert(step >= 4);
-    assert(picture->a != NULL);
-    for (y = 0; y < height; ++y) {
-      for (x = 0; x < width; ++x) {
-        picture->a[x + y * picture->a_stride] =
-            a_ptr[step * x + y * rgb_stride];
-      }
-    }
-  }
-  return 1;
-}
-
-static int Import(WebPPicture* const picture,
-                  const uint8_t* const rgb, int rgb_stride,
-                  int step, int swap_rb, int import_alpha) {
-  const uint8_t* const r_ptr = rgb + (swap_rb ? 2 : 0);
-  const uint8_t* const g_ptr = rgb + 1;
-  const uint8_t* const b_ptr = rgb + (swap_rb ? 0 : 2);
-  const uint8_t* const a_ptr = import_alpha ? rgb + 3 : NULL;
-  const int width = picture->width;
-  const int height = picture->height;
-
-  if (!picture->use_argb) {
-    return ImportYUVAFromRGBA(r_ptr, g_ptr, b_ptr, a_ptr, step, rgb_stride,
-                              0.f /* no dithering */, picture);
-  }
-  if (import_alpha) {
-    picture->colorspace |= WEBP_CSP_ALPHA_BIT;
-  } else {
-    picture->colorspace &= ~WEBP_CSP_ALPHA_BIT;
-  }
-  if (!WebPPictureAlloc(picture)) return 0;
-
-  if (!import_alpha) {
-    int x, y;
-    for (y = 0; y < height; ++y) {
-      for (x = 0; x < width; ++x) {
-        const int offset = step * x + y * rgb_stride;
-        const uint32_t argb =
-            MakeARGB32(r_ptr[offset], g_ptr[offset], b_ptr[offset]);
-        picture->argb[x + y * picture->argb_stride] = argb;
-      }
-    }
-  } else {
-    int x, y;
-    assert(step >= 4);
-    for (y = 0; y < height; ++y) {
-      for (x = 0; x < width; ++x) {
-        const int offset = step * x + y * rgb_stride;
-        const uint32_t argb = ((uint32_t)a_ptr[offset] << 24) |
-                              (r_ptr[offset] << 16) |
-                              (g_ptr[offset] <<  8) |
-                              (b_ptr[offset]);
-        picture->argb[x + y * picture->argb_stride] = argb;
-      }
-    }
-  }
-  return 1;
-}
-#undef SUM4
-#undef SUM2V
-#undef SUM2H
-#undef SUM1
-#undef RGB_TO_UV
-
-int WebPPictureImportRGB(WebPPicture* picture,
-                         const uint8_t* rgb, int rgb_stride) {
-  return Import(picture, rgb, rgb_stride, 3, 0, 0);
-}
-
-int WebPPictureImportBGR(WebPPicture* picture,
-                         const uint8_t* rgb, int rgb_stride) {
-  return Import(picture, rgb, rgb_stride, 3, 1, 0);
-}
-
-int WebPPictureImportRGBA(WebPPicture* picture,
-                          const uint8_t* rgba, int rgba_stride) {
-  return Import(picture, rgba, rgba_stride, 4, 0, 1);
-}
-
-int WebPPictureImportBGRA(WebPPicture* picture,
-                          const uint8_t* rgba, int rgba_stride) {
-  return Import(picture, rgba, rgba_stride, 4, 1, 1);
-}
-
-int WebPPictureImportRGBX(WebPPicture* picture,
-                          const uint8_t* rgba, int rgba_stride) {
-  return Import(picture, rgba, rgba_stride, 4, 0, 0);
-}
-
-int WebPPictureImportBGRX(WebPPicture* picture,
-                          const uint8_t* rgba, int rgba_stride) {
-  return Import(picture, rgba, rgba_stride, 4, 1, 0);
-}
-
-//------------------------------------------------------------------------------
-// Automatic YUV <-> ARGB conversions.
-
-int WebPPictureYUVAToARGB(WebPPicture* picture) {
-  if (picture == NULL) return 0;
-  if (picture->y == NULL || picture->u == NULL || picture->v == NULL) {
-    return WebPEncodingSetError(picture, VP8_ENC_ERROR_NULL_PARAMETER);
-  }
-  if ((picture->colorspace & WEBP_CSP_ALPHA_BIT) && picture->a == NULL) {
-    return WebPEncodingSetError(picture, VP8_ENC_ERROR_NULL_PARAMETER);
-  }
-  if ((picture->colorspace & WEBP_CSP_UV_MASK) != WEBP_YUV420) {
-    return WebPEncodingSetError(picture, VP8_ENC_ERROR_INVALID_CONFIGURATION);
-  }
-  // Allocate a new argb buffer (discarding the previous one).
-  if (!PictureAllocARGB(picture)) return 0;
-
-  // Convert
-  {
-    int y;
-    const int width = picture->width;
-    const int height = picture->height;
-    const int argb_stride = 4 * picture->argb_stride;
-    uint8_t* dst = (uint8_t*)picture->argb;
-    const uint8_t *cur_u = picture->u, *cur_v = picture->v, *cur_y = picture->y;
-    WebPUpsampleLinePairFunc upsample = WebPGetLinePairConverter(ALPHA_IS_LAST);
-
-    // First row, with replicated top samples.
-    upsample(cur_y, NULL, cur_u, cur_v, cur_u, cur_v, dst, NULL, width);
-    cur_y += picture->y_stride;
-    dst += argb_stride;
-    // Center rows.
-    for (y = 1; y + 1 < height; y += 2) {
-      const uint8_t* const top_u = cur_u;
-      const uint8_t* const top_v = cur_v;
-      cur_u += picture->uv_stride;
-      cur_v += picture->uv_stride;
-      upsample(cur_y, cur_y + picture->y_stride, top_u, top_v, cur_u, cur_v,
-               dst, dst + argb_stride, width);
-      cur_y += 2 * picture->y_stride;
-      dst += 2 * argb_stride;
-    }
-    // Last row (if needed), with replicated bottom samples.
-    if (height > 1 && !(height & 1)) {
-      upsample(cur_y, NULL, cur_u, cur_v, cur_u, cur_v, dst, NULL, width);
-    }
-    // Insert alpha values if needed, in replacement for the default 0xff ones.
-    if (picture->colorspace & WEBP_CSP_ALPHA_BIT) {
-      for (y = 0; y < height; ++y) {
-        uint32_t* const argb_dst = picture->argb + y * picture->argb_stride;
-        const uint8_t* const src = picture->a + y * picture->a_stride;
-        int x;
-        for (x = 0; x < width; ++x) {
-          argb_dst[x] = (argb_dst[x] & 0x00ffffffu) | ((uint32_t)src[x] << 24);
-        }
-      }
-    }
-  }
-  return 1;
-}
-
-int WebPPictureARGBToYUVADithered(WebPPicture* picture, WebPEncCSP colorspace,
-                                  float dithering) {
-  if (picture == NULL) return 0;
-  if (picture->argb == NULL) {
-    return WebPEncodingSetError(picture, VP8_ENC_ERROR_NULL_PARAMETER);
-  } else {
-    const uint8_t* const argb = (const uint8_t*)picture->argb;
-    const uint8_t* const r = ALPHA_IS_LAST ? argb + 2 : argb + 1;
-    const uint8_t* const g = ALPHA_IS_LAST ? argb + 1 : argb + 2;
-    const uint8_t* const b = ALPHA_IS_LAST ? argb + 0 : argb + 3;
-    const uint8_t* const a = ALPHA_IS_LAST ? argb + 3 : argb + 0;
-    // We work on a tmp copy of 'picture', because ImportYUVAFromRGBA()
-    // would be calling WebPPictureFree(picture) otherwise.
-    WebPPicture tmp = *picture;
-    PictureResetARGB(&tmp);  // reset ARGB buffer so that it's not free()'d.
-    tmp.use_argb = 0;
-    tmp.colorspace = colorspace & WEBP_CSP_UV_MASK;
-    if (!ImportYUVAFromRGBA(r, g, b, a, 4, 4 * picture->argb_stride, dithering,
-                            &tmp)) {
-      return WebPEncodingSetError(picture, VP8_ENC_ERROR_OUT_OF_MEMORY);
-    }
-    // Copy back the YUV specs into 'picture'.
-    tmp.argb = picture->argb;
-    tmp.argb_stride = picture->argb_stride;
-    tmp.memory_argb_ = picture->memory_argb_;
-    *picture = tmp;
-  }
-  return 1;
-}
-
-int WebPPictureARGBToYUVA(WebPPicture* picture, WebPEncCSP colorspace) {
-  return WebPPictureARGBToYUVADithered(picture, colorspace, 0.f);
-}
-
-//------------------------------------------------------------------------------
-// Helper: clean up fully transparent area to help compressibility.
-
-#define SIZE 8
-#define SIZE2 (SIZE / 2)
-static int is_transparent_area(const uint8_t* ptr, int stride, int size) {
-  int y, x;
-  for (y = 0; y < size; ++y) {
-    for (x = 0; x < size; ++x) {
-      if (ptr[x]) {
-        return 0;
-      }
-    }
-    ptr += stride;
-  }
-  return 1;
-}
-
-static WEBP_INLINE void flatten(uint8_t* ptr, int v, int stride, int size) {
-  int y;
-  for (y = 0; y < size; ++y) {
-    memset(ptr, v, size);
-    ptr += stride;
-  }
-}
-
-void WebPCleanupTransparentArea(WebPPicture* pic) {
-  int x, y, w, h;
-  const uint8_t* a_ptr;
-  int values[3] = { 0 };
-
-  if (pic == NULL) return;
-
-  a_ptr = pic->a;
-  if (a_ptr == NULL) return;    // nothing to do
-
-  w = pic->width / SIZE;
-  h = pic->height / SIZE;
-  for (y = 0; y < h; ++y) {
-    int need_reset = 1;
-    for (x = 0; x < w; ++x) {
-      const int off_a = (y * pic->a_stride + x) * SIZE;
-      const int off_y = (y * pic->y_stride + x) * SIZE;
-      const int off_uv = (y * pic->uv_stride + x) * SIZE2;
-      if (is_transparent_area(a_ptr + off_a, pic->a_stride, SIZE)) {
-        if (need_reset) {
-          values[0] = pic->y[off_y];
-          values[1] = pic->u[off_uv];
-          values[2] = pic->v[off_uv];
-          need_reset = 0;
-        }
-        flatten(pic->y + off_y, values[0], pic->y_stride, SIZE);
-        flatten(pic->u + off_uv, values[1], pic->uv_stride, SIZE2);
-        flatten(pic->v + off_uv, values[2], pic->uv_stride, SIZE2);
-      } else {
-        need_reset = 1;
-      }
-    }
-    // ignore the left-overs on right/bottom
-  }
-}
-
-#undef SIZE
-#undef SIZE2
-
-//------------------------------------------------------------------------------
-// Blend color and remove transparency info
-
-#define BLEND(V0, V1, ALPHA) \
-    ((((V0) * (255 - (ALPHA)) + (V1) * (ALPHA)) * 0x101) >> 16)
-#define BLEND_10BIT(V0, V1, ALPHA) \
-    ((((V0) * (1020 - (ALPHA)) + (V1) * (ALPHA)) * 0x101) >> 18)
-
-void WebPBlendAlpha(WebPPicture* pic, uint32_t background_rgb) {
-  const int red = (background_rgb >> 16) & 0xff;
-  const int green = (background_rgb >> 8) & 0xff;
-  const int blue = (background_rgb >> 0) & 0xff;
-  VP8Random rg;
-  int x, y;
-  if (pic == NULL) return;
-  VP8InitRandom(&rg, 0.f);
-  if (!pic->use_argb) {
-    const int uv_width = (pic->width >> 1);  // omit last pixel during u/v loop
-    const int Y0 = RGBToY(red, green, blue, &rg);
-    // VP8RGBToU/V expects the u/v values summed over four pixels
-    const int U0 = RGBToU(4 * red, 4 * green, 4 * blue, &rg);
-    const int V0 = RGBToV(4 * red, 4 * green, 4 * blue, &rg);
-    const int has_alpha = pic->colorspace & WEBP_CSP_ALPHA_BIT;
-    if (!has_alpha || pic->a == NULL) return;    // nothing to do
-    for (y = 0; y < pic->height; ++y) {
-      // Luma blending
-      uint8_t* const y_ptr = pic->y + y * pic->y_stride;
-      uint8_t* const a_ptr = pic->a + y * pic->a_stride;
-      for (x = 0; x < pic->width; ++x) {
-        const int alpha = a_ptr[x];
-        if (alpha < 0xff) {
-          y_ptr[x] = BLEND(Y0, y_ptr[x], a_ptr[x]);
-        }
-      }
-      // Chroma blending every even line
-      if ((y & 1) == 0) {
-        uint8_t* const u = pic->u + (y >> 1) * pic->uv_stride;
-        uint8_t* const v = pic->v + (y >> 1) * pic->uv_stride;
-        uint8_t* const a_ptr2 =
-            (y + 1 == pic->height) ? a_ptr : a_ptr + pic->a_stride;
-        for (x = 0; x < uv_width; ++x) {
-          // Average four alpha values into a single blending weight.
-          // TODO(skal): might lead to visible contouring. Can we do better?
-          const int alpha =
-              a_ptr[2 * x + 0] + a_ptr[2 * x + 1] +
-              a_ptr2[2 * x + 0] + a_ptr2[2 * x + 1];
-          u[x] = BLEND_10BIT(U0, u[x], alpha);
-          v[x] = BLEND_10BIT(V0, v[x], alpha);
-        }
-        if (pic->width & 1) {   // rightmost pixel
-          const int alpha = 2 * (a_ptr[2 * x + 0] + a_ptr2[2 * x + 0]);
-          u[x] = BLEND_10BIT(U0, u[x], alpha);
-          v[x] = BLEND_10BIT(V0, v[x], alpha);
-        }
-      }
-      memset(a_ptr, 0xff, pic->width);
-    }
-  } else {
-    uint32_t* argb = pic->argb;
-    const uint32_t background = MakeARGB32(red, green, blue);
-    for (y = 0; y < pic->height; ++y) {
-      for (x = 0; x < pic->width; ++x) {
-        const int alpha = (argb[x] >> 24) & 0xff;
-        if (alpha != 0xff) {
-          if (alpha > 0) {
-            int r = (argb[x] >> 16) & 0xff;
-            int g = (argb[x] >>  8) & 0xff;
-            int b = (argb[x] >>  0) & 0xff;
-            r = BLEND(red, r, alpha);
-            g = BLEND(green, g, alpha);
-            b = BLEND(blue, b, alpha);
-            argb[x] = MakeARGB32(r, g, b);
-          } else {
-            argb[x] = background;
-          }
-        }
-      }
-      argb += pic->argb_stride;
-    }
-  }
-}
-
-#undef BLEND
-#undef BLEND_10BIT
-
-//------------------------------------------------------------------------------
-// local-min distortion
-//
-// For every pixel in the *reference* picture, we search for the local best
-// match in the compressed image. This is not a symmetrical measure.
-
-// search radius. Shouldn't be too large.
-#define RADIUS 2
-
-static float AccumulateLSIM(const uint8_t* src, int src_stride,
-                            const uint8_t* ref, int ref_stride,
-                            int w, int h) {
-  int x, y;
-  double total_sse = 0.;
-  for (y = 0; y < h; ++y) {
-    const int y_0 = (y - RADIUS < 0) ? 0 : y - RADIUS;
-    const int y_1 = (y + RADIUS + 1 >= h) ? h : y + RADIUS + 1;
-    for (x = 0; x < w; ++x) {
-      const int x_0 = (x - RADIUS < 0) ? 0 : x - RADIUS;
-      const int x_1 = (x + RADIUS + 1 >= w) ? w : x + RADIUS + 1;
-      double best_sse = 255. * 255.;
-      const double value = (double)ref[y * ref_stride + x];
-      int i, j;
-      for (j = y_0; j < y_1; ++j) {
-        const uint8_t* s = src + j * src_stride;
-        for (i = x_0; i < x_1; ++i) {
-          const double sse = (double)(s[i] - value) * (s[i] - value);
-          if (sse < best_sse) best_sse = sse;
-        }
-      }
-      total_sse += best_sse;
-    }
-  }
-  return (float)total_sse;
-}
-#undef RADIUS
-
-//------------------------------------------------------------------------------
-// Distortion
-
-// Max value returned in case of exact similarity.
-static const double kMinDistortion_dB = 99.;
-static float GetPSNR(const double v) {
-  return (float)((v > 0.) ? -4.3429448 * log(v / (255 * 255.))
-                          : kMinDistortion_dB);
-}
-
-int WebPPictureDistortion(const WebPPicture* src, const WebPPicture* ref,
-                          int type, float result[5]) {
-  DistoStats stats[5];
-  int has_alpha;
-  int uv_w, uv_h;
-
-  if (src == NULL || ref == NULL ||
-      src->width != ref->width || src->height != ref->height ||
-      src->y == NULL || ref->y == NULL ||
-      src->u == NULL || ref->u == NULL ||
-      src->v == NULL || ref->v == NULL ||
-      result == NULL) {
-    return 0;
-  }
-  // TODO(skal): provide distortion for ARGB too.
-  if (src->use_argb == 1 || src->use_argb != ref->use_argb) {
-    return 0;
-  }
-
-  has_alpha = !!(src->colorspace & WEBP_CSP_ALPHA_BIT);
-  if (has_alpha != !!(ref->colorspace & WEBP_CSP_ALPHA_BIT) ||
-      (has_alpha && (src->a == NULL || ref->a == NULL))) {
-    return 0;
-  }
-
-  memset(stats, 0, sizeof(stats));
-
-  uv_w = HALVE(src->width);
-  uv_h = HALVE(src->height);
-  if (type >= 2) {
-    float sse[4];
-    sse[0] = AccumulateLSIM(src->y, src->y_stride,
-                            ref->y, ref->y_stride, src->width, src->height);
-    sse[1] = AccumulateLSIM(src->u, src->uv_stride,
-                            ref->u, ref->uv_stride, uv_w, uv_h);
-    sse[2] = AccumulateLSIM(src->v, src->uv_stride,
-                            ref->v, ref->uv_stride, uv_w, uv_h);
-    sse[3] = has_alpha ? AccumulateLSIM(src->a, src->a_stride,
-                                        ref->a, ref->a_stride,
-                                        src->width, src->height)
-                       : 0.f;
-    result[0] = GetPSNR(sse[0] / (src->width * src->height));
-    result[1] = GetPSNR(sse[1] / (uv_w * uv_h));
-    result[2] = GetPSNR(sse[2] / (uv_w * uv_h));
-    result[3] = GetPSNR(sse[3] / (src->width * src->height));
-    {
-      double total_sse = sse[0] + sse[1] + sse[2];
-      int total_pixels = src->width * src->height + 2 * uv_w * uv_h;
-      if (has_alpha) {
-        total_pixels += src->width * src->height;
-        total_sse += sse[3];
-      }
-      result[4] = GetPSNR(total_sse / total_pixels);
-    }
-  } else {
-    int c;
-    VP8SSIMAccumulatePlane(src->y, src->y_stride,
-                           ref->y, ref->y_stride,
-                           src->width, src->height, &stats[0]);
-    VP8SSIMAccumulatePlane(src->u, src->uv_stride,
-                           ref->u, ref->uv_stride,
-                           uv_w, uv_h, &stats[1]);
-    VP8SSIMAccumulatePlane(src->v, src->uv_stride,
-                           ref->v, ref->uv_stride,
-                           uv_w, uv_h, &stats[2]);
-    if (has_alpha) {
-      VP8SSIMAccumulatePlane(src->a, src->a_stride,
-                             ref->a, ref->a_stride,
-                             src->width, src->height, &stats[3]);
-    }
-    for (c = 0; c <= 4; ++c) {
-      if (type == 1) {
-        const double v = VP8SSIMGet(&stats[c]);
-        result[c] = (float)((v < 1.) ? -10.0 * log10(1. - v)
-                                     : kMinDistortion_dB);
-      } else {
-        const double v = VP8SSIMGetSquaredError(&stats[c]);
-        result[c] = GetPSNR(v);
-      }
-      // Accumulate forward
-      if (c < 4) VP8SSIMAddStats(&stats[c], &stats[4]);
-    }
-  }
-  return 1;
-}
-
 //------------------------------------------------------------------------------
 // Simplest high-level calls:
 
@@ -1286,7 +252,7 @@ static size_t Encode(const uint8_t* rgba, int width, int height, int stride,
   ok = import(&pic, rgba, stride) && WebPEncode(&config, &pic);
   WebPPictureFree(&pic);
   if (!ok) {
-    free(wrt.mem);
+    WebPMemoryWriterClear(&wrt);
     *output = NULL;
     return 0;
   }
@@ -1321,4 +287,3 @@ LOSSLESS_ENCODE_FUNC(WebPEncodeLosslessBGRA, WebPPictureImportBGRA)
 #undef LOSSLESS_ENCODE_FUNC
 
 //------------------------------------------------------------------------------
-
diff --git a/src/3rdparty/libwebp/src/enc/picture_csp.c b/src/3rdparty/libwebp/src/enc/picture_csp.c
new file mode 100644
index 0000000..7875f62
--- /dev/null
+++ b/src/3rdparty/libwebp/src/enc/picture_csp.c
@@ -0,0 +1,1114 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// WebPPicture utils for colorspace conversion
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include <assert.h>
+#include <stdlib.h>
+#include <math.h>
+
+#include "./vp8enci.h"
+#include "../utils/random.h"
+#include "../utils/utils.h"
+#include "../dsp/yuv.h"
+
+// Uncomment to disable gamma-compression during RGB->U/V averaging
+#define USE_GAMMA_COMPRESSION
+
+// If defined, use table to compute x / alpha.
+#define USE_INVERSE_ALPHA_TABLE
+
+static const union {
+  uint32_t argb;
+  uint8_t  bytes[4];
+} test_endian = { 0xff000000u };
+#define ALPHA_IS_LAST (test_endian.bytes[3] == 0xff)
+
+static WEBP_INLINE uint32_t MakeARGB32(int a, int r, int g, int b) {
+  return (((uint32_t)a << 24) | (r << 16) | (g << 8) | b);
+}
+
+//------------------------------------------------------------------------------
+// Detection of non-trivial transparency
+
+// Returns true if alpha[] has non-0xff values.
+static int CheckNonOpaque(const uint8_t* alpha, int width, int height,
+                          int x_step, int y_step) {
+  if (alpha == NULL) return 0;
+  while (height-- > 0) {
+    int x;
+    for (x = 0; x < width * x_step; x += x_step) {
+      if (alpha[x] != 0xff) return 1;  // TODO(skal): check 4/8 bytes at a time.
+    }
+    alpha += y_step;
+  }
+  return 0;
+}
+
+// Checking for the presence of non-opaque alpha.
+int WebPPictureHasTransparency(const WebPPicture* picture) {
+  if (picture == NULL) return 0;
+  if (!picture->use_argb) {
+    return CheckNonOpaque(picture->a, picture->width, picture->height,
+                          1, picture->a_stride);
+  } else {
+    int x, y;
+    const uint32_t* argb = picture->argb;
+    if (argb == NULL) return 0;
+    for (y = 0; y < picture->height; ++y) {
+      for (x = 0; x < picture->width; ++x) {
+        if (argb[x] < 0xff000000u) return 1;   // test any alpha values != 0xff
+      }
+      argb += picture->argb_stride;
+    }
+  }
+  return 0;
+}
+
+//------------------------------------------------------------------------------
+// Code for gamma correction
+
+#if defined(USE_GAMMA_COMPRESSION)
+
+// gamma-compensates loss of resolution during chroma subsampling
+#define kGamma 0.80      // for now we use a different gamma value than kGammaF
+#define kGammaFix 12     // fixed-point precision for linear values
+#define kGammaScale ((1 << kGammaFix) - 1)
+#define kGammaTabFix 7   // fixed-point fractional bits precision
+#define kGammaTabScale (1 << kGammaTabFix)
+#define kGammaTabRounder (kGammaTabScale >> 1)
+#define kGammaTabSize (1 << (kGammaFix - kGammaTabFix))
+
+static int kLinearToGammaTab[kGammaTabSize + 1];
+static uint16_t kGammaToLinearTab[256];
+static int kGammaTablesOk = 0;
+
+static void InitGammaTables(void) {
+  if (!kGammaTablesOk) {
+    int v;
+    const double scale = (double)(1 << kGammaTabFix) / kGammaScale;
+    const double norm = 1. / 255.;
+    for (v = 0; v <= 255; ++v) {
+      kGammaToLinearTab[v] =
+          (uint16_t)(pow(norm * v, kGamma) * kGammaScale + .5);
+    }
+    for (v = 0; v <= kGammaTabSize; ++v) {
+      kLinearToGammaTab[v] = (int)(255. * pow(scale * v, 1. / kGamma) + .5);
+    }
+    kGammaTablesOk = 1;
+  }
+}
+
+static WEBP_INLINE uint32_t GammaToLinear(uint8_t v) {
+  return kGammaToLinearTab[v];
+}
+
+static WEBP_INLINE int Interpolate(int v) {
+  const int tab_pos = v >> (kGammaTabFix + 2);    // integer part
+  const int x = v & ((kGammaTabScale << 2) - 1);  // fractional part
+  const int v0 = kLinearToGammaTab[tab_pos];
+  const int v1 = kLinearToGammaTab[tab_pos + 1];
+  const int y = v1 * x + v0 * ((kGammaTabScale << 2) - x);   // interpolate
+  assert(tab_pos + 1 < kGammaTabSize + 1);
+  return y;
+}
+
+// Convert a linear value 'v' to YUV_FIX+2 fixed-point precision
+// U/V value, suitable for RGBToU/V calls.
+static WEBP_INLINE int LinearToGamma(uint32_t base_value, int shift) {
+  const int y = Interpolate(base_value << shift);   // final uplifted value
+  return (y + kGammaTabRounder) >> kGammaTabFix;    // descale
+}
+
+#else
+
+static void InitGammaTables(void) {}
+static WEBP_INLINE uint32_t GammaToLinear(uint8_t v) { return v; }
+static WEBP_INLINE int LinearToGamma(uint32_t base_value, int shift) {
+  return (int)(base_value << shift);
+}
+
+#endif    // USE_GAMMA_COMPRESSION
+
+//------------------------------------------------------------------------------
+// RGB -> YUV conversion
+
+static int RGBToY(int r, int g, int b, VP8Random* const rg) {
+  return (rg == NULL) ? VP8RGBToY(r, g, b, YUV_HALF)
+                      : VP8RGBToY(r, g, b, VP8RandomBits(rg, YUV_FIX));
+}
+
+static int RGBToU(int r, int g, int b, VP8Random* const rg) {
+  return (rg == NULL) ? VP8RGBToU(r, g, b, YUV_HALF << 2)
+                      : VP8RGBToU(r, g, b, VP8RandomBits(rg, YUV_FIX + 2));
+}
+
+static int RGBToV(int r, int g, int b, VP8Random* const rg) {
+  return (rg == NULL) ? VP8RGBToV(r, g, b, YUV_HALF << 2)
+                      : VP8RGBToV(r, g, b, VP8RandomBits(rg, YUV_FIX + 2));
+}
+
+//------------------------------------------------------------------------------
+// Smart RGB->YUV conversion
+
+static const int kNumIterations = 6;
+static const int kMinDimensionIterativeConversion = 4;
+
+// We use a-priori a different precision for storing RGB and Y/W components
+// We could use YFIX=0 and only uint8_t for fixed_y_t, but it produces some
+// banding sometimes. Better use extra precision.
+// TODO(skal): cleanup once TFIX/YFIX values are fixed.
+
+typedef int16_t fixed_t;      // signed type with extra TFIX precision for UV
+typedef uint16_t fixed_y_t;   // unsigned type with extra YFIX precision for W
+#define TFIX 6   // fixed-point precision of RGB
+#define YFIX 2   // fixed point precision for Y/W
+
+#define THALF ((1 << TFIX) >> 1)
+#define MAX_Y_T ((256 << YFIX) - 1)
+#define TROUNDER (1 << (YUV_FIX + TFIX - 1))
+
+#if defined(USE_GAMMA_COMPRESSION)
+
+// float variant of gamma-correction
+// We use tables of different size and precision, along with a 'real-world'
+// Gamma value close to ~2.
+#define kGammaF 2.2
+static float kGammaToLinearTabF[MAX_Y_T + 1];   // size scales with Y_FIX
+static float kLinearToGammaTabF[kGammaTabSize + 2];
+static int kGammaTablesFOk = 0;
+
+static void InitGammaTablesF(void) {
+  if (!kGammaTablesFOk) {
+    int v;
+    const double norm = 1. / MAX_Y_T;
+    const double scale = 1. / kGammaTabSize;
+    for (v = 0; v <= MAX_Y_T; ++v) {
+      kGammaToLinearTabF[v] = (float)pow(norm * v, kGammaF);
+    }
+    for (v = 0; v <= kGammaTabSize; ++v) {
+      kLinearToGammaTabF[v] = (float)(MAX_Y_T * pow(scale * v, 1. / kGammaF));
+    }
+    // to prevent small rounding errors to cause read-overflow:
+    kLinearToGammaTabF[kGammaTabSize + 1] = kLinearToGammaTabF[kGammaTabSize];
+    kGammaTablesFOk = 1;
+  }
+}
+
+static WEBP_INLINE float GammaToLinearF(int v) {
+  return kGammaToLinearTabF[v];
+}
+
+static WEBP_INLINE float LinearToGammaF(float value) {
+  const float v = value * kGammaTabSize;
+  const int tab_pos = (int)v;
+  const float x = v - (float)tab_pos;      // fractional part
+  const float v0 = kLinearToGammaTabF[tab_pos + 0];
+  const float v1 = kLinearToGammaTabF[tab_pos + 1];
+  const float y = v1 * x + v0 * (1.f - x);  // interpolate
+  return y;
+}
+
+#else
+
+static void InitGammaTablesF(void) {}
+static WEBP_INLINE float GammaToLinearF(int v) {
+  const float norm = 1.f / MAX_Y_T;
+  return norm * v;
+}
+static WEBP_INLINE float LinearToGammaF(float value) {
+  return MAX_Y_T * value;
+}
+
+#endif    // USE_GAMMA_COMPRESSION
+
+//------------------------------------------------------------------------------
+
+// precision: YFIX -> TFIX
+static WEBP_INLINE int FixedYToW(int v) {
+#if TFIX == YFIX
+  return v;
+#elif TFIX >= YFIX
+  return v << (TFIX - YFIX);
+#else
+  return v >> (YFIX - TFIX);
+#endif
+}
+
+static WEBP_INLINE int FixedWToY(int v) {
+#if TFIX == YFIX
+  return v;
+#elif YFIX >= TFIX
+  return v << (YFIX - TFIX);
+#else
+  return v >> (TFIX - YFIX);
+#endif
+}
+
+static uint8_t clip_8b(fixed_t v) {
+  return (!(v & ~0xff)) ? (uint8_t)v : (v < 0) ? 0u : 255u;
+}
+
+static fixed_y_t clip_y(int y) {
+  return (!(y & ~MAX_Y_T)) ? (fixed_y_t)y : (y < 0) ? 0 : MAX_Y_T;
+}
+
+// precision: TFIX -> YFIX
+static fixed_y_t clip_fixed_t(fixed_t v) {
+  const int y = FixedWToY(v);
+  const fixed_y_t w = clip_y(y);
+  return w;
+}
+
+//------------------------------------------------------------------------------
+
+static int RGBToGray(int r, int g, int b) {
+  const int luma = 19595 * r + 38470 * g + 7471 * b + YUV_HALF;
+  return (luma >> YUV_FIX);
+}
+
+static float RGBToGrayF(float r, float g, float b) {
+  return 0.299f * r + 0.587f * g + 0.114f * b;
+}
+
+static float ScaleDown(int a, int b, int c, int d) {
+  const float A = GammaToLinearF(a);
+  const float B = GammaToLinearF(b);
+  const float C = GammaToLinearF(c);
+  const float D = GammaToLinearF(d);
+  return LinearToGammaF(0.25f * (A + B + C + D));
+}
+
+static WEBP_INLINE void UpdateW(const fixed_y_t* src, fixed_y_t* dst, int len) {
+  while (len-- > 0) {
+    const float R = GammaToLinearF(src[0]);
+    const float G = GammaToLinearF(src[1]);
+    const float B = GammaToLinearF(src[2]);
+    const float Y = RGBToGrayF(R, G, B);
+    *dst++ = (fixed_y_t)(LinearToGammaF(Y) + .5);
+    src += 3;
+  }
+}
+
+static WEBP_INLINE void UpdateChroma(const fixed_y_t* src1,
+                                     const fixed_y_t* src2,
+                                     fixed_t* dst, fixed_y_t* tmp, int len) {
+  while (len--> 0) {
+    const float r = ScaleDown(src1[0], src1[3], src2[0], src2[3]);
+    const float g = ScaleDown(src1[1], src1[4], src2[1], src2[4]);
+    const float b = ScaleDown(src1[2], src1[5], src2[2], src2[5]);
+    const float W = RGBToGrayF(r, g, b);
+    dst[0] = (fixed_t)FixedYToW((int)(r - W));
+    dst[1] = (fixed_t)FixedYToW((int)(g - W));
+    dst[2] = (fixed_t)FixedYToW((int)(b - W));
+    dst += 3;
+    src1 += 6;
+    src2 += 6;
+    if (tmp != NULL) {
+      tmp[0] = tmp[1] = clip_y((int)(W + .5));
+      tmp += 2;
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+
+static WEBP_INLINE int Filter(const fixed_t* const A, const fixed_t* const B,
+                              int rightwise) {
+  int v;
+  if (!rightwise) {
+    v = (A[0] * 9 + A[-3] * 3 + B[0] * 3 + B[-3]);
+  } else {
+    v = (A[0] * 9 + A[+3] * 3 + B[0] * 3 + B[+3]);
+  }
+  return (v + 8) >> 4;
+}
+
+static WEBP_INLINE int Filter2(int A, int B) { return (A * 3 + B + 2) >> 2; }
+
+//------------------------------------------------------------------------------
+
+// 8bit -> YFIX
+static WEBP_INLINE fixed_y_t UpLift(uint8_t a) {
+  return ((fixed_y_t)a << YFIX) | (1 << (YFIX - 1));
+}
+
+static void ImportOneRow(const uint8_t* const r_ptr,
+                         const uint8_t* const g_ptr,
+                         const uint8_t* const b_ptr,
+                         int step,
+                         int pic_width,
+                         fixed_y_t* const dst) {
+  int i;
+  for (i = 0; i < pic_width; ++i) {
+    const int off = i * step;
+    dst[3 * i + 0] = UpLift(r_ptr[off]);
+    dst[3 * i + 1] = UpLift(g_ptr[off]);
+    dst[3 * i + 2] = UpLift(b_ptr[off]);
+  }
+  if (pic_width & 1) {  // replicate rightmost pixel
+    memcpy(dst + 3 * pic_width, dst + 3 * (pic_width - 1), 3 * sizeof(*dst));
+  }
+}
+
+static void InterpolateTwoRows(const fixed_y_t* const best_y,
+                               const fixed_t* const prev_uv,
+                               const fixed_t* const cur_uv,
+                               const fixed_t* const next_uv,
+                               int w,
+                               fixed_y_t* const out1,
+                               fixed_y_t* const out2) {
+  int i, k;
+  {  // special boundary case for i==0
+    const int W0 = FixedYToW(best_y[0]);
+    const int W1 = FixedYToW(best_y[w]);
+    for (k = 0; k <= 2; ++k) {
+      out1[k] = clip_fixed_t(Filter2(cur_uv[k], prev_uv[k]) + W0);
+      out2[k] = clip_fixed_t(Filter2(cur_uv[k], next_uv[k]) + W1);
+    }
+  }
+  for (i = 1; i < w - 1; ++i) {
+    const int W0 = FixedYToW(best_y[i + 0]);
+    const int W1 = FixedYToW(best_y[i + w]);
+    const int off = 3 * (i >> 1);
+    for (k = 0; k <= 2; ++k) {
+      const int tmp0 = Filter(cur_uv + off + k, prev_uv + off + k, i & 1);
+      const int tmp1 = Filter(cur_uv + off + k, next_uv + off + k, i & 1);
+      out1[3 * i + k] = clip_fixed_t(tmp0 + W0);
+      out2[3 * i + k] = clip_fixed_t(tmp1 + W1);
+    }
+  }
+  {  // special boundary case for i == w - 1
+    const int W0 = FixedYToW(best_y[i + 0]);
+    const int W1 = FixedYToW(best_y[i + w]);
+    const int off = 3 * (i >> 1);
+    for (k = 0; k <= 2; ++k) {
+      out1[3 * i + k] =
+          clip_fixed_t(Filter2(cur_uv[off + k], prev_uv[off + k]) + W0);
+      out2[3 * i + k] =
+          clip_fixed_t(Filter2(cur_uv[off + k], next_uv[off + k]) + W1);
+    }
+  }
+}
+
+static WEBP_INLINE uint8_t ConvertRGBToY(int r, int g, int b) {
+  const int luma = 16839 * r + 33059 * g + 6420 * b + TROUNDER;
+  return clip_8b(16 + (luma >> (YUV_FIX + TFIX)));
+}
+
+static WEBP_INLINE uint8_t ConvertRGBToU(int r, int g, int b) {
+  const int u =  -9719 * r - 19081 * g + 28800 * b + TROUNDER;
+  return clip_8b(128 + (u >> (YUV_FIX + TFIX)));
+}
+
+static WEBP_INLINE uint8_t ConvertRGBToV(int r, int g, int b) {
+  const int v = +28800 * r - 24116 * g -  4684 * b + TROUNDER;
+  return clip_8b(128 + (v >> (YUV_FIX + TFIX)));
+}
+
+static int ConvertWRGBToYUV(const fixed_y_t* const best_y,
+                            const fixed_t* const best_uv,
+                            WebPPicture* const picture) {
+  int i, j;
+  const int w = (picture->width + 1) & ~1;
+  const int h = (picture->height + 1) & ~1;
+  const int uv_w = w >> 1;
+  const int uv_h = h >> 1;
+  for (j = 0; j < picture->height; ++j) {
+    for (i = 0; i < picture->width; ++i) {
+      const int off = 3 * ((i >> 1) + (j >> 1) * uv_w);
+      const int off2 = i + j * picture->y_stride;
+      const int W = FixedYToW(best_y[i + j * w]);
+      const int r = best_uv[off + 0] + W;
+      const int g = best_uv[off + 1] + W;
+      const int b = best_uv[off + 2] + W;
+      picture->y[off2] = ConvertRGBToY(r, g, b);
+    }
+  }
+  for (j = 0; j < uv_h; ++j) {
+    uint8_t* const dst_u = picture->u + j * picture->uv_stride;
+    uint8_t* const dst_v = picture->v + j * picture->uv_stride;
+    for (i = 0; i < uv_w; ++i) {
+      const int off = 3 * (i + j * uv_w);
+      const int r = best_uv[off + 0];
+      const int g = best_uv[off + 1];
+      const int b = best_uv[off + 2];
+      dst_u[i] = ConvertRGBToU(r, g, b);
+      dst_v[i] = ConvertRGBToV(r, g, b);
+    }
+  }
+  return 1;
+}
+
+//------------------------------------------------------------------------------
+// Main function
+
+#define SAFE_ALLOC(W, H, T) ((T*)WebPSafeMalloc((W) * (H), sizeof(T)))
+
+static int PreprocessARGB(const uint8_t* const r_ptr,
+                          const uint8_t* const g_ptr,
+                          const uint8_t* const b_ptr,
+                          int step, int rgb_stride,
+                          WebPPicture* const picture) {
+  // we expand the right/bottom border if needed
+  const int w = (picture->width + 1) & ~1;
+  const int h = (picture->height + 1) & ~1;
+  const int uv_w = w >> 1;
+  const int uv_h = h >> 1;
+  int i, j, iter;
+
+  // TODO(skal): allocate one big memory chunk. But for now, it's easier
+  // for valgrind debugging to have several chunks.
+  fixed_y_t* const tmp_buffer = SAFE_ALLOC(w * 3, 2, fixed_y_t);   // scratch
+  fixed_y_t* const best_y = SAFE_ALLOC(w, h, fixed_y_t);
+  fixed_y_t* const target_y = SAFE_ALLOC(w, h, fixed_y_t);
+  fixed_y_t* const best_rgb_y = SAFE_ALLOC(w, 2, fixed_y_t);
+  fixed_t* const best_uv = SAFE_ALLOC(uv_w * 3, uv_h, fixed_t);
+  fixed_t* const target_uv = SAFE_ALLOC(uv_w * 3, uv_h, fixed_t);
+  fixed_t* const best_rgb_uv = SAFE_ALLOC(uv_w * 3, 1, fixed_t);
+  int ok;
+
+  if (best_y == NULL || best_uv == NULL ||
+      target_y == NULL || target_uv == NULL ||
+      best_rgb_y == NULL || best_rgb_uv == NULL ||
+      tmp_buffer == NULL) {
+    ok = WebPEncodingSetError(picture, VP8_ENC_ERROR_OUT_OF_MEMORY);
+    goto End;
+  }
+  assert(picture->width >= kMinDimensionIterativeConversion);
+  assert(picture->height >= kMinDimensionIterativeConversion);
+
+  // Import RGB samples to W/RGB representation.
+  for (j = 0; j < picture->height; j += 2) {
+    const int is_last_row = (j == picture->height - 1);
+    fixed_y_t* const src1 = tmp_buffer;
+    fixed_y_t* const src2 = tmp_buffer + 3 * w;
+    const int off1 = j * rgb_stride;
+    const int off2 = off1 + rgb_stride;
+    const int uv_off = (j >> 1) * 3 * uv_w;
+    fixed_y_t* const dst_y = best_y + j * w;
+
+    // prepare two rows of input
+    ImportOneRow(r_ptr + off1, g_ptr + off1, b_ptr + off1,
+                 step, picture->width, src1);
+    if (!is_last_row) {
+      ImportOneRow(r_ptr + off2, g_ptr + off2, b_ptr + off2,
+                   step, picture->width, src2);
+    } else {
+      memcpy(src2, src1, 3 * w * sizeof(*src2));
+    }
+    UpdateW(src1, target_y + (j + 0) * w, w);
+    UpdateW(src2, target_y + (j + 1) * w, w);
+    UpdateChroma(src1, src2, target_uv + uv_off, dst_y, uv_w);
+    memcpy(best_uv + uv_off, target_uv + uv_off, 3 * uv_w * sizeof(*best_uv));
+    memcpy(dst_y + w, dst_y, w * sizeof(*dst_y));
+  }
+
+  // Iterate and resolve clipping conflicts.
+  for (iter = 0; iter < kNumIterations; ++iter) {
+    int k;
+    const fixed_t* cur_uv = best_uv;
+    const fixed_t* prev_uv = best_uv;
+    for (j = 0; j < h; j += 2) {
+      fixed_y_t* const src1 = tmp_buffer;
+      fixed_y_t* const src2 = tmp_buffer + 3 * w;
+
+      {
+        const fixed_t* const next_uv = cur_uv + ((j < h - 2) ? 3 * uv_w : 0);
+        InterpolateTwoRows(best_y + j * w, prev_uv, cur_uv, next_uv,
+                           w, src1, src2);
+        prev_uv = cur_uv;
+        cur_uv = next_uv;
+      }
+
+      UpdateW(src1, best_rgb_y + 0 * w, w);
+      UpdateW(src2, best_rgb_y + 1 * w, w);
+      UpdateChroma(src1, src2, best_rgb_uv, NULL, uv_w);
+
+      // update two rows of Y and one row of RGB
+      for (i = 0; i < 2 * w; ++i) {
+        const int off = i + j * w;
+        const int diff_y = target_y[off] - best_rgb_y[i];
+        const int new_y = (int)best_y[off] + diff_y;
+        best_y[off] = clip_y(new_y);
+      }
+      for (i = 0; i < uv_w; ++i) {
+        const int off = 3 * (i + (j >> 1) * uv_w);
+        int W;
+        for (k = 0; k <= 2; ++k) {
+          const int diff_uv = (int)target_uv[off + k] - best_rgb_uv[3 * i + k];
+          best_uv[off + k] += diff_uv;
+        }
+        W = RGBToGray(best_uv[off + 0], best_uv[off + 1], best_uv[off + 2]);
+        for (k = 0; k <= 2; ++k) {
+          best_uv[off + k] -= W;
+        }
+      }
+    }
+    // TODO(skal): add early-termination criterion
+  }
+
+  // final reconstruction
+  ok = ConvertWRGBToYUV(best_y, best_uv, picture);
+
+ End:
+  WebPSafeFree(best_y);
+  WebPSafeFree(best_uv);
+  WebPSafeFree(target_y);
+  WebPSafeFree(target_uv);
+  WebPSafeFree(best_rgb_y);
+  WebPSafeFree(best_rgb_uv);
+  WebPSafeFree(tmp_buffer);
+  return ok;
+}
+#undef SAFE_ALLOC
+
+//------------------------------------------------------------------------------
+// "Fast" regular RGB->YUV
+
+#define SUM4(ptr, step) LinearToGamma(                     \
+    GammaToLinear((ptr)[0]) +                              \
+    GammaToLinear((ptr)[(step)]) +                         \
+    GammaToLinear((ptr)[rgb_stride]) +                     \
+    GammaToLinear((ptr)[rgb_stride + (step)]), 0)          \
+
+#define SUM2(ptr) \
+    LinearToGamma(GammaToLinear((ptr)[0]) + GammaToLinear((ptr)[rgb_stride]), 1)
+
+#define SUM2ALPHA(ptr) ((ptr)[0] + (ptr)[rgb_stride])
+#define SUM4ALPHA(ptr) (SUM2ALPHA(ptr) + SUM2ALPHA((ptr) + 4))
+
+#if defined(USE_INVERSE_ALPHA_TABLE)
+
+static const int kAlphaFix = 19;
+// Following table is (1 << kAlphaFix) / a. The (v * kInvAlpha[a]) >> kAlphaFix
+// formula is then equal to v / a in most (99.6%) cases. Note that this table
+// and constant are adjusted very tightly to fit 32b arithmetic.
+// In particular, they use the fact that the operands for 'v / a' are actually
+// derived as v = (a0.p0 + a1.p1 + a2.p2 + a3.p3) and a = a0 + a1 + a2 + a3
+// with ai in [0..255] and pi in [0..1<<kGammaFix). The constraint to avoid
+// overflow is: kGammaFix + kAlphaFix <= 31.
+static const uint32_t kInvAlpha[4 * 0xff + 1] = {
+  0,  /* alpha = 0 */
+  524288, 262144, 174762, 131072, 104857, 87381, 74898, 65536,
+  58254, 52428, 47662, 43690, 40329, 37449, 34952, 32768,
+  30840, 29127, 27594, 26214, 24966, 23831, 22795, 21845,
+  20971, 20164, 19418, 18724, 18078, 17476, 16912, 16384,
+  15887, 15420, 14979, 14563, 14169, 13797, 13443, 13107,
+  12787, 12483, 12192, 11915, 11650, 11397, 11155, 10922,
+  10699, 10485, 10280, 10082, 9892, 9709, 9532, 9362,
+  9198, 9039, 8886, 8738, 8594, 8456, 8322, 8192,
+  8065, 7943, 7825, 7710, 7598, 7489, 7384, 7281,
+  7182, 7084, 6990, 6898, 6808, 6721, 6636, 6553,
+  6472, 6393, 6316, 6241, 6168, 6096, 6026, 5957,
+  5890, 5825, 5761, 5698, 5637, 5577, 5518, 5461,
+  5405, 5349, 5295, 5242, 5190, 5140, 5090, 5041,
+  4993, 4946, 4899, 4854, 4809, 4766, 4723, 4681,
+  4639, 4599, 4559, 4519, 4481, 4443, 4405, 4369,
+  4332, 4297, 4262, 4228, 4194, 4161, 4128, 4096,
+  4064, 4032, 4002, 3971, 3942, 3912, 3883, 3855,
+  3826, 3799, 3771, 3744, 3718, 3692, 3666, 3640,
+  3615, 3591, 3566, 3542, 3518, 3495, 3472, 3449,
+  3426, 3404, 3382, 3360, 3339, 3318, 3297, 3276,
+  3256, 3236, 3216, 3196, 3177, 3158, 3139, 3120,
+  3102, 3084, 3066, 3048, 3030, 3013, 2995, 2978,
+  2962, 2945, 2928, 2912, 2896, 2880, 2864, 2849,
+  2833, 2818, 2803, 2788, 2774, 2759, 2744, 2730,
+  2716, 2702, 2688, 2674, 2661, 2647, 2634, 2621,
+  2608, 2595, 2582, 2570, 2557, 2545, 2532, 2520,
+  2508, 2496, 2484, 2473, 2461, 2449, 2438, 2427,
+  2416, 2404, 2394, 2383, 2372, 2361, 2351, 2340,
+  2330, 2319, 2309, 2299, 2289, 2279, 2269, 2259,
+  2250, 2240, 2231, 2221, 2212, 2202, 2193, 2184,
+  2175, 2166, 2157, 2148, 2139, 2131, 2122, 2114,
+  2105, 2097, 2088, 2080, 2072, 2064, 2056, 2048,
+  2040, 2032, 2024, 2016, 2008, 2001, 1993, 1985,
+  1978, 1971, 1963, 1956, 1949, 1941, 1934, 1927,
+  1920, 1913, 1906, 1899, 1892, 1885, 1879, 1872,
+  1865, 1859, 1852, 1846, 1839, 1833, 1826, 1820,
+  1814, 1807, 1801, 1795, 1789, 1783, 1777, 1771,
+  1765, 1759, 1753, 1747, 1741, 1736, 1730, 1724,
+  1718, 1713, 1707, 1702, 1696, 1691, 1685, 1680,
+  1675, 1669, 1664, 1659, 1653, 1648, 1643, 1638,
+  1633, 1628, 1623, 1618, 1613, 1608, 1603, 1598,
+  1593, 1588, 1583, 1579, 1574, 1569, 1565, 1560,
+  1555, 1551, 1546, 1542, 1537, 1533, 1528, 1524,
+  1519, 1515, 1510, 1506, 1502, 1497, 1493, 1489,
+  1485, 1481, 1476, 1472, 1468, 1464, 1460, 1456,
+  1452, 1448, 1444, 1440, 1436, 1432, 1428, 1424,
+  1420, 1416, 1413, 1409, 1405, 1401, 1398, 1394,
+  1390, 1387, 1383, 1379, 1376, 1372, 1368, 1365,
+  1361, 1358, 1354, 1351, 1347, 1344, 1340, 1337,
+  1334, 1330, 1327, 1323, 1320, 1317, 1314, 1310,
+  1307, 1304, 1300, 1297, 1294, 1291, 1288, 1285,
+  1281, 1278, 1275, 1272, 1269, 1266, 1263, 1260,
+  1257, 1254, 1251, 1248, 1245, 1242, 1239, 1236,
+  1233, 1230, 1227, 1224, 1222, 1219, 1216, 1213,
+  1210, 1208, 1205, 1202, 1199, 1197, 1194, 1191,
+  1188, 1186, 1183, 1180, 1178, 1175, 1172, 1170,
+  1167, 1165, 1162, 1159, 1157, 1154, 1152, 1149,
+  1147, 1144, 1142, 1139, 1137, 1134, 1132, 1129,
+  1127, 1125, 1122, 1120, 1117, 1115, 1113, 1110,
+  1108, 1106, 1103, 1101, 1099, 1096, 1094, 1092,
+  1089, 1087, 1085, 1083, 1081, 1078, 1076, 1074,
+  1072, 1069, 1067, 1065, 1063, 1061, 1059, 1057,
+  1054, 1052, 1050, 1048, 1046, 1044, 1042, 1040,
+  1038, 1036, 1034, 1032, 1030, 1028, 1026, 1024,
+  1022, 1020, 1018, 1016, 1014, 1012, 1010, 1008,
+  1006, 1004, 1002, 1000, 998, 996, 994, 992,
+  991, 989, 987, 985, 983, 981, 979, 978,
+  976, 974, 972, 970, 969, 967, 965, 963,
+  961, 960, 958, 956, 954, 953, 951, 949,
+  948, 946, 944, 942, 941, 939, 937, 936,
+  934, 932, 931, 929, 927, 926, 924, 923,
+  921, 919, 918, 916, 914, 913, 911, 910,
+  908, 907, 905, 903, 902, 900, 899, 897,
+  896, 894, 893, 891, 890, 888, 887, 885,
+  884, 882, 881, 879, 878, 876, 875, 873,
+  872, 870, 869, 868, 866, 865, 863, 862,
+  860, 859, 858, 856, 855, 853, 852, 851,
+  849, 848, 846, 845, 844, 842, 841, 840,
+  838, 837, 836, 834, 833, 832, 830, 829,
+  828, 826, 825, 824, 823, 821, 820, 819,
+  817, 816, 815, 814, 812, 811, 810, 809,
+  807, 806, 805, 804, 802, 801, 800, 799,
+  798, 796, 795, 794, 793, 791, 790, 789,
+  788, 787, 786, 784, 783, 782, 781, 780,
+  779, 777, 776, 775, 774, 773, 772, 771,
+  769, 768, 767, 766, 765, 764, 763, 762,
+  760, 759, 758, 757, 756, 755, 754, 753,
+  752, 751, 750, 748, 747, 746, 745, 744,
+  743, 742, 741, 740, 739, 738, 737, 736,
+  735, 734, 733, 732, 731, 730, 729, 728,
+  727, 726, 725, 724, 723, 722, 721, 720,
+  719, 718, 717, 716, 715, 714, 713, 712,
+  711, 710, 709, 708, 707, 706, 705, 704,
+  703, 702, 701, 700, 699, 699, 698, 697,
+  696, 695, 694, 693, 692, 691, 690, 689,
+  688, 688, 687, 686, 685, 684, 683, 682,
+  681, 680, 680, 679, 678, 677, 676, 675,
+  674, 673, 673, 672, 671, 670, 669, 668,
+  667, 667, 666, 665, 664, 663, 662, 661,
+  661, 660, 659, 658, 657, 657, 656, 655,
+  654, 653, 652, 652, 651, 650, 649, 648,
+  648, 647, 646, 645, 644, 644, 643, 642,
+  641, 640, 640, 639, 638, 637, 637, 636,
+  635, 634, 633, 633, 632, 631, 630, 630,
+  629, 628, 627, 627, 626, 625, 624, 624,
+  623, 622, 621, 621, 620, 619, 618, 618,
+  617, 616, 616, 615, 614, 613, 613, 612,
+  611, 611, 610, 609, 608, 608, 607, 606,
+  606, 605, 604, 604, 603, 602, 601, 601,
+  600, 599, 599, 598, 597, 597, 596, 595,
+  595, 594, 593, 593, 592, 591, 591, 590,
+  589, 589, 588, 587, 587, 586, 585, 585,
+  584, 583, 583, 582, 581, 581, 580, 579,
+  579, 578, 578, 577, 576, 576, 575, 574,
+  574, 573, 572, 572, 571, 571, 570, 569,
+  569, 568, 568, 567, 566, 566, 565, 564,
+  564, 563, 563, 562, 561, 561, 560, 560,
+  559, 558, 558, 557, 557, 556, 555, 555,
+  554, 554, 553, 553, 552, 551, 551, 550,
+  550, 549, 548, 548, 547, 547, 546, 546,
+  545, 544, 544, 543, 543, 542, 542, 541,
+  541, 540, 539, 539, 538, 538, 537, 537,
+  536, 536, 535, 534, 534, 533, 533, 532,
+  532, 531, 531, 530, 530, 529, 529, 528,
+  527, 527, 526, 526, 525, 525, 524, 524,
+  523, 523, 522, 522, 521, 521, 520, 520,
+  519, 519, 518, 518, 517, 517, 516, 516,
+  515, 515, 514, 514
+};
+
+// Note that LinearToGamma() expects the values to be premultiplied by 4,
+// so we incorporate this factor 4 inside the DIVIDE_BY_ALPHA macro directly.
+#define DIVIDE_BY_ALPHA(sum, a)  (((sum) * kInvAlpha[(a)]) >> (kAlphaFix - 2))
+
+#else
+
+#define DIVIDE_BY_ALPHA(sum, a) (4 * (sum) / (a))
+
+#endif  // USE_INVERSE_ALPHA_TABLE
+
+static WEBP_INLINE int LinearToGammaWeighted(const uint8_t* src,
+                                             const uint8_t* a_ptr,
+                                             uint32_t total_a, int step,
+                                             int rgb_stride) {
+  const uint32_t sum =
+      a_ptr[0] * GammaToLinear(src[0]) +
+      a_ptr[step] * GammaToLinear(src[step]) +
+      a_ptr[rgb_stride] * GammaToLinear(src[rgb_stride]) +
+      a_ptr[rgb_stride + step] * GammaToLinear(src[rgb_stride + step]);
+  assert(total_a > 0 && total_a <= 4 * 0xff);
+#if defined(USE_INVERSE_ALPHA_TABLE)
+  assert((uint64_t)sum * kInvAlpha[total_a] < ((uint64_t)1 << 32));
+#endif
+  return LinearToGamma(DIVIDE_BY_ALPHA(sum, total_a), 0);
+}
+
+static WEBP_INLINE void ConvertRowToY(const uint8_t* const r_ptr,
+                                      const uint8_t* const g_ptr,
+                                      const uint8_t* const b_ptr,
+                                      int step,
+                                      uint8_t* const dst_y,
+                                      int width,
+                                      VP8Random* const rg) {
+  int i, j;
+  for (i = 0, j = 0; i < width; ++i, j += step) {
+    dst_y[i] = RGBToY(r_ptr[j], g_ptr[j], b_ptr[j], rg);
+  }
+}
+
+static WEBP_INLINE void ConvertRowsToUVWithAlpha(const uint8_t* const r_ptr,
+                                                 const uint8_t* const g_ptr,
+                                                 const uint8_t* const b_ptr,
+                                                 const uint8_t* const a_ptr,
+                                                 int rgb_stride,
+                                                 uint8_t* const dst_u,
+                                                 uint8_t* const dst_v,
+                                                 int width,
+                                                 VP8Random* const rg) {
+  int i, j;
+  // we loop over 2x2 blocks and produce one U/V value for each.
+  for (i = 0, j = 0; i < (width >> 1); ++i, j += 2 * sizeof(uint32_t)) {
+    const uint32_t a = SUM4ALPHA(a_ptr + j);
+    int r, g, b;
+    if (a == 4 * 0xff || a == 0) {
+      r = SUM4(r_ptr + j, 4);
+      g = SUM4(g_ptr + j, 4);
+      b = SUM4(b_ptr + j, 4);
+    } else {
+      r = LinearToGammaWeighted(r_ptr + j, a_ptr + j, a, 4, rgb_stride);
+      g = LinearToGammaWeighted(g_ptr + j, a_ptr + j, a, 4, rgb_stride);
+      b = LinearToGammaWeighted(b_ptr + j, a_ptr + j, a, 4, rgb_stride);
+    }
+    dst_u[i] = RGBToU(r, g, b, rg);
+    dst_v[i] = RGBToV(r, g, b, rg);
+  }
+  if (width & 1) {
+    const uint32_t a = 2u * SUM2ALPHA(a_ptr + j);
+    int r, g, b;
+    if (a == 4 * 0xff || a == 0) {
+      r = SUM2(r_ptr + j);
+      g = SUM2(g_ptr + j);
+      b = SUM2(b_ptr + j);
+    } else {
+      r = LinearToGammaWeighted(r_ptr + j, a_ptr + j, a, 0, rgb_stride);
+      g = LinearToGammaWeighted(g_ptr + j, a_ptr + j, a, 0, rgb_stride);
+      b = LinearToGammaWeighted(b_ptr + j, a_ptr + j, a, 0, rgb_stride);
+    }
+    dst_u[i] = RGBToU(r, g, b, rg);
+    dst_v[i] = RGBToV(r, g, b, rg);
+  }
+}
+
+static WEBP_INLINE void ConvertRowsToUV(const uint8_t* const r_ptr,
+                                        const uint8_t* const g_ptr,
+                                        const uint8_t* const b_ptr,
+                                        int step, int rgb_stride,
+                                        uint8_t* const dst_u,
+                                        uint8_t* const dst_v,
+                                        int width,
+                                        VP8Random* const rg) {
+  int i, j;
+  for (i = 0, j = 0; i < (width >> 1); ++i, j += 2 * step) {
+    const int r = SUM4(r_ptr + j, step);
+    const int g = SUM4(g_ptr + j, step);
+    const int b = SUM4(b_ptr + j, step);
+    dst_u[i] = RGBToU(r, g, b, rg);
+    dst_v[i] = RGBToV(r, g, b, rg);
+  }
+  if (width & 1) {
+    const int r = SUM2(r_ptr + j);
+    const int g = SUM2(g_ptr + j);
+    const int b = SUM2(b_ptr + j);
+    dst_u[i] = RGBToU(r, g, b, rg);
+    dst_v[i] = RGBToV(r, g, b, rg);
+  }
+}
+
+static int ImportYUVAFromRGBA(const uint8_t* const r_ptr,
+                              const uint8_t* const g_ptr,
+                              const uint8_t* const b_ptr,
+                              const uint8_t* const a_ptr,
+                              int step,         // bytes per pixel
+                              int rgb_stride,   // bytes per scanline
+                              float dithering,
+                              int use_iterative_conversion,
+                              WebPPicture* const picture) {
+  int y;
+  const int width = picture->width;
+  const int height = picture->height;
+  const int has_alpha = CheckNonOpaque(a_ptr, width, height, step, rgb_stride);
+
+  picture->colorspace = has_alpha ? WEBP_YUV420A : WEBP_YUV420;
+  picture->use_argb = 0;
+
+  // disable smart conversion if source is too small (overkill).
+  if (width < kMinDimensionIterativeConversion ||
+      height < kMinDimensionIterativeConversion) {
+    use_iterative_conversion = 0;
+  }
+
+  if (!WebPPictureAllocYUVA(picture, width, height)) {
+    return 0;
+  }
+  if (has_alpha) {
+    WebPInitAlphaProcessing();
+    assert(step == 4);
+#if defined(USE_INVERSE_ALPHA_TABLE)
+    assert(kAlphaFix + kGammaFix <= 31);
+#endif
+  }
+
+  if (use_iterative_conversion) {
+    InitGammaTablesF();
+    if (!PreprocessARGB(r_ptr, g_ptr, b_ptr, step, rgb_stride, picture)) {
+      return 0;
+    }
+    if (has_alpha) {
+      WebPExtractAlpha(a_ptr, rgb_stride, width, height,
+                       picture->a, picture->a_stride);
+    }
+  } else {
+    uint8_t* dst_y = picture->y;
+    uint8_t* dst_u = picture->u;
+    uint8_t* dst_v = picture->v;
+    uint8_t* dst_a = picture->a;
+
+    VP8Random base_rg;
+    VP8Random* rg = NULL;
+    if (dithering > 0.) {
+      VP8InitRandom(&base_rg, dithering);
+      rg = &base_rg;
+    }
+
+    InitGammaTables();
+
+    // Downsample Y/U/V planes, two rows at a time
+    for (y = 0; y < (height >> 1); ++y) {
+      int rows_have_alpha = has_alpha;
+      const int off1 = (2 * y + 0) * rgb_stride;
+      const int off2 = (2 * y + 1) * rgb_stride;
+      ConvertRowToY(r_ptr + off1, g_ptr + off1, b_ptr + off1, step,
+                    dst_y, width, rg);
+      ConvertRowToY(r_ptr + off2, g_ptr + off2, b_ptr + off2, step,
+                    dst_y + picture->y_stride, width, rg);
+      dst_y += 2 * picture->y_stride;
+      if (has_alpha) {
+        rows_have_alpha &= !WebPExtractAlpha(a_ptr + off1, rgb_stride,
+                                             width, 2,
+                                             dst_a, picture->a_stride);
+        dst_a += 2 * picture->a_stride;
+      }
+      if (!rows_have_alpha) {
+        ConvertRowsToUV(r_ptr + off1, g_ptr + off1, b_ptr + off1,
+                        step, rgb_stride, dst_u, dst_v, width, rg);
+      } else {
+        ConvertRowsToUVWithAlpha(r_ptr + off1, g_ptr + off1, b_ptr + off1,
+                                 a_ptr + off1, rgb_stride,
+                                 dst_u, dst_v, width, rg);
+      }
+      dst_u += picture->uv_stride;
+      dst_v += picture->uv_stride;
+    }
+    if (height & 1) {    // extra last row
+      const int off = 2 * y * rgb_stride;
+      int row_has_alpha = has_alpha;
+      ConvertRowToY(r_ptr + off, g_ptr + off, b_ptr + off, step,
+                    dst_y, width, rg);
+      if (row_has_alpha) {
+        row_has_alpha &= !WebPExtractAlpha(a_ptr + off, 0, width, 1, dst_a, 0);
+      }
+      if (!row_has_alpha) {
+        ConvertRowsToUV(r_ptr + off, g_ptr + off, b_ptr + off,
+                        step, 0, dst_u, dst_v, width, rg);
+      } else {
+        ConvertRowsToUVWithAlpha(r_ptr + off, g_ptr + off, b_ptr + off,
+                                 a_ptr + off, 0,
+                                 dst_u, dst_v, width, rg);
+      }
+    }
+  }
+  return 1;
+}
+
+#undef SUM4
+#undef SUM2
+#undef SUM4ALPHA
+#undef SUM2ALPHA
+
+//------------------------------------------------------------------------------
+// call for ARGB->YUVA conversion
+
+static int PictureARGBToYUVA(WebPPicture* picture, WebPEncCSP colorspace,
+                             float dithering, int use_iterative_conversion) {
+  if (picture == NULL) return 0;
+  if (picture->argb == NULL) {
+    return WebPEncodingSetError(picture, VP8_ENC_ERROR_NULL_PARAMETER);
+  } else if ((colorspace & WEBP_CSP_UV_MASK) != WEBP_YUV420) {
+    return WebPEncodingSetError(picture, VP8_ENC_ERROR_INVALID_CONFIGURATION);
+  } else {
+    const uint8_t* const argb = (const uint8_t*)picture->argb;
+    const uint8_t* const r = ALPHA_IS_LAST ? argb + 2 : argb + 1;
+    const uint8_t* const g = ALPHA_IS_LAST ? argb + 1 : argb + 2;
+    const uint8_t* const b = ALPHA_IS_LAST ? argb + 0 : argb + 3;
+    const uint8_t* const a = ALPHA_IS_LAST ? argb + 3 : argb + 0;
+
+    picture->colorspace = WEBP_YUV420;
+    return ImportYUVAFromRGBA(r, g, b, a, 4, 4 * picture->argb_stride,
+                              dithering, use_iterative_conversion, picture);
+  }
+}
+
+int WebPPictureARGBToYUVADithered(WebPPicture* picture, WebPEncCSP colorspace,
+                                  float dithering) {
+  return PictureARGBToYUVA(picture, colorspace, dithering, 0);
+}
+
+int WebPPictureARGBToYUVA(WebPPicture* picture, WebPEncCSP colorspace) {
+  return PictureARGBToYUVA(picture, colorspace, 0.f, 0);
+}
+
+#if WEBP_ENCODER_ABI_VERSION > 0x0204
+int WebPPictureSmartARGBToYUVA(WebPPicture* picture) {
+  return PictureARGBToYUVA(picture, WEBP_YUV420, 0.f, 1);
+}
+#endif
+
+//------------------------------------------------------------------------------
+// call for YUVA -> ARGB conversion
+
+int WebPPictureYUVAToARGB(WebPPicture* picture) {
+  if (picture == NULL) return 0;
+  if (picture->y == NULL || picture->u == NULL || picture->v == NULL) {
+    return WebPEncodingSetError(picture, VP8_ENC_ERROR_NULL_PARAMETER);
+  }
+  if ((picture->colorspace & WEBP_CSP_ALPHA_BIT) && picture->a == NULL) {
+    return WebPEncodingSetError(picture, VP8_ENC_ERROR_NULL_PARAMETER);
+  }
+  if ((picture->colorspace & WEBP_CSP_UV_MASK) != WEBP_YUV420) {
+    return WebPEncodingSetError(picture, VP8_ENC_ERROR_INVALID_CONFIGURATION);
+  }
+  // Allocate a new argb buffer (discarding the previous one).
+  if (!WebPPictureAllocARGB(picture, picture->width, picture->height)) return 0;
+  picture->use_argb = 1;
+
+  // Convert
+  {
+    int y;
+    const int width = picture->width;
+    const int height = picture->height;
+    const int argb_stride = 4 * picture->argb_stride;
+    uint8_t* dst = (uint8_t*)picture->argb;
+    const uint8_t *cur_u = picture->u, *cur_v = picture->v, *cur_y = picture->y;
+    WebPUpsampleLinePairFunc upsample = WebPGetLinePairConverter(ALPHA_IS_LAST);
+
+    // First row, with replicated top samples.
+    upsample(cur_y, NULL, cur_u, cur_v, cur_u, cur_v, dst, NULL, width);
+    cur_y += picture->y_stride;
+    dst += argb_stride;
+    // Center rows.
+    for (y = 1; y + 1 < height; y += 2) {
+      const uint8_t* const top_u = cur_u;
+      const uint8_t* const top_v = cur_v;
+      cur_u += picture->uv_stride;
+      cur_v += picture->uv_stride;
+      upsample(cur_y, cur_y + picture->y_stride, top_u, top_v, cur_u, cur_v,
+               dst, dst + argb_stride, width);
+      cur_y += 2 * picture->y_stride;
+      dst += 2 * argb_stride;
+    }
+    // Last row (if needed), with replicated bottom samples.
+    if (height > 1 && !(height & 1)) {
+      upsample(cur_y, NULL, cur_u, cur_v, cur_u, cur_v, dst, NULL, width);
+    }
+    // Insert alpha values if needed, in replacement for the default 0xff ones.
+    if (picture->colorspace & WEBP_CSP_ALPHA_BIT) {
+      for (y = 0; y < height; ++y) {
+        uint32_t* const argb_dst = picture->argb + y * picture->argb_stride;
+        const uint8_t* const src = picture->a + y * picture->a_stride;
+        int x;
+        for (x = 0; x < width; ++x) {
+          argb_dst[x] = (argb_dst[x] & 0x00ffffffu) | ((uint32_t)src[x] << 24);
+        }
+      }
+    }
+  }
+  return 1;
+}
+
+//------------------------------------------------------------------------------
+// automatic import / conversion
+
+static int Import(WebPPicture* const picture,
+                  const uint8_t* const rgb, int rgb_stride,
+                  int step, int swap_rb, int import_alpha) {
+  int y;
+  const uint8_t* const r_ptr = rgb + (swap_rb ? 2 : 0);
+  const uint8_t* const g_ptr = rgb + 1;
+  const uint8_t* const b_ptr = rgb + (swap_rb ? 0 : 2);
+  const uint8_t* const a_ptr = import_alpha ? rgb + 3 : NULL;
+  const int width = picture->width;
+  const int height = picture->height;
+
+  if (!picture->use_argb) {
+    return ImportYUVAFromRGBA(r_ptr, g_ptr, b_ptr, a_ptr, step, rgb_stride,
+                              0.f /* no dithering */, 0, picture);
+  }
+  if (!WebPPictureAlloc(picture)) return 0;
+
+  assert(step >= (import_alpha ? 4 : 3));
+  for (y = 0; y < height; ++y) {
+    uint32_t* const dst = &picture->argb[y * picture->argb_stride];
+    int x;
+    for (x = 0; x < width; ++x) {
+      const int offset = step * x + y * rgb_stride;
+      dst[x] = MakeARGB32(import_alpha ? a_ptr[offset] : 0xff,
+                          r_ptr[offset], g_ptr[offset], b_ptr[offset]);
+    }
+  }
+  return 1;
+}
+
+// Public API
+
+int WebPPictureImportRGB(WebPPicture* picture,
+                         const uint8_t* rgb, int rgb_stride) {
+  return (picture != NULL) ? Import(picture, rgb, rgb_stride, 3, 0, 0) : 0;
+}
+
+int WebPPictureImportBGR(WebPPicture* picture,
+                         const uint8_t* rgb, int rgb_stride) {
+  return (picture != NULL) ? Import(picture, rgb, rgb_stride, 3, 1, 0) : 0;
+}
+
+int WebPPictureImportRGBA(WebPPicture* picture,
+                          const uint8_t* rgba, int rgba_stride) {
+  return (picture != NULL) ? Import(picture, rgba, rgba_stride, 4, 0, 1) : 0;
+}
+
+int WebPPictureImportBGRA(WebPPicture* picture,
+                          const uint8_t* rgba, int rgba_stride) {
+  return (picture != NULL) ? Import(picture, rgba, rgba_stride, 4, 1, 1) : 0;
+}
+
+int WebPPictureImportRGBX(WebPPicture* picture,
+                          const uint8_t* rgba, int rgba_stride) {
+  return (picture != NULL) ? Import(picture, rgba, rgba_stride, 4, 0, 0) : 0;
+}
+
+int WebPPictureImportBGRX(WebPPicture* picture,
+                          const uint8_t* rgba, int rgba_stride) {
+  return (picture != NULL) ? Import(picture, rgba, rgba_stride, 4, 1, 0) : 0;
+}
+
+//------------------------------------------------------------------------------
diff --git a/src/3rdparty/libwebp/src/enc/picture_psnr.c b/src/3rdparty/libwebp/src/enc/picture_psnr.c
new file mode 100644
index 0000000..2254b7e
--- /dev/null
+++ b/src/3rdparty/libwebp/src/enc/picture_psnr.c
@@ -0,0 +1,150 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// WebPPicture tools for measuring distortion
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include <math.h>
+
+#include "./vp8enci.h"
+
+//------------------------------------------------------------------------------
+// local-min distortion
+//
+// For every pixel in the *reference* picture, we search for the local best
+// match in the compressed image. This is not a symmetrical measure.
+
+#define RADIUS 2  // search radius. Shouldn't be too large.
+
+static float AccumulateLSIM(const uint8_t* src, int src_stride,
+                            const uint8_t* ref, int ref_stride,
+                            int w, int h) {
+  int x, y;
+  double total_sse = 0.;
+  for (y = 0; y < h; ++y) {
+    const int y_0 = (y - RADIUS < 0) ? 0 : y - RADIUS;
+    const int y_1 = (y + RADIUS + 1 >= h) ? h : y + RADIUS + 1;
+    for (x = 0; x < w; ++x) {
+      const int x_0 = (x - RADIUS < 0) ? 0 : x - RADIUS;
+      const int x_1 = (x + RADIUS + 1 >= w) ? w : x + RADIUS + 1;
+      double best_sse = 255. * 255.;
+      const double value = (double)ref[y * ref_stride + x];
+      int i, j;
+      for (j = y_0; j < y_1; ++j) {
+        const uint8_t* s = src + j * src_stride;
+        for (i = x_0; i < x_1; ++i) {
+          const double sse = (double)(s[i] - value) * (s[i] - value);
+          if (sse < best_sse) best_sse = sse;
+        }
+      }
+      total_sse += best_sse;
+    }
+  }
+  return (float)total_sse;
+}
+#undef RADIUS
+
+//------------------------------------------------------------------------------
+// Distortion
+
+// Max value returned in case of exact similarity.
+static const double kMinDistortion_dB = 99.;
+static float GetPSNR(const double v) {
+  return (float)((v > 0.) ? -4.3429448 * log(v / (255 * 255.))
+                          : kMinDistortion_dB);
+}
+
+int WebPPictureDistortion(const WebPPicture* src, const WebPPicture* ref,
+                          int type, float result[5]) {
+  DistoStats stats[5];
+  int has_alpha;
+  int uv_w, uv_h;
+
+  if (src == NULL || ref == NULL ||
+      src->width != ref->width || src->height != ref->height ||
+      src->y == NULL || ref->y == NULL ||
+      src->u == NULL || ref->u == NULL ||
+      src->v == NULL || ref->v == NULL ||
+      result == NULL) {
+    return 0;
+  }
+  // TODO(skal): provide distortion for ARGB too.
+  if (src->use_argb == 1 || src->use_argb != ref->use_argb) {
+    return 0;
+  }
+
+  has_alpha = !!(src->colorspace & WEBP_CSP_ALPHA_BIT);
+  if (has_alpha != !!(ref->colorspace & WEBP_CSP_ALPHA_BIT) ||
+      (has_alpha && (src->a == NULL || ref->a == NULL))) {
+    return 0;
+  }
+
+  memset(stats, 0, sizeof(stats));
+
+  uv_w = (src->width + 1) >> 1;
+  uv_h = (src->height + 1) >> 1;
+  if (type >= 2) {
+    float sse[4];
+    sse[0] = AccumulateLSIM(src->y, src->y_stride,
+                            ref->y, ref->y_stride, src->width, src->height);
+    sse[1] = AccumulateLSIM(src->u, src->uv_stride,
+                            ref->u, ref->uv_stride, uv_w, uv_h);
+    sse[2] = AccumulateLSIM(src->v, src->uv_stride,
+                            ref->v, ref->uv_stride, uv_w, uv_h);
+    sse[3] = has_alpha ? AccumulateLSIM(src->a, src->a_stride,
+                                        ref->a, ref->a_stride,
+                                        src->width, src->height)
+                       : 0.f;
+    result[0] = GetPSNR(sse[0] / (src->width * src->height));
+    result[1] = GetPSNR(sse[1] / (uv_w * uv_h));
+    result[2] = GetPSNR(sse[2] / (uv_w * uv_h));
+    result[3] = GetPSNR(sse[3] / (src->width * src->height));
+    {
+      double total_sse = sse[0] + sse[1] + sse[2];
+      int total_pixels = src->width * src->height + 2 * uv_w * uv_h;
+      if (has_alpha) {
+        total_pixels += src->width * src->height;
+        total_sse += sse[3];
+      }
+      result[4] = GetPSNR(total_sse / total_pixels);
+    }
+  } else {
+    int c;
+    VP8SSIMAccumulatePlane(src->y, src->y_stride,
+                           ref->y, ref->y_stride,
+                           src->width, src->height, &stats[0]);
+    VP8SSIMAccumulatePlane(src->u, src->uv_stride,
+                           ref->u, ref->uv_stride,
+                           uv_w, uv_h, &stats[1]);
+    VP8SSIMAccumulatePlane(src->v, src->uv_stride,
+                           ref->v, ref->uv_stride,
+                           uv_w, uv_h, &stats[2]);
+    if (has_alpha) {
+      VP8SSIMAccumulatePlane(src->a, src->a_stride,
+                             ref->a, ref->a_stride,
+                             src->width, src->height, &stats[3]);
+    }
+    for (c = 0; c <= 4; ++c) {
+      if (type == 1) {
+        const double v = VP8SSIMGet(&stats[c]);
+        result[c] = (float)((v < 1.) ? -10.0 * log10(1. - v)
+                                     : kMinDistortion_dB);
+      } else {
+        const double v = VP8SSIMGetSquaredError(&stats[c]);
+        result[c] = GetPSNR(v);
+      }
+      // Accumulate forward
+      if (c < 4) VP8SSIMAddStats(&stats[c], &stats[4]);
+    }
+  }
+  return 1;
+}
+
+//------------------------------------------------------------------------------
diff --git a/src/3rdparty/libwebp/src/enc/picture_rescale.c b/src/3rdparty/libwebp/src/enc/picture_rescale.c
new file mode 100644
index 0000000..de52848
--- /dev/null
+++ b/src/3rdparty/libwebp/src/enc/picture_rescale.c
@@ -0,0 +1,285 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// WebPPicture tools: copy, crop, rescaling and view.
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include <assert.h>
+#include <stdlib.h>
+
+#include "./vp8enci.h"
+#include "../utils/rescaler.h"
+#include "../utils/utils.h"
+
+#define HALVE(x) (((x) + 1) >> 1)
+
+// Grab the 'specs' (writer, *opaque, width, height...) from 'src' and copy them
+// into 'dst'. Mark 'dst' as not owning any memory.
+static void PictureGrabSpecs(const WebPPicture* const src,
+                             WebPPicture* const dst) {
+  assert(src != NULL && dst != NULL);
+  *dst = *src;
+  WebPPictureResetBuffers(dst);
+}
+
+//------------------------------------------------------------------------------
+// Picture copying
+
+static void CopyPlane(const uint8_t* src, int src_stride,
+                      uint8_t* dst, int dst_stride, int width, int height) {
+  while (height-- > 0) {
+    memcpy(dst, src, width);
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+// Adjust top-left corner to chroma sample position.
+static void SnapTopLeftPosition(const WebPPicture* const pic,
+                                int* const left, int* const top) {
+  if (!pic->use_argb) {
+    *left &= ~1;
+    *top &= ~1;
+  }
+}
+
+// Adjust top-left corner and verify that the sub-rectangle is valid.
+static int AdjustAndCheckRectangle(const WebPPicture* const pic,
+                                   int* const left, int* const top,
+                                   int width, int height) {
+  SnapTopLeftPosition(pic, left, top);
+  if ((*left) < 0 || (*top) < 0) return 0;
+  if (width <= 0 || height <= 0) return 0;
+  if ((*left) + width > pic->width) return 0;
+  if ((*top) + height > pic->height) return 0;
+  return 1;
+}
+
+int WebPPictureCopy(const WebPPicture* src, WebPPicture* dst) {
+  if (src == NULL || dst == NULL) return 0;
+  if (src == dst) return 1;
+
+  PictureGrabSpecs(src, dst);
+  if (!WebPPictureAlloc(dst)) return 0;
+
+  if (!src->use_argb) {
+    CopyPlane(src->y, src->y_stride,
+              dst->y, dst->y_stride, dst->width, dst->height);
+    CopyPlane(src->u, src->uv_stride,
+              dst->u, dst->uv_stride, HALVE(dst->width), HALVE(dst->height));
+    CopyPlane(src->v, src->uv_stride,
+              dst->v, dst->uv_stride, HALVE(dst->width), HALVE(dst->height));
+    if (dst->a != NULL)  {
+      CopyPlane(src->a, src->a_stride,
+                dst->a, dst->a_stride, dst->width, dst->height);
+    }
+  } else {
+    CopyPlane((const uint8_t*)src->argb, 4 * src->argb_stride,
+              (uint8_t*)dst->argb, 4 * dst->argb_stride,
+              4 * dst->width, dst->height);
+  }
+  return 1;
+}
+
+int WebPPictureIsView(const WebPPicture* picture) {
+  if (picture == NULL) return 0;
+  if (picture->use_argb) {
+    return (picture->memory_argb_ == NULL);
+  }
+  return (picture->memory_ == NULL);
+}
+
+int WebPPictureView(const WebPPicture* src,
+                    int left, int top, int width, int height,
+                    WebPPicture* dst) {
+  if (src == NULL || dst == NULL) return 0;
+
+  // verify rectangle position.
+  if (!AdjustAndCheckRectangle(src, &left, &top, width, height)) return 0;
+
+  if (src != dst) {  // beware of aliasing! We don't want to leak 'memory_'.
+    PictureGrabSpecs(src, dst);
+  }
+  dst->width = width;
+  dst->height = height;
+  if (!src->use_argb) {
+    dst->y = src->y + top * src->y_stride + left;
+    dst->u = src->u + (top >> 1) * src->uv_stride + (left >> 1);
+    dst->v = src->v + (top >> 1) * src->uv_stride + (left >> 1);
+    dst->y_stride = src->y_stride;
+    dst->uv_stride = src->uv_stride;
+    if (src->a != NULL) {
+      dst->a = src->a + top * src->a_stride + left;
+      dst->a_stride = src->a_stride;
+    }
+  } else {
+    dst->argb = src->argb + top * src->argb_stride + left;
+    dst->argb_stride = src->argb_stride;
+  }
+  return 1;
+}
+
+//------------------------------------------------------------------------------
+// Picture cropping
+
+int WebPPictureCrop(WebPPicture* pic,
+                    int left, int top, int width, int height) {
+  WebPPicture tmp;
+
+  if (pic == NULL) return 0;
+  if (!AdjustAndCheckRectangle(pic, &left, &top, width, height)) return 0;
+
+  PictureGrabSpecs(pic, &tmp);
+  tmp.width = width;
+  tmp.height = height;
+  if (!WebPPictureAlloc(&tmp)) return 0;
+
+  if (!pic->use_argb) {
+    const int y_offset = top * pic->y_stride + left;
+    const int uv_offset = (top / 2) * pic->uv_stride + left / 2;
+    CopyPlane(pic->y + y_offset, pic->y_stride,
+              tmp.y, tmp.y_stride, width, height);
+    CopyPlane(pic->u + uv_offset, pic->uv_stride,
+              tmp.u, tmp.uv_stride, HALVE(width), HALVE(height));
+    CopyPlane(pic->v + uv_offset, pic->uv_stride,
+              tmp.v, tmp.uv_stride, HALVE(width), HALVE(height));
+
+    if (tmp.a != NULL) {
+      const int a_offset = top * pic->a_stride + left;
+      CopyPlane(pic->a + a_offset, pic->a_stride,
+                tmp.a, tmp.a_stride, width, height);
+    }
+  } else {
+    const uint8_t* const src =
+        (const uint8_t*)(pic->argb + top * pic->argb_stride + left);
+    CopyPlane(src, pic->argb_stride * 4,
+              (uint8_t*)tmp.argb, tmp.argb_stride * 4,
+              width * 4, height);
+  }
+  WebPPictureFree(pic);
+  *pic = tmp;
+  return 1;
+}
+
+//------------------------------------------------------------------------------
+// Simple picture rescaler
+
+static void RescalePlane(const uint8_t* src,
+                         int src_width, int src_height, int src_stride,
+                         uint8_t* dst,
+                         int dst_width, int dst_height, int dst_stride,
+                         int32_t* const work,
+                         int num_channels) {
+  WebPRescaler rescaler;
+  int y = 0;
+  WebPRescalerInit(&rescaler, src_width, src_height,
+                   dst, dst_width, dst_height, dst_stride,
+                   num_channels,
+                   src_width, dst_width,
+                   src_height, dst_height,
+                   work);
+  memset(work, 0, 2 * dst_width * num_channels * sizeof(*work));
+  while (y < src_height) {
+    y += WebPRescalerImport(&rescaler, src_height - y,
+                            src + y * src_stride, src_stride);
+    WebPRescalerExport(&rescaler);
+  }
+}
+
+static void AlphaMultiplyARGB(WebPPicture* const pic, int inverse) {
+  assert(pic->argb != NULL);
+  WebPMultARGBRows((uint8_t*)pic->argb, pic->argb_stride * sizeof(*pic->argb),
+                   pic->width, pic->height, inverse);
+}
+
+static void AlphaMultiplyY(WebPPicture* const pic, int inverse) {
+  if (pic->a != NULL) {
+    WebPMultRows(pic->y, pic->y_stride, pic->a, pic->a_stride,
+                 pic->width, pic->height, inverse);
+  }
+}
+
+int WebPPictureRescale(WebPPicture* pic, int width, int height) {
+  WebPPicture tmp;
+  int prev_width, prev_height;
+  int32_t* work;
+
+  if (pic == NULL) return 0;
+  prev_width = pic->width;
+  prev_height = pic->height;
+  // if width is unspecified, scale original proportionally to height ratio.
+  if (width == 0) {
+    width = (prev_width * height + prev_height / 2) / prev_height;
+  }
+  // if height is unspecified, scale original proportionally to width ratio.
+  if (height == 0) {
+    height = (prev_height * width + prev_width / 2) / prev_width;
+  }
+  // Check if the overall dimensions still make sense.
+  if (width <= 0 || height <= 0) return 0;
+
+  PictureGrabSpecs(pic, &tmp);
+  tmp.width = width;
+  tmp.height = height;
+  if (!WebPPictureAlloc(&tmp)) return 0;
+
+  if (!pic->use_argb) {
+    work = (int32_t*)WebPSafeMalloc(2ULL * width, sizeof(*work));
+    if (work == NULL) {
+      WebPPictureFree(&tmp);
+      return 0;
+    }
+    // If present, we need to rescale alpha first (for AlphaMultiplyY).
+    if (pic->a != NULL) {
+      WebPInitAlphaProcessing();
+      RescalePlane(pic->a, prev_width, prev_height, pic->a_stride,
+                   tmp.a, width, height, tmp.a_stride, work, 1);
+    }
+
+    // We take transparency into account on the luma plane only. That's not
+    // totally exact blending, but still is a good approximation.
+    AlphaMultiplyY(pic, 0);
+    RescalePlane(pic->y, prev_width, prev_height, pic->y_stride,
+                 tmp.y, width, height, tmp.y_stride, work, 1);
+    AlphaMultiplyY(&tmp, 1);
+
+    RescalePlane(pic->u,
+                 HALVE(prev_width), HALVE(prev_height), pic->uv_stride,
+                 tmp.u,
+                 HALVE(width), HALVE(height), tmp.uv_stride, work, 1);
+    RescalePlane(pic->v,
+                 HALVE(prev_width), HALVE(prev_height), pic->uv_stride,
+                 tmp.v,
+                 HALVE(width), HALVE(height), tmp.uv_stride, work, 1);
+  } else {
+    work = (int32_t*)WebPSafeMalloc(2ULL * width * 4, sizeof(*work));
+    if (work == NULL) {
+      WebPPictureFree(&tmp);
+      return 0;
+    }
+    // In order to correctly interpolate colors, we need to apply the alpha
+    // weighting first (black-matting), scale the RGB values, and remove
+    // the premultiplication afterward (while preserving the alpha channel).
+    WebPInitAlphaProcessing();
+    AlphaMultiplyARGB(pic, 0);
+    RescalePlane((const uint8_t*)pic->argb, prev_width, prev_height,
+                 pic->argb_stride * 4,
+                 (uint8_t*)tmp.argb, width, height,
+                 tmp.argb_stride * 4,
+                 work, 4);
+    AlphaMultiplyARGB(&tmp, 1);
+  }
+  WebPPictureFree(pic);
+  WebPSafeFree(work);
+  *pic = tmp;
+  return 1;
+}
+
+//------------------------------------------------------------------------------
diff --git a/src/3rdparty/libwebp/src/enc/picture_tools.c b/src/3rdparty/libwebp/src/enc/picture_tools.c
new file mode 100644
index 0000000..7c73646
--- /dev/null
+++ b/src/3rdparty/libwebp/src/enc/picture_tools.c
@@ -0,0 +1,206 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// WebPPicture tools: alpha handling, etc.
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "./vp8enci.h"
+#include "../dsp/yuv.h"
+
+static WEBP_INLINE uint32_t MakeARGB32(int r, int g, int b) {
+  return (0xff000000u | (r << 16) | (g << 8) | b);
+}
+
+//------------------------------------------------------------------------------
+// Helper: clean up fully transparent area to help compressibility.
+
+#define SIZE 8
+#define SIZE2 (SIZE / 2)
+static int is_transparent_area(const uint8_t* ptr, int stride, int size) {
+  int y, x;
+  for (y = 0; y < size; ++y) {
+    for (x = 0; x < size; ++x) {
+      if (ptr[x]) {
+        return 0;
+      }
+    }
+    ptr += stride;
+  }
+  return 1;
+}
+
+static int is_transparent_argb_area(const uint32_t* ptr, int stride, int size) {
+  int y, x;
+  for (y = 0; y < size; ++y) {
+    for (x = 0; x < size; ++x) {
+      if (ptr[x] & 0xff000000u) {
+        return 0;
+      }
+    }
+    ptr += stride;
+  }
+  return 1;
+}
+
+static void flatten(uint8_t* ptr, int v, int stride, int size) {
+  int y;
+  for (y = 0; y < size; ++y) {
+    memset(ptr, v, size);
+    ptr += stride;
+  }
+}
+
+static void flatten_argb(uint32_t* ptr, uint32_t v, int stride, int size) {
+  int x, y;
+  for (y = 0; y < size; ++y) {
+    for (x = 0; x < size; ++x) ptr[x] = v;
+    ptr += stride;
+  }
+}
+
+void WebPCleanupTransparentArea(WebPPicture* pic) {
+  int x, y, w, h;
+  if (pic == NULL) return;
+  w = pic->width / SIZE;
+  h = pic->height / SIZE;
+
+  // note: we ignore the left-overs on right/bottom
+  if (pic->use_argb) {
+    uint32_t argb_value = 0;
+    for (y = 0; y < h; ++y) {
+      int need_reset = 1;
+      for (x = 0; x < w; ++x) {
+        const int off = (y * pic->argb_stride + x) * SIZE;
+        if (is_transparent_argb_area(pic->argb + off, pic->argb_stride, SIZE)) {
+          if (need_reset) {
+            argb_value = pic->argb[off];
+            need_reset = 0;
+          }
+          flatten_argb(pic->argb + off, argb_value, pic->argb_stride, SIZE);
+        } else {
+          need_reset = 1;
+        }
+      }
+    }
+  } else {
+    const uint8_t* const a_ptr = pic->a;
+    int values[3] = { 0 };
+    if (a_ptr == NULL) return;    // nothing to do
+    for (y = 0; y < h; ++y) {
+      int need_reset = 1;
+      for (x = 0; x < w; ++x) {
+        const int off_a = (y * pic->a_stride + x) * SIZE;
+        const int off_y = (y * pic->y_stride + x) * SIZE;
+        const int off_uv = (y * pic->uv_stride + x) * SIZE2;
+        if (is_transparent_area(a_ptr + off_a, pic->a_stride, SIZE)) {
+          if (need_reset) {
+            values[0] = pic->y[off_y];
+            values[1] = pic->u[off_uv];
+            values[2] = pic->v[off_uv];
+            need_reset = 0;
+          }
+          flatten(pic->y + off_y, values[0], pic->y_stride, SIZE);
+          flatten(pic->u + off_uv, values[1], pic->uv_stride, SIZE2);
+          flatten(pic->v + off_uv, values[2], pic->uv_stride, SIZE2);
+        } else {
+          need_reset = 1;
+        }
+      }
+    }
+  }
+}
+
+#undef SIZE
+#undef SIZE2
+
+//------------------------------------------------------------------------------
+// Blend color and remove transparency info
+
+#define BLEND(V0, V1, ALPHA) \
+    ((((V0) * (255 - (ALPHA)) + (V1) * (ALPHA)) * 0x101) >> 16)
+#define BLEND_10BIT(V0, V1, ALPHA) \
+    ((((V0) * (1020 - (ALPHA)) + (V1) * (ALPHA)) * 0x101) >> 18)
+
+void WebPBlendAlpha(WebPPicture* pic, uint32_t background_rgb) {
+  const int red = (background_rgb >> 16) & 0xff;
+  const int green = (background_rgb >> 8) & 0xff;
+  const int blue = (background_rgb >> 0) & 0xff;
+  int x, y;
+  if (pic == NULL) return;
+  if (!pic->use_argb) {
+    const int uv_width = (pic->width >> 1);  // omit last pixel during u/v loop
+    const int Y0 = VP8RGBToY(red, green, blue, YUV_HALF);
+    // VP8RGBToU/V expects the u/v values summed over four pixels
+    const int U0 = VP8RGBToU(4 * red, 4 * green, 4 * blue, 4 * YUV_HALF);
+    const int V0 = VP8RGBToV(4 * red, 4 * green, 4 * blue, 4 * YUV_HALF);
+    const int has_alpha = pic->colorspace & WEBP_CSP_ALPHA_BIT;
+    if (!has_alpha || pic->a == NULL) return;    // nothing to do
+    for (y = 0; y < pic->height; ++y) {
+      // Luma blending
+      uint8_t* const y_ptr = pic->y + y * pic->y_stride;
+      uint8_t* const a_ptr = pic->a + y * pic->a_stride;
+      for (x = 0; x < pic->width; ++x) {
+        const int alpha = a_ptr[x];
+        if (alpha < 0xff) {
+          y_ptr[x] = BLEND(Y0, y_ptr[x], a_ptr[x]);
+        }
+      }
+      // Chroma blending every even line
+      if ((y & 1) == 0) {
+        uint8_t* const u = pic->u + (y >> 1) * pic->uv_stride;
+        uint8_t* const v = pic->v + (y >> 1) * pic->uv_stride;
+        uint8_t* const a_ptr2 =
+            (y + 1 == pic->height) ? a_ptr : a_ptr + pic->a_stride;
+        for (x = 0; x < uv_width; ++x) {
+          // Average four alpha values into a single blending weight.
+          // TODO(skal): might lead to visible contouring. Can we do better?
+          const int alpha =
+              a_ptr[2 * x + 0] + a_ptr[2 * x + 1] +
+              a_ptr2[2 * x + 0] + a_ptr2[2 * x + 1];
+          u[x] = BLEND_10BIT(U0, u[x], alpha);
+          v[x] = BLEND_10BIT(V0, v[x], alpha);
+        }
+        if (pic->width & 1) {   // rightmost pixel
+          const int alpha = 2 * (a_ptr[2 * x + 0] + a_ptr2[2 * x + 0]);
+          u[x] = BLEND_10BIT(U0, u[x], alpha);
+          v[x] = BLEND_10BIT(V0, v[x], alpha);
+        }
+      }
+      memset(a_ptr, 0xff, pic->width);
+    }
+  } else {
+    uint32_t* argb = pic->argb;
+    const uint32_t background = MakeARGB32(red, green, blue);
+    for (y = 0; y < pic->height; ++y) {
+      for (x = 0; x < pic->width; ++x) {
+        const int alpha = (argb[x] >> 24) & 0xff;
+        if (alpha != 0xff) {
+          if (alpha > 0) {
+            int r = (argb[x] >> 16) & 0xff;
+            int g = (argb[x] >>  8) & 0xff;
+            int b = (argb[x] >>  0) & 0xff;
+            r = BLEND(red, r, alpha);
+            g = BLEND(green, g, alpha);
+            b = BLEND(blue, b, alpha);
+            argb[x] = MakeARGB32(r, g, b);
+          } else {
+            argb[x] = background;
+          }
+        }
+      }
+      argb += pic->argb_stride;
+    }
+  }
+}
+
+#undef BLEND
+#undef BLEND_10BIT
+
+//------------------------------------------------------------------------------
diff --git a/src/3rdparty/libwebp/src/enc/quant.c b/src/3rdparty/libwebp/src/enc/quant.c
index e1d202b..9130a41 100644
--- a/src/3rdparty/libwebp/src/enc/quant.c
+++ b/src/3rdparty/libwebp/src/enc/quant.c
@@ -395,7 +395,7 @@ void VP8SetSegmentParams(VP8Encoder* const enc, float quality) {
   dq_uv_ac = clip(dq_uv_ac, MIN_DQ_UV, MAX_DQ_UV);
   // We also boost the dc-uv-quant a little, based on sns-strength, since
   // U/V channels are quite more reactive to high quants (flat DC-blocks
-  // tend to appear, and are displeasant).
+  // tend to appear, and are unpleasant).
   dq_uv_dc = -4 * enc->config_->sns_strength / 100;
   dq_uv_dc = clip(dq_uv_dc, -15, 15);   // 4bit-signed max allowed
 
@@ -454,13 +454,14 @@ void VP8MakeIntra4Preds(const VP8EncIterator* const it) {
 // |UUVV| 20
 // +----+
 
-const int VP8Scan[16 + 4 + 4] = {
-  // Luma
+const int VP8Scan[16] = {  // Luma
   0 +  0 * BPS,  4 +  0 * BPS, 8 +  0 * BPS, 12 +  0 * BPS,
   0 +  4 * BPS,  4 +  4 * BPS, 8 +  4 * BPS, 12 +  4 * BPS,
   0 +  8 * BPS,  4 +  8 * BPS, 8 +  8 * BPS, 12 +  8 * BPS,
   0 + 12 * BPS,  4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS,
+};
 
+static const int VP8ScanUV[4 + 4] = {
   0 + 0 * BPS,   4 + 0 * BPS, 0 + 4 * BPS,  4 + 4 * BPS,    // U
   8 + 0 * BPS,  12 + 0 * BPS, 8 + 4 * BPS, 12 + 4 * BPS     // V
 };
@@ -514,24 +515,27 @@ static void AddScore(VP8ModeScore* const dst, const VP8ModeScore* const src) {
 //------------------------------------------------------------------------------
 // Performs trellis-optimized quantization.
 
-// Trellis
-
+// Trellis node
 typedef struct {
-  int prev;        // best previous
-  int level;       // level
-  int sign;        // sign of coeff_i
-  score_t cost;    // bit cost
-  score_t error;   // distortion = sum of (|coeff_i| - level_i * Q_i)^2
-  int ctx;         // context (only depends on 'level'. Could be spared.)
+  int8_t prev;            // best previous node
+  int8_t sign;            // sign of coeff_i
+  int16_t level;          // level
 } Node;
 
+// Score state
+typedef struct {
+  score_t score;          // partial RD score
+  const uint16_t* costs;  // shortcut to cost tables
+} ScoreState;
+
 // If a coefficient was quantized to a value Q (using a neutral bias),
 // we test all alternate possibilities between [Q-MIN_DELTA, Q+MAX_DELTA]
 // We don't test negative values though.
 #define MIN_DELTA 0   // how much lower level to try
 #define MAX_DELTA 1   // how much higher
 #define NUM_NODES (MIN_DELTA + 1 + MAX_DELTA)
-#define NODE(n, l) (nodes[(n) + 1][(l) + MIN_DELTA])
+#define NODE(n, l) (nodes[(n)][(l) + MIN_DELTA])
+#define SCORE_STATE(n, l) (score_states[n][(l) + MIN_DELTA])
 
 static WEBP_INLINE void SetRDScore(int lambda, VP8ModeScore* const rd) {
   // TODO: incorporate the "* 256" in the tables?
@@ -543,34 +547,36 @@ static WEBP_INLINE score_t RDScoreTrellis(int lambda, score_t rate,
   return rate * lambda + 256 * distortion;
 }
 
-static int TrellisQuantizeBlock(const VP8EncIterator* const it,
+static int TrellisQuantizeBlock(const VP8Encoder* const enc,
                                 int16_t in[16], int16_t out[16],
                                 int ctx0, int coeff_type,
                                 const VP8Matrix* const mtx,
                                 int lambda) {
-  ProbaArray* const last_costs = it->enc_->proba_.coeffs_[coeff_type];
-  CostArray* const costs = it->enc_->proba_.level_cost_[coeff_type];
+  const ProbaArray* const probas = enc->proba_.coeffs_[coeff_type];
+  const CostArray* const costs = enc->proba_.level_cost_[coeff_type];
   const int first = (coeff_type == 0) ? 1 : 0;
-  Node nodes[17][NUM_NODES];
+  Node nodes[16][NUM_NODES];
+  ScoreState score_states[2][NUM_NODES];
+  ScoreState* ss_cur = &SCORE_STATE(0, MIN_DELTA);
+  ScoreState* ss_prev = &SCORE_STATE(1, MIN_DELTA);
   int best_path[3] = {-1, -1, -1};   // store best-last/best-level/best-previous
   score_t best_score;
-  int best_node;
-  int last = first - 1;
-  int n, m, p, nz;
+  int n, m, p, last;
 
   {
     score_t cost;
-    score_t max_error;
     const int thresh = mtx->q_[1] * mtx->q_[1] / 4;
-    const int last_proba = last_costs[VP8EncBands[first]][ctx0][0];
+    const int last_proba = probas[VP8EncBands[first]][ctx0][0];
 
-    // compute maximal distortion.
-    max_error = 0;
-    for (n = first; n < 16; ++n) {
-      const int j  = kZigzag[n];
+    // compute the position of the last interesting coefficient
+    last = first - 1;
+    for (n = 15; n >= first; --n) {
+      const int j = kZigzag[n];
       const int err = in[j] * in[j];
-      max_error += kWeightTrellis[j] * err;
-      if (err > thresh) last = n;
+      if (err > thresh) {
+        last = n;
+        break;
+      }
     }
     // we don't need to go inspect up to n = 16 coeffs. We can just go up
     // to last + 1 (inclusive) without losing much.
@@ -578,92 +584,95 @@ static int TrellisQuantizeBlock(const VP8EncIterator* const it,
 
     // compute 'skip' score. This is the max score one can do.
     cost = VP8BitCost(0, last_proba);
-    best_score = RDScoreTrellis(lambda, cost, max_error);
+    best_score = RDScoreTrellis(lambda, cost, 0);
 
     // initialize source node.
-    n = first - 1;
     for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
-      NODE(n, m).cost = 0;
-      NODE(n, m).error = max_error;
-      NODE(n, m).ctx = ctx0;
+      const score_t rate = (ctx0 == 0) ? VP8BitCost(1, last_proba) : 0;
+      ss_cur[m].score = RDScoreTrellis(lambda, rate, 0);
+      ss_cur[m].costs = costs[VP8EncBands[first]][ctx0];
     }
   }
 
   // traverse trellis.
   for (n = first; n <= last; ++n) {
-    const int j  = kZigzag[n];
-    const int Q  = mtx->q_[j];
-    const int iQ = mtx->iq_[j];
-    const int B = BIAS(0x00);     // neutral bias
+    const int j = kZigzag[n];
+    const uint32_t Q  = mtx->q_[j];
+    const uint32_t iQ = mtx->iq_[j];
+    const uint32_t B = BIAS(0x00);     // neutral bias
     // note: it's important to take sign of the _original_ coeff,
     // so we don't have to consider level < 0 afterward.
     const int sign = (in[j] < 0);
-    const int coeff0 = (sign ? -in[j] : in[j]) + mtx->sharpen_[j];
+    const uint32_t coeff0 = (sign ? -in[j] : in[j]) + mtx->sharpen_[j];
     int level0 = QUANTDIV(coeff0, iQ, B);
     if (level0 > MAX_LEVEL) level0 = MAX_LEVEL;
 
+    {   // Swap current and previous score states
+      ScoreState* const tmp = ss_cur;
+      ss_cur = ss_prev;
+      ss_prev = tmp;
+    }
+
     // test all alternate level values around level0.
     for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
       Node* const cur = &NODE(n, m);
-      int delta_error, new_error;
-      score_t cur_score = MAX_COST;
       int level = level0 + m;
-      int last_proba;
-
-      cur->sign = sign;
-      cur->level = level;
-      cur->ctx = (level == 0) ? 0 : (level == 1) ? 1 : 2;
+      const int ctx = (level > 2) ? 2 : level;
+      const int band = VP8EncBands[n + 1];
+      score_t base_score, last_pos_score;
+      score_t best_cur_score = MAX_COST;
+      int best_prev = 0;   // default, in case
+
+      ss_cur[m].score = MAX_COST;
+      ss_cur[m].costs = costs[band][ctx];
       if (level > MAX_LEVEL || level < 0) {   // node is dead?
-        cur->cost = MAX_COST;
         continue;
       }
-      last_proba = last_costs[VP8EncBands[n + 1]][cur->ctx][0];
 
-      // Compute delta_error = how much coding this level will
-      // subtract as distortion to max_error
-      new_error = coeff0 - level * Q;
-      delta_error =
-        kWeightTrellis[j] * (coeff0 * coeff0 - new_error * new_error);
+      // Compute extra rate cost if last coeff's position is < 15
+      {
+        const score_t last_pos_cost =
+            (n < 15) ? VP8BitCost(0, probas[band][ctx][0]) : 0;
+        last_pos_score = RDScoreTrellis(lambda, last_pos_cost, 0);
+      }
+
+      {
+        // Compute delta_error = how much coding this level will
+        // subtract to max_error as distortion.
+        // Here, distortion = sum of (|coeff_i| - level_i * Q_i)^2
+        const int new_error = coeff0 - level * Q;
+        const int delta_error =
+            kWeightTrellis[j] * (new_error * new_error - coeff0 * coeff0);
+        base_score = RDScoreTrellis(lambda, 0, delta_error);
+      }
 
       // Inspect all possible non-dead predecessors. Retain only the best one.
       for (p = -MIN_DELTA; p <= MAX_DELTA; ++p) {
-        const Node* const prev = &NODE(n - 1, p);
-        const int prev_ctx = prev->ctx;
-        const uint16_t* const tcost = costs[VP8EncBands[n]][prev_ctx];
-        const score_t total_error = prev->error - delta_error;
-        score_t cost, base_cost, score;
-
-        if (prev->cost >= MAX_COST) {   // dead node?
-          continue;
-        }
-
-        // Base cost of both terminal/non-terminal
-        base_cost = prev->cost + VP8LevelCost(tcost, level);
-
+        // Dead nodes (with ss_prev[p].score >= MAX_COST) are automatically
+        // eliminated since their score can't be better than the current best.
+        const score_t cost = VP8LevelCost(ss_prev[p].costs, level);
         // Examine node assuming it's a non-terminal one.
-        cost = base_cost;
-        if (level && n < 15) {
-          cost += VP8BitCost(1, last_proba);
+        const score_t score =
+            base_score + ss_prev[p].score + RDScoreTrellis(lambda, cost, 0);
+        if (score < best_cur_score) {
+          best_cur_score = score;
+          best_prev = p;
         }
-        score = RDScoreTrellis(lambda, cost, total_error);
-        if (score < cur_score) {
-          cur_score = score;
-          cur->cost  = cost;
-          cur->error = total_error;
-          cur->prev  = p;
-        }
-
-        // Now, record best terminal node (and thus best entry in the graph).
-        if (level) {
-          cost = base_cost;
-          if (n < 15) cost += VP8BitCost(0, last_proba);
-          score = RDScoreTrellis(lambda, cost, total_error);
-          if (score < best_score) {
-            best_score = score;
-            best_path[0] = n;   // best eob position
-            best_path[1] = m;   // best level
-            best_path[2] = p;   // best predecessor
-          }
+      }
+      // Store best finding in current node.
+      cur->sign = sign;
+      cur->level = level;
+      cur->prev = best_prev;
+      ss_cur[m].score = best_cur_score;
+
+      // Now, record best terminal node (and thus best entry in the graph).
+      if (level != 0) {
+        const score_t score = best_cur_score + last_pos_score;
+        if (score < best_score) {
+          best_score = score;
+          best_path[0] = n;                     // best eob position
+          best_path[1] = m;                     // best node index
+          best_path[2] = best_prev;             // best predecessor
         }
       }
     }
@@ -676,23 +685,25 @@ static int TrellisQuantizeBlock(const VP8EncIterator* const it,
     return 0;   // skip!
   }
 
-  // Unwind the best path.
-  // Note: best-prev on terminal node is not necessarily equal to the
-  // best_prev for non-terminal. So we patch best_path[2] in.
-  n = best_path[0];
-  best_node = best_path[1];
-  NODE(n, best_node).prev = best_path[2];   // force best-prev for terminal
-  nz = 0;
-
-  for (; n >= first; --n) {
-    const Node* const node = &NODE(n, best_node);
-    const int j = kZigzag[n];
-    out[n] = node->sign ? -node->level : node->level;
-    nz |= (node->level != 0);
-    in[j] = out[n] * mtx->q_[j];
-    best_node = node->prev;
+  {
+    // Unwind the best path.
+    // Note: best-prev on terminal node is not necessarily equal to the
+    // best_prev for non-terminal. So we patch best_path[2] in.
+    int nz = 0;
+    int best_node = best_path[1];
+    n = best_path[0];
+    NODE(n, best_node).prev = best_path[2];   // force best-prev for terminal
+
+    for (; n >= first; --n) {
+      const Node* const node = &NODE(n, best_node);
+      const int j = kZigzag[n];
+      out[n] = node->sign ? -node->level : node->level;
+      nz |= node->level;
+      in[j] = out[n] * mtx->q_[j];
+      best_node = node->prev;
+    }
+    return (nz != 0);
   }
-  return nz;
 }
 
 #undef NODE
@@ -706,10 +717,10 @@ static int ReconstructIntra16(VP8EncIterator* const it,
                               VP8ModeScore* const rd,
                               uint8_t* const yuv_out,
                               int mode) {
-  VP8Encoder* const enc = it->enc_;
+  const VP8Encoder* const enc = it->enc_;
   const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode];
   const uint8_t* const src = it->yuv_in_ + Y_OFF;
-  VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
+  const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
   int nz = 0;
   int n;
   int16_t tmp[16][16], dc_tmp[16];
@@ -727,20 +738,25 @@ static int ReconstructIntra16(VP8EncIterator* const it,
       for (x = 0; x < 4; ++x, ++n) {
         const int ctx = it->top_nz_[x] + it->left_nz_[y];
         const int non_zero =
-           TrellisQuantizeBlock(it, tmp[n], rd->y_ac_levels[n], ctx, 0,
-                                &dqm->y1_, dqm->lambda_trellis_i16_);
+            TrellisQuantizeBlock(enc, tmp[n], rd->y_ac_levels[n], ctx, 0,
+                                 &dqm->y1_, dqm->lambda_trellis_i16_);
         it->top_nz_[x] = it->left_nz_[y] = non_zero;
+        rd->y_ac_levels[n][0] = 0;
         nz |= non_zero << n;
       }
     }
   } else {
     for (n = 0; n < 16; ++n) {
-      nz |= VP8EncQuantizeBlock(tmp[n], rd->y_ac_levels[n], 1, &dqm->y1_) << n;
+      // Zero-out the first coeff, so that: a) nz is correct below, and
+      // b) finding 'last' non-zero coeffs in SetResidualCoeffs() is simplified.
+      tmp[n][0] = 0;
+      nz |= VP8EncQuantizeBlock(tmp[n], rd->y_ac_levels[n], &dqm->y1_) << n;
+      assert(rd->y_ac_levels[n][0] == 0);
     }
   }
 
   // Transform back
-  VP8ITransformWHT(dc_tmp, tmp[0]);
+  VP8TransformWHT(dc_tmp, tmp[0]);
   for (n = 0; n < 16; n += 2) {
     VP8ITransform(ref + VP8Scan[n], tmp[n], yuv_out + VP8Scan[n], 1);
   }
@@ -763,10 +779,10 @@ static int ReconstructIntra4(VP8EncIterator* const it,
   if (DO_TRELLIS_I4 && it->do_trellis_) {
     const int x = it->i4_ & 3, y = it->i4_ >> 2;
     const int ctx = it->top_nz_[x] + it->left_nz_[y];
-    nz = TrellisQuantizeBlock(it, tmp, levels, ctx, 3, &dqm->y1_,
+    nz = TrellisQuantizeBlock(enc, tmp, levels, ctx, 3, &dqm->y1_,
                               dqm->lambda_trellis_i4_);
   } else {
-    nz = VP8EncQuantizeBlock(tmp, levels, 0, &dqm->y1_);
+    nz = VP8EncQuantizeBlock(tmp, levels, &dqm->y1_);
   }
   VP8ITransform(ref, tmp, yuv_out, 0);
   return nz;
@@ -783,7 +799,7 @@ static int ReconstructUV(VP8EncIterator* const it, VP8ModeScore* const rd,
   int16_t tmp[8][16];
 
   for (n = 0; n < 8; ++n) {
-    VP8FTransform(src + VP8Scan[16 + n], ref + VP8Scan[16 + n], tmp[n]);
+    VP8FTransform(src + VP8ScanUV[n], ref + VP8ScanUV[n], tmp[n]);
   }
   if (DO_TRELLIS_UV && it->do_trellis_) {
     int ch, x, y;
@@ -792,8 +808,8 @@ static int ReconstructUV(VP8EncIterator* const it, VP8ModeScore* const rd,
         for (x = 0; x < 2; ++x, ++n) {
           const int ctx = it->top_nz_[4 + ch + x] + it->left_nz_[4 + ch + y];
           const int non_zero =
-            TrellisQuantizeBlock(it, tmp[n], rd->uv_levels[n], ctx, 2,
-                                 &dqm->uv_, dqm->lambda_trellis_uv_);
+              TrellisQuantizeBlock(enc, tmp[n], rd->uv_levels[n], ctx, 2,
+                                   &dqm->uv_, dqm->lambda_trellis_uv_);
           it->top_nz_[4 + ch + x] = it->left_nz_[4 + ch + y] = non_zero;
           nz |= non_zero << n;
         }
@@ -801,12 +817,12 @@ static int ReconstructUV(VP8EncIterator* const it, VP8ModeScore* const rd,
     }
   } else {
     for (n = 0; n < 8; ++n) {
-      nz |= VP8EncQuantizeBlock(tmp[n], rd->uv_levels[n], 0, &dqm->uv_) << n;
+      nz |= VP8EncQuantizeBlock(tmp[n], rd->uv_levels[n], &dqm->uv_) << n;
     }
   }
 
   for (n = 0; n < 8; n += 2) {
-    VP8ITransform(ref + VP8Scan[16 + n], tmp[n], yuv_out + VP8Scan[16 + n], 1);
+    VP8ITransform(ref + VP8ScanUV[n], tmp[n], yuv_out + VP8ScanUV[n], 1);
   }
   return (nz << 16);
 }
@@ -851,8 +867,7 @@ static score_t IsFlat(const int16_t* levels, int num_blocks, score_t thresh) {
 
 static void PickBestIntra16(VP8EncIterator* const it, VP8ModeScore* const rd) {
   const int kNumBlocks = 16;
-  VP8Encoder* const enc = it->enc_;
-  VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
+  VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
   const int lambda = dqm->lambda_i16_;
   const int tlambda = dqm->tlambda_;
   const uint8_t* const src = it->yuv_in_ + Y_OFF;
@@ -999,8 +1014,7 @@ static int PickBestIntra4(VP8EncIterator* const it, VP8ModeScore* const rd) {
 
 static void PickBestUV(VP8EncIterator* const it, VP8ModeScore* const rd) {
   const int kNumBlocks = 8;
-  const VP8Encoder* const enc = it->enc_;
-  const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
+  const VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
   const int lambda = dqm->lambda_uv_;
   const uint8_t* const src = it->yuv_in_ + U_OFF;
   uint8_t* const tmp_dst = it->yuv_out2_ + U_OFF;  // scratch buffer
diff --git a/src/3rdparty/libwebp/src/enc/syntax.c b/src/3rdparty/libwebp/src/enc/syntax.c
index 08cfe79..d1ff0a5 100644
--- a/src/3rdparty/libwebp/src/enc/syntax.c
+++ b/src/3rdparty/libwebp/src/enc/syntax.c
@@ -263,53 +263,16 @@ static int EmitPartitionsSize(const VP8Encoder* const enc,
 
 //------------------------------------------------------------------------------
 
-#ifdef WEBP_EXPERIMENTAL_FEATURES
-
-#define KTRAILER_SIZE 8
-
-static int WriteExtensions(VP8Encoder* const enc) {
-  uint8_t buffer[KTRAILER_SIZE];
-  VP8BitWriter* const bw = &enc->bw_;
-  WebPPicture* const pic = enc->pic_;
-
-  // Layer (bytes 0..3)
-  PutLE24(buffer + 0, enc->layer_data_size_);
-  buffer[3] = enc->pic_->colorspace & WEBP_CSP_UV_MASK;
-  if (enc->layer_data_size_ > 0) {
-    assert(enc->use_layer_);
-    // append layer data to last partition
-    if (!VP8BitWriterAppend(&enc->parts_[enc->num_parts_ - 1],
-                            enc->layer_data_, enc->layer_data_size_)) {
-      return WebPEncodingSetError(pic, VP8_ENC_ERROR_BITSTREAM_OUT_OF_MEMORY);
-    }
-  }
-
-  buffer[KTRAILER_SIZE - 1] = 0x01;  // marker
-  if (!VP8BitWriterAppend(bw, buffer, KTRAILER_SIZE)) {
-    return WebPEncodingSetError(pic, VP8_ENC_ERROR_BITSTREAM_OUT_OF_MEMORY);
-  }
-  return 1;
-}
-
-#endif    /* WEBP_EXPERIMENTAL_FEATURES */
-
-//------------------------------------------------------------------------------
-
-static size_t GeneratePartition0(VP8Encoder* const enc) {
+static int GeneratePartition0(VP8Encoder* const enc) {
   VP8BitWriter* const bw = &enc->bw_;
   const int mb_size = enc->mb_w_ * enc->mb_h_;
   uint64_t pos1, pos2, pos3;
-#ifdef WEBP_EXPERIMENTAL_FEATURES
-  const int need_extensions = enc->use_layer_;
-#endif
 
   pos1 = VP8BitWriterPos(bw);
-  VP8BitWriterInit(bw, mb_size * 7 / 8);        // ~7 bits per macroblock
-#ifdef WEBP_EXPERIMENTAL_FEATURES
-  VP8PutBitUniform(bw, need_extensions);   // extensions
-#else
+  if (!VP8BitWriterInit(bw, mb_size * 7 / 8)) {        // ~7 bits per macroblock
+    return WebPEncodingSetError(enc->pic_, VP8_ENC_ERROR_OUT_OF_MEMORY);
+  }
   VP8PutBitUniform(bw, 0);   // colorspace
-#endif
   VP8PutBitUniform(bw, 0);   // clamp type
 
   PutSegmentHeader(bw, enc);
@@ -324,21 +287,17 @@ static size_t GeneratePartition0(VP8Encoder* const enc) {
   VP8CodeIntraModes(enc);
   VP8BitWriterFinish(bw);
 
-#ifdef WEBP_EXPERIMENTAL_FEATURES
-  if (need_extensions && !WriteExtensions(enc)) {
-    return 0;
-  }
-#endif
-
   pos3 = VP8BitWriterPos(bw);
 
   if (enc->pic_->stats) {
     enc->pic_->stats->header_bytes[0] = (int)((pos2 - pos1 + 7) >> 3);
     enc->pic_->stats->header_bytes[1] = (int)((pos3 - pos2 + 7) >> 3);
     enc->pic_->stats->alpha_data_size = (int)enc->alpha_data_size_;
-    enc->pic_->stats->layer_data_size = (int)enc->layer_data_size_;
   }
-  return !bw->error_;
+  if (bw->error_) {
+    return WebPEncodingSetError(enc->pic_, VP8_ENC_ERROR_OUT_OF_MEMORY);
+  }
+  return 1;
 }
 
 void VP8EncFreeBitWriters(VP8Encoder* const enc) {
@@ -360,7 +319,8 @@ int VP8EncWrite(VP8Encoder* const enc) {
   int p;
 
   // Partition #0 with header and partition sizes
-  ok = !!GeneratePartition0(enc);
+  ok = GeneratePartition0(enc);
+  if (!ok) return 0;
 
   // Compute VP8 size
   vp8_size = VP8_FRAME_HEADER_SIZE +
diff --git a/src/3rdparty/libwebp/src/enc/token.c b/src/3rdparty/libwebp/src/enc/token.c
index e696642..8af13a0 100644
--- a/src/3rdparty/libwebp/src/enc/token.c
+++ b/src/3rdparty/libwebp/src/enc/token.c
@@ -22,27 +22,32 @@
 
 #include "./cost.h"
 #include "./vp8enci.h"
+#include "../utils/utils.h"
 
 #if !defined(DISABLE_TOKEN_BUFFER)
 
 // we use pages to reduce the number of memcpy()
-#define MAX_NUM_TOKEN 8192          // max number of token per page
+#define MIN_PAGE_SIZE 8192          // minimum number of token per page
 #define FIXED_PROBA_BIT (1u << 14)
 
+typedef uint16_t token_t;  // bit#15: bit
+                           // bit #14: constant proba or idx
+                           // bits 0..13: slot or constant proba
 struct VP8Tokens {
-  uint16_t tokens_[MAX_NUM_TOKEN];  // bit#15: bit
-                                    // bit #14: constant proba or idx
-                                    // bits 0..13: slot or constant proba
-  VP8Tokens* next_;
+  VP8Tokens* next_;        // pointer to next page
 };
+// Token data is located in memory just after the next_ field.
+// This macro is used to return their address and hide the trick.
+#define TOKEN_DATA(p) ((token_t*)&(p)[1])
 
 //------------------------------------------------------------------------------
 
-void VP8TBufferInit(VP8TBuffer* const b) {
+void VP8TBufferInit(VP8TBuffer* const b, int page_size) {
   b->tokens_ = NULL;
   b->pages_ = NULL;
   b->last_page_ = &b->pages_;
   b->left_ = 0;
+  b->page_size_ = (page_size < MIN_PAGE_SIZE) ? MIN_PAGE_SIZE : page_size;
   b->error_ = 0;
 }
 
@@ -51,24 +56,29 @@ void VP8TBufferClear(VP8TBuffer* const b) {
     const VP8Tokens* p = b->pages_;
     while (p != NULL) {
       const VP8Tokens* const next = p->next_;
-      free((void*)p);
+      WebPSafeFree((void*)p);
       p = next;
     }
-    VP8TBufferInit(b);
+    VP8TBufferInit(b, b->page_size_);
   }
 }
 
 static int TBufferNewPage(VP8TBuffer* const b) {
-  VP8Tokens* const page = b->error_ ? NULL : (VP8Tokens*)malloc(sizeof(*page));
+  VP8Tokens* page = NULL;
+  const size_t size = sizeof(*page) + b->page_size_ * sizeof(token_t);
+  if (!b->error_) {
+    page = (VP8Tokens*)WebPSafeMalloc(1ULL, size);
+  }
   if (page == NULL) {
     b->error_ = 1;
     return 0;
   }
+  page->next_ = NULL;
+
   *b->last_page_ = page;
   b->last_page_ = &page->next_;
-  b->left_ = MAX_NUM_TOKEN;
-  b->tokens_ = page->tokens_;
-  page->next_ = NULL;
+  b->left_ = b->page_size_;
+  b->tokens_ = TOKEN_DATA(page);
   return 1;
 }
 
@@ -195,8 +205,9 @@ void VP8TokenToStats(const VP8TBuffer* const b, proba_t* const stats) {
   while (p != NULL) {
     const int N = (p->next_ == NULL) ? b->left_ : 0;
     int n = MAX_NUM_TOKEN;
+    const token_t* const tokens = TOKEN_DATA(p);
     while (n-- > N) {
-      const uint16_t token = p->tokens_[n];
+      const token_t token = tokens[n];
       if (!(token & FIXED_PROBA_BIT)) {
         Record((token >> 15) & 1, stats + (token & 0x3fffu));
       }
@@ -214,13 +225,14 @@ int VP8EmitTokens(VP8TBuffer* const b, VP8BitWriter* const bw,
                   const uint8_t* const probas, int final_pass) {
   const VP8Tokens* p = b->pages_;
   (void)final_pass;
-  if (b->error_) return 0;
+  assert(!b->error_);
   while (p != NULL) {
     const VP8Tokens* const next = p->next_;
     const int N = (next == NULL) ? b->left_ : 0;
-    int n = MAX_NUM_TOKEN;
+    int n = b->page_size_;
+    const token_t* const tokens = TOKEN_DATA(p);
     while (n-- > N) {
-      const uint16_t token = p->tokens_[n];
+      const token_t token = tokens[n];
       const int bit = (token >> 15) & 1;
       if (token & FIXED_PROBA_BIT) {
         VP8PutBit(bw, bit, token & 0xffu);  // constant proba
@@ -228,7 +240,7 @@ int VP8EmitTokens(VP8TBuffer* const b, VP8BitWriter* const bw,
         VP8PutBit(bw, bit, probas[token & 0x3fffu]);
       }
     }
-    if (final_pass) free((void*)p);
+    if (final_pass) WebPSafeFree((void*)p);
     p = next;
   }
   if (final_pass) b->pages_ = NULL;
@@ -239,13 +251,14 @@ int VP8EmitTokens(VP8TBuffer* const b, VP8BitWriter* const bw,
 size_t VP8EstimateTokenSize(VP8TBuffer* const b, const uint8_t* const probas) {
   size_t size = 0;
   const VP8Tokens* p = b->pages_;
-  if (b->error_) return 0;
+  assert(!b->error_);
   while (p != NULL) {
     const VP8Tokens* const next = p->next_;
     const int N = (next == NULL) ? b->left_ : 0;
-    int n = MAX_NUM_TOKEN;
+    int n = b->page_size_;
+    const token_t* const tokens = TOKEN_DATA(p);
     while (n-- > N) {
-      const uint16_t token = p->tokens_[n];
+      const token_t token = tokens[n];
       const int bit = token & (1 << 15);
       if (token & FIXED_PROBA_BIT) {
         size += VP8BitCost(bit, token & 0xffu);
diff --git a/src/3rdparty/libwebp/src/enc/vp8enci.h b/src/3rdparty/libwebp/src/enc/vp8enci.h
index 71adf6c..74c8f70 100644
--- a/src/3rdparty/libwebp/src/enc/vp8enci.h
+++ b/src/3rdparty/libwebp/src/enc/vp8enci.h
@@ -30,7 +30,7 @@ extern "C" {
 // version numbers
 #define ENC_MAJ_VERSION 0
 #define ENC_MIN_VERSION 4
-#define ENC_REV_VERSION 0
+#define ENC_REV_VERSION 3
 
 // intra prediction modes
 enum { B_DC_PRED = 0,   // 4x4 modes
@@ -130,8 +130,8 @@ typedef enum {   // Rate-distortion optimization levels
 #define ALIGN_CST 15
 #define DO_ALIGN(PTR) ((uintptr_t)((PTR) + ALIGN_CST) & ~ALIGN_CST)
 
-extern const int VP8Scan[16 + 4 + 4];           // in quant.c
-extern const int VP8UVModeOffsets[4];           // in analyze.c
+extern const int VP8Scan[16];           // in quant.c
+extern const int VP8UVModeOffsets[4];   // in analyze.c
 extern const int VP8I16ModeOffsets[4];
 extern const int VP8I4ModeOffsets[NUM_BMODES];
 
@@ -160,14 +160,16 @@ extern const int VP8I4ModeOffsets[NUM_BMODES];
 #define I4TMP (6 * 16 * BPS + 8 * BPS +  8)
 
 typedef int64_t score_t;     // type used for scores, rate, distortion
+// Note that MAX_COST is not the maximum allowed by sizeof(score_t),
+// in order to allow overflowing computations.
 #define MAX_COST ((score_t)0x7fffffffffffffLL)
 
 #define QFIX 17
 #define BIAS(b)  ((b) << (QFIX - 8))
 // Fun fact: this is the _only_ line where we're actually being lossy and
 // discarding bits.
-static WEBP_INLINE int QUANTDIV(int n, int iQ, int B) {
-  return (n * iQ + B) >> QFIX;
+static WEBP_INLINE int QUANTDIV(uint32_t n, uint32_t iQ, uint32_t B) {
+  return (int)((n * iQ + B) >> QFIX);
 }
 
 // size of histogram used by CollectHistogram.
@@ -204,9 +206,9 @@ typedef struct {
 typedef struct {
   uint8_t segments_[3];     // probabilities for segment tree
   uint8_t skip_proba_;      // final probability of being skipped.
-  ProbaArray coeffs_[NUM_TYPES][NUM_BANDS];      // 924 bytes
+  ProbaArray coeffs_[NUM_TYPES][NUM_BANDS];      // 1056 bytes
   StatsArray stats_[NUM_TYPES][NUM_BANDS];       // 4224 bytes
-  CostArray level_cost_[NUM_TYPES][NUM_BANDS];   // 11.4k
+  CostArray level_cost_[NUM_TYPES][NUM_BANDS];   // 13056 bytes
   int dirty_;               // if true, need to call VP8CalculateLevelCosts()
   int use_skip_proba_;      // Note: we always use skip_proba for now.
   int nb_skip_;             // number of skipped blocks
@@ -236,8 +238,8 @@ typedef struct {
 typedef struct VP8Matrix {
   uint16_t q_[16];        // quantizer steps
   uint16_t iq_[16];       // reciprocals, fixed point.
-  uint16_t bias_[16];     // rounding bias
-  uint16_t zthresh_[16];  // value under which a coefficient is zeroed
+  uint32_t bias_[16];     // rounding bias
+  uint32_t zthresh_[16];  // value below which a coefficient is zeroed
   uint16_t sharpen_[16];  // frequency boosters for slight sharpening
 } VP8Matrix;
 
@@ -361,12 +363,14 @@ typedef struct {
   VP8Tokens* pages_;        // first page
   VP8Tokens** last_page_;   // last page
   uint16_t* tokens_;        // set to (*last_page_)->tokens_
-  int left_;          // how many free tokens left before the page is full.
+  int left_;                // how many free tokens left before the page is full
+  int page_size_;           // number of tokens per page
 #endif
   int error_;         // true in case of malloc error
 } VP8TBuffer;
 
-void VP8TBufferInit(VP8TBuffer* const b);    // initialize an empty buffer
+// initialize an empty buffer
+void VP8TBufferInit(VP8TBuffer* const b, int page_size);
 void VP8TBufferClear(VP8TBuffer* const b);   // de-allocate pages memory
 
 #if !defined(DISABLE_TOKEN_BUFFER)
@@ -422,12 +426,6 @@ struct VP8Encoder {
   uint32_t alpha_data_size_;
   WebPWorker alpha_worker_;
 
-  // enhancement layer
-  int use_layer_;
-  VP8BitWriter layer_bw_;
-  uint8_t* layer_data_;
-  size_t layer_data_size_;
-
   // quantization info (one set of DC/AC dequant factor per segment)
   VP8SegmentInfo dqm_[NUM_MB_SEGMENTS];
   int base_quant_;                 // nominal quantizer value. Only used
@@ -459,10 +457,10 @@ struct VP8Encoder {
   VP8MBInfo* mb_info_;   // contextual macroblock infos (mb_w_ + 1)
   uint8_t*   preds_;     // predictions modes: (4*mb_w+1) * (4*mb_h+1)
   uint32_t*  nz_;        // non-zero bit context: mb_w+1
-  uint8_t   *y_top_;     // top luma samples.
-  uint8_t   *uv_top_;    // top u/v samples.
+  uint8_t*   y_top_;     // top luma samples.
+  uint8_t*   uv_top_;    // top u/v samples.
                          // U and V are packed into 16 bytes (8 U + 8 V)
-  LFStats   *lf_stats_;  // autofilter stats (if NULL, autofilter is off)
+  LFStats*   lf_stats_;  // autofilter stats (if NULL, autofilter is off)
 };
 
 //------------------------------------------------------------------------------
@@ -533,12 +531,6 @@ int VP8EncStartAlpha(VP8Encoder* const enc);    // start alpha coding process
 int VP8EncFinishAlpha(VP8Encoder* const enc);   // finalize compressed data
 int VP8EncDeleteAlpha(VP8Encoder* const enc);   // delete compressed data
 
-  // in layer.c
-void VP8EncInitLayer(VP8Encoder* const enc);     // init everything
-void VP8EncCodeLayerBlock(VP8EncIterator* it);   // code one more macroblock
-int VP8EncFinishLayer(VP8Encoder* const enc);    // finalize coding
-void VP8EncDeleteLayer(VP8Encoder* enc);         // reclaim memory
-
   // in filter.c
 
 // SSIM utils
@@ -561,8 +553,28 @@ void VP8AdjustFilterStrength(VP8EncIterator* const it);
 // step of 'delta', given a sharpness parameter 'sharpness'.
 int VP8FilterStrengthFromDelta(int sharpness, int delta);
 
+  // misc utils for picture_*.c:
+
+// Remove reference to the ARGB/YUVA buffer (doesn't free anything).
+void WebPPictureResetBuffers(WebPPicture* const picture);
+
+// Allocates ARGB buffer of given dimension (previous one is always free'd).
+// Preserves the YUV(A) buffer. Returns false in case of error (invalid param,
+// out-of-memory).
+int WebPPictureAllocARGB(WebPPicture* const picture, int width, int height);
+
+// Allocates YUVA buffer of given dimension (previous one is always free'd).
+// Uses picture->csp to determine whether an alpha buffer is needed.
+// Preserves the ARGB buffer.
+// Returns false in case of error (invalid param, out-of-memory).
+int WebPPictureAllocYUVA(WebPPicture* const picture, int width, int height);
+
 //------------------------------------------------------------------------------
 
+#if WEBP_ENCODER_ABI_VERSION <= 0x0203
+void WebPMemoryWriterClear(WebPMemoryWriter* writer);
+#endif
+
 #ifdef __cplusplus
 }    // extern "C"
 #endif
diff --git a/src/3rdparty/libwebp/src/enc/vp8l.c b/src/3rdparty/libwebp/src/enc/vp8l.c
index 1572631..c2bb13d 100644
--- a/src/3rdparty/libwebp/src/enc/vp8l.c
+++ b/src/3rdparty/libwebp/src/enc/vp8l.c
@@ -106,14 +106,9 @@ static int AnalyzeEntropy(const uint32_t* argb,
   const uint32_t* last_line = NULL;
   uint32_t last_pix = argb[0];    // so we're sure that pix_diff == 0
 
-  VP8LHistogram* nonpredicted = NULL;
-  VP8LHistogram* predicted =
-      (VP8LHistogram*)malloc(2 * sizeof(*predicted));
-  if (predicted == NULL) return 0;
-  nonpredicted = predicted + 1;
-
-  VP8LHistogramInit(predicted, 0);
-  VP8LHistogramInit(nonpredicted, 0);
+  VP8LHistogramSet* const histo_set = VP8LAllocateHistogramSet(2, 0);
+  if (histo_set == NULL) return 0;
+
   for (y = 0; y < height; ++y) {
     for (x = 0; x < width; ++x) {
       const uint32_t pix = argb[x];
@@ -126,21 +121,28 @@ static int AnalyzeEntropy(const uint32_t* argb,
       {
         const PixOrCopy pix_token = PixOrCopyCreateLiteral(pix);
         const PixOrCopy pix_diff_token = PixOrCopyCreateLiteral(pix_diff);
-        VP8LHistogramAddSinglePixOrCopy(nonpredicted, &pix_token);
-        VP8LHistogramAddSinglePixOrCopy(predicted, &pix_diff_token);
+        VP8LHistogramAddSinglePixOrCopy(histo_set->histograms[0], &pix_token);
+        VP8LHistogramAddSinglePixOrCopy(histo_set->histograms[1],
+                                        &pix_diff_token);
       }
     }
     last_line = argb;
     argb += argb_stride;
   }
-  *nonpredicted_bits = VP8LHistogramEstimateBitsBulk(nonpredicted);
-  *predicted_bits = VP8LHistogramEstimateBitsBulk(predicted);
-  free(predicted);
+  *nonpredicted_bits = VP8LHistogramEstimateBitsBulk(histo_set->histograms[0]);
+  *predicted_bits = VP8LHistogramEstimateBitsBulk(histo_set->histograms[1]);
+  VP8LFreeHistogramSet(histo_set);
   return 1;
 }
 
-static int VP8LEncAnalyze(VP8LEncoder* const enc, WebPImageHint image_hint) {
+static int AnalyzeAndInit(VP8LEncoder* const enc, WebPImageHint image_hint) {
   const WebPPicture* const pic = enc->pic_;
+  const int width = pic->width;
+  const int height = pic->height;
+  const int pix_cnt = width * height;
+  // we round the block size up, so we're guaranteed to have
+  // at max MAX_REFS_BLOCK_PER_IMAGE blocks used:
+  int refs_block_size = (pix_cnt - 1) / MAX_REFS_BLOCK_PER_IMAGE + 1;
   assert(pic != NULL && pic->argb != NULL);
 
   enc->use_palette_ =
@@ -158,7 +160,7 @@ static int VP8LEncAnalyze(VP8LEncoder* const enc, WebPImageHint image_hint) {
       enc->use_cross_color_ = 1;
     } else {
       double non_pred_entropy, pred_entropy;
-      if (!AnalyzeEntropy(pic->argb, pic->width, pic->height, pic->argb_stride,
+      if (!AnalyzeEntropy(pic->argb, width, height, pic->argb_stride,
                           &non_pred_entropy, &pred_entropy)) {
         return 0;
       }
@@ -168,27 +170,38 @@ static int VP8LEncAnalyze(VP8LEncoder* const enc, WebPImageHint image_hint) {
       }
     }
   }
+  if (!VP8LHashChainInit(&enc->hash_chain_, pix_cnt)) return 0;
+
+  // palette-friendly input typically uses less literals
+  //  -> reduce block size a bit
+  if (enc->use_palette_) refs_block_size /= 2;
+  VP8LBackwardRefsInit(&enc->refs_[0], refs_block_size);
+  VP8LBackwardRefsInit(&enc->refs_[1], refs_block_size);
 
   return 1;
 }
 
+// Returns false in case of memory error.
 static int GetHuffBitLengthsAndCodes(
     const VP8LHistogramSet* const histogram_image,
     HuffmanTreeCode* const huffman_codes) {
   int i, k;
-  int ok = 1;
+  int ok = 0;
   uint64_t total_length_size = 0;
   uint8_t* mem_buf = NULL;
   const int histogram_image_size = histogram_image->size;
+  int max_num_symbols = 0;
+  uint8_t* buf_rle = NULL;
+  HuffmanTree* huff_tree = NULL;
 
   // Iterate over all histograms and get the aggregate number of codes used.
   for (i = 0; i < histogram_image_size; ++i) {
     const VP8LHistogram* const histo = histogram_image->histograms[i];
     HuffmanTreeCode* const codes = &huffman_codes[5 * i];
     for (k = 0; k < 5; ++k) {
-      const int num_symbols = (k == 0) ? VP8LHistogramNumCodes(histo)
-                            : (k == 4) ? NUM_DISTANCE_CODES
-                            : 256;
+      const int num_symbols =
+          (k == 0) ? VP8LHistogramNumCodes(histo->palette_code_bits_) :
+          (k == 4) ? NUM_DISTANCE_CODES : 256;
       codes[k].num_symbols = num_symbols;
       total_length_size += num_symbols;
     }
@@ -200,10 +213,8 @@ static int GetHuffBitLengthsAndCodes(
     uint8_t* lengths;
     mem_buf = (uint8_t*)WebPSafeCalloc(total_length_size,
                                        sizeof(*lengths) + sizeof(*codes));
-    if (mem_buf == NULL) {
-      ok = 0;
-      goto End;
-    }
+    if (mem_buf == NULL) goto End;
+
     codes = (uint16_t*)mem_buf;
     lengths = (uint8_t*)&codes[total_length_size];
     for (i = 0; i < 5 * histogram_image_size; ++i) {
@@ -212,24 +223,33 @@ static int GetHuffBitLengthsAndCodes(
       huffman_codes[i].code_lengths = lengths;
       codes += bit_length;
       lengths += bit_length;
+      if (max_num_symbols < bit_length) {
+        max_num_symbols = bit_length;
+      }
     }
   }
 
+  buf_rle = (uint8_t*)WebPSafeMalloc(1ULL, max_num_symbols);
+  huff_tree = (HuffmanTree*)WebPSafeMalloc(3ULL * max_num_symbols,
+                                           sizeof(*huff_tree));
+  if (buf_rle == NULL || huff_tree == NULL) goto End;
+
   // Create Huffman trees.
-  for (i = 0; ok && (i < histogram_image_size); ++i) {
+  for (i = 0; i < histogram_image_size; ++i) {
     HuffmanTreeCode* const codes = &huffman_codes[5 * i];
     VP8LHistogram* const histo = histogram_image->histograms[i];
-    ok = ok && VP8LCreateHuffmanTree(histo->literal_, 15, codes + 0);
-    ok = ok && VP8LCreateHuffmanTree(histo->red_, 15, codes + 1);
-    ok = ok && VP8LCreateHuffmanTree(histo->blue_, 15, codes + 2);
-    ok = ok && VP8LCreateHuffmanTree(histo->alpha_, 15, codes + 3);
-    ok = ok && VP8LCreateHuffmanTree(histo->distance_, 15, codes + 4);
+    VP8LCreateHuffmanTree(histo->literal_, 15, buf_rle, huff_tree, codes + 0);
+    VP8LCreateHuffmanTree(histo->red_, 15, buf_rle, huff_tree, codes + 1);
+    VP8LCreateHuffmanTree(histo->blue_, 15, buf_rle, huff_tree, codes + 2);
+    VP8LCreateHuffmanTree(histo->alpha_, 15, buf_rle, huff_tree, codes + 3);
+    VP8LCreateHuffmanTree(histo->distance_, 15, buf_rle, huff_tree, codes + 4);
   }
-
+  ok = 1;
  End:
+  WebPSafeFree(huff_tree);
+  WebPSafeFree(buf_rle);
   if (!ok) {
-    free(mem_buf);
-    // If one VP8LCreateHuffmanTree() above fails, we need to clean up behind.
+    WebPSafeFree(mem_buf);
     memset(huffman_codes, 0, 5 * histogram_image_size * sizeof(*huffman_codes));
   }
   return ok;
@@ -296,18 +316,16 @@ static void StoreHuffmanTreeToBitMask(
   }
 }
 
-static int StoreFullHuffmanCode(VP8LBitWriter* const bw,
-                                const HuffmanTreeCode* const tree) {
-  int ok = 0;
+// 'huff_tree' and 'tokens' are pre-alloacted buffers.
+static void StoreFullHuffmanCode(VP8LBitWriter* const bw,
+                                 HuffmanTree* const huff_tree,
+                                 HuffmanTreeToken* const tokens,
+                                 const HuffmanTreeCode* const tree) {
   uint8_t code_length_bitdepth[CODE_LENGTH_CODES] = { 0 };
   uint16_t code_length_bitdepth_symbols[CODE_LENGTH_CODES] = { 0 };
   const int max_tokens = tree->num_symbols;
   int num_tokens;
   HuffmanTreeCode huffman_code;
-  HuffmanTreeToken* const tokens =
-      (HuffmanTreeToken*)WebPSafeMalloc((uint64_t)max_tokens, sizeof(*tokens));
-  if (tokens == NULL) return 0;
-
   huffman_code.num_symbols = CODE_LENGTH_CODES;
   huffman_code.code_lengths = code_length_bitdepth;
   huffman_code.codes = code_length_bitdepth_symbols;
@@ -315,15 +333,14 @@ static int StoreFullHuffmanCode(VP8LBitWriter* const bw,
   VP8LWriteBits(bw, 1, 0);
   num_tokens = VP8LCreateCompressedHuffmanTree(tree, tokens, max_tokens);
   {
-    int histogram[CODE_LENGTH_CODES] = { 0 };
+    uint32_t histogram[CODE_LENGTH_CODES] = { 0 };
+    uint8_t buf_rle[CODE_LENGTH_CODES] = { 0 };
     int i;
     for (i = 0; i < num_tokens; ++i) {
       ++histogram[tokens[i].code];
     }
 
-    if (!VP8LCreateHuffmanTree(histogram, 7, &huffman_code)) {
-      goto End;
-    }
+    VP8LCreateHuffmanTree(histogram, 7, buf_rle, huff_tree, &huffman_code);
   }
 
   StoreHuffmanTreeOfHuffmanTreeToBitMask(bw, code_length_bitdepth);
@@ -360,14 +377,13 @@ static int StoreFullHuffmanCode(VP8LBitWriter* const bw,
     }
     StoreHuffmanTreeToBitMask(bw, tokens, length, &huffman_code);
   }
-  ok = 1;
- End:
-  free(tokens);
-  return ok;
 }
 
-static int StoreHuffmanCode(VP8LBitWriter* const bw,
-                            const HuffmanTreeCode* const huffman_code) {
+// 'huff_tree' and 'tokens' are pre-alloacted buffers.
+static void StoreHuffmanCode(VP8LBitWriter* const bw,
+                             HuffmanTree* const huff_tree,
+                             HuffmanTreeToken* const tokens,
+                             const HuffmanTreeCode* const huffman_code) {
   int i;
   int count = 0;
   int symbols[2] = { 0, 0 };
@@ -385,7 +401,6 @@ static int StoreHuffmanCode(VP8LBitWriter* const bw,
   if (count == 0) {   // emit minimal tree for empty cases
     // bits: small tree marker: 1, count-1: 0, large 8-bit code: 0, code: 0
     VP8LWriteBits(bw, 4, 0x01);
-    return 1;
   } else if (count <= 2 && symbols[0] < kMaxSymbol && symbols[1] < kMaxSymbol) {
     VP8LWriteBits(bw, 1, 1);  // Small tree marker to encode 1 or 2 symbols.
     VP8LWriteBits(bw, 1, count - 1);
@@ -399,9 +414,8 @@ static int StoreHuffmanCode(VP8LBitWriter* const bw,
     if (count == 2) {
       VP8LWriteBits(bw, 8, symbols[1]);
     }
-    return 1;
   } else {
-    return StoreFullHuffmanCode(bw, huffman_code);
+    StoreFullHuffmanCode(bw, huff_tree, tokens, huffman_code);
   }
 }
 
@@ -413,18 +427,18 @@ static void WriteHuffmanCode(VP8LBitWriter* const bw,
   VP8LWriteBits(bw, depth, symbol);
 }
 
-static void StoreImageToBitMask(
+static WebPEncodingError StoreImageToBitMask(
     VP8LBitWriter* const bw, int width, int histo_bits,
-    const VP8LBackwardRefs* const refs,
+    VP8LBackwardRefs* const refs,
     const uint16_t* histogram_symbols,
     const HuffmanTreeCode* const huffman_codes) {
   // x and y trace the position in the image.
   int x = 0;
   int y = 0;
   const int histo_xsize = histo_bits ? VP8LSubSampleSize(width, histo_bits) : 1;
-  int i;
-  for (i = 0; i < refs->size; ++i) {
-    const PixOrCopy* const v = &refs->refs[i];
+  VP8LRefsCursor c = VP8LRefsCursorInit(refs);
+  while (VP8LRefsCursorOk(&c)) {
+    const PixOrCopy* const v = c.cur_pos;
     const int histogram_ix = histogram_symbols[histo_bits ?
                                                (y >> histo_bits) * histo_xsize +
                                                (x >> histo_bits) : 0];
@@ -458,88 +472,128 @@ static void StoreImageToBitMask(
       x -= width;
       ++y;
     }
+    VP8LRefsCursorNext(&c);
   }
+  return bw->error_ ? VP8_ENC_ERROR_OUT_OF_MEMORY : VP8_ENC_OK;
 }
 
 // Special case of EncodeImageInternal() for cache-bits=0, histo_bits=31
-static int EncodeImageNoHuffman(VP8LBitWriter* const bw,
-                                const uint32_t* const argb,
-                                int width, int height, int quality) {
+static WebPEncodingError EncodeImageNoHuffman(VP8LBitWriter* const bw,
+                                              const uint32_t* const argb,
+                                              VP8LHashChain* const hash_chain,
+                                              VP8LBackwardRefs refs_array[2],
+                                              int width, int height,
+                                              int quality) {
   int i;
-  int ok = 0;
-  VP8LBackwardRefs refs;
+  int max_tokens = 0;
+  WebPEncodingError err = VP8_ENC_OK;
+  VP8LBackwardRefs* refs;
+  HuffmanTreeToken* tokens = NULL;
   HuffmanTreeCode huffman_codes[5] = { { 0, NULL, NULL } };
   const uint16_t histogram_symbols[1] = { 0 };    // only one tree, one symbol
   VP8LHistogramSet* const histogram_image = VP8LAllocateHistogramSet(1, 0);
-  if (histogram_image == NULL) return 0;
+  HuffmanTree* const huff_tree = (HuffmanTree*)WebPSafeMalloc(
+        3ULL * CODE_LENGTH_CODES, sizeof(*huff_tree));
+  if (histogram_image == NULL || huff_tree == NULL) {
+    err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+    goto Error;
+  }
 
   // Calculate backward references from ARGB image.
-  if (!VP8LGetBackwardReferences(width, height, argb, quality, 0, 1, &refs)) {
+  refs = VP8LGetBackwardReferences(width, height, argb, quality, 0, 1,
+                                   hash_chain, refs_array);
+  if (refs == NULL) {
+    err = VP8_ENC_ERROR_OUT_OF_MEMORY;
     goto Error;
   }
   // Build histogram image and symbols from backward references.
-  VP8LHistogramStoreRefs(&refs, histogram_image->histograms[0]);
+  VP8LHistogramStoreRefs(refs, histogram_image->histograms[0]);
 
   // Create Huffman bit lengths and codes for each histogram image.
   assert(histogram_image->size == 1);
   if (!GetHuffBitLengthsAndCodes(histogram_image, huffman_codes)) {
+    err = VP8_ENC_ERROR_OUT_OF_MEMORY;
     goto Error;
   }
 
   // No color cache, no Huffman image.
   VP8LWriteBits(bw, 1, 0);
 
-  // Store Huffman codes.
+  // Find maximum number of symbols for the huffman tree-set.
   for (i = 0; i < 5; ++i) {
     HuffmanTreeCode* const codes = &huffman_codes[i];
-    if (!StoreHuffmanCode(bw, codes)) {
-      goto Error;
+    if (max_tokens < codes->num_symbols) {
+      max_tokens = codes->num_symbols;
     }
+  }
+
+  tokens = (HuffmanTreeToken*)WebPSafeMalloc(max_tokens, sizeof(*tokens));
+  if (tokens == NULL) {
+    err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+    goto Error;
+  }
+
+  // Store Huffman codes.
+  for (i = 0; i < 5; ++i) {
+    HuffmanTreeCode* const codes = &huffman_codes[i];
+    StoreHuffmanCode(bw, huff_tree, tokens, codes);
     ClearHuffmanTreeIfOnlyOneSymbol(codes);
   }
 
   // Store actual literals.
-  StoreImageToBitMask(bw, width, 0, &refs, histogram_symbols, huffman_codes);
-  ok = 1;
+  err = StoreImageToBitMask(bw, width, 0, refs, histogram_symbols,
+                            huffman_codes);
 
  Error:
-  free(histogram_image);
-  VP8LClearBackwardRefs(&refs);
-  free(huffman_codes[0].codes);
-  return ok;
+  WebPSafeFree(tokens);
+  WebPSafeFree(huff_tree);
+  VP8LFreeHistogramSet(histogram_image);
+  WebPSafeFree(huffman_codes[0].codes);
+  return err;
 }
 
-static int EncodeImageInternal(VP8LBitWriter* const bw,
-                               const uint32_t* const argb,
-                               int width, int height, int quality,
-                               int cache_bits, int histogram_bits) {
-  int ok = 0;
+static WebPEncodingError EncodeImageInternal(VP8LBitWriter* const bw,
+                                             const uint32_t* const argb,
+                                             VP8LHashChain* const hash_chain,
+                                             VP8LBackwardRefs refs_array[2],
+                                             int width, int height, int quality,
+                                             int cache_bits,
+                                             int histogram_bits) {
+  WebPEncodingError err = VP8_ENC_OK;
   const int use_2d_locality = 1;
   const int use_color_cache = (cache_bits > 0);
   const uint32_t histogram_image_xysize =
       VP8LSubSampleSize(width, histogram_bits) *
       VP8LSubSampleSize(height, histogram_bits);
   VP8LHistogramSet* histogram_image =
-      VP8LAllocateHistogramSet(histogram_image_xysize, 0);
+      VP8LAllocateHistogramSet(histogram_image_xysize, cache_bits);
   int histogram_image_size = 0;
   size_t bit_array_size = 0;
+  HuffmanTree* huff_tree = NULL;
+  HuffmanTreeToken* tokens = NULL;
   HuffmanTreeCode* huffman_codes = NULL;
   VP8LBackwardRefs refs;
+  VP8LBackwardRefs* best_refs;
   uint16_t* const histogram_symbols =
-      (uint16_t*)WebPSafeMalloc((uint64_t)histogram_image_xysize,
+      (uint16_t*)WebPSafeMalloc(histogram_image_xysize,
                                 sizeof(*histogram_symbols));
   assert(histogram_bits >= MIN_HUFFMAN_BITS);
   assert(histogram_bits <= MAX_HUFFMAN_BITS);
 
+  VP8LBackwardRefsInit(&refs, refs_array[0].block_size_);
   if (histogram_image == NULL || histogram_symbols == NULL) {
-    free(histogram_image);
-    free(histogram_symbols);
+    VP8LFreeHistogramSet(histogram_image);
+    WebPSafeFree(histogram_symbols);
     return 0;
   }
 
+  // 'best_refs' is the reference to the best backward refs and points to one
+  // of refs_array[0] or refs_array[1].
   // Calculate backward references from ARGB image.
-  if (!VP8LGetBackwardReferences(width, height, argb, quality, cache_bits,
-                                 use_2d_locality, &refs)) {
+  best_refs = VP8LGetBackwardReferences(width, height, argb, quality,
+                                        cache_bits, use_2d_locality,
+                                        hash_chain, refs_array);
+  if (best_refs == NULL || !VP8LBackwardRefsCopy(best_refs, &refs)) {
     goto Error;
   }
   // Build histogram image and symbols from backward references.
@@ -559,7 +613,7 @@ static int EncodeImageInternal(VP8LBitWriter* const bw,
     goto Error;
   }
   // Free combined histograms.
-  free(histogram_image);
+  VP8LFreeHistogramSet(histogram_image);
   histogram_image = NULL;
 
   // Color Cache parameters.
@@ -574,7 +628,7 @@ static int EncodeImageInternal(VP8LBitWriter* const bw,
     VP8LWriteBits(bw, 1, write_histogram_image);
     if (write_histogram_image) {
       uint32_t* const histogram_argb =
-          (uint32_t*)WebPSafeMalloc((uint64_t)histogram_image_xysize,
+          (uint32_t*)WebPSafeMalloc(histogram_image_xysize,
                                     sizeof(*histogram_argb));
       int max_index = 0;
       uint32_t i;
@@ -589,40 +643,54 @@ static int EncodeImageInternal(VP8LBitWriter* const bw,
       histogram_image_size = max_index;
 
       VP8LWriteBits(bw, 3, histogram_bits - 2);
-      ok = EncodeImageNoHuffman(bw, histogram_argb,
-                                VP8LSubSampleSize(width, histogram_bits),
-                                VP8LSubSampleSize(height, histogram_bits),
-                                quality);
-      free(histogram_argb);
-      if (!ok) goto Error;
+      err = EncodeImageNoHuffman(bw, histogram_argb, hash_chain, refs_array,
+                                 VP8LSubSampleSize(width, histogram_bits),
+                                 VP8LSubSampleSize(height, histogram_bits),
+                                 quality);
+      WebPSafeFree(histogram_argb);
+      if (err != VP8_ENC_OK) goto Error;
     }
   }
 
   // Store Huffman codes.
   {
     int i;
+    int max_tokens = 0;
+    huff_tree = (HuffmanTree*)WebPSafeMalloc(3ULL * CODE_LENGTH_CODES,
+                                             sizeof(*huff_tree));
+    if (huff_tree == NULL) goto Error;
+    // Find maximum number of symbols for the huffman tree-set.
+    for (i = 0; i < 5 * histogram_image_size; ++i) {
+      HuffmanTreeCode* const codes = &huffman_codes[i];
+      if (max_tokens < codes->num_symbols) {
+        max_tokens = codes->num_symbols;
+      }
+    }
+    tokens = (HuffmanTreeToken*)WebPSafeMalloc(max_tokens,
+                                               sizeof(*tokens));
+    if (tokens == NULL) goto Error;
     for (i = 0; i < 5 * histogram_image_size; ++i) {
       HuffmanTreeCode* const codes = &huffman_codes[i];
-      if (!StoreHuffmanCode(bw, codes)) goto Error;
+      StoreHuffmanCode(bw, huff_tree, tokens, codes);
       ClearHuffmanTreeIfOnlyOneSymbol(codes);
     }
   }
 
   // Store actual literals.
-  StoreImageToBitMask(bw, width, histogram_bits, &refs,
-                      histogram_symbols, huffman_codes);
-  ok = 1;
+  err = StoreImageToBitMask(bw, width, histogram_bits, &refs,
+                            histogram_symbols, huffman_codes);
 
  Error:
-  free(histogram_image);
-
-  VP8LClearBackwardRefs(&refs);
+  WebPSafeFree(tokens);
+  WebPSafeFree(huff_tree);
+  VP8LFreeHistogramSet(histogram_image);
+  VP8LBackwardRefsClear(&refs);
   if (huffman_codes != NULL) {
-    free(huffman_codes->codes);
-    free(huffman_codes);
+    WebPSafeFree(huffman_codes->codes);
+    WebPSafeFree(huffman_codes);
   }
-  free(histogram_symbols);
-  return ok;
+  WebPSafeFree(histogram_symbols);
+  return err;
 }
 
 // -----------------------------------------------------------------------------
@@ -630,17 +698,16 @@ static int EncodeImageInternal(VP8LBitWriter* const bw,
 
 // Check if it would be a good idea to subtract green from red and blue. We
 // only impact entropy in red/blue components, don't bother to look at others.
-static int EvalAndApplySubtractGreen(VP8LEncoder* const enc,
-                                     int width, int height,
-                                     VP8LBitWriter* const bw) {
+static WebPEncodingError EvalAndApplySubtractGreen(VP8LEncoder* const enc,
+                                                   int width, int height,
+                                                   VP8LBitWriter* const bw) {
   if (!enc->use_palette_) {
     int i;
     const uint32_t* const argb = enc->argb_;
     double bit_cost_before, bit_cost_after;
-    VP8LHistogram* const histo = (VP8LHistogram*)malloc(sizeof(*histo));
-    if (histo == NULL) return 0;
-
-    VP8LHistogramInit(histo, 1);
+    // Allocate histogram with cache_bits = 1.
+    VP8LHistogram* const histo = VP8LAllocateHistogram(1);
+    if (histo == NULL) return VP8_ENC_ERROR_OUT_OF_MEMORY;
     for (i = 0; i < width * height; ++i) {
       const uint32_t c = argb[i];
       ++histo->red_[(c >> 16) & 0xff];
@@ -656,7 +723,7 @@ static int EvalAndApplySubtractGreen(VP8LEncoder* const enc,
       ++histo->blue_[((c >> 0) - green) & 0xff];
     }
     bit_cost_after = VP8LHistogramEstimateBits(histo);
-    free(histo);
+    VP8LFreeHistogram(histo);
 
     // Check if subtracting green yields low entropy.
     enc->use_subtract_green_ = (bit_cost_after < bit_cost_before);
@@ -666,12 +733,12 @@ static int EvalAndApplySubtractGreen(VP8LEncoder* const enc,
       VP8LSubtractGreenFromBlueAndRed(enc->argb_, width * height);
     }
   }
-  return 1;
+  return VP8_ENC_OK;
 }
 
-static int ApplyPredictFilter(const VP8LEncoder* const enc,
-                              int width, int height, int quality,
-                              VP8LBitWriter* const bw) {
+static WebPEncodingError ApplyPredictFilter(const VP8LEncoder* const enc,
+                                            int width, int height, int quality,
+                                            VP8LBitWriter* const bw) {
   const int pred_bits = enc->transform_bits_;
   const int transform_width = VP8LSubSampleSize(width, pred_bits);
   const int transform_height = VP8LSubSampleSize(height, pred_bits);
@@ -682,32 +749,32 @@ static int ApplyPredictFilter(const VP8LEncoder* const enc,
   VP8LWriteBits(bw, 2, PREDICTOR_TRANSFORM);
   assert(pred_bits >= 2);
   VP8LWriteBits(bw, 3, pred_bits - 2);
-  if (!EncodeImageNoHuffman(bw, enc->transform_data_,
-                            transform_width, transform_height, quality)) {
-    return 0;
-  }
-  return 1;
+  return EncodeImageNoHuffman(bw, enc->transform_data_,
+                              (VP8LHashChain*)&enc->hash_chain_,
+                              (VP8LBackwardRefs*)enc->refs_,  // cast const away
+                              transform_width, transform_height,
+                              quality);
 }
 
-static int ApplyCrossColorFilter(const VP8LEncoder* const enc,
-                                 int width, int height, int quality,
-                                 VP8LBitWriter* const bw) {
+static WebPEncodingError ApplyCrossColorFilter(const VP8LEncoder* const enc,
+                                               int width, int height,
+                                               int quality,
+                                               VP8LBitWriter* const bw) {
   const int ccolor_transform_bits = enc->transform_bits_;
   const int transform_width = VP8LSubSampleSize(width, ccolor_transform_bits);
   const int transform_height = VP8LSubSampleSize(height, ccolor_transform_bits);
-  const int step = (quality < 25) ? 32 : (quality > 50) ? 8 : 16;
 
-  VP8LColorSpaceTransform(width, height, ccolor_transform_bits, step,
+  VP8LColorSpaceTransform(width, height, ccolor_transform_bits, quality,
                           enc->argb_, enc->transform_data_);
   VP8LWriteBits(bw, 1, TRANSFORM_PRESENT);
   VP8LWriteBits(bw, 2, CROSS_COLOR_TRANSFORM);
   assert(ccolor_transform_bits >= 2);
   VP8LWriteBits(bw, 3, ccolor_transform_bits - 2);
-  if (!EncodeImageNoHuffman(bw, enc->transform_data_,
-                            transform_width, transform_height, quality)) {
-    return 0;
-  }
-  return 1;
+  return EncodeImageNoHuffman(bw, enc->transform_data_,
+                              (VP8LHashChain*)&enc->hash_chain_,
+                              (VP8LBackwardRefs*)enc->refs_,  // cast const away
+                              transform_width, transform_height,
+                              quality);
 }
 
 // -----------------------------------------------------------------------------
@@ -785,11 +852,11 @@ static WebPEncodingError AllocateTransformBuffer(VP8LEncoder* const enc,
   const int tile_size = 1 << enc->transform_bits_;
   const uint64_t image_size = width * height;
   const uint64_t argb_scratch_size = tile_size * width + width;
-  const uint64_t transform_data_size =
-      (uint64_t)VP8LSubSampleSize(width, enc->transform_bits_) *
-      (uint64_t)VP8LSubSampleSize(height, enc->transform_bits_);
+  const int transform_data_size =
+      VP8LSubSampleSize(width, enc->transform_bits_) *
+      VP8LSubSampleSize(height, enc->transform_bits_);
   const uint64_t total_size =
-      image_size + argb_scratch_size + transform_data_size;
+      image_size + argb_scratch_size + (uint64_t)transform_data_size;
   uint32_t* mem = (uint32_t*)WebPSafeMalloc(total_size, sizeof(*mem));
   if (mem == NULL) {
     err = VP8_ENC_ERROR_OUT_OF_MEMORY;
@@ -888,7 +955,7 @@ static WebPEncodingError EncodePalette(VP8LBitWriter* const bw,
   if (err != VP8_ENC_OK) goto Error;
   dst = enc->argb_;
 
-  row = (uint8_t*)WebPSafeMalloc((uint64_t)width, sizeof(*row));
+  row = (uint8_t*)WebPSafeMalloc(width, sizeof(*row));
   if (row == NULL) return VP8_ENC_ERROR_OUT_OF_MEMORY;
 
   ApplyPalette(src, dst, pic->argb_stride, enc->current_width_,
@@ -902,42 +969,48 @@ static WebPEncodingError EncodePalette(VP8LBitWriter* const bw,
   for (i = palette_size - 1; i >= 1; --i) {
     palette[i] = VP8LSubPixels(palette[i], palette[i - 1]);
   }
-  if (!EncodeImageNoHuffman(bw, palette, palette_size, 1, quality)) {
-    err = VP8_ENC_ERROR_INVALID_CONFIGURATION;
-    goto Error;
-  }
+  err = EncodeImageNoHuffman(bw, palette, &enc->hash_chain_, enc->refs_,
+                             palette_size, 1, quality);
 
  Error:
-  free(row);
+  WebPSafeFree(row);
   return err;
 }
 
 // -----------------------------------------------------------------------------
 
 static int GetHistoBits(int method, int use_palette, int width, int height) {
-  const uint64_t hist_size = sizeof(VP8LHistogram);
+  const int hist_size = VP8LGetHistogramSize(MAX_COLOR_CACHE_BITS);
   // Make tile size a function of encoding method (Range: 0 to 6).
   int histo_bits = (use_palette ? 9 : 7) - method;
   while (1) {
-    const uint64_t huff_image_size = VP8LSubSampleSize(width, histo_bits) *
-                                     VP8LSubSampleSize(height, histo_bits) *
-                                     hist_size;
-    if (huff_image_size <= MAX_HUFF_IMAGE_SIZE) break;
+    const int huff_image_size = VP8LSubSampleSize(width, histo_bits) *
+                                VP8LSubSampleSize(height, histo_bits);
+    if ((uint64_t)huff_image_size * hist_size <= MAX_HUFF_IMAGE_SIZE) break;
     ++histo_bits;
   }
   return (histo_bits < MIN_HUFFMAN_BITS) ? MIN_HUFFMAN_BITS :
          (histo_bits > MAX_HUFFMAN_BITS) ? MAX_HUFFMAN_BITS : histo_bits;
 }
 
+static int GetTransformBits(int method, int histo_bits) {
+  const int max_transform_bits = (method < 4) ? 6 : (method > 4) ? 4 : 5;
+  return (histo_bits > max_transform_bits) ? max_transform_bits : histo_bits;
+}
+
+static int GetCacheBits(float quality) {
+  return (quality <= 25.f) ? 0 : 7;
+}
+
 static void FinishEncParams(VP8LEncoder* const enc) {
   const WebPConfig* const config = enc->config_;
   const WebPPicture* const pic = enc->pic_;
   const int method = config->method;
   const float quality = config->quality;
   const int use_palette = enc->use_palette_;
-  enc->transform_bits_ = (method < 4) ? 5 : (method > 4) ? 3 : 4;
   enc->histo_bits_ = GetHistoBits(method, use_palette, pic->width, pic->height);
-  enc->cache_bits_ = (quality <= 25.f) ? 0 : 7;
+  enc->transform_bits_ = GetTransformBits(method, enc->histo_bits_);
+  enc->cache_bits_ = GetCacheBits(quality);
 }
 
 // -----------------------------------------------------------------------------
@@ -945,7 +1018,7 @@ static void FinishEncParams(VP8LEncoder* const enc) {
 
 static VP8LEncoder* VP8LEncoderNew(const WebPConfig* const config,
                                    const WebPPicture* const picture) {
-  VP8LEncoder* const enc = (VP8LEncoder*)calloc(1, sizeof(*enc));
+  VP8LEncoder* const enc = (VP8LEncoder*)WebPSafeCalloc(1ULL, sizeof(*enc));
   if (enc == NULL) {
     WebPEncodingSetError(picture, VP8_ENC_ERROR_OUT_OF_MEMORY);
     return NULL;
@@ -959,8 +1032,13 @@ static VP8LEncoder* VP8LEncoderNew(const WebPConfig* const config,
 }
 
 static void VP8LEncoderDelete(VP8LEncoder* enc) {
-  free(enc->argb_);
-  free(enc);
+  if (enc != NULL) {
+    VP8LHashChainClear(&enc->hash_chain_);
+    VP8LBackwardRefsClear(&enc->refs_[0]);
+    VP8LBackwardRefsClear(&enc->refs_[1]);
+    WebPSafeFree(enc->argb_);
+    WebPSafeFree(enc);
+  }
 }
 
 // -----------------------------------------------------------------------------
@@ -984,7 +1062,7 @@ WebPEncodingError VP8LEncodeStream(const WebPConfig* const config,
   // ---------------------------------------------------------------------------
   // Analyze image (entropy, num_palettes etc)
 
-  if (!VP8LEncAnalyze(enc, config->image_hint)) {
+  if (!AnalyzeAndInit(enc, config->image_hint)) {
     err = VP8_ENC_ERROR_OUT_OF_MEMORY;
     goto Error;
   }
@@ -1003,6 +1081,7 @@ WebPEncodingError VP8LEncodeStream(const WebPConfig* const config,
     int y;
     err = AllocateTransformBuffer(enc, width, height);
     if (err != VP8_ENC_OK) goto Error;
+    assert(enc->argb_ != NULL);
     for (y = 0; y < height; ++y) {
       memcpy(enc->argb_ + y * width,
              picture->argb + y * picture->argb_stride,
@@ -1014,23 +1093,17 @@ WebPEncodingError VP8LEncodeStream(const WebPConfig* const config,
   // ---------------------------------------------------------------------------
   // Apply transforms and write transform data.
 
-  if (!EvalAndApplySubtractGreen(enc, enc->current_width_, height, bw)) {
-    err = VP8_ENC_ERROR_OUT_OF_MEMORY;
-    goto Error;
-  }
+  err = EvalAndApplySubtractGreen(enc, enc->current_width_, height, bw);
+  if (err != VP8_ENC_OK) goto Error;
 
   if (enc->use_predict_) {
-    if (!ApplyPredictFilter(enc, enc->current_width_, height, quality, bw)) {
-      err = VP8_ENC_ERROR_INVALID_CONFIGURATION;
-      goto Error;
-    }
+    err = ApplyPredictFilter(enc, enc->current_width_, height, quality, bw);
+    if (err != VP8_ENC_OK) goto Error;
   }
 
   if (enc->use_cross_color_) {
-    if (!ApplyCrossColorFilter(enc, enc->current_width_, height, quality, bw)) {
-      err = VP8_ENC_ERROR_INVALID_CONFIGURATION;
-      goto Error;
-    }
+    err = ApplyCrossColorFilter(enc, enc->current_width_, height, quality, bw);
+    if (err != VP8_ENC_OK) goto Error;
   }
 
   VP8LWriteBits(bw, 1, !TRANSFORM_PRESENT);  // No more transforms.
@@ -1040,8 +1113,9 @@ WebPEncodingError VP8LEncodeStream(const WebPConfig* const config,
 
   if (enc->cache_bits_ > 0) {
     if (!VP8LCalculateEstimateForCacheSize(enc->argb_, enc->current_width_,
-                                           height, &enc->cache_bits_)) {
-      err = VP8_ENC_ERROR_INVALID_CONFIGURATION;
+                                           height, quality, &enc->hash_chain_,
+                                           &enc->refs_[0], &enc->cache_bits_)) {
+      err = VP8_ENC_ERROR_OUT_OF_MEMORY;
       goto Error;
     }
   }
@@ -1049,11 +1123,10 @@ WebPEncodingError VP8LEncodeStream(const WebPConfig* const config,
   // ---------------------------------------------------------------------------
   // Encode and write the transformed image.
 
-  if (!EncodeImageInternal(bw, enc->argb_, enc->current_width_, height,
-                           quality, enc->cache_bits_, enc->histo_bits_)) {
-    err = VP8_ENC_ERROR_OUT_OF_MEMORY;
-    goto Error;
-  }
+  err = EncodeImageInternal(bw, enc->argb_, &enc->hash_chain_, enc->refs_,
+                            enc->current_width_, height, quality,
+                            enc->cache_bits_, enc->histo_bits_);
+  if (err != VP8_ENC_OK) goto Error;
 
   if (picture->stats != NULL) {
     WebPAuxStats* const stats = picture->stats;
@@ -1080,6 +1153,7 @@ int VP8LEncodeImage(const WebPConfig* const config,
   int has_alpha;
   size_t coded_size;
   int percent = 0;
+  int initial_size;
   WebPEncodingError err = VP8_ENC_OK;
   VP8LBitWriter bw;
 
@@ -1093,7 +1167,11 @@ int VP8LEncodeImage(const WebPConfig* const config,
 
   width = picture->width;
   height = picture->height;
-  if (!VP8LBitWriterInit(&bw, (width * height) >> 1)) {
+  // Initialize BitWriter with size corresponding to 16 bpp to photo images and
+  // 8 bpp for graphical images.
+  initial_size = (config->image_hint == WEBP_HINT_GRAPH) ?
+                 width * height : width * height * 2;
+  if (!VP8LBitWriterInit(&bw, initial_size)) {
     err = VP8_ENC_ERROR_OUT_OF_MEMORY;
     goto Error;
   }
@@ -1165,4 +1243,3 @@ int VP8LEncodeImage(const WebPConfig* const config,
 }
 
 //------------------------------------------------------------------------------
-
diff --git a/src/3rdparty/libwebp/src/enc/vp8li.h b/src/3rdparty/libwebp/src/enc/vp8li.h
index 96d6fae..6b6db12 100644
--- a/src/3rdparty/libwebp/src/enc/vp8li.h
+++ b/src/3rdparty/libwebp/src/enc/vp8li.h
@@ -14,6 +14,7 @@
 #ifndef WEBP_ENC_VP8LI_H_
 #define WEBP_ENC_VP8LI_H_
 
+#include "./backward_references.h"
 #include "./histogram.h"
 #include "../utils/bit_writer.h"
 #include "../webp/encode.h"
@@ -45,6 +46,12 @@ typedef struct {
   int use_palette_;
   int palette_size_;
   uint32_t palette_[MAX_PALETTE_SIZE];
+
+  // Some 'scratch' (potentially large) objects.
+  struct VP8LBackwardRefs refs_[2];  // Backward Refs array corresponding to
+                                     // LZ77 & RLE coding.
+  VP8LHashChain hash_chain_;         // HashChain data for constructing
+                                     // backward references.
 } VP8LEncoder;
 
 //------------------------------------------------------------------------------
diff --git a/src/3rdparty/libwebp/src/enc/webpenc.c b/src/3rdparty/libwebp/src/enc/webpenc.c
index 207cce6..ca85e0b 100644
--- a/src/3rdparty/libwebp/src/enc/webpenc.c
+++ b/src/3rdparty/libwebp/src/enc/webpenc.c
@@ -18,6 +18,7 @@
 
 #include "./vp8enci.h"
 #include "./vp8li.h"
+#include "./cost.h"
 #include "../utils/utils.h"
 
 // #define PRINT_MEMORY_INFO
@@ -33,31 +34,6 @@ int WebPGetEncoderVersion(void) {
 }
 
 //------------------------------------------------------------------------------
-// WebPPicture
-//------------------------------------------------------------------------------
-
-static int DummyWriter(const uint8_t* data, size_t data_size,
-                       const WebPPicture* const picture) {
-  // The following are to prevent 'unused variable' error message.
-  (void)data;
-  (void)data_size;
-  (void)picture;
-  return 1;
-}
-
-int WebPPictureInitInternal(WebPPicture* picture, int version) {
-  if (WEBP_ABI_IS_INCOMPATIBLE(version, WEBP_ENCODER_ABI_VERSION)) {
-    return 0;   // caller/system version mismatch!
-  }
-  if (picture != NULL) {
-    memset(picture, 0, sizeof(*picture));
-    picture->writer = DummyWriter;
-    WebPEncodingSetError(picture, VP8_ENC_OK);
-  }
-  return 1;
-}
-
-//------------------------------------------------------------------------------
 // VP8Encoder
 //------------------------------------------------------------------------------
 
@@ -143,23 +119,21 @@ static void MapConfigToTools(VP8Encoder* const enc) {
 // Memory scaling with dimensions:
 //  memory (bytes) ~= 2.25 * w + 0.0625 * w * h
 //
-// Typical memory footprint (768x510 picture)
-// Memory used:
-//              encoder: 33919
-//          block cache: 2880
-//                 info: 3072
-//                preds: 24897
-//          top samples: 1623
-//             non-zero: 196
-//             lf-stats: 2048
-//                total: 68635
+// Typical memory footprint (614x440 picture)
+//              encoder: 22111
+//                 info: 4368
+//                preds: 17741
+//          top samples: 1263
+//             non-zero: 175
+//             lf-stats: 0
+//                total: 45658
 // Transient object sizes:
-//       VP8EncIterator: 352
-//         VP8ModeScore: 912
-//       VP8SegmentInfo: 532
-//             VP8Proba: 31032
+//       VP8EncIterator: 3360
+//         VP8ModeScore: 872
+//       VP8SegmentInfo: 732
+//             VP8Proba: 18352
 //              LFStats: 2048
-// Picture size (yuv): 589824
+// Picture size (yuv): 419328
 
 static VP8Encoder* InitVP8Encoder(const WebPConfig* const config,
                                   WebPPicture* const picture) {
@@ -251,13 +225,16 @@ static VP8Encoder* InitVP8Encoder(const WebPConfig* const config,
   ResetSegmentHeader(enc);
   ResetFilterHeader(enc);
   ResetBoundaryPredictions(enc);
-
+  VP8GetResidualCostInit();
+  VP8SetResidualCoeffsInit();
   VP8EncInitAlpha(enc);
-#ifdef WEBP_EXPERIMENTAL_FEATURES
-  VP8EncInitLayer(enc);
-#endif
 
-  VP8TBufferInit(&enc->tokens_);
+  // lower quality means smaller output -> we modulate a little the page
+  // size based on quality. This is just a crude 1rst-order prediction.
+  {
+    const float scale = 1.f + config->quality * 5.f / 100.f;  // in [1,6]
+    VP8TBufferInit(&enc->tokens_, (int)(mb_w * mb_h * 4 * scale));
+  }
   return enc;
 }
 
@@ -265,11 +242,8 @@ static int DeleteVP8Encoder(VP8Encoder* enc) {
   int ok = 1;
   if (enc != NULL) {
     ok = VP8EncDeleteAlpha(enc);
-#ifdef WEBP_EXPERIMENTAL_FEATURES
-    VP8EncDeleteLayer(enc);
-#endif
     VP8TBufferClear(&enc->tokens_);
-    free(enc);
+    WebPSafeFree(enc);
   }
   return ok;
 }
@@ -352,18 +326,26 @@ int WebPEncode(const WebPConfig* config, WebPPicture* pic) {
 
   if (!config->lossless) {
     VP8Encoder* enc = NULL;
-    if (pic->y == NULL || pic->u == NULL || pic->v == NULL) {
+    if (pic->use_argb || pic->y == NULL || pic->u == NULL || pic->v == NULL) {
       // Make sure we have YUVA samples.
-      float dithering = 0.f;
-      if (config->preprocessing & 2) {
-        const float x = config->quality / 100.f;
-        const float x2 = x * x;
-        // slowly decreasing from max dithering at low quality (q->0)
-        // to 0.5 dithering amplitude at high quality (q->100)
-        dithering = 1.0f + (0.5f - 1.0f) * x2 * x2;
-      }
-      if (!WebPPictureARGBToYUVADithered(pic, WEBP_YUV420, dithering)) {
-        return 0;
+      if (config->preprocessing & 4) {
+#if WEBP_ENCODER_ABI_VERSION > 0x0204
+        if (!WebPPictureSmartARGBToYUVA(pic)) {
+          return 0;
+        }
+#endif
+      } else {
+        float dithering = 0.f;
+        if (config->preprocessing & 2) {
+          const float x = config->quality / 100.f;
+          const float x2 = x * x;
+          // slowly decreasing from max dithering at low quality (q->0)
+          // to 0.5 dithering amplitude at high quality (q->100)
+          dithering = 1.0f + (0.5f - 1.0f) * x2 * x2;
+        }
+        if (!WebPPictureARGBToYUVADithered(pic, WEBP_YUV420, dithering)) {
+          return 0;
+        }
       }
     }
 
@@ -380,9 +362,6 @@ int WebPEncode(const WebPConfig* config, WebPPicture* pic) {
       ok = ok && VP8EncTokenLoop(enc);
     }
     ok = ok && VP8EncFinishAlpha(enc);
-#ifdef WEBP_EXPERIMENTAL_FEATURES
-    ok = ok && VP8EncFinishLayer(enc);
-#endif
 
     ok = ok && VP8EncWrite(enc);
     StoreStats(enc);
@@ -401,4 +380,3 @@ int WebPEncode(const WebPConfig* config, WebPPicture* pic) {
 
   return ok;
 }
-
diff --git a/src/3rdparty/libwebp/src/mux/muxedit.c b/src/3rdparty/libwebp/src/mux/muxedit.c
index 25770b3..24ca471 100644
--- a/src/3rdparty/libwebp/src/mux/muxedit.c
+++ b/src/3rdparty/libwebp/src/mux/muxedit.c
@@ -20,17 +20,18 @@
 // Life of a mux object.
 
 static void MuxInit(WebPMux* const mux) {
-  if (mux == NULL) return;
+  assert(mux != NULL);
   memset(mux, 0, sizeof(*mux));
+  mux->canvas_width_ = 0;     // just to be explicit
+  mux->canvas_height_ = 0;
 }
 
 WebPMux* WebPNewInternal(int version) {
   if (WEBP_ABI_IS_INCOMPATIBLE(version, WEBP_MUX_ABI_VERSION)) {
     return NULL;
   } else {
-    WebPMux* const mux = (WebPMux*)malloc(sizeof(WebPMux));
-    // If mux is NULL MuxInit is a noop.
-    MuxInit(mux);
+    WebPMux* const mux = (WebPMux*)WebPSafeMalloc(1ULL, sizeof(WebPMux));
+    if (mux != NULL) MuxInit(mux);
     return mux;
   }
 }
@@ -43,7 +44,7 @@ static void DeleteAllImages(WebPMuxImage** const wpi_list) {
 }
 
 static void MuxRelease(WebPMux* const mux) {
-  if (mux == NULL) return;
+  assert(mux != NULL);
   DeleteAllImages(&mux->images_);
   ChunkListDelete(&mux->vp8x_);
   ChunkListDelete(&mux->iccp_);
@@ -54,9 +55,10 @@ static void MuxRelease(WebPMux* const mux) {
 }
 
 void WebPMuxDelete(WebPMux* mux) {
-  // If mux is NULL MuxRelease is a noop.
-  MuxRelease(mux);
-  free(mux);
+  if (mux != NULL) {
+    MuxRelease(mux);
+    WebPSafeFree(mux);
+  }
 }
 
 //------------------------------------------------------------------------------
@@ -102,7 +104,7 @@ static WebPMuxError CreateFrameFragmentData(
   assert(info->dispose_method == (info->dispose_method & 1));
   // Note: assertion on upper bounds is done in PutLE24().
 
-  frame_frgm_bytes = (uint8_t*)malloc(frame_frgm_size);
+  frame_frgm_bytes = (uint8_t*)WebPSafeMalloc(1ULL, frame_frgm_size);
   if (frame_frgm_bytes == NULL) return WEBP_MUX_MEMORY_ERROR;
 
   PutLE24(frame_frgm_bytes + 0, info->x_offset / 2);
@@ -360,6 +362,34 @@ WebPMuxError WebPMuxSetAnimationParams(WebPMux* mux,
   return MuxSet(mux, kChunks[IDX_ANIM].tag, 1, &anim, 1);
 }
 
+#if WEBP_MUX_ABI_VERSION > 0x0101
+WebPMuxError WebPMuxSetCanvasSize(WebPMux* mux,
+                                  int width, int height) {
+  WebPMuxError err;
+  if (mux == NULL) {
+    return WEBP_MUX_INVALID_ARGUMENT;
+  }
+  if (width < 0 || height < 0 ||
+      width > MAX_CANVAS_SIZE || height > MAX_CANVAS_SIZE) {
+    return WEBP_MUX_INVALID_ARGUMENT;
+  }
+  if (width * (uint64_t)height >= MAX_IMAGE_AREA) {
+    return WEBP_MUX_INVALID_ARGUMENT;
+  }
+  if ((width * height) == 0 && (width | height) != 0) {
+    // one of width / height is zero, but not both -> invalid!
+    return WEBP_MUX_INVALID_ARGUMENT;
+  }
+  // If we already assembled a VP8X chunk, invalidate it.
+  err = MuxDeleteAllNamedData(mux, kChunks[IDX_VP8X].tag);
+  if (err != WEBP_MUX_OK && err != WEBP_MUX_NOT_FOUND) return err;
+
+  mux->canvas_width_ = width;
+  mux->canvas_height_ = height;
+  return WEBP_MUX_OK;
+}
+#endif
+
 //------------------------------------------------------------------------------
 // Delete API(s).
 
@@ -413,9 +443,10 @@ static WebPMuxError GetImageInfo(const WebPMuxImage* const wpi,
   return WEBP_MUX_OK;
 }
 
-static WebPMuxError GetImageCanvasWidthHeight(
-    const WebPMux* const mux, uint32_t flags,
-    int* const width, int* const height) {
+// Returns the tightest dimension for the canvas considering the image list.
+static WebPMuxError GetAdjustedCanvasSize(const WebPMux* const mux,
+                                          uint32_t flags,
+                                          int* const width, int* const height) {
   WebPMuxImage* wpi = NULL;
   assert(mux != NULL);
   assert(width != NULL && height != NULL);
@@ -513,12 +544,7 @@ static WebPMuxError CreateVP8XChunk(WebPMux* const mux) {
     flags |= ALPHA_FLAG;  // Some images have an alpha channel.
   }
 
-  if (flags == 0) {
-    // For Simple Image, VP8X chunk should not be added.
-    return WEBP_MUX_OK;
-  }
-
-  err = GetImageCanvasWidthHeight(mux, flags, &width, &height);
+  err = GetAdjustedCanvasSize(mux, flags, &width, &height);
   if (err != WEBP_MUX_OK) return err;
 
   if (width <= 0 || height <= 0) {
@@ -528,6 +554,19 @@ static WebPMuxError CreateVP8XChunk(WebPMux* const mux) {
     return WEBP_MUX_INVALID_ARGUMENT;
   }
 
+  if (mux->canvas_width_ != 0 || mux->canvas_height_ != 0) {
+    if (width > mux->canvas_width_ || height > mux->canvas_height_) {
+      return WEBP_MUX_INVALID_ARGUMENT;
+    }
+    width = mux->canvas_width_;
+    height = mux->canvas_height_;
+  }
+
+  if (flags == 0) {
+    // For Simple Image, VP8X chunk should not be added.
+    return WEBP_MUX_OK;
+  }
+
   if (MuxHasAlpha(images)) {
     // This means some frames explicitly/implicitly contain alpha.
     // Note: This 'flags' update must NOT be done for a lossless image
@@ -548,9 +587,9 @@ static WebPMuxError MuxCleanup(WebPMux* const mux) {
   int num_fragments;
   int num_anim_chunks;
 
-  // If we have an image with single fragment or frame, convert it to a
-  // non-animated non-fragmented image (to avoid writing FRGM/ANMF chunk
-  // unnecessarily).
+  // If we have an image with a single fragment or frame, and its rectangle
+  // covers the whole canvas, convert it to a non-animated non-fragmented image
+  // (to avoid writing FRGM/ANMF chunk unnecessarily).
   WebPMuxError err = WebPMuxNumChunks(mux, kChunks[IDX_ANMF].id, &num_frames);
   if (err != WEBP_MUX_OK) return err;
   err = WebPMuxNumChunks(mux, kChunks[IDX_FRGM].id, &num_fragments);
@@ -559,14 +598,18 @@ static WebPMuxError MuxCleanup(WebPMux* const mux) {
     WebPMuxImage* frame_frag;
     err = MuxImageGetNth((const WebPMuxImage**)&mux->images_, 1, &frame_frag);
     assert(err == WEBP_MUX_OK);  // We know that one frame/fragment does exist.
-    if (frame_frag->header_ != NULL) {
+    assert(frame_frag != NULL);
+    if (frame_frag->header_ != NULL &&
+        ((mux->canvas_width_ == 0 && mux->canvas_height_ == 0) ||
+         (frame_frag->width_ == mux->canvas_width_ &&
+          frame_frag->height_ == mux->canvas_height_))) {
       assert(frame_frag->header_->tag_ == kChunks[IDX_ANMF].tag ||
              frame_frag->header_->tag_ == kChunks[IDX_FRGM].tag);
       ChunkDelete(frame_frag->header_);  // Removes ANMF/FRGM chunk.
       frame_frag->header_ = NULL;
+      num_frames = 0;
+      num_fragments = 0;
     }
-    num_frames = 0;
-    num_fragments = 0;
   }
   // Remove ANIM chunk if this is a non-animated image.
   err = WebPMuxNumChunks(mux, kChunks[IDX_ANIM].id, &num_anim_chunks);
@@ -603,7 +646,13 @@ WebPMuxError WebPMuxAssemble(WebPMux* mux, WebPData* assembled_data) {
   uint8_t* dst = NULL;
   WebPMuxError err;
 
-  if (mux == NULL || assembled_data == NULL) {
+  if (assembled_data == NULL) {
+    return WEBP_MUX_INVALID_ARGUMENT;
+  }
+  // Clean up returned data, in case something goes wrong.
+  memset(assembled_data, 0, sizeof(*assembled_data));
+
+  if (mux == NULL) {
     return WEBP_MUX_INVALID_ARGUMENT;
   }
 
@@ -619,7 +668,7 @@ WebPMuxError WebPMuxAssemble(WebPMux* mux, WebPData* assembled_data) {
        + ChunkListDiskSize(mux->exif_) + ChunkListDiskSize(mux->xmp_)
        + ChunkListDiskSize(mux->unknown_) + RIFF_HEADER_SIZE;
 
-  data = (uint8_t*)malloc(size);
+  data = (uint8_t*)WebPSafeMalloc(1ULL, size);
   if (data == NULL) return WEBP_MUX_MEMORY_ERROR;
 
   // Emit header & chunks.
@@ -636,7 +685,7 @@ WebPMuxError WebPMuxAssemble(WebPMux* mux, WebPData* assembled_data) {
   // Validate mux.
   err = MuxValidate(mux);
   if (err != WEBP_MUX_OK) {
-    free(data);
+    WebPSafeFree(data);
     data = NULL;
     size = 0;
   }
@@ -649,4 +698,3 @@ WebPMuxError WebPMuxAssemble(WebPMux* mux, WebPData* assembled_data) {
 }
 
 //------------------------------------------------------------------------------
-
diff --git a/src/3rdparty/libwebp/src/mux/muxi.h b/src/3rdparty/libwebp/src/mux/muxi.h
index 277d5fb..718b2f5 100644
--- a/src/3rdparty/libwebp/src/mux/muxi.h
+++ b/src/3rdparty/libwebp/src/mux/muxi.h
@@ -28,7 +28,7 @@ extern "C" {
 
 #define MUX_MAJ_VERSION 0
 #define MUX_MIN_VERSION 2
-#define MUX_REV_VERSION 0
+#define MUX_REV_VERSION 2
 
 // Chunk object.
 typedef struct WebPChunk WebPChunk;
@@ -65,7 +65,9 @@ struct WebPMux {
   WebPChunk*      anim_;
   WebPChunk*      vp8x_;
 
-  WebPChunk*  unknown_;
+  WebPChunk*      unknown_;
+  int             canvas_width_;
+  int             canvas_height_;
 };
 
 // CHUNK_INDEX enum: used for indexing within 'kChunks' (defined below) only.
diff --git a/src/3rdparty/libwebp/src/mux/muxinternal.c b/src/3rdparty/libwebp/src/mux/muxinternal.c
index 3f992ce..4babbe8 100644
--- a/src/3rdparty/libwebp/src/mux/muxinternal.c
+++ b/src/3rdparty/libwebp/src/mux/muxinternal.c
@@ -165,7 +165,7 @@ WebPMuxError ChunkSetNth(WebPChunk* chunk, WebPChunk** chunk_list,
     return WEBP_MUX_NOT_FOUND;
   }
 
-  new_chunk = (WebPChunk*)malloc(sizeof(*new_chunk));
+  new_chunk = (WebPChunk*)WebPSafeMalloc(1ULL, sizeof(*new_chunk));
   if (new_chunk == NULL) return WEBP_MUX_MEMORY_ERROR;
   *new_chunk = *chunk;
   chunk->owner_ = 0;
@@ -179,7 +179,7 @@ WebPMuxError ChunkSetNth(WebPChunk* chunk, WebPChunk** chunk_list,
 
 WebPChunk* ChunkDelete(WebPChunk* const chunk) {
   WebPChunk* const next = ChunkRelease(chunk);
-  free(chunk);
+  WebPSafeFree(chunk);
   return next;
 }
 
@@ -312,7 +312,7 @@ WebPMuxError MuxImagePush(const WebPMuxImage* wpi, WebPMuxImage** wpi_list) {
     wpi_list = &cur_wpi->next_;
   }
 
-  new_wpi = (WebPMuxImage*)malloc(sizeof(*new_wpi));
+  new_wpi = (WebPMuxImage*)WebPSafeMalloc(1ULL, sizeof(*new_wpi));
   if (new_wpi == NULL) return WEBP_MUX_MEMORY_ERROR;
   *new_wpi = *wpi;
   new_wpi->next_ = NULL;
@@ -331,7 +331,7 @@ WebPMuxError MuxImagePush(const WebPMuxImage* wpi, WebPMuxImage** wpi_list) {
 WebPMuxImage* MuxImageDelete(WebPMuxImage* const wpi) {
   // Delete the components of wpi. If wpi is NULL this is a noop.
   WebPMuxImage* const next = MuxImageRelease(wpi);
-  free(wpi);
+  WebPSafeFree(wpi);
   return next;
 }
 
diff --git a/src/3rdparty/libwebp/src/mux/muxread.c b/src/3rdparty/libwebp/src/mux/muxread.c
index 6003a25..bba09a5 100644
--- a/src/3rdparty/libwebp/src/mux/muxread.c
+++ b/src/3rdparty/libwebp/src/mux/muxread.c
@@ -57,7 +57,7 @@ static WebPMuxError ChunkVerifyAndAssign(WebPChunk* chunk,
   WebPData chunk_data;
 
   // Sanity checks.
-  if (data_size < TAG_SIZE) return WEBP_MUX_NOT_ENOUGH_DATA;
+  if (data_size < CHUNK_HEADER_SIZE) return WEBP_MUX_NOT_ENOUGH_DATA;
   chunk_size = GetLE32(data + TAG_SIZE);
 
   {
@@ -220,7 +220,7 @@ WebPMux* WebPMuxCreateInternal(const WebPData* bitstream, int copy_data,
   data += RIFF_HEADER_SIZE;
   size -= RIFF_HEADER_SIZE;
 
-  wpi = (WebPMuxImage*)malloc(sizeof(*wpi));
+  wpi = (WebPMuxImage*)WebPSafeMalloc(1ULL, sizeof(*wpi));
   if (wpi == NULL) goto Err;
   MuxImageInit(wpi);
 
@@ -264,6 +264,10 @@ WebPMux* WebPMuxCreateInternal(const WebPData* bitstream, int copy_data,
                                          // getting all chunks of an image.
         chunk_list = MuxGetChunkListFromId(mux, id);  // List to add this chunk.
         if (ChunkSetNth(&chunk, chunk_list, 0) != WEBP_MUX_OK) goto Err;
+        if (id == WEBP_CHUNK_VP8X) {  // grab global specs
+          mux->canvas_width_ = GetLE24(data + 12) + 1;
+          mux->canvas_height_ = GetLE24(data + 15) + 1;
+        }
         break;
     }
     data += data_size;
@@ -320,14 +324,20 @@ static WebPMuxError MuxGetCanvasInfo(const WebPMux* const mux,
     f = GetLE32(data.bytes + 0);
     w = GetLE24(data.bytes + 4) + 1;
     h = GetLE24(data.bytes + 7) + 1;
-  } else {  // Single image case.
+  } else {
     const WebPMuxImage* const wpi = mux->images_;
-    WebPMuxError err = ValidateForSingleImage(mux);
-    if (err != WEBP_MUX_OK) return err;
-    assert(wpi != NULL);
-    w = wpi->width_;
-    h = wpi->height_;
-    if (wpi->has_alpha_) f |= ALPHA_FLAG;
+    // Grab user-forced canvas size as default.
+    w = mux->canvas_width_;
+    h = mux->canvas_height_;
+    if (w == 0 && h == 0 && ValidateForSingleImage(mux) == WEBP_MUX_OK) {
+      // single image and not forced canvas size => use dimension of first frame
+      assert(wpi != NULL);
+      w = wpi->width_;
+      h = wpi->height_;
+    }
+    if (wpi != NULL) {
+      if (wpi->has_alpha_) f |= ALPHA_FLAG;
+    }
   }
   if (w * (uint64_t)h >= MAX_IMAGE_AREA) return WEBP_MUX_BAD_DATA;
 
@@ -375,7 +385,7 @@ static WebPMuxError SynthesizeBitstream(const WebPMuxImage* const wpi,
   // Note: No need to output ANMF/FRGM chunk for a single image.
   const size_t size = RIFF_HEADER_SIZE + vp8x_size + alpha_size +
                       ChunkDiskSize(wpi->img_);
-  uint8_t* const data = (uint8_t*)malloc(size);
+  uint8_t* const data = (uint8_t*)WebPSafeMalloc(1ULL, size);
   if (data == NULL) return WEBP_MUX_MEMORY_ERROR;
 
   // Main RIFF header.
@@ -537,4 +547,3 @@ WebPMuxError WebPMuxNumChunks(const WebPMux* mux,
 }
 
 //------------------------------------------------------------------------------
-
diff --git a/src/3rdparty/libwebp/src/utils/alpha_processing.h b/src/3rdparty/libwebp/src/utils/alpha_processing.h
deleted file mode 100644
index 80e1ae4..0000000
--- a/src/3rdparty/libwebp/src/utils/alpha_processing.h
+++ /dev/null
@@ -1,46 +0,0 @@
-// Copyright 2013 Google Inc. All Rights Reserved.
-//
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
-// -----------------------------------------------------------------------------
-//
-// Utilities for processing transparent channel.
-//
-// Author: Skal (pascal.massimino@gmail.com)
-
-#ifndef WEBP_UTILS_ALPHA_PROCESSING_H_
-#define WEBP_UTILS_ALPHA_PROCESSING_H_
-
-#include "../webp/types.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-// Pre-Multiply operation transforms x into x * A / 255  (where x=Y,R,G or B).
-// Un-Multiply operation transforms x into x * 255 / A.
-
-// Pre-Multiply or Un-Multiply (if 'inverse' is true) argb values in a row.
-void WebPMultARGBRow(uint32_t* const ptr, int width, int inverse);
-
-// Same a WebPMultARGBRow(), but for several rows.
-void WebPMultARGBRows(uint8_t* ptr, int stride, int width, int num_rows,
-                      int inverse);
-
-// Same for a row of single values, with side alpha values.
-void WebPMultRow(uint8_t* const ptr, const uint8_t* const alpha,
-                 int width, int inverse);
-
-// Same a WebPMultRow(), but for several 'num_rows' rows.
-void WebPMultRows(uint8_t* ptr, int stride,
-                  const uint8_t* alpha, int alpha_stride,
-                  int width, int num_rows, int inverse);
-
-#ifdef __cplusplus
-}    // extern "C"
-#endif
-
-#endif    // WEBP_UTILS_ALPHA_PROCESSING_H_
diff --git a/src/3rdparty/libwebp/src/utils/bit_reader.c b/src/3rdparty/libwebp/src/utils/bit_reader.c
index bfa4d7d..64503e6 100644
--- a/src/3rdparty/libwebp/src/utils/bit_reader.c
+++ b/src/3rdparty/libwebp/src/utils/bit_reader.c
@@ -7,18 +7,16 @@
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
-// Boolean decoder
+// Boolean decoder non-inlined methods
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
-#include "./bit_reader.h"
-
-#ifndef USE_RIGHT_JUSTIFY
-#define MK(X) (((range_t)(X) << (BITS)) | (MASK))
-#else
-#define MK(X) ((range_t)(X))
+#ifdef HAVE_CONFIG_H
+#include "../webp/config.h"
 #endif
 
+#include "./bit_reader_inl.h"
+
 //------------------------------------------------------------------------------
 // VP8BitReader
 
@@ -27,12 +25,20 @@ void VP8InitBitReader(VP8BitReader* const br,
   assert(br != NULL);
   assert(start != NULL);
   assert(start <= end);
-  br->range_   = MK(255 - 1);
+  br->range_   = 255 - 1;
   br->buf_     = start;
   br->buf_end_ = end;
   br->value_   = 0;
   br->bits_    = -8;   // to load the very first 8bits
   br->eof_     = 0;
+  VP8LoadNewBytes(br);
+}
+
+void VP8RemapBitReader(VP8BitReader* const br, ptrdiff_t offset) {
+  if (br->buf_ != NULL) {
+    br->buf_ += offset;
+    br->buf_end_ += offset;
+  }
 }
 
 const uint8_t kVP8Log2Range[128] = {
@@ -47,45 +53,35 @@ const uint8_t kVP8Log2Range[128] = {
   0
 };
 
-// range = (range << kVP8Log2Range[range]) + trailing 1's
+// range = ((range - 1) << kVP8Log2Range[range]) + 1
 const range_t kVP8NewRange[128] = {
-  MK(127), MK(127), MK(191), MK(127), MK(159), MK(191), MK(223), MK(127),
-  MK(143), MK(159), MK(175), MK(191), MK(207), MK(223), MK(239), MK(127),
-  MK(135), MK(143), MK(151), MK(159), MK(167), MK(175), MK(183), MK(191),
-  MK(199), MK(207), MK(215), MK(223), MK(231), MK(239), MK(247), MK(127),
-  MK(131), MK(135), MK(139), MK(143), MK(147), MK(151), MK(155), MK(159),
-  MK(163), MK(167), MK(171), MK(175), MK(179), MK(183), MK(187), MK(191),
-  MK(195), MK(199), MK(203), MK(207), MK(211), MK(215), MK(219), MK(223),
-  MK(227), MK(231), MK(235), MK(239), MK(243), MK(247), MK(251), MK(127),
-  MK(129), MK(131), MK(133), MK(135), MK(137), MK(139), MK(141), MK(143),
-  MK(145), MK(147), MK(149), MK(151), MK(153), MK(155), MK(157), MK(159),
-  MK(161), MK(163), MK(165), MK(167), MK(169), MK(171), MK(173), MK(175),
-  MK(177), MK(179), MK(181), MK(183), MK(185), MK(187), MK(189), MK(191),
-  MK(193), MK(195), MK(197), MK(199), MK(201), MK(203), MK(205), MK(207),
-  MK(209), MK(211), MK(213), MK(215), MK(217), MK(219), MK(221), MK(223),
-  MK(225), MK(227), MK(229), MK(231), MK(233), MK(235), MK(237), MK(239),
-  MK(241), MK(243), MK(245), MK(247), MK(249), MK(251), MK(253), MK(127)
+  127, 127, 191, 127, 159, 191, 223, 127,
+  143, 159, 175, 191, 207, 223, 239, 127,
+  135, 143, 151, 159, 167, 175, 183, 191,
+  199, 207, 215, 223, 231, 239, 247, 127,
+  131, 135, 139, 143, 147, 151, 155, 159,
+  163, 167, 171, 175, 179, 183, 187, 191,
+  195, 199, 203, 207, 211, 215, 219, 223,
+  227, 231, 235, 239, 243, 247, 251, 127,
+  129, 131, 133, 135, 137, 139, 141, 143,
+  145, 147, 149, 151, 153, 155, 157, 159,
+  161, 163, 165, 167, 169, 171, 173, 175,
+  177, 179, 181, 183, 185, 187, 189, 191,
+  193, 195, 197, 199, 201, 203, 205, 207,
+  209, 211, 213, 215, 217, 219, 221, 223,
+  225, 227, 229, 231, 233, 235, 237, 239,
+  241, 243, 245, 247, 249, 251, 253, 127
 };
 
-#undef MK
-
 void VP8LoadFinalBytes(VP8BitReader* const br) {
   assert(br != NULL && br->buf_ != NULL);
   // Only read 8bits at a time
   if (br->buf_ < br->buf_end_) {
-#ifndef USE_RIGHT_JUSTIFY
-    br->value_ |= (bit_t)(*br->buf_++) << ((BITS) - 8 - br->bits_);
-#else
-    br->value_ = (bit_t)(*br->buf_++) | (br->value_ << 8);
-#endif
     br->bits_ += 8;
+    br->value_ = (bit_t)(*br->buf_++) | (br->value_ << 8);
   } else if (!br->eof_) {
-#ifdef USE_RIGHT_JUSTIFY
-    // These are not strictly needed, but it makes the behaviour
-    // consistent for both USE_RIGHT_JUSTIFY and !USE_RIGHT_JUSTIFY.
     br->value_ <<= 8;
     br->bits_ += 8;
-#endif
     br->eof_ = 1;
   }
 }
@@ -109,36 +105,48 @@ int32_t VP8GetSignedValue(VP8BitReader* const br, int bits) {
 //------------------------------------------------------------------------------
 // VP8LBitReader
 
-#define MAX_NUM_BIT_READ 25
+#define VP8L_LOG8_WBITS 4  // Number of bytes needed to store VP8L_WBITS bits.
 
-#define LBITS 64      // Number of bits prefetched.
-#define WBITS 32      // Minimum number of bytes needed after VP8LFillBitWindow.
-#define LOG8_WBITS 4  // Number of bytes needed to store WBITS bits.
+#if !defined(WEBP_FORCE_ALIGNED) && \
+    (defined(__arm__) || defined(_M_ARM) || defined(__aarch64__) || \
+     defined(__i386__) || defined(_M_IX86) || \
+     defined(__x86_64__) || defined(_M_X64))
+#define VP8L_USE_UNALIGNED_LOAD
+#endif
 
-static const uint32_t kBitMask[MAX_NUM_BIT_READ] = {
-  0, 1, 3, 7, 15, 31, 63, 127, 255, 511, 1023, 2047, 4095, 8191, 16383, 32767,
-  65535, 131071, 262143, 524287, 1048575, 2097151, 4194303, 8388607, 16777215
+static const uint32_t kBitMask[VP8L_MAX_NUM_BIT_READ + 1] = {
+  0,
+  0x000001, 0x000003, 0x000007, 0x00000f,
+  0x00001f, 0x00003f, 0x00007f, 0x0000ff,
+  0x0001ff, 0x0003ff, 0x0007ff, 0x000fff,
+  0x001fff, 0x003fff, 0x007fff, 0x00ffff,
+  0x01ffff, 0x03ffff, 0x07ffff, 0x0fffff,
+  0x1fffff, 0x3fffff, 0x7fffff, 0xffffff
 };
 
-void VP8LInitBitReader(VP8LBitReader* const br,
-                       const uint8_t* const start,
+void VP8LInitBitReader(VP8LBitReader* const br, const uint8_t* const start,
                        size_t length) {
   size_t i;
+  vp8l_val_t value = 0;
   assert(br != NULL);
   assert(start != NULL);
   assert(length < 0xfffffff8u);   // can't happen with a RIFF chunk.
 
-  br->buf_ = start;
   br->len_ = length;
   br->val_ = 0;
-  br->pos_ = 0;
   br->bit_pos_ = 0;
   br->eos_ = 0;
   br->error_ = 0;
-  for (i = 0; i < sizeof(br->val_) && i < br->len_; ++i) {
-    br->val_ |= ((vp8l_val_t)br->buf_[br->pos_]) << (8 * i);
-    ++br->pos_;
+
+  if (length > sizeof(br->val_)) {
+    length = sizeof(br->val_);
+  }
+  for (i = 0; i < length; ++i) {
+    value |= (vp8l_val_t)start[i] << (8 * i);
   }
+  br->val_ = value;
+  br->pos_ = length;
+  br->buf_ = start;
 }
 
 void VP8LBitReaderSetBuffer(VP8LBitReader* const br,
@@ -146,55 +154,51 @@ void VP8LBitReaderSetBuffer(VP8LBitReader* const br,
   assert(br != NULL);
   assert(buf != NULL);
   assert(len < 0xfffffff8u);   // can't happen with a RIFF chunk.
-  br->eos_ = (br->pos_ >= len);
   br->buf_ = buf;
   br->len_ = len;
+  // pos_ > len_ should be considered a param error.
+  br->error_ = (br->pos_ > br->len_);
+  br->eos_ = br->error_ || VP8LIsEndOfStream(br);
 }
 
-// If not at EOS, reload up to LBITS byte-by-byte
+// If not at EOS, reload up to VP8L_LBITS byte-by-byte
 static void ShiftBytes(VP8LBitReader* const br) {
   while (br->bit_pos_ >= 8 && br->pos_ < br->len_) {
     br->val_ >>= 8;
-    br->val_ |= ((vp8l_val_t)br->buf_[br->pos_]) << (LBITS - 8);
+    br->val_ |= ((vp8l_val_t)br->buf_[br->pos_]) << (VP8L_LBITS - 8);
     ++br->pos_;
     br->bit_pos_ -= 8;
   }
+  br->eos_ = VP8LIsEndOfStream(br);
 }
 
-void VP8LFillBitWindow(VP8LBitReader* const br) {
-  if (br->bit_pos_ >= WBITS) {
-#if (defined(__x86_64__) || defined(_M_X64))
-    if (br->pos_ + sizeof(br->val_) < br->len_) {
-      br->val_ >>= WBITS;
-      br->bit_pos_ -= WBITS;
-      // The expression below needs a little-endian arch to work correctly.
-      // This gives a large speedup for decoding speed.
-      br->val_ |= *(const vp8l_val_t*)(br->buf_ + br->pos_) << (LBITS - WBITS);
-      br->pos_ += LOG8_WBITS;
-      return;
-    }
-#endif
-    ShiftBytes(br);       // Slow path.
-    if (br->pos_ == br->len_ && br->bit_pos_ >= LBITS) {
-      br->eos_ = 1;
-    }
+void VP8LDoFillBitWindow(VP8LBitReader* const br) {
+  assert(br->bit_pos_ >= VP8L_WBITS);
+  // TODO(jzern): given the fixed read size it may be possible to force
+  //              alignment in this block.
+#if defined(VP8L_USE_UNALIGNED_LOAD)
+  if (br->pos_ + sizeof(br->val_) < br->len_) {
+    br->val_ >>= VP8L_WBITS;
+    br->bit_pos_ -= VP8L_WBITS;
+    // The expression below needs a little-endian arch to work correctly.
+    // This gives a large speedup for decoding speed.
+    br->val_ |= (vp8l_val_t)*(const uint32_t*)(br->buf_ + br->pos_) <<
+                (VP8L_LBITS - VP8L_WBITS);
+    br->pos_ += VP8L_LOG8_WBITS;
+    return;
   }
+#endif
+  ShiftBytes(br);       // Slow path.
 }
 
 uint32_t VP8LReadBits(VP8LBitReader* const br, int n_bits) {
   assert(n_bits >= 0);
   // Flag an error if end_of_stream or n_bits is more than allowed limit.
-  if (!br->eos_ && n_bits < MAX_NUM_BIT_READ) {
+  if (!br->eos_ && n_bits <= VP8L_MAX_NUM_BIT_READ) {
     const uint32_t val =
         (uint32_t)(br->val_ >> br->bit_pos_) & kBitMask[n_bits];
     const int new_bits = br->bit_pos_ + n_bits;
     br->bit_pos_ = new_bits;
-    // If this read is going to cross the read buffer, set the eos flag.
-    if (br->pos_ == br->len_) {
-      if (new_bits >= LBITS) {
-        br->eos_ = 1;
-      }
-    }
     ShiftBytes(br);
     return val;
   } else {
@@ -204,4 +208,3 @@ uint32_t VP8LReadBits(VP8LBitReader* const br, int n_bits) {
 }
 
 //------------------------------------------------------------------------------
-
diff --git a/src/3rdparty/libwebp/src/utils/bit_reader.h b/src/3rdparty/libwebp/src/utils/bit_reader.h
index 98df98a..f569734 100644
--- a/src/3rdparty/libwebp/src/utils/bit_reader.h
+++ b/src/3rdparty/libwebp/src/utils/bit_reader.h
@@ -29,110 +29,62 @@ extern "C" {
 // However, since range_ is only 8bit, we only need an active window of 8 bits
 // for value_. Left bits (MSB) gets zeroed and shifted away when value_ falls
 // below 128, range_ is updated, and fresh bits read from the bitstream are
-// brought in as LSB.
-// To avoid reading the fresh bits one by one (slow), we cache a few of them
-// ahead (actually, we cache BITS of them ahead. See below). There's two
-// strategies regarding how to shift these looked-ahead fresh bits into the
-// 8bit window of value_: either we shift them in, while keeping the position of
-// the window fixed. Or we slide the window to the right while keeping the cache
-// bits at a fixed, right-justified, position.
+// brought in as LSB. To avoid reading the fresh bits one by one (slow), we
+// cache BITS of them ahead. The total of (BITS + 8) bits must fit into a
+// natural register (with type bit_t). To fetch BITS bits from bitstream we
+// use a type lbit_t.
 //
-//  Example, for BITS=16: here is the content of value_ for both strategies:
-//
-//          !USE_RIGHT_JUSTIFY            ||        USE_RIGHT_JUSTIFY
-//                                        ||
-//   <- 8b -><- 8b -><- BITS bits  ->     ||  <- 8b+3b -><- 8b -><- 13 bits ->
-//   [unused][value_][cached bits][0]     ||  [unused...][value_][cached bits]
-//  [........00vvvvvvBBBBBBBBBBBBB000]LSB || [...........00vvvvvvBBBBBBBBBBBBB]
-//                                        ||
-// After calling VP8Shift(), where we need to shift away two zeros:
-//  [........vvvvvvvvBBBBBBBBBBB00000]LSB || [.............vvvvvvvvBBBBBBBBBBB]
-//                                        ||
-// Just before we need to call VP8LoadNewBytes(), the situation is:
-//  [........vvvvvv000000000000000000]LSB || [..........................vvvvvv]
-//                                        ||
-// And just after calling VP8LoadNewBytes():
-//  [........vvvvvvvvBBBBBBBBBBBBBBBB]LSB || [........vvvvvvvvBBBBBBBBBBBBBBBB]
-//
-// -> we're back to eight active 'value_' bits (marked 'v') and BITS cached
-// bits (marked 'B')
-//
-// The right-justify strategy tends to use less shifts and is often faster.
-
-//------------------------------------------------------------------------------
 // BITS can be any multiple of 8 from 8 to 56 (inclusive).
 // Pick values that fit natural register size.
 
-#if !defined(WEBP_REFERENCE_IMPLEMENTATION)
-
-#define USE_RIGHT_JUSTIFY
-
 #if defined(__i386__) || defined(_M_IX86)      // x86 32bit
-#define BITS 16
+#define BITS 24
 #elif defined(__x86_64__) || defined(_M_X64)   // x86 64bit
 #define BITS 56
 #elif defined(__arm__) || defined(_M_ARM)      // ARM
 #define BITS 24
-#else                      // reasonable default
+#elif defined(__mips__)                        // MIPS
 #define BITS 24
-#endif
-
-#else     // reference choices
-
-#define USE_RIGHT_JUSTIFY
-#define BITS 8
-
+#else                                          // reasonable default
+#define BITS 24  // TODO(skal): test aarch64 and find the proper BITS value.
 #endif
 
 //------------------------------------------------------------------------------
-// Derived types and constants
+// Derived types and constants:
+//   bit_t = natural register type for storing 'value_' (which is BITS+8 bits)
+//   range_t = register for 'range_' (which is 8bits only)
 
-// bit_t = natural register type
-// lbit_t = natural type for memory I/O
-
-#if (BITS > 32)
-typedef uint64_t bit_t;
-typedef uint64_t lbit_t;
-#elif (BITS == 32)
+#if (BITS > 24)
 typedef uint64_t bit_t;
-typedef uint32_t lbit_t;
-#elif (BITS == 24)
-typedef uint32_t bit_t;
-typedef uint32_t lbit_t;
-#elif (BITS == 16)
-typedef uint32_t bit_t;
-typedef uint16_t lbit_t;
 #else
 typedef uint32_t bit_t;
-typedef uint8_t lbit_t;
 #endif
 
-#ifndef USE_RIGHT_JUSTIFY
-typedef bit_t range_t;     // type for storing range_
-#define MASK ((((bit_t)1) << (BITS)) - 1)
-#else
-typedef uint32_t range_t;  // range_ only uses 8bits here. No need for bit_t.
-#endif
+typedef uint32_t range_t;
 
 //------------------------------------------------------------------------------
 // Bitreader
 
 typedef struct VP8BitReader VP8BitReader;
 struct VP8BitReader {
+  // boolean decoder  (keep the field ordering as is!)
+  bit_t value_;               // current value
+  range_t range_;             // current range minus 1. In [127, 254] interval.
+  int bits_;                  // number of valid bits left
+  // read buffer
   const uint8_t* buf_;        // next byte to be read
   const uint8_t* buf_end_;    // end of read buffer
   int eof_;                   // true if input is exhausted
-
-  // boolean decoder
-  range_t range_;            // current range minus 1. In [127, 254] interval.
-  bit_t value_;              // current value
-  int bits_;                 // number of valid bits left
 };
 
 // Initialize the bit reader and the boolean decoder.
 void VP8InitBitReader(VP8BitReader* const br,
                       const uint8_t* const start, const uint8_t* const end);
 
+// Update internal pointers to displace the byte buffer by the
+// relative offset 'offset'.
+void VP8RemapBitReader(VP8BitReader* const br, ptrdiff_t offset);
+
 // return the next value made of 'num_bits' bits
 uint32_t VP8GetValue(VP8BitReader* const br, int num_bits);
 static WEBP_INLINE uint32_t VP8Get(VP8BitReader* const br) {
@@ -142,152 +94,22 @@ static WEBP_INLINE uint32_t VP8Get(VP8BitReader* const br) {
 // return the next value with sign-extension.
 int32_t VP8GetSignedValue(VP8BitReader* const br, int num_bits);
 
-// Read a bit with proba 'prob'. Speed-critical function!
-extern const uint8_t kVP8Log2Range[128];
-extern const range_t kVP8NewRange[128];
-
-void VP8LoadFinalBytes(VP8BitReader* const br);    // special case for the tail
-
-static WEBP_INLINE void VP8LoadNewBytes(VP8BitReader* const br) {
-  assert(br != NULL && br->buf_ != NULL);
-  // Read 'BITS' bits at a time if possible.
-  if (br->buf_ + sizeof(lbit_t) <= br->buf_end_) {
-    // convert memory type to register type (with some zero'ing!)
-    bit_t bits;
-    const lbit_t in_bits = *(const lbit_t*)br->buf_;
-    br->buf_ += (BITS) >> 3;
-#if !defined(__BIG_ENDIAN__)
-#if (BITS > 32)
-// gcc 4.3 has builtin functions for swap32/swap64
-#if defined(__GNUC__) && \
-           (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3))
-    bits = (bit_t)__builtin_bswap64(in_bits);
-#elif defined(_MSC_VER)
-    bits = (bit_t)_byteswap_uint64(in_bits);
-#elif defined(__x86_64__)
-    __asm__ volatile("bswapq %0" : "=r"(bits) : "0"(in_bits));
-#else  // generic code for swapping 64-bit values (suggested by bdb@)
-    bits = (bit_t)in_bits;
-    bits = ((bits & 0xffffffff00000000ull) >> 32) |
-           ((bits & 0x00000000ffffffffull) << 32);
-    bits = ((bits & 0xffff0000ffff0000ull) >> 16) |
-           ((bits & 0x0000ffff0000ffffull) << 16);
-    bits = ((bits & 0xff00ff00ff00ff00ull) >> 8) |
-           ((bits & 0x00ff00ff00ff00ffull) << 8);
-#endif
-    bits >>= 64 - BITS;
-#elif (BITS >= 24)
-#if defined(__i386__) || defined(__x86_64__)
-    {
-      lbit_t swapped_in_bits;
-      __asm__ volatile("bswap %k0" : "=r"(swapped_in_bits) : "0"(in_bits));
-      bits = (bit_t)swapped_in_bits;   // 24b/32b -> 32b/64b zero-extension
-    }
-#elif defined(_MSC_VER)
-    bits = (bit_t)_byteswap_ulong(in_bits);
-#else
-    bits = (bit_t)(in_bits >> 24) | ((in_bits >> 8) & 0xff00)
-         | ((in_bits << 8) & 0xff0000)  | (in_bits << 24);
-#endif  // x86
-    bits >>= (32 - BITS);
-#elif (BITS == 16)
-    // gcc will recognize a 'rorw $8, ...' here:
-    bits = (bit_t)(in_bits >> 8) | ((in_bits & 0xff) << 8);
-#else   // BITS == 8
-    bits = (bit_t)in_bits;
-#endif
-#else    // BIG_ENDIAN
-    bits = (bit_t)in_bits;
-    if (BITS != 8 * sizeof(bit_t)) bits >>= (8 * sizeof(bit_t) - BITS);
-#endif
-#ifndef USE_RIGHT_JUSTIFY
-    br->value_ |= bits << (-br->bits_);
-#else
-    br->value_ = bits | (br->value_ << (BITS));
-#endif
-    br->bits_ += (BITS);
-  } else {
-    VP8LoadFinalBytes(br);    // no need to be inlined
-  }
-}
-
-static WEBP_INLINE int VP8BitUpdate(VP8BitReader* const br, range_t split) {
-  if (br->bits_ < 0) {  // Make sure we have a least BITS bits in 'value_'
-    VP8LoadNewBytes(br);
-  }
-#ifndef USE_RIGHT_JUSTIFY
-  split |= (MASK);
-  if (br->value_ > split) {
-    br->range_ -= split + 1;
-    br->value_ -= split + 1;
-    return 1;
-  } else {
-    br->range_ = split;
-    return 0;
-  }
-#else
-  {
-    const int pos = br->bits_;
-    const range_t value = (range_t)(br->value_ >> pos);
-    if (value > split) {
-      br->range_ -= split + 1;
-      br->value_ -= (bit_t)(split + 1) << pos;
-      return 1;
-    } else {
-      br->range_ = split;
-      return 0;
-    }
-  }
-#endif
-}
-
-static WEBP_INLINE void VP8Shift(VP8BitReader* const br) {
-#ifndef USE_RIGHT_JUSTIFY
-  // range_ is in [0..127] interval here.
-  const bit_t idx = br->range_ >> (BITS);
-  const int shift = kVP8Log2Range[idx];
-  br->range_ = kVP8NewRange[idx];
-  br->value_ <<= shift;
-  br->bits_ -= shift;
-#else
-  const int shift = kVP8Log2Range[br->range_];
-  assert(br->range_ < (range_t)128);
-  br->range_ = kVP8NewRange[br->range_];
-  br->bits_ -= shift;
-#endif
-}
-
-static WEBP_INLINE int VP8GetBit(VP8BitReader* const br, int prob) {
-#ifndef USE_RIGHT_JUSTIFY
-  // It's important to avoid generating a 64bit x 64bit multiply here.
-  // We just need an 8b x 8b after all.
-  const range_t split =
-      (range_t)((uint32_t)(br->range_ >> (BITS)) * prob) << ((BITS) - 8);
-  const int bit = VP8BitUpdate(br, split);
-  if (br->range_ <= (((range_t)0x7e << (BITS)) | (MASK))) {
-    VP8Shift(br);
-  }
-  return bit;
-#else
-  const range_t split = (br->range_ * prob) >> 8;
-  const int bit = VP8BitUpdate(br, split);
-  if (br->range_ <= (range_t)0x7e) {
-    VP8Shift(br);
-  }
-  return bit;
-#endif
-}
-
-static WEBP_INLINE int VP8GetSigned(VP8BitReader* const br, int v) {
-  const range_t split = (br->range_ >> 1);
-  const int bit = VP8BitUpdate(br, split);
-  VP8Shift(br);
-  return bit ? -v : v;
-}
+// bit_reader_inl.h will implement the following methods:
+//   static WEBP_INLINE int VP8GetBit(VP8BitReader* const br, int prob)
+//   static WEBP_INLINE int VP8GetSigned(VP8BitReader* const br, int v)
+// and should be included by the .c files that actually need them.
+// This is to avoid recompiling the whole library whenever this file is touched,
+// and also allowing platform-specific ad-hoc hacks.
 
 // -----------------------------------------------------------------------------
 // Bitreader for lossless format
 
+// maximum number of bits (inclusive) the bit-reader can handle:
+#define VP8L_MAX_NUM_BIT_READ 24
+
+#define VP8L_LBITS 64  // Number of bits prefetched.
+#define VP8L_WBITS 32  // Minimum number of bytes ready after VP8LFillBitWindow.
+
 typedef uint64_t vp8l_val_t;  // right now, this bit-reader can only use 64bit.
 
 typedef struct {
@@ -308,9 +130,10 @@ void VP8LInitBitReader(VP8LBitReader* const br,
 void VP8LBitReaderSetBuffer(VP8LBitReader* const br,
                             const uint8_t* const buffer, size_t length);
 
-// Reads the specified number of bits from Read Buffer.
-// Flags an error in case end_of_stream or n_bits is more than allowed limit.
-// Flags eos if this read attempt is going to cross the read buffer.
+// Reads the specified number of bits from read buffer.
+// Flags an error in case end_of_stream or n_bits is more than the allowed limit
+// of VP8L_MAX_NUM_BIT_READ (inclusive).
+// Flags eos_ if this read attempt is going to cross the read buffer.
 uint32_t VP8LReadBits(VP8LBitReader* const br, int n_bits);
 
 // Return the prefetched bits, so they can be looked up.
@@ -318,14 +141,26 @@ static WEBP_INLINE uint32_t VP8LPrefetchBits(VP8LBitReader* const br) {
   return (uint32_t)(br->val_ >> br->bit_pos_);
 }
 
+// Returns true if there was an attempt at reading bit past the end of
+// the buffer. Doesn't set br->eos_ flag.
+static WEBP_INLINE int VP8LIsEndOfStream(const VP8LBitReader* const br) {
+  assert(br->pos_ <= br->len_);
+  return (br->pos_ == br->len_) && (br->bit_pos_ > VP8L_LBITS);
+}
+
 // For jumping over a number of bits in the bit stream when accessed with
 // VP8LPrefetchBits and VP8LFillBitWindow.
 static WEBP_INLINE void VP8LSetBitPos(VP8LBitReader* const br, int val) {
   br->bit_pos_ = val;
+  br->eos_ = VP8LIsEndOfStream(br);
 }
 
 // Advances the read buffer by 4 bytes to make room for reading next 32 bits.
-void VP8LFillBitWindow(VP8LBitReader* const br);
+// Speed critical, but infrequent part of the code can be non-inlined.
+extern void VP8LDoFillBitWindow(VP8LBitReader* const br);
+static WEBP_INLINE void VP8LFillBitWindow(VP8LBitReader* const br) {
+  if (br->bit_pos_ >= VP8L_WBITS) VP8LDoFillBitWindow(br);
+}
 
 #ifdef __cplusplus
 }    // extern "C"
diff --git a/src/3rdparty/libwebp/src/utils/bit_reader_inl.h b/src/3rdparty/libwebp/src/utils/bit_reader_inl.h
new file mode 100644
index 0000000..81427c6
--- /dev/null
+++ b/src/3rdparty/libwebp/src/utils/bit_reader_inl.h
@@ -0,0 +1,172 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Specific inlined methods for boolean decoder [VP8GetBit() ...]
+// This file should be included by the .c sources that actually need to call
+// these methods.
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#ifndef WEBP_UTILS_BIT_READER_INL_H_
+#define WEBP_UTILS_BIT_READER_INL_H_
+
+#ifdef HAVE_CONFIG_H
+#include "../webp/config.h"
+#endif
+
+#ifdef WEBP_FORCE_ALIGNED
+#include <string.h>  // memcpy
+#endif
+
+#include "../dsp/dsp.h"
+#include "./bit_reader.h"
+#include "./endian_inl.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//------------------------------------------------------------------------------
+// Derived type lbit_t = natural type for memory I/O
+
+#if   (BITS > 32)
+typedef uint64_t lbit_t;
+#elif (BITS > 16)
+typedef uint32_t lbit_t;
+#elif (BITS >  8)
+typedef uint16_t lbit_t;
+#else
+typedef uint8_t lbit_t;
+#endif
+
+extern const uint8_t kVP8Log2Range[128];
+extern const range_t kVP8NewRange[128];
+
+// special case for the tail byte-reading
+void VP8LoadFinalBytes(VP8BitReader* const br);
+
+//------------------------------------------------------------------------------
+// Inlined critical functions
+
+// makes sure br->value_ has at least BITS bits worth of data
+static WEBP_INLINE void VP8LoadNewBytes(VP8BitReader* const br) {
+  assert(br != NULL && br->buf_ != NULL);
+  // Read 'BITS' bits at a time if possible.
+  if (br->buf_ + sizeof(lbit_t) <= br->buf_end_) {
+    // convert memory type to register type (with some zero'ing!)
+    bit_t bits;
+#if defined(WEBP_FORCE_ALIGNED)
+    lbit_t in_bits;
+    memcpy(&in_bits, br->buf_, sizeof(in_bits));
+#elif defined(WEBP_USE_MIPS32)
+    // This is needed because of un-aligned read.
+    lbit_t in_bits;
+    lbit_t* p_buf_ = (lbit_t*)br->buf_;
+    __asm__ volatile(
+      ".set   push                             \n\t"
+      ".set   at                               \n\t"
+      ".set   macro                            \n\t"
+      "ulw    %[in_bits], 0(%[p_buf_])         \n\t"
+      ".set   pop                              \n\t"
+      : [in_bits]"=r"(in_bits)
+      : [p_buf_]"r"(p_buf_)
+      : "memory", "at"
+    );
+#else
+    const lbit_t in_bits = *(const lbit_t*)br->buf_;
+#endif
+    br->buf_ += BITS >> 3;
+#if !defined(WORDS_BIGENDIAN)
+#if (BITS > 32)
+    bits = BSwap64(in_bits);
+    bits >>= 64 - BITS;
+#elif (BITS >= 24)
+    bits = BSwap32(in_bits);
+    bits >>= (32 - BITS);
+#elif (BITS == 16)
+    bits = BSwap16(in_bits);
+#else   // BITS == 8
+    bits = (bit_t)in_bits;
+#endif  // BITS > 32
+#else    // WORDS_BIGENDIAN
+    bits = (bit_t)in_bits;
+    if (BITS != 8 * sizeof(bit_t)) bits >>= (8 * sizeof(bit_t) - BITS);
+#endif
+    br->value_ = bits | (br->value_ << BITS);
+    br->bits_ += BITS;
+  } else {
+    VP8LoadFinalBytes(br);    // no need to be inlined
+  }
+}
+
+// Read a bit with proba 'prob'. Speed-critical function!
+static WEBP_INLINE int VP8GetBit(VP8BitReader* const br, int prob) {
+  // Don't move this declaration! It makes a big speed difference to store
+  // 'range' *before* calling VP8LoadNewBytes(), even if this function doesn't
+  // alter br->range_ value.
+  range_t range = br->range_;
+  if (br->bits_ < 0) {
+    VP8LoadNewBytes(br);
+  }
+  {
+    const int pos = br->bits_;
+    const range_t split = (range * prob) >> 8;
+    const range_t value = (range_t)(br->value_ >> pos);
+#if defined(__arm__) || defined(_M_ARM)      // ARM-specific
+    const int bit = ((int)(split - value) >> 31) & 1;
+    if (value > split) {
+      range -= split + 1;
+      br->value_ -= (bit_t)(split + 1) << pos;
+    } else {
+      range = split;
+    }
+#else  // faster version on x86
+    int bit;  // Don't use 'const int bit = (value > split);", it's slower.
+    if (value > split) {
+      range -= split + 1;
+      br->value_ -= (bit_t)(split + 1) << pos;
+      bit = 1;
+    } else {
+      range = split;
+      bit = 0;
+    }
+#endif
+    if (range <= (range_t)0x7e) {
+      const int shift = kVP8Log2Range[range];
+      range = kVP8NewRange[range];
+      br->bits_ -= shift;
+    }
+    br->range_ = range;
+    return bit;
+  }
+}
+
+// simplified version of VP8GetBit() for prob=0x80 (note shift is always 1 here)
+static WEBP_INLINE int VP8GetSigned(VP8BitReader* const br, int v) {
+  if (br->bits_ < 0) {
+    VP8LoadNewBytes(br);
+  }
+  {
+    const int pos = br->bits_;
+    const range_t split = br->range_ >> 1;
+    const range_t value = (range_t)(br->value_ >> pos);
+    const int32_t mask = (int32_t)(split - value) >> 31;  // -1 or 0
+    br->bits_ -= 1;
+    br->range_ += mask;
+    br->range_ |= 1;
+    br->value_ -= (bit_t)((split + 1) & mask) << pos;
+    return (v ^ mask) - mask;
+  }
+}
+
+#ifdef __cplusplus
+}    // extern "C"
+#endif
+
+#endif   // WEBP_UTILS_BIT_READER_INL_H_
diff --git a/src/3rdparty/libwebp/src/utils/bit_writer.c b/src/3rdparty/libwebp/src/utils/bit_writer.c
index 29810a1..9875ca6 100644
--- a/src/3rdparty/libwebp/src/utils/bit_writer.c
+++ b/src/3rdparty/libwebp/src/utils/bit_writer.c
@@ -15,7 +15,10 @@
 #include <assert.h>
 #include <string.h>   // for memcpy()
 #include <stdlib.h>
+
 #include "./bit_writer.h"
+#include "./endian_inl.h"
+#include "./utils.h"
 
 //------------------------------------------------------------------------------
 // VP8BitWriter
@@ -34,7 +37,7 @@ static int BitWriterResize(VP8BitWriter* const bw, size_t extra_size) {
   new_size = 2 * bw->max_pos_;
   if (new_size < needed_size) new_size = needed_size;
   if (new_size < 1024) new_size = 1024;
-  new_buf = (uint8_t*)malloc(new_size);
+  new_buf = (uint8_t*)WebPSafeMalloc(1ULL, new_size);
   if (new_buf == NULL) {
     bw->error_ = 1;
     return 0;
@@ -43,13 +46,13 @@ static int BitWriterResize(VP8BitWriter* const bw, size_t extra_size) {
     assert(bw->buf_ != NULL);
     memcpy(new_buf, bw->buf_, bw->pos_);
   }
-  free(bw->buf_);
+  WebPSafeFree(bw->buf_);
   bw->buf_ = new_buf;
   bw->max_pos_ = new_size;
   return 1;
 }
 
-static void kFlush(VP8BitWriter* const bw) {
+static void Flush(VP8BitWriter* const bw) {
   const int s = 8 + bw->nb_bits_;
   const int32_t bits = bw->value_ >> s;
   assert(bw->nb_bits_ >= 0);
@@ -115,7 +118,7 @@ int VP8PutBit(VP8BitWriter* const bw, int bit, int prob) {
     bw->range_ = kNewRange[bw->range_];
     bw->value_ <<= shift;
     bw->nb_bits_ += shift;
-    if (bw->nb_bits_ > 0) kFlush(bw);
+    if (bw->nb_bits_ > 0) Flush(bw);
   }
   return bit;
 }
@@ -132,7 +135,7 @@ int VP8PutBitUniform(VP8BitWriter* const bw, int bit) {
     bw->range_ = kNewRange[bw->range_];
     bw->value_ <<= 1;
     bw->nb_bits_ += 1;
-    if (bw->nb_bits_ > 0) kFlush(bw);
+    if (bw->nb_bits_ > 0) Flush(bw);
   }
   return bit;
 }
@@ -170,14 +173,14 @@ int VP8BitWriterInit(VP8BitWriter* const bw, size_t expected_size) {
 uint8_t* VP8BitWriterFinish(VP8BitWriter* const bw) {
   VP8PutValue(bw, 0, 9 - bw->nb_bits_);
   bw->nb_bits_ = 0;   // pad with zeroes
-  kFlush(bw);
+  Flush(bw);
   return bw->buf_;
 }
 
 int VP8BitWriterAppend(VP8BitWriter* const bw,
                        const uint8_t* data, size_t size) {
-  assert(data);
-  if (bw->nb_bits_ != -8) return 0;   // kFlush() must have been called
+  assert(data != NULL);
+  if (bw->nb_bits_ != -8) return 0;   // Flush() must have been called
   if (!BitWriterResize(bw, size)) return 0;
   memcpy(bw->buf_ + bw->pos_, data, size);
   bw->pos_ += size;
@@ -185,8 +188,8 @@ int VP8BitWriterAppend(VP8BitWriter* const bw,
 }
 
 void VP8BitWriterWipeOut(VP8BitWriter* const bw) {
-  if (bw) {
-    free(bw->buf_);
+  if (bw != NULL) {
+    WebPSafeFree(bw->buf_);
     memset(bw, 0, sizeof(*bw));
   }
 }
@@ -194,32 +197,43 @@ void VP8BitWriterWipeOut(VP8BitWriter* const bw) {
 //------------------------------------------------------------------------------
 // VP8LBitWriter
 
+// This is the minimum amount of size the memory buffer is guaranteed to grow
+// when extra space is needed.
+#define MIN_EXTRA_SIZE  (32768ULL)
+
+#define VP8L_WRITER_BYTES ((int)sizeof(vp8l_wtype_t))
+#define VP8L_WRITER_BITS (VP8L_WRITER_BYTES * 8)
+#define VP8L_WRITER_MAX_BITS (8 * (int)sizeof(vp8l_atype_t))
+
 // Returns 1 on success.
 static int VP8LBitWriterResize(VP8LBitWriter* const bw, size_t extra_size) {
   uint8_t* allocated_buf;
   size_t allocated_size;
-  const size_t current_size = VP8LBitWriterNumBytes(bw);
+  const size_t max_bytes = bw->end_ - bw->buf_;
+  const size_t current_size = bw->cur_ - bw->buf_;
   const uint64_t size_required_64b = (uint64_t)current_size + extra_size;
   const size_t size_required = (size_t)size_required_64b;
   if (size_required != size_required_64b) {
     bw->error_ = 1;
     return 0;
   }
-  if (bw->max_bytes_ > 0 && size_required <= bw->max_bytes_) return 1;
-  allocated_size = (3 * bw->max_bytes_) >> 1;
+  if (max_bytes > 0 && size_required <= max_bytes) return 1;
+  allocated_size = (3 * max_bytes) >> 1;
   if (allocated_size < size_required) allocated_size = size_required;
   // make allocated size multiple of 1k
   allocated_size = (((allocated_size >> 10) + 1) << 10);
-  allocated_buf = (uint8_t*)malloc(allocated_size);
+  allocated_buf = (uint8_t*)WebPSafeMalloc(1ULL, allocated_size);
   if (allocated_buf == NULL) {
     bw->error_ = 1;
     return 0;
   }
-  memcpy(allocated_buf, bw->buf_, current_size);
-  free(bw->buf_);
+  if (current_size > 0) {
+    memcpy(allocated_buf, bw->buf_, current_size);
+  }
+  WebPSafeFree(bw->buf_);
   bw->buf_ = allocated_buf;
-  bw->max_bytes_ = allocated_size;
-  memset(allocated_buf + current_size, 0, allocated_size - current_size);
+  bw->cur_ = bw->buf_ + current_size;
+  bw->end_ = bw->buf_ + allocated_size;
   return 1;
 }
 
@@ -230,53 +244,64 @@ int VP8LBitWriterInit(VP8LBitWriter* const bw, size_t expected_size) {
 
 void VP8LBitWriterDestroy(VP8LBitWriter* const bw) {
   if (bw != NULL) {
-    free(bw->buf_);
+    WebPSafeFree(bw->buf_);
     memset(bw, 0, sizeof(*bw));
   }
 }
 
 void VP8LWriteBits(VP8LBitWriter* const bw, int n_bits, uint32_t bits) {
-  if (n_bits < 1) return;
-#if !defined(__BIG_ENDIAN__)
-  // Technically, this branch of the code can write up to 25 bits at a time,
-  // but in prefix encoding, the maximum number of bits written is 18 at a time.
-  {
-    uint8_t* const p = &bw->buf_[bw->bit_pos_ >> 3];
-    uint32_t v = *(const uint32_t*)p;
-    v |= bits << (bw->bit_pos_ & 7);
-    *(uint32_t*)p = v;
-    bw->bit_pos_ += n_bits;
-  }
-#else  // BIG_ENDIAN
-  {
-    uint8_t* p = &bw->buf_[bw->bit_pos_ >> 3];
-    const int bits_reserved_in_first_byte = bw->bit_pos_ & 7;
-    const int bits_left_to_write = n_bits - 8 + bits_reserved_in_first_byte;
-    // implicit & 0xff is assumed for uint8_t arithmetic
-    *p++ |= bits << bits_reserved_in_first_byte;
-    bits >>= 8 - bits_reserved_in_first_byte;
-    if (bits_left_to_write >= 1) {
-      *p++ = bits;
-      bits >>= 8;
-      if (bits_left_to_write >= 9) {
-        *p++ = bits;
-        bits >>= 8;
+  assert(n_bits <= 32);
+  // That's the max we can handle:
+  assert(bw->used_ + n_bits <= 2 * VP8L_WRITER_MAX_BITS);
+  if (n_bits > 0) {
+    // Local field copy.
+    vp8l_atype_t lbits = bw->bits_;
+    int used = bw->used_;
+    // Special case of overflow handling for 32bit accumulator (2-steps flush).
+    if (VP8L_WRITER_BITS == 16) {
+      if (used + n_bits >= VP8L_WRITER_MAX_BITS) {
+        // Fill up all the VP8L_WRITER_MAX_BITS so it can be flushed out below.
+        const int shift = VP8L_WRITER_MAX_BITS - used;
+        lbits |= (vp8l_atype_t)bits << used;
+        used = VP8L_WRITER_MAX_BITS;
+        n_bits -= shift;
+        bits >>= shift;
+        assert(n_bits <= VP8L_WRITER_MAX_BITS);
       }
     }
-    assert(n_bits <= 25);
-    *p = bits;
-    bw->bit_pos_ += n_bits;
+    // If needed, make some room by flushing some bits out.
+    while (used >= VP8L_WRITER_BITS) {
+      if (bw->cur_ + VP8L_WRITER_BYTES > bw->end_) {
+        const uint64_t extra_size = (bw->end_ - bw->buf_) + MIN_EXTRA_SIZE;
+        if (extra_size != (size_t)extra_size ||
+            !VP8LBitWriterResize(bw, (size_t)extra_size)) {
+          bw->cur_ = bw->buf_;
+          bw->error_ = 1;
+          return;
+        }
+      }
+      *(vp8l_wtype_t*)bw->cur_ = (vp8l_wtype_t)WSWAP((vp8l_wtype_t)lbits);
+      bw->cur_ += VP8L_WRITER_BYTES;
+      lbits >>= VP8L_WRITER_BITS;
+      used -= VP8L_WRITER_BITS;
+    }
+    // Eventually, insert new bits.
+    bw->bits_ = lbits | ((vp8l_atype_t)bits << used);
+    bw->used_ = used + n_bits;
   }
-#endif
-  if ((bw->bit_pos_ >> 3) > (bw->max_bytes_ - 8)) {
-    const uint64_t extra_size = 32768ULL + bw->max_bytes_;
-    if (extra_size != (size_t)extra_size ||
-        !VP8LBitWriterResize(bw, (size_t)extra_size)) {
-      bw->bit_pos_ = 0;
-      bw->error_ = 1;
+}
+
+uint8_t* VP8LBitWriterFinish(VP8LBitWriter* const bw) {
+  // flush leftover bits
+  if (VP8LBitWriterResize(bw, (bw->used_ + 7) >> 3)) {
+    while (bw->used_ > 0) {
+      *bw->cur_++ = (uint8_t)bw->bits_;
+      bw->bits_ >>= 8;
+      bw->used_ -= 8;
     }
+    bw->used_ = 0;
   }
+  return bw->buf_;
 }
 
 //------------------------------------------------------------------------------
-
diff --git a/src/3rdparty/libwebp/src/utils/bit_writer.h b/src/3rdparty/libwebp/src/utils/bit_writer.h
index 89a9ead..c80d22a 100644
--- a/src/3rdparty/libwebp/src/utils/bit_writer.h
+++ b/src/3rdparty/libwebp/src/utils/bit_writer.h
@@ -68,51 +68,46 @@ static WEBP_INLINE size_t VP8BitWriterSize(const VP8BitWriter* const bw) {
 
 //------------------------------------------------------------------------------
 // VP8LBitWriter
-// TODO(vikasa): VP8LBitWriter is copied as-is from lossless code. There's scope
-// of re-using VP8BitWriter. Will evaluate once basic lossless encoder is
-// implemented.
 
-typedef struct {
-  uint8_t* buf_;
-  size_t bit_pos_;
-  size_t max_bytes_;
+#if defined(__x86_64__) || defined(_M_X64)   // 64bit
+typedef uint64_t vp8l_atype_t;   // accumulator type
+typedef uint32_t vp8l_wtype_t;   // writing type
+#define WSWAP HToLE32
+#else
+typedef uint32_t vp8l_atype_t;
+typedef uint16_t vp8l_wtype_t;
+#define WSWAP HToLE16
+#endif
 
-  // After all bits are written, the caller must observe the state of
-  // error_. A value of 1 indicates that a memory allocation failure
-  // has happened during bit writing. A value of 0 indicates successful
+typedef struct {
+  vp8l_atype_t bits_;   // bit accumulator
+  int          used_;   // number of bits used in accumulator
+  uint8_t*     buf_;    // start of buffer
+  uint8_t*     cur_;    // current write position
+  uint8_t*     end_;    // end of buffer
+
+  // After all bits are written (VP8LBitWriterFinish()), the caller must observe
+  // the state of error_. A value of 1 indicates that a memory allocation
+  // failure has happened during bit writing. A value of 0 indicates successful
   // writing of bits.
   int error_;
 } VP8LBitWriter;
 
 static WEBP_INLINE size_t VP8LBitWriterNumBytes(VP8LBitWriter* const bw) {
-  return (bw->bit_pos_ + 7) >> 3;
+  return (bw->cur_ - bw->buf_) + ((bw->used_ + 7) >> 3);
 }
 
-static WEBP_INLINE uint8_t* VP8LBitWriterFinish(VP8LBitWriter* const bw) {
-  return bw->buf_;
-}
+uint8_t* VP8LBitWriterFinish(VP8LBitWriter* const bw);
 
 // Returns 0 in case of memory allocation error.
 int VP8LBitWriterInit(VP8LBitWriter* const bw, size_t expected_size);
 
 void VP8LBitWriterDestroy(VP8LBitWriter* const bw);
 
-// This function writes bits into bytes in increasing addresses, and within
-// a byte least-significant-bit first.
-//
-// The function can write up to 16 bits in one go with WriteBits
-// Example: let's assume that 3 bits (Rs below) have been written already:
-//
-// BYTE-0     BYTE+1       BYTE+2
-//
-// 0000 0RRR    0000 0000    0000 0000
-//
-// Now, we could write 5 or less bits in MSB by just sifting by 3
-// and OR'ing to BYTE-0.
-//
-// For n bits, we take the last 5 bytes, OR that with high bits in BYTE-0,
-// and locate the rest in BYTE+1 and BYTE+2.
-//
+// This function writes bits into bytes in increasing addresses (little endian),
+// and within a byte least-significant-bit first.
+// This function can write up to 32 bits in one go, but VP8LBitReader can only
+// read 24 bits max (VP8L_MAX_NUM_BIT_READ).
 // VP8LBitWriter's error_ flag is set in case of  memory allocation error.
 void VP8LWriteBits(VP8LBitWriter* const bw, int n_bits, uint32_t bits);
 
diff --git a/src/3rdparty/libwebp/src/utils/color_cache.c b/src/3rdparty/libwebp/src/utils/color_cache.c
index 66a4464..8a88f08 100644
--- a/src/3rdparty/libwebp/src/utils/color_cache.c
+++ b/src/3rdparty/libwebp/src/utils/color_cache.c
@@ -32,7 +32,7 @@ int VP8LColorCacheInit(VP8LColorCache* const cc, int hash_bits) {
 
 void VP8LColorCacheClear(VP8LColorCache* const cc) {
   if (cc != NULL) {
-    free(cc->colors_);
+    WebPSafeFree(cc->colors_);
     cc->colors_ = NULL;
   }
 }
diff --git a/src/3rdparty/libwebp/src/utils/endian_inl.h b/src/3rdparty/libwebp/src/utils/endian_inl.h
new file mode 100644
index 0000000..cd56c37
--- /dev/null
+++ b/src/3rdparty/libwebp/src/utils/endian_inl.h
@@ -0,0 +1,100 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Endian related functions.
+
+#ifndef WEBP_UTILS_ENDIAN_INL_H_
+#define WEBP_UTILS_ENDIAN_INL_H_
+
+#ifdef HAVE_CONFIG_H
+#include "../webp/config.h"
+#endif
+
+#include "../dsp/dsp.h"
+#include "../webp/types.h"
+
+// some endian fix (e.g.: mips-gcc doesn't define __BIG_ENDIAN__)
+#if !defined(WORDS_BIGENDIAN) && \
+    (defined(__BIG_ENDIAN__) || defined(_M_PPC) || \
+     (defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)))
+#define WORDS_BIGENDIAN
+#endif
+
+#if defined(WORDS_BIGENDIAN)
+#define HToLE32 BSwap32
+#define HToLE16 BSwap16
+#else
+#define HToLE32(x) (x)
+#define HToLE16(x) (x)
+#endif
+
+#if !defined(HAVE_CONFIG_H)
+// clang-3.3 and gcc-4.3 have builtin functions for swap32/swap64
+#if LOCAL_GCC_PREREQ(4,3) || LOCAL_CLANG_PREREQ(3,3)
+#define HAVE_BUILTIN_BSWAP32
+#define HAVE_BUILTIN_BSWAP64
+#endif
+// clang-3.3 and gcc-4.8 have a builtin function for swap16
+#if LOCAL_GCC_PREREQ(4,8) || LOCAL_CLANG_PREREQ(3,3)
+#define HAVE_BUILTIN_BSWAP16
+#endif
+#endif  // !HAVE_CONFIG_H
+
+static WEBP_INLINE uint16_t BSwap16(uint16_t x) {
+#if defined(HAVE_BUILTIN_BSWAP16)
+  return __builtin_bswap16(x);
+#elif defined(_MSC_VER)
+  return _byteswap_ushort(x);
+#else
+  // gcc will recognize a 'rorw $8, ...' here:
+  return (x >> 8) | ((x & 0xff) << 8);
+#endif  // HAVE_BUILTIN_BSWAP16
+}
+
+static WEBP_INLINE uint32_t BSwap32(uint32_t x) {
+#if defined(WEBP_USE_MIPS32_R2)
+  uint32_t ret;
+  __asm__ volatile (
+    "wsbh   %[ret], %[x]          \n\t"
+    "rotr   %[ret], %[ret],  16   \n\t"
+    : [ret]"=r"(ret)
+    : [x]"r"(x)
+  );
+  return ret;
+#elif defined(HAVE_BUILTIN_BSWAP32)
+  return __builtin_bswap32(x);
+#elif defined(__i386__) || defined(__x86_64__)
+  uint32_t swapped_bytes;
+  __asm__ volatile("bswap %0" : "=r"(swapped_bytes) : "0"(x));
+  return swapped_bytes;
+#elif defined(_MSC_VER)
+  return (uint32_t)_byteswap_ulong(x);
+#else
+  return (x >> 24) | ((x >> 8) & 0xff00) | ((x << 8) & 0xff0000) | (x << 24);
+#endif  // HAVE_BUILTIN_BSWAP32
+}
+
+static WEBP_INLINE uint64_t BSwap64(uint64_t x) {
+#if defined(HAVE_BUILTIN_BSWAP64)
+  return __builtin_bswap64(x);
+#elif defined(__x86_64__)
+  uint64_t swapped_bytes;
+  __asm__ volatile("bswapq %0" : "=r"(swapped_bytes) : "0"(x));
+  return swapped_bytes;
+#elif defined(_MSC_VER)
+  return (uint64_t)_byteswap_uint64(x);
+#else  // generic code for swapping 64-bit values (suggested by bdb@)
+  x = ((x & 0xffffffff00000000ull) >> 32) | ((x & 0x00000000ffffffffull) << 32);
+  x = ((x & 0xffff0000ffff0000ull) >> 16) | ((x & 0x0000ffff0000ffffull) << 16);
+  x = ((x & 0xff00ff00ff00ff00ull) >>  8) | ((x & 0x00ff00ff00ff00ffull) <<  8);
+  return x;
+#endif  // HAVE_BUILTIN_BSWAP64
+}
+
+#endif  // WEBP_UTILS_ENDIAN_INL_H_
diff --git a/src/3rdparty/libwebp/src/utils/huffman.c b/src/3rdparty/libwebp/src/utils/huffman.c
index 8c5739f..c4c16d9 100644
--- a/src/3rdparty/libwebp/src/utils/huffman.c
+++ b/src/3rdparty/libwebp/src/utils/huffman.c
@@ -22,6 +22,9 @@
 // (might be faster on some platform)
 // #define USE_LUT_REVERSE_BITS
 
+// Huffman data read via DecodeImageStream is represented in two (red and green)
+// bytes.
+#define MAX_HTREE_GROUPS    0x10000
 #define NON_EXISTENT_SYMBOL (-1)
 
 static void TreeNodeInit(HuffmanTreeNode* const node) {
@@ -46,17 +49,25 @@ static void AssignChildren(HuffmanTree* const tree,
   TreeNodeInit(children + 1);
 }
 
+// A Huffman tree is a full binary tree; and in a full binary tree with L
+// leaves, the total number of nodes N = 2 * L - 1.
+static int HuffmanTreeMaxNodes(int num_leaves) {
+  return (2 * num_leaves - 1);
+}
+
+static int HuffmanTreeAllocate(HuffmanTree* const tree, int num_nodes) {
+  assert(tree != NULL);
+  tree->root_ =
+      (HuffmanTreeNode*)WebPSafeMalloc(num_nodes, sizeof(*tree->root_));
+  return (tree->root_ != NULL);
+}
+
 static int TreeInit(HuffmanTree* const tree, int num_leaves) {
   assert(tree != NULL);
   if (num_leaves == 0) return 0;
-  // We allocate maximum possible nodes in the tree at once.
-  // Note that a Huffman tree is a full binary tree; and in a full binary tree
-  // with L leaves, the total number of nodes N = 2 * L - 1.
-  tree->max_nodes_ = 2 * num_leaves - 1;
+  tree->max_nodes_ = HuffmanTreeMaxNodes(num_leaves);
   assert(tree->max_nodes_ < (1 << 16));   // limit for the lut_jump_ table
-  tree->root_ = (HuffmanTreeNode*)WebPSafeMalloc((uint64_t)tree->max_nodes_,
-                                                 sizeof(*tree->root_));
-  if (tree->root_ == NULL) return 0;
+  if (!HuffmanTreeAllocate(tree, tree->max_nodes_)) return 0;
   TreeNodeInit(tree->root_);  // Initialize root.
   tree->num_nodes_ = 1;
   memset(tree->lut_bits_, 255, sizeof(tree->lut_bits_));
@@ -64,17 +75,41 @@ static int TreeInit(HuffmanTree* const tree, int num_leaves) {
   return 1;
 }
 
-void HuffmanTreeRelease(HuffmanTree* const tree) {
+void VP8LHuffmanTreeFree(HuffmanTree* const tree) {
   if (tree != NULL) {
-    free(tree->root_);
+    WebPSafeFree(tree->root_);
     tree->root_ = NULL;
     tree->max_nodes_ = 0;
     tree->num_nodes_ = 0;
   }
 }
 
-int HuffmanCodeLengthsToCodes(const int* const code_lengths,
-                              int code_lengths_size, int* const huff_codes) {
+HTreeGroup* VP8LHtreeGroupsNew(int num_htree_groups) {
+  HTreeGroup* const htree_groups =
+      (HTreeGroup*)WebPSafeCalloc(num_htree_groups, sizeof(*htree_groups));
+  assert(num_htree_groups <= MAX_HTREE_GROUPS);
+  if (htree_groups == NULL) {
+    return NULL;
+  }
+  return htree_groups;
+}
+
+void VP8LHtreeGroupsFree(HTreeGroup* htree_groups, int num_htree_groups) {
+  if (htree_groups != NULL) {
+    int i, j;
+    for (i = 0; i < num_htree_groups; ++i) {
+      HuffmanTree* const htrees = htree_groups[i].htrees_;
+      for (j = 0; j < HUFFMAN_CODES_PER_META_CODE; ++j) {
+        VP8LHuffmanTreeFree(&htrees[j]);
+      }
+    }
+    WebPSafeFree(htree_groups);
+  }
+}
+
+int VP8LHuffmanCodeLengthsToCodes(
+    const int* const code_lengths, int code_lengths_size,
+    int* const huff_codes) {
   int symbol;
   int code_len;
   int code_length_hist[MAX_ALLOWED_CODE_LENGTH + 1] = { 0 };
@@ -193,9 +228,10 @@ static int TreeAddSymbol(HuffmanTree* const tree,
   return 1;
 }
 
-int HuffmanTreeBuildImplicit(HuffmanTree* const tree,
-                             const int* const code_lengths,
-                             int code_lengths_size) {
+int VP8LHuffmanTreeBuildImplicit(HuffmanTree* const tree,
+                                 const int* const code_lengths,
+                                 int* const codes,
+                                 int code_lengths_size) {
   int symbol;
   int num_symbols = 0;
   int root_symbol = 0;
@@ -219,47 +255,43 @@ int HuffmanTreeBuildImplicit(HuffmanTree* const tree,
   if (num_symbols == 1) {  // Trivial case.
     const int max_symbol = code_lengths_size;
     if (root_symbol < 0 || root_symbol >= max_symbol) {
-      HuffmanTreeRelease(tree);
+      VP8LHuffmanTreeFree(tree);
       return 0;
     }
     return TreeAddSymbol(tree, root_symbol, 0, 0);
   } else {  // Normal case.
     int ok = 0;
+    memset(codes, 0, code_lengths_size * sizeof(*codes));
 
-    // Get Huffman codes from the code lengths.
-    int* const codes =
-        (int*)WebPSafeMalloc((uint64_t)code_lengths_size, sizeof(*codes));
-    if (codes == NULL) goto End;
-
-    if (!HuffmanCodeLengthsToCodes(code_lengths, code_lengths_size, codes)) {
+    if (!VP8LHuffmanCodeLengthsToCodes(code_lengths, code_lengths_size,
+                                       codes)) {
       goto End;
     }
 
     // Add symbols one-by-one.
     for (symbol = 0; symbol < code_lengths_size; ++symbol) {
       if (code_lengths[symbol] > 0) {
-        if (!TreeAddSymbol(tree, symbol, codes[symbol], code_lengths[symbol])) {
+        if (!TreeAddSymbol(tree, symbol, codes[symbol],
+                           code_lengths[symbol])) {
           goto End;
         }
       }
     }
     ok = 1;
  End:
-    free(codes);
     ok = ok && IsFull(tree);
-    if (!ok) HuffmanTreeRelease(tree);
+    if (!ok) VP8LHuffmanTreeFree(tree);
     return ok;
   }
 }
 
-int HuffmanTreeBuildExplicit(HuffmanTree* const tree,
-                             const int* const code_lengths,
-                             const int* const codes,
-                             const int* const symbols, int max_symbol,
-                             int num_symbols) {
+int VP8LHuffmanTreeBuildExplicit(HuffmanTree* const tree,
+                                 const int* const code_lengths,
+                                 const int* const codes,
+                                 const int* const symbols, int max_symbol,
+                                 int num_symbols) {
   int ok = 0;
   int i;
-
   assert(tree != NULL);
   assert(code_lengths != NULL);
   assert(codes != NULL);
@@ -282,7 +314,6 @@ int HuffmanTreeBuildExplicit(HuffmanTree* const tree,
   ok = 1;
  End:
   ok = ok && IsFull(tree);
-  if (!ok) HuffmanTreeRelease(tree);
+  if (!ok) VP8LHuffmanTreeFree(tree);
   return ok;
 }
-
diff --git a/src/3rdparty/libwebp/src/utils/huffman.h b/src/3rdparty/libwebp/src/utils/huffman.h
index e8afd27..624bc17 100644
--- a/src/3rdparty/libwebp/src/utils/huffman.h
+++ b/src/3rdparty/libwebp/src/utils/huffman.h
@@ -15,6 +15,7 @@
 #define WEBP_UTILS_HUFFMAN_H_
 
 #include <assert.h>
+#include "../webp/format_constants.h"
 #include "../webp/types.h"
 
 #ifdef __cplusplus
@@ -42,6 +43,12 @@ struct HuffmanTree {
   int num_nodes_;           // number of currently occupied nodes
 };
 
+// Huffman Tree group.
+typedef struct HTreeGroup HTreeGroup;
+struct HTreeGroup {
+  HuffmanTree htrees_[HUFFMAN_CODES_PER_META_CODE];
+};
+
 // Returns true if the given node is not a leaf of the Huffman tree.
 static WEBP_INLINE int HuffmanTreeNodeIsNotLeaf(
     const HuffmanTreeNode* const node) {
@@ -56,29 +63,37 @@ static WEBP_INLINE const HuffmanTreeNode* HuffmanTreeNextNode(
 
 // Releases the nodes of the Huffman tree.
 // Note: It does NOT free 'tree' itself.
-void HuffmanTreeRelease(HuffmanTree* const tree);
+void VP8LHuffmanTreeFree(HuffmanTree* const tree);
+
+// Creates the instance of HTreeGroup with specified number of tree-groups.
+HTreeGroup* VP8LHtreeGroupsNew(int num_htree_groups);
+
+// Releases the memory allocated for HTreeGroup.
+void VP8LHtreeGroupsFree(HTreeGroup* htree_groups, int num_htree_groups);
 
 // Builds Huffman tree assuming code lengths are implicitly in symbol order.
+// The 'huff_codes' and 'code_lengths' are pre-allocated temporary memory
+// buffers, used for creating the huffman tree.
 // Returns false in case of error (invalid tree or memory error).
-int HuffmanTreeBuildImplicit(HuffmanTree* const tree,
-                             const int* const code_lengths,
-                             int code_lengths_size);
+int VP8LHuffmanTreeBuildImplicit(HuffmanTree* const tree,
+                                 const int* const code_lengths,
+                                 int* const huff_codes,
+                                 int code_lengths_size);
 
 // Build a Huffman tree with explicitly given lists of code lengths, codes
 // and symbols. Verifies that all symbols added are smaller than max_symbol.
 // Returns false in case of an invalid symbol, invalid tree or memory error.
-int HuffmanTreeBuildExplicit(HuffmanTree* const tree,
-                             const int* const code_lengths,
-                             const int* const codes,
-                             const int* const symbols, int max_symbol,
-                             int num_symbols);
+int VP8LHuffmanTreeBuildExplicit(HuffmanTree* const tree,
+                                 const int* const code_lengths,
+                                 const int* const codes,
+                                 const int* const symbols, int max_symbol,
+                                 int num_symbols);
 
 // Utility: converts Huffman code lengths to corresponding Huffman codes.
 // 'huff_codes' should be pre-allocated.
 // Returns false in case of error (memory allocation, invalid codes).
-int HuffmanCodeLengthsToCodes(const int* const code_lengths,
-                              int code_lengths_size, int* const huff_codes);
-
+int VP8LHuffmanCodeLengthsToCodes(const int* const code_lengths,
+                                  int code_lengths_size, int* const huff_codes);
 
 #ifdef __cplusplus
 }    // extern "C"
diff --git a/src/3rdparty/libwebp/src/utils/huffman_encode.c b/src/3rdparty/libwebp/src/utils/huffman_encode.c
index 9c59867..6421c2b 100644
--- a/src/3rdparty/libwebp/src/utils/huffman_encode.c
+++ b/src/3rdparty/libwebp/src/utils/huffman_encode.c
@@ -28,13 +28,13 @@ static int ValuesShouldBeCollapsedToStrideAverage(int a, int b) {
 
 // Change the population counts in a way that the consequent
 // Huffman tree compression, especially its RLE-part, give smaller output.
-static int OptimizeHuffmanForRle(int length, int* const counts) {
-  uint8_t* good_for_rle;
+static void OptimizeHuffmanForRle(int length, uint8_t* const good_for_rle,
+                                  uint32_t* const counts) {
   // 1) Let's make the Huffman code more compatible with rle encoding.
   int i;
   for (; length >= 0; --length) {
     if (length == 0) {
-      return 1;  // All zeros.
+      return;  // All zeros.
     }
     if (counts[length - 1] != 0) {
       // Now counts[0..length - 1] does not have trailing zeros.
@@ -43,15 +43,11 @@ static int OptimizeHuffmanForRle(int length, int* const counts) {
   }
   // 2) Let's mark all population counts that already can be encoded
   // with an rle code.
-  good_for_rle = (uint8_t*)calloc(length, 1);
-  if (good_for_rle == NULL) {
-    return 0;
-  }
   {
     // Let's not spoil any of the existing good rle codes.
     // Mark any seq of 0's that is longer as 5 as a good_for_rle.
     // Mark any seq of non-0's that is longer as 7 as a good_for_rle.
-    int symbol = counts[0];
+    uint32_t symbol = counts[0];
     int stride = 0;
     for (i = 0; i < length + 1; ++i) {
       if (i == length || counts[i] != symbol) {
@@ -73,17 +69,17 @@ static int OptimizeHuffmanForRle(int length, int* const counts) {
   }
   // 3) Let's replace those population counts that lead to more rle codes.
   {
-    int stride = 0;
-    int limit = counts[0];
-    int sum = 0;
+    uint32_t stride = 0;
+    uint32_t limit = counts[0];
+    uint32_t sum = 0;
     for (i = 0; i < length + 1; ++i) {
       if (i == length || good_for_rle[i] ||
           (i != 0 && good_for_rle[i - 1]) ||
           !ValuesShouldBeCollapsedToStrideAverage(counts[i], limit)) {
         if (stride >= 4 || (stride >= 3 && sum == 0)) {
-          int k;
+          uint32_t k;
           // The stride must end, collapse what we have, if we have enough (4).
-          int count = (sum + stride / 2) / stride;
+          uint32_t count = (sum + stride / 2) / stride;
           if (count < 1) {
             count = 1;
           }
@@ -119,17 +115,8 @@ static int OptimizeHuffmanForRle(int length, int* const counts) {
       }
     }
   }
-  free(good_for_rle);
-  return 1;
 }
 
-typedef struct {
-  int total_count_;
-  int value_;
-  int pool_index_left_;
-  int pool_index_right_;
-} HuffmanTree;
-
 // A comparer function for two Huffman trees: sorts first by 'total count'
 // (more comes first), and then by 'value' (more comes first).
 static int CompareHuffmanTrees(const void* ptr1, const void* ptr2) {
@@ -175,12 +162,12 @@ static void SetBitDepths(const HuffmanTree* const tree,
 // we are not planning to use this with extremely long blocks.
 //
 // See http://en.wikipedia.org/wiki/Huffman_coding
-static int GenerateOptimalTree(const int* const histogram, int histogram_size,
-                               int tree_depth_limit,
-                               uint8_t* const bit_depths) {
-  int count_min;
+static void GenerateOptimalTree(const uint32_t* const histogram,
+                                int histogram_size,
+                                HuffmanTree* tree, int tree_depth_limit,
+                                uint8_t* const bit_depths) {
+  uint32_t count_min;
   HuffmanTree* tree_pool;
-  HuffmanTree* tree;
   int tree_size_orig = 0;
   int i;
 
@@ -191,15 +178,9 @@ static int GenerateOptimalTree(const int* const histogram, int histogram_size,
   }
 
   if (tree_size_orig == 0) {   // pretty optimal already!
-    return 1;
+    return;
   }
 
-  // 3 * tree_size is enough to cover all the nodes representing a
-  // population and all the inserted nodes combining two existing nodes.
-  // The tree pool needs 2 * (tree_size_orig - 1) entities, and the
-  // tree needs exactly tree_size_orig entities.
-  tree = (HuffmanTree*)WebPSafeMalloc(3ULL * tree_size_orig, sizeof(*tree));
-  if (tree == NULL) return 0;
   tree_pool = tree + tree_size_orig;
 
   // For block sizes with less than 64k symbols we never need to do a
@@ -215,7 +196,7 @@ static int GenerateOptimalTree(const int* const histogram, int histogram_size,
     int j;
     for (j = 0; j < histogram_size; ++j) {
       if (histogram[j] != 0) {
-        const int count =
+        const uint32_t count =
             (histogram[j] < count_min) ? count_min : histogram[j];
         tree[idx].total_count_ = count;
         tree[idx].value_ = j;
@@ -231,7 +212,7 @@ static int GenerateOptimalTree(const int* const histogram, int histogram_size,
     if (tree_size > 1) {  // Normal case.
       int tree_pool_size = 0;
       while (tree_size > 1) {  // Finish when we have only one root.
-        int count;
+        uint32_t count;
         tree_pool[tree_pool_size++] = tree[tree_size - 1];
         tree_pool[tree_pool_size++] = tree[tree_size - 2];
         count = tree_pool[tree_pool_size - 1].total_count_ +
@@ -272,8 +253,6 @@ static int GenerateOptimalTree(const int* const histogram, int histogram_size,
       }
     }
   }
-  free(tree);
-  return 1;
 }
 
 // -----------------------------------------------------------------------------
@@ -424,17 +403,15 @@ static void ConvertBitDepthsToSymbols(HuffmanTreeCode* const tree) {
 // -----------------------------------------------------------------------------
 // Main entry point
 
-int VP8LCreateHuffmanTree(int* const histogram, int tree_depth_limit,
-                          HuffmanTreeCode* const tree) {
-  const int num_symbols = tree->num_symbols;
-  if (!OptimizeHuffmanForRle(num_symbols, histogram)) {
-    return 0;
-  }
-  if (!GenerateOptimalTree(histogram, num_symbols,
-                           tree_depth_limit, tree->code_lengths)) {
-    return 0;
-  }
+void VP8LCreateHuffmanTree(uint32_t* const histogram, int tree_depth_limit,
+                           uint8_t* const buf_rle,
+                           HuffmanTree* const huff_tree,
+                           HuffmanTreeCode* const huff_code) {
+  const int num_symbols = huff_code->num_symbols;
+  memset(buf_rle, 0, num_symbols * sizeof(*buf_rle));
+  OptimizeHuffmanForRle(num_symbols, buf_rle, histogram);
+  GenerateOptimalTree(histogram, num_symbols, huff_tree, tree_depth_limit,
+                      huff_code->code_lengths);
   // Create the actual bit codes for the bit lengths.
-  ConvertBitDepthsToSymbols(tree);
-  return 1;
+  ConvertBitDepthsToSymbols(huff_code);
 }
diff --git a/src/3rdparty/libwebp/src/utils/huffman_encode.h b/src/3rdparty/libwebp/src/utils/huffman_encode.h
index ee51c68..91aa18f 100644
--- a/src/3rdparty/libwebp/src/utils/huffman_encode.h
+++ b/src/3rdparty/libwebp/src/utils/huffman_encode.h
@@ -33,14 +33,26 @@ typedef struct {
   uint16_t* codes;         // Symbol Codes.
 } HuffmanTreeCode;
 
+// Struct to represent the Huffman tree.
+// TODO(vikasa): Add comment for the fields of the Struct.
+typedef struct {
+  uint32_t total_count_;
+  int value_;
+  int pool_index_left_;    // Index for the left sub-tree.
+  int pool_index_right_;   // Index for the right sub-tree.
+} HuffmanTree;
+
 // Turn the Huffman tree into a token sequence.
 // Returns the number of tokens used.
 int VP8LCreateCompressedHuffmanTree(const HuffmanTreeCode* const tree,
                                     HuffmanTreeToken* tokens, int max_tokens);
 
 // Create an optimized tree, and tokenize it.
-int VP8LCreateHuffmanTree(int* const histogram, int tree_depth_limit,
-                          HuffmanTreeCode* const tree);
+// 'buf_rle' and 'huff_tree' are pre-allocated and the 'tree' is the constructed
+// huffman code tree.
+void VP8LCreateHuffmanTree(uint32_t* const histogram, int tree_depth_limit,
+                           uint8_t* const buf_rle, HuffmanTree* const huff_tree,
+                           HuffmanTreeCode* const tree);
 
 #ifdef __cplusplus
 }
diff --git a/src/3rdparty/libwebp/src/utils/quant_levels_dec.c b/src/3rdparty/libwebp/src/utils/quant_levels_dec.c
index 8489705..5b8b8b4 100644
--- a/src/3rdparty/libwebp/src/utils/quant_levels_dec.c
+++ b/src/3rdparty/libwebp/src/utils/quant_levels_dec.c
@@ -7,18 +7,273 @@
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
-// TODO(skal): implement gradient smoothing.
+// Implement gradient smoothing: we replace a current alpha value by its
+// surrounding average if it's close enough (that is: the change will be less
+// than the minimum distance between two quantized level).
+// We use sliding window for computing the 2d moving average.
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
 #include "./quant_levels_dec.h"
 
-int DequantizeLevels(uint8_t* const data, int width, int height,
-                     int row, int num_rows) {
-  if (data == NULL || width <= 0 || height <= 0 || row < 0 || num_rows < 0 ||
-      row + num_rows > height) {
-    return 0;
+#include <string.h>   // for memset
+
+#include "./utils.h"
+
+// #define USE_DITHERING   // uncomment to enable ordered dithering (not vital)
+
+#define FIX 16     // fix-point precision for averaging
+#define LFIX 2     // extra precision for look-up table
+#define LUT_SIZE ((1 << (8 + LFIX)) - 1)  // look-up table size
+
+#if defined(USE_DITHERING)
+
+#define DFIX 4           // extra precision for ordered dithering
+#define DSIZE 4          // dithering size (must be a power of two)
+// cf. http://en.wikipedia.org/wiki/Ordered_dithering
+static const uint8_t kOrderedDither[DSIZE][DSIZE] = {
+  {  0,  8,  2, 10 },     // coefficients are in DFIX fixed-point precision
+  { 12,  4, 14,  6 },
+  {  3, 11,  1,  9 },
+  { 15,  7, 13,  5 }
+};
+
+#else
+#define DFIX 0
+#endif
+
+typedef struct {
+  int width_, height_;  // dimension
+  int row_;             // current input row being processed
+  uint8_t* src_;        // input pointer
+  uint8_t* dst_;        // output pointer
+
+  int radius_;          // filter radius (=delay)
+  int scale_;           // normalization factor, in FIX bits precision
+
+  void* mem_;           // all memory
+
+  // various scratch buffers
+  uint16_t* start_;
+  uint16_t* cur_;
+  uint16_t* end_;
+  uint16_t* top_;
+  uint16_t* average_;
+
+  // input levels distribution
+  int num_levels_;       // number of quantized levels
+  int min_, max_;        // min and max level values
+  int min_level_dist_;   // smallest distance between two consecutive levels
+
+  int16_t* correction_;  // size = 1 + 2*LUT_SIZE  -> ~4k memory
+} SmoothParams;
+
+//------------------------------------------------------------------------------
+
+#define CLIP_MASK (int)(~0U << (8 + DFIX))
+static WEBP_INLINE uint8_t clip_8b(int v) {
+  return (!(v & CLIP_MASK)) ? (uint8_t)(v >> DFIX) : (v < 0) ? 0u : 255u;
+}
+
+// vertical accumulation
+static void VFilter(SmoothParams* const p) {
+  const uint8_t* src = p->src_;
+  const int w = p->width_;
+  uint16_t* const cur = p->cur_;
+  const uint16_t* const top = p->top_;
+  uint16_t* const out = p->end_;
+  uint16_t sum = 0;               // all arithmetic is modulo 16bit
+  int x;
+
+  for (x = 0; x < w; ++x) {
+    uint16_t new_value;
+    sum += src[x];
+    new_value = top[x] + sum;
+    out[x] = new_value - cur[x];  // vertical sum of 'r' pixels.
+    cur[x] = new_value;
+  }
+  // move input pointers one row down
+  p->top_ = p->cur_;
+  p->cur_ += w;
+  if (p->cur_ == p->end_) p->cur_ = p->start_;  // roll-over
+  // We replicate edges, as it's somewhat easier as a boundary condition.
+  // That's why we don't update the 'src' pointer on top/bottom area:
+  if (p->row_ >= 0 && p->row_ < p->height_ - 1) {
+    p->src_ += p->width_;
+  }
+}
+
+// horizontal accumulation. We use mirror replication of missing pixels, as it's
+// a little easier to implement (surprisingly).
+static void HFilter(SmoothParams* const p) {
+  const uint16_t* const in = p->end_;
+  uint16_t* const out = p->average_;
+  const uint32_t scale = p->scale_;
+  const int w = p->width_;
+  const int r = p->radius_;
+
+  int x;
+  for (x = 0; x <= r; ++x) {   // left mirroring
+    const uint16_t delta = in[x + r - 1] + in[r - x];
+    out[x] = (delta * scale) >> FIX;
+  }
+  for (; x < w - r; ++x) {     // bulk middle run
+    const uint16_t delta = in[x + r] - in[x - r - 1];
+    out[x] = (delta * scale) >> FIX;
+  }
+  for (; x < w; ++x) {         // right mirroring
+    const uint16_t delta =
+        2 * in[w - 1] - in[2 * w - 2 - r - x] - in[x - r - 1];
+    out[x] = (delta * scale) >> FIX;
+  }
+}
+
+// emit one filtered output row
+static void ApplyFilter(SmoothParams* const p) {
+  const uint16_t* const average = p->average_;
+  const int w = p->width_;
+  const int16_t* const correction = p->correction_;
+#if defined(USE_DITHERING)
+  const uint8_t* const dither = kOrderedDither[p->row_ % DSIZE];
+#endif
+  uint8_t* const dst = p->dst_;
+  int x;
+  for (x = 0; x < w; ++x) {
+    const int v = dst[x];
+    if (v < p->max_ && v > p->min_) {
+      const int c = (v << DFIX) + correction[average[x] - (v << LFIX)];
+#if defined(USE_DITHERING)
+      dst[x] = clip_8b(c + dither[x % DSIZE]);
+#else
+      dst[x] = clip_8b(c);
+#endif
+    }
   }
+  p->dst_ += w;  // advance output pointer
+}
+
+//------------------------------------------------------------------------------
+// Initialize correction table
+
+static void InitCorrectionLUT(int16_t* const lut, int min_dist) {
+  // The correction curve is:
+  //   f(x) = x for x <= threshold2
+  //   f(x) = 0 for x >= threshold1
+  // and a linear interpolation for range x=[threshold2, threshold1]
+  // (along with f(-x) = -f(x) symmetry).
+  // Note that: threshold2 = 3/4 * threshold1
+  const int threshold1 = min_dist << LFIX;
+  const int threshold2 = (3 * threshold1) >> 2;
+  const int max_threshold = threshold2 << DFIX;
+  const int delta = threshold1 - threshold2;
+  int i;
+  for (i = 1; i <= LUT_SIZE; ++i) {
+    int c = (i <= threshold2) ? (i << DFIX)
+          : (i < threshold1) ? max_threshold * (threshold1 - i) / delta
+          : 0;
+    c >>= LFIX;
+    lut[+i] = +c;
+    lut[-i] = -c;
+  }
+  lut[0] = 0;
+}
+
+static void CountLevels(const uint8_t* const data, int size,
+                        SmoothParams* const p) {
+  int i, last_level;
+  uint8_t used_levels[256] = { 0 };
+  p->min_ = 255;
+  p->max_ = 0;
+  for (i = 0; i < size; ++i) {
+    const int v = data[i];
+    if (v < p->min_) p->min_ = v;
+    if (v > p->max_) p->max_ = v;
+    used_levels[v] = 1;
+  }
+  // Compute the mininum distance between two non-zero levels.
+  p->min_level_dist_ = p->max_ - p->min_;
+  last_level = -1;
+  for (i = 0; i < 256; ++i) {
+    if (used_levels[i]) {
+      ++p->num_levels_;
+      if (last_level >= 0) {
+        const int level_dist = i - last_level;
+        if (level_dist < p->min_level_dist_) {
+          p->min_level_dist_ = level_dist;
+        }
+      }
+      last_level = i;
+    }
+  }
+}
+
+// Initialize all params.
+static int InitParams(uint8_t* const data, int width, int height,
+                      int radius, SmoothParams* const p) {
+  const int R = 2 * radius + 1;  // total size of the kernel
+
+  const size_t size_scratch_m = (R + 1) * width * sizeof(*p->start_);
+  const size_t size_m =  width * sizeof(*p->average_);
+  const size_t size_lut = (1 + 2 * LUT_SIZE) * sizeof(*p->correction_);
+  const size_t total_size = size_scratch_m + size_m + size_lut;
+  uint8_t* mem = (uint8_t*)WebPSafeMalloc(1U, total_size);
+
+  if (mem == NULL) return 0;
+  p->mem_ = (void*)mem;
+
+  p->start_ = (uint16_t*)mem;
+  p->cur_ = p->start_;
+  p->end_ = p->start_ + R * width;
+  p->top_ = p->end_ - width;
+  memset(p->top_, 0, width * sizeof(*p->top_));
+  mem += size_scratch_m;
+
+  p->average_ = (uint16_t*)mem;
+  mem += size_m;
+
+  p->width_ = width;
+  p->height_ = height;
+  p->src_ = data;
+  p->dst_ = data;
+  p->radius_ = radius;
+  p->scale_ = (1 << (FIX + LFIX)) / (R * R);  // normalization constant
+  p->row_ = -radius;
+
+  // analyze the input distribution so we can best-fit the threshold
+  CountLevels(data, width * height, p);
+
+  // correction table
+  p->correction_ = ((int16_t*)mem) + LUT_SIZE;
+  InitCorrectionLUT(p->correction_, p->min_level_dist_);
+
   return 1;
 }
 
+static void CleanupParams(SmoothParams* const p) {
+  WebPSafeFree(p->mem_);
+}
+
+int WebPDequantizeLevels(uint8_t* const data, int width, int height,
+                         int strength) {
+  const int radius = 4 * strength / 100;
+  if (strength < 0 || strength > 100) return 0;
+  if (data == NULL || width <= 0 || height <= 0) return 0;  // bad params
+  if (radius > 0) {
+    SmoothParams p;
+    memset(&p, 0, sizeof(p));
+    if (!InitParams(data, width, height, radius, &p)) return 0;
+    if (p.num_levels_ > 2) {
+      for (; p.row_ < p.height_; ++p.row_) {
+        VFilter(&p);  // accumulate average of input
+        // Need to wait few rows in order to prime the filter,
+        // before emitting some output.
+        if (p.row_ >= p.radius_) {
+          HFilter(&p);
+          ApplyFilter(&p);
+        }
+      }
+    }
+    CleanupParams(&p);
+  }
+  return 1;
+}
diff --git a/src/3rdparty/libwebp/src/utils/quant_levels_dec.h b/src/3rdparty/libwebp/src/utils/quant_levels_dec.h
index 0288383..9aab068 100644
--- a/src/3rdparty/libwebp/src/utils/quant_levels_dec.h
+++ b/src/3rdparty/libwebp/src/utils/quant_levels_dec.h
@@ -21,11 +21,12 @@ extern "C" {
 #endif
 
 // Apply post-processing to input 'data' of size 'width'x'height' assuming that
-// the source was quantized to a reduced number of levels. The post-processing
-// will be applied to 'num_rows' rows of 'data' starting from 'row'.
-// Returns false in case of error (data is NULL, invalid parameters, ...).
-int DequantizeLevels(uint8_t* const data, int width, int height,
-                     int row, int num_rows);
+// the source was quantized to a reduced number of levels.
+// Strength is in [0..100] and controls the amount of dithering applied.
+// Returns false in case of error (data is NULL, invalid parameters,
+// malloc failure, ...).
+int WebPDequantizeLevels(uint8_t* const data, int width, int height,
+                         int strength);
 
 #ifdef __cplusplus
 }    // extern "C"
diff --git a/src/3rdparty/libwebp/src/utils/random.h b/src/3rdparty/libwebp/src/utils/random.h
index 08a83e9..c392a61 100644
--- a/src/3rdparty/libwebp/src/utils/random.h
+++ b/src/3rdparty/libwebp/src/utils/random.h
@@ -45,7 +45,8 @@ static WEBP_INLINE int VP8RandomBits2(VP8Random* const rg, int num_bits,
   rg->tab_[rg->index1_] = diff;
   if (++rg->index1_ == VP8_RANDOM_TABLE_SIZE) rg->index1_ = 0;
   if (++rg->index2_ == VP8_RANDOM_TABLE_SIZE) rg->index2_ = 0;
-  diff = (diff << 1) >> (32 - num_bits);         // sign-extend, 0-center
+  // sign-extend, 0-center
+  diff = (int)((uint32_t)diff << 1) >> (32 - num_bits);
   diff = (diff * amp) >> VP8_RANDOM_DITHER_FIX;  // restrict range
   diff += 1 << (num_bits - 1);                   // shift back to 0.5-center
   return diff;
diff --git a/src/3rdparty/libwebp/src/utils/rescaler.c b/src/3rdparty/libwebp/src/utils/rescaler.c
index 7061246..fad9c6b 100644
--- a/src/3rdparty/libwebp/src/utils/rescaler.c
+++ b/src/3rdparty/libwebp/src/utils/rescaler.c
@@ -14,41 +14,20 @@
 #include <assert.h>
 #include <stdlib.h>
 #include "./rescaler.h"
+#include "../dsp/dsp.h"
 
 //------------------------------------------------------------------------------
+// Implementations of critical functions ImportRow / ExportRow
+
+void (*WebPRescalerImportRow)(WebPRescaler* const wrk,
+                              const uint8_t* const src, int channel) = NULL;
+void (*WebPRescalerExportRow)(WebPRescaler* const wrk, int x_out) = NULL;
 
 #define RFIX 30
 #define MULT_FIX(x, y) (((int64_t)(x) * (y) + (1 << (RFIX - 1))) >> RFIX)
 
-void WebPRescalerInit(WebPRescaler* const wrk, int src_width, int src_height,
-                      uint8_t* const dst, int dst_width, int dst_height,
-                      int dst_stride, int num_channels, int x_add, int x_sub,
-                      int y_add, int y_sub, int32_t* const work) {
-  wrk->x_expand = (src_width < dst_width);
-  wrk->src_width = src_width;
-  wrk->src_height = src_height;
-  wrk->dst_width = dst_width;
-  wrk->dst_height = dst_height;
-  wrk->dst = dst;
-  wrk->dst_stride = dst_stride;
-  wrk->num_channels = num_channels;
-  // for 'x_expand', we use bilinear interpolation
-  wrk->x_add = wrk->x_expand ? (x_sub - 1) : x_add - x_sub;
-  wrk->x_sub = wrk->x_expand ? (x_add - 1) : x_sub;
-  wrk->y_accum = y_add;
-  wrk->y_add = y_add;
-  wrk->y_sub = y_sub;
-  wrk->fx_scale = (1 << RFIX) / x_sub;
-  wrk->fy_scale = (1 << RFIX) / y_sub;
-  wrk->fxy_scale = wrk->x_expand ?
-      ((int64_t)dst_height << RFIX) / (x_sub * src_height) :
-      ((int64_t)dst_height << RFIX) / (x_add * src_height);
-  wrk->irow = work;
-  wrk->frow = work + num_channels * dst_width;
-}
-
-void WebPRescalerImportRow(WebPRescaler* const wrk,
-                           const uint8_t* const src, int channel) {
+static void ImportRowC(WebPRescaler* const wrk,
+                       const uint8_t* const src, int channel) {
   const int x_stride = wrk->num_channels;
   const int x_out_max = wrk->dst_width * wrk->num_channels;
   int x_in = channel;
@@ -84,22 +63,20 @@ void WebPRescalerImportRow(WebPRescaler* const wrk,
       accum -= wrk->x_sub;
     }
   }
-  // Accumulate the new row's contribution
+  // Accumulate the contribution of the new row.
   for (x_out = channel; x_out < x_out_max; x_out += x_stride) {
     wrk->irow[x_out] += wrk->frow[x_out];
   }
 }
 
-uint8_t* WebPRescalerExportRow(WebPRescaler* const wrk) {
+static void ExportRowC(WebPRescaler* const wrk, int x_out) {
   if (wrk->y_accum <= 0) {
-    int x_out;
     uint8_t* const dst = wrk->dst;
     int32_t* const irow = wrk->irow;
     const int32_t* const frow = wrk->frow;
     const int yscale = wrk->fy_scale * (-wrk->y_accum);
     const int x_out_max = wrk->dst_width * wrk->num_channels;
-
-    for (x_out = 0; x_out < x_out_max; ++x_out) {
+    for (; x_out < x_out_max; ++x_out) {
       const int frac = (int)MULT_FIX(frow[x_out], yscale);
       const int v = (int)MULT_FIX(irow[x_out] - frac, wrk->fxy_scale);
       dst[x_out] = (!(v & ~0xff)) ? v : (v < 0) ? 0 : 255;
@@ -107,9 +84,214 @@ uint8_t* WebPRescalerExportRow(WebPRescaler* const wrk) {
     }
     wrk->y_accum += wrk->y_add;
     wrk->dst += wrk->dst_stride;
-    return dst;
+  }
+}
+
+//------------------------------------------------------------------------------
+// MIPS version
+
+#if defined(WEBP_USE_MIPS32)
+
+static void ImportRowMIPS(WebPRescaler* const wrk,
+                          const uint8_t* const src, int channel) {
+  const int x_stride = wrk->num_channels;
+  const int x_out_max = wrk->dst_width * wrk->num_channels;
+  const int fx_scale = wrk->fx_scale;
+  const int x_add = wrk->x_add;
+  const int x_sub = wrk->x_sub;
+  int* frow = wrk->frow + channel;
+  int* irow = wrk->irow + channel;
+  const uint8_t* src1 = src + channel;
+  int temp1, temp2, temp3;
+  int base, frac, sum;
+  int accum, accum1;
+  const int x_stride1 = x_stride << 2;
+  int loop_c = x_out_max - channel;
+
+  if (!wrk->x_expand) {
+    __asm__ volatile (
+      "li     %[temp1],   0x8000                    \n\t"
+      "li     %[temp2],   0x10000                   \n\t"
+      "li     %[sum],     0                         \n\t"
+      "li     %[accum],   0                         \n\t"
+    "1:                                             \n\t"
+      "addu   %[accum],   %[accum],   %[x_add]      \n\t"
+      "blez   %[accum],   3f                        \n\t"
+    "2:                                             \n\t"
+      "lbu    %[temp3],   0(%[src1])                \n\t"
+      "subu   %[accum],   %[accum],   %[x_sub]      \n\t"
+      "addu   %[src1],    %[src1],    %[x_stride]   \n\t"
+      "addu   %[sum],     %[sum],     %[temp3]      \n\t"
+      "bgtz   %[accum],   2b                        \n\t"
+    "3:                                             \n\t"
+      "lbu    %[base],    0(%[src1])                \n\t"
+      "addu   %[src1],    %[src1],    %[x_stride]   \n\t"
+      "negu   %[accum1],  %[accum]                  \n\t"
+      "mul    %[frac],    %[base],    %[accum1]     \n\t"
+      "addu   %[temp3],   %[sum],     %[base]       \n\t"
+      "mul    %[temp3],   %[temp3],   %[x_sub]      \n\t"
+      "lw     %[base],    0(%[irow])                \n\t"
+      "subu   %[loop_c],  %[loop_c],  %[x_stride]   \n\t"
+      "sll    %[accum1],  %[frac],    2             \n\t"
+      "mult   %[temp1],   %[temp2]                  \n\t"
+      "madd   %[accum1],  %[fx_scale]               \n\t"
+      "mfhi   %[sum]                                \n\t"
+      "subu   %[temp3],   %[temp3],   %[frac]       \n\t"
+      "sw     %[temp3],   0(%[frow])                \n\t"
+      "add    %[base],    %[base],    %[temp3]      \n\t"
+      "sw     %[base],    0(%[irow])                \n\t"
+      "addu   %[irow],    %[irow],    %[x_stride1]  \n\t"
+      "addu   %[frow],    %[frow],    %[x_stride1]  \n\t"
+      "bgtz   %[loop_c],  1b                        \n\t"
+
+      : [accum] "=&r" (accum), [src1] "+r" (src1), [temp3] "=&r" (temp3),
+        [sum] "=&r" (sum), [base] "=&r" (base), [frac] "=&r" (frac),
+        [frow] "+r" (frow), [irow] "+r" (irow), [accum1] "=&r" (accum1),
+        [temp2] "=&r" (temp2), [temp1] "=&r" (temp1)
+      : [x_stride] "r" (x_stride), [fx_scale] "r" (fx_scale),
+        [x_sub] "r" (x_sub), [x_add] "r" (x_add),
+        [loop_c] "r" (loop_c), [x_stride1] "r" (x_stride1)
+      : "memory", "hi", "lo"
+    );
   } else {
-    return NULL;
+    __asm__ volatile (
+      "lbu    %[temp1],   0(%[src1])                \n\t"
+      "move   %[temp2],   %[temp1]                  \n\t"
+      "li     %[accum],   0                         \n\t"
+    "1:                                             \n\t"
+      "bgez   %[accum],   2f                        \n\t"
+      "move   %[temp2],   %[temp1]                  \n\t"
+      "addu   %[src1],    %[x_stride]               \n\t"
+      "lbu    %[temp1],   0(%[src1])                \n\t"
+      "addu   %[accum],   %[x_add]                  \n\t"
+    "2:                                             \n\t"
+      "subu   %[temp3],   %[temp2],   %[temp1]      \n\t"
+      "mul    %[temp3],   %[temp3],   %[accum]      \n\t"
+      "mul    %[base],    %[temp1],   %[x_add]      \n\t"
+      "subu   %[accum],   %[accum],   %[x_sub]      \n\t"
+      "lw     %[frac],    0(%[irow])                \n\t"
+      "subu   %[loop_c],  %[loop_c],  %[x_stride]   \n\t"
+      "addu   %[temp3],   %[base],    %[temp3]      \n\t"
+      "sw     %[temp3],   0(%[frow])                \n\t"
+      "addu   %[frow],    %[x_stride1]              \n\t"
+      "addu   %[frac],    %[temp3]                  \n\t"
+      "sw     %[frac],    0(%[irow])                \n\t"
+      "addu   %[irow],    %[x_stride1]              \n\t"
+      "bgtz   %[loop_c],  1b                        \n\t"
+
+      : [src1] "+r" (src1), [accum] "=&r" (accum), [temp1] "=&r" (temp1),
+        [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), [base] "=&r" (base),
+        [frac] "=&r" (frac), [frow] "+r" (frow), [irow] "+r" (irow)
+      : [x_stride] "r" (x_stride), [x_add] "r" (x_add), [x_sub] "r" (x_sub),
+        [x_stride1] "r" (x_stride1), [loop_c] "r" (loop_c)
+      : "memory", "hi", "lo"
+    );
+  }
+}
+
+static void ExportRowMIPS(WebPRescaler* const wrk, int x_out) {
+  if (wrk->y_accum <= 0) {
+    uint8_t* const dst = wrk->dst;
+    int32_t* const irow = wrk->irow;
+    const int32_t* const frow = wrk->frow;
+    const int yscale = wrk->fy_scale * (-wrk->y_accum);
+    const int x_out_max = wrk->dst_width * wrk->num_channels;
+    // if wrk->fxy_scale can fit into 32 bits use optimized code,
+    // otherwise use C code
+    if ((wrk->fxy_scale >> 32) == 0) {
+      int temp0, temp1, temp3, temp4, temp5, temp6, temp7, loop_end;
+      const int temp2 = (int)(wrk->fxy_scale);
+      const int temp8 = x_out_max << 2;
+      uint8_t* dst_t = (uint8_t*)dst;
+      int32_t* irow_t = (int32_t*)irow;
+      const int32_t* frow_t = (const int32_t*)frow;
+
+      __asm__ volatile(
+        "addiu    %[temp6],    $zero,       -256          \n\t"
+        "addiu    %[temp7],    $zero,       255           \n\t"
+        "li       %[temp3],    0x10000                    \n\t"
+        "li       %[temp4],    0x8000                     \n\t"
+        "addu     %[loop_end], %[frow_t],   %[temp8]      \n\t"
+      "1:                                                 \n\t"
+        "lw       %[temp0],    0(%[frow_t])               \n\t"
+        "mult     %[temp3],    %[temp4]                   \n\t"
+        "addiu    %[frow_t],   %[frow_t],   4             \n\t"
+        "sll      %[temp0],    %[temp0],    2             \n\t"
+        "madd     %[temp0],    %[yscale]                  \n\t"
+        "mfhi     %[temp1]                                \n\t"
+        "lw       %[temp0],    0(%[irow_t])               \n\t"
+        "addiu    %[dst_t],    %[dst_t],    1             \n\t"
+        "addiu    %[irow_t],   %[irow_t],   4             \n\t"
+        "subu     %[temp0],    %[temp0],    %[temp1]      \n\t"
+        "mult     %[temp3],    %[temp4]                   \n\t"
+        "sll      %[temp0],    %[temp0],    2             \n\t"
+        "madd     %[temp0],    %[temp2]                   \n\t"
+        "mfhi     %[temp5]                                \n\t"
+        "sw       %[temp1],    -4(%[irow_t])              \n\t"
+        "and      %[temp0],    %[temp5],    %[temp6]      \n\t"
+        "slti     %[temp1],    %[temp5],    0             \n\t"
+        "beqz     %[temp0],    2f                         \n\t"
+        "xor      %[temp5],    %[temp5],    %[temp5]      \n\t"
+        "movz     %[temp5],    %[temp7],    %[temp1]      \n\t"
+      "2:                                                 \n\t"
+        "sb       %[temp5],    -1(%[dst_t])               \n\t"
+        "bne      %[frow_t],   %[loop_end], 1b            \n\t"
+
+        : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp3]"=&r"(temp3),
+          [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [temp6]"=&r"(temp6),
+          [temp7]"=&r"(temp7), [frow_t]"+r"(frow_t), [irow_t]"+r"(irow_t),
+          [dst_t]"+r"(dst_t), [loop_end]"=&r"(loop_end)
+        : [temp2]"r"(temp2), [yscale]"r"(yscale), [temp8]"r"(temp8)
+        : "memory", "hi", "lo"
+      );
+      wrk->y_accum += wrk->y_add;
+      wrk->dst += wrk->dst_stride;
+    } else {
+      ExportRowC(wrk, x_out);
+    }
+  }
+}
+#endif   // WEBP_USE_MIPS32
+
+//------------------------------------------------------------------------------
+
+void WebPRescalerInit(WebPRescaler* const wrk, int src_width, int src_height,
+                      uint8_t* const dst, int dst_width, int dst_height,
+                      int dst_stride, int num_channels, int x_add, int x_sub,
+                      int y_add, int y_sub, int32_t* const work) {
+  wrk->x_expand = (src_width < dst_width);
+  wrk->src_width = src_width;
+  wrk->src_height = src_height;
+  wrk->dst_width = dst_width;
+  wrk->dst_height = dst_height;
+  wrk->dst = dst;
+  wrk->dst_stride = dst_stride;
+  wrk->num_channels = num_channels;
+  // for 'x_expand', we use bilinear interpolation
+  wrk->x_add = wrk->x_expand ? (x_sub - 1) : x_add - x_sub;
+  wrk->x_sub = wrk->x_expand ? (x_add - 1) : x_sub;
+  wrk->y_accum = y_add;
+  wrk->y_add = y_add;
+  wrk->y_sub = y_sub;
+  wrk->fx_scale = (1 << RFIX) / x_sub;
+  wrk->fy_scale = (1 << RFIX) / y_sub;
+  wrk->fxy_scale = wrk->x_expand ?
+      ((int64_t)dst_height << RFIX) / (x_sub * src_height) :
+      ((int64_t)dst_height << RFIX) / (x_add * src_height);
+  wrk->irow = work;
+  wrk->frow = work + num_channels * dst_width;
+
+  if (WebPRescalerImportRow == NULL) {
+    WebPRescalerImportRow = ImportRowC;
+    WebPRescalerExportRow = ExportRowC;
+    if (VP8GetCPUInfo != NULL) {
+#if defined(WEBP_USE_MIPS32)
+      if (VP8GetCPUInfo(kMIPS32)) {
+        WebPRescalerImportRow = ImportRowMIPS;
+        WebPRescalerExportRow = ExportRowMIPS;
+      }
+#endif
+    }
   }
 }
 
@@ -142,11 +324,10 @@ int WebPRescalerImport(WebPRescaler* const wrk, int num_lines,
 int WebPRescalerExport(WebPRescaler* const rescaler) {
   int total_exported = 0;
   while (WebPRescalerHasPendingOutput(rescaler)) {
-    WebPRescalerExportRow(rescaler);
+    WebPRescalerExportRow(rescaler, 0);
     ++total_exported;
   }
   return total_exported;
 }
 
 //------------------------------------------------------------------------------
-
diff --git a/src/3rdparty/libwebp/src/utils/rescaler.h b/src/3rdparty/libwebp/src/utils/rescaler.h
index 68e49ce..a6f3787 100644
--- a/src/3rdparty/libwebp/src/utils/rescaler.h
+++ b/src/3rdparty/libwebp/src/utils/rescaler.h
@@ -52,26 +52,24 @@ void WebPRescalerInit(WebPRescaler* const rescaler,
 int WebPRescaleNeededLines(const WebPRescaler* const rescaler,
                            int max_num_lines);
 
-// Import a row of data and save its contribution in the rescaler.
-// 'channel' denotes the channel number to be imported.
-void WebPRescalerImportRow(WebPRescaler* const rescaler,
-                           const uint8_t* const src, int channel);
-
 // Import multiple rows over all channels, until at least one row is ready to
 // be exported. Returns the actual number of lines that were imported.
 int WebPRescalerImport(WebPRescaler* const rescaler, int num_rows,
                        const uint8_t* src, int src_stride);
 
+// Import a row of data and save its contribution in the rescaler.
+// 'channel' denotes the channel number to be imported.
+extern void (*WebPRescalerImportRow)(WebPRescaler* const wrk,
+                                     const uint8_t* const src, int channel);
+// Export one row (starting at x_out position) from rescaler.
+extern void (*WebPRescalerExportRow)(WebPRescaler* const wrk, int x_out);
+
 // Return true if there is pending output rows ready.
 static WEBP_INLINE
 int WebPRescalerHasPendingOutput(const WebPRescaler* const rescaler) {
   return (rescaler->y_accum <= 0);
 }
 
-// Export one row from rescaler. Returns the pointer where output was written,
-// or NULL if no row was pending.
-uint8_t* WebPRescalerExportRow(WebPRescaler* const rescaler);
-
 // Export as many rows as possible. Return the numbers of rows written.
 int WebPRescalerExport(WebPRescaler* const rescaler);
 
diff --git a/src/3rdparty/libwebp/src/utils/thread.c b/src/3rdparty/libwebp/src/utils/thread.c
index a9e3fae..264210b 100644
--- a/src/3rdparty/libwebp/src/utils/thread.c
+++ b/src/3rdparty/libwebp/src/utils/thread.c
@@ -14,11 +14,35 @@
 #include <assert.h>
 #include <string.h>   // for memset()
 #include "./thread.h"
+#include "./utils.h"
 
 #ifdef WEBP_USE_THREAD
 
 #if defined(_WIN32)
 
+#include <windows.h>
+typedef HANDLE pthread_t;
+typedef CRITICAL_SECTION pthread_mutex_t;
+typedef struct {
+  HANDLE waiting_sem_;
+  HANDLE received_sem_;
+  HANDLE signal_event_;
+} pthread_cond_t;
+
+#else  // !_WIN32
+
+#include <pthread.h>
+
+#endif  // _WIN32
+
+struct WebPWorkerImpl {
+  pthread_mutex_t mutex_;
+  pthread_cond_t  condition_;
+  pthread_t       thread_;
+};
+
+#if defined(_WIN32)
+
 //------------------------------------------------------------------------------
 // simplistic pthread emulation layer
 
@@ -129,23 +153,25 @@ static int pthread_cond_wait(pthread_cond_t* const condition,
 
 //------------------------------------------------------------------------------
 
+static void Execute(WebPWorker* const worker);  // Forward declaration.
+
 static THREADFN ThreadLoop(void* ptr) {
   WebPWorker* const worker = (WebPWorker*)ptr;
   int done = 0;
   while (!done) {
-    pthread_mutex_lock(&worker->mutex_);
+    pthread_mutex_lock(&worker->impl_->mutex_);
     while (worker->status_ == OK) {   // wait in idling mode
-      pthread_cond_wait(&worker->condition_, &worker->mutex_);
+      pthread_cond_wait(&worker->impl_->condition_, &worker->impl_->mutex_);
     }
     if (worker->status_ == WORK) {
-      WebPWorkerExecute(worker);
+      Execute(worker);
       worker->status_ = OK;
     } else if (worker->status_ == NOT_OK) {   // finish the worker
       done = 1;
     }
     // signal to the main thread that we're done (for Sync())
-    pthread_cond_signal(&worker->condition_);
-    pthread_mutex_unlock(&worker->mutex_);
+    pthread_cond_signal(&worker->impl_->condition_);
+    pthread_mutex_unlock(&worker->impl_->mutex_);
   }
   return THREAD_RETURN(NULL);    // Thread is finished
 }
@@ -153,32 +179,36 @@ static THREADFN ThreadLoop(void* ptr) {
 // main thread state control
 static void ChangeState(WebPWorker* const worker,
                         WebPWorkerStatus new_status) {
-  // no-op when attempting to change state on a thread that didn't come up
-  if (worker->status_ < OK) return;
+  // No-op when attempting to change state on a thread that didn't come up.
+  // Checking status_ without acquiring the lock first would result in a data
+  // race.
+  if (worker->impl_ == NULL) return;
 
-  pthread_mutex_lock(&worker->mutex_);
-  // wait for the worker to finish
-  while (worker->status_ != OK) {
-    pthread_cond_wait(&worker->condition_, &worker->mutex_);
-  }
-  // assign new status and release the working thread if needed
-  if (new_status != OK) {
-    worker->status_ = new_status;
-    pthread_cond_signal(&worker->condition_);
+  pthread_mutex_lock(&worker->impl_->mutex_);
+  if (worker->status_ >= OK) {
+    // wait for the worker to finish
+    while (worker->status_ != OK) {
+      pthread_cond_wait(&worker->impl_->condition_, &worker->impl_->mutex_);
+    }
+    // assign new status and release the working thread if needed
+    if (new_status != OK) {
+      worker->status_ = new_status;
+      pthread_cond_signal(&worker->impl_->condition_);
+    }
   }
-  pthread_mutex_unlock(&worker->mutex_);
+  pthread_mutex_unlock(&worker->impl_->mutex_);
 }
 
 #endif  // WEBP_USE_THREAD
 
 //------------------------------------------------------------------------------
 
-void WebPWorkerInit(WebPWorker* const worker) {
+static void Init(WebPWorker* const worker) {
   memset(worker, 0, sizeof(*worker));
   worker->status_ = NOT_OK;
 }
 
-int WebPWorkerSync(WebPWorker* const worker) {
+static int Sync(WebPWorker* const worker) {
 #ifdef WEBP_USE_THREAD
   ChangeState(worker, OK);
 #endif
@@ -186,56 +216,94 @@ int WebPWorkerSync(WebPWorker* const worker) {
   return !worker->had_error;
 }
 
-int WebPWorkerReset(WebPWorker* const worker) {
+static int Reset(WebPWorker* const worker) {
   int ok = 1;
   worker->had_error = 0;
   if (worker->status_ < OK) {
 #ifdef WEBP_USE_THREAD
-    if (pthread_mutex_init(&worker->mutex_, NULL) ||
-        pthread_cond_init(&worker->condition_, NULL)) {
+    worker->impl_ = (WebPWorkerImpl*)WebPSafeCalloc(1, sizeof(*worker->impl_));
+    if (worker->impl_ == NULL) {
       return 0;
     }
-    pthread_mutex_lock(&worker->mutex_);
-    ok = !pthread_create(&worker->thread_, NULL, ThreadLoop, worker);
+    if (pthread_mutex_init(&worker->impl_->mutex_, NULL)) {
+      goto Error;
+    }
+    if (pthread_cond_init(&worker->impl_->condition_, NULL)) {
+      pthread_mutex_destroy(&worker->impl_->mutex_);
+      goto Error;
+    }
+    pthread_mutex_lock(&worker->impl_->mutex_);
+    ok = !pthread_create(&worker->impl_->thread_, NULL, ThreadLoop, worker);
     if (ok) worker->status_ = OK;
-    pthread_mutex_unlock(&worker->mutex_);
+    pthread_mutex_unlock(&worker->impl_->mutex_);
+    if (!ok) {
+      pthread_mutex_destroy(&worker->impl_->mutex_);
+      pthread_cond_destroy(&worker->impl_->condition_);
+ Error:
+      WebPSafeFree(worker->impl_);
+      worker->impl_ = NULL;
+      return 0;
+    }
 #else
     worker->status_ = OK;
 #endif
   } else if (worker->status_ > OK) {
-    ok = WebPWorkerSync(worker);
+    ok = Sync(worker);
   }
   assert(!ok || (worker->status_ == OK));
   return ok;
 }
 
-void WebPWorkerExecute(WebPWorker* const worker) {
+static void Execute(WebPWorker* const worker) {
   if (worker->hook != NULL) {
     worker->had_error |= !worker->hook(worker->data1, worker->data2);
   }
 }
 
-void WebPWorkerLaunch(WebPWorker* const worker) {
+static void Launch(WebPWorker* const worker) {
 #ifdef WEBP_USE_THREAD
   ChangeState(worker, WORK);
 #else
-  WebPWorkerExecute(worker);
+  Execute(worker);
 #endif
 }
 
-void WebPWorkerEnd(WebPWorker* const worker) {
-  if (worker->status_ >= OK) {
+static void End(WebPWorker* const worker) {
 #ifdef WEBP_USE_THREAD
+  if (worker->impl_ != NULL) {
     ChangeState(worker, NOT_OK);
-    pthread_join(worker->thread_, NULL);
-    pthread_mutex_destroy(&worker->mutex_);
-    pthread_cond_destroy(&worker->condition_);
+    pthread_join(worker->impl_->thread_, NULL);
+    pthread_mutex_destroy(&worker->impl_->mutex_);
+    pthread_cond_destroy(&worker->impl_->condition_);
+    WebPSafeFree(worker->impl_);
+    worker->impl_ = NULL;
+  }
 #else
-    worker->status_ = NOT_OK;
+  worker->status_ = NOT_OK;
+  assert(worker->impl_ == NULL);
 #endif
-  }
   assert(worker->status_ == NOT_OK);
 }
 
 //------------------------------------------------------------------------------
 
+static WebPWorkerInterface g_worker_interface = {
+  Init, Reset, Sync, Launch, Execute, End
+};
+
+int WebPSetWorkerInterface(const WebPWorkerInterface* const winterface) {
+  if (winterface == NULL ||
+      winterface->Init == NULL || winterface->Reset == NULL ||
+      winterface->Sync == NULL || winterface->Launch == NULL ||
+      winterface->Execute == NULL || winterface->End == NULL) {
+    return 0;
+  }
+  g_worker_interface = *winterface;
+  return 1;
+}
+
+const WebPWorkerInterface* WebPGetWorkerInterface(void) {
+  return &g_worker_interface;
+}
+
+//------------------------------------------------------------------------------
diff --git a/src/3rdparty/libwebp/src/utils/thread.h b/src/3rdparty/libwebp/src/utils/thread.h
index aef33bd..7bd451b 100644
--- a/src/3rdparty/libwebp/src/utils/thread.h
+++ b/src/3rdparty/libwebp/src/utils/thread.h
@@ -15,33 +15,15 @@
 #define WEBP_UTILS_THREAD_H_
 
 #ifdef HAVE_CONFIG_H
-#include "config.h"
+#include "../webp/config.h"
 #endif
 
+#include "../webp/types.h"
+
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-#ifdef WEBP_USE_THREAD
-
-#if defined(_WIN32)
-
-#include <windows.h>
-typedef HANDLE pthread_t;
-typedef CRITICAL_SECTION pthread_mutex_t;
-typedef struct {
-  HANDLE waiting_sem_;
-  HANDLE received_sem_;
-  HANDLE signal_event_;
-} pthread_cond_t;
-
-#else
-
-#include <pthread.h>
-
-#endif    /* _WIN32 */
-#endif    /* WEBP_USE_THREAD */
-
 // State of the worker thread object
 typedef enum {
   NOT_OK = 0,   // object is unusable
@@ -53,13 +35,12 @@ typedef enum {
 // arguments (data1 and data2), and should return false in case of error.
 typedef int (*WebPWorkerHook)(void*, void*);
 
-// Synchronize object used to launch job in the worker thread
+// Platform-dependent implementation details for the worker.
+typedef struct WebPWorkerImpl WebPWorkerImpl;
+
+// Synchronization object used to launch job in the worker thread
 typedef struct {
-#ifdef WEBP_USE_THREAD
-  pthread_mutex_t mutex_;
-  pthread_cond_t  condition_;
-  pthread_t       thread_;
-#endif
+  WebPWorkerImpl* impl_;
   WebPWorkerStatus status_;
   WebPWorkerHook hook;    // hook to call
   void* data1;            // first argument passed to 'hook'
@@ -67,26 +48,41 @@ typedef struct {
   int had_error;          // return value of the last call to 'hook'
 } WebPWorker;
 
-// Must be called first, before any other method.
-void WebPWorkerInit(WebPWorker* const worker);
-// Must be called to initialize the object and spawn the thread. Re-entrant.
-// Will potentially launch the thread. Returns false in case of error.
-int WebPWorkerReset(WebPWorker* const worker);
-// Makes sure the previous work is finished. Returns true if worker->had_error
-// was not set and no error condition was triggered by the working thread.
-int WebPWorkerSync(WebPWorker* const worker);
-// Triggers the thread to call hook() with data1 and data2 argument. These
-// hook/data1/data2 can be changed at any time before calling this function,
-// but not be changed afterward until the next call to WebPWorkerSync().
-void WebPWorkerLaunch(WebPWorker* const worker);
-// This function is similar to WebPWorkerLaunch() except that it calls the
-// hook directly instead of using a thread. Convenient to bypass the thread
-// mechanism while still using the WebPWorker structs. WebPWorkerSync() must
-// still be called afterward (for error reporting).
-void WebPWorkerExecute(WebPWorker* const worker);
-// Kill the thread and terminate the object. To use the object again, one
-// must call WebPWorkerReset() again.
-void WebPWorkerEnd(WebPWorker* const worker);
+// The interface for all thread-worker related functions. All these functions
+// must be implemented.
+typedef struct {
+  // Must be called first, before any other method.
+  void (*Init)(WebPWorker* const worker);
+  // Must be called to initialize the object and spawn the thread. Re-entrant.
+  // Will potentially launch the thread. Returns false in case of error.
+  int (*Reset)(WebPWorker* const worker);
+  // Makes sure the previous work is finished. Returns true if worker->had_error
+  // was not set and no error condition was triggered by the working thread.
+  int (*Sync)(WebPWorker* const worker);
+  // Triggers the thread to call hook() with data1 and data2 arguments. These
+  // hook/data1/data2 values can be changed at any time before calling this
+  // function, but not be changed afterward until the next call to Sync().
+  void (*Launch)(WebPWorker* const worker);
+  // This function is similar to Launch() except that it calls the
+  // hook directly instead of using a thread. Convenient to bypass the thread
+  // mechanism while still using the WebPWorker structs. Sync() must
+  // still be called afterward (for error reporting).
+  void (*Execute)(WebPWorker* const worker);
+  // Kill the thread and terminate the object. To use the object again, one
+  // must call Reset() again.
+  void (*End)(WebPWorker* const worker);
+} WebPWorkerInterface;
+
+// Install a new set of threading functions, overriding the defaults. This
+// should be done before any workers are started, i.e., before any encoding or
+// decoding takes place. The contents of the interface struct are copied, it
+// is safe to free the corresponding memory after this call. This function is
+// not thread-safe. Return false in case of invalid pointer or methods.
+WEBP_EXTERN(int) WebPSetWorkerInterface(
+    const WebPWorkerInterface* const interface);
+
+// Retrieve the currently set thread worker interface.
+WEBP_EXTERN(const WebPWorkerInterface*) WebPGetWorkerInterface(void);
 
 //------------------------------------------------------------------------------
 
diff --git a/src/3rdparty/libwebp/src/utils/utils.c b/src/3rdparty/libwebp/src/utils/utils.c
index 5592538..8ff7f12 100644
--- a/src/3rdparty/libwebp/src/utils/utils.c
+++ b/src/3rdparty/libwebp/src/utils/utils.c
@@ -14,29 +14,198 @@
 #include <stdlib.h>
 #include "./utils.h"
 
+// If PRINT_MEM_INFO is defined, extra info (like total memory used, number of
+// alloc/free etc) is printed. For debugging/tuning purpose only (it's slow,
+// and not multi-thread safe!).
+// An interesting alternative is valgrind's 'massif' tool:
+//    http://valgrind.org/docs/manual/ms-manual.html
+// Here is an example command line:
+/*    valgrind --tool=massif --massif-out-file=massif.out \
+               --stacks=yes --alloc-fn=WebPSafeAlloc --alloc-fn=WebPSafeCalloc
+      ms_print massif.out
+*/
+// In addition:
+// * if PRINT_MEM_TRAFFIC is defined, all the details of the malloc/free cycles
+//   are printed.
+// * if MALLOC_FAIL_AT is defined, the global environment variable
+//   $MALLOC_FAIL_AT is used to simulate a memory error when calloc or malloc
+//   is called for the nth time. Example usage:
+//   export MALLOC_FAIL_AT=50 && ./examples/cwebp input.png
+// * if MALLOC_LIMIT is defined, the global environment variable $MALLOC_LIMIT
+//   sets the maximum amount of memory (in bytes) made available to libwebp.
+//   This can be used to emulate environment with very limited memory.
+//   Example: export MALLOC_LIMIT=64000000 && ./examples/dwebp picture.webp
+
+// #define PRINT_MEM_INFO
+// #define PRINT_MEM_TRAFFIC
+// #define MALLOC_FAIL_AT
+// #define MALLOC_LIMIT
+
 //------------------------------------------------------------------------------
 // Checked memory allocation
 
+#if defined(PRINT_MEM_INFO)
+
+#include <stdio.h>
+#include <stdlib.h>  // for abort()
+
+static int num_malloc_calls = 0;
+static int num_calloc_calls = 0;
+static int num_free_calls = 0;
+static int countdown_to_fail = 0;     // 0 = off
+
+typedef struct MemBlock MemBlock;
+struct MemBlock {
+  void* ptr_;
+  size_t size_;
+  MemBlock* next_;
+};
+
+static MemBlock* all_blocks = NULL;
+static size_t total_mem = 0;
+static size_t total_mem_allocated = 0;
+static size_t high_water_mark = 0;
+static size_t mem_limit = 0;
+
+static int exit_registered = 0;
+
+static void PrintMemInfo(void) {
+  fprintf(stderr, "\nMEMORY INFO:\n");
+  fprintf(stderr, "num calls to: malloc = %4d\n", num_malloc_calls);
+  fprintf(stderr, "              calloc = %4d\n", num_calloc_calls);
+  fprintf(stderr, "              free   = %4d\n", num_free_calls);
+  fprintf(stderr, "total_mem: %u\n", (uint32_t)total_mem);
+  fprintf(stderr, "total_mem allocated: %u\n", (uint32_t)total_mem_allocated);
+  fprintf(stderr, "high-water mark: %u\n", (uint32_t)high_water_mark);
+  while (all_blocks != NULL) {
+    MemBlock* b = all_blocks;
+    all_blocks = b->next_;
+    free(b);
+  }
+}
+
+static void Increment(int* const v) {
+  if (!exit_registered) {
+#if defined(MALLOC_FAIL_AT)
+    {
+      const char* const malloc_fail_at_str = getenv("MALLOC_FAIL_AT");
+      if (malloc_fail_at_str != NULL) {
+        countdown_to_fail = atoi(malloc_fail_at_str);
+      }
+    }
+#endif
+#if defined(MALLOC_LIMIT)
+    {
+      const char* const malloc_limit_str = getenv("MALLOC_LIMIT");
+      if (malloc_limit_str != NULL) {
+        mem_limit = atoi(malloc_limit_str);
+      }
+    }
+#endif
+    (void)countdown_to_fail;
+    (void)mem_limit;
+    atexit(PrintMemInfo);
+    exit_registered = 1;
+  }
+  ++*v;
+}
+
+static void AddMem(void* ptr, size_t size) {
+  if (ptr != NULL) {
+    MemBlock* const b = (MemBlock*)malloc(sizeof(*b));
+    if (b == NULL) abort();
+    b->next_ = all_blocks;
+    all_blocks = b;
+    b->ptr_ = ptr;
+    b->size_ = size;
+    total_mem += size;
+    total_mem_allocated += size;
+#if defined(PRINT_MEM_TRAFFIC)
+#if defined(MALLOC_FAIL_AT)
+    fprintf(stderr, "fail-count: %5d [mem=%u]\n",
+            num_malloc_calls + num_calloc_calls, (uint32_t)total_mem);
+#else
+    fprintf(stderr, "Mem: %u (+%u)\n", (uint32_t)total_mem, (uint32_t)size);
+#endif
+#endif
+    if (total_mem > high_water_mark) high_water_mark = total_mem;
+  }
+}
+
+static void SubMem(void* ptr) {
+  if (ptr != NULL) {
+    MemBlock** b = &all_blocks;
+    // Inefficient search, but that's just for debugging.
+    while (*b != NULL && (*b)->ptr_ != ptr) b = &(*b)->next_;
+    if (*b == NULL) {
+      fprintf(stderr, "Invalid pointer free! (%p)\n", ptr);
+      abort();
+    }
+    {
+      MemBlock* const block = *b;
+      *b = block->next_;
+      total_mem -= block->size_;
+#if defined(PRINT_MEM_TRAFFIC)
+      fprintf(stderr, "Mem: %u (-%u)\n",
+              (uint32_t)total_mem, (uint32_t)block->size_);
+#endif
+      free(block);
+    }
+  }
+}
+
+#else
+#define Increment(v) do {} while (0)
+#define AddMem(p, s) do {} while (0)
+#define SubMem(p)    do {} while (0)
+#endif
+
 // Returns 0 in case of overflow of nmemb * size.
 static int CheckSizeArgumentsOverflow(uint64_t nmemb, size_t size) {
   const uint64_t total_size = nmemb * size;
   if (nmemb == 0) return 1;
   if ((uint64_t)size > WEBP_MAX_ALLOCABLE_MEMORY / nmemb) return 0;
   if (total_size != (size_t)total_size) return 0;
+#if defined(PRINT_MEM_INFO) && defined(MALLOC_FAIL_AT)
+  if (countdown_to_fail > 0 && --countdown_to_fail == 0) {
+    return 0;    // fake fail!
+  }
+#endif
+#if defined(MALLOC_LIMIT)
+  if (mem_limit > 0 && total_mem + total_size >= mem_limit) {
+    return 0;   // fake fail!
+  }
+#endif
+
   return 1;
 }
 
 void* WebPSafeMalloc(uint64_t nmemb, size_t size) {
+  void* ptr;
+  Increment(&num_malloc_calls);
   if (!CheckSizeArgumentsOverflow(nmemb, size)) return NULL;
   assert(nmemb * size > 0);
-  return malloc((size_t)(nmemb * size));
+  ptr = malloc((size_t)(nmemb * size));
+  AddMem(ptr, (size_t)(nmemb * size));
+  return ptr;
 }
 
 void* WebPSafeCalloc(uint64_t nmemb, size_t size) {
+  void* ptr;
+  Increment(&num_calloc_calls);
   if (!CheckSizeArgumentsOverflow(nmemb, size)) return NULL;
   assert(nmemb * size > 0);
-  return calloc((size_t)nmemb, size);
+  ptr = calloc((size_t)nmemb, size);
+  AddMem(ptr, (size_t)(nmemb * size));
+  return ptr;
 }
 
-//------------------------------------------------------------------------------
+void WebPSafeFree(void* const ptr) {
+  if (ptr != NULL) {
+    Increment(&num_free_calls);
+    SubMem(ptr);
+  }
+  free(ptr);
+}
 
+//------------------------------------------------------------------------------
diff --git a/src/3rdparty/libwebp/src/utils/utils.h b/src/3rdparty/libwebp/src/utils/utils.h
index 8bdf0f0..f2c498a 100644
--- a/src/3rdparty/libwebp/src/utils/utils.h
+++ b/src/3rdparty/libwebp/src/utils/utils.h
@@ -35,10 +35,13 @@ extern "C" {
 // somewhere (like: malloc(num_pixels * sizeof(*something))). That's why this
 // safe malloc() borrows the signature from calloc(), pointing at the dangerous
 // underlying multiply involved.
-void* WebPSafeMalloc(uint64_t nmemb, size_t size);
+WEBP_EXTERN(void*) WebPSafeMalloc(uint64_t nmemb, size_t size);
 // Note that WebPSafeCalloc() expects the second argument type to be 'size_t'
 // in order to favor the "calloc(num_foo, sizeof(foo))" pattern.
-void* WebPSafeCalloc(uint64_t nmemb, size_t size);
+WEBP_EXTERN(void*) WebPSafeCalloc(uint64_t nmemb, size_t size);
+
+// Companion deallocation function to the above allocations.
+WEBP_EXTERN(void) WebPSafeFree(void* const ptr);
 
 //------------------------------------------------------------------------------
 // Reading/writing data.
@@ -74,6 +77,41 @@ static WEBP_INLINE void PutLE32(uint8_t* const data, uint32_t val) {
   PutLE16(data + 2, (int)(val >> 16));
 }
 
+// Returns (int)floor(log2(n)). n must be > 0.
+// use GNU builtins where available.
+#if defined(__GNUC__) && \
+    ((__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || __GNUC__ >= 4)
+static WEBP_INLINE int BitsLog2Floor(uint32_t n) {
+  return 31 ^ __builtin_clz(n);
+}
+#elif defined(_MSC_VER) && _MSC_VER > 1310 && \
+      (defined(_M_X64) || defined(_M_IX86))
+#include <intrin.h>
+#pragma intrinsic(_BitScanReverse)
+
+static WEBP_INLINE int BitsLog2Floor(uint32_t n) {
+  uint32_t first_set_bit;
+  _BitScanReverse(&first_set_bit, n);
+  return first_set_bit;
+}
+#else
+static WEBP_INLINE int BitsLog2Floor(uint32_t n) {
+  int log = 0;
+  uint32_t value = n;
+  int i;
+
+  for (i = 4; i >= 0; --i) {
+    const int shift = (1 << i);
+    const uint32_t x = value >> shift;
+    if (x != 0) {
+      value = x;
+      log += shift;
+    }
+  }
+  return log;
+}
+#endif
+
 //------------------------------------------------------------------------------
 
 #ifdef __cplusplus
diff --git a/src/3rdparty/libwebp/src/webp/decode.h b/src/3rdparty/libwebp/src/webp/decode.h
index 0c3b62e..8d3f7be 100644
--- a/src/3rdparty/libwebp/src/webp/decode.h
+++ b/src/3rdparty/libwebp/src/webp/decode.h
@@ -442,11 +442,23 @@ struct WebPDecoderOptions {
   int scaled_width, scaled_height;    // final resolution
   int use_threads;                    // if true, use multi-threaded decoding
   int dithering_strength;             // dithering strength (0=Off, 100=full)
+#if WEBP_DECODER_ABI_VERSION > 0x0203
+  int flip;                           // flip output vertically
+#endif
+#if WEBP_DECODER_ABI_VERSION > 0x0204
+  int alpha_dithering_strength;       // alpha dithering strength in [0..100]
+#endif
 
   // Unused for now:
   int force_rotation;                 // forced rotation (to be applied _last_)
   int no_enhancement;                 // if true, discard enhancement layer
+#if WEBP_DECODER_ABI_VERSION < 0x0203
   uint32_t pad[5];                    // padding for later use
+#elif WEBP_DECODER_ABI_VERSION < 0x0204
+  uint32_t pad[4];                    // padding for later use
+#else
+  uint32_t pad[3];                    // padding for later use
+#endif
 };
 
 // Main object storing the configuration for advanced decoding.
diff --git a/src/3rdparty/libwebp/src/webp/encode.h b/src/3rdparty/libwebp/src/webp/encode.h
index 7a428b4..b3f05b1 100644
--- a/src/3rdparty/libwebp/src/webp/encode.h
+++ b/src/3rdparty/libwebp/src/webp/encode.h
@@ -167,6 +167,16 @@ static WEBP_INLINE int WebPConfigPreset(WebPConfig* config,
                                 WEBP_ENCODER_ABI_VERSION);
 }
 
+#if WEBP_ENCODER_ABI_VERSION > 0x0202
+// Activate the lossless compression mode with the desired efficiency level
+// between 0 (fastest, lowest compression) and 9 (slower, best compression).
+// A good default level is '6', providing a fair tradeoff between compression
+// speed and final compressed size.
+// This function will overwrite several fields from config: 'method', 'quality'
+// and 'lossless'. Returns false in case of parameter error.
+WEBP_EXTERN(int) WebPConfigLosslessPreset(WebPConfig* config, int level);
+#endif
+
 // Returns true if 'config' is non-NULL and all configuration parameters are
 // within their valid ranges.
 WEBP_EXTERN(int) WebPValidateConfig(const WebPConfig* config);
@@ -221,9 +231,18 @@ struct WebPMemoryWriter {
 // The following must be called first before any use.
 WEBP_EXTERN(void) WebPMemoryWriterInit(WebPMemoryWriter* writer);
 
+#if WEBP_ENCODER_ABI_VERSION > 0x0203
+// The following must be called to deallocate writer->mem memory. The 'writer'
+// object itself is not deallocated.
+WEBP_EXTERN(void) WebPMemoryWriterClear(WebPMemoryWriter* writer);
+#endif
 // The custom writer to be used with WebPMemoryWriter as custom_ptr. Upon
 // completion, writer.mem and writer.size will hold the coded data.
-// writer.mem must be freed using the call 'free(writer.mem)'.
+#if WEBP_ENCODER_ABI_VERSION > 0x0203
+// writer.mem must be freed by calling WebPMemoryWriterClear.
+#else
+// writer.mem must be freed by calling 'free(writer.mem)'.
+#endif
 WEBP_EXTERN(int) WebPMemoryWrite(const uint8_t* data, size_t data_size,
                                  const WebPPicture* picture);
 
@@ -235,16 +254,9 @@ typedef int (*WebPProgressHook)(int percent, const WebPPicture* picture);
 // Color spaces.
 typedef enum WebPEncCSP {
   // chroma sampling
-  WEBP_YUV420 = 0,   // 4:2:0
-  WEBP_YUV422 = 1,   // 4:2:2
-  WEBP_YUV444 = 2,   // 4:4:4
-  WEBP_YUV400 = 3,   // grayscale
-  WEBP_CSP_UV_MASK = 3,   // bit-mask to get the UV sampling factors
-  // alpha channel variants
-  WEBP_YUV420A = 4,
-  WEBP_YUV422A = 5,
-  WEBP_YUV444A = 6,
-  WEBP_YUV400A = 7,   // grayscale + alpha
+  WEBP_YUV420  = 0,        // 4:2:0
+  WEBP_YUV420A = 4,        // alpha channel variant
+  WEBP_CSP_UV_MASK = 3,    // bit-mask to get the UV sampling factors
   WEBP_CSP_ALPHA_BIT = 4   // bit that is set if alpha is present
 } WebPEncCSP;
 
@@ -323,17 +335,15 @@ struct WebPPicture {
 
   uint32_t pad3[3];       // padding for later use
 
-  // Unused for now: original samples (for non-YUV420 modes)
-  uint8_t *u0, *v0;
-  int uv0_stride;
-
-  uint32_t pad4[7];       // padding for later use
+  // Unused for now
+  uint8_t *pad4, *pad5;
+  uint32_t pad6[8];       // padding for later use
 
   // PRIVATE FIELDS
   ////////////////////
   void* memory_;          // row chunk of memory for yuva planes
   void* memory_argb_;     // and for argb too.
-  void* pad5[2];          // padding for later use
+  void* pad7[2];          // padding for later use
 };
 
 // Internal, version-checked, entry point
@@ -409,7 +419,9 @@ WEBP_EXTERN(int) WebPPictureView(const WebPPicture* src,
 WEBP_EXTERN(int) WebPPictureIsView(const WebPPicture* picture);
 
 // Rescale a picture to new dimension width x height.
-// Now gamma correction is applied.
+// If either 'width' or 'height' (but not both) is 0 the corresponding
+// dimension will be calculated preserving the aspect ratio.
+// No gamma correction is applied.
 // Returns false in case of error (invalid parameter or insufficient memory).
 WEBP_EXTERN(int) WebPPictureRescale(WebPPicture* pic, int width, int height);
 
@@ -436,13 +448,14 @@ WEBP_EXTERN(int) WebPPictureImportBGRA(
 WEBP_EXTERN(int) WebPPictureImportBGRX(
     WebPPicture* picture, const uint8_t* bgrx, int bgrx_stride);
 
-// Converts picture->argb data to the YUVA format specified by 'colorspace'.
+// Converts picture->argb data to the YUV420A format. The 'colorspace'
+// parameter is deprecated and should be equal to WEBP_YUV420.
 // Upon return, picture->use_argb is set to false. The presence of real
 // non-opaque transparent values is detected, and 'colorspace' will be
 // adjusted accordingly. Note that this method is lossy.
 // Returns false in case of error.
 WEBP_EXTERN(int) WebPPictureARGBToYUVA(WebPPicture* picture,
-                                       WebPEncCSP colorspace);
+                                       WebPEncCSP /*colorspace = WEBP_YUV420*/);
 
 // Same as WebPPictureARGBToYUVA(), but the conversion is done using
 // pseudo-random dithering with a strength 'dithering' between
@@ -451,6 +464,15 @@ WEBP_EXTERN(int) WebPPictureARGBToYUVA(WebPPicture* picture,
 WEBP_EXTERN(int) WebPPictureARGBToYUVADithered(
     WebPPicture* picture, WebPEncCSP colorspace, float dithering);
 
+#if WEBP_ENCODER_ABI_VERSION > 0x0204
+// Performs 'smart' RGBA->YUVA420 downsampling and colorspace conversion.
+// Downsampling is handled with extra care in case of color clipping. This
+// method is roughly 2x slower than WebPPictureARGBToYUVA() but produces better
+// YUV representation.
+// Returns false in case of error.
+WEBP_EXTERN(int) WebPPictureSmartARGBToYUVA(WebPPicture* picture);
+#endif
+
 // Converts picture->yuv to picture->argb and sets picture->use_argb to true.
 // The input format must be YUV_420 or YUV_420A.
 // Note that the use of this method is discouraged if one has access to the
@@ -459,9 +481,9 @@ WEBP_EXTERN(int) WebPPictureARGBToYUVADithered(
 // Returns false in case of error.
 WEBP_EXTERN(int) WebPPictureYUVAToARGB(WebPPicture* picture);
 
-// Helper function: given a width x height plane of YUV(A) samples
-// (with stride 'stride'), clean-up the YUV samples under fully transparent
-// area, to help compressibility (no guarantee, though).
+// Helper function: given a width x height plane of RGBA or YUV(A) samples
+// clean-up the YUV or RGB samples under fully transparent area, to help
+// compressibility (no guarantee, though).
 WEBP_EXTERN(void) WebPCleanupTransparentArea(WebPPicture* picture);
 
 // Scan the picture 'picture' for the presence of non fully opaque alpha values.
diff --git a/src/3rdparty/libwebp/src/webp/mux.h b/src/3rdparty/libwebp/src/webp/mux.h
index eb57f51..1ae03b3 100644
--- a/src/3rdparty/libwebp/src/webp/mux.h
+++ b/src/3rdparty/libwebp/src/webp/mux.h
@@ -105,6 +105,7 @@ WEBP_EXTERN(WebPMux*) WebPNewInternal(int);
 // Creates an empty mux object.
 // Returns:
 //   A pointer to the newly created empty mux object.
+//   Or NULL in case of memory error.
 static WEBP_INLINE WebPMux* WebPMuxNew(void) {
   return WebPNewInternal(WEBP_MUX_ABI_VERSION);
 }
@@ -309,6 +310,26 @@ WEBP_EXTERN(WebPMuxError) WebPMuxGetAnimationParams(
 //------------------------------------------------------------------------------
 // Misc Utilities.
 
+#if WEBP_MUX_ABI_VERSION > 0x0101
+// Sets the canvas size for the mux object. The width and height can be
+// specified explicitly or left as zero (0, 0).
+// * When width and height are specified explicitly, then this frame bound is
+//   enforced during subsequent calls to WebPMuxAssemble() and an error is
+//   reported if any animated frame does not completely fit within the canvas.
+// * When unspecified (0, 0), the constructed canvas will get the frame bounds
+//   from the bounding-box over all frames after calling WebPMuxAssemble().
+// Parameters:
+//   mux - (in) object to which the canvas size is to be set
+//   width - (in) canvas width
+//   height - (in) canvas height
+// Returns:
+//   WEBP_MUX_INVALID_ARGUMENT - if mux is NULL; or
+//                               width or height are invalid or out of bounds
+//   WEBP_MUX_OK - on success.
+WEBP_EXTERN(WebPMuxError) WebPMuxSetCanvasSize(WebPMux* mux,
+                                               int width, int height);
+#endif
+
 // Gets the canvas size from the mux object.
 // Note: This method assumes that the VP8X chunk, if present, is up-to-date.
 // That is, the mux object hasn't been modified since the last call to
@@ -356,7 +377,8 @@ WEBP_EXTERN(WebPMuxError) WebPMuxNumChunks(const WebPMux* mux,
 // Note: The content of 'assembled_data' will be ignored and overwritten.
 // Also, the content of 'assembled_data' is allocated using malloc(), and NOT
 // owned by the 'mux' object. It MUST be deallocated by the caller by calling
-// WebPDataClear().
+// WebPDataClear(). It's always safe to call WebPDataClear() upon return,
+// even in case of error.
 // Parameters:
 //   mux - (in/out) object whose chunks are to be assembled
 //   assembled_data - (out) assembled WebP data
diff --git a/src/3rdparty/libwebp/src/webp/types.h b/src/3rdparty/libwebp/src/webp/types.h
index 568d1f2..9b036e0 100644
--- a/src/3rdparty/libwebp/src/webp/types.h
+++ b/src/3rdparty/libwebp/src/webp/types.h
@@ -18,10 +18,11 @@
 
 #ifndef _MSC_VER
 #include <inttypes.h>
-#ifdef __STRICT_ANSI__
-#define WEBP_INLINE
-#else  /* __STRICT_ANSI__ */
+#if defined(__cplusplus) || !defined(__STRICT_ANSI__) || \
+    (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L)
 #define WEBP_INLINE inline
+#else
+#define WEBP_INLINE
 #endif
 #else
 typedef signed   char int8_t;
author	Liang Qi <liang.qi@theqtcompany.com>	2015-04-08 10:10:02 +0200
committer	Liang Qi <liang.qi@theqtcompany.com>	2015-04-09 08:41:30 +0000
commit	600a2d18d40ba87a273e1a258ce3189b0edc11f8 (patch)
tree	d80ede3880cb86f06eae50191e5dd3b102482189 /src/3rdparty
parent	32e437b1244e7bf45011e4fd770fc0eef40d02de (diff)