119 files changed, 5235 insertions, 2900 deletions
diff --git a/src/3rdparty/libwebp/AUTHORS b/src/3rdparty/libwebp/AUTHORS
index 67482c1..2f0c537 100644
--- a/src/3rdparty/libwebp/AUTHORS
+++ b/src/3rdparty/libwebp/AUTHORS
@@ -1,13 +1,23 @@
 Contributors:
 - Aidan O'Loan (aidanol at gmail dot com)
 - Alan Browning (browning at google dot com)
+- Alexandru Ardelean (ardeleanalex at gmail dot com)
+- Brian Ledger (brianpl at google dot com)
 - Charles Munger (clm at google dot com)
+- Cheng Yi (cyi at google dot com)
 - Christian Duvivier (cduvivier at google dot com)
+- Christopher Degawa (ccom at randomderp dot com)
+- Clement Courbet (courbet at google dot com)
 - Djordje Pesut (djordje dot pesut at imgtec dot com)
+- Frank Barchard (fbarchard at google dot com)
 - Hui Su (huisu at google dot com)
+- H. Vetinari (h dot vetinari at gmx dot com)
+- Ilya Kurdyukov (jpegqs at gmail dot com)
+- Ingvar Stepanyan (rreverser at google dot com)
 - James Zern (jzern at google dot com)
 - Jan Engelhardt (jengelh at medozas dot de)
 - Jehan (jehan at girinstud dot io)
+- Jeremy Maitin-Shepard (jbms at google dot com)
 - Johann Koenig (johann dot koenig at duck dot com)
 - Jovan Zelincevic (jovan dot zelincevic at imgtec dot com)
 - Jyrki Alakuijala (jyrki at google dot com)
@@ -17,15 +27,18 @@ Contributors:
 - Mans Rullgard (mans at mansr dot com)
 - Marcin Kowalczyk (qrczak at google dot com)
 - Martin Olsson (mnemo at minimum dot se)
+- Maryla Ustarroz-Calonge (maryla at google dot com)
 - Mikołaj Zalewski (mikolajz at google dot com)
 - Mislav Bradac (mislavm at google dot com)
 - Nico Weber (thakis at chromium dot org)
 - Noel Chromium (noel at chromium dot org)
+- Oliver Wolff (oliver dot wolff at qt dot io)
 - Owen Rodley (orodley at google dot com)
 - Parag Salasakar (img dot mips1 at gmail dot com)
 - Pascal Massimino (pascal dot massimino at gmail dot com)
 - Paweł Hajdan, Jr (phajdan dot jr at chromium dot org)
 - Pierre Joye (pierre dot php at gmail dot com)
+- Roberto Alanis (alanisbaez at google dot com)
 - Sam Clegg (sbc at chromium dot org)
 - Scott Hancher (seh at google dot com)
 - Scott LaVarnway (slavarnway at google dot com)
@@ -39,5 +52,7 @@ Contributors:
 - Vikas Arora (vikasa at google dot com)
 - Vincent Rabaud (vrabaud at google dot com)
 - Vlad Tsyrklevich (vtsyrklevich at chromium dot org)
+- Wan-Teh Chang (wtc at google dot com)
 - Yang Zhang (yang dot zhang at arm dot com)
 - Yannis Guyon (yguyon at google dot com)
+- Zhi An Ng (zhin at chromium dot org)
diff --git a/src/3rdparty/libwebp/ChangeLog b/src/3rdparty/libwebp/ChangeLog
index 10ba580..00ef617 100644
--- a/src/3rdparty/libwebp/ChangeLog
+++ b/src/3rdparty/libwebp/ChangeLog
@@ -1,3 +1,524 @@
+0ba77244 update NEWS
+e763eb1e bump version to 1.3.0
+2a8686fc update AUTHORS
+106a57c1 Merge "*/Android.mk: add a check for NDK_ROOT" into main
+c5e841c4 Merge "extras: WebpToSDL -> WebPToSDL" into main
+dbc30715 Merge "xcframeworkbuild.sh: bump MACOSX_CATALYST_MIN_VERSION" into main
+6fc1a9f9 */Android.mk: add a check for NDK_ROOT
+d3e151fc doc/api.md,webp_js/README.md: Webp -> WebP
+ed92a626 extras: WebpToSDL -> WebPToSDL
+6eb0189b xcframeworkbuild.sh: bump MACOSX_CATALYST_MIN_VERSION
+1d58575b CMake: align .pc variables with autoconf
+e5fe2cfc webp-lossless-bitstream-spec,cosmetics: reflow paragraphs
+0ceeeab9 webp-lossless-bitstream-spec: add amendment note
+607611cd Merge "webp-container-spec: normalize section title case" into main
+f853685e lossless: SUBTRACT_GREEN -> SUBTRACT_GREEN_TRANSFORM
+786497e4 webp-lossless-bitstream-spec: fix inv color txfm description
+c6ac672d webp-lossless-bitstream-spec: fix num_code_lengths check
+b5700efb webp-lossless-bitstream-spec,cosmetics: grammar/capitalization
+d8ed8c11 webp-container-spec: normalize section title case
+52ec0b8f Merge changes Ie975dbb5,Ifc8c93af,I6ca7c5d6,I2e8d66f5,I152477b8 into main
+5097ef62 webp-container-spec,cosmetics: grammar/capitalization
+e3ba2b1f webp-lossless-bitstream-spec,cosmetics: reflow abstract
+1e8e3ded webp-lossless-bitstream-spec: reword abstract re alpha
+017cb6fa webp-container-spec,cosmetics: normalize range syntax
+f6a4684b webp-lossless-bitstream-spec,cosmetics: normalize range syntax
+54ebd5a3 webp-lossless-bitstream-spec: limit dist map lut to 69 cols
+44741f9c webp-lossless-bitstream-spec: fix dist mapping example
+fad0ece7 pnmdec.c: use snprintf instead of sprintf
+3f73e8f7 sharpyuv: add SharpYuvGetVersion()
+ce2f2d66 SharpYuvConvert: fix a race on SharpYuvGetCPUInfo
+a458e308 sharpyuv_dsp.h: restore sharpyuv_cpu.h include
+9ba800a7 Merge changes Id72fbf3b,Ic59d23a2 into main
+979c0ebb sharpyuv: add SharpYuvGetCPUInfo
+8bab09a4 Merge "*.pc.in: rename lib_prefix to webp_libname_prefix" into main
+769387c5 cpu.c,cosmetics: fix a typo
+a02978c2 sharpyuv/Makefile.am+cmake: add missing -lm
+28aedcb9 *.pc.in: rename lib_prefix to webp_libname_prefix
+c42e6d5a configure.ac: export an empty lib_prefix variable
+dfc843aa Merge "*.pc.in: add lib prefix to lib names w/MSVC" into main
+2498209b *.pc.in: add lib prefix to lib names w/MSVC
+ac252b61 Merge "analysis_enc.c: fix a dead store warning" into main
+56944762 analysis_enc.c: fix a dead store warning
+d34f9b99 Merge "webp-lossless-bitstream-spec: convert BNF to ABNF" into main
+dc05b4db Merge changes I96bc063c,I45880467,If9e18e5a,I6ee938e4,I0a410b28, ... into main
+83270c7f webp-container-spec: add prose for rendering process
+73b19b64 webp-container-spec: note reserved fields MUST be ignored
+57101d3f webp-lossless-bitstream-spec: improve 'small' color table stmt
+dfd32e45 webp-container-spec: remove redundant sentence
+8a6185dd doc/webp-*: fix some punctuation, grammar
+72776530 webp-lossless-bitstream-spec: convert BNF to ABNF
+d992bb08 cmake: rename cpufeatures target to cpufeatures-webp
+3ed2b275 webp-container-spec: clarify background color note
+951c292d webp-container-spec: come too late -> out of order
+902dd787 webp-container-spec: prefer hex literals
+a8f6b5ee webp-container-spec: change SHOULD to MUST w/ANIM chunk
+1dc59435 webp-container-spec: add unknown fields MUST be ignored
+280a810f webp-container-spec: make padding byte=0 a MUST
+41f0bf68 webp-container-spec: update note on trailing data
+6bdd36db webp-container-spec: clarify Chunk Size is in bytes
+87e36c48 Merge "webp_js/README.md,cosmetics: reflow some lines" into main
+5b01f321 Merge "Update Windows makefile to build libsharpyuv library." into main
+19b1a71c webp_js/README.md,cosmetics: reflow some lines
+780db756 Update Windows makefile to build libsharpyuv library.
+e407d4b3 CMakeLists.txt: replace GLUT_glut_LIBRARY w/GLUT::GLUT
+abf73d62 Merge "WebPConfig.cmake.in: add find_dependency(Threads)" into main
+25807fb4 Merge "cmake: restore compatibility with cmake < 3.12" into main
+5dbc4bfa WebPConfig.cmake.in: add find_dependency(Threads)
+b2a175dd Merge "Update wasm instructions." into main
+cb90f76b Update wasm instructions.
+02d15258 cmake: restore compatibility with cmake < 3.12
+5ba046e2 CMake: add_definitions -> add_compile_options
+e68765af dsp,neon: use vaddv in a few more places
+e8f83de2 Set libsharpyuv include dir to 'webp' subdirectory.
+15a91ab1 cmake,cosmetics: apply cmake-format
+0dd49d1a CMakeLists.txt: set @ONLY in configure_file() calls
+62b1bfe8 Merge changes I2877e7bb,I777cad70,I15af7d1a,I686e6740,If10538a9, ... into main
+95c8fe5f Merge changes Iecea3603,I9dc228ab into main
+e7c805cf picture_csp_enc.c: remove SafeInitSharpYuv
+6af8845a sharpyuv: prefer webp/types.h
+639619ce cmake: fix dll exports
+782ed48c sharpyuv,SharpYuvInit: add mutex protection when available
+cad0d5ad sharyuv_{neon,sse2}.c: merge WEBP_USE_* sections
+ef70ee06 add a few missing <stddef.h> includes for NULL
+f0f9eda4 sharpyuv.h: remove <inttypes.h>
+9b902cba Merge "picture_csp_enc.c,CheckNonOpaque: rm unneeded local" into main
+9c1d457c cmake/cpu.cmake: remove unused variable
+9ac25bcb CMakeLists.txt,win32: match naming convention used by nmake
+76c353ba picture_csp_enc.c,CheckNonOpaque: rm unneeded local
+5000de54 Merge "cwebp: fix WebPPictureHasTransparency call" into main
+e1729309 Merge "WebPPictureHasTransparency: add missing pointer check" into main
+00ff988a vp8l_enc,AddSingleSubGreen: clear int sanitizer warnings
+e2fecc22 dsp/lossless_enc.c: clear int sanitizer warnings
+129cf9e9 dsp/lossless.c: clear int sanitizer warnings
+ad7d1753 dsp/lossless_enc.c: clear int sanitizer warnings
+5037220e VP8LSubtractGreenFromBlueAndRed_C: clear int sanitizer warnings
+2ee786c7 upsampling_sse2.c: clear int sanitizer warnings
+4cc157d4 ParseOptionalChunks: clear int sanitizer warning
+892cf033 BuildHuffmanTable: clear int sanitizer warning
+3a9a4d45 VP8GetSigned: clear int sanitizer warnings
+704a3d0a dsp/lossless.c: quiet int sanitizer warnings
+1a6c109c WebPPictureHasTransparency: add missing pointer check
+c626e7d5 cwebp: fix WebPPictureHasTransparency call
+866e349c Merge tag 'v1.2.4'
+c170df38 Merge "Create libsharpyuv.a in makefile.unix." into main
+9d7ff74a Create libsharpyuv.a in makefile.unix.
+0d1f1254 update ChangeLog (tag: v1.2.4, origin/1.2.4)
+fcbc2d78 Merge "doc/*.txt: restrict code to 69 columns" into main
+4ad0e189 Merge "webp-container-spec.txt: normalize fourcc spelling" into main
+980d2488 update NEWS
+9fde8127 bump version to 1.2.4
+7a0a9935 doc/*.txt: restrict code to 69 columns
+c040a615 webp-container-spec.txt: normalize fourcc spelling
+aff1c546 dsp,x86: normalize types w/_mm_cvtsi128_si32 calls
+ab540ae0 dsp,x86: normalize types w/_mm_cvtsi32_si128 calls
+8980362e dsp,x86: normalize types w/_mm_set* calls (2)
+e626925c lossless: fix crunch mode w/WEBP_REDUCE_SIZE
+83539239 dsp,x86: normalize types w/_mm_set* calls
+8a4576ce webp-container-spec.txt: replace &amp; with &
+db870881 Merge "webp-container-spec.txt: make reserved 0 values a MUST" into main
+01d7d378 webp-lossless-bitstream-spec: number all sections
+337cf69f webp-lossless-bitstream-spec: mv Nomenclature after Intro
+79be856e Merge changes I7111d1f7,I872cd62c into main
+5b87983a webp-container-spec.txt: make reserved 0 values a MUST
+bd939123 Merge changes I7a25b1a6,I51b2c2a0,I87d0cbcf,I6ec60af6,I0a3fe9dc into main
+04764b56 libwebp.pc: add libsharpyuv to requires
+7deee810 libsharpyuv: add pkg-config file
+1a64a7e6 webp-container-spec.txt: clarify some SHOULDs
+bec2c88a webp-container-spec.txt: move ChunkHeader to terminology
+c9359332 webp-container-spec.txt: clarify 'VP8 '/'XMP ' fourccs
+70fe3063 webp-container-spec.txt: rightsize table entries
+ddbf3f3f webp-container-spec.txt: update 'key words' text
+c151e95b utils.h,WEBP_ALIGN: make bitmask unsigned
+748e92bb add WebPInt32ToMem
+3fe15b67 Merge "Build libsharpyuv as a full installable library." into main
+4f402f34 add WebPMemToInt32
+a3b68c19 Build libsharpyuv as a full installable library.
+b4994eaa CMake: set rpath for shared objects
+94cd7117 Merge "CMake: fix dylib versioning" into main
+e91451b6 Fix the lossless specs a bit more.
+231bdfb7 CMake: fix dylib versioning
+bfad7ab5 CMakeLists.txt: correct libwebpmux name in WebPConfig.cmake
+c2e3fd30 Revert "cmake: fix webpmux lib name for cmake linking"
+7366f7f3 Merge "lossless: fix crunch mode w/WEBP_REDUCE_SIZE" into main
+84163d9d lossless: fix crunch mode w/WEBP_REDUCE_SIZE
+d01c1eb3 webp-lossless-bitstream-spec,cosmetics: normalize capitalization
+8813ca8e Merge tag 'v1.2.3'
+3c4a0fbf update ChangeLog (tag: v1.2.3)
+56a480e8 dsp/cpu.h: add missing extern "C"
+62b45bdd update ChangeLog (tag: v1.2.3-rc1)
+8764ec7a Merge changes Idb037953,Id582e395 into 1.2.3
+bcb872c3 vwebp: fix file name display in windows unicode build
+67c44ac5 webpmux: fix -frame option in windows unicode build
+8278825a makefile.unix: add sharpyuv objects to clean target
+14a49e01 update NEWS
+34b1dc33 bump version to 1.2.3
+0b397fda update AUTHORS
+c16488ac update .mailmap
+5a2d929c Merge "unicode.h: set console mode before using wprintf" into main
+169f867f unicode.h: set console mode before using wprintf
+a94b855c Merge "libsharpyuv: add version defines" into main
+f83bdb52 libsharpyuv: add version defines
+bef0d797 unicode_gif.h: fix -Wdeclaration-after-statement
+404c1622 Rename Huffman coding to prefix coding in the bitstream spec
+8895f8a3 Merge "run_static_analysis.sh: fix scan-build archive path" into main
+92a673d2 Merge "Add -fvisibility=hidden flag in CMakeLists." into main
+67c1d722 Merge "add WEBP_MSAN" into main
+1124ff66 Add -fvisibility=hidden flag in CMakeLists.
+e15b3560 add WEBP_MSAN
+ec9e782a sharpyuv: remove minimum image size from sharpyuv library
+7bd07f3b run_static_analysis.sh: fix scan-build archive path
+5ecee06f Merge "sharpyuv: increase precision of gamma<->linear conversion" into main
+f81dd7d6 Merge changes I3d17d529,I53026880,I1bd61639,I6bd4b25d,Icfec8fba into main
+2d607ee6 sharpyuv: increase precision of gamma<->linear conversion
+266cbbc5 sharpyuv: add 32bit version of SharpYuvFilterRow.
+9fc12274 CMake: add src to webpinfo includes
+7d18f40a CMake: add WEBP_BUILD_WEBPINFO to list of checks for exampleutil
+11309aa5 CMake: add WEBP_BUILD_WEBPMUX to list of checks for exampleutil
+4bc762f7 CMake: link imageioutil to exampleutil after defined
+0d1b9bc4 WEBP_DEP_LIBRARIES: use Threads::Threads
+20ef48f0 Merge "sharpyuv: add support for 10/12/16 bit rgb and 10/12 bit yuv." into main
+93c54371 sharpyuv: add support for 10/12/16 bit rgb and 10/12 bit yuv.
+53cf2b49 normalize WebPValidatePicture declaration w/definition
+d3006f4b sharpyuv: slightly improve precision
+ea967098 Merge changes Ia01bd397,Ibf3771af into main
+11bc8410 Merge changes I2d317c4b,I9e77f6db into main
+30453ea4 Add an internal WebPValidatePicture.
+6c43219a Some renamings for consistency.
+4f59fa73 update .mailmap
+e74f8a62 webp-lossless-bitstream-spec,cosmetics: normalize range syntax
+5a709ec0 webp-lossless-bitstream-spec,cosmetics: fix code typo
+a2093acc webp-lossless-bitstream-spec: add amendment note
+86c66930 webp-lossless-bitstream-spec: fix BNF
+232f22da webp-lossless-bitstream-spec: fix 'simple code' snippet
+44dd765d webp-lossless-bitstream-spec: fix ColorTransform impl
+7a7e33e9 webp-lossless-bitstream-spec: fix TR-pixel right border note
+86f94ee0 Update lossless spec with Huffman codes.
+a3927cc8 sharpyuv.c,cosmetics: fix indent
+6c45cef7 Make sure the stride has a minimum value in the importer.
+0c8b0e67 sharpyuv: cleanup/cosmetic changes
+dc3841e0 {histogram,predictor}_enc: quiet int -> float warnings
+a19a25bb Replace doubles by floats in lossless misc cost estimations.
+42888f6c Add an option to enable static builds.
+7efcf3cc Merge "Fix typo in color constants: Marix -> Matrix" into main
+8f4b5c62 Fix typo in color constants: Marix -> Matrix
+90084d84 Merge "demux,IsValidExtendedFormat: remove unused variable" into main
+ed643f61 Merge changes I452d2485,Ic6d75475 into main
+8fa053d1 Rename SharpYUV to SharpYuv for consistency.
+99a87562 SharpYuvComputeConversionMatrix: quiet int->float warnings
+deb426be Makefile.vc: add sharpyuv_csp.obj to SHARPYUV_OBJS
+779597d4 demux,IsValidExtendedFormat: remove unused variable
+40e8aa57 Merge "libsharpyuv: add colorspace utilities" into main
+01a05de1 libsharpyuv: add colorspace utilities
+2de4b05a Merge changes Id9890a60,I376d81e6,I1c958838 into main
+b8bca81f Merge "configure.ac: use LT_INIT if available" into main
+e8e77b9c Merge changes I479bc487,I39864691,I5d486c2c,I186d13be into main
+7e7d5d50 Merge ".gitignore: add Android Studio & VS code dirs" into main
+10c50848 normalize label indent
+89f774e6 mux{edit,internal}: fix leaks on error
+2d3293ad ExUtilInitCommandLineArguments: fix leak on error
+ec34fd70 anim_util: fix leaks on error
+e4717287 gif2webp: fix segfault on OOM
+e3cfafaf GetBackwardReferences: fail on alloc error
+a828a59b BackwardReferencesHashChainDistanceOnly: fix segfault on OOM
+fe153fae VP8LEncodeStream: fix segfault on OOM
+919acc0e .gitignore: add Android Studio & VS code dirs
+efa0731b configure.ac: use LT_INIT if available
+0957fd69 tiffdec: add grayscale support
+e685feef Merge "Make libsharpyuv self-contained by removing dependency on cpu.c" into main
+841960b6 Make libsharpyuv self-contained by removing dependency on cpu.c
+617cf036 image_dec: add WebPGetEnabledInputFileFormats()
+7a68afaa Let SharpArgbToYuv caller pass in an RGB>YUV conversion matrix.
+34bb332c man/cwebp.1: add note about crop/resize order
+f0e9351c webp-lossless-bitstream-spec,cosmetics: fix some typos
+5ccbd6ed vp8l_dec.c,cosmetics: fix a few typos
+c3d0c2d7 fix ios build scripts after sharpyuv dep added
+d0d2292e Merge "Make libwebp depend on libsharpyuv." into main
+03d12190 alpha_processing_neon.c: fix 0x01... typo
+d55d447c Make libwebp depend on libsharpyuv.
+e4cbcdd2 Fix lossless encoding for MIPS.
+924e7ca6 alpha_processing_neon.c: fix Dispatch/ExtractAlpha_NEON
+0fa0ea54 Makefile.vc: use /MANIFEST:EMBED
+29cc95ce Basic version of libsharpyuv in libwebp, in C.
+a30f2190 examples/webpmux.c: fix a couple of typos
+66b3ce23 Fix bad overflow check in ReadTIFF()
+54e61a38 Markdownify libwebp docs and reorganize them.
+b4533deb CMakeLists.txt,cosmetics: break long line
+b9d2f9cd quant_enc.c: use WEBP_RESTRICT qualifier
+ec178f2c Add progress hook granularity in lossless
+26139c73 Rename MAX_COST to MAX_BIT_COST in histogram_enc.c
+13b82816 cmake: fix webpmux lib name for cmake linking
+88b6a396 webp-container-spec.txt,cosmetics: normalize formatting
+6f496540 Merge tag 'v1.2.2'
+4074acf8 dsp.h: bump msvc arm64 version requirement to 16.6
+b0a86089 update ChangeLog (tag: v1.2.2)
+6db8248c libwebp: Fix VP8EncTokenLoop() progress
+827a307f BMP enc: fix the transparency case
+db25f1b4 libwebp: Fix VP8EncTokenLoop() progress
+286e7fce libwebp: do not destroy jpeg codec twice on error
+6e8a4126 libwebp: do not destroy jpeg codec twice on error
+faf21968 Merge "BMP enc: fix the transparency case" into main
+480cd51d BMP enc: fix the transparency case
+9195ea05 update ChangeLog (tag: v1.2.2-rc2)
+4acae017 update NEWS
+883f0633 man/img2webp.1: update date
+567e1f44 Reword img2webp synopsis command line
+1b0c15db man/img2webp.1: update date
+17bade38 Merge "Reword img2webp synopsis command line" into main
+a80954a1 Reword img2webp synopsis command line
+f084244d anim_decode: fix alpha blending with big-endian
+b217b4ff webpinfo: fix fourcc comparison w/big-endian
+ec497b75 Merge "anim_decode: fix alpha blending with big-endian" into main
+e4886716 anim_decode: fix alpha blending with big-endian
+e3cb052c webpinfo: fix fourcc comparison w/big-endian
+a510fedb patch-check: detect duplicated files
+f035d2e4 update ChangeLog (tag: v1.2.2-rc1)
+7031946a update NEWS
+973390b6 bump version to 1.2.2
+abd6664f update AUTHORS
+5b7e7930 Merge "add missing USE_{MSA,NEON} checks in headers" into main
+02ca04c3 add missing USE_{MSA,NEON} checks in headers
+e94716e2 xcframeworkbuild.sh: place headers in a subdir
+c846efd8 patch-check: commit subject length check
+b6f756e8 update http links
+8f5cb4c1 update rfc links
+8ea81561 change VP8LPredictorFunc signature to avoid reading 'left'
+6b1d18c3 webpmux: fix the -bgcolor description
+3368d876 Merge "webpmux: add "-set bgcolor A,R,G,B"" into main
+f213abf6 webpinfo: print the number of warnings
+50c97c30 webpmux: add "-set bgcolor A,R,G,B"
+2c206aaf Remove CMakeLists.txt check in compile.sh
+96e3dfef Merge "infra/common.sh: add shard_should_run()" into main
+0e0f74b7 infra/common.sh: add shard_should_run()
+35b7436a Jenkins scripts port: update shell function comments
+21d24b4c webp-container-spec.txt: remove 'experimental' markers
+cdcf8902 Merge "Port Jenkins script: compile" into main
+dc683cde Jenkins scripts port: static analysis
+0858494e Port Jenkins script: compile
+c2cf6a93 Jenkins scripts port: android compilation
+df0e808f presubmit: Add pylint-2.7 and .pylintrc
+676c57db patch-check: shfmt
+7bb7f747 patch-check: Add shellcheck
+abcd1797 Reformat docstrings and imports
+edaf0895 Port Jenkins scripts: compile js
+b9622063 Set CheckPatchFormatted flags to fail on diffs
+e23cd548 dsp.h: enable NEON w/VS2019+ ARM64 targets
+3875c7de CMakeLists.txt: set minimum version to 3.7
+1a8f0d45 Have a hard-coded value for memset in TrellisQuantizeBlock.
+93480160 Speed up TrellisQuantizeBlock
+45eaacc9 Convert deprecated uint32 to uint32_t.
+42592af8 webp,cmake: Remove unnecessary include dirs
+e298e05f Add patch-check steps in PRESUBMIT.py
+29148919 Merge tag 'v1.2.1'
+9ce5843d update ChangeLog (tag: v1.2.1)
+d9191588 fuzzer/*: normalize src/ includes
+c5bc3624 fuzzer/*: normalize src/ includes
+53b6f762 fix indent
+d2caaba4 fix indent
+731246ba update ChangeLog (tag: v1.2.1-rc2)
+d250f01d dsp/*: use WEBP_HAVE_* to determine Init availability
+1fe31625 dsp/*: use WEBP_HAVE_* to determine Init availability
+3a4d3ecd update NEWS
+b2bc8093 bump version to 1.2.1
+e542fc7a update AUTHORS
+e0241154 Merge "libwebp/CMake: Add <BUILD_INTERFACE> to webp incl" into main
+edea6444 libwebp/CMake: Add <BUILD_INTERFACE> to webp incl
+ece18e55 dsp.h: respect --disable-sse2/sse4.1/neon
+a89a3230 wicdec: support alpha from WebP WIC decoder
+26f4aa01 Merge "alpha_processing: fix visual studio warnings" into main
+8f594663 alpha_processing: fix visual studio warnings
+46d844e6 Merge "cpu.cmake: fix compiler flag detection w/3.17.0+" into main
+298d26ea Merge changes I593adf92,If20675e7,Ifac68eac into main
+a1e5dae0 alpha_processing*: use WEBP_RESTRICT qualifier
+327ef24f cpu.cmake: fix compiler flag detection w/3.17.0+
+f70819de configure: enable libwebpmux by default
+dc7e2b42 configure: add informational notices when disabling binaries
+9df23ddd configure: move lib flag checks before binaries
+a2e18f10 Merge "WebPConfig.config.in: correct WEBP_INCLUDE_DIRS" into main
+e1a8d4f3 Merge "bit_reader_inl_utils: uniformly apply WEBP_RESTRICT" into main
+4de35f43 rescaler.c: fix alignment
+0f13eec7 bit_reader_inl_utils: uniformly apply WEBP_RESTRICT
+277d3074 Fix size_t overflow in  WebPRescalerInit
+97adbba5 WebPConfig.config.in: correct WEBP_INCLUDE_DIRS
+b60d4603 advanced_api_fuzzer: add extreme config value coverage
+72fe52f6 anim_encode.c,cosmetics: normalize indent
+116d235c anim_encode: Fix encoded_frames_[] overflow
+6f445b3e CMake: set CMP0072 to NEW
+b1cf887f define WEBP_RESTRICT for MSVC
+3e265136 Add WEBP_RESTRICT & use it in VP8BitReader
+f6d29247 vp8l_dec::ProcessRows: fix int overflow in multiply
+de3b4ba8 CMake: add WEBP_BUILD_LIBWEBPMUX
+7f09d3d1 CMakeLists.txt: rm libwebpmux dep from anim_{diff,dump}
+4edea4a6 Init{RGB,YUV}Rescaler: fix a few more int overflows
+c9e26bdb rescaler_utils: set max valid scaled w/h to INT_MAX/2
+28d488e6 utils.h: add SizeOverflow()
+695bdaa2 Export/EmitRescaledRowsRGBA: fix pointer offset int overflow
+685d073e Init{RGB,YUV}Rescaler: fix int overflows in multiplication
+d38bd0dd WebPFlipBuffer: fix integer overflow
+109ff0f1 utils: allow MALLOC_LIMIT to indicate a max
+a2fce867 WebPRescalerImportRowExpand_C: promote some vals before multiply
+776983d4 AllocateBuffer: fix int multiplication overflow check
+315abbd6 Merge "Revert "Do not use a palette for one color images.""
+eae815d0 Merge changes Ica3bbf75,I82f82954
+afbca5a1 Require Emscripten 2.0.18
+3320416b CMakeLists,emscripten: use EXPORTED_RUNTIME_METHODS
+29145ed6 Update README instructions for using Emscripten
+1f579139 cosmetics: remove use of 'sanity' / 'master'
+29b6129c WebPAnimEncoderNewInternal: remove some unnecessary inits
+b60869a1 Revert "Do not use a palette for one color images."
+6fb4cddc demux: move padded size calc post unpadded validation
+05b72d42 vp8l_enc.c: normalize index types
+b6513fba Do not use a palette for one color images.
+98bbe35b Fix multi-threading with palettes.
+b1674240 Add modified Zeng's method to palette sorting.
+88c90c45 add CONTRIBUTING.md
+6a9916d7 WebPRescalerInit: add missing int64_t promotion
+b6cf52d5 WebPIoInitFromOptions: treat use_scaling as a bool
+3b12b7f4 WebPIoInitFromOptions: treat use_cropping as a bool
+595fa13f add WebPCheckCropDimensions()
+8fdaecb0 Disable cross-color when palette is used.
+8933bac2 WebPIoInitFromOptions: respect incoming bypass_filtering val
+7d416ff0 webpdec,cosmetics: match error text to function call
+ec6cfeb5 Fix typo on WebPPictureAlloc() in README
+7e58a1a2 *.cmake: add license header
+5651a6b2 cmake: fix .so versioning
+25ae67b3 xcframeworkbuild.sh: add arm64 simulator target
+5d4ee4c3 cosmetics: remove use of the term 'dummy'
+01b38ee1 faster CollectColorXXXTransforms_SSE41
+652aa344 Merge "Use BitCtz for FastSLog2Slow_C"
+0320e1e3 add the missing default BitsCtz() code
+8886f620 Use BitCtz for FastSLog2Slow_C
+fae41617 faster CombinedShannonEntropy_SSE2
+5bd2704e Introduce the BitCtz() function.
+fee64287 Merge "wicdec,icc: treat unsupported op as non-fatal"
+33ddb894 lossless_sse{2,41}: remove some unneeded includes
+b27ea852 wicdec,icc: treat unsupported op as non-fatal
+b78494a9 Merge "Fix undefined signed shift."
+e79974cd Fix undefined signed shift.
+a8853394 SSE4.1 versions of BGRA to RGB/BGR color-space conversions
+a09a6472 SSE4.1 version of TransformColorInverse
+401da22b Merge "pngdec: check version before using png_get_chunk_malloc_max"
+26907822 pngdec: check version before using png_get_chunk_malloc_max
+06c1e72e Code cleanup
+8f0d41aa Merge changes Id135bbf4,I99e59797
+373eb170 gif2webp: don't store loop-count if there's only 1 frame
+759b9d5a cmake: add WEBP_USE_THREAD option
+926ce921 cmake: don't install binaries from extras/
+9c367bc6 WebPAnimDecoderNewInternal: validate bitstream before alloc
+47f64f6e filters_sse2: import Chromium change
+cc3577e9 fuzzer/*: use src/ based include paths
+004d77ff Merge tag 'v1.2.0'
+fedac6cc update ChangeLog (tag: v1.2.0-rc3, tag: v1.2.0)
+170a8712 Fix check_c_source_compiles with pthread.
+ceddb5fc Fix check_c_source_compiles with pthread.
+85995719 disable CombinedShannonEntropy_SSE2 on x86
+289757fe TiffDec: enforce stricter mem/dimension limit on tiles
+8af7436f Merge "{ios,xcframework}build.sh: make min version(s) more visible" into 1.2.0
+e56c3c5b pngdec: raise memory limit if needed
+8696147d pngdec: raise memory limit if needed
+13b8e9fe {ios,xcframework}build.sh: make min version(s) more visible
+a9225410 animdecoder_fuzzer: fix memory leak
+d6c2285d update gradle to 6.1.1
+8df77fb1 animdecoder_fuzzer: fix memory leak
+52ce6333 update NEWS
+28c49820 bump version to 1.2.0
+7363dff2 webp/encode.h: restore WEBP_ENCODER_ABI_VERSION to v1.1.0
+826aafa5 update AUTHORS
+63258823 animdecoder_fuzzer: validate canvas size
+9eb26381 CMake: remove duplicate "include(GNUInstallDirs)"
+2e7bed79 WebPPicture: clarify the ownership of user-owned data.
+cccf5e33 webpmux: add an '-set loop <value>' option
+c9a3f6a1 Merge changes Ie29f9867,I289c54c4
+319f56f1 iosbuild.sh: sync some aspects of xcframeworkbuild.sh
+e8e8db98 add xcframeworkbuild.sh
+ae545534 dsp.h: allow config.h to override MSVC SIMD autodetection
+fef789f3 Merge "cmake: fix per-file assembly flags"
+fc14fc03 Have C encoding predictors use decoding predictors.
+7656f0b3 README,cosmetics: fix a couple typos
+d2e245ea cmake: disable webp.js if WEBP_ENABLE_SIMD=1
+96099a79 cmake: fix per-file assembly flags
+5abb5582 Merge "cmake: fix compilation w/Xcode generator"
+8484a120 cmake: fix compilation w/Xcode generator
+d7bf01c9 Merge changes Ifcae0f38,Iee2d7401
+36c81ff6 WASM-SIMD: port 2 patches from rreverser@'s tree
+988b02ab Merge "Couple of fixes to allow SIMD on Emscripten"
+26faf770 wicdec: fail with animated images
+ab2d08a8 [cd]webp: document lack of animated webp support
+52273943 Couple of fixes to allow SIMD on Emscripten
+8870ba7f Fix skia bug #10952
+4b3c6953 Detect if StoreFrame read more than anmf_payload_size bytes
+17fd4ba8 webp/decode.h,cosmetics: normalize 'flip' comment
+411d3677 remove some unreachable break statements
+3700ffd7 WebPPictureHasTransparency: remove unreachable return
+83604bf3 {animencoder,enc_dec}_fuzzer: convert some abort()s to returns
+eb44119c Merge changes I8ae09473,I678c8b1e
+9f6055fc fuzz_utils.h: rename max() to Max()
+695788e7 fuzz_utils.h: make functions WEBP_INLINE
+906c1fcd make ImgIoUtilReadFile use WebPMalloc instead of malloc
+8cb7e536 rename demux_api_fuzzer.c -> mux_demux_api_fuzzer.c
+443db47d add animdecoder_fuzzer.cc
+36a6eea3 Merge "import fuzzers from oss-fuzz/chromium"
+ec5f12c1 Makefile.vc: remove deprecated /Gm option
+64425a08 picture_tools_enc: fix windows build warning
+bd94090a import fuzzers from oss-fuzz/chromium
+cf847cba use WEBP_DSP_INIT_FUNC for Init{GammaTables*,GetCoeffs}
+55a080e5 Add WebPReplaceTransparentPixels() in dsp
+84739717 GetBackgroundColorGIF: promote to uint32_t before << 24
+def64e92 cwebp: Fix -print_psnr for near_lossless
+cf2f88b3 Add palette and spatial for q >= 75 and -m 5
+f0110bae Add no-color cache configuration to the cruncher
+749a8b99 Better estimate of the cache cost.
+4f9f00cc Use spatial predictors on top of palette no matter what.
+7658c686 Add spatial prediction on top of palette in cruncher.
+133ff0e3 webp_js: force WASM=0 option explicitly
+e3c259a2 Fix integer overflow in EmitFancyRGB.
+b3ff0bde man/{gif2,img2}webp,webpmux: normalize some wording
+f9b30586 fix ABI breakage introduced by 6a0ff358
+1d58dcfc README.webp_js: update note about emscripten version
+44070266 README.webp_js: s/fastcomp/upstream/
+2565fa8f README.webp_js: update cmake command
+47309ef5 webp: WEBP_OFFSET_PTR()
+687ab00e DC{4,8,16}_NEON: replace vmovl w/vaddl
+1b92fe75 DC16_NEON,aarch64: use vaddlv
+53f3d8cf dec_neon,DC8_NEON: use vaddlv instead of movl+vaddv
+27d08240 Fix integer overflow in WebPAnimDecoderGetNext()
+69776e38 Merge "remove call to MBAnalyzeBestIntra4Mode for method >= 5"
+a99078c1 remove call to MBAnalyzeBestIntra4Mode for method >= 5
+22e404cc CMakeLists.txt: fix set(CACHE) argument order
+71690b52 fix MSVC warning
+6a0ff358 Enc: add a qmin / qmax range for quality factor
+0fa56f30 Merge tag 'v1.1.0'
+6cf504d0 PNM decoding: handle max_value != 255
+d7844e97 update ChangeLog (tag: v1.1.0-rc2, tag: v1.1.0)
+7f006436 Makefile.vc: fix webp_quality.exe link
+cf047e83 Makefile.vc: fix webp_quality.exe link
+c074c653 update NEWS
+30f09551 bump version to 1.1.0
+a76694a1 update AUTHORS
+6e3ef7b3 extras: fix WEBP_SWAP_16BIT_CSP check
+47178dbd extras: add WebPUnmultiplyARGB() convenience function
+22cbae33 idec_dec: fix 0 offset of NULL pointer
+290dd0b4 muxread: fix 0 offset of NULL pointer
+0df474ac Merge "lossless_(enc_|)sse2: avoid offsetting a NULL pointer"
+c6b75a19 lossless_(enc_|)sse2: avoid offsetting a NULL pointer
+295e5e38 fix UBSAN warning
+e2575e05 DC8_NEON,aarch64: use vaddv
+b0e09e34 dec_neon: Fix build failure under some toolchains
+cf0e903c dsp/lossless: Fix non gcc ARM builds
+bb7bc40b Remove ubsan errors.
+78881b76 CMake: fix GLUT library link
+9f750f7a cmake: fix BUILD_SHARED_LIBS build on mac
+17850e74 libwebp: Remove char-subscripts warning in pnmdec.c
+2fa2552d Merge "Expose WebPMalloc() in addition to WebPFree()"
+a4df4aae Expose WebPMalloc() in addition to WebPFree()
+853ea3d8 imageio/tiff: Return error before allocating bad tile size
+af650c0b Fix a Wxor-used-as-pow false positive
+601ef17c libwebp.py: update to swig 3.0.12
+0e48d889 bugfix: last alpha rows were incorrectly decoded
+24d2ccb4 webp: Fix imageio ReadPNM() TUPLTYPE
+fab8f9cf cosmetics: normalize '*' association
+94138e0e update .gitignore
+0fe1a89d update ChangeLog (tag: v1.0.3-rc1, tag: v1.0.3)
 2ad0916d update NEWS
 1287362b bump version to 1.0.3
 7b968cc2 update AUTHORS
diff --git a/src/3rdparty/libwebp/NEWS b/src/3rdparty/libwebp/NEWS
index 5c0fc8c..c4f8ef7 100644
--- a/src/3rdparty/libwebp/NEWS
+++ b/src/3rdparty/libwebp/NEWS
@@ -1,3 +1,68 @@
+- 12/16/2022: version 1.3.0
+  This is a binary compatible release.
+  * add libsharpyuv, which exposes -sharp_yuv/config.use_sharp_yuv
+    functionality to other libraries; libwebp now depends on this library
+  * major updates to the container and lossless bitstream docs (#448, #546,
+    #551)
+  * miscellaneous warning, bug & build fixes (#576, #583, #584)
+
+- 8/4/2022: version 1.2.4
+  This is a binary compatible release.
+  * restore CMake libwebpmux target name for compatibility with 1.2.2 (#575)
+  * fix lossless crunch mode encoding with WEBP_REDUCE_SIZE
+    (chromium: #1345547, #1345595, #1345772, #1345804)
+
+- 6/30/2022: version 1.2.3
+  This is a binary compatible release.
+  * security fix for lossless encoder (#565, chromium:1313709)
+  * improved progress granularity in WebPReportProgress() when using lossless
+  * improved precision in Sharp YUV (-sharp_yuv) conversion
+  * many corrections to webp-lossless-bitstream-spec.txt (#551)
+  * crash/leak fixes on error/OOM and other bug fixes (#558, #563, #569, #573)
+
+- 1/11/2022: version 1.2.2
+  This is a binary compatible release.
+  * webpmux: add "-set bgcolor A,R,G,B"
+  * add ARM64 NEON support for MSVC builds (#539)
+  * fix duplicate include error in Xcode when using multiple XCFrameworks in a
+    project (#542)
+  * doc updates and bug fixes (#538, #544, #548, #550)
+
+- 7/20/2021: version 1.2.1
+  This is a binary compatible release.
+  * minor lossless encoder improvements and x86 color conversion speed up
+  * add ARM64 simulator support to xcframeworkbuild.sh (#510)
+  * further security related hardening in libwebp & examples
+    (issues: #497, #508, #518)
+    (chromium: #1196480, #1196773, #1196775, #1196777, #1196778, #1196850)
+    (oss-fuzz: #28658, #28978)
+  * toolchain updates and bug fixes (#498, #501, #502, #504, #505, #506, #509,
+                                     #533)
+  * use more inclusive language within the source (#507)
+
+- 12/23/2020: version 1.2.0
+  * API changes:
+    - libwebp:
+      encode.h: add a qmin / qmax range for quality factor (cwebp adds -qrange)
+  * lossless encoder improvements
+  * SIMD support for Wasm builds
+  * add xcframeworkbuild.sh, supports Mac Catalyst builds
+  * import fuzzers from oss-fuzz & chromium (#409)
+  * webpmux: add an '-set loop <value>' option (#494)
+  * toolchain updates and bug fixes (#449, #463, #470, #475, #477, #478, #479,
+    #488, #491)
+
+- 12/18/2019: version 1.1.0
+  * API changes:
+    - libwebp:
+      WebPMalloc (issue #442)
+    - extras:
+      WebPUnmultiplyARGB
+  * alpha decode fix (issue #439)
+  * toolchain updates and bug fixes
+    (chromium: #1026858, #1027136, #1027409, #1028620, #1028716, #995200)
+    (oss-fuzz: #19430, #19447)
+
 - 7/4/2019: version 1.0.3
   This is a binary compatible release.
   * resize fixes for Nx1 sizes and the addition of non-opaque alpha values for
diff --git a/src/3rdparty/libwebp/README b/src/3rdparty/libwebp/README
index 60da8a2..f6eaf2c 100644
--- a/src/3rdparty/libwebp/README
+++ b/src/3rdparty/libwebp/README
@@ -4,7 +4,7 @@
           \__\__/\____/\_____/__/ ____  ___
                 / _/ /    \    \ /  _ \/ _/
                /  \_/   / /   \ \   __/  \__
-               \____/____/\_____/_____/____/v1.0.3
+               \____/____/\_____/_____/____/v1.2.2
 
 Description:
 ============
@@ -13,13 +13,13 @@ WebP codec: library to encode and decode images in WebP format. This package
 contains the library that can be used in other programs to add WebP support,
 as well as the command line tools 'cwebp' and 'dwebp'.
 
-See http://developers.google.com/speed/webp
+See https://developers.google.com/speed/webp
 
 The latest source tree is available at
 https://chromium.googlesource.com/webm/libwebp
 
 It is released under the same license as the WebM project.
-See http://www.webmproject.org/license/software/ or the
+See https://www.webmproject.org/license/software/ or the
 "COPYING" file for details. An additional intellectual
 property rights grant can be found in the file PATENTS.
 
@@ -113,7 +113,7 @@ make install
 
 CMake:
 ------
-With CMake, you can compile libwebp, cwebp, dwebp, gif2web, img2webp, webpinfo
+With CMake, you can compile libwebp, cwebp, dwebp, gif2webp, img2webp, webpinfo
 and the JS bindings.
 
 Prerequisites:
@@ -225,6 +225,7 @@ Usage:
 
 If input size (-s) for an image is not specified, it is
 assumed to be a PNG, JPEG, TIFF or WebP file.
+Note: Animated PNG and WebP files are not supported.
 
 Options:
   -h / -help ............. short help
@@ -254,6 +255,8 @@ Options:
   -partition_limit <int> . limit quality to fit the 512k limit on
                            the first partition (0=no degradation ... 100=full)
   -pass <int> ............ analysis pass number (1..10)
+  -qrange <min> <max> .... specifies the permissible quality range
+                           (default: 0 100)
   -crop <x> <y> <w> <h> .. crop picture with the given rectangle
   -resize <w> <h> ........ resize picture (after any cropping)
   -mt .................... use multi-threading if available
@@ -294,6 +297,7 @@ Experimental Options:
   -af .................... auto-adjust filter strength
   -pre <int> ............. pre-processing filter
 
+
 The main options you might want to try in order to further tune the
 visual quality are:
  -preset
@@ -341,7 +345,9 @@ The full list of options is available using -h:
 > dwebp -h
 Usage: dwebp in_file [options] [-o out_file]
 
-Decodes the WebP image file to PNG format [Default]
+Decodes the WebP image file to PNG format [Default].
+Note: Animated WebP files are not supported.
+
 Use following options to convert into alternate image formats:
   -pam ......... save the raw RGBA samples as a color PAM
   -ppm ......... save the raw RGB samples as a color PPM
@@ -423,15 +429,15 @@ Prerequisites:
 1) OpenGL & OpenGL Utility Toolkit (GLUT)
   Linux:
     $ sudo apt-get install freeglut3-dev mesa-common-dev
-  Mac + XCode:
+  Mac + Xcode:
     - These libraries should be available in the OpenGL / GLUT frameworks.
   Windows:
     http://freeglut.sourceforge.net/index.php#download
 
 2) (Optional) qcms (Quick Color Management System)
   i. Download qcms from Mozilla / Chromium:
-    http://hg.mozilla.org/mozilla-central/file/0e7639e3bdfb/gfx/qcms
-    http://src.chromium.org/viewvc/chrome/trunk/src/third_party/qcms
+    https://hg.mozilla.org/mozilla-central/file/0e7639e3bdfb/gfx/qcms
+    https://source.chromium.org/chromium/chromium/src/+/main:third_party/qcms/;drc=d4a2f8e1ed461d8fc05ed88d1ae2dc94c9773825
   ii. Build and archive the source files as libqcms.a / qcms.lib
   iii. Update makefile.unix / Makefile.vc
     a) Define WEBP_HAVE_QCMS
@@ -450,7 +456,7 @@ modes, etc.
 
 Usage:
 
-  img2webp [file-level options] [image files...] [per-frame options...]
+  img2webp [file_options] [[frame_options] frame_file]...
 
 File-level options (only used at the start of compression):
  -min_size ............ minimize size
@@ -613,7 +619,7 @@ The encoding flow looks like:
   pic.width = width;
   pic.height = height;
   // allocated picture of dimension width x height
-  if (!WebPPictureAllocate(&pic)) {
+  if (!WebPPictureAlloc(&pic)) {
     return 0;   // memory error
   }
   // at this point, 'pic' has been initialized as a container,
@@ -780,10 +786,10 @@ Bugs:
 Please report all bugs to the issue tracker:
     https://bugs.chromium.org/p/webp
 Patches welcome! See this page to get started:
-    http://www.webmproject.org/code/contribute/submitting-patches/
+    https://www.webmproject.org/code/contribute/submitting-patches/
 
 Discuss:
 ========
 
 Email: webp-discuss@webmproject.org
-Web: http://groups.google.com/a/webmproject.org/group/webp-discuss
+Web: https://groups.google.com/a/webmproject.org/group/webp-discuss
diff --git a/src/3rdparty/libwebp/patches/0001-Fix-Windows-build-for-clang-and-neon.patch b/src/3rdparty/libwebp/patches/0001-Fix-Windows-build-for-clang-and-neon.patch
new file mode 100644
index 0000000..2b46f5b
--- /dev/null
+++ b/src/3rdparty/libwebp/patches/0001-Fix-Windows-build-for-clang-and-neon.patch
@@ -0,0 +1,38 @@
+diff --git a/src/3rdparty/libwebp/src/dsp/cpu.h b/src/3rdparty/libwebp/src/dsp/cpu.h
+index 57a40d8..8cf3e92 100644
+--- a/src/3rdparty/libwebp/src/dsp/cpu.h
++++ b/src/3rdparty/libwebp/src/dsp/cpu.h
+@@ -14,6 +14,8 @@
+ #ifndef WEBP_DSP_CPU_H_
+ #define WEBP_DSP_CPU_H_
+ 
++#include <qglobal.h>
++
+ #ifdef HAVE_CONFIG_H
+ #include "src/webp/config.h"
+ #endif
+@@ -43,12 +45,12 @@
+ 
+ #if !defined(HAVE_CONFIG_H)
+ #if defined(_MSC_VER) && _MSC_VER > 1310 && \
+-    (defined(_M_X64) || defined(_M_IX86))
++    (defined(_M_X64) || defined(_M_IX86)) && !defined(__clang__)
+ #define WEBP_MSC_SSE2  // Visual C++ SSE2 targets
+ #endif
+ 
+ #if defined(_MSC_VER) && _MSC_VER >= 1500 && \
+-    (defined(_M_X64) || defined(_M_IX86))
++    (defined(_M_X64) || defined(_M_IX86)) && !defined(__clang__)
+ #define WEBP_MSC_SSE41  // Visual C++ SSE4.1 targets
+ #endif
+ #endif
+@@ -97,7 +99,8 @@
+ // arm_neon.h. Compile errors were seen with Visual Studio 2019 16.4 with
+ // vtbl4_u8(); a fix was made in 16.6.
+ #if defined(_MSC_VER) && ((_MSC_VER >= 1700 && defined(_M_ARM)) || \
+-                          (_MSC_VER >= 1926 && defined(_M_ARM64)))
++                          (_MSC_VER >= 1926 && defined(_M_ARM64))) && \
++                         !defined(__clang__) && (QT_CONFIG_neon == 1)
+ #define WEBP_USE_NEON
+ #define WEBP_USE_INTRINSICS
+ #endif
diff --git a/src/3rdparty/libwebp/qt_attribution.json b/src/3rdparty/libwebp/qt_attribution.json
index 8d15db5..258f7ff 100644
--- a/src/3rdparty/libwebp/qt_attribution.json
+++ b/src/3rdparty/libwebp/qt_attribution.json
@@ -6,7 +6,7 @@
 
     "Description": "WebP is a new image format that provides lossless and lossy compression for images on the web.",
     "Homepage": "https://developers.google.com/speed/webp/",
-    "Version": "1.0.3",
+    "Version": "1.3.0",
     "License": "BSD 3-clause \"New\" or \"Revised\" License",
     "LicenseId": "BSD-3-Clause",
     "LicenseFile": "COPYING",
diff --git a/src/3rdparty/libwebp/sharpyuv/sharpyuv.c b/src/3rdparty/libwebp/sharpyuv/sharpyuv.c
new file mode 100644
index 0000000..7de34fb
--- /dev/null
+++ b/src/3rdparty/libwebp/sharpyuv/sharpyuv.c
@@ -0,0 +1,526 @@
+// Copyright 2022 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Sharp RGB to YUV conversion.
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "sharpyuv/sharpyuv.h"
+
+#include <assert.h>
+#include <limits.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "src/webp/types.h"
+#include "sharpyuv/sharpyuv_cpu.h"
+#include "sharpyuv/sharpyuv_dsp.h"
+#include "sharpyuv/sharpyuv_gamma.h"
+
+//------------------------------------------------------------------------------
+
+int SharpYuvGetVersion(void) {
+  return SHARPYUV_VERSION;
+}
+
+//------------------------------------------------------------------------------
+// Sharp RGB->YUV conversion
+
+static const int kNumIterations = 4;
+
+#define YUV_FIX 16  // fixed-point precision for RGB->YUV
+static const int kYuvHalf = 1 << (YUV_FIX - 1);
+
+// Max bit depth so that intermediate calculations fit in 16 bits.
+static const int kMaxBitDepth = 14;
+
+// Returns the precision shift to use based on the input rgb_bit_depth.
+static int GetPrecisionShift(int rgb_bit_depth) {
+  // Try to add 2 bits of precision if it fits in kMaxBitDepth. Otherwise remove
+  // bits if needed.
+  return ((rgb_bit_depth + 2) <= kMaxBitDepth) ? 2
+                                               : (kMaxBitDepth - rgb_bit_depth);
+}
+
+typedef int16_t fixed_t;      // signed type with extra precision for UV
+typedef uint16_t fixed_y_t;   // unsigned type with extra precision for W
+
+//------------------------------------------------------------------------------
+
+static uint8_t clip_8b(fixed_t v) {
+  return (!(v & ~0xff)) ? (uint8_t)v : (v < 0) ? 0u : 255u;
+}
+
+static uint16_t clip(fixed_t v, int max) {
+  return (v < 0) ? 0 : (v > max) ? max : (uint16_t)v;
+}
+
+static fixed_y_t clip_bit_depth(int y, int bit_depth) {
+  const int max = (1 << bit_depth) - 1;
+  return (!(y & ~max)) ? (fixed_y_t)y : (y < 0) ? 0 : max;
+}
+
+//------------------------------------------------------------------------------
+
+static int RGBToGray(int64_t r, int64_t g, int64_t b) {
+  const int64_t luma = 13933 * r + 46871 * g + 4732 * b + kYuvHalf;
+  return (int)(luma >> YUV_FIX);
+}
+
+static uint32_t ScaleDown(uint16_t a, uint16_t b, uint16_t c, uint16_t d,
+                          int rgb_bit_depth) {
+  const int bit_depth = rgb_bit_depth + GetPrecisionShift(rgb_bit_depth);
+  const uint32_t A = SharpYuvGammaToLinear(a, bit_depth);
+  const uint32_t B = SharpYuvGammaToLinear(b, bit_depth);
+  const uint32_t C = SharpYuvGammaToLinear(c, bit_depth);
+  const uint32_t D = SharpYuvGammaToLinear(d, bit_depth);
+  return SharpYuvLinearToGamma((A + B + C + D + 2) >> 2, bit_depth);
+}
+
+static WEBP_INLINE void UpdateW(const fixed_y_t* src, fixed_y_t* dst, int w,
+                                int rgb_bit_depth) {
+  const int bit_depth = rgb_bit_depth + GetPrecisionShift(rgb_bit_depth);
+  int i;
+  for (i = 0; i < w; ++i) {
+    const uint32_t R = SharpYuvGammaToLinear(src[0 * w + i], bit_depth);
+    const uint32_t G = SharpYuvGammaToLinear(src[1 * w + i], bit_depth);
+    const uint32_t B = SharpYuvGammaToLinear(src[2 * w + i], bit_depth);
+    const uint32_t Y = RGBToGray(R, G, B);
+    dst[i] = (fixed_y_t)SharpYuvLinearToGamma(Y, bit_depth);
+  }
+}
+
+static void UpdateChroma(const fixed_y_t* src1, const fixed_y_t* src2,
+                         fixed_t* dst, int uv_w, int rgb_bit_depth) {
+  int i;
+  for (i = 0; i < uv_w; ++i) {
+    const int r =
+        ScaleDown(src1[0 * uv_w + 0], src1[0 * uv_w + 1], src2[0 * uv_w + 0],
+                  src2[0 * uv_w + 1], rgb_bit_depth);
+    const int g =
+        ScaleDown(src1[2 * uv_w + 0], src1[2 * uv_w + 1], src2[2 * uv_w + 0],
+                  src2[2 * uv_w + 1], rgb_bit_depth);
+    const int b =
+        ScaleDown(src1[4 * uv_w + 0], src1[4 * uv_w + 1], src2[4 * uv_w + 0],
+                  src2[4 * uv_w + 1], rgb_bit_depth);
+    const int W = RGBToGray(r, g, b);
+    dst[0 * uv_w] = (fixed_t)(r - W);
+    dst[1 * uv_w] = (fixed_t)(g - W);
+    dst[2 * uv_w] = (fixed_t)(b - W);
+    dst  += 1;
+    src1 += 2;
+    src2 += 2;
+  }
+}
+
+static void StoreGray(const fixed_y_t* rgb, fixed_y_t* y, int w) {
+  int i;
+  assert(w > 0);
+  for (i = 0; i < w; ++i) {
+    y[i] = RGBToGray(rgb[0 * w + i], rgb[1 * w + i], rgb[2 * w + i]);
+  }
+}
+
+//------------------------------------------------------------------------------
+
+static WEBP_INLINE fixed_y_t Filter2(int A, int B, int W0, int bit_depth) {
+  const int v0 = (A * 3 + B + 2) >> 2;
+  return clip_bit_depth(v0 + W0, bit_depth);
+}
+
+//------------------------------------------------------------------------------
+
+static WEBP_INLINE int Shift(int v, int shift) {
+  return (shift >= 0) ? (v << shift) : (v >> -shift);
+}
+
+static void ImportOneRow(const uint8_t* const r_ptr,
+                         const uint8_t* const g_ptr,
+                         const uint8_t* const b_ptr,
+                         int rgb_step,
+                         int rgb_bit_depth,
+                         int pic_width,
+                         fixed_y_t* const dst) {
+  // Convert the rgb_step from a number of bytes to a number of uint8_t or
+  // uint16_t values depending the bit depth.
+  const int step = (rgb_bit_depth > 8) ? rgb_step / 2 : rgb_step;
+  int i;
+  const int w = (pic_width + 1) & ~1;
+  for (i = 0; i < pic_width; ++i) {
+    const int off = i * step;
+    const int shift = GetPrecisionShift(rgb_bit_depth);
+    if (rgb_bit_depth == 8) {
+      dst[i + 0 * w] = Shift(r_ptr[off], shift);
+      dst[i + 1 * w] = Shift(g_ptr[off], shift);
+      dst[i + 2 * w] = Shift(b_ptr[off], shift);
+    } else {
+      dst[i + 0 * w] = Shift(((uint16_t*)r_ptr)[off], shift);
+      dst[i + 1 * w] = Shift(((uint16_t*)g_ptr)[off], shift);
+      dst[i + 2 * w] = Shift(((uint16_t*)b_ptr)[off], shift);
+    }
+  }
+  if (pic_width & 1) {  // replicate rightmost pixel
+    dst[pic_width + 0 * w] = dst[pic_width + 0 * w - 1];
+    dst[pic_width + 1 * w] = dst[pic_width + 1 * w - 1];
+    dst[pic_width + 2 * w] = dst[pic_width + 2 * w - 1];
+  }
+}
+
+static void InterpolateTwoRows(const fixed_y_t* const best_y,
+                               const fixed_t* prev_uv,
+                               const fixed_t* cur_uv,
+                               const fixed_t* next_uv,
+                               int w,
+                               fixed_y_t* out1,
+                               fixed_y_t* out2,
+                               int rgb_bit_depth) {
+  const int uv_w = w >> 1;
+  const int len = (w - 1) >> 1;   // length to filter
+  int k = 3;
+  const int bit_depth = rgb_bit_depth + GetPrecisionShift(rgb_bit_depth);
+  while (k-- > 0) {   // process each R/G/B segments in turn
+    // special boundary case for i==0
+    out1[0] = Filter2(cur_uv[0], prev_uv[0], best_y[0], bit_depth);
+    out2[0] = Filter2(cur_uv[0], next_uv[0], best_y[w], bit_depth);
+
+    SharpYuvFilterRow(cur_uv, prev_uv, len, best_y + 0 + 1, out1 + 1,
+                      bit_depth);
+    SharpYuvFilterRow(cur_uv, next_uv, len, best_y + w + 1, out2 + 1,
+                      bit_depth);
+
+    // special boundary case for i == w - 1 when w is even
+    if (!(w & 1)) {
+      out1[w - 1] = Filter2(cur_uv[uv_w - 1], prev_uv[uv_w - 1],
+                            best_y[w - 1 + 0], bit_depth);
+      out2[w - 1] = Filter2(cur_uv[uv_w - 1], next_uv[uv_w - 1],
+                            best_y[w - 1 + w], bit_depth);
+    }
+    out1 += w;
+    out2 += w;
+    prev_uv += uv_w;
+    cur_uv  += uv_w;
+    next_uv += uv_w;
+  }
+}
+
+static WEBP_INLINE int RGBToYUVComponent(int r, int g, int b,
+                                         const int coeffs[4], int sfix) {
+  const int srounder = 1 << (YUV_FIX + sfix - 1);
+  const int luma = coeffs[0] * r + coeffs[1] * g + coeffs[2] * b +
+                   coeffs[3] + srounder;
+  return (luma >> (YUV_FIX + sfix));
+}
+
+static int ConvertWRGBToYUV(const fixed_y_t* best_y, const fixed_t* best_uv,
+                            uint8_t* y_ptr, int y_stride, uint8_t* u_ptr,
+                            int u_stride, uint8_t* v_ptr, int v_stride,
+                            int rgb_bit_depth,
+                            int yuv_bit_depth, int width, int height,
+                            const SharpYuvConversionMatrix* yuv_matrix) {
+  int i, j;
+  const fixed_t* const best_uv_base = best_uv;
+  const int w = (width + 1) & ~1;
+  const int h = (height + 1) & ~1;
+  const int uv_w = w >> 1;
+  const int uv_h = h >> 1;
+  const int sfix = GetPrecisionShift(rgb_bit_depth);
+  const int yuv_max = (1 << yuv_bit_depth) - 1;
+
+  for (best_uv = best_uv_base, j = 0; j < height; ++j) {
+    for (i = 0; i < width; ++i) {
+      const int off = (i >> 1);
+      const int W = best_y[i];
+      const int r = best_uv[off + 0 * uv_w] + W;
+      const int g = best_uv[off + 1 * uv_w] + W;
+      const int b = best_uv[off + 2 * uv_w] + W;
+      const int y = RGBToYUVComponent(r, g, b, yuv_matrix->rgb_to_y, sfix);
+      if (yuv_bit_depth <= 8) {
+        y_ptr[i] = clip_8b(y);
+      } else {
+        ((uint16_t*)y_ptr)[i] = clip(y, yuv_max);
+      }
+    }
+    best_y += w;
+    best_uv += (j & 1) * 3 * uv_w;
+    y_ptr += y_stride;
+  }
+  for (best_uv = best_uv_base, j = 0; j < uv_h; ++j) {
+    for (i = 0; i < uv_w; ++i) {
+      const int off = i;
+      // Note r, g and b values here are off by W, but a constant offset on all
+      // 3 components doesn't change the value of u and v with a YCbCr matrix.
+      const int r = best_uv[off + 0 * uv_w];
+      const int g = best_uv[off + 1 * uv_w];
+      const int b = best_uv[off + 2 * uv_w];
+      const int u = RGBToYUVComponent(r, g, b, yuv_matrix->rgb_to_u, sfix);
+      const int v = RGBToYUVComponent(r, g, b, yuv_matrix->rgb_to_v, sfix);
+      if (yuv_bit_depth <= 8) {
+        u_ptr[i] = clip_8b(u);
+        v_ptr[i] = clip_8b(v);
+      } else {
+        ((uint16_t*)u_ptr)[i] = clip(u, yuv_max);
+        ((uint16_t*)v_ptr)[i] = clip(v, yuv_max);
+      }
+    }
+    best_uv += 3 * uv_w;
+    u_ptr += u_stride;
+    v_ptr += v_stride;
+  }
+  return 1;
+}
+
+//------------------------------------------------------------------------------
+// Main function
+
+static void* SafeMalloc(uint64_t nmemb, size_t size) {
+  const uint64_t total_size = nmemb * (uint64_t)size;
+  if (total_size != (size_t)total_size) return NULL;
+  return malloc((size_t)total_size);
+}
+
+#define SAFE_ALLOC(W, H, T) ((T*)SafeMalloc((W) * (H), sizeof(T)))
+
+static int DoSharpArgbToYuv(const uint8_t* r_ptr, const uint8_t* g_ptr,
+                            const uint8_t* b_ptr, int rgb_step, int rgb_stride,
+                            int rgb_bit_depth, uint8_t* y_ptr, int y_stride,
+                            uint8_t* u_ptr, int u_stride, uint8_t* v_ptr,
+                            int v_stride, int yuv_bit_depth, int width,
+                            int height,
+                            const SharpYuvConversionMatrix* yuv_matrix) {
+  // we expand the right/bottom border if needed
+  const int w = (width + 1) & ~1;
+  const int h = (height + 1) & ~1;
+  const int uv_w = w >> 1;
+  const int uv_h = h >> 1;
+  uint64_t prev_diff_y_sum = ~0;
+  int j, iter;
+
+  // TODO(skal): allocate one big memory chunk. But for now, it's easier
+  // for valgrind debugging to have several chunks.
+  fixed_y_t* const tmp_buffer = SAFE_ALLOC(w * 3, 2, fixed_y_t);   // scratch
+  fixed_y_t* const best_y_base = SAFE_ALLOC(w, h, fixed_y_t);
+  fixed_y_t* const target_y_base = SAFE_ALLOC(w, h, fixed_y_t);
+  fixed_y_t* const best_rgb_y = SAFE_ALLOC(w, 2, fixed_y_t);
+  fixed_t* const best_uv_base = SAFE_ALLOC(uv_w * 3, uv_h, fixed_t);
+  fixed_t* const target_uv_base = SAFE_ALLOC(uv_w * 3, uv_h, fixed_t);
+  fixed_t* const best_rgb_uv = SAFE_ALLOC(uv_w * 3, 1, fixed_t);
+  fixed_y_t* best_y = best_y_base;
+  fixed_y_t* target_y = target_y_base;
+  fixed_t* best_uv = best_uv_base;
+  fixed_t* target_uv = target_uv_base;
+  const uint64_t diff_y_threshold = (uint64_t)(3.0 * w * h);
+  int ok;
+  assert(w > 0);
+  assert(h > 0);
+
+  if (best_y_base == NULL || best_uv_base == NULL ||
+      target_y_base == NULL || target_uv_base == NULL ||
+      best_rgb_y == NULL || best_rgb_uv == NULL ||
+      tmp_buffer == NULL) {
+    ok = 0;
+    goto End;
+  }
+
+  // Import RGB samples to W/RGB representation.
+  for (j = 0; j < height; j += 2) {
+    const int is_last_row = (j == height - 1);
+    fixed_y_t* const src1 = tmp_buffer + 0 * w;
+    fixed_y_t* const src2 = tmp_buffer + 3 * w;
+
+    // prepare two rows of input
+    ImportOneRow(r_ptr, g_ptr, b_ptr, rgb_step, rgb_bit_depth, width,
+                 src1);
+    if (!is_last_row) {
+      ImportOneRow(r_ptr + rgb_stride, g_ptr + rgb_stride, b_ptr + rgb_stride,
+                   rgb_step, rgb_bit_depth, width, src2);
+    } else {
+      memcpy(src2, src1, 3 * w * sizeof(*src2));
+    }
+    StoreGray(src1, best_y + 0, w);
+    StoreGray(src2, best_y + w, w);
+
+    UpdateW(src1, target_y, w, rgb_bit_depth);
+    UpdateW(src2, target_y + w, w, rgb_bit_depth);
+    UpdateChroma(src1, src2, target_uv, uv_w, rgb_bit_depth);
+    memcpy(best_uv, target_uv, 3 * uv_w * sizeof(*best_uv));
+    best_y += 2 * w;
+    best_uv += 3 * uv_w;
+    target_y += 2 * w;
+    target_uv += 3 * uv_w;
+    r_ptr += 2 * rgb_stride;
+    g_ptr += 2 * rgb_stride;
+    b_ptr += 2 * rgb_stride;
+  }
+
+  // Iterate and resolve clipping conflicts.
+  for (iter = 0; iter < kNumIterations; ++iter) {
+    const fixed_t* cur_uv = best_uv_base;
+    const fixed_t* prev_uv = best_uv_base;
+    uint64_t diff_y_sum = 0;
+
+    best_y = best_y_base;
+    best_uv = best_uv_base;
+    target_y = target_y_base;
+    target_uv = target_uv_base;
+    for (j = 0; j < h; j += 2) {
+      fixed_y_t* const src1 = tmp_buffer + 0 * w;
+      fixed_y_t* const src2 = tmp_buffer + 3 * w;
+      {
+        const fixed_t* const next_uv = cur_uv + ((j < h - 2) ? 3 * uv_w : 0);
+        InterpolateTwoRows(best_y, prev_uv, cur_uv, next_uv, w,
+                           src1, src2, rgb_bit_depth);
+        prev_uv = cur_uv;
+        cur_uv = next_uv;
+      }
+
+      UpdateW(src1, best_rgb_y + 0 * w, w, rgb_bit_depth);
+      UpdateW(src2, best_rgb_y + 1 * w, w, rgb_bit_depth);
+      UpdateChroma(src1, src2, best_rgb_uv, uv_w, rgb_bit_depth);
+
+      // update two rows of Y and one row of RGB
+      diff_y_sum +=
+          SharpYuvUpdateY(target_y, best_rgb_y, best_y, 2 * w,
+                          rgb_bit_depth + GetPrecisionShift(rgb_bit_depth));
+      SharpYuvUpdateRGB(target_uv, best_rgb_uv, best_uv, 3 * uv_w);
+
+      best_y += 2 * w;
+      best_uv += 3 * uv_w;
+      target_y += 2 * w;
+      target_uv += 3 * uv_w;
+    }
+    // test exit condition
+    if (iter > 0) {
+      if (diff_y_sum < diff_y_threshold) break;
+      if (diff_y_sum > prev_diff_y_sum) break;
+    }
+    prev_diff_y_sum = diff_y_sum;
+  }
+
+  // final reconstruction
+  ok = ConvertWRGBToYUV(best_y_base, best_uv_base, y_ptr, y_stride, u_ptr,
+                        u_stride, v_ptr, v_stride, rgb_bit_depth, yuv_bit_depth,
+                        width, height, yuv_matrix);
+
+ End:
+  free(best_y_base);
+  free(best_uv_base);
+  free(target_y_base);
+  free(target_uv_base);
+  free(best_rgb_y);
+  free(best_rgb_uv);
+  free(tmp_buffer);
+  return ok;
+}
+#undef SAFE_ALLOC
+
+#if defined(WEBP_USE_THREAD) && !defined(_WIN32)
+#include <pthread.h>  // NOLINT
+
+#define LOCK_ACCESS \
+    static pthread_mutex_t sharpyuv_lock = PTHREAD_MUTEX_INITIALIZER; \
+    if (pthread_mutex_lock(&sharpyuv_lock)) return
+#define UNLOCK_ACCESS_AND_RETURN                  \
+    do {                                          \
+      (void)pthread_mutex_unlock(&sharpyuv_lock); \
+      return;                                     \
+    } while (0)
+#else  // !(defined(WEBP_USE_THREAD) && !defined(_WIN32))
+#define LOCK_ACCESS do {} while (0)
+#define UNLOCK_ACCESS_AND_RETURN return
+#endif  // defined(WEBP_USE_THREAD) && !defined(_WIN32)
+
+// Hidden exported init function.
+// By default SharpYuvConvert calls it with SharpYuvGetCPUInfo. If needed,
+// users can declare it as extern and call it with an alternate VP8CPUInfo
+// function.
+SHARPYUV_EXTERN void SharpYuvInit(VP8CPUInfo cpu_info_func);
+void SharpYuvInit(VP8CPUInfo cpu_info_func) {
+  static volatile VP8CPUInfo sharpyuv_last_cpuinfo_used =
+      (VP8CPUInfo)&sharpyuv_last_cpuinfo_used;
+  LOCK_ACCESS;
+  // Only update SharpYuvGetCPUInfo when called from external code to avoid a
+  // race on reading the value in SharpYuvConvert().
+  if (cpu_info_func != (VP8CPUInfo)&SharpYuvGetCPUInfo) {
+    SharpYuvGetCPUInfo = cpu_info_func;
+  }
+  if (sharpyuv_last_cpuinfo_used == SharpYuvGetCPUInfo) {
+    UNLOCK_ACCESS_AND_RETURN;
+  }
+
+  SharpYuvInitDsp();
+  SharpYuvInitGammaTables();
+
+  sharpyuv_last_cpuinfo_used = SharpYuvGetCPUInfo;
+  UNLOCK_ACCESS_AND_RETURN;
+}
+
+int SharpYuvConvert(const void* r_ptr, const void* g_ptr,
+                    const void* b_ptr, int rgb_step, int rgb_stride,
+                    int rgb_bit_depth, void* y_ptr, int y_stride,
+                    void* u_ptr, int u_stride, void* v_ptr,
+                    int v_stride, int yuv_bit_depth, int width,
+                    int height, const SharpYuvConversionMatrix* yuv_matrix) {
+  SharpYuvConversionMatrix scaled_matrix;
+  const int rgb_max = (1 << rgb_bit_depth) - 1;
+  const int rgb_round = 1 << (rgb_bit_depth - 1);
+  const int yuv_max = (1 << yuv_bit_depth) - 1;
+  const int sfix = GetPrecisionShift(rgb_bit_depth);
+
+  if (width < 1 || height < 1 || width == INT_MAX || height == INT_MAX ||
+      r_ptr == NULL || g_ptr == NULL || b_ptr == NULL || y_ptr == NULL ||
+      u_ptr == NULL || v_ptr == NULL) {
+    return 0;
+  }
+  if (rgb_bit_depth != 8 && rgb_bit_depth != 10 && rgb_bit_depth != 12 &&
+      rgb_bit_depth != 16) {
+    return 0;
+  }
+  if (yuv_bit_depth != 8 && yuv_bit_depth != 10 && yuv_bit_depth != 12) {
+    return 0;
+  }
+  if (rgb_bit_depth > 8 && (rgb_step % 2 != 0 || rgb_stride %2 != 0)) {
+    // Step/stride should be even for uint16_t buffers.
+    return 0;
+  }
+  if (yuv_bit_depth > 8 &&
+      (y_stride % 2 != 0 || u_stride % 2 != 0 || v_stride % 2 != 0)) {
+    // Stride should be even for uint16_t buffers.
+    return 0;
+  }
+  // The address of the function pointer is used to avoid a read race.
+  SharpYuvInit((VP8CPUInfo)&SharpYuvGetCPUInfo);
+
+  // Add scaling factor to go from rgb_bit_depth to yuv_bit_depth, to the
+  // rgb->yuv conversion matrix.
+  if (rgb_bit_depth == yuv_bit_depth) {
+    memcpy(&scaled_matrix, yuv_matrix, sizeof(scaled_matrix));
+  } else {
+    int i;
+    for (i = 0; i < 3; ++i) {
+      scaled_matrix.rgb_to_y[i] =
+          (yuv_matrix->rgb_to_y[i] * yuv_max + rgb_round) / rgb_max;
+      scaled_matrix.rgb_to_u[i] =
+          (yuv_matrix->rgb_to_u[i] * yuv_max + rgb_round) / rgb_max;
+      scaled_matrix.rgb_to_v[i] =
+          (yuv_matrix->rgb_to_v[i] * yuv_max + rgb_round) / rgb_max;
+    }
+  }
+  // Also incorporate precision change scaling.
+  scaled_matrix.rgb_to_y[3] = Shift(yuv_matrix->rgb_to_y[3], sfix);
+  scaled_matrix.rgb_to_u[3] = Shift(yuv_matrix->rgb_to_u[3], sfix);
+  scaled_matrix.rgb_to_v[3] = Shift(yuv_matrix->rgb_to_v[3], sfix);
+
+  return DoSharpArgbToYuv(r_ptr, g_ptr, b_ptr, rgb_step, rgb_stride,
+                          rgb_bit_depth, y_ptr, y_stride, u_ptr, u_stride,
+                          v_ptr, v_stride, yuv_bit_depth, width, height,
+                          &scaled_matrix);
+}
+
+//------------------------------------------------------------------------------
diff --git a/src/3rdparty/libwebp/sharpyuv/sharpyuv.h b/src/3rdparty/libwebp/sharpyuv/sharpyuv.h
new file mode 100644
index 0000000..181b20a
--- /dev/null
+++ b/src/3rdparty/libwebp/sharpyuv/sharpyuv.h
@@ -0,0 +1,103 @@
+// Copyright 2022 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Sharp RGB to YUV conversion.
+
+#ifndef WEBP_SHARPYUV_SHARPYUV_H_
+#define WEBP_SHARPYUV_SHARPYUV_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef SHARPYUV_EXTERN
+#ifdef WEBP_EXTERN
+#define SHARPYUV_EXTERN WEBP_EXTERN
+#else
+// This explicitly marks library functions and allows for changing the
+// signature for e.g., Windows DLL builds.
+#if defined(__GNUC__) && __GNUC__ >= 4
+#define SHARPYUV_EXTERN extern __attribute__((visibility("default")))
+#else
+#if defined(_MSC_VER) && defined(WEBP_DLL)
+#define SHARPYUV_EXTERN __declspec(dllexport)
+#else
+#define SHARPYUV_EXTERN extern
+#endif /* _MSC_VER && WEBP_DLL */
+#endif /* __GNUC__ >= 4 */
+#endif /* WEBP_EXTERN */
+#endif /* SHARPYUV_EXTERN */
+
+// SharpYUV API version following the convention from semver.org
+#define SHARPYUV_VERSION_MAJOR 0
+#define SHARPYUV_VERSION_MINOR 2
+#define SHARPYUV_VERSION_PATCH 0
+// Version as a uint32_t. The major number is the high 8 bits.
+// The minor number is the middle 8 bits. The patch number is the low 16 bits.
+#define SHARPYUV_MAKE_VERSION(MAJOR, MINOR, PATCH) \
+  (((MAJOR) << 24) | ((MINOR) << 16) | (PATCH))
+#define SHARPYUV_VERSION                                                \
+  SHARPYUV_MAKE_VERSION(SHARPYUV_VERSION_MAJOR, SHARPYUV_VERSION_MINOR, \
+                        SHARPYUV_VERSION_PATCH)
+
+// Returns the library's version number, packed in hexadecimal. See
+// SHARPYUV_VERSION.
+SHARPYUV_EXTERN int SharpYuvGetVersion(void);
+
+// RGB to YUV conversion matrix, in 16 bit fixed point.
+// y = rgb_to_y[0] * r + rgb_to_y[1] * g + rgb_to_y[2] * b + rgb_to_y[3]
+// u = rgb_to_u[0] * r + rgb_to_u[1] * g + rgb_to_u[2] * b + rgb_to_u[3]
+// v = rgb_to_v[0] * r + rgb_to_v[1] * g + rgb_to_v[2] * b + rgb_to_v[3]
+// Then y, u and v values are divided by 1<<16 and rounded.
+typedef struct {
+  int rgb_to_y[4];
+  int rgb_to_u[4];
+  int rgb_to_v[4];
+} SharpYuvConversionMatrix;
+
+// Converts RGB to YUV420 using a downsampling algorithm that minimizes
+// artefacts caused by chroma subsampling.
+// This is slower than standard downsampling (averaging of 4 UV values).
+// Assumes that the image will be upsampled using a bilinear filter. If nearest
+// neighbor is used instead, the upsampled image might look worse than with
+// standard downsampling.
+// r_ptr, g_ptr, b_ptr: pointers to the source r, g and b channels. Should point
+//     to uint8_t buffers if rgb_bit_depth is 8, or uint16_t buffers otherwise.
+// rgb_step: distance in bytes between two horizontally adjacent pixels on the
+//     r, g and b channels. If rgb_bit_depth is > 8, it should be a
+//     multiple of 2.
+// rgb_stride: distance in bytes between two vertically adjacent pixels on the
+//     r, g, and b channels. If rgb_bit_depth is > 8, it should be a
+//     multiple of 2.
+// rgb_bit_depth: number of bits for each r/g/b value. One of: 8, 10, 12, 16.
+//     Note: 16 bit input is truncated to 14 bits before conversion to yuv.
+// yuv_bit_depth: number of bits for each y/u/v value. One of: 8, 10, 12.
+// y_ptr, u_ptr, v_ptr: pointers to the destination y, u and v channels.  Should
+//     point to uint8_t buffers if yuv_bit_depth is 8, or uint16_t buffers
+//     otherwise.
+// y_stride, u_stride, v_stride: distance in bytes between two vertically
+//     adjacent pixels on the y, u and v channels. If yuv_bit_depth > 8, they
+//     should be multiples of 2.
+// width, height: width and height of the image in pixels
+SHARPYUV_EXTERN int SharpYuvConvert(const void* r_ptr, const void* g_ptr,
+                                    const void* b_ptr, int rgb_step,
+                                    int rgb_stride, int rgb_bit_depth,
+                                    void* y_ptr, int y_stride, void* u_ptr,
+                                    int u_stride, void* v_ptr, int v_stride,
+                                    int yuv_bit_depth, int width, int height,
+                                    const SharpYuvConversionMatrix* yuv_matrix);
+
+// TODO(b/194336375): Add YUV444 to YUV420 conversion. Maybe also add 422
+// support (it's rarely used in practice, especially for images).
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // WEBP_SHARPYUV_SHARPYUV_H_
diff --git a/src/3rdparty/libwebp/sharpyuv/sharpyuv_cpu.c b/src/3rdparty/libwebp/sharpyuv/sharpyuv_cpu.c
new file mode 100644
index 0000000..29425a0
--- /dev/null
+++ b/src/3rdparty/libwebp/sharpyuv/sharpyuv_cpu.c
@@ -0,0 +1,14 @@
+// Copyright 2022 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+#include "sharpyuv/sharpyuv_cpu.h"
+
+// Include src/dsp/cpu.c to create SharpYuvGetCPUInfo from VP8GetCPUInfo. The
+// function pointer is renamed in sharpyuv_cpu.h.
+#include "src/dsp/cpu.c"
diff --git a/src/3rdparty/libwebp/sharpyuv/sharpyuv_cpu.h b/src/3rdparty/libwebp/sharpyuv/sharpyuv_cpu.h
new file mode 100644
index 0000000..176ca3e
--- /dev/null
+++ b/src/3rdparty/libwebp/sharpyuv/sharpyuv_cpu.h
@@ -0,0 +1,22 @@
+// Copyright 2022 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+#ifndef WEBP_SHARPYUV_SHARPYUV_CPU_H_
+#define WEBP_SHARPYUV_SHARPYUV_CPU_H_
+
+#include "sharpyuv/sharpyuv.h"
+
+// Avoid exporting SharpYuvGetCPUInfo in shared object / DLL builds.
+// SharpYuvInit() replaces the use of the function pointer.
+#undef WEBP_EXTERN
+#define WEBP_EXTERN extern
+#define VP8GetCPUInfo SharpYuvGetCPUInfo
+#include "src/dsp/cpu.h"
+
+#endif  // WEBP_SHARPYUV_SHARPYUV_CPU_H_
diff --git a/src/3rdparty/libwebp/sharpyuv/sharpyuv_csp.c b/src/3rdparty/libwebp/sharpyuv/sharpyuv_csp.c
new file mode 100644
index 0000000..0ad22be
--- /dev/null
+++ b/src/3rdparty/libwebp/sharpyuv/sharpyuv_csp.c
@@ -0,0 +1,110 @@
+// Copyright 2022 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Colorspace utilities.
+
+#include "sharpyuv/sharpyuv_csp.h"
+
+#include <assert.h>
+#include <math.h>
+#include <stddef.h>
+
+static int ToFixed16(float f) { return (int)floor(f * (1 << 16) + 0.5f); }
+
+void SharpYuvComputeConversionMatrix(const SharpYuvColorSpace* yuv_color_space,
+                                     SharpYuvConversionMatrix* matrix) {
+  const float kr = yuv_color_space->kr;
+  const float kb = yuv_color_space->kb;
+  const float kg = 1.0f - kr - kb;
+  const float cr = 0.5f / (1.0f - kb);
+  const float cb = 0.5f / (1.0f - kr);
+
+  const int shift = yuv_color_space->bit_depth - 8;
+
+  const float denom = (float)((1 << yuv_color_space->bit_depth) - 1);
+  float scale_y = 1.0f;
+  float add_y = 0.0f;
+  float scale_u = cr;
+  float scale_v = cb;
+  float add_uv = (float)(128 << shift);
+  assert(yuv_color_space->bit_depth >= 8);
+
+  if (yuv_color_space->range == kSharpYuvRangeLimited) {
+    scale_y *= (219 << shift) / denom;
+    scale_u *= (224 << shift) / denom;
+    scale_v *= (224 << shift) / denom;
+    add_y = (float)(16 << shift);
+  }
+
+  matrix->rgb_to_y[0] = ToFixed16(kr * scale_y);
+  matrix->rgb_to_y[1] = ToFixed16(kg * scale_y);
+  matrix->rgb_to_y[2] = ToFixed16(kb * scale_y);
+  matrix->rgb_to_y[3] = ToFixed16(add_y);
+
+  matrix->rgb_to_u[0] = ToFixed16(-kr * scale_u);
+  matrix->rgb_to_u[1] = ToFixed16(-kg * scale_u);
+  matrix->rgb_to_u[2] = ToFixed16((1 - kb) * scale_u);
+  matrix->rgb_to_u[3] = ToFixed16(add_uv);
+
+  matrix->rgb_to_v[0] = ToFixed16((1 - kr) * scale_v);
+  matrix->rgb_to_v[1] = ToFixed16(-kg * scale_v);
+  matrix->rgb_to_v[2] = ToFixed16(-kb * scale_v);
+  matrix->rgb_to_v[3] = ToFixed16(add_uv);
+}
+
+// Matrices are in YUV_FIX fixed point precision.
+// WebP's matrix, similar but not identical to kRec601LimitedMatrix.
+static const SharpYuvConversionMatrix kWebpMatrix = {
+  {16839, 33059, 6420, 16 << 16},
+  {-9719, -19081, 28800, 128 << 16},
+  {28800, -24116, -4684, 128 << 16},
+};
+// Kr=0.2990f Kb=0.1140f bits=8 range=kSharpYuvRangeLimited
+static const SharpYuvConversionMatrix kRec601LimitedMatrix = {
+  {16829, 33039, 6416, 16 << 16},
+  {-9714, -19071, 28784, 128 << 16},
+  {28784, -24103, -4681, 128 << 16},
+};
+// Kr=0.2990f Kb=0.1140f bits=8 range=kSharpYuvRangeFull
+static const SharpYuvConversionMatrix kRec601FullMatrix = {
+  {19595, 38470, 7471, 0},
+  {-11058, -21710, 32768, 128 << 16},
+  {32768, -27439, -5329, 128 << 16},
+};
+// Kr=0.2126f Kb=0.0722f bits=8 range=kSharpYuvRangeLimited
+static const SharpYuvConversionMatrix kRec709LimitedMatrix = {
+  {11966, 40254, 4064, 16 << 16},
+  {-6596, -22189, 28784, 128 << 16},
+  {28784, -26145, -2639, 128 << 16},
+};
+// Kr=0.2126f Kb=0.0722f bits=8 range=kSharpYuvRangeFull
+static const SharpYuvConversionMatrix kRec709FullMatrix = {
+  {13933, 46871, 4732, 0},
+  {-7509, -25259, 32768, 128 << 16},
+  {32768, -29763, -3005, 128 << 16},
+};
+
+const SharpYuvConversionMatrix* SharpYuvGetConversionMatrix(
+    SharpYuvMatrixType matrix_type) {
+  switch (matrix_type) {
+    case kSharpYuvMatrixWebp:
+      return &kWebpMatrix;
+    case kSharpYuvMatrixRec601Limited:
+      return &kRec601LimitedMatrix;
+    case kSharpYuvMatrixRec601Full:
+      return &kRec601FullMatrix;
+    case kSharpYuvMatrixRec709Limited:
+      return &kRec709LimitedMatrix;
+    case kSharpYuvMatrixRec709Full:
+      return &kRec709FullMatrix;
+    case kSharpYuvMatrixNum:
+      return NULL;
+  }
+  return NULL;
+}
diff --git a/src/3rdparty/libwebp/sharpyuv/sharpyuv_csp.h b/src/3rdparty/libwebp/sharpyuv/sharpyuv_csp.h
new file mode 100644
index 0000000..3214e3a
--- /dev/null
+++ b/src/3rdparty/libwebp/sharpyuv/sharpyuv_csp.h
@@ -0,0 +1,60 @@
+// Copyright 2022 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Colorspace utilities.
+
+#ifndef WEBP_SHARPYUV_SHARPYUV_CSP_H_
+#define WEBP_SHARPYUV_SHARPYUV_CSP_H_
+
+#include "sharpyuv/sharpyuv.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Range of YUV values.
+typedef enum {
+  kSharpYuvRangeFull,     // YUV values between [0;255] (for 8 bit)
+  kSharpYuvRangeLimited   // Y in [16;235], YUV in [16;240] (for 8 bit)
+} SharpYuvRange;
+
+// Constants that define a YUV color space.
+typedef struct {
+  // Kr and Kb are defined such that:
+  // Y = Kr * r + Kg * g + Kb * b where Kg = 1 - Kr - Kb.
+  float kr;
+  float kb;
+  int bit_depth;  // 8, 10 or 12
+  SharpYuvRange range;
+} SharpYuvColorSpace;
+
+// Fills in 'matrix' for the given YUVColorSpace.
+SHARPYUV_EXTERN void SharpYuvComputeConversionMatrix(
+    const SharpYuvColorSpace* yuv_color_space,
+    SharpYuvConversionMatrix* matrix);
+
+// Enums for precomputed conversion matrices.
+typedef enum {
+  kSharpYuvMatrixWebp = 0,
+  kSharpYuvMatrixRec601Limited,
+  kSharpYuvMatrixRec601Full,
+  kSharpYuvMatrixRec709Limited,
+  kSharpYuvMatrixRec709Full,
+  kSharpYuvMatrixNum
+} SharpYuvMatrixType;
+
+// Returns a pointer to a matrix for one of the predefined colorspaces.
+SHARPYUV_EXTERN const SharpYuvConversionMatrix* SharpYuvGetConversionMatrix(
+    SharpYuvMatrixType matrix_type);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // WEBP_SHARPYUV_SHARPYUV_CSP_H_
diff --git a/src/3rdparty/libwebp/sharpyuv/sharpyuv_dsp.c b/src/3rdparty/libwebp/sharpyuv/sharpyuv_dsp.c
new file mode 100644
index 0000000..31c272c
--- /dev/null
+++ b/src/3rdparty/libwebp/sharpyuv/sharpyuv_dsp.c
@@ -0,0 +1,103 @@
+// Copyright 2022 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Speed-critical functions for Sharp YUV.
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "sharpyuv/sharpyuv_dsp.h"
+
+#include <assert.h>
+#include <stdlib.h>
+
+#include "sharpyuv/sharpyuv_cpu.h"
+
+//-----------------------------------------------------------------------------
+
+#if !WEBP_NEON_OMIT_C_CODE
+static uint16_t clip(int v, int max) {
+  return (v < 0) ? 0 : (v > max) ? max : (uint16_t)v;
+}
+
+static uint64_t SharpYuvUpdateY_C(const uint16_t* ref, const uint16_t* src,
+                                  uint16_t* dst, int len, int bit_depth) {
+  uint64_t diff = 0;
+  int i;
+  const int max_y = (1 << bit_depth) - 1;
+  for (i = 0; i < len; ++i) {
+    const int diff_y = ref[i] - src[i];
+    const int new_y = (int)dst[i] + diff_y;
+    dst[i] = clip(new_y, max_y);
+    diff += (uint64_t)abs(diff_y);
+  }
+  return diff;
+}
+
+static void SharpYuvUpdateRGB_C(const int16_t* ref, const int16_t* src,
+                                int16_t* dst, int len) {
+  int i;
+  for (i = 0; i < len; ++i) {
+    const int diff_uv = ref[i] - src[i];
+    dst[i] += diff_uv;
+  }
+}
+
+static void SharpYuvFilterRow_C(const int16_t* A, const int16_t* B, int len,
+                                const uint16_t* best_y, uint16_t* out,
+                                int bit_depth) {
+  int i;
+  const int max_y = (1 << bit_depth) - 1;
+  for (i = 0; i < len; ++i, ++A, ++B) {
+    const int v0 = (A[0] * 9 + A[1] * 3 + B[0] * 3 + B[1] + 8) >> 4;
+    const int v1 = (A[1] * 9 + A[0] * 3 + B[1] * 3 + B[0] + 8) >> 4;
+    out[2 * i + 0] = clip(best_y[2 * i + 0] + v0, max_y);
+    out[2 * i + 1] = clip(best_y[2 * i + 1] + v1, max_y);
+  }
+}
+#endif  // !WEBP_NEON_OMIT_C_CODE
+
+//-----------------------------------------------------------------------------
+
+uint64_t (*SharpYuvUpdateY)(const uint16_t* src, const uint16_t* ref,
+                            uint16_t* dst, int len, int bit_depth);
+void (*SharpYuvUpdateRGB)(const int16_t* src, const int16_t* ref, int16_t* dst,
+                          int len);
+void (*SharpYuvFilterRow)(const int16_t* A, const int16_t* B, int len,
+                          const uint16_t* best_y, uint16_t* out,
+                          int bit_depth);
+
+extern void InitSharpYuvSSE2(void);
+extern void InitSharpYuvNEON(void);
+
+void SharpYuvInitDsp(void) {
+#if !WEBP_NEON_OMIT_C_CODE
+  SharpYuvUpdateY = SharpYuvUpdateY_C;
+  SharpYuvUpdateRGB = SharpYuvUpdateRGB_C;
+  SharpYuvFilterRow = SharpYuvFilterRow_C;
+#endif
+
+  if (SharpYuvGetCPUInfo != NULL) {
+#if defined(WEBP_HAVE_SSE2)
+    if (SharpYuvGetCPUInfo(kSSE2)) {
+      InitSharpYuvSSE2();
+    }
+#endif  // WEBP_HAVE_SSE2
+  }
+
+#if defined(WEBP_HAVE_NEON)
+  if (WEBP_NEON_OMIT_C_CODE ||
+      (SharpYuvGetCPUInfo != NULL && SharpYuvGetCPUInfo(kNEON))) {
+    InitSharpYuvNEON();
+  }
+#endif  // WEBP_HAVE_NEON
+
+  assert(SharpYuvUpdateY != NULL);
+  assert(SharpYuvUpdateRGB != NULL);
+  assert(SharpYuvFilterRow != NULL);
+}
diff --git a/src/3rdparty/libwebp/sharpyuv/sharpyuv_dsp.h b/src/3rdparty/libwebp/sharpyuv/sharpyuv_dsp.h
new file mode 100644
index 0000000..805fbad
--- /dev/null
+++ b/src/3rdparty/libwebp/sharpyuv/sharpyuv_dsp.h
@@ -0,0 +1,28 @@
+// Copyright 2022 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Speed-critical functions for Sharp YUV.
+
+#ifndef WEBP_SHARPYUV_SHARPYUV_DSP_H_
+#define WEBP_SHARPYUV_SHARPYUV_DSP_H_
+
+#include "sharpyuv/sharpyuv_cpu.h"
+#include "src/webp/types.h"
+
+extern uint64_t (*SharpYuvUpdateY)(const uint16_t* src, const uint16_t* ref,
+                                   uint16_t* dst, int len, int bit_depth);
+extern void (*SharpYuvUpdateRGB)(const int16_t* src, const int16_t* ref,
+                                 int16_t* dst, int len);
+extern void (*SharpYuvFilterRow)(const int16_t* A, const int16_t* B, int len,
+                                 const uint16_t* best_y, uint16_t* out,
+                                 int bit_depth);
+
+void SharpYuvInitDsp(void);
+
+#endif  // WEBP_SHARPYUV_SHARPYUV_DSP_H_
diff --git a/src/3rdparty/libwebp/sharpyuv/sharpyuv_gamma.c b/src/3rdparty/libwebp/sharpyuv/sharpyuv_gamma.c
new file mode 100644
index 0000000..20ab2da
--- /dev/null
+++ b/src/3rdparty/libwebp/sharpyuv/sharpyuv_gamma.c
@@ -0,0 +1,113 @@
+// Copyright 2022 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Gamma correction utilities.
+
+#include "sharpyuv/sharpyuv_gamma.h"
+
+#include <assert.h>
+#include <math.h>
+
+#include "src/webp/types.h"
+
+// Gamma correction compensates loss of resolution during chroma subsampling.
+// Size of pre-computed table for converting from gamma to linear.
+#define GAMMA_TO_LINEAR_TAB_BITS 10
+#define GAMMA_TO_LINEAR_TAB_SIZE (1 << GAMMA_TO_LINEAR_TAB_BITS)
+static uint32_t kGammaToLinearTabS[GAMMA_TO_LINEAR_TAB_SIZE + 2];
+#define LINEAR_TO_GAMMA_TAB_BITS 9
+#define LINEAR_TO_GAMMA_TAB_SIZE (1 << LINEAR_TO_GAMMA_TAB_BITS)
+static uint32_t kLinearToGammaTabS[LINEAR_TO_GAMMA_TAB_SIZE + 2];
+
+static const double kGammaF = 1. / 0.45;
+#define GAMMA_TO_LINEAR_BITS 16
+
+static volatile int kGammaTablesSOk = 0;
+void SharpYuvInitGammaTables(void) {
+  assert(GAMMA_TO_LINEAR_BITS <= 16);
+  if (!kGammaTablesSOk) {
+    int v;
+    const double a = 0.09929682680944;
+    const double thresh = 0.018053968510807;
+    const double final_scale = 1 << GAMMA_TO_LINEAR_BITS;
+    // Precompute gamma to linear table.
+    {
+      const double norm = 1. / GAMMA_TO_LINEAR_TAB_SIZE;
+      const double a_rec = 1. / (1. + a);
+      for (v = 0; v <= GAMMA_TO_LINEAR_TAB_SIZE; ++v) {
+        const double g = norm * v;
+        double value;
+        if (g <= thresh * 4.5) {
+          value = g / 4.5;
+        } else {
+          value = pow(a_rec * (g + a), kGammaF);
+        }
+        kGammaToLinearTabS[v] = (uint32_t)(value * final_scale + .5);
+      }
+      // to prevent small rounding errors to cause read-overflow:
+      kGammaToLinearTabS[GAMMA_TO_LINEAR_TAB_SIZE + 1] =
+          kGammaToLinearTabS[GAMMA_TO_LINEAR_TAB_SIZE];
+    }
+    // Precompute linear to gamma table.
+    {
+      const double scale = 1. / LINEAR_TO_GAMMA_TAB_SIZE;
+      for (v = 0; v <= LINEAR_TO_GAMMA_TAB_SIZE; ++v) {
+        const double g = scale * v;
+        double value;
+        if (g <= thresh) {
+          value = 4.5 * g;
+        } else {
+          value = (1. + a) * pow(g, 1. / kGammaF) - a;
+        }
+        kLinearToGammaTabS[v] =
+            (uint32_t)(final_scale * value + 0.5);
+      }
+      // to prevent small rounding errors to cause read-overflow:
+      kLinearToGammaTabS[LINEAR_TO_GAMMA_TAB_SIZE + 1] =
+          kLinearToGammaTabS[LINEAR_TO_GAMMA_TAB_SIZE];
+    }
+    kGammaTablesSOk = 1;
+  }
+}
+
+static WEBP_INLINE int Shift(int v, int shift) {
+  return (shift >= 0) ? (v << shift) : (v >> -shift);
+}
+
+static WEBP_INLINE uint32_t FixedPointInterpolation(int v, uint32_t* tab,
+                                                    int tab_pos_shift_right,
+                                                    int tab_value_shift) {
+  const uint32_t tab_pos = Shift(v, -tab_pos_shift_right);
+  // fractional part, in 'tab_pos_shift' fixed-point precision
+  const uint32_t x = v - (tab_pos << tab_pos_shift_right);  // fractional part
+  // v0 / v1 are in kGammaToLinearBits fixed-point precision (range [0..1])
+  const uint32_t v0 = Shift(tab[tab_pos + 0], tab_value_shift);
+  const uint32_t v1 = Shift(tab[tab_pos + 1], tab_value_shift);
+  // Final interpolation.
+  const uint32_t v2 = (v1 - v0) * x;  // note: v1 >= v0.
+  const int half =
+      (tab_pos_shift_right > 0) ? 1 << (tab_pos_shift_right - 1) : 0;
+  const uint32_t result = v0 + ((v2 + half) >> tab_pos_shift_right);
+  return result;
+}
+
+uint32_t SharpYuvGammaToLinear(uint16_t v, int bit_depth) {
+  const int shift = GAMMA_TO_LINEAR_TAB_BITS - bit_depth;
+  if (shift > 0) {
+    return kGammaToLinearTabS[v << shift];
+  }
+  return FixedPointInterpolation(v, kGammaToLinearTabS, -shift, 0);
+}
+
+uint16_t SharpYuvLinearToGamma(uint32_t value, int bit_depth) {
+  return FixedPointInterpolation(
+      value, kLinearToGammaTabS,
+      (GAMMA_TO_LINEAR_BITS - LINEAR_TO_GAMMA_TAB_BITS),
+      bit_depth - GAMMA_TO_LINEAR_BITS);
+}
diff --git a/src/3rdparty/libwebp/sharpyuv/sharpyuv_gamma.h b/src/3rdparty/libwebp/sharpyuv/sharpyuv_gamma.h
new file mode 100644
index 0000000..d13aff5
--- /dev/null
+++ b/src/3rdparty/libwebp/sharpyuv/sharpyuv_gamma.h
@@ -0,0 +1,35 @@
+// Copyright 2022 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Gamma correction utilities.
+
+#ifndef WEBP_SHARPYUV_SHARPYUV_GAMMA_H_
+#define WEBP_SHARPYUV_SHARPYUV_GAMMA_H_
+
+#include "src/webp/types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Initializes precomputed tables. Must be called once before calling
+// SharpYuvGammaToLinear or SharpYuvLinearToGamma.
+void SharpYuvInitGammaTables(void);
+
+// Converts a gamma color value on 'bit_depth' bits to a 16 bit linear value.
+uint32_t SharpYuvGammaToLinear(uint16_t v, int bit_depth);
+
+// Converts a 16 bit linear color value to a gamma value on 'bit_depth' bits.
+uint16_t SharpYuvLinearToGamma(uint32_t value, int bit_depth);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // WEBP_SHARPYUV_SHARPYUV_GAMMA_H_
diff --git a/src/3rdparty/libwebp/sharpyuv/sharpyuv_neon.c b/src/3rdparty/libwebp/sharpyuv/sharpyuv_neon.c
new file mode 100644
index 0000000..5840914
--- /dev/null
+++ b/src/3rdparty/libwebp/sharpyuv/sharpyuv_neon.c
@@ -0,0 +1,181 @@
+// Copyright 2022 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Speed-critical functions for Sharp YUV.
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "sharpyuv/sharpyuv_dsp.h"
+
+#if defined(WEBP_USE_NEON)
+#include <assert.h>
+#include <stdlib.h>
+#include <arm_neon.h>
+
+static uint16_t clip_NEON(int v, int max) {
+  return (v < 0) ? 0 : (v > max) ? max : (uint16_t)v;
+}
+
+static uint64_t SharpYuvUpdateY_NEON(const uint16_t* ref, const uint16_t* src,
+                                     uint16_t* dst, int len, int bit_depth) {
+  const int max_y = (1 << bit_depth) - 1;
+  int i;
+  const int16x8_t zero = vdupq_n_s16(0);
+  const int16x8_t max = vdupq_n_s16(max_y);
+  uint64x2_t sum = vdupq_n_u64(0);
+  uint64_t diff;
+
+  for (i = 0; i + 8 <= len; i += 8) {
+    const int16x8_t A = vreinterpretq_s16_u16(vld1q_u16(ref + i));
+    const int16x8_t B = vreinterpretq_s16_u16(vld1q_u16(src + i));
+    const int16x8_t C = vreinterpretq_s16_u16(vld1q_u16(dst + i));
+    const int16x8_t D = vsubq_s16(A, B);       // diff_y
+    const int16x8_t F = vaddq_s16(C, D);       // new_y
+    const uint16x8_t H =
+        vreinterpretq_u16_s16(vmaxq_s16(vminq_s16(F, max), zero));
+    const int16x8_t I = vabsq_s16(D);          // abs(diff_y)
+    vst1q_u16(dst + i, H);
+    sum = vpadalq_u32(sum, vpaddlq_u16(vreinterpretq_u16_s16(I)));
+  }
+  diff = vgetq_lane_u64(sum, 0) + vgetq_lane_u64(sum, 1);
+  for (; i < len; ++i) {
+    const int diff_y = ref[i] - src[i];
+    const int new_y = (int)(dst[i]) + diff_y;
+    dst[i] = clip_NEON(new_y, max_y);
+    diff += (uint64_t)(abs(diff_y));
+  }
+  return diff;
+}
+
+static void SharpYuvUpdateRGB_NEON(const int16_t* ref, const int16_t* src,
+                                   int16_t* dst, int len) {
+  int i;
+  for (i = 0; i + 8 <= len; i += 8) {
+    const int16x8_t A = vld1q_s16(ref + i);
+    const int16x8_t B = vld1q_s16(src + i);
+    const int16x8_t C = vld1q_s16(dst + i);
+    const int16x8_t D = vsubq_s16(A, B);   // diff_uv
+    const int16x8_t E = vaddq_s16(C, D);   // new_uv
+    vst1q_s16(dst + i, E);
+  }
+  for (; i < len; ++i) {
+    const int diff_uv = ref[i] - src[i];
+    dst[i] += diff_uv;
+  }
+}
+
+static void SharpYuvFilterRow16_NEON(const int16_t* A, const int16_t* B,
+                                     int len, const uint16_t* best_y,
+                                     uint16_t* out, int bit_depth) {
+  const int max_y = (1 << bit_depth) - 1;
+  int i;
+  const int16x8_t max = vdupq_n_s16(max_y);
+  const int16x8_t zero = vdupq_n_s16(0);
+  for (i = 0; i + 8 <= len; i += 8) {
+    const int16x8_t a0 = vld1q_s16(A + i + 0);
+    const int16x8_t a1 = vld1q_s16(A + i + 1);
+    const int16x8_t b0 = vld1q_s16(B + i + 0);
+    const int16x8_t b1 = vld1q_s16(B + i + 1);
+    const int16x8_t a0b1 = vaddq_s16(a0, b1);
+    const int16x8_t a1b0 = vaddq_s16(a1, b0);
+    const int16x8_t a0a1b0b1 = vaddq_s16(a0b1, a1b0);  // A0+A1+B0+B1
+    const int16x8_t a0b1_2 = vaddq_s16(a0b1, a0b1);    // 2*(A0+B1)
+    const int16x8_t a1b0_2 = vaddq_s16(a1b0, a1b0);    // 2*(A1+B0)
+    const int16x8_t c0 = vshrq_n_s16(vaddq_s16(a0b1_2, a0a1b0b1), 3);
+    const int16x8_t c1 = vshrq_n_s16(vaddq_s16(a1b0_2, a0a1b0b1), 3);
+    const int16x8_t e0 = vrhaddq_s16(c1, a0);
+    const int16x8_t e1 = vrhaddq_s16(c0, a1);
+    const int16x8x2_t f = vzipq_s16(e0, e1);
+    const int16x8_t g0 = vreinterpretq_s16_u16(vld1q_u16(best_y + 2 * i + 0));
+    const int16x8_t g1 = vreinterpretq_s16_u16(vld1q_u16(best_y + 2 * i + 8));
+    const int16x8_t h0 = vaddq_s16(g0, f.val[0]);
+    const int16x8_t h1 = vaddq_s16(g1, f.val[1]);
+    const int16x8_t i0 = vmaxq_s16(vminq_s16(h0, max), zero);
+    const int16x8_t i1 = vmaxq_s16(vminq_s16(h1, max), zero);
+    vst1q_u16(out + 2 * i + 0, vreinterpretq_u16_s16(i0));
+    vst1q_u16(out + 2 * i + 8, vreinterpretq_u16_s16(i1));
+  }
+  for (; i < len; ++i) {
+    const int a0b1 = A[i + 0] + B[i + 1];
+    const int a1b0 = A[i + 1] + B[i + 0];
+    const int a0a1b0b1 = a0b1 + a1b0 + 8;
+    const int v0 = (8 * A[i + 0] + 2 * a1b0 + a0a1b0b1) >> 4;
+    const int v1 = (8 * A[i + 1] + 2 * a0b1 + a0a1b0b1) >> 4;
+    out[2 * i + 0] = clip_NEON(best_y[2 * i + 0] + v0, max_y);
+    out[2 * i + 1] = clip_NEON(best_y[2 * i + 1] + v1, max_y);
+  }
+}
+
+static void SharpYuvFilterRow32_NEON(const int16_t* A, const int16_t* B,
+                                     int len, const uint16_t* best_y,
+                                     uint16_t* out, int bit_depth) {
+  const int max_y = (1 << bit_depth) - 1;
+  int i;
+  const uint16x8_t max = vdupq_n_u16(max_y);
+  for (i = 0; i + 4 <= len; i += 4) {
+    const int16x4_t a0 = vld1_s16(A + i + 0);
+    const int16x4_t a1 = vld1_s16(A + i + 1);
+    const int16x4_t b0 = vld1_s16(B + i + 0);
+    const int16x4_t b1 = vld1_s16(B + i + 1);
+    const int32x4_t a0b1 = vaddl_s16(a0, b1);
+    const int32x4_t a1b0 = vaddl_s16(a1, b0);
+    const int32x4_t a0a1b0b1 = vaddq_s32(a0b1, a1b0);  // A0+A1+B0+B1
+    const int32x4_t a0b1_2 = vaddq_s32(a0b1, a0b1);    // 2*(A0+B1)
+    const int32x4_t a1b0_2 = vaddq_s32(a1b0, a1b0);    // 2*(A1+B0)
+    const int32x4_t c0 = vshrq_n_s32(vaddq_s32(a0b1_2, a0a1b0b1), 3);
+    const int32x4_t c1 = vshrq_n_s32(vaddq_s32(a1b0_2, a0a1b0b1), 3);
+    const int32x4_t e0 = vrhaddq_s32(c1, vmovl_s16(a0));
+    const int32x4_t e1 = vrhaddq_s32(c0, vmovl_s16(a1));
+    const int32x4x2_t f = vzipq_s32(e0, e1);
+
+    const int16x8_t g = vreinterpretq_s16_u16(vld1q_u16(best_y + 2 * i));
+    const int32x4_t h0 = vaddw_s16(f.val[0], vget_low_s16(g));
+    const int32x4_t h1 = vaddw_s16(f.val[1], vget_high_s16(g));
+    const uint16x8_t i_16 = vcombine_u16(vqmovun_s32(h0), vqmovun_s32(h1));
+    const uint16x8_t i_clamped = vminq_u16(i_16, max);
+    vst1q_u16(out + 2 * i + 0, i_clamped);
+  }
+  for (; i < len; ++i) {
+    const int a0b1 = A[i + 0] + B[i + 1];
+    const int a1b0 = A[i + 1] + B[i + 0];
+    const int a0a1b0b1 = a0b1 + a1b0 + 8;
+    const int v0 = (8 * A[i + 0] + 2 * a1b0 + a0a1b0b1) >> 4;
+    const int v1 = (8 * A[i + 1] + 2 * a0b1 + a0a1b0b1) >> 4;
+    out[2 * i + 0] = clip_NEON(best_y[2 * i + 0] + v0, max_y);
+    out[2 * i + 1] = clip_NEON(best_y[2 * i + 1] + v1, max_y);
+  }
+}
+
+static void SharpYuvFilterRow_NEON(const int16_t* A, const int16_t* B, int len,
+                                   const uint16_t* best_y, uint16_t* out,
+                                   int bit_depth) {
+  if (bit_depth <= 10) {
+    SharpYuvFilterRow16_NEON(A, B, len, best_y, out, bit_depth);
+  } else {
+    SharpYuvFilterRow32_NEON(A, B, len, best_y, out, bit_depth);
+  }
+}
+
+//------------------------------------------------------------------------------
+
+extern void InitSharpYuvNEON(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void InitSharpYuvNEON(void) {
+  SharpYuvUpdateY = SharpYuvUpdateY_NEON;
+  SharpYuvUpdateRGB = SharpYuvUpdateRGB_NEON;
+  SharpYuvFilterRow = SharpYuvFilterRow_NEON;
+}
+
+#else  // !WEBP_USE_NEON
+
+extern void InitSharpYuvNEON(void);
+
+void InitSharpYuvNEON(void) {}
+
+#endif  // WEBP_USE_NEON
diff --git a/src/3rdparty/libwebp/sharpyuv/sharpyuv_sse2.c b/src/3rdparty/libwebp/sharpyuv/sharpyuv_sse2.c
new file mode 100644
index 0000000..9744d1b
--- /dev/null
+++ b/src/3rdparty/libwebp/sharpyuv/sharpyuv_sse2.c
@@ -0,0 +1,201 @@
+// Copyright 2022 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Speed-critical functions for Sharp YUV.
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "sharpyuv/sharpyuv_dsp.h"
+
+#if defined(WEBP_USE_SSE2)
+#include <stdlib.h>
+#include <emmintrin.h>
+
+static uint16_t clip_SSE2(int v, int max) {
+  return (v < 0) ? 0 : (v > max) ? max : (uint16_t)v;
+}
+
+static uint64_t SharpYuvUpdateY_SSE2(const uint16_t* ref, const uint16_t* src,
+                                     uint16_t* dst, int len, int bit_depth) {
+  const int max_y = (1 << bit_depth) - 1;
+  uint64_t diff = 0;
+  uint32_t tmp[4];
+  int i;
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i max = _mm_set1_epi16(max_y);
+  const __m128i one = _mm_set1_epi16(1);
+  __m128i sum = zero;
+
+  for (i = 0; i + 8 <= len; i += 8) {
+    const __m128i A = _mm_loadu_si128((const __m128i*)(ref + i));
+    const __m128i B = _mm_loadu_si128((const __m128i*)(src + i));
+    const __m128i C = _mm_loadu_si128((const __m128i*)(dst + i));
+    const __m128i D = _mm_sub_epi16(A, B);       // diff_y
+    const __m128i E = _mm_cmpgt_epi16(zero, D);  // sign (-1 or 0)
+    const __m128i F = _mm_add_epi16(C, D);       // new_y
+    const __m128i G = _mm_or_si128(E, one);      // -1 or 1
+    const __m128i H = _mm_max_epi16(_mm_min_epi16(F, max), zero);
+    const __m128i I = _mm_madd_epi16(D, G);      // sum(abs(...))
+    _mm_storeu_si128((__m128i*)(dst + i), H);
+    sum = _mm_add_epi32(sum, I);
+  }
+  _mm_storeu_si128((__m128i*)tmp, sum);
+  diff = tmp[3] + tmp[2] + tmp[1] + tmp[0];
+  for (; i < len; ++i) {
+    const int diff_y = ref[i] - src[i];
+    const int new_y = (int)dst[i] + diff_y;
+    dst[i] = clip_SSE2(new_y, max_y);
+    diff += (uint64_t)abs(diff_y);
+  }
+  return diff;
+}
+
+static void SharpYuvUpdateRGB_SSE2(const int16_t* ref, const int16_t* src,
+                                   int16_t* dst, int len) {
+  int i = 0;
+  for (i = 0; i + 8 <= len; i += 8) {
+    const __m128i A = _mm_loadu_si128((const __m128i*)(ref + i));
+    const __m128i B = _mm_loadu_si128((const __m128i*)(src + i));
+    const __m128i C = _mm_loadu_si128((const __m128i*)(dst + i));
+    const __m128i D = _mm_sub_epi16(A, B);   // diff_uv
+    const __m128i E = _mm_add_epi16(C, D);   // new_uv
+    _mm_storeu_si128((__m128i*)(dst + i), E);
+  }
+  for (; i < len; ++i) {
+    const int diff_uv = ref[i] - src[i];
+    dst[i] += diff_uv;
+  }
+}
+
+static void SharpYuvFilterRow16_SSE2(const int16_t* A, const int16_t* B,
+                                     int len, const uint16_t* best_y,
+                                     uint16_t* out, int bit_depth) {
+  const int max_y = (1 << bit_depth) - 1;
+  int i;
+  const __m128i kCst8 = _mm_set1_epi16(8);
+  const __m128i max = _mm_set1_epi16(max_y);
+  const __m128i zero = _mm_setzero_si128();
+  for (i = 0; i + 8 <= len; i += 8) {
+    const __m128i a0 = _mm_loadu_si128((const __m128i*)(A + i + 0));
+    const __m128i a1 = _mm_loadu_si128((const __m128i*)(A + i + 1));
+    const __m128i b0 = _mm_loadu_si128((const __m128i*)(B + i + 0));
+    const __m128i b1 = _mm_loadu_si128((const __m128i*)(B + i + 1));
+    const __m128i a0b1 = _mm_add_epi16(a0, b1);
+    const __m128i a1b0 = _mm_add_epi16(a1, b0);
+    const __m128i a0a1b0b1 = _mm_add_epi16(a0b1, a1b0);  // A0+A1+B0+B1
+    const __m128i a0a1b0b1_8 = _mm_add_epi16(a0a1b0b1, kCst8);
+    const __m128i a0b1_2 = _mm_add_epi16(a0b1, a0b1);    // 2*(A0+B1)
+    const __m128i a1b0_2 = _mm_add_epi16(a1b0, a1b0);    // 2*(A1+B0)
+    const __m128i c0 = _mm_srai_epi16(_mm_add_epi16(a0b1_2, a0a1b0b1_8), 3);
+    const __m128i c1 = _mm_srai_epi16(_mm_add_epi16(a1b0_2, a0a1b0b1_8), 3);
+    const __m128i d0 = _mm_add_epi16(c1, a0);
+    const __m128i d1 = _mm_add_epi16(c0, a1);
+    const __m128i e0 = _mm_srai_epi16(d0, 1);
+    const __m128i e1 = _mm_srai_epi16(d1, 1);
+    const __m128i f0 = _mm_unpacklo_epi16(e0, e1);
+    const __m128i f1 = _mm_unpackhi_epi16(e0, e1);
+    const __m128i g0 = _mm_loadu_si128((const __m128i*)(best_y + 2 * i + 0));
+    const __m128i g1 = _mm_loadu_si128((const __m128i*)(best_y + 2 * i + 8));
+    const __m128i h0 = _mm_add_epi16(g0, f0);
+    const __m128i h1 = _mm_add_epi16(g1, f1);
+    const __m128i i0 = _mm_max_epi16(_mm_min_epi16(h0, max), zero);
+    const __m128i i1 = _mm_max_epi16(_mm_min_epi16(h1, max), zero);
+    _mm_storeu_si128((__m128i*)(out + 2 * i + 0), i0);
+    _mm_storeu_si128((__m128i*)(out + 2 * i + 8), i1);
+  }
+  for (; i < len; ++i) {
+    //   (9 * A0 + 3 * A1 + 3 * B0 + B1 + 8) >> 4 =
+    // = (8 * A0 + 2 * (A1 + B0) + (A0 + A1 + B0 + B1 + 8)) >> 4
+    // We reuse the common sub-expressions.
+    const int a0b1 = A[i + 0] + B[i + 1];
+    const int a1b0 = A[i + 1] + B[i + 0];
+    const int a0a1b0b1 = a0b1 + a1b0 + 8;
+    const int v0 = (8 * A[i + 0] + 2 * a1b0 + a0a1b0b1) >> 4;
+    const int v1 = (8 * A[i + 1] + 2 * a0b1 + a0a1b0b1) >> 4;
+    out[2 * i + 0] = clip_SSE2(best_y[2 * i + 0] + v0, max_y);
+    out[2 * i + 1] = clip_SSE2(best_y[2 * i + 1] + v1, max_y);
+  }
+}
+
+static WEBP_INLINE __m128i s16_to_s32(__m128i in) {
+  return _mm_srai_epi32(_mm_unpacklo_epi16(in, in), 16);
+}
+
+static void SharpYuvFilterRow32_SSE2(const int16_t* A, const int16_t* B,
+                                     int len, const uint16_t* best_y,
+                                     uint16_t* out, int bit_depth) {
+  const int max_y = (1 << bit_depth) - 1;
+  int i;
+  const __m128i kCst8 = _mm_set1_epi32(8);
+  const __m128i max = _mm_set1_epi16(max_y);
+  const __m128i zero = _mm_setzero_si128();
+  for (i = 0; i + 4 <= len; i += 4) {
+    const __m128i a0 = s16_to_s32(_mm_loadl_epi64((const __m128i*)(A + i + 0)));
+    const __m128i a1 = s16_to_s32(_mm_loadl_epi64((const __m128i*)(A + i + 1)));
+    const __m128i b0 = s16_to_s32(_mm_loadl_epi64((const __m128i*)(B + i + 0)));
+    const __m128i b1 = s16_to_s32(_mm_loadl_epi64((const __m128i*)(B + i + 1)));
+    const __m128i a0b1 = _mm_add_epi32(a0, b1);
+    const __m128i a1b0 = _mm_add_epi32(a1, b0);
+    const __m128i a0a1b0b1 = _mm_add_epi32(a0b1, a1b0);  // A0+A1+B0+B1
+    const __m128i a0a1b0b1_8 = _mm_add_epi32(a0a1b0b1, kCst8);
+    const __m128i a0b1_2 = _mm_add_epi32(a0b1, a0b1);  // 2*(A0+B1)
+    const __m128i a1b0_2 = _mm_add_epi32(a1b0, a1b0);  // 2*(A1+B0)
+    const __m128i c0 = _mm_srai_epi32(_mm_add_epi32(a0b1_2, a0a1b0b1_8), 3);
+    const __m128i c1 = _mm_srai_epi32(_mm_add_epi32(a1b0_2, a0a1b0b1_8), 3);
+    const __m128i d0 = _mm_add_epi32(c1, a0);
+    const __m128i d1 = _mm_add_epi32(c0, a1);
+    const __m128i e0 = _mm_srai_epi32(d0, 1);
+    const __m128i e1 = _mm_srai_epi32(d1, 1);
+    const __m128i f0 = _mm_unpacklo_epi32(e0, e1);
+    const __m128i f1 = _mm_unpackhi_epi32(e0, e1);
+    const __m128i g = _mm_loadu_si128((const __m128i*)(best_y + 2 * i + 0));
+    const __m128i h_16 = _mm_add_epi16(g, _mm_packs_epi32(f0, f1));
+    const __m128i final = _mm_max_epi16(_mm_min_epi16(h_16, max), zero);
+    _mm_storeu_si128((__m128i*)(out + 2 * i + 0), final);
+  }
+  for (; i < len; ++i) {
+    //   (9 * A0 + 3 * A1 + 3 * B0 + B1 + 8) >> 4 =
+    // = (8 * A0 + 2 * (A1 + B0) + (A0 + A1 + B0 + B1 + 8)) >> 4
+    // We reuse the common sub-expressions.
+    const int a0b1 = A[i + 0] + B[i + 1];
+    const int a1b0 = A[i + 1] + B[i + 0];
+    const int a0a1b0b1 = a0b1 + a1b0 + 8;
+    const int v0 = (8 * A[i + 0] + 2 * a1b0 + a0a1b0b1) >> 4;
+    const int v1 = (8 * A[i + 1] + 2 * a0b1 + a0a1b0b1) >> 4;
+    out[2 * i + 0] = clip_SSE2(best_y[2 * i + 0] + v0, max_y);
+    out[2 * i + 1] = clip_SSE2(best_y[2 * i + 1] + v1, max_y);
+  }
+}
+
+static void SharpYuvFilterRow_SSE2(const int16_t* A, const int16_t* B, int len,
+                                   const uint16_t* best_y, uint16_t* out,
+                                   int bit_depth) {
+  if (bit_depth <= 10) {
+    SharpYuvFilterRow16_SSE2(A, B, len, best_y, out, bit_depth);
+  } else {
+    SharpYuvFilterRow32_SSE2(A, B, len, best_y, out, bit_depth);
+  }
+}
+
+//------------------------------------------------------------------------------
+
+extern void InitSharpYuvSSE2(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void InitSharpYuvSSE2(void) {
+  SharpYuvUpdateY = SharpYuvUpdateY_SSE2;
+  SharpYuvUpdateRGB = SharpYuvUpdateRGB_SSE2;
+  SharpYuvFilterRow = SharpYuvFilterRow_SSE2;
+}
+#else  // !WEBP_USE_SSE2
+
+extern void InitSharpYuvSSE2(void);
+
+void InitSharpYuvSSE2(void) {}
+
+#endif  // WEBP_USE_SSE2
diff --git a/src/3rdparty/libwebp/src/dec/alpha_dec.c b/src/3rdparty/libwebp/src/dec/alpha_dec.c
index bce735b..0b93a30 100644
--- a/src/3rdparty/libwebp/src/dec/alpha_dec.c
+++ b/src/3rdparty/libwebp/src/dec/alpha_dec.c
@@ -183,7 +183,7 @@ const uint8_t* VP8DecompressAlphaRows(VP8Decoder* const dec,
   assert(dec != NULL && io != NULL);
 
   if (row < 0 || num_rows <= 0 || row + num_rows > height) {
-    return NULL;    // sanity check.
+    return NULL;
   }
 
   if (!dec->is_alpha_decoded_) {
diff --git a/src/3rdparty/libwebp/src/dec/buffer_dec.c b/src/3rdparty/libwebp/src/dec/buffer_dec.c
index 3cd94eb..4786cf0 100644
--- a/src/3rdparty/libwebp/src/dec/buffer_dec.c
+++ b/src/3rdparty/libwebp/src/dec/buffer_dec.c
@@ -102,7 +102,7 @@ static VP8StatusCode AllocateBuffer(WebPDecBuffer* const buffer) {
     int stride;
     uint64_t size;
 
-    if ((uint64_t)w * kModeBpp[mode] >= (1ull << 32)) {
+    if ((uint64_t)w * kModeBpp[mode] >= (1ull << 31)) {
       return VP8_STATUS_INVALID_PARAM;
     }
     stride = w * kModeBpp[mode];
@@ -117,7 +117,6 @@ static VP8StatusCode AllocateBuffer(WebPDecBuffer* const buffer) {
     }
     total_size = size + 2 * uv_size + a_size;
 
-    // Security/sanity checks
     output = (uint8_t*)WebPSafeMalloc(total_size, sizeof(*output));
     if (output == NULL) {
       return VP8_STATUS_OUT_OF_MEMORY;
@@ -156,11 +155,11 @@ VP8StatusCode WebPFlipBuffer(WebPDecBuffer* const buffer) {
   }
   if (WebPIsRGBMode(buffer->colorspace)) {
     WebPRGBABuffer* const buf = &buffer->u.RGBA;
-    buf->rgba += (buffer->height - 1) * buf->stride;
+    buf->rgba += (int64_t)(buffer->height - 1) * buf->stride;
     buf->stride = -buf->stride;
   } else {
     WebPYUVABuffer* const buf = &buffer->u.YUVA;
-    const int H = buffer->height;
+    const int64_t H = buffer->height;
     buf->y += (H - 1) * buf->y_stride;
     buf->y_stride = -buf->y_stride;
     buf->u += ((H - 1) >> 1) * buf->u_stride;
@@ -188,8 +187,7 @@ VP8StatusCode WebPAllocateDecBuffer(int width, int height,
       const int ch = options->crop_height;
       const int x = options->crop_left & ~1;
       const int y = options->crop_top & ~1;
-      if (x < 0 || y < 0 || cw <= 0 || ch <= 0 ||
-          x + cw > width || y + ch > height) {
+      if (!WebPCheckCropDimensions(width, height, x, y, cw, ch)) {
         return VP8_STATUS_INVALID_PARAM;   // out of frame boundary.
       }
       width = cw;
diff --git a/src/3rdparty/libwebp/src/dec/frame_dec.c b/src/3rdparty/libwebp/src/dec/frame_dec.c
index bda9e1a..91ca1f8 100644
--- a/src/3rdparty/libwebp/src/dec/frame_dec.c
+++ b/src/3rdparty/libwebp/src/dec/frame_dec.c
@@ -705,7 +705,7 @@ static int AllocateMemory(VP8Decoder* const dec) {
                         + cache_size + alpha_size + WEBP_ALIGN_CST;
   uint8_t* mem;
 
-  if (needed != (size_t)needed) return 0;  // check for overflow
+  if (!CheckSizeOverflow(needed)) return 0;  // check for overflow
   if (needed > dec->mem_size_) {
     WebPSafeFree(dec->mem_);
     dec->mem_size_ = 0;
@@ -732,7 +732,7 @@ static int AllocateMemory(VP8Decoder* const dec) {
   mem += f_info_size;
   dec->thread_ctx_.id_ = 0;
   dec->thread_ctx_.f_info_ = dec->f_info_;
-  if (dec->mt_method_ > 0) {
+  if (dec->filter_type_ > 0 && dec->mt_method_ > 0) {
     // secondary cache line. The deblocking process need to make use of the
     // filtering strength from previous macroblock row, while the new ones
     // are being decoded in parallel. We'll just swap the pointers.
diff --git a/src/3rdparty/libwebp/src/dec/idec_dec.c b/src/3rdparty/libwebp/src/dec/idec_dec.c
index 9bc9166..9035df5 100644
--- a/src/3rdparty/libwebp/src/dec/idec_dec.c
+++ b/src/3rdparty/libwebp/src/dec/idec_dec.c
@@ -166,9 +166,11 @@ static int AppendToMemBuffer(WebPIDecoder* const idec,
   VP8Decoder* const dec = (VP8Decoder*)idec->dec_;
   MemBuffer* const mem = &idec->mem_;
   const int need_compressed_alpha = NeedCompressedAlpha(idec);
-  const uint8_t* const old_start = mem->buf_ + mem->start_;
+  const uint8_t* const old_start =
+      (mem->buf_ == NULL) ? NULL : mem->buf_ + mem->start_;
   const uint8_t* const old_base =
       need_compressed_alpha ? dec->alpha_data_ : old_start;
+  assert(mem->buf_ != NULL || mem->start_ == 0);
   assert(mem->mode_ == MEM_MODE_APPEND);
   if (data_size > MAX_CHUNK_PAYLOAD) {
     // security safeguard: trying to allocate more than what the format
@@ -184,7 +186,7 @@ static int AppendToMemBuffer(WebPIDecoder* const idec,
     uint8_t* const new_buf =
         (uint8_t*)WebPSafeMalloc(extra_size, sizeof(*new_buf));
     if (new_buf == NULL) return 0;
-    memcpy(new_buf, old_base, current_size);
+    if (old_base != NULL) memcpy(new_buf, old_base, current_size);
     WebPSafeFree(mem->buf_);
     mem->buf_ = new_buf;
     mem->buf_size_ = (size_t)extra_size;
@@ -192,6 +194,7 @@ static int AppendToMemBuffer(WebPIDecoder* const idec,
     mem->end_ = current_size;
   }
 
+  assert(mem->buf_ != NULL);
   memcpy(mem->buf_ + mem->end_, data, data_size);
   mem->end_ += data_size;
   assert(mem->end_ <= mem->buf_size_);
@@ -204,7 +207,9 @@ static int RemapMemBuffer(WebPIDecoder* const idec,
                           const uint8_t* const data, size_t data_size) {
   MemBuffer* const mem = &idec->mem_;
   const uint8_t* const old_buf = mem->buf_;
-  const uint8_t* const old_start = old_buf + mem->start_;
+  const uint8_t* const old_start =
+      (old_buf == NULL) ? NULL : old_buf + mem->start_;
+  assert(old_buf != NULL || mem->start_ == 0);
   assert(mem->mode_ == MEM_MODE_MAP);
 
   if (data_size < mem->buf_size_) return 0;  // can't remap to a shorter buffer!
diff --git a/src/3rdparty/libwebp/src/dec/io_dec.c b/src/3rdparty/libwebp/src/dec/io_dec.c
index e603f19..5ef6298 100644
--- a/src/3rdparty/libwebp/src/dec/io_dec.c
+++ b/src/3rdparty/libwebp/src/dec/io_dec.c
@@ -25,21 +25,16 @@
 static int EmitYUV(const VP8Io* const io, WebPDecParams* const p) {
   WebPDecBuffer* output = p->output;
   const WebPYUVABuffer* const buf = &output->u.YUVA;
-  uint8_t* const y_dst = buf->y + io->mb_y * buf->y_stride;
-  uint8_t* const u_dst = buf->u + (io->mb_y >> 1) * buf->u_stride;
-  uint8_t* const v_dst = buf->v + (io->mb_y >> 1) * buf->v_stride;
+  uint8_t* const y_dst = buf->y + (size_t)io->mb_y * buf->y_stride;
+  uint8_t* const u_dst = buf->u + (size_t)(io->mb_y >> 1) * buf->u_stride;
+  uint8_t* const v_dst = buf->v + (size_t)(io->mb_y >> 1) * buf->v_stride;
   const int mb_w = io->mb_w;
   const int mb_h = io->mb_h;
   const int uv_w = (mb_w + 1) / 2;
   const int uv_h = (mb_h + 1) / 2;
-  int j;
-  for (j = 0; j < mb_h; ++j) {
-    memcpy(y_dst + j * buf->y_stride, io->y + j * io->y_stride, mb_w);
-  }
-  for (j = 0; j < uv_h; ++j) {
-    memcpy(u_dst + j * buf->u_stride, io->u + j * io->uv_stride, uv_w);
-    memcpy(v_dst + j * buf->v_stride, io->v + j * io->uv_stride, uv_w);
-  }
+  WebPCopyPlane(io->y, io->y_stride, y_dst, buf->y_stride, mb_w, mb_h);
+  WebPCopyPlane(io->u, io->uv_stride, u_dst, buf->u_stride, uv_w, uv_h);
+  WebPCopyPlane(io->v, io->uv_stride, v_dst, buf->v_stride, uv_w, uv_h);
   return io->mb_h;
 }
 
@@ -47,7 +42,7 @@ static int EmitYUV(const VP8Io* const io, WebPDecParams* const p) {
 static int EmitSampledRGB(const VP8Io* const io, WebPDecParams* const p) {
   WebPDecBuffer* const output = p->output;
   WebPRGBABuffer* const buf = &output->u.RGBA;
-  uint8_t* const dst = buf->rgba + io->mb_y * buf->stride;
+  uint8_t* const dst = buf->rgba + (size_t)io->mb_y * buf->stride;
   WebPSamplerProcessPlane(io->y, io->y_stride,
                           io->u, io->v, io->uv_stride,
                           dst, buf->stride, io->mb_w, io->mb_h,
@@ -62,7 +57,7 @@ static int EmitSampledRGB(const VP8Io* const io, WebPDecParams* const p) {
 static int EmitFancyRGB(const VP8Io* const io, WebPDecParams* const p) {
   int num_lines_out = io->mb_h;   // a priori guess
   const WebPRGBABuffer* const buf = &p->output->u.RGBA;
-  uint8_t* dst = buf->rgba + io->mb_y * buf->stride;
+  uint8_t* dst = buf->rgba + (size_t)io->mb_y * buf->stride;
   WebPUpsampleLinePairFunc upsample = WebPUpsamplers[p->output->colorspace];
   const uint8_t* cur_y = io->y;
   const uint8_t* cur_u = io->u;
@@ -133,7 +128,7 @@ static int EmitAlphaYUV(const VP8Io* const io, WebPDecParams* const p,
   const WebPYUVABuffer* const buf = &p->output->u.YUVA;
   const int mb_w = io->mb_w;
   const int mb_h = io->mb_h;
-  uint8_t* dst = buf->a + io->mb_y * buf->a_stride;
+  uint8_t* dst = buf->a + (size_t)io->mb_y * buf->a_stride;
   int j;
   (void)expected_num_lines_out;
   assert(expected_num_lines_out == mb_h);
@@ -186,7 +181,7 @@ static int EmitAlphaRGB(const VP8Io* const io, WebPDecParams* const p,
         (colorspace == MODE_ARGB || colorspace == MODE_Argb);
     const WebPRGBABuffer* const buf = &p->output->u.RGBA;
     int num_rows;
-    const int start_y = GetAlphaSourceRow(io, &alpha, &num_rows);
+    const size_t start_y = GetAlphaSourceRow(io, &alpha, &num_rows);
     uint8_t* const base_rgba = buf->rgba + start_y * buf->stride;
     uint8_t* const dst = base_rgba + (alpha_first ? 0 : 3);
     const int has_alpha = WebPDispatchAlpha(alpha, io->width, mb_w,
@@ -210,7 +205,7 @@ static int EmitAlphaRGBA4444(const VP8Io* const io, WebPDecParams* const p,
     const WEBP_CSP_MODE colorspace = p->output->colorspace;
     const WebPRGBABuffer* const buf = &p->output->u.RGBA;
     int num_rows;
-    const int start_y = GetAlphaSourceRow(io, &alpha, &num_rows);
+    const size_t start_y = GetAlphaSourceRow(io, &alpha, &num_rows);
     uint8_t* const base_rgba = buf->rgba + start_y * buf->stride;
 #if (WEBP_SWAP_16BIT_CSP == 1)
     uint8_t* alpha_dst = base_rgba;
@@ -276,9 +271,9 @@ static int EmitRescaledYUV(const VP8Io* const io, WebPDecParams* const p) {
 static int EmitRescaledAlphaYUV(const VP8Io* const io, WebPDecParams* const p,
                                 int expected_num_lines_out) {
   const WebPYUVABuffer* const buf = &p->output->u.YUVA;
-  uint8_t* const dst_a = buf->a + p->last_y * buf->a_stride;
+  uint8_t* const dst_a = buf->a + (size_t)p->last_y * buf->a_stride;
   if (io->a != NULL) {
-    uint8_t* const dst_y = buf->y + p->last_y * buf->y_stride;
+    uint8_t* const dst_y = buf->y + (size_t)p->last_y * buf->y_stride;
     const int num_lines_out = Rescale(io->a, io->width, io->mb_h, p->scaler_a);
     assert(expected_num_lines_out == num_lines_out);
     if (num_lines_out > 0) {   // unmultiply the Y
@@ -303,46 +298,57 @@ static int InitYUVRescaler(const VP8Io* const io, WebPDecParams* const p) {
   const int uv_out_height = (out_height + 1) >> 1;
   const int uv_in_width  = (io->mb_w + 1) >> 1;
   const int uv_in_height = (io->mb_h + 1) >> 1;
-  const size_t work_size = 2 * out_width;   // scratch memory for luma rescaler
+  // scratch memory for luma rescaler
+  const size_t work_size = 2 * (size_t)out_width;
   const size_t uv_work_size = 2 * uv_out_width;  // and for each u/v ones
-  size_t tmp_size, rescaler_size;
+  uint64_t total_size;
+  size_t rescaler_size;
   rescaler_t* work;
   WebPRescaler* scalers;
   const int num_rescalers = has_alpha ? 4 : 3;
 
-  tmp_size = (work_size + 2 * uv_work_size) * sizeof(*work);
+  total_size = ((uint64_t)work_size + 2 * uv_work_size) * sizeof(*work);
   if (has_alpha) {
-    tmp_size += work_size * sizeof(*work);
+    total_size += (uint64_t)work_size * sizeof(*work);
   }
   rescaler_size = num_rescalers * sizeof(*p->scaler_y) + WEBP_ALIGN_CST;
+  total_size += rescaler_size;
+  if (!CheckSizeOverflow(total_size)) {
+    return 0;
+  }
 
-  p->memory = WebPSafeMalloc(1ULL, tmp_size + rescaler_size);
+  p->memory = WebPSafeMalloc(1ULL, (size_t)total_size);
   if (p->memory == NULL) {
     return 0;   // memory error
   }
   work = (rescaler_t*)p->memory;
 
-  scalers = (WebPRescaler*)WEBP_ALIGN((const uint8_t*)work + tmp_size);
+  scalers = (WebPRescaler*)WEBP_ALIGN(
+      (const uint8_t*)work + total_size - rescaler_size);
   p->scaler_y = &scalers[0];
   p->scaler_u = &scalers[1];
   p->scaler_v = &scalers[2];
   p->scaler_a = has_alpha ? &scalers[3] : NULL;
 
-  WebPRescalerInit(p->scaler_y, io->mb_w, io->mb_h,
-                   buf->y, out_width, out_height, buf->y_stride, 1,
-                   work);
-  WebPRescalerInit(p->scaler_u, uv_in_width, uv_in_height,
-                   buf->u, uv_out_width, uv_out_height, buf->u_stride, 1,
-                   work + work_size);
-  WebPRescalerInit(p->scaler_v, uv_in_width, uv_in_height,
-                   buf->v, uv_out_width, uv_out_height, buf->v_stride, 1,
-                   work + work_size + uv_work_size);
+  if (!WebPRescalerInit(p->scaler_y, io->mb_w, io->mb_h,
+                        buf->y, out_width, out_height, buf->y_stride, 1,
+                        work) ||
+      !WebPRescalerInit(p->scaler_u, uv_in_width, uv_in_height,
+                        buf->u, uv_out_width, uv_out_height, buf->u_stride, 1,
+                        work + work_size) ||
+      !WebPRescalerInit(p->scaler_v, uv_in_width, uv_in_height,
+                        buf->v, uv_out_width, uv_out_height, buf->v_stride, 1,
+                        work + work_size + uv_work_size)) {
+    return 0;
+  }
   p->emit = EmitRescaledYUV;
 
   if (has_alpha) {
-    WebPRescalerInit(p->scaler_a, io->mb_w, io->mb_h,
-                     buf->a, out_width, out_height, buf->a_stride, 1,
-                     work + work_size + 2 * uv_work_size);
+    if (!WebPRescalerInit(p->scaler_a, io->mb_w, io->mb_h,
+                          buf->a, out_width, out_height, buf->a_stride, 1,
+                          work + work_size + 2 * uv_work_size)) {
+      return 0;
+    }
     p->emit_alpha = EmitRescaledAlphaYUV;
     WebPInitAlphaProcessing();
   }
@@ -356,7 +362,7 @@ static int ExportRGB(WebPDecParams* const p, int y_pos) {
   const WebPYUV444Converter convert =
       WebPYUV444Converters[p->output->colorspace];
   const WebPRGBABuffer* const buf = &p->output->u.RGBA;
-  uint8_t* dst = buf->rgba + y_pos * buf->stride;
+  uint8_t* dst = buf->rgba + (size_t)y_pos * buf->stride;
   int num_lines_out = 0;
   // For RGB rescaling, because of the YUV420, current scan position
   // U/V can be +1/-1 line from the Y one.  Hence the double test.
@@ -383,15 +389,15 @@ static int EmitRescaledRGB(const VP8Io* const io, WebPDecParams* const p) {
   while (j < mb_h) {
     const int y_lines_in =
         WebPRescalerImport(p->scaler_y, mb_h - j,
-                           io->y + j * io->y_stride, io->y_stride);
+                           io->y + (size_t)j * io->y_stride, io->y_stride);
     j += y_lines_in;
     if (WebPRescaleNeededLines(p->scaler_u, uv_mb_h - uv_j)) {
-      const int u_lines_in =
-          WebPRescalerImport(p->scaler_u, uv_mb_h - uv_j,
-                             io->u + uv_j * io->uv_stride, io->uv_stride);
-      const int v_lines_in =
-          WebPRescalerImport(p->scaler_v, uv_mb_h - uv_j,
-                             io->v + uv_j * io->uv_stride, io->uv_stride);
+      const int u_lines_in = WebPRescalerImport(
+          p->scaler_u, uv_mb_h - uv_j, io->u + (size_t)uv_j * io->uv_stride,
+          io->uv_stride);
+      const int v_lines_in = WebPRescalerImport(
+          p->scaler_v, uv_mb_h - uv_j, io->v + (size_t)uv_j * io->uv_stride,
+          io->uv_stride);
       (void)v_lines_in;   // remove a gcc warning
       assert(u_lines_in == v_lines_in);
       uv_j += u_lines_in;
@@ -403,7 +409,7 @@ static int EmitRescaledRGB(const VP8Io* const io, WebPDecParams* const p) {
 
 static int ExportAlpha(WebPDecParams* const p, int y_pos, int max_lines_out) {
   const WebPRGBABuffer* const buf = &p->output->u.RGBA;
-  uint8_t* const base_rgba = buf->rgba + y_pos * buf->stride;
+  uint8_t* const base_rgba = buf->rgba + (size_t)y_pos * buf->stride;
   const WEBP_CSP_MODE colorspace = p->output->colorspace;
   const int alpha_first =
       (colorspace == MODE_ARGB || colorspace == MODE_Argb);
@@ -431,7 +437,7 @@ static int ExportAlpha(WebPDecParams* const p, int y_pos, int max_lines_out) {
 static int ExportAlphaRGBA4444(WebPDecParams* const p, int y_pos,
                                int max_lines_out) {
   const WebPRGBABuffer* const buf = &p->output->u.RGBA;
-  uint8_t* const base_rgba = buf->rgba + y_pos * buf->stride;
+  uint8_t* const base_rgba = buf->rgba + (size_t)y_pos * buf->stride;
 #if (WEBP_SWAP_16BIT_CSP == 1)
   uint8_t* alpha_dst = base_rgba;
 #else
@@ -470,7 +476,7 @@ static int EmitRescaledAlphaRGB(const VP8Io* const io, WebPDecParams* const p,
     int lines_left = expected_num_out_lines;
     const int y_end = p->last_y + lines_left;
     while (lines_left > 0) {
-      const int row_offset = scaler->src_y - io->mb_y;
+      const int64_t row_offset = (int64_t)scaler->src_y - io->mb_y;
       WebPRescalerImport(scaler, io->mb_h + io->mb_y - scaler->src_y,
                          io->a + row_offset * io->width, io->width);
       lines_left -= p->emit_alpha_row(p, y_end - lines_left, lines_left);
@@ -485,51 +491,58 @@ static int InitRGBRescaler(const VP8Io* const io, WebPDecParams* const p) {
   const int out_height = io->scaled_height;
   const int uv_in_width  = (io->mb_w + 1) >> 1;
   const int uv_in_height = (io->mb_h + 1) >> 1;
-  const size_t work_size = 2 * out_width;   // scratch memory for one rescaler
+  // scratch memory for one rescaler
+  const size_t work_size = 2 * (size_t)out_width;
   rescaler_t* work;  // rescalers work area
   uint8_t* tmp;   // tmp storage for scaled YUV444 samples before RGB conversion
-  size_t tmp_size1, tmp_size2, total_size, rescaler_size;
+  uint64_t tmp_size1, tmp_size2, total_size;
+  size_t rescaler_size;
   WebPRescaler* scalers;
   const int num_rescalers = has_alpha ? 4 : 3;
 
-  tmp_size1 = 3 * work_size;
-  tmp_size2 = 3 * out_width;
-  if (has_alpha) {
-    tmp_size1 += work_size;
-    tmp_size2 += out_width;
-  }
+  tmp_size1 = (uint64_t)num_rescalers * work_size;
+  tmp_size2 = (uint64_t)num_rescalers * out_width;
   total_size = tmp_size1 * sizeof(*work) + tmp_size2 * sizeof(*tmp);
   rescaler_size = num_rescalers * sizeof(*p->scaler_y) + WEBP_ALIGN_CST;
+  total_size += rescaler_size;
+  if (!CheckSizeOverflow(total_size)) {
+    return 0;
+  }
 
-  p->memory = WebPSafeMalloc(1ULL, total_size + rescaler_size);
+  p->memory = WebPSafeMalloc(1ULL, (size_t)total_size);
   if (p->memory == NULL) {
     return 0;   // memory error
   }
   work = (rescaler_t*)p->memory;
   tmp = (uint8_t*)(work + tmp_size1);
 
-  scalers = (WebPRescaler*)WEBP_ALIGN((const uint8_t*)work + total_size);
+  scalers = (WebPRescaler*)WEBP_ALIGN(
+      (const uint8_t*)work + total_size - rescaler_size);
   p->scaler_y = &scalers[0];
   p->scaler_u = &scalers[1];
   p->scaler_v = &scalers[2];
   p->scaler_a = has_alpha ? &scalers[3] : NULL;
 
-  WebPRescalerInit(p->scaler_y, io->mb_w, io->mb_h,
-                   tmp + 0 * out_width, out_width, out_height, 0, 1,
-                   work + 0 * work_size);
-  WebPRescalerInit(p->scaler_u, uv_in_width, uv_in_height,
-                   tmp + 1 * out_width, out_width, out_height, 0, 1,
-                   work + 1 * work_size);
-  WebPRescalerInit(p->scaler_v, uv_in_width, uv_in_height,
-                   tmp + 2 * out_width, out_width, out_height, 0, 1,
-                   work + 2 * work_size);
+  if (!WebPRescalerInit(p->scaler_y, io->mb_w, io->mb_h,
+                        tmp + 0 * out_width, out_width, out_height, 0, 1,
+                        work + 0 * work_size) ||
+      !WebPRescalerInit(p->scaler_u, uv_in_width, uv_in_height,
+                        tmp + 1 * out_width, out_width, out_height, 0, 1,
+                        work + 1 * work_size) ||
+      !WebPRescalerInit(p->scaler_v, uv_in_width, uv_in_height,
+                        tmp + 2 * out_width, out_width, out_height, 0, 1,
+                        work + 2 * work_size)) {
+    return 0;
+  }
   p->emit = EmitRescaledRGB;
   WebPInitYUV444Converters();
 
   if (has_alpha) {
-    WebPRescalerInit(p->scaler_a, io->mb_w, io->mb_h,
-                     tmp + 3 * out_width, out_width, out_height, 0, 1,
-                     work + 3 * work_size);
+    if (!WebPRescalerInit(p->scaler_a, io->mb_w, io->mb_h,
+                          tmp + 3 * out_width, out_width, out_height, 0, 1,
+                          work + 3 * work_size)) {
+      return 0;
+    }
     p->emit_alpha = EmitRescaledAlphaRGB;
     if (p->output->colorspace == MODE_RGBA_4444 ||
         p->output->colorspace == MODE_rgbA_4444) {
diff --git a/src/3rdparty/libwebp/src/dec/vp8_dec.c b/src/3rdparty/libwebp/src/dec/vp8_dec.c
index 57efb69..2003935 100644
--- a/src/3rdparty/libwebp/src/dec/vp8_dec.c
+++ b/src/3rdparty/libwebp/src/dec/vp8_dec.c
@@ -335,7 +335,7 @@ int VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io) {
     io->scaled_width = io->width;
     io->scaled_height = io->height;
 
-    io->mb_w = io->width;   // sanity check
+    io->mb_w = io->width;   // for soundness
     io->mb_h = io->height;  // ditto
 
     VP8ResetProba(&dec->proba_);
@@ -403,7 +403,7 @@ static const uint8_t kZigzag[16] = {
   0, 1, 4, 8,  5, 2, 3, 6,  9, 12, 13, 10,  7, 11, 14, 15
 };
 
-// See section 13-2: http://tools.ietf.org/html/rfc6386#section-13.2
+// See section 13-2: https://datatracker.ietf.org/doc/html/rfc6386#section-13.2
 static int GetLargeValue(VP8BitReader* const br, const uint8_t* const p) {
   int v;
   if (!VP8GetBit(br, p[3], "coeffs")) {
@@ -494,13 +494,11 @@ static int GetCoeffsAlt(VP8BitReader* const br,
   return 16;
 }
 
-static WEBP_TSAN_IGNORE_FUNCTION void InitGetCoeffs(void) {
-  if (GetCoeffs == NULL) {
-    if (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kSlowSSSE3)) {
-      GetCoeffs = GetCoeffsAlt;
-    } else {
-      GetCoeffs = GetCoeffsFast;
-    }
+WEBP_DSP_INIT_FUNC(InitGetCoeffs) {
+  if (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kSlowSSSE3)) {
+    GetCoeffs = GetCoeffsAlt;
+  } else {
+    GetCoeffs = GetCoeffsFast;
   }
 }
 
diff --git a/src/3rdparty/libwebp/src/dec/vp8i_dec.h b/src/3rdparty/libwebp/src/dec/vp8i_dec.h
index 3de8d86..83791ec 100644
--- a/src/3rdparty/libwebp/src/dec/vp8i_dec.h
+++ b/src/3rdparty/libwebp/src/dec/vp8i_dec.h
@@ -31,8 +31,8 @@ extern "C" {
 
 // version numbers
 #define DEC_MAJ_VERSION 1
-#define DEC_MIN_VERSION 0
-#define DEC_REV_VERSION 3
+#define DEC_MIN_VERSION 3
+#define DEC_REV_VERSION 0
 
 // YUV-cache parameters. Cache is 32-bytes wide (= one cacheline).
 // Constraints are: We need to store one 16x16 block of luma samples (y),
diff --git a/src/3rdparty/libwebp/src/dec/vp8l_dec.c b/src/3rdparty/libwebp/src/dec/vp8l_dec.c
index d3e2711..c0ea018 100644
--- a/src/3rdparty/libwebp/src/dec/vp8l_dec.c
+++ b/src/3rdparty/libwebp/src/dec/vp8l_dec.c
@@ -84,7 +84,7 @@ static const uint8_t kCodeToPlane[CODE_TO_PLANE_CODES] = {
 // to 256 (green component values) + 24 (length prefix values)
 // + color_cache_size (between 0 and 2048).
 // All values computed for 8-bit first level lookup with Mark Adler's tool:
-// http://www.hdfgroup.org/ftp/lib-external/zlib/zlib-1.2.5/examples/enough.c
+// https://github.com/madler/zlib/blob/v1.2.5/examples/enough.c
 #define FIXED_TABLE_SIZE (630 * 3 + 410)
 static const uint16_t kTableSize[12] = {
   FIXED_TABLE_SIZE + 654,
@@ -178,7 +178,7 @@ static WEBP_INLINE int PlaneCodeToDistance(int xsize, int plane_code) {
 
 //------------------------------------------------------------------------------
 // Decodes the next Huffman code from bit-stream.
-// FillBitWindow(br) needs to be called at minimum every second call
+// VP8LFillBitWindow(br) needs to be called at minimum every second call
 // to ReadSymbol, in order to pre-fetch enough bits.
 static WEBP_INLINE int ReadSymbol(const HuffmanCode* table,
                                   VP8LBitReader* const br) {
@@ -321,7 +321,7 @@ static int ReadHuffmanCode(int alphabet_size, VP8LDecoder* const dec,
     // The first code is either 1 bit or 8 bit code.
     int symbol = VP8LReadBits(br, (first_symbol_len_code == 0) ? 1 : 8);
     code_lengths[symbol] = 1;
-    // The second code (if present), is always 8 bit long.
+    // The second code (if present), is always 8 bits long.
     if (num_symbols == 2) {
       symbol = VP8LReadBits(br, 8);
       code_lengths[symbol] = 1;
@@ -559,8 +559,11 @@ static int AllocateAndInitRescaler(VP8LDecoder* const dec, VP8Io* const io) {
   memory += work_size * sizeof(*work);
   scaled_data = (uint32_t*)memory;
 
-  WebPRescalerInit(dec->rescaler, in_width, in_height, (uint8_t*)scaled_data,
-                   out_width, out_height, 0, num_channels, work);
+  if (!WebPRescalerInit(dec->rescaler, in_width, in_height,
+                        (uint8_t*)scaled_data, out_width, out_height,
+                        0, num_channels, work)) {
+    return 0;
+  }
   return 1;
 }
 #endif   // WEBP_REDUCE_SIZE
@@ -574,13 +577,14 @@ static int AllocateAndInitRescaler(VP8LDecoder* const dec, VP8Io* const io) {
 static int Export(WebPRescaler* const rescaler, WEBP_CSP_MODE colorspace,
                   int rgba_stride, uint8_t* const rgba) {
   uint32_t* const src = (uint32_t*)rescaler->dst;
+  uint8_t* dst = rgba;
   const int dst_width = rescaler->dst_width;
   int num_lines_out = 0;
   while (WebPRescalerHasPendingOutput(rescaler)) {
-    uint8_t* const dst = rgba + num_lines_out * rgba_stride;
     WebPRescalerExportRow(rescaler);
     WebPMultARGBRow(src, dst_width, 1);
     VP8LConvertFromBGRA(src, dst_width, colorspace, dst);
+    dst += rgba_stride;
     ++num_lines_out;
   }
   return num_lines_out;
@@ -594,8 +598,8 @@ static int EmitRescaledRowsRGBA(const VP8LDecoder* const dec,
   int num_lines_in = 0;
   int num_lines_out = 0;
   while (num_lines_in < mb_h) {
-    uint8_t* const row_in = in + num_lines_in * in_stride;
-    uint8_t* const row_out = out + num_lines_out * out_stride;
+    uint8_t* const row_in = in + (uint64_t)num_lines_in * in_stride;
+    uint8_t* const row_out = out + (uint64_t)num_lines_out * out_stride;
     const int lines_left = mb_h - num_lines_in;
     const int needed_lines = WebPRescaleNeededLines(dec->rescaler, lines_left);
     int lines_imported;
@@ -754,11 +758,11 @@ static WEBP_INLINE HTreeGroup* GetHtreeGroupForPos(VP8LMetadata* const hdr,
 
 typedef void (*ProcessRowsFunc)(VP8LDecoder* const dec, int row);
 
-static void ApplyInverseTransforms(VP8LDecoder* const dec, int num_rows,
+static void ApplyInverseTransforms(VP8LDecoder* const dec,
+                                   int start_row, int num_rows,
                                    const uint32_t* const rows) {
   int n = dec->next_transform_;
   const int cache_pixs = dec->width_ * num_rows;
-  const int start_row = dec->last_row_;
   const int end_row = start_row + num_rows;
   const uint32_t* rows_in = rows;
   uint32_t* const rows_out = dec->argb_cache_;
@@ -789,15 +793,15 @@ static void ProcessRows(VP8LDecoder* const dec, int row) {
     VP8Io* const io = dec->io_;
     uint8_t* rows_data = (uint8_t*)dec->argb_cache_;
     const int in_stride = io->width * sizeof(uint32_t);  // in unit of RGBA
-
-    ApplyInverseTransforms(dec, num_rows, rows);
+    ApplyInverseTransforms(dec, dec->last_row_, num_rows, rows);
     if (!SetCropWindow(io, dec->last_row_, row, &rows_data, in_stride)) {
       // Nothing to output (this time).
     } else {
       const WebPDecBuffer* const output = dec->output_;
       if (WebPIsRGBMode(output->colorspace)) {  // convert to RGBA
         const WebPRGBABuffer* const buf = &output->u.RGBA;
-        uint8_t* const rgba = buf->rgba + dec->last_out_row_ * buf->stride;
+        uint8_t* const rgba =
+            buf->rgba + (int64_t)dec->last_out_row_ * buf->stride;
         const int num_rows_out =
 #if !defined(WEBP_REDUCE_SIZE)
          io->use_scaling ?
@@ -948,7 +952,6 @@ static WEBP_INLINE void CopyBlock8b(uint8_t* const dst, int dist, int length) {
         break;
       default:
         goto Copy;
-        break;
     }
     CopySmallPattern8b(src, dst, length, pattern);
     return;
@@ -1193,6 +1196,7 @@ static int DecodeImageData(VP8LDecoder* const dec, uint32_t* const data,
       VP8LFillBitWindow(br);
       dist_code = GetCopyDistance(dist_symbol, br);
       dist = PlaneCodeToDistance(width, dist_code);
+
       if (VP8LIsEndOfStream(br)) break;
       if (src - data < (ptrdiff_t)dist || src_end - src < (ptrdiff_t)length) {
         goto Error;
@@ -1277,7 +1281,7 @@ static int ExpandColorMap(int num_colors, VP8LTransform* const transform) {
     uint8_t* const new_data = (uint8_t*)new_color_map;
     new_color_map[0] = transform->data_[0];
     for (i = 4; i < 4 * num_colors; ++i) {
-      // Equivalent to AddPixelEq(), on a byte-basis.
+      // Equivalent to VP8LAddPixels(), on a byte-basis.
       new_data[i] = (data[i] + new_data[i - 4]) & 0xff;
     }
     for (; i < 4 * final_num_colors; ++i) {
@@ -1332,7 +1336,7 @@ static int ReadTransform(int* const xsize, int const* ysize,
        ok = ok && ExpandColorMap(num_colors, transform);
       break;
     }
-    case SUBTRACT_GREEN:
+    case SUBTRACT_GREEN_TRANSFORM:
       break;
     default:
       assert(0);    // can't happen
@@ -1515,7 +1519,7 @@ static int AllocateInternalBuffers32b(VP8LDecoder* const dec, int final_width) {
   assert(dec->width_ <= final_width);
   dec->pixels_ = (uint32_t*)WebPSafeMalloc(total_num_pixels, sizeof(uint32_t));
   if (dec->pixels_ == NULL) {
-    dec->argb_cache_ = NULL;    // for sanity check
+    dec->argb_cache_ = NULL;    // for soundness
     dec->status_ = VP8_STATUS_OUT_OF_MEMORY;
     return 0;
   }
@@ -1525,7 +1529,7 @@ static int AllocateInternalBuffers32b(VP8LDecoder* const dec, int final_width) {
 
 static int AllocateInternalBuffers8b(VP8LDecoder* const dec) {
   const uint64_t total_num_pixels = (uint64_t)dec->width_ * dec->height_;
-  dec->argb_cache_ = NULL;    // for sanity check
+  dec->argb_cache_ = NULL;    // for soundness
   dec->pixels_ = (uint32_t*)WebPSafeMalloc(total_num_pixels, sizeof(uint8_t));
   if (dec->pixels_ == NULL) {
     dec->status_ = VP8_STATUS_OUT_OF_MEMORY;
@@ -1553,7 +1557,7 @@ static void ExtractAlphaRows(VP8LDecoder* const dec, int last_row) {
     const int cache_pixs = width * num_rows_to_process;
     uint8_t* const dst = output + width * cur_row;
     const uint32_t* const src = dec->argb_cache_;
-    ApplyInverseTransforms(dec, num_rows_to_process, in);
+    ApplyInverseTransforms(dec, cur_row, num_rows_to_process, in);
     WebPExtractGreen(src, dst, cache_pixs);
     AlphaApplyFilter(alph_dec,
                      cur_row, cur_row + num_rows_to_process, dst, width);
@@ -1667,7 +1671,6 @@ int VP8LDecodeImage(VP8LDecoder* const dec) {
   VP8Io* io = NULL;
   WebPDecParams* params = NULL;
 
-  // Sanity checks.
   if (dec == NULL) return 0;
 
   assert(dec->hdr_.huffman_tables_ != NULL);
diff --git a/src/3rdparty/libwebp/src/dec/vp8li_dec.h b/src/3rdparty/libwebp/src/dec/vp8li_dec.h
index 0a4d613..72b2e86 100644
--- a/src/3rdparty/libwebp/src/dec/vp8li_dec.h
+++ b/src/3rdparty/libwebp/src/dec/vp8li_dec.h
@@ -37,7 +37,7 @@ struct VP8LTransform {
   int                    bits_;   // subsampling bits defining transform window.
   int                    xsize_;  // transform window X index.
   int                    ysize_;  // transform window Y index.
-  uint32_t              *data_;   // transform data.
+  uint32_t*              data_;   // transform data.
 };
 
 typedef struct {
@@ -48,23 +48,23 @@ typedef struct {
   int             huffman_mask_;
   int             huffman_subsample_bits_;
   int             huffman_xsize_;
-  uint32_t       *huffman_image_;
+  uint32_t*       huffman_image_;
   int             num_htree_groups_;
-  HTreeGroup     *htree_groups_;
-  HuffmanCode    *huffman_tables_;
+  HTreeGroup*     htree_groups_;
+  HuffmanCode*    huffman_tables_;
 } VP8LMetadata;
 
 typedef struct VP8LDecoder VP8LDecoder;
 struct VP8LDecoder {
   VP8StatusCode    status_;
   VP8LDecodeState  state_;
-  VP8Io           *io_;
+  VP8Io*           io_;
 
-  const WebPDecBuffer *output_;    // shortcut to io->opaque->output
+  const WebPDecBuffer* output_;    // shortcut to io->opaque->output
 
-  uint32_t        *pixels_;        // Internal data: either uint8_t* for alpha
+  uint32_t*        pixels_;        // Internal data: either uint8_t* for alpha
                                    // or uint32_t* for BGRA.
-  uint32_t        *argb_cache_;    // Scratch buffer for temporary BGRA storage.
+  uint32_t*        argb_cache_;    // Scratch buffer for temporary BGRA storage.
 
   VP8LBitReader    br_;
   int              incremental_;   // if true, incremental decoding is expected
@@ -86,8 +86,8 @@ struct VP8LDecoder {
   // or'd bitset storing the transforms types.
   uint32_t         transforms_seen_;
 
-  uint8_t         *rescaler_memory;  // Working memory for rescaling work.
-  WebPRescaler    *rescaler;         // Common rescaler for all channels.
+  uint8_t*         rescaler_memory;  // Working memory for rescaling work.
+  WebPRescaler*    rescaler;         // Common rescaler for all channels.
 };
 
 //------------------------------------------------------------------------------
diff --git a/src/3rdparty/libwebp/src/dec/webp_dec.c b/src/3rdparty/libwebp/src/dec/webp_dec.c
index 42d0988..3f4f7bb 100644
--- a/src/3rdparty/libwebp/src/dec/webp_dec.c
+++ b/src/3rdparty/libwebp/src/dec/webp_dec.c
@@ -179,7 +179,7 @@ static VP8StatusCode ParseOptionalChunks(const uint8_t** const data,
       return VP8_STATUS_BITSTREAM_ERROR;          // Not a valid chunk size.
     }
     // For odd-sized chunk-payload, there's one byte padding at the end.
-    disk_chunk_size = (CHUNK_HEADER_SIZE + chunk_size + 1) & ~1;
+    disk_chunk_size = (CHUNK_HEADER_SIZE + chunk_size + 1) & ~1u;
     total_size += disk_chunk_size;
 
     // Check that total bytes skipped so far does not exceed riff_size.
@@ -785,6 +785,13 @@ VP8StatusCode WebPDecode(const uint8_t* data, size_t data_size,
 //------------------------------------------------------------------------------
 // Cropping and rescaling.
 
+int WebPCheckCropDimensions(int image_width, int image_height,
+                            int x, int y, int w, int h) {
+  return !(x < 0 || y < 0 || w <= 0 || h <= 0 ||
+           x >= image_width || w > image_width || w > image_width - x ||
+           y >= image_height || h > image_height || h > image_height - y);
+}
+
 int WebPIoInitFromOptions(const WebPDecoderOptions* const options,
                           VP8Io* const io, WEBP_CSP_MODE src_colorspace) {
   const int W = io->width;
@@ -792,7 +799,7 @@ int WebPIoInitFromOptions(const WebPDecoderOptions* const options,
   int x = 0, y = 0, w = W, h = H;
 
   // Cropping
-  io->use_cropping = (options != NULL) && (options->use_cropping > 0);
+  io->use_cropping = (options != NULL) && options->use_cropping;
   if (io->use_cropping) {
     w = options->crop_width;
     h = options->crop_height;
@@ -802,7 +809,7 @@ int WebPIoInitFromOptions(const WebPDecoderOptions* const options,
       x &= ~1;
       y &= ~1;
     }
-    if (x < 0 || y < 0 || w <= 0 || h <= 0 || x + w > W || y + h > H) {
+    if (!WebPCheckCropDimensions(W, H, x, y, w, h)) {
       return 0;  // out of frame boundary error
     }
   }
@@ -814,7 +821,7 @@ int WebPIoInitFromOptions(const WebPDecoderOptions* const options,
   io->mb_h = h;
 
   // Scaling
-  io->use_scaling = (options != NULL) && (options->use_scaling > 0);
+  io->use_scaling = (options != NULL) && options->use_scaling;
   if (io->use_scaling) {
     int scaled_width = options->scaled_width;
     int scaled_height = options->scaled_height;
@@ -835,8 +842,8 @@ int WebPIoInitFromOptions(const WebPDecoderOptions* const options,
 
   if (io->use_scaling) {
     // disable filter (only for large downscaling ratio).
-    io->bypass_filtering = (io->scaled_width < W * 3 / 4) &&
-                           (io->scaled_height < H * 3 / 4);
+    io->bypass_filtering |= (io->scaled_width < W * 3 / 4) &&
+                            (io->scaled_height < H * 3 / 4);
     io->fancy_upsampling = 0;
   }
   return 1;
diff --git a/src/3rdparty/libwebp/src/dec/webpi_dec.h b/src/3rdparty/libwebp/src/dec/webpi_dec.h
index 24baff5..3b97388 100644
--- a/src/3rdparty/libwebp/src/dec/webpi_dec.h
+++ b/src/3rdparty/libwebp/src/dec/webpi_dec.h
@@ -77,6 +77,10 @@ VP8StatusCode WebPParseHeaders(WebPHeaderStructure* const headers);
 //------------------------------------------------------------------------------
 // Misc utils
 
+// Returns true if crop dimensions are within image bounds.
+int WebPCheckCropDimensions(int image_width, int image_height,
+                            int x, int y, int w, int h);
+
 // Initializes VP8Io with custom setup, io and teardown functions. The default
 // hooks will use the supplied 'params' as io->opaque handle.
 void WebPInitCustomIo(WebPDecParams* const params, VP8Io* const io);
diff --git a/src/3rdparty/libwebp/src/demux/anim_decode.c b/src/3rdparty/libwebp/src/demux/anim_decode.c
index 05dd707..e077ffb 100644
--- a/src/3rdparty/libwebp/src/demux/anim_decode.c
+++ b/src/3rdparty/libwebp/src/demux/anim_decode.c
@@ -23,6 +23,14 @@
 
 #define NUM_CHANNELS 4
 
+// Channel extraction from a uint32_t representation of a uint8_t RGBA/BGRA
+// buffer.
+#ifdef WORDS_BIGENDIAN
+#define CHANNEL_SHIFT(i) (24 - (i) * 8)
+#else
+#define CHANNEL_SHIFT(i) ((i) * 8)
+#endif
+
 typedef void (*BlendRowFunc)(uint32_t* const, const uint32_t* const, int);
 static void BlendPixelRowNonPremult(uint32_t* const src,
                                     const uint32_t* const dst, int num_pixels);
@@ -87,11 +95,19 @@ WebPAnimDecoder* WebPAnimDecoderNewInternal(
     int abi_version) {
   WebPAnimDecoderOptions options;
   WebPAnimDecoder* dec = NULL;
+  WebPBitstreamFeatures features;
   if (webp_data == NULL ||
       WEBP_ABI_IS_INCOMPATIBLE(abi_version, WEBP_DEMUX_ABI_VERSION)) {
     return NULL;
   }
 
+  // Validate the bitstream before doing expensive allocations. The demuxer may
+  // be more tolerant than the decoder.
+  if (WebPGetFeatures(webp_data->bytes, webp_data->size, &features) !=
+      VP8_STATUS_OK) {
+    return NULL;
+  }
+
   // Note: calloc() so that the pointer members are initialized to NULL.
   dec = (WebPAnimDecoder*)WebPSafeCalloc(1ULL, sizeof(*dec));
   if (dec == NULL) goto Error;
@@ -145,7 +161,7 @@ static int ZeroFillCanvas(uint8_t* buf, uint32_t canvas_width,
                           uint32_t canvas_height) {
   const uint64_t size =
       (uint64_t)canvas_width * canvas_height * NUM_CHANNELS * sizeof(*buf);
-  if (size != (size_t)size) return 0;
+  if (!CheckSizeOverflow(size)) return 0;
   memset(buf, 0, (size_t)size);
   return 1;
 }
@@ -166,7 +182,7 @@ static void ZeroFillFrameRect(uint8_t* buf, int buf_stride, int x_offset,
 static int CopyCanvas(const uint8_t* src, uint8_t* dst,
                       uint32_t width, uint32_t height) {
   const uint64_t size = (uint64_t)width * height * NUM_CHANNELS;
-  if (size != (size_t)size) return 0;
+  if (!CheckSizeOverflow(size)) return 0;
   assert(src != NULL && dst != NULL);
   memcpy(dst, src, (size_t)size);
   return 1;
@@ -201,35 +217,35 @@ static uint8_t BlendChannelNonPremult(uint32_t src, uint8_t src_a,
   const uint8_t dst_channel = (dst >> shift) & 0xff;
   const uint32_t blend_unscaled = src_channel * src_a + dst_channel * dst_a;
   assert(blend_unscaled < (1ULL << 32) / scale);
-  return (blend_unscaled * scale) >> 24;
+  return (blend_unscaled * scale) >> CHANNEL_SHIFT(3);
 }
 
 // Blend 'src' over 'dst' assuming they are NOT pre-multiplied by alpha.
 static uint32_t BlendPixelNonPremult(uint32_t src, uint32_t dst) {
-  const uint8_t src_a = (src >> 24) & 0xff;
+  const uint8_t src_a = (src >> CHANNEL_SHIFT(3)) & 0xff;
 
   if (src_a == 0) {
     return dst;
   } else {
-    const uint8_t dst_a = (dst >> 24) & 0xff;
+    const uint8_t dst_a = (dst >> CHANNEL_SHIFT(3)) & 0xff;
     // This is the approximate integer arithmetic for the actual formula:
     // dst_factor_a = (dst_a * (255 - src_a)) / 255.
     const uint8_t dst_factor_a = (dst_a * (256 - src_a)) >> 8;
     const uint8_t blend_a = src_a + dst_factor_a;
     const uint32_t scale = (1UL << 24) / blend_a;
 
-    const uint8_t blend_r =
-        BlendChannelNonPremult(src, src_a, dst, dst_factor_a, scale, 0);
-    const uint8_t blend_g =
-        BlendChannelNonPremult(src, src_a, dst, dst_factor_a, scale, 8);
-    const uint8_t blend_b =
-        BlendChannelNonPremult(src, src_a, dst, dst_factor_a, scale, 16);
+    const uint8_t blend_r = BlendChannelNonPremult(
+        src, src_a, dst, dst_factor_a, scale, CHANNEL_SHIFT(0));
+    const uint8_t blend_g = BlendChannelNonPremult(
+        src, src_a, dst, dst_factor_a, scale, CHANNEL_SHIFT(1));
+    const uint8_t blend_b = BlendChannelNonPremult(
+        src, src_a, dst, dst_factor_a, scale, CHANNEL_SHIFT(2));
     assert(src_a + dst_factor_a < 256);
 
-    return (blend_r << 0) |
-           (blend_g << 8) |
-           (blend_b << 16) |
-           ((uint32_t)blend_a << 24);
+    return ((uint32_t)blend_r << CHANNEL_SHIFT(0)) |
+           ((uint32_t)blend_g << CHANNEL_SHIFT(1)) |
+           ((uint32_t)blend_b << CHANNEL_SHIFT(2)) |
+           ((uint32_t)blend_a << CHANNEL_SHIFT(3));
   }
 }
 
@@ -239,7 +255,7 @@ static void BlendPixelRowNonPremult(uint32_t* const src,
                                     const uint32_t* const dst, int num_pixels) {
   int i;
   for (i = 0; i < num_pixels; ++i) {
-    const uint8_t src_alpha = (src[i] >> 24) & 0xff;
+    const uint8_t src_alpha = (src[i] >> CHANNEL_SHIFT(3)) & 0xff;
     if (src_alpha != 0xff) {
       src[i] = BlendPixelNonPremult(src[i], dst[i]);
     }
@@ -256,7 +272,7 @@ static WEBP_INLINE uint32_t ChannelwiseMultiply(uint32_t pix, uint32_t scale) {
 
 // Blend 'src' over 'dst' assuming they are pre-multiplied by alpha.
 static uint32_t BlendPixelPremult(uint32_t src, uint32_t dst) {
-  const uint8_t src_a = (src >> 24) & 0xff;
+  const uint8_t src_a = (src >> CHANNEL_SHIFT(3)) & 0xff;
   return src + ChannelwiseMultiply(dst, 256 - src_a);
 }
 
@@ -266,7 +282,7 @@ static void BlendPixelRowPremult(uint32_t* const src, const uint32_t* const dst,
                                  int num_pixels) {
   int i;
   for (i = 0; i < num_pixels; ++i) {
-    const uint8_t src_alpha = (src[i] >> 24) & 0xff;
+    const uint8_t src_alpha = (src[i] >> CHANNEL_SHIFT(3)) & 0xff;
     if (src_alpha != 0xff) {
       src[i] = BlendPixelPremult(src[i], dst[i]);
     }
@@ -346,12 +362,15 @@ int WebPAnimDecoderGetNext(WebPAnimDecoder* dec,
   {
     const uint8_t* in = iter.fragment.bytes;
     const size_t in_size = iter.fragment.size;
-    const size_t out_offset =
-        (iter.y_offset * width + iter.x_offset) * NUM_CHANNELS;
+    const uint32_t stride = width * NUM_CHANNELS;  // at most 25 + 2 bits
+    const uint64_t out_offset = (uint64_t)iter.y_offset * stride +
+                                (uint64_t)iter.x_offset * NUM_CHANNELS;  // 53b
+    const uint64_t size = (uint64_t)iter.height * stride;  // at most 25 + 27b
     WebPDecoderConfig* const config = &dec->config_;
     WebPRGBABuffer* const buf = &config->output.u.RGBA;
-    buf->stride = NUM_CHANNELS * width;
-    buf->size = buf->stride * iter.height;
+    if ((size_t)size != size) goto Error;
+    buf->stride = (int)stride;
+    buf->size = (size_t)size;
     buf->rgba = dec->curr_frame_ + out_offset;
 
     if (WebPDecode(in, in_size, config) != VP8_STATUS_OK) {
diff --git a/src/3rdparty/libwebp/src/demux/demux.c b/src/3rdparty/libwebp/src/demux/demux.c
index ab6433e..324e5eb 100644
--- a/src/3rdparty/libwebp/src/demux/demux.c
+++ b/src/3rdparty/libwebp/src/demux/demux.c
@@ -24,8 +24,8 @@
 #include "src/webp/format_constants.h"
 
 #define DMUX_MAJ_VERSION 1
-#define DMUX_MIN_VERSION 0
-#define DMUX_REV_VERSION 3
+#define DMUX_MIN_VERSION 3
+#define DMUX_REV_VERSION 0
 
 typedef struct {
   size_t start_;        // start location of the data
@@ -221,12 +221,16 @@ static ParseStatus StoreFrame(int frame_num, uint32_t min_size,
     const size_t chunk_start_offset = mem->start_;
     const uint32_t fourcc = ReadLE32(mem);
     const uint32_t payload_size = ReadLE32(mem);
-    const uint32_t payload_size_padded = payload_size + (payload_size & 1);
-    const size_t payload_available = (payload_size_padded > MemDataSize(mem))
-                                   ? MemDataSize(mem) : payload_size_padded;
-    const size_t chunk_size = CHUNK_HEADER_SIZE + payload_available;
+    uint32_t payload_size_padded;
+    size_t payload_available;
+    size_t chunk_size;
 
     if (payload_size > MAX_CHUNK_PAYLOAD) return PARSE_ERROR;
+
+    payload_size_padded = payload_size + (payload_size & 1);
+    payload_available = (payload_size_padded > MemDataSize(mem))
+                      ? MemDataSize(mem) : payload_size_padded;
+    chunk_size = CHUNK_HEADER_SIZE + payload_available;
     if (SizeIsInvalid(mem, payload_size_padded)) return PARSE_ERROR;
     if (payload_size_padded > MemDataSize(mem)) status = PARSE_NEED_MORE_DATA;
 
@@ -312,6 +316,7 @@ static ParseStatus ParseAnimationFrame(
   int bits;
   MemBuffer* const mem = &dmux->mem_;
   Frame* frame;
+  size_t start_offset;
   ParseStatus status =
       NewFrame(mem, ANMF_CHUNK_SIZE, frame_chunk_size, &frame);
   if (status != PARSE_OK) return status;
@@ -332,7 +337,11 @@ static ParseStatus ParseAnimationFrame(
 
   // Store a frame only if the animation flag is set there is some data for
   // this frame is available.
+  start_offset = mem->start_;
   status = StoreFrame(dmux->num_frames_ + 1, anmf_payload_size, mem, frame);
+  if (status != PARSE_ERROR && mem->start_ - start_offset > anmf_payload_size) {
+    status = PARSE_ERROR;
+  }
   if (status != PARSE_ERROR && is_animation && frame->frame_num_ > 0) {
     added_frame = AddFrame(dmux, frame);
     if (added_frame) {
@@ -446,9 +455,11 @@ static ParseStatus ParseVP8XChunks(WebPDemuxer* const dmux) {
     const size_t chunk_start_offset = mem->start_;
     const uint32_t fourcc = ReadLE32(mem);
     const uint32_t chunk_size = ReadLE32(mem);
-    const uint32_t chunk_size_padded = chunk_size + (chunk_size & 1);
+    uint32_t chunk_size_padded;
 
     if (chunk_size > MAX_CHUNK_PAYLOAD) return PARSE_ERROR;
+
+    chunk_size_padded = chunk_size + (chunk_size & 1);
     if (SizeIsInvalid(mem, chunk_size_padded)) return PARSE_ERROR;
 
     switch (fourcc) {
@@ -603,7 +614,6 @@ static int IsValidExtendedFormat(const WebPDemuxer* const dmux) {
 
   while (f != NULL) {
     const int cur_frame_set = f->frame_num_;
-    int frame_count = 0;
 
     // Check frame properties.
     for (; f != NULL && f->frame_num_ == cur_frame_set; f = f->next_) {
@@ -638,8 +648,6 @@ static int IsValidExtendedFormat(const WebPDemuxer* const dmux) {
                             dmux->canvas_width_, dmux->canvas_height_)) {
         return 0;
       }
-
-      ++frame_count;
     }
   }
   return 1;
diff --git a/src/3rdparty/libwebp/src/dsp/alpha_processing.c b/src/3rdparty/libwebp/src/dsp/alpha_processing.c
index 819d139..1892929 100644
--- a/src/3rdparty/libwebp/src/dsp/alpha_processing.c
+++ b/src/3rdparty/libwebp/src/dsp/alpha_processing.c
@@ -157,7 +157,8 @@ void WebPMultARGBRow_C(uint32_t* const ptr, int width, int inverse) {
   }
 }
 
-void WebPMultRow_C(uint8_t* const ptr, const uint8_t* const alpha,
+void WebPMultRow_C(uint8_t* WEBP_RESTRICT const ptr,
+                   const uint8_t* WEBP_RESTRICT const alpha,
                    int width, int inverse) {
   int x;
   for (x = 0; x < width; ++x) {
@@ -178,7 +179,8 @@ void WebPMultRow_C(uint8_t* const ptr, const uint8_t* const alpha,
 #undef MFIX
 
 void (*WebPMultARGBRow)(uint32_t* const ptr, int width, int inverse);
-void (*WebPMultRow)(uint8_t* const ptr, const uint8_t* const alpha,
+void (*WebPMultRow)(uint8_t* WEBP_RESTRICT const ptr,
+                    const uint8_t* WEBP_RESTRICT const alpha,
                     int width, int inverse);
 
 //------------------------------------------------------------------------------
@@ -193,8 +195,8 @@ void WebPMultARGBRows(uint8_t* ptr, int stride, int width, int num_rows,
   }
 }
 
-void WebPMultRows(uint8_t* ptr, int stride,
-                  const uint8_t* alpha, int alpha_stride,
+void WebPMultRows(uint8_t* WEBP_RESTRICT ptr, int stride,
+                  const uint8_t* WEBP_RESTRICT alpha, int alpha_stride,
                   int width, int num_rows, int inverse) {
   int n;
   for (n = 0; n < num_rows; ++n) {
@@ -290,9 +292,9 @@ static void ApplyAlphaMultiply_16b_C(uint8_t* rgba4444,
 }
 
 #if !WEBP_NEON_OMIT_C_CODE
-static int DispatchAlpha_C(const uint8_t* alpha, int alpha_stride,
+static int DispatchAlpha_C(const uint8_t* WEBP_RESTRICT alpha, int alpha_stride,
                            int width, int height,
-                           uint8_t* dst, int dst_stride) {
+                           uint8_t* WEBP_RESTRICT dst, int dst_stride) {
   uint32_t alpha_mask = 0xff;
   int i, j;
 
@@ -309,9 +311,10 @@ static int DispatchAlpha_C(const uint8_t* alpha, int alpha_stride,
   return (alpha_mask != 0xff);
 }
 
-static void DispatchAlphaToGreen_C(const uint8_t* alpha, int alpha_stride,
-                                   int width, int height,
-                                   uint32_t* dst, int dst_stride) {
+static void DispatchAlphaToGreen_C(const uint8_t* WEBP_RESTRICT alpha,
+                                   int alpha_stride, int width, int height,
+                                   uint32_t* WEBP_RESTRICT dst,
+                                   int dst_stride) {
   int i, j;
   for (j = 0; j < height; ++j) {
     for (i = 0; i < width; ++i) {
@@ -322,9 +325,9 @@ static void DispatchAlphaToGreen_C(const uint8_t* alpha, int alpha_stride,
   }
 }
 
-static int ExtractAlpha_C(const uint8_t* argb, int argb_stride,
+static int ExtractAlpha_C(const uint8_t* WEBP_RESTRICT argb, int argb_stride,
                           int width, int height,
-                          uint8_t* alpha, int alpha_stride) {
+                          uint8_t* WEBP_RESTRICT alpha, int alpha_stride) {
   uint8_t alpha_mask = 0xff;
   int i, j;
 
@@ -340,7 +343,8 @@ static int ExtractAlpha_C(const uint8_t* argb, int argb_stride,
   return (alpha_mask == 0xff);
 }
 
-static void ExtractGreen_C(const uint32_t* argb, uint8_t* alpha, int size) {
+static void ExtractGreen_C(const uint32_t* WEBP_RESTRICT argb,
+                           uint8_t* WEBP_RESTRICT alpha, int size) {
   int i;
   for (i = 0; i < size; ++i) alpha[i] = argb[i] >> 8;
 }
@@ -359,6 +363,11 @@ static int HasAlpha32b_C(const uint8_t* src, int length) {
   return 0;
 }
 
+static void AlphaReplace_C(uint32_t* src, int length, uint32_t color) {
+  int x;
+  for (x = 0; x < length; ++x) if ((src[x] >> 24) == 0) src[x] = color;
+}
+
 //------------------------------------------------------------------------------
 // Simple channel manipulations.
 
@@ -367,8 +376,11 @@ static WEBP_INLINE uint32_t MakeARGB32(int a, int r, int g, int b) {
 }
 
 #ifdef WORDS_BIGENDIAN
-static void PackARGB_C(const uint8_t* a, const uint8_t* r, const uint8_t* g,
-                       const uint8_t* b, int len, uint32_t* out) {
+static void PackARGB_C(const uint8_t* WEBP_RESTRICT a,
+                       const uint8_t* WEBP_RESTRICT r,
+                       const uint8_t* WEBP_RESTRICT g,
+                       const uint8_t* WEBP_RESTRICT b,
+                       int len, uint32_t* WEBP_RESTRICT out) {
   int i;
   for (i = 0; i < len; ++i) {
     out[i] = MakeARGB32(a[4 * i], r[4 * i], g[4 * i], b[4 * i]);
@@ -376,8 +388,10 @@ static void PackARGB_C(const uint8_t* a, const uint8_t* r, const uint8_t* g,
 }
 #endif
 
-static void PackRGB_C(const uint8_t* r, const uint8_t* g, const uint8_t* b,
-                      int len, int step, uint32_t* out) {
+static void PackRGB_C(const uint8_t* WEBP_RESTRICT r,
+                      const uint8_t* WEBP_RESTRICT g,
+                      const uint8_t* WEBP_RESTRICT b,
+                      int len, int step, uint32_t* WEBP_RESTRICT out) {
   int i, offset = 0;
   for (i = 0; i < len; ++i) {
     out[i] = MakeARGB32(0xff, r[offset], g[offset], b[offset]);
@@ -387,19 +401,26 @@ static void PackRGB_C(const uint8_t* r, const uint8_t* g, const uint8_t* b,
 
 void (*WebPApplyAlphaMultiply)(uint8_t*, int, int, int, int);
 void (*WebPApplyAlphaMultiply4444)(uint8_t*, int, int, int);
-int (*WebPDispatchAlpha)(const uint8_t*, int, int, int, uint8_t*, int);
-void (*WebPDispatchAlphaToGreen)(const uint8_t*, int, int, int, uint32_t*, int);
-int (*WebPExtractAlpha)(const uint8_t*, int, int, int, uint8_t*, int);
-void (*WebPExtractGreen)(const uint32_t* argb, uint8_t* alpha, int size);
+int (*WebPDispatchAlpha)(const uint8_t* WEBP_RESTRICT, int, int, int,
+                         uint8_t* WEBP_RESTRICT, int);
+void (*WebPDispatchAlphaToGreen)(const uint8_t* WEBP_RESTRICT, int, int, int,
+                                 uint32_t* WEBP_RESTRICT, int);
+int (*WebPExtractAlpha)(const uint8_t* WEBP_RESTRICT, int, int, int,
+                        uint8_t* WEBP_RESTRICT, int);
+void (*WebPExtractGreen)(const uint32_t* WEBP_RESTRICT argb,
+                         uint8_t* WEBP_RESTRICT alpha, int size);
 #ifdef WORDS_BIGENDIAN
 void (*WebPPackARGB)(const uint8_t* a, const uint8_t* r, const uint8_t* g,
                      const uint8_t* b, int, uint32_t*);
 #endif
-void (*WebPPackRGB)(const uint8_t* r, const uint8_t* g, const uint8_t* b,
-                    int len, int step, uint32_t* out);
+void (*WebPPackRGB)(const uint8_t* WEBP_RESTRICT r,
+                    const uint8_t* WEBP_RESTRICT g,
+                    const uint8_t* WEBP_RESTRICT b,
+                    int len, int step, uint32_t* WEBP_RESTRICT out);
 
 int (*WebPHasAlpha8b)(const uint8_t* src, int length);
 int (*WebPHasAlpha32b)(const uint8_t* src, int length);
+void (*WebPAlphaReplace)(uint32_t* src, int length, uint32_t color);
 
 //------------------------------------------------------------------------------
 // Init function
@@ -428,13 +449,14 @@ WEBP_DSP_INIT_FUNC(WebPInitAlphaProcessing) {
 
   WebPHasAlpha8b = HasAlpha8b_C;
   WebPHasAlpha32b = HasAlpha32b_C;
+  WebPAlphaReplace = AlphaReplace_C;
 
   // If defined, use CPUInfo() to overwrite some pointers with faster versions.
   if (VP8GetCPUInfo != NULL) {
-#if defined(WEBP_USE_SSE2)
+#if defined(WEBP_HAVE_SSE2)
     if (VP8GetCPUInfo(kSSE2)) {
       WebPInitAlphaProcessingSSE2();
-#if defined(WEBP_USE_SSE41)
+#if defined(WEBP_HAVE_SSE41)
       if (VP8GetCPUInfo(kSSE4_1)) {
         WebPInitAlphaProcessingSSE41();
       }
@@ -448,7 +470,7 @@ WEBP_DSP_INIT_FUNC(WebPInitAlphaProcessing) {
 #endif
   }
 
-#if defined(WEBP_USE_NEON)
+#if defined(WEBP_HAVE_NEON)
   if (WEBP_NEON_OMIT_C_CODE ||
       (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
     WebPInitAlphaProcessingNEON();
@@ -469,4 +491,5 @@ WEBP_DSP_INIT_FUNC(WebPInitAlphaProcessing) {
   assert(WebPPackRGB != NULL);
   assert(WebPHasAlpha8b != NULL);
   assert(WebPHasAlpha32b != NULL);
+  assert(WebPAlphaReplace != NULL);
 }
diff --git a/src/3rdparty/libwebp/src/dsp/alpha_processing_neon.c b/src/3rdparty/libwebp/src/dsp/alpha_processing_neon.c
index 9d55421..6716fb7 100644
--- a/src/3rdparty/libwebp/src/dsp/alpha_processing_neon.c
+++ b/src/3rdparty/libwebp/src/dsp/alpha_processing_neon.c
@@ -80,10 +80,10 @@ static void ApplyAlphaMultiply_NEON(uint8_t* rgba, int alpha_first,
 
 //------------------------------------------------------------------------------
 
-static int DispatchAlpha_NEON(const uint8_t* alpha, int alpha_stride,
-                              int width, int height,
-                              uint8_t* dst, int dst_stride) {
-  uint32_t alpha_mask = 0xffffffffu;
+static int DispatchAlpha_NEON(const uint8_t* WEBP_RESTRICT alpha,
+                              int alpha_stride, int width, int height,
+                              uint8_t* WEBP_RESTRICT dst, int dst_stride) {
+  uint32_t alpha_mask = 0xffu;
   uint8x8_t mask8 = vdup_n_u8(0xff);
   uint32_t tmp[2];
   int i, j;
@@ -107,14 +107,16 @@ static int DispatchAlpha_NEON(const uint8_t* alpha, int alpha_stride,
     dst += dst_stride;
   }
   vst1_u8((uint8_t*)tmp, mask8);
+  alpha_mask *= 0x01010101;
   alpha_mask &= tmp[0];
   alpha_mask &= tmp[1];
   return (alpha_mask != 0xffffffffu);
 }
 
-static void DispatchAlphaToGreen_NEON(const uint8_t* alpha, int alpha_stride,
-                                      int width, int height,
-                                      uint32_t* dst, int dst_stride) {
+static void DispatchAlphaToGreen_NEON(const uint8_t* WEBP_RESTRICT alpha,
+                                      int alpha_stride, int width, int height,
+                                      uint32_t* WEBP_RESTRICT dst,
+                                      int dst_stride) {
   int i, j;
   uint8x8x4_t greens;   // leave A/R/B channels zero'd.
   greens.val[0] = vdup_n_u8(0);
@@ -131,10 +133,10 @@ static void DispatchAlphaToGreen_NEON(const uint8_t* alpha, int alpha_stride,
   }
 }
 
-static int ExtractAlpha_NEON(const uint8_t* argb, int argb_stride,
+static int ExtractAlpha_NEON(const uint8_t* WEBP_RESTRICT argb, int argb_stride,
                              int width, int height,
-                             uint8_t* alpha, int alpha_stride) {
-  uint32_t alpha_mask = 0xffffffffu;
+                             uint8_t* WEBP_RESTRICT alpha, int alpha_stride) {
+  uint32_t alpha_mask = 0xffu;
   uint8x8_t mask8 = vdup_n_u8(0xff);
   uint32_t tmp[2];
   int i, j;
@@ -156,13 +158,14 @@ static int ExtractAlpha_NEON(const uint8_t* argb, int argb_stride,
     alpha += alpha_stride;
   }
   vst1_u8((uint8_t*)tmp, mask8);
+  alpha_mask *= 0x01010101;
   alpha_mask &= tmp[0];
   alpha_mask &= tmp[1];
   return (alpha_mask == 0xffffffffu);
 }
 
-static void ExtractGreen_NEON(const uint32_t* argb,
-                              uint8_t* alpha, int size) {
+static void ExtractGreen_NEON(const uint32_t* WEBP_RESTRICT argb,
+                              uint8_t* WEBP_RESTRICT alpha, int size) {
   int i;
   for (i = 0; i + 16 <= size; i += 16) {
     const uint8x16x4_t rgbX = vld4q_u8((const uint8_t*)(argb + i));
diff --git a/src/3rdparty/libwebp/src/dsp/alpha_processing_sse2.c b/src/3rdparty/libwebp/src/dsp/alpha_processing_sse2.c
index 2871c56..f0843d0 100644
--- a/src/3rdparty/libwebp/src/dsp/alpha_processing_sse2.c
+++ b/src/3rdparty/libwebp/src/dsp/alpha_processing_sse2.c
@@ -18,16 +18,16 @@
 
 //------------------------------------------------------------------------------
 
-static int DispatchAlpha_SSE2(const uint8_t* alpha, int alpha_stride,
-                              int width, int height,
-                              uint8_t* dst, int dst_stride) {
+static int DispatchAlpha_SSE2(const uint8_t* WEBP_RESTRICT alpha,
+                              int alpha_stride, int width, int height,
+                              uint8_t* WEBP_RESTRICT dst, int dst_stride) {
   // alpha_and stores an 'and' operation of all the alpha[] values. The final
   // value is not 0xff if any of the alpha[] is not equal to 0xff.
   uint32_t alpha_and = 0xff;
   int i, j;
   const __m128i zero = _mm_setzero_si128();
-  const __m128i rgb_mask = _mm_set1_epi32(0xffffff00u);  // to preserve RGB
-  const __m128i all_0xff = _mm_set_epi32(0, 0, ~0u, ~0u);
+  const __m128i rgb_mask = _mm_set1_epi32((int)0xffffff00);  // to preserve RGB
+  const __m128i all_0xff = _mm_set_epi32(0, 0, ~0, ~0);
   __m128i all_alphas = all_0xff;
 
   // We must be able to access 3 extra bytes after the last written byte
@@ -72,9 +72,10 @@ static int DispatchAlpha_SSE2(const uint8_t* alpha, int alpha_stride,
   return (alpha_and != 0xff);
 }
 
-static void DispatchAlphaToGreen_SSE2(const uint8_t* alpha, int alpha_stride,
-                                      int width, int height,
-                                      uint32_t* dst, int dst_stride) {
+static void DispatchAlphaToGreen_SSE2(const uint8_t* WEBP_RESTRICT alpha,
+                                      int alpha_stride, int width, int height,
+                                      uint32_t* WEBP_RESTRICT dst,
+                                      int dst_stride) {
   int i, j;
   const __m128i zero = _mm_setzero_si128();
   const int limit = width & ~15;
@@ -98,15 +99,15 @@ static void DispatchAlphaToGreen_SSE2(const uint8_t* alpha, int alpha_stride,
   }
 }
 
-static int ExtractAlpha_SSE2(const uint8_t* argb, int argb_stride,
+static int ExtractAlpha_SSE2(const uint8_t* WEBP_RESTRICT argb, int argb_stride,
                              int width, int height,
-                             uint8_t* alpha, int alpha_stride) {
+                             uint8_t* WEBP_RESTRICT alpha, int alpha_stride) {
   // alpha_and stores an 'and' operation of all the alpha[] values. The final
   // value is not 0xff if any of the alpha[] is not equal to 0xff.
   uint32_t alpha_and = 0xff;
   int i, j;
-  const __m128i a_mask = _mm_set1_epi32(0xffu);  // to preserve alpha
-  const __m128i all_0xff = _mm_set_epi32(0, 0, ~0u, ~0u);
+  const __m128i a_mask = _mm_set1_epi32(0xff);  // to preserve alpha
+  const __m128i all_0xff = _mm_set_epi32(0, 0, ~0, ~0);
   __m128i all_alphas = all_0xff;
 
   // We must be able to access 3 extra bytes after the last written byte
@@ -177,7 +178,7 @@ static int ExtractAlpha_SSE2(const uint8_t* argb, int argb_stride,
 static void ApplyAlphaMultiply_SSE2(uint8_t* rgba, int alpha_first,
                                     int w, int h, int stride) {
   const __m128i zero = _mm_setzero_si128();
-  const __m128i kMult = _mm_set1_epi16(0x8081u);
+  const __m128i kMult = _mm_set1_epi16((short)0x8081);
   const __m128i kMask = _mm_set_epi16(0, 0xff, 0xff, 0, 0, 0xff, 0xff, 0);
   const int kSpan = 4;
   while (h-- > 0) {
@@ -265,6 +266,27 @@ static int HasAlpha32b_SSE2(const uint8_t* src, int length) {
   return 0;
 }
 
+static void AlphaReplace_SSE2(uint32_t* src, int length, uint32_t color) {
+  const __m128i m_color = _mm_set1_epi32((int)color);
+  const __m128i zero = _mm_setzero_si128();
+  int i = 0;
+  for (; i + 8 <= length; i += 8) {
+    const __m128i a0 = _mm_loadu_si128((const __m128i*)(src + i + 0));
+    const __m128i a1 = _mm_loadu_si128((const __m128i*)(src + i + 4));
+    const __m128i b0 = _mm_srai_epi32(a0, 24);
+    const __m128i b1 = _mm_srai_epi32(a1, 24);
+    const __m128i c0 = _mm_cmpeq_epi32(b0, zero);
+    const __m128i c1 = _mm_cmpeq_epi32(b1, zero);
+    const __m128i d0 = _mm_and_si128(c0, m_color);
+    const __m128i d1 = _mm_and_si128(c1, m_color);
+    const __m128i e0 = _mm_andnot_si128(c0, a0);
+    const __m128i e1 = _mm_andnot_si128(c1, a1);
+    _mm_storeu_si128((__m128i*)(src + i + 0), _mm_or_si128(d0, e0));
+    _mm_storeu_si128((__m128i*)(src + i + 4), _mm_or_si128(d1, e1));
+  }
+  for (; i < length; ++i) if ((src[i] >> 24) == 0) src[i] = color;
+}
+
 // -----------------------------------------------------------------------------
 // Apply alpha value to rows
 
@@ -296,7 +318,8 @@ static void MultARGBRow_SSE2(uint32_t* const ptr, int width, int inverse) {
   if (width > 0) WebPMultARGBRow_C(ptr + x, width, inverse);
 }
 
-static void MultRow_SSE2(uint8_t* const ptr, const uint8_t* const alpha,
+static void MultRow_SSE2(uint8_t* WEBP_RESTRICT const ptr,
+                         const uint8_t* WEBP_RESTRICT const alpha,
                          int width, int inverse) {
   int x = 0;
   if (!inverse) {
@@ -334,6 +357,7 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessingSSE2(void) {
 
   WebPHasAlpha8b = HasAlpha8b_SSE2;
   WebPHasAlpha32b = HasAlpha32b_SSE2;
+  WebPAlphaReplace = AlphaReplace_SSE2;
 }
 
 #else  // !WEBP_USE_SSE2
diff --git a/src/3rdparty/libwebp/src/dsp/alpha_processing_sse41.c b/src/3rdparty/libwebp/src/dsp/alpha_processing_sse41.c
index 56040f9..1156ac3 100644
--- a/src/3rdparty/libwebp/src/dsp/alpha_processing_sse41.c
+++ b/src/3rdparty/libwebp/src/dsp/alpha_processing_sse41.c
@@ -19,14 +19,14 @@
 
 //------------------------------------------------------------------------------
 
-static int ExtractAlpha_SSE41(const uint8_t* argb, int argb_stride,
-                              int width, int height,
-                              uint8_t* alpha, int alpha_stride) {
+static int ExtractAlpha_SSE41(const uint8_t* WEBP_RESTRICT argb,
+                              int argb_stride, int width, int height,
+                              uint8_t* WEBP_RESTRICT alpha, int alpha_stride) {
   // alpha_and stores an 'and' operation of all the alpha[] values. The final
   // value is not 0xff if any of the alpha[] is not equal to 0xff.
   uint32_t alpha_and = 0xff;
   int i, j;
-  const __m128i all_0xff = _mm_set1_epi32(~0u);
+  const __m128i all_0xff = _mm_set1_epi32(~0);
   __m128i all_alphas = all_0xff;
 
   // We must be able to access 3 extra bytes after the last written byte
diff --git a/src/3rdparty/libwebp/src/dsp/cost.c b/src/3rdparty/libwebp/src/dsp/cost.c
index cc681cd..460ec4f 100644
--- a/src/3rdparty/libwebp/src/dsp/cost.c
+++ b/src/3rdparty/libwebp/src/dsp/cost.c
@@ -395,12 +395,12 @@ WEBP_DSP_INIT_FUNC(VP8EncDspCostInit) {
       VP8EncDspCostInitMIPSdspR2();
     }
 #endif
-#if defined(WEBP_USE_SSE2)
+#if defined(WEBP_HAVE_SSE2)
     if (VP8GetCPUInfo(kSSE2)) {
       VP8EncDspCostInitSSE2();
     }
 #endif
-#if defined(WEBP_USE_NEON)
+#if defined(WEBP_HAVE_NEON)
     if (VP8GetCPUInfo(kNEON)) {
       VP8EncDspCostInitNEON();
     }
diff --git a/src/3rdparty/libwebp/src/dsp/cpu.c b/src/3rdparty/libwebp/src/dsp/cpu.c
index 0fa5b6a..62de73f 100644
--- a/src/3rdparty/libwebp/src/dsp/cpu.c
+++ b/src/3rdparty/libwebp/src/dsp/cpu.c
@@ -11,7 +11,7 @@
 //
 // Author: Christian Duvivier (cduvivier@google.com)
 
-#include "src/dsp/dsp.h"
+#include "src/dsp/cpu.h"
 
 #if defined(WEBP_HAVE_NEON_RTCD)
 #include <stdio.h>
@@ -55,12 +55,18 @@ static WEBP_INLINE void GetCPUInfo(int cpu_info[4], int info_type) {
     : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
     : "a"(info_type), "c"(0));
 }
-#elif (defined(_M_X64) || defined(_M_IX86)) && \
-      defined(_MSC_FULL_VER) && _MSC_FULL_VER >= 150030729  // >= VS2008 SP1
+#elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86))
+
+#if defined(_MSC_FULL_VER) && _MSC_FULL_VER >= 150030729  // >= VS2008 SP1
 #include <intrin.h>
 #define GetCPUInfo(info, type) __cpuidex(info, type, 0)  // set ecx=0
-#elif defined(WEBP_MSC_SSE2)
+#define WEBP_HAVE_MSC_CPUID
+#elif _MSC_VER > 1310
+#include <intrin.h>
 #define GetCPUInfo __cpuid
+#define WEBP_HAVE_MSC_CPUID
+#endif
+
 #endif
 
 // NaCl has no support for xgetbv or the raw opcode.
@@ -94,7 +100,7 @@ static WEBP_INLINE uint64_t xgetbv(void) {
 #define xgetbv() 0U  // no AVX for older x64 or unrecognized toolchains.
 #endif
 
-#if defined(__i386__) || defined(__x86_64__) || defined(WEBP_MSC_SSE2)
+#if defined(__i386__) || defined(__x86_64__) || defined(WEBP_HAVE_MSC_CPUID)
 
 // helper function for run-time detection of slow SSSE3 platforms
 static int CheckSlowModel(int info) {
@@ -179,9 +185,34 @@ static int AndroidCPUInfo(CPUFeature feature) {
   return 0;
 }
 VP8CPUInfo VP8GetCPUInfo = AndroidCPUInfo;
-#elif defined(WEBP_USE_NEON)
-// define a dummy function to enable turning off NEON at runtime by setting
-// VP8DecGetCPUInfo = NULL
+#elif defined(EMSCRIPTEN) // also needs to be before generic NEON test
+// Use compile flags as an indicator of SIMD support instead of a runtime check.
+static int wasmCPUInfo(CPUFeature feature) {
+  switch (feature) {
+#ifdef WEBP_HAVE_SSE2
+    case kSSE2:
+      return 1;
+#endif
+#ifdef WEBP_HAVE_SSE41
+    case kSSE3:
+    case kSlowSSSE3:
+    case kSSE4_1:
+      return 1;
+#endif
+#ifdef WEBP_HAVE_NEON
+    case kNEON:
+      return 1;
+#endif
+    default:
+      break;
+  }
+  return 0;
+}
+VP8CPUInfo VP8GetCPUInfo = wasmCPUInfo;
+#elif defined(WEBP_HAVE_NEON)
+// In most cases this function doesn't check for NEON support (it's assumed by
+// the configuration), but enables turning off NEON at runtime, for testing
+// purposes, by setting VP8GetCPUInfo = NULL.
 static int armCPUInfo(CPUFeature feature) {
   if (feature != kNEON) return 0;
 #if defined(__linux__) && defined(WEBP_HAVE_NEON_RTCD)
diff --git a/src/3rdparty/libwebp/src/dsp/cpu.h b/src/3rdparty/libwebp/src/dsp/cpu.h
new file mode 100644
index 0000000..de32a39
--- /dev/null
+++ b/src/3rdparty/libwebp/src/dsp/cpu.h
@@ -0,0 +1,259 @@
+// Copyright 2022 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+//   CPU detection functions and macros.
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#ifndef WEBP_DSP_CPU_H_
+#define WEBP_DSP_CPU_H_
+
+#include <stddef.h>
+
+#include <qglobal.h>
+
+#ifdef HAVE_CONFIG_H
+#include "src/webp/config.h"
+#endif
+
+#include "src/webp/types.h"
+
+#if defined(__GNUC__)
+#define LOCAL_GCC_VERSION ((__GNUC__ << 8) | __GNUC_MINOR__)
+#define LOCAL_GCC_PREREQ(maj, min) (LOCAL_GCC_VERSION >= (((maj) << 8) | (min)))
+#else
+#define LOCAL_GCC_VERSION 0
+#define LOCAL_GCC_PREREQ(maj, min) 0
+#endif
+
+#if defined(__clang__)
+#define LOCAL_CLANG_VERSION ((__clang_major__ << 8) | __clang_minor__)
+#define LOCAL_CLANG_PREREQ(maj, min) \
+  (LOCAL_CLANG_VERSION >= (((maj) << 8) | (min)))
+#else
+#define LOCAL_CLANG_VERSION 0
+#define LOCAL_CLANG_PREREQ(maj, min) 0
+#endif
+
+#ifndef __has_builtin
+#define __has_builtin(x) 0
+#endif
+
+#if !defined(HAVE_CONFIG_H)
+#if defined(_MSC_VER) && _MSC_VER > 1310 && \
+    (defined(_M_X64) || defined(_M_IX86)) && !defined(__clang__)
+#define WEBP_MSC_SSE2  // Visual C++ SSE2 targets
+#endif
+
+#if defined(_MSC_VER) && _MSC_VER >= 1500 && \
+    (defined(_M_X64) || defined(_M_IX86)) && !defined(__clang__)
+#define WEBP_MSC_SSE41  // Visual C++ SSE4.1 targets
+#endif
+#endif
+
+// WEBP_HAVE_* are used to indicate the presence of the instruction set in dsp
+// files without intrinsics, allowing the corresponding Init() to be called.
+// Files containing intrinsics will need to be built targeting the instruction
+// set so should succeed on one of the earlier tests.
+#if (defined(__SSE2__) || defined(WEBP_MSC_SSE2)) && \
+    (!defined(HAVE_CONFIG_H) || defined(WEBP_HAVE_SSE2))
+#define WEBP_USE_SSE2
+#endif
+
+#if defined(WEBP_USE_SSE2) && !defined(WEBP_HAVE_SSE2)
+#define WEBP_HAVE_SSE2
+#endif
+
+#if (defined(__SSE4_1__) || defined(WEBP_MSC_SSE41)) && \
+    (!defined(HAVE_CONFIG_H) || defined(WEBP_HAVE_SSE41))
+#define WEBP_USE_SSE41
+#endif
+
+#if defined(WEBP_USE_SSE41) && !defined(WEBP_HAVE_SSE41)
+#define WEBP_HAVE_SSE41
+#endif
+
+#undef WEBP_MSC_SSE41
+#undef WEBP_MSC_SSE2
+
+// The intrinsics currently cause compiler errors with arm-nacl-gcc and the
+// inline assembly would need to be modified for use with Native Client.
+#if ((defined(__ARM_NEON__) || defined(__aarch64__)) &&       \
+     (!defined(HAVE_CONFIG_H) || defined(WEBP_HAVE_NEON))) && \
+    !defined(__native_client__)
+#define WEBP_USE_NEON
+#endif
+
+#if !defined(WEBP_USE_NEON) && defined(__ANDROID__) && \
+    defined(__ARM_ARCH_7A__) && defined(HAVE_CPU_FEATURES_H)
+#define WEBP_ANDROID_NEON  // Android targets that may have NEON
+#define WEBP_USE_NEON
+#endif
+
+// Note: ARM64 is supported in Visual Studio 2017, but requires the direct
+// inclusion of arm64_neon.h; Visual Studio 2019 includes this file in
+// arm_neon.h. Compile errors were seen with Visual Studio 2019 16.4 with
+// vtbl4_u8(); a fix was made in 16.6.
+#if defined(_MSC_VER) && ((_MSC_VER >= 1700 && defined(_M_ARM)) || \
+                          (_MSC_VER >= 1926 && defined(_M_ARM64))) && \
+                         !defined(__clang__) && (QT_CONFIG_neon == 1)
+#define WEBP_USE_NEON
+#define WEBP_USE_INTRINSICS
+#endif
+
+#if defined(WEBP_USE_NEON) && !defined(WEBP_HAVE_NEON)
+#define WEBP_HAVE_NEON
+#endif
+
+#if defined(__mips__) && !defined(__mips64) && defined(__mips_isa_rev) && \
+    (__mips_isa_rev >= 1) && (__mips_isa_rev < 6)
+#define WEBP_USE_MIPS32
+#if (__mips_isa_rev >= 2)
+#define WEBP_USE_MIPS32_R2
+#if defined(__mips_dspr2) || (defined(__mips_dsp_rev) && __mips_dsp_rev >= 2)
+#define WEBP_USE_MIPS_DSP_R2
+#endif
+#endif
+#endif
+
+#if defined(__mips_msa) && defined(__mips_isa_rev) && (__mips_isa_rev >= 5)
+#define WEBP_USE_MSA
+#endif
+
+#ifndef WEBP_DSP_OMIT_C_CODE
+#define WEBP_DSP_OMIT_C_CODE 1
+#endif
+
+#if defined(WEBP_USE_NEON) && WEBP_DSP_OMIT_C_CODE
+#define WEBP_NEON_OMIT_C_CODE 1
+#else
+#define WEBP_NEON_OMIT_C_CODE 0
+#endif
+
+#if !(LOCAL_CLANG_PREREQ(3, 8) || LOCAL_GCC_PREREQ(4, 8) || \
+      defined(__aarch64__))
+#define WEBP_NEON_WORK_AROUND_GCC 1
+#else
+#define WEBP_NEON_WORK_AROUND_GCC 0
+#endif
+
+// This macro prevents thread_sanitizer from reporting known concurrent writes.
+#define WEBP_TSAN_IGNORE_FUNCTION
+#if defined(__has_feature)
+#if __has_feature(thread_sanitizer)
+#undef WEBP_TSAN_IGNORE_FUNCTION
+#define WEBP_TSAN_IGNORE_FUNCTION __attribute__((no_sanitize_thread))
+#endif
+#endif
+
+#if defined(__has_feature)
+#if __has_feature(memory_sanitizer)
+#define WEBP_MSAN
+#endif
+#endif
+
+#if defined(WEBP_USE_THREAD) && !defined(_WIN32)
+#include <pthread.h>  // NOLINT
+
+#define WEBP_DSP_INIT(func)                                         \
+  do {                                                              \
+    static volatile VP8CPUInfo func##_last_cpuinfo_used =           \
+        (VP8CPUInfo)&func##_last_cpuinfo_used;                      \
+    static pthread_mutex_t func##_lock = PTHREAD_MUTEX_INITIALIZER; \
+    if (pthread_mutex_lock(&func##_lock)) break;                    \
+    if (func##_last_cpuinfo_used != VP8GetCPUInfo) func();          \
+    func##_last_cpuinfo_used = VP8GetCPUInfo;                       \
+    (void)pthread_mutex_unlock(&func##_lock);                       \
+  } while (0)
+#else  // !(defined(WEBP_USE_THREAD) && !defined(_WIN32))
+#define WEBP_DSP_INIT(func)                               \
+  do {                                                    \
+    static volatile VP8CPUInfo func##_last_cpuinfo_used = \
+        (VP8CPUInfo)&func##_last_cpuinfo_used;            \
+    if (func##_last_cpuinfo_used == VP8GetCPUInfo) break; \
+    func();                                               \
+    func##_last_cpuinfo_used = VP8GetCPUInfo;             \
+  } while (0)
+#endif  // defined(WEBP_USE_THREAD) && !defined(_WIN32)
+
+// Defines an Init + helper function that control multiple initialization of
+// function pointers / tables.
+/* Usage:
+   WEBP_DSP_INIT_FUNC(InitFunc) {
+     ...function body
+   }
+*/
+#define WEBP_DSP_INIT_FUNC(name)                                            \
+  static WEBP_TSAN_IGNORE_FUNCTION void name##_body(void);                  \
+  WEBP_TSAN_IGNORE_FUNCTION void name(void) { WEBP_DSP_INIT(name##_body); } \
+  static WEBP_TSAN_IGNORE_FUNCTION void name##_body(void)
+
+#define WEBP_UBSAN_IGNORE_UNDEF
+#define WEBP_UBSAN_IGNORE_UNSIGNED_OVERFLOW
+#if defined(__clang__) && defined(__has_attribute)
+#if __has_attribute(no_sanitize)
+// This macro prevents the undefined behavior sanitizer from reporting
+// failures. This is only meant to silence unaligned loads on platforms that
+// are known to support them.
+#undef WEBP_UBSAN_IGNORE_UNDEF
+#define WEBP_UBSAN_IGNORE_UNDEF __attribute__((no_sanitize("undefined")))
+
+// This macro prevents the undefined behavior sanitizer from reporting
+// failures related to unsigned integer overflows. This is only meant to
+// silence cases where this well defined behavior is expected.
+#undef WEBP_UBSAN_IGNORE_UNSIGNED_OVERFLOW
+#define WEBP_UBSAN_IGNORE_UNSIGNED_OVERFLOW \
+  __attribute__((no_sanitize("unsigned-integer-overflow")))
+#endif
+#endif
+
+// If 'ptr' is NULL, returns NULL. Otherwise returns 'ptr + off'.
+// Prevents undefined behavior sanitizer nullptr-with-nonzero-offset warning.
+#if !defined(WEBP_OFFSET_PTR)
+#define WEBP_OFFSET_PTR(ptr, off) (((ptr) == NULL) ? NULL : ((ptr) + (off)))
+#endif
+
+// Regularize the definition of WEBP_SWAP_16BIT_CSP (backward compatibility)
+#if !defined(WEBP_SWAP_16BIT_CSP)
+#define WEBP_SWAP_16BIT_CSP 0
+#endif
+
+// some endian fix (e.g.: mips-gcc doesn't define __BIG_ENDIAN__)
+#if !defined(WORDS_BIGENDIAN) &&                   \
+    (defined(__BIG_ENDIAN__) || defined(_M_PPC) || \
+     (defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)))
+#define WORDS_BIGENDIAN
+#endif
+
+typedef enum {
+  kSSE2,
+  kSSE3,
+  kSlowSSSE3,  // special feature for slow SSSE3 architectures
+  kSSE4_1,
+  kAVX,
+  kAVX2,
+  kNEON,
+  kMIPS32,
+  kMIPSdspR2,
+  kMSA
+} CPUFeature;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// returns true if the CPU supports the feature.
+typedef int (*VP8CPUInfo)(CPUFeature feature);
+WEBP_EXTERN VP8CPUInfo VP8GetCPUInfo;
+
+#ifdef __cplusplus
+}    // extern "C"
+#endif
+
+#endif  // WEBP_DSP_CPU_H_
diff --git a/src/3rdparty/libwebp/src/dsp/dec.c b/src/3rdparty/libwebp/src/dsp/dec.c
index 1119842..537c701 100644
--- a/src/3rdparty/libwebp/src/dsp/dec.c
+++ b/src/3rdparty/libwebp/src/dsp/dec.c
@@ -807,10 +807,10 @@ WEBP_DSP_INIT_FUNC(VP8DspInit) {
 
   // If defined, use CPUInfo() to overwrite some pointers with faster versions.
   if (VP8GetCPUInfo != NULL) {
-#if defined(WEBP_USE_SSE2)
+#if defined(WEBP_HAVE_SSE2)
     if (VP8GetCPUInfo(kSSE2)) {
       VP8DspInitSSE2();
-#if defined(WEBP_USE_SSE41)
+#if defined(WEBP_HAVE_SSE41)
       if (VP8GetCPUInfo(kSSE4_1)) {
         VP8DspInitSSE41();
       }
@@ -834,7 +834,7 @@ WEBP_DSP_INIT_FUNC(VP8DspInit) {
 #endif
   }
 
-#if defined(WEBP_USE_NEON)
+#if defined(WEBP_HAVE_NEON)
   if (WEBP_NEON_OMIT_C_CODE ||
       (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
     VP8DspInitNEON();
diff --git a/src/3rdparty/libwebp/src/dsp/dec_neon.c b/src/3rdparty/libwebp/src/dsp/dec_neon.c
index ffa697f..fa85170 100644
--- a/src/3rdparty/libwebp/src/dsp/dec_neon.c
+++ b/src/3rdparty/libwebp/src/dsp/dec_neon.c
@@ -1283,12 +1283,12 @@ static void DC4_NEON(uint8_t* dst) {    // DC
   const uint8x8_t A = vld1_u8(dst - BPS);  // top row
   const uint16x4_t p0 = vpaddl_u8(A);  // cascading summation of the top
   const uint16x4_t p1 = vpadd_u16(p0, p0);
-  const uint16x8_t L0 = vmovl_u8(vld1_u8(dst + 0 * BPS - 1));
-  const uint16x8_t L1 = vmovl_u8(vld1_u8(dst + 1 * BPS - 1));
-  const uint16x8_t L2 = vmovl_u8(vld1_u8(dst + 2 * BPS - 1));
-  const uint16x8_t L3 = vmovl_u8(vld1_u8(dst + 3 * BPS - 1));
-  const uint16x8_t s0 = vaddq_u16(L0, L1);
-  const uint16x8_t s1 = vaddq_u16(L2, L3);
+  const uint8x8_t L0 = vld1_u8(dst + 0 * BPS - 1);
+  const uint8x8_t L1 = vld1_u8(dst + 1 * BPS - 1);
+  const uint8x8_t L2 = vld1_u8(dst + 2 * BPS - 1);
+  const uint8x8_t L3 = vld1_u8(dst + 3 * BPS - 1);
+  const uint16x8_t s0 = vaddl_u8(L0, L1);
+  const uint16x8_t s1 = vaddl_u8(L2, L3);
   const uint16x8_t s01 = vaddq_u16(s0, s1);
   const uint16x8_t sum = vaddq_u16(s01, vcombine_u16(p1, p1));
   const uint8x8_t dc0 = vrshrn_n_u16(sum, 3);  // (sum + 4) >> 3
@@ -1361,7 +1361,8 @@ static void RD4_NEON(uint8_t* dst) {   // Down-right
   const uint32_t J = dst[-1 + 1 * BPS];
   const uint32_t K = dst[-1 + 2 * BPS];
   const uint32_t L = dst[-1 + 3 * BPS];
-  const uint64x1_t LKJI____ = vcreate_u64(L | (K << 8) | (J << 16) | (I << 24));
+  const uint64x1_t LKJI____ =
+      vcreate_u64((uint64_t)L | (K << 8) | (J << 16) | (I << 24));
   const uint64x1_t LKJIXABC = vorr_u64(LKJI____, ____XABC);
   const uint8x8_t KJIXABC_ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 8));
   const uint8x8_t JIXABC__ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 16));
@@ -1427,25 +1428,30 @@ static WEBP_INLINE void DC8_NEON(uint8_t* dst, int do_top, int do_left) {
 
   if (do_top) {
     const uint8x8_t A = vld1_u8(dst - BPS);  // top row
+#if defined(__aarch64__)
+    const uint16_t p2 = vaddlv_u8(A);
+    sum_top = vdupq_n_u16(p2);
+#else
     const uint16x4_t p0 = vpaddl_u8(A);  // cascading summation of the top
     const uint16x4_t p1 = vpadd_u16(p0, p0);
     const uint16x4_t p2 = vpadd_u16(p1, p1);
     sum_top = vcombine_u16(p2, p2);
+#endif
   }
 
   if (do_left) {
-    const uint16x8_t L0 = vmovl_u8(vld1_u8(dst + 0 * BPS - 1));
-    const uint16x8_t L1 = vmovl_u8(vld1_u8(dst + 1 * BPS - 1));
-    const uint16x8_t L2 = vmovl_u8(vld1_u8(dst + 2 * BPS - 1));
-    const uint16x8_t L3 = vmovl_u8(vld1_u8(dst + 3 * BPS - 1));
-    const uint16x8_t L4 = vmovl_u8(vld1_u8(dst + 4 * BPS - 1));
-    const uint16x8_t L5 = vmovl_u8(vld1_u8(dst + 5 * BPS - 1));
-    const uint16x8_t L6 = vmovl_u8(vld1_u8(dst + 6 * BPS - 1));
-    const uint16x8_t L7 = vmovl_u8(vld1_u8(dst + 7 * BPS - 1));
-    const uint16x8_t s0 = vaddq_u16(L0, L1);
-    const uint16x8_t s1 = vaddq_u16(L2, L3);
-    const uint16x8_t s2 = vaddq_u16(L4, L5);
-    const uint16x8_t s3 = vaddq_u16(L6, L7);
+    const uint8x8_t L0 = vld1_u8(dst + 0 * BPS - 1);
+    const uint8x8_t L1 = vld1_u8(dst + 1 * BPS - 1);
+    const uint8x8_t L2 = vld1_u8(dst + 2 * BPS - 1);
+    const uint8x8_t L3 = vld1_u8(dst + 3 * BPS - 1);
+    const uint8x8_t L4 = vld1_u8(dst + 4 * BPS - 1);
+    const uint8x8_t L5 = vld1_u8(dst + 5 * BPS - 1);
+    const uint8x8_t L6 = vld1_u8(dst + 6 * BPS - 1);
+    const uint8x8_t L7 = vld1_u8(dst + 7 * BPS - 1);
+    const uint16x8_t s0 = vaddl_u8(L0, L1);
+    const uint16x8_t s1 = vaddl_u8(L2, L3);
+    const uint16x8_t s2 = vaddl_u8(L4, L5);
+    const uint16x8_t s3 = vaddl_u8(L6, L7);
     const uint16x8_t s01 = vaddq_u16(s0, s1);
     const uint16x8_t s23 = vaddq_u16(s2, s3);
     sum_left = vaddq_u16(s01, s23);
@@ -1505,29 +1511,34 @@ static WEBP_INLINE void DC16_NEON(uint8_t* dst, int do_top, int do_left) {
 
   if (do_top) {
     const uint8x16_t A = vld1q_u8(dst - BPS);  // top row
+#if defined(__aarch64__)
+    const uint16_t p3 = vaddlvq_u8(A);
+    sum_top = vdupq_n_u16(p3);
+#else
     const uint16x8_t p0 = vpaddlq_u8(A);  // cascading summation of the top
     const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
     const uint16x4_t p2 = vpadd_u16(p1, p1);
     const uint16x4_t p3 = vpadd_u16(p2, p2);
     sum_top = vcombine_u16(p3, p3);
+#endif
   }
 
   if (do_left) {
     int i;
     sum_left = vdupq_n_u16(0);
     for (i = 0; i < 16; i += 8) {
-      const uint16x8_t L0 = vmovl_u8(vld1_u8(dst + (i + 0) * BPS - 1));
-      const uint16x8_t L1 = vmovl_u8(vld1_u8(dst + (i + 1) * BPS - 1));
-      const uint16x8_t L2 = vmovl_u8(vld1_u8(dst + (i + 2) * BPS - 1));
-      const uint16x8_t L3 = vmovl_u8(vld1_u8(dst + (i + 3) * BPS - 1));
-      const uint16x8_t L4 = vmovl_u8(vld1_u8(dst + (i + 4) * BPS - 1));
-      const uint16x8_t L5 = vmovl_u8(vld1_u8(dst + (i + 5) * BPS - 1));
-      const uint16x8_t L6 = vmovl_u8(vld1_u8(dst + (i + 6) * BPS - 1));
-      const uint16x8_t L7 = vmovl_u8(vld1_u8(dst + (i + 7) * BPS - 1));
-      const uint16x8_t s0 = vaddq_u16(L0, L1);
-      const uint16x8_t s1 = vaddq_u16(L2, L3);
-      const uint16x8_t s2 = vaddq_u16(L4, L5);
-      const uint16x8_t s3 = vaddq_u16(L6, L7);
+      const uint8x8_t L0 = vld1_u8(dst + (i + 0) * BPS - 1);
+      const uint8x8_t L1 = vld1_u8(dst + (i + 1) * BPS - 1);
+      const uint8x8_t L2 = vld1_u8(dst + (i + 2) * BPS - 1);
+      const uint8x8_t L3 = vld1_u8(dst + (i + 3) * BPS - 1);
+      const uint8x8_t L4 = vld1_u8(dst + (i + 4) * BPS - 1);
+      const uint8x8_t L5 = vld1_u8(dst + (i + 5) * BPS - 1);
+      const uint8x8_t L6 = vld1_u8(dst + (i + 6) * BPS - 1);
+      const uint8x8_t L7 = vld1_u8(dst + (i + 7) * BPS - 1);
+      const uint16x8_t s0 = vaddl_u8(L0, L1);
+      const uint16x8_t s1 = vaddl_u8(L2, L3);
+      const uint16x8_t s2 = vaddl_u8(L4, L5);
+      const uint16x8_t s3 = vaddl_u8(L6, L7);
       const uint16x8_t s01 = vaddq_u16(s0, s1);
       const uint16x8_t s23 = vaddq_u16(s2, s3);
       const uint16x8_t sum = vaddq_u16(s01, s23);
diff --git a/src/3rdparty/libwebp/src/dsp/dec_sse2.c b/src/3rdparty/libwebp/src/dsp/dec_sse2.c
index 873aa59..01e6bcb 100644
--- a/src/3rdparty/libwebp/src/dsp/dec_sse2.c
+++ b/src/3rdparty/libwebp/src/dsp/dec_sse2.c
@@ -158,10 +158,10 @@ static void Transform_SSE2(const int16_t* in, uint8_t* dst, int do_two) {
       dst3 = _mm_loadl_epi64((__m128i*)(dst + 3 * BPS));
     } else {
       // Load four bytes/pixels per line.
-      dst0 = _mm_cvtsi32_si128(WebPMemToUint32(dst + 0 * BPS));
-      dst1 = _mm_cvtsi32_si128(WebPMemToUint32(dst + 1 * BPS));
-      dst2 = _mm_cvtsi32_si128(WebPMemToUint32(dst + 2 * BPS));
-      dst3 = _mm_cvtsi32_si128(WebPMemToUint32(dst + 3 * BPS));
+      dst0 = _mm_cvtsi32_si128(WebPMemToInt32(dst + 0 * BPS));
+      dst1 = _mm_cvtsi32_si128(WebPMemToInt32(dst + 1 * BPS));
+      dst2 = _mm_cvtsi32_si128(WebPMemToInt32(dst + 2 * BPS));
+      dst3 = _mm_cvtsi32_si128(WebPMemToInt32(dst + 3 * BPS));
     }
     // Convert to 16b.
     dst0 = _mm_unpacklo_epi8(dst0, zero);
@@ -187,10 +187,10 @@ static void Transform_SSE2(const int16_t* in, uint8_t* dst, int do_two) {
       _mm_storel_epi64((__m128i*)(dst + 3 * BPS), dst3);
     } else {
       // Store four bytes/pixels per line.
-      WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(dst0));
-      WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(dst1));
-      WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(dst2));
-      WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(dst3));
+      WebPInt32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(dst0));
+      WebPInt32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(dst1));
+      WebPInt32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(dst2));
+      WebPInt32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(dst3));
     }
   }
 }
@@ -213,10 +213,10 @@ static void TransformAC3(const int16_t* in, uint8_t* dst) {
   const __m128i m3 = _mm_subs_epi16(B, d4);
   const __m128i zero = _mm_setzero_si128();
   // Load the source pixels.
-  __m128i dst0 = _mm_cvtsi32_si128(WebPMemToUint32(dst + 0 * BPS));
-  __m128i dst1 = _mm_cvtsi32_si128(WebPMemToUint32(dst + 1 * BPS));
-  __m128i dst2 = _mm_cvtsi32_si128(WebPMemToUint32(dst + 2 * BPS));
-  __m128i dst3 = _mm_cvtsi32_si128(WebPMemToUint32(dst + 3 * BPS));
+  __m128i dst0 = _mm_cvtsi32_si128(WebPMemToInt32(dst + 0 * BPS));
+  __m128i dst1 = _mm_cvtsi32_si128(WebPMemToInt32(dst + 1 * BPS));
+  __m128i dst2 = _mm_cvtsi32_si128(WebPMemToInt32(dst + 2 * BPS));
+  __m128i dst3 = _mm_cvtsi32_si128(WebPMemToInt32(dst + 3 * BPS));
   // Convert to 16b.
   dst0 = _mm_unpacklo_epi8(dst0, zero);
   dst1 = _mm_unpacklo_epi8(dst1, zero);
@@ -233,10 +233,10 @@ static void TransformAC3(const int16_t* in, uint8_t* dst) {
   dst2 = _mm_packus_epi16(dst2, dst2);
   dst3 = _mm_packus_epi16(dst3, dst3);
   // Store the results.
-  WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(dst0));
-  WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(dst1));
-  WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(dst2));
-  WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(dst3));
+  WebPInt32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(dst0));
+  WebPInt32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(dst1));
+  WebPInt32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(dst2));
+  WebPInt32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(dst3));
 }
 #undef MUL
 #endif   // USE_TRANSFORM_AC3
@@ -477,11 +477,11 @@ static WEBP_INLINE void Load8x4_SSE2(const uint8_t* const b, int stride,
   // A0 = 63 62 61 60 23 22 21 20 43 42 41 40 03 02 01 00
   // A1 = 73 72 71 70 33 32 31 30 53 52 51 50 13 12 11 10
   const __m128i A0 = _mm_set_epi32(
-      WebPMemToUint32(&b[6 * stride]), WebPMemToUint32(&b[2 * stride]),
-      WebPMemToUint32(&b[4 * stride]), WebPMemToUint32(&b[0 * stride]));
+      WebPMemToInt32(&b[6 * stride]), WebPMemToInt32(&b[2 * stride]),
+      WebPMemToInt32(&b[4 * stride]), WebPMemToInt32(&b[0 * stride]));
   const __m128i A1 = _mm_set_epi32(
-      WebPMemToUint32(&b[7 * stride]), WebPMemToUint32(&b[3 * stride]),
-      WebPMemToUint32(&b[5 * stride]), WebPMemToUint32(&b[1 * stride]));
+      WebPMemToInt32(&b[7 * stride]), WebPMemToInt32(&b[3 * stride]),
+      WebPMemToInt32(&b[5 * stride]), WebPMemToInt32(&b[1 * stride]));
 
   // B0 = 53 43 52 42 51 41 50 40 13 03 12 02 11 01 10 00
   // B1 = 73 63 72 62 71 61 70 60 33 23 32 22 31 21 30 20
@@ -540,7 +540,7 @@ static WEBP_INLINE void Store4x4_SSE2(__m128i* const x,
                                       uint8_t* dst, int stride) {
   int i;
   for (i = 0; i < 4; ++i, dst += stride) {
-    WebPUint32ToMem(dst, _mm_cvtsi128_si32(*x));
+    WebPInt32ToMem(dst, _mm_cvtsi128_si32(*x));
     *x = _mm_srli_si128(*x, 4);
   }
 }
@@ -908,10 +908,10 @@ static void VE4_SSE2(uint8_t* dst) {    // vertical
   const __m128i lsb = _mm_and_si128(_mm_xor_si128(ABCDEFGH, CDEFGH00), one);
   const __m128i b = _mm_subs_epu8(a, lsb);
   const __m128i avg = _mm_avg_epu8(b, BCDEFGH0);
-  const uint32_t vals = _mm_cvtsi128_si32(avg);
+  const int vals = _mm_cvtsi128_si32(avg);
   int i;
   for (i = 0; i < 4; ++i) {
-    WebPUint32ToMem(dst + i * BPS, vals);
+    WebPInt32ToMem(dst + i * BPS, vals);
   }
 }
 
@@ -925,10 +925,10 @@ static void LD4_SSE2(uint8_t* dst) {   // Down-Left
   const __m128i lsb = _mm_and_si128(_mm_xor_si128(ABCDEFGH, CDEFGHH0), one);
   const __m128i avg2 = _mm_subs_epu8(avg1, lsb);
   const __m128i abcdefg = _mm_avg_epu8(avg2, BCDEFGH0);
-  WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(               abcdefg    ));
-  WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 1)));
-  WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 2)));
-  WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3)));
+  WebPInt32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(               abcdefg    ));
+  WebPInt32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 1)));
+  WebPInt32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 2)));
+  WebPInt32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3)));
 }
 
 static void VR4_SSE2(uint8_t* dst) {   // Vertical-Right
@@ -946,10 +946,10 @@ static void VR4_SSE2(uint8_t* dst) {   // Vertical-Right
   const __m128i lsb = _mm_and_si128(_mm_xor_si128(IXABCD, ABCD0), one);
   const __m128i avg2 = _mm_subs_epu8(avg1, lsb);
   const __m128i efgh = _mm_avg_epu8(avg2, XABCD);
-  WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(               abcd    ));
-  WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(               efgh    ));
-  WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_slli_si128(abcd, 1)));
-  WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_slli_si128(efgh, 1)));
+  WebPInt32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(               abcd    ));
+  WebPInt32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(               efgh    ));
+  WebPInt32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_slli_si128(abcd, 1)));
+  WebPInt32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_slli_si128(efgh, 1)));
 
   // these two are hard to implement in SSE2, so we keep the C-version:
   DST(0, 2) = AVG3(J, I, X);
@@ -970,11 +970,12 @@ static void VL4_SSE2(uint8_t* dst) {   // Vertical-Left
   const __m128i abbc = _mm_or_si128(ab, bc);
   const __m128i lsb2 = _mm_and_si128(abbc, lsb1);
   const __m128i avg4 = _mm_subs_epu8(avg3, lsb2);
-  const uint32_t extra_out = _mm_cvtsi128_si32(_mm_srli_si128(avg4, 4));
-  WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(               avg1    ));
-  WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(               avg4    ));
-  WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(avg1, 1)));
-  WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(avg4, 1)));
+  const uint32_t extra_out =
+      (uint32_t)_mm_cvtsi128_si32(_mm_srli_si128(avg4, 4));
+  WebPInt32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(               avg1    ));
+  WebPInt32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(               avg4    ));
+  WebPInt32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(avg1, 1)));
+  WebPInt32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(avg4, 1)));
 
   // these two are hard to get and irregular
   DST(3, 2) = (extra_out >> 0) & 0xff;
@@ -990,7 +991,7 @@ static void RD4_SSE2(uint8_t* dst) {   // Down-right
   const uint32_t K = dst[-1 + 2 * BPS];
   const uint32_t L = dst[-1 + 3 * BPS];
   const __m128i LKJI_____ =
-      _mm_cvtsi32_si128(L | (K << 8) | (J << 16) | (I << 24));
+      _mm_cvtsi32_si128((int)(L | (K << 8) | (J << 16) | (I << 24)));
   const __m128i LKJIXABCD = _mm_or_si128(LKJI_____, ____XABCD);
   const __m128i KJIXABCD_ = _mm_srli_si128(LKJIXABCD, 1);
   const __m128i JIXABCD__ = _mm_srli_si128(LKJIXABCD, 2);
@@ -998,10 +999,10 @@ static void RD4_SSE2(uint8_t* dst) {   // Down-right
   const __m128i lsb = _mm_and_si128(_mm_xor_si128(JIXABCD__, LKJIXABCD), one);
   const __m128i avg2 = _mm_subs_epu8(avg1, lsb);
   const __m128i abcdefg = _mm_avg_epu8(avg2, KJIXABCD_);
-  WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(               abcdefg    ));
-  WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 1)));
-  WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 2)));
-  WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3)));
+  WebPInt32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(               abcdefg    ));
+  WebPInt32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 1)));
+  WebPInt32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 2)));
+  WebPInt32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3)));
 }
 
 #undef DST
@@ -1015,13 +1016,13 @@ static WEBP_INLINE void TrueMotion_SSE2(uint8_t* dst, int size) {
   const __m128i zero = _mm_setzero_si128();
   int y;
   if (size == 4) {
-    const __m128i top_values = _mm_cvtsi32_si128(WebPMemToUint32(top));
+    const __m128i top_values = _mm_cvtsi32_si128(WebPMemToInt32(top));
     const __m128i top_base = _mm_unpacklo_epi8(top_values, zero);
     for (y = 0; y < 4; ++y, dst += BPS) {
       const int val = dst[-1] - top[-1];
       const __m128i base = _mm_set1_epi16(val);
       const __m128i out = _mm_packus_epi16(_mm_add_epi16(base, top_base), zero);
-      WebPUint32ToMem(dst, _mm_cvtsi128_si32(out));
+      WebPInt32ToMem(dst, _mm_cvtsi128_si32(out));
     }
   } else if (size == 8) {
     const __m128i top_values = _mm_loadl_epi64((const __m128i*)top);
@@ -1062,7 +1063,7 @@ static void VE16_SSE2(uint8_t* dst) {
 static void HE16_SSE2(uint8_t* dst) {     // horizontal
   int j;
   for (j = 16; j > 0; --j) {
-    const __m128i values = _mm_set1_epi8(dst[-1]);
+    const __m128i values = _mm_set1_epi8((char)dst[-1]);
     _mm_storeu_si128((__m128i*)dst, values);
     dst += BPS;
   }
@@ -1070,7 +1071,7 @@ static void HE16_SSE2(uint8_t* dst) {     // horizontal
 
 static WEBP_INLINE void Put16_SSE2(uint8_t v, uint8_t* dst) {
   int j;
-  const __m128i values = _mm_set1_epi8(v);
+  const __m128i values = _mm_set1_epi8((char)v);
   for (j = 0; j < 16; ++j) {
     _mm_storeu_si128((__m128i*)(dst + j * BPS), values);
   }
@@ -1130,7 +1131,7 @@ static void VE8uv_SSE2(uint8_t* dst) {    // vertical
 // helper for chroma-DC predictions
 static WEBP_INLINE void Put8x8uv_SSE2(uint8_t v, uint8_t* dst) {
   int j;
-  const __m128i values = _mm_set1_epi8(v);
+  const __m128i values = _mm_set1_epi8((char)v);
   for (j = 0; j < 8; ++j) {
     _mm_storel_epi64((__m128i*)(dst + j * BPS), values);
   }
diff --git a/src/3rdparty/libwebp/src/dsp/dec_sse41.c b/src/3rdparty/libwebp/src/dsp/dec_sse41.c
index 8f18506..08a3630 100644
--- a/src/3rdparty/libwebp/src/dsp/dec_sse41.c
+++ b/src/3rdparty/libwebp/src/dsp/dec_sse41.c
@@ -23,7 +23,7 @@ static void HE16_SSE41(uint8_t* dst) {     // horizontal
   int j;
   const __m128i kShuffle3 = _mm_set1_epi8(3);
   for (j = 16; j > 0; --j) {
-    const __m128i in = _mm_cvtsi32_si128(WebPMemToUint32(dst - 4));
+    const __m128i in = _mm_cvtsi32_si128(WebPMemToInt32(dst - 4));
     const __m128i values = _mm_shuffle_epi8(in, kShuffle3);
     _mm_storeu_si128((__m128i*)dst, values);
     dst += BPS;
diff --git a/src/3rdparty/libwebp/src/dsp/dsp.h b/src/3rdparty/libwebp/src/dsp/dsp.h
index 0d7f3fb..d2000b8 100644
--- a/src/3rdparty/libwebp/src/dsp/dsp.h
+++ b/src/3rdparty/libwebp/src/dsp/dsp.h
@@ -18,6 +18,7 @@
 #include "src/webp/config.h"
 #endif
 
+#include "src/dsp/cpu.h"
 #include "src/webp/types.h"
 
 #ifdef __cplusplus
@@ -27,199 +28,22 @@ extern "C" {
 #define BPS 32   // this is the common stride for enc/dec
 
 //------------------------------------------------------------------------------
-// CPU detection
-
+// WEBP_RESTRICT
+
+// Declares a pointer with the restrict type qualifier if available.
+// This allows code to hint to the compiler that only this pointer references a
+// particular object or memory region within the scope of the block in which it
+// is declared. This may allow for improved optimizations due to the lack of
+// pointer aliasing. See also:
+// https://en.cppreference.com/w/c/language/restrict
 #if defined(__GNUC__)
-# define LOCAL_GCC_VERSION ((__GNUC__ << 8) | __GNUC_MINOR__)
-# define LOCAL_GCC_PREREQ(maj, min) \
-    (LOCAL_GCC_VERSION >= (((maj) << 8) | (min)))
-#else
-# define LOCAL_GCC_VERSION 0
-# define LOCAL_GCC_PREREQ(maj, min) 0
-#endif
-
-#if defined(__clang__)
-# define LOCAL_CLANG_VERSION ((__clang_major__ << 8) | __clang_minor__)
-# define LOCAL_CLANG_PREREQ(maj, min) \
-    (LOCAL_CLANG_VERSION >= (((maj) << 8) | (min)))
+#define WEBP_RESTRICT __restrict__
+#elif defined(_MSC_VER)
+#define WEBP_RESTRICT __restrict
 #else
-# define LOCAL_CLANG_VERSION 0
-# define LOCAL_CLANG_PREREQ(maj, min) 0
-#endif
-
-#ifndef __has_builtin
-# define __has_builtin(x) 0
-#endif
-
-// for now, none of the optimizations below are available in emscripten
-#if !defined(EMSCRIPTEN)
-
-#if defined(_MSC_VER) && _MSC_VER > 1310 && \
-    (defined(_M_X64) || defined(_M_IX86)) && !defined(__clang__)
-#define WEBP_MSC_SSE2  // Visual C++ SSE2 targets
-#endif
-
-#if defined(_MSC_VER) && _MSC_VER >= 1500 && \
-    (defined(_M_X64) || defined(_M_IX86)) && !defined(__clang__)
-#define WEBP_MSC_SSE41  // Visual C++ SSE4.1 targets
-#endif
-
-// WEBP_HAVE_* are used to indicate the presence of the instruction set in dsp
-// files without intrinsics, allowing the corresponding Init() to be called.
-// Files containing intrinsics will need to be built targeting the instruction
-// set so should succeed on one of the earlier tests.
-#if defined(__SSE2__) || defined(WEBP_MSC_SSE2) || defined(WEBP_HAVE_SSE2)
-#define WEBP_USE_SSE2
-#endif
-
-#if defined(__SSE4_1__) || defined(WEBP_MSC_SSE41) || defined(WEBP_HAVE_SSE41)
-#define WEBP_USE_SSE41
-#endif
-
-// The intrinsics currently cause compiler errors with arm-nacl-gcc and the
-// inline assembly would need to be modified for use with Native Client.
-#if (defined(__ARM_NEON__) || \
-     defined(__aarch64__) || defined(WEBP_HAVE_NEON)) && \
-    !defined(__native_client__)
-#define WEBP_USE_NEON
-#endif
-
-#if !defined(WEBP_USE_NEON) && defined(__ANDROID__) && \
-    defined(__ARM_ARCH_7A__) && defined(HAVE_CPU_FEATURES_H)
-#define WEBP_ANDROID_NEON  // Android targets that may have NEON
-#define WEBP_USE_NEON
-#endif
-
-#if defined(_MSC_VER) && _MSC_VER >= 1700 && defined(_M_ARM) && !defined(__clang__)
-#define WEBP_USE_NEON
-#define WEBP_USE_INTRINSICS
+#define WEBP_RESTRICT
 #endif
 
-#if defined(__mips__) && !defined(__mips64) && \
-    defined(__mips_isa_rev) && (__mips_isa_rev >= 1) && (__mips_isa_rev < 6)
-#define WEBP_USE_MIPS32
-#if (__mips_isa_rev >= 2)
-#define WEBP_USE_MIPS32_R2
-#if defined(__mips_dspr2) || (defined(__mips_dsp_rev) && __mips_dsp_rev >= 2)
-#define WEBP_USE_MIPS_DSP_R2
-#endif
-#endif
-#endif
-
-#if defined(__mips_msa) && defined(__mips_isa_rev) && (__mips_isa_rev >= 5)
-#define WEBP_USE_MSA
-#endif
-
-#endif  /* EMSCRIPTEN */
-
-#ifndef WEBP_DSP_OMIT_C_CODE
-#define WEBP_DSP_OMIT_C_CODE 1
-#endif
-
-#if (defined(__aarch64__) || defined(__ARM_NEON__)) && WEBP_DSP_OMIT_C_CODE
-#define WEBP_NEON_OMIT_C_CODE 1
-#else
-#define WEBP_NEON_OMIT_C_CODE 0
-#endif
-
-#if !(LOCAL_CLANG_PREREQ(3,8) || LOCAL_GCC_PREREQ(4,8) || defined(__aarch64__))
-#define WEBP_NEON_WORK_AROUND_GCC 1
-#else
-#define WEBP_NEON_WORK_AROUND_GCC 0
-#endif
-
-// This macro prevents thread_sanitizer from reporting known concurrent writes.
-#define WEBP_TSAN_IGNORE_FUNCTION
-#if defined(__has_feature)
-#if __has_feature(thread_sanitizer)
-#undef WEBP_TSAN_IGNORE_FUNCTION
-#define WEBP_TSAN_IGNORE_FUNCTION __attribute__((no_sanitize_thread))
-#endif
-#endif
-
-#if defined(WEBP_USE_THREAD) && !defined(_WIN32)
-#include <pthread.h>  // NOLINT
-
-#define WEBP_DSP_INIT(func) do {                                    \
-  static volatile VP8CPUInfo func ## _last_cpuinfo_used =           \
-      (VP8CPUInfo)&func ## _last_cpuinfo_used;                      \
-  static pthread_mutex_t func ## _lock = PTHREAD_MUTEX_INITIALIZER; \
-  if (pthread_mutex_lock(&func ## _lock)) break;                    \
-  if (func ## _last_cpuinfo_used != VP8GetCPUInfo) func();          \
-  func ## _last_cpuinfo_used = VP8GetCPUInfo;                       \
-  (void)pthread_mutex_unlock(&func ## _lock);                       \
-} while (0)
-#else  // !(defined(WEBP_USE_THREAD) && !defined(_WIN32))
-#define WEBP_DSP_INIT(func) do {                                    \
-  static volatile VP8CPUInfo func ## _last_cpuinfo_used =           \
-      (VP8CPUInfo)&func ## _last_cpuinfo_used;                      \
-  if (func ## _last_cpuinfo_used == VP8GetCPUInfo) break;           \
-  func();                                                           \
-  func ## _last_cpuinfo_used = VP8GetCPUInfo;                       \
-} while (0)
-#endif  // defined(WEBP_USE_THREAD) && !defined(_WIN32)
-
-// Defines an Init + helper function that control multiple initialization of
-// function pointers / tables.
-/* Usage:
-   WEBP_DSP_INIT_FUNC(InitFunc) {
-     ...function body
-   }
-*/
-#define WEBP_DSP_INIT_FUNC(name)                             \
-  static WEBP_TSAN_IGNORE_FUNCTION void name ## _body(void); \
-  WEBP_TSAN_IGNORE_FUNCTION void name(void) {                \
-    WEBP_DSP_INIT(name ## _body);                            \
-  }                                                          \
-  static WEBP_TSAN_IGNORE_FUNCTION void name ## _body(void)
-
-#define WEBP_UBSAN_IGNORE_UNDEF
-#define WEBP_UBSAN_IGNORE_UNSIGNED_OVERFLOW
-#if defined(__clang__) && defined(__has_attribute)
-#if __has_attribute(no_sanitize)
-// This macro prevents the undefined behavior sanitizer from reporting
-// failures. This is only meant to silence unaligned loads on platforms that
-// are known to support them.
-#undef WEBP_UBSAN_IGNORE_UNDEF
-#define WEBP_UBSAN_IGNORE_UNDEF \
-  __attribute__((no_sanitize("undefined")))
-
-// This macro prevents the undefined behavior sanitizer from reporting
-// failures related to unsigned integer overflows. This is only meant to
-// silence cases where this well defined behavior is expected.
-#undef WEBP_UBSAN_IGNORE_UNSIGNED_OVERFLOW
-#define WEBP_UBSAN_IGNORE_UNSIGNED_OVERFLOW \
-  __attribute__((no_sanitize("unsigned-integer-overflow")))
-#endif
-#endif
-
-// Regularize the definition of WEBP_SWAP_16BIT_CSP (backward compatibility)
-#if !defined(WEBP_SWAP_16BIT_CSP)
-#define WEBP_SWAP_16BIT_CSP 0
-#endif
-
-// some endian fix (e.g.: mips-gcc doesn't define __BIG_ENDIAN__)
-#if !defined(WORDS_BIGENDIAN) && \
-    (defined(__BIG_ENDIAN__) || defined(_M_PPC) || \
-     (defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)))
-#define WORDS_BIGENDIAN
-#endif
-
-typedef enum {
-  kSSE2,
-  kSSE3,
-  kSlowSSSE3,  // special feature for slow SSSE3 architectures
-  kSSE4_1,
-  kAVX,
-  kAVX2,
-  kNEON,
-  kMIPS32,
-  kMIPSdspR2,
-  kMSA
-} CPUFeature;
-// returns true if the CPU supports the feature.
-typedef int (*VP8CPUInfo)(CPUFeature feature);
-WEBP_EXTERN VP8CPUInfo VP8GetCPUInfo;
 
 //------------------------------------------------------------------------------
 // Init stub generator
@@ -246,9 +70,9 @@ extern VP8Fdct VP8FTransform2;   // performs two transforms at a time
 extern VP8WHT VP8FTransformWHT;
 // Predictions
 // *dst is the destination block. *top and *left can be NULL.
-typedef void (*VP8IntraPreds)(uint8_t *dst, const uint8_t* left,
+typedef void (*VP8IntraPreds)(uint8_t* dst, const uint8_t* left,
                               const uint8_t* top);
-typedef void (*VP8Intra4Preds)(uint8_t *dst, const uint8_t* top);
+typedef void (*VP8Intra4Preds)(uint8_t* dst, const uint8_t* top);
 extern VP8Intra4Preds VP8EncPredLuma4;
 extern VP8IntraPreds VP8EncPredLuma16;
 extern VP8IntraPreds VP8EncPredChroma8;
@@ -508,15 +332,6 @@ extern void WebPConvertARGBToUV_C(const uint32_t* argb, uint8_t* u, uint8_t* v,
 extern void WebPConvertRGBA32ToUV_C(const uint16_t* rgb,
                                     uint8_t* u, uint8_t* v, int width);
 
-// utilities for accurate RGB->YUV conversion
-extern uint64_t (*WebPSharpYUVUpdateY)(const uint16_t* src, const uint16_t* ref,
-                                       uint16_t* dst, int len);
-extern void (*WebPSharpYUVUpdateRGB)(const int16_t* src, const int16_t* ref,
-                                     int16_t* dst, int len);
-extern void (*WebPSharpYUVFilterRow)(const int16_t* A, const int16_t* B,
-                                     int len,
-                                     const uint16_t* best_y, uint16_t* out);
-
 // Must be called before using the above.
 void WebPInitConvertARGBToYUV(void);
 
@@ -572,26 +387,29 @@ extern void (*WebPApplyAlphaMultiply4444)(
 
 // Dispatch the values from alpha[] plane to the ARGB destination 'dst'.
 // Returns true if alpha[] plane has non-trivial values different from 0xff.
-extern int (*WebPDispatchAlpha)(const uint8_t* alpha, int alpha_stride,
-                                int width, int height,
-                                uint8_t* dst, int dst_stride);
+extern int (*WebPDispatchAlpha)(const uint8_t* WEBP_RESTRICT alpha,
+                                int alpha_stride, int width, int height,
+                                uint8_t* WEBP_RESTRICT dst, int dst_stride);
 
 // Transfer packed 8b alpha[] values to green channel in dst[], zero'ing the
 // A/R/B values. 'dst_stride' is the stride for dst[] in uint32_t units.
-extern void (*WebPDispatchAlphaToGreen)(const uint8_t* alpha, int alpha_stride,
-                                        int width, int height,
-                                        uint32_t* dst, int dst_stride);
+extern void (*WebPDispatchAlphaToGreen)(const uint8_t* WEBP_RESTRICT alpha,
+                                        int alpha_stride, int width, int height,
+                                        uint32_t* WEBP_RESTRICT dst,
+                                        int dst_stride);
 
 // Extract the alpha values from 32b values in argb[] and pack them into alpha[]
 // (this is the opposite of WebPDispatchAlpha).
 // Returns true if there's only trivial 0xff alpha values.
-extern int (*WebPExtractAlpha)(const uint8_t* argb, int argb_stride,
-                               int width, int height,
-                               uint8_t* alpha, int alpha_stride);
+extern int (*WebPExtractAlpha)(const uint8_t* WEBP_RESTRICT argb,
+                               int argb_stride, int width, int height,
+                               uint8_t* WEBP_RESTRICT alpha,
+                               int alpha_stride);
 
 // Extract the green values from 32b values in argb[] and pack them into alpha[]
 // (this is the opposite of WebPDispatchAlphaToGreen).
-extern void (*WebPExtractGreen)(const uint32_t* argb, uint8_t* alpha, int size);
+extern void (*WebPExtractGreen)(const uint32_t* WEBP_RESTRICT argb,
+                                uint8_t* WEBP_RESTRICT alpha, int size);
 
 // Pre-Multiply operation transforms x into x * A / 255  (where x=Y,R,G or B).
 // Un-Multiply operation transforms x into x * 255 / A.
@@ -604,34 +422,42 @@ void WebPMultARGBRows(uint8_t* ptr, int stride, int width, int num_rows,
                       int inverse);
 
 // Same for a row of single values, with side alpha values.
-extern void (*WebPMultRow)(uint8_t* const ptr, const uint8_t* const alpha,
+extern void (*WebPMultRow)(uint8_t* WEBP_RESTRICT const ptr,
+                           const uint8_t* WEBP_RESTRICT const alpha,
                            int width, int inverse);
 
 // Same a WebPMultRow(), but for several 'num_rows' rows.
-void WebPMultRows(uint8_t* ptr, int stride,
-                  const uint8_t* alpha, int alpha_stride,
+void WebPMultRows(uint8_t* WEBP_RESTRICT ptr, int stride,
+                  const uint8_t* WEBP_RESTRICT alpha, int alpha_stride,
                   int width, int num_rows, int inverse);
 
 // Plain-C versions, used as fallback by some implementations.
-void WebPMultRow_C(uint8_t* const ptr, const uint8_t* const alpha,
+void WebPMultRow_C(uint8_t* WEBP_RESTRICT const ptr,
+                   const uint8_t* WEBP_RESTRICT const alpha,
                    int width, int inverse);
 void WebPMultARGBRow_C(uint32_t* const ptr, int width, int inverse);
 
 #ifdef WORDS_BIGENDIAN
 // ARGB packing function: a/r/g/b input is rgba or bgra order.
-extern void (*WebPPackARGB)(const uint8_t* a, const uint8_t* r,
-                            const uint8_t* g, const uint8_t* b, int len,
-                            uint32_t* out);
+extern void (*WebPPackARGB)(const uint8_t* WEBP_RESTRICT a,
+                            const uint8_t* WEBP_RESTRICT r,
+                            const uint8_t* WEBP_RESTRICT g,
+                            const uint8_t* WEBP_RESTRICT b,
+                            int len, uint32_t* WEBP_RESTRICT out);
 #endif
 
 // RGB packing function. 'step' can be 3 or 4. r/g/b input is rgb or bgr order.
-extern void (*WebPPackRGB)(const uint8_t* r, const uint8_t* g, const uint8_t* b,
-                           int len, int step, uint32_t* out);
+extern void (*WebPPackRGB)(const uint8_t* WEBP_RESTRICT r,
+                           const uint8_t* WEBP_RESTRICT g,
+                           const uint8_t* WEBP_RESTRICT b,
+                           int len, int step, uint32_t* WEBP_RESTRICT out);
 
 // This function returns true if src[i] contains a value different from 0xff.
 extern int (*WebPHasAlpha8b)(const uint8_t* src, int length);
 // This function returns true if src[4*i] contains a value different from 0xff.
 extern int (*WebPHasAlpha32b)(const uint8_t* src, int length);
+// replaces transparent values in src[] by 'color'.
+extern void (*WebPAlphaReplace)(uint32_t* src, int length, uint32_t color);
 
 // To be called first before using the above.
 void WebPInitAlphaProcessing(void);
diff --git a/src/3rdparty/libwebp/src/dsp/enc.c b/src/3rdparty/libwebp/src/dsp/enc.c
index 2fddbc4..ea47a3f 100644
--- a/src/3rdparty/libwebp/src/dsp/enc.c
+++ b/src/3rdparty/libwebp/src/dsp/enc.c
@@ -773,10 +773,10 @@ WEBP_DSP_INIT_FUNC(VP8EncDspInit) {
 
   // If defined, use CPUInfo() to overwrite some pointers with faster versions.
   if (VP8GetCPUInfo != NULL) {
-#if defined(WEBP_USE_SSE2)
+#if defined(WEBP_HAVE_SSE2)
     if (VP8GetCPUInfo(kSSE2)) {
       VP8EncDspInitSSE2();
-#if defined(WEBP_USE_SSE41)
+#if defined(WEBP_HAVE_SSE41)
       if (VP8GetCPUInfo(kSSE4_1)) {
         VP8EncDspInitSSE41();
       }
@@ -800,7 +800,7 @@ WEBP_DSP_INIT_FUNC(VP8EncDspInit) {
 #endif
   }
 
-#if defined(WEBP_USE_NEON)
+#if defined(WEBP_HAVE_NEON)
   if (WEBP_NEON_OMIT_C_CODE ||
       (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
     VP8EncDspInitNEON();
diff --git a/src/3rdparty/libwebp/src/dsp/enc_neon.c b/src/3rdparty/libwebp/src/dsp/enc_neon.c
index 43bf124..3a04111 100644
--- a/src/3rdparty/libwebp/src/dsp/enc_neon.c
+++ b/src/3rdparty/libwebp/src/dsp/enc_neon.c
@@ -9,7 +9,7 @@
 //
 // ARM NEON version of speed-critical encoding functions.
 //
-// adapted from libvpx (http://www.webmproject.org/code/)
+// adapted from libvpx (https://www.webmproject.org/code/)
 
 #include "src/dsp/dsp.h"
 
@@ -764,9 +764,14 @@ static WEBP_INLINE void AccumulateSSE16_NEON(const uint8_t* const a,
 
 // Horizontal sum of all four uint32_t values in 'sum'.
 static int SumToInt_NEON(uint32x4_t sum) {
+#if defined(__aarch64__)
+  return (int)vaddvq_u32(sum);
+#else
   const uint64x2_t sum2 = vpaddlq_u32(sum);
-  const uint64_t sum3 = vgetq_lane_u64(sum2, 0) + vgetq_lane_u64(sum2, 1);
-  return (int)sum3;
+  const uint32x2_t sum3 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(sum2)),
+                                   vreinterpret_u32_u64(vget_high_u64(sum2)));
+  return (int)vget_lane_u32(sum3, 0);
+#endif
 }
 
 static int SSE16x16_NEON(const uint8_t* a, const uint8_t* b) {
diff --git a/src/3rdparty/libwebp/src/dsp/enc_sse2.c b/src/3rdparty/libwebp/src/dsp/enc_sse2.c
index b2e78ed..1d10556 100644
--- a/src/3rdparty/libwebp/src/dsp/enc_sse2.c
+++ b/src/3rdparty/libwebp/src/dsp/enc_sse2.c
@@ -156,10 +156,10 @@ static void ITransform_SSE2(const uint8_t* ref, const int16_t* in, uint8_t* dst,
       ref3 = _mm_loadl_epi64((const __m128i*)&ref[3 * BPS]);
     } else {
       // Load four bytes/pixels per line.
-      ref0 = _mm_cvtsi32_si128(WebPMemToUint32(&ref[0 * BPS]));
-      ref1 = _mm_cvtsi32_si128(WebPMemToUint32(&ref[1 * BPS]));
-      ref2 = _mm_cvtsi32_si128(WebPMemToUint32(&ref[2 * BPS]));
-      ref3 = _mm_cvtsi32_si128(WebPMemToUint32(&ref[3 * BPS]));
+      ref0 = _mm_cvtsi32_si128(WebPMemToInt32(&ref[0 * BPS]));
+      ref1 = _mm_cvtsi32_si128(WebPMemToInt32(&ref[1 * BPS]));
+      ref2 = _mm_cvtsi32_si128(WebPMemToInt32(&ref[2 * BPS]));
+      ref3 = _mm_cvtsi32_si128(WebPMemToInt32(&ref[3 * BPS]));
     }
     // Convert to 16b.
     ref0 = _mm_unpacklo_epi8(ref0, zero);
@@ -185,10 +185,10 @@ static void ITransform_SSE2(const uint8_t* ref, const int16_t* in, uint8_t* dst,
       _mm_storel_epi64((__m128i*)&dst[3 * BPS], ref3);
     } else {
       // Store four bytes/pixels per line.
-      WebPUint32ToMem(&dst[0 * BPS], _mm_cvtsi128_si32(ref0));
-      WebPUint32ToMem(&dst[1 * BPS], _mm_cvtsi128_si32(ref1));
-      WebPUint32ToMem(&dst[2 * BPS], _mm_cvtsi128_si32(ref2));
-      WebPUint32ToMem(&dst[3 * BPS], _mm_cvtsi128_si32(ref3));
+      WebPInt32ToMem(&dst[0 * BPS], _mm_cvtsi128_si32(ref0));
+      WebPInt32ToMem(&dst[1 * BPS], _mm_cvtsi128_si32(ref1));
+      WebPInt32ToMem(&dst[2 * BPS], _mm_cvtsi128_si32(ref2));
+      WebPInt32ToMem(&dst[3 * BPS], _mm_cvtsi128_si32(ref3));
     }
   }
 }
@@ -481,7 +481,7 @@ static void CollectHistogram_SSE2(const uint8_t* ref, const uint8_t* pred,
 // helper for chroma-DC predictions
 static WEBP_INLINE void Put8x8uv_SSE2(uint8_t v, uint8_t* dst) {
   int j;
-  const __m128i values = _mm_set1_epi8(v);
+  const __m128i values = _mm_set1_epi8((char)v);
   for (j = 0; j < 8; ++j) {
     _mm_storel_epi64((__m128i*)(dst + j * BPS), values);
   }
@@ -489,7 +489,7 @@ static WEBP_INLINE void Put8x8uv_SSE2(uint8_t v, uint8_t* dst) {
 
 static WEBP_INLINE void Put16_SSE2(uint8_t v, uint8_t* dst) {
   int j;
-  const __m128i values = _mm_set1_epi8(v);
+  const __m128i values = _mm_set1_epi8((char)v);
   for (j = 0; j < 16; ++j) {
     _mm_store_si128((__m128i*)(dst + j * BPS), values);
   }
@@ -540,7 +540,7 @@ static WEBP_INLINE void VerticalPred_SSE2(uint8_t* dst,
 static WEBP_INLINE void HE8uv_SSE2(uint8_t* dst, const uint8_t* left) {
   int j;
   for (j = 0; j < 8; ++j) {
-    const __m128i values = _mm_set1_epi8(left[j]);
+    const __m128i values = _mm_set1_epi8((char)left[j]);
     _mm_storel_epi64((__m128i*)dst, values);
     dst += BPS;
   }
@@ -549,7 +549,7 @@ static WEBP_INLINE void HE8uv_SSE2(uint8_t* dst, const uint8_t* left) {
 static WEBP_INLINE void HE16_SSE2(uint8_t* dst, const uint8_t* left) {
   int j;
   for (j = 0; j < 16; ++j) {
-    const __m128i values = _mm_set1_epi8(left[j]);
+    const __m128i values = _mm_set1_epi8((char)left[j]);
     _mm_store_si128((__m128i*)dst, values);
     dst += BPS;
   }
@@ -722,10 +722,10 @@ static WEBP_INLINE void VE4_SSE2(uint8_t* dst,
   const __m128i lsb = _mm_and_si128(_mm_xor_si128(ABCDEFGH, CDEFGH00), one);
   const __m128i b = _mm_subs_epu8(a, lsb);
   const __m128i avg = _mm_avg_epu8(b, BCDEFGH0);
-  const uint32_t vals = _mm_cvtsi128_si32(avg);
+  const int vals = _mm_cvtsi128_si32(avg);
   int i;
   for (i = 0; i < 4; ++i) {
-    WebPUint32ToMem(dst + i * BPS, vals);
+    WebPInt32ToMem(dst + i * BPS, vals);
   }
 }
 
@@ -760,10 +760,10 @@ static WEBP_INLINE void LD4_SSE2(uint8_t* dst,
   const __m128i lsb = _mm_and_si128(_mm_xor_si128(ABCDEFGH, CDEFGHH0), one);
   const __m128i avg2 = _mm_subs_epu8(avg1, lsb);
   const __m128i abcdefg = _mm_avg_epu8(avg2, BCDEFGH0);
-  WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(               abcdefg    ));
-  WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 1)));
-  WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 2)));
-  WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3)));
+  WebPInt32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(               abcdefg    ));
+  WebPInt32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 1)));
+  WebPInt32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 2)));
+  WebPInt32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3)));
 }
 
 static WEBP_INLINE void VR4_SSE2(uint8_t* dst,
@@ -782,10 +782,10 @@ static WEBP_INLINE void VR4_SSE2(uint8_t* dst,
   const __m128i lsb = _mm_and_si128(_mm_xor_si128(IXABCD, ABCD0), one);
   const __m128i avg2 = _mm_subs_epu8(avg1, lsb);
   const __m128i efgh = _mm_avg_epu8(avg2, XABCD);
-  WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(               abcd    ));
-  WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(               efgh    ));
-  WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_slli_si128(abcd, 1)));
-  WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_slli_si128(efgh, 1)));
+  WebPInt32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(               abcd    ));
+  WebPInt32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(               efgh    ));
+  WebPInt32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_slli_si128(abcd, 1)));
+  WebPInt32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_slli_si128(efgh, 1)));
 
   // these two are hard to implement in SSE2, so we keep the C-version:
   DST(0, 2) = AVG3(J, I, X);
@@ -807,11 +807,12 @@ static WEBP_INLINE void VL4_SSE2(uint8_t* dst,
   const __m128i abbc = _mm_or_si128(ab, bc);
   const __m128i lsb2 = _mm_and_si128(abbc, lsb1);
   const __m128i avg4 = _mm_subs_epu8(avg3, lsb2);
-  const uint32_t extra_out = _mm_cvtsi128_si32(_mm_srli_si128(avg4, 4));
-  WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(               avg1    ));
-  WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(               avg4    ));
-  WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(avg1, 1)));
-  WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(avg4, 1)));
+  const uint32_t extra_out =
+      (uint32_t)_mm_cvtsi128_si32(_mm_srli_si128(avg4, 4));
+  WebPInt32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(               avg1    ));
+  WebPInt32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(               avg4    ));
+  WebPInt32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(avg1, 1)));
+  WebPInt32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(avg4, 1)));
 
   // these two are hard to get and irregular
   DST(3, 2) = (extra_out >> 0) & 0xff;
@@ -829,10 +830,10 @@ static WEBP_INLINE void RD4_SSE2(uint8_t* dst,
   const __m128i lsb = _mm_and_si128(_mm_xor_si128(JIXABCD__, LKJIXABCD), one);
   const __m128i avg2 = _mm_subs_epu8(avg1, lsb);
   const __m128i abcdefg = _mm_avg_epu8(avg2, KJIXABCD_);
-  WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(               abcdefg    ));
-  WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 1)));
-  WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 2)));
-  WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3)));
+  WebPInt32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(               abcdefg    ));
+  WebPInt32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 1)));
+  WebPInt32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 2)));
+  WebPInt32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3)));
 }
 
 static WEBP_INLINE void HU4_SSE2(uint8_t* dst, const uint8_t* top) {
@@ -875,14 +876,14 @@ static WEBP_INLINE void HD4_SSE2(uint8_t* dst, const uint8_t* top) {
 
 static WEBP_INLINE void TM4_SSE2(uint8_t* dst, const uint8_t* top) {
   const __m128i zero = _mm_setzero_si128();
-  const __m128i top_values = _mm_cvtsi32_si128(WebPMemToUint32(top));
+  const __m128i top_values = _mm_cvtsi32_si128(WebPMemToInt32(top));
   const __m128i top_base = _mm_unpacklo_epi8(top_values, zero);
   int y;
   for (y = 0; y < 4; ++y, dst += BPS) {
     const int val = top[-2 - y] - top[-1];
     const __m128i base = _mm_set1_epi16(val);
     const __m128i out = _mm_packus_epi16(_mm_add_epi16(base, top_base), zero);
-    WebPUint32ToMem(dst, _mm_cvtsi128_si32(out));
+    WebPInt32ToMem(dst, _mm_cvtsi128_si32(out));
   }
 }
 
diff --git a/src/3rdparty/libwebp/src/dsp/filters.c b/src/3rdparty/libwebp/src/dsp/filters.c
index 9e910d9..4506567 100644
--- a/src/3rdparty/libwebp/src/dsp/filters.c
+++ b/src/3rdparty/libwebp/src/dsp/filters.c
@@ -254,7 +254,7 @@ WEBP_DSP_INIT_FUNC(VP8FiltersInit) {
 #endif
 
   if (VP8GetCPUInfo != NULL) {
-#if defined(WEBP_USE_SSE2)
+#if defined(WEBP_HAVE_SSE2)
     if (VP8GetCPUInfo(kSSE2)) {
       VP8FiltersInitSSE2();
     }
@@ -271,7 +271,7 @@ WEBP_DSP_INIT_FUNC(VP8FiltersInit) {
 #endif
   }
 
-#if defined(WEBP_USE_NEON)
+#if defined(WEBP_HAVE_NEON)
   if (WEBP_NEON_OMIT_C_CODE ||
       (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
     VP8FiltersInitNEON();
diff --git a/src/3rdparty/libwebp/src/dsp/filters_sse2.c b/src/3rdparty/libwebp/src/dsp/filters_sse2.c
index 4b3f2d0..5c33ec1 100644
--- a/src/3rdparty/libwebp/src/dsp/filters_sse2.c
+++ b/src/3rdparty/libwebp/src/dsp/filters_sse2.c
@@ -320,7 +320,12 @@ extern void VP8FiltersInitSSE2(void);
 
 WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInitSSE2(void) {
   WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter_SSE2;
+#if defined(CHROMIUM)
+  // TODO(crbug.com/654974)
+  (void)VerticalUnfilter_SSE2;
+#else
   WebPUnfilters[WEBP_FILTER_VERTICAL] = VerticalUnfilter_SSE2;
+#endif
   WebPUnfilters[WEBP_FILTER_GRADIENT] = GradientUnfilter_SSE2;
 
   WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter_SSE2;
diff --git a/src/3rdparty/libwebp/src/dsp/lossless.c b/src/3rdparty/libwebp/src/dsp/lossless.c
index d05af84..fb86e58 100644
--- a/src/3rdparty/libwebp/src/dsp/lossless.c
+++ b/src/3rdparty/libwebp/src/dsp/lossless.c
@@ -49,7 +49,7 @@ static WEBP_INLINE uint32_t Clip255(uint32_t a) {
 }
 
 static WEBP_INLINE int AddSubtractComponentFull(int a, int b, int c) {
-  return Clip255(a + b - c);
+  return Clip255((uint32_t)(a + b - c));
 }
 
 static WEBP_INLINE uint32_t ClampedAddSubtractFull(uint32_t c0, uint32_t c1,
@@ -66,7 +66,7 @@ static WEBP_INLINE uint32_t ClampedAddSubtractFull(uint32_t c0, uint32_t c1,
 }
 
 static WEBP_INLINE int AddSubtractComponentHalf(int a, int b) {
-  return Clip255(a + (a - b) / 2);
+  return Clip255((uint32_t)(a + (a - b) / 2));
 }
 
 static WEBP_INLINE uint32_t ClampedAddSubtractHalf(uint32_t c0, uint32_t c1,
@@ -81,7 +81,7 @@ static WEBP_INLINE uint32_t ClampedAddSubtractHalf(uint32_t c0, uint32_t c1,
 
 // gcc <= 4.9 on ARM generates incorrect code in Select() when Sub3() is
 // inlined.
-#if defined(__arm__) && LOCAL_GCC_VERSION <= 0x409
+#if defined(__arm__) && defined(__GNUC__) && LOCAL_GCC_VERSION <= 0x409
 # define LOCAL_INLINE __attribute__ ((noinline))
 #else
 # define LOCAL_INLINE WEBP_INLINE
@@ -107,88 +107,107 @@ static WEBP_INLINE uint32_t Select(uint32_t a, uint32_t b, uint32_t c) {
 //------------------------------------------------------------------------------
 // Predictors
 
-static uint32_t Predictor0_C(uint32_t left, const uint32_t* const top) {
+uint32_t VP8LPredictor0_C(const uint32_t* const left,
+                          const uint32_t* const top) {
   (void)top;
   (void)left;
   return ARGB_BLACK;
 }
-static uint32_t Predictor1_C(uint32_t left, const uint32_t* const top) {
+uint32_t VP8LPredictor1_C(const uint32_t* const left,
+                          const uint32_t* const top) {
   (void)top;
-  return left;
+  return *left;
 }
-static uint32_t Predictor2_C(uint32_t left, const uint32_t* const top) {
+uint32_t VP8LPredictor2_C(const uint32_t* const left,
+                          const uint32_t* const top) {
   (void)left;
   return top[0];
 }
-static uint32_t Predictor3_C(uint32_t left, const uint32_t* const top) {
+uint32_t VP8LPredictor3_C(const uint32_t* const left,
+                          const uint32_t* const top) {
   (void)left;
   return top[1];
 }
-static uint32_t Predictor4_C(uint32_t left, const uint32_t* const top) {
+uint32_t VP8LPredictor4_C(const uint32_t* const left,
+                          const uint32_t* const top) {
   (void)left;
   return top[-1];
 }
-static uint32_t Predictor5_C(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = Average3(left, top[0], top[1]);
+uint32_t VP8LPredictor5_C(const uint32_t* const left,
+                          const uint32_t* const top) {
+  const uint32_t pred = Average3(*left, top[0], top[1]);
   return pred;
 }
-static uint32_t Predictor6_C(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = Average2(left, top[-1]);
+uint32_t VP8LPredictor6_C(const uint32_t* const left,
+                          const uint32_t* const top) {
+  const uint32_t pred = Average2(*left, top[-1]);
   return pred;
 }
-static uint32_t Predictor7_C(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = Average2(left, top[0]);
+uint32_t VP8LPredictor7_C(const uint32_t* const left,
+                          const uint32_t* const top) {
+  const uint32_t pred = Average2(*left, top[0]);
   return pred;
 }
-static uint32_t Predictor8_C(uint32_t left, const uint32_t* const top) {
+uint32_t VP8LPredictor8_C(const uint32_t* const left,
+                          const uint32_t* const top) {
   const uint32_t pred = Average2(top[-1], top[0]);
   (void)left;
   return pred;
 }
-static uint32_t Predictor9_C(uint32_t left, const uint32_t* const top) {
+uint32_t VP8LPredictor9_C(const uint32_t* const left,
+                          const uint32_t* const top) {
   const uint32_t pred = Average2(top[0], top[1]);
   (void)left;
   return pred;
 }
-static uint32_t Predictor10_C(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = Average4(left, top[-1], top[0], top[1]);
+uint32_t VP8LPredictor10_C(const uint32_t* const left,
+                           const uint32_t* const top) {
+  const uint32_t pred = Average4(*left, top[-1], top[0], top[1]);
   return pred;
 }
-static uint32_t Predictor11_C(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = Select(top[0], left, top[-1]);
+uint32_t VP8LPredictor11_C(const uint32_t* const left,
+                           const uint32_t* const top) {
+  const uint32_t pred = Select(top[0], *left, top[-1]);
   return pred;
 }
-static uint32_t Predictor12_C(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = ClampedAddSubtractFull(left, top[0], top[-1]);
+uint32_t VP8LPredictor12_C(const uint32_t* const left,
+                           const uint32_t* const top) {
+  const uint32_t pred = ClampedAddSubtractFull(*left, top[0], top[-1]);
   return pred;
 }
-static uint32_t Predictor13_C(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = ClampedAddSubtractHalf(left, top[0], top[-1]);
+uint32_t VP8LPredictor13_C(const uint32_t* const left,
+                           const uint32_t* const top) {
+  const uint32_t pred = ClampedAddSubtractHalf(*left, top[0], top[-1]);
   return pred;
 }
 
-GENERATE_PREDICTOR_ADD(Predictor0_C, PredictorAdd0_C)
+static void PredictorAdd0_C(const uint32_t* in, const uint32_t* upper,
+                            int num_pixels, uint32_t* out) {
+  int x;
+  (void)upper;
+  for (x = 0; x < num_pixels; ++x) out[x] = VP8LAddPixels(in[x], ARGB_BLACK);
+}
 static void PredictorAdd1_C(const uint32_t* in, const uint32_t* upper,
                             int num_pixels, uint32_t* out) {
   int i;
   uint32_t left = out[-1];
+  (void)upper;
   for (i = 0; i < num_pixels; ++i) {
     out[i] = left = VP8LAddPixels(in[i], left);
   }
-  (void)upper;
 }
-GENERATE_PREDICTOR_ADD(Predictor2_C, PredictorAdd2_C)
-GENERATE_PREDICTOR_ADD(Predictor3_C, PredictorAdd3_C)
-GENERATE_PREDICTOR_ADD(Predictor4_C, PredictorAdd4_C)
-GENERATE_PREDICTOR_ADD(Predictor5_C, PredictorAdd5_C)
-GENERATE_PREDICTOR_ADD(Predictor6_C, PredictorAdd6_C)
-GENERATE_PREDICTOR_ADD(Predictor7_C, PredictorAdd7_C)
-GENERATE_PREDICTOR_ADD(Predictor8_C, PredictorAdd8_C)
-GENERATE_PREDICTOR_ADD(Predictor9_C, PredictorAdd9_C)
-GENERATE_PREDICTOR_ADD(Predictor10_C, PredictorAdd10_C)
-GENERATE_PREDICTOR_ADD(Predictor11_C, PredictorAdd11_C)
-GENERATE_PREDICTOR_ADD(Predictor12_C, PredictorAdd12_C)
-GENERATE_PREDICTOR_ADD(Predictor13_C, PredictorAdd13_C)
+GENERATE_PREDICTOR_ADD(VP8LPredictor2_C, PredictorAdd2_C)
+GENERATE_PREDICTOR_ADD(VP8LPredictor3_C, PredictorAdd3_C)
+GENERATE_PREDICTOR_ADD(VP8LPredictor4_C, PredictorAdd4_C)
+GENERATE_PREDICTOR_ADD(VP8LPredictor5_C, PredictorAdd5_C)
+GENERATE_PREDICTOR_ADD(VP8LPredictor6_C, PredictorAdd6_C)
+GENERATE_PREDICTOR_ADD(VP8LPredictor7_C, PredictorAdd7_C)
+GENERATE_PREDICTOR_ADD(VP8LPredictor8_C, PredictorAdd8_C)
+GENERATE_PREDICTOR_ADD(VP8LPredictor9_C, PredictorAdd9_C)
+GENERATE_PREDICTOR_ADD(VP8LPredictor10_C, PredictorAdd10_C)
+GENERATE_PREDICTOR_ADD(VP8LPredictor11_C, PredictorAdd11_C)
+GENERATE_PREDICTOR_ADD(VP8LPredictor12_C, PredictorAdd12_C)
+GENERATE_PREDICTOR_ADD(VP8LPredictor13_C, PredictorAdd13_C)
 
 //------------------------------------------------------------------------------
 
@@ -274,10 +293,10 @@ void VP8LTransformColorInverse_C(const VP8LMultipliers* const m,
     const uint32_t red = argb >> 16;
     int new_red = red & 0xff;
     int new_blue = argb & 0xff;
-    new_red += ColorTransformDelta(m->green_to_red_, green);
+    new_red += ColorTransformDelta((int8_t)m->green_to_red_, green);
     new_red &= 0xff;
-    new_blue += ColorTransformDelta(m->green_to_blue_, green);
-    new_blue += ColorTransformDelta(m->red_to_blue_, (int8_t)new_red);
+    new_blue += ColorTransformDelta((int8_t)m->green_to_blue_, green);
+    new_blue += ColorTransformDelta((int8_t)m->red_to_blue_, (int8_t)new_red);
     new_blue &= 0xff;
     dst[i] = (argb & 0xff00ff00u) | (new_red << 16) | (new_blue);
   }
@@ -376,7 +395,7 @@ void VP8LInverseTransform(const VP8LTransform* const transform,
   assert(row_start < row_end);
   assert(row_end <= transform->ysize_);
   switch (transform->type_) {
-    case SUBTRACT_GREEN:
+    case SUBTRACT_GREEN_TRANSFORM:
       VP8LAddGreenToBlueAndRed(in, (row_end - row_start) * width, out);
       break;
     case PREDICTOR_TRANSFORM:
@@ -557,7 +576,6 @@ VP8LPredictorFunc VP8LPredictors[16];
 
 // exposed plain-C implementations
 VP8LPredictorAddSubFunc VP8LPredictorsAdd_C[16];
-VP8LPredictorFunc VP8LPredictors_C[16];
 
 VP8LTransformColorInverseFunc VP8LTransformColorInverse;
 
@@ -571,6 +589,7 @@ VP8LMapARGBFunc VP8LMapColor32b;
 VP8LMapAlphaFunc VP8LMapColor8b;
 
 extern void VP8LDspInitSSE2(void);
+extern void VP8LDspInitSSE41(void);
 extern void VP8LDspInitNEON(void);
 extern void VP8LDspInitMIPSdspR2(void);
 extern void VP8LDspInitMSA(void);
@@ -595,8 +614,7 @@ extern void VP8LDspInitMSA(void);
 } while (0);
 
 WEBP_DSP_INIT_FUNC(VP8LDspInit) {
-  COPY_PREDICTOR_ARRAY(Predictor, VP8LPredictors)
-  COPY_PREDICTOR_ARRAY(Predictor, VP8LPredictors_C)
+  COPY_PREDICTOR_ARRAY(VP8LPredictor, VP8LPredictors)
   COPY_PREDICTOR_ARRAY(PredictorAdd, VP8LPredictorsAdd)
   COPY_PREDICTOR_ARRAY(PredictorAdd, VP8LPredictorsAdd_C)
 
@@ -618,9 +636,14 @@ WEBP_DSP_INIT_FUNC(VP8LDspInit) {
 
   // If defined, use CPUInfo() to overwrite some pointers with faster versions.
   if (VP8GetCPUInfo != NULL) {
-#if defined(WEBP_USE_SSE2)
+#if defined(WEBP_HAVE_SSE2)
     if (VP8GetCPUInfo(kSSE2)) {
       VP8LDspInitSSE2();
+#if defined(WEBP_HAVE_SSE41)
+      if (VP8GetCPUInfo(kSSE4_1)) {
+        VP8LDspInitSSE41();
+      }
+#endif
     }
 #endif
 #if defined(WEBP_USE_MIPS_DSP_R2)
@@ -635,7 +658,7 @@ WEBP_DSP_INIT_FUNC(VP8LDspInit) {
 #endif
   }
 
-#if defined(WEBP_USE_NEON)
+#if defined(WEBP_HAVE_NEON)
   if (WEBP_NEON_OMIT_C_CODE ||
       (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
     VP8LDspInitNEON();
diff --git a/src/3rdparty/libwebp/src/dsp/lossless.h b/src/3rdparty/libwebp/src/dsp/lossless.h
index f709cc8..de60d95 100644
--- a/src/3rdparty/libwebp/src/dsp/lossless.h
+++ b/src/3rdparty/libwebp/src/dsp/lossless.h
@@ -28,9 +28,39 @@ extern "C" {
 //------------------------------------------------------------------------------
 // Decoding
 
-typedef uint32_t (*VP8LPredictorFunc)(uint32_t left, const uint32_t* const top);
+typedef uint32_t (*VP8LPredictorFunc)(const uint32_t* const left,
+                                      const uint32_t* const top);
 extern VP8LPredictorFunc VP8LPredictors[16];
-extern VP8LPredictorFunc VP8LPredictors_C[16];
+
+uint32_t VP8LPredictor0_C(const uint32_t* const left,
+                          const uint32_t* const top);
+uint32_t VP8LPredictor1_C(const uint32_t* const left,
+                          const uint32_t* const top);
+uint32_t VP8LPredictor2_C(const uint32_t* const left,
+                          const uint32_t* const top);
+uint32_t VP8LPredictor3_C(const uint32_t* const left,
+                          const uint32_t* const top);
+uint32_t VP8LPredictor4_C(const uint32_t* const left,
+                          const uint32_t* const top);
+uint32_t VP8LPredictor5_C(const uint32_t* const left,
+                          const uint32_t* const top);
+uint32_t VP8LPredictor6_C(const uint32_t* const left,
+                          const uint32_t* const top);
+uint32_t VP8LPredictor7_C(const uint32_t* const left,
+                          const uint32_t* const top);
+uint32_t VP8LPredictor8_C(const uint32_t* const left,
+                          const uint32_t* const top);
+uint32_t VP8LPredictor9_C(const uint32_t* const left,
+                          const uint32_t* const top);
+uint32_t VP8LPredictor10_C(const uint32_t* const left,
+                           const uint32_t* const top);
+uint32_t VP8LPredictor11_C(const uint32_t* const left,
+                           const uint32_t* const top);
+uint32_t VP8LPredictor12_C(const uint32_t* const left,
+                           const uint32_t* const top);
+uint32_t VP8LPredictor13_C(const uint32_t* const left,
+                           const uint32_t* const top);
+
 // These Add/Sub function expects upper[-1] and out[-1] to be readable.
 typedef void (*VP8LPredictorAddSubFunc)(const uint32_t* in,
                                         const uint32_t* upper, int num_pixels,
@@ -152,9 +182,9 @@ extern VP8LPredictorAddSubFunc VP8LPredictorsSub_C[16];
 // -----------------------------------------------------------------------------
 // Huffman-cost related functions.
 
-typedef double (*VP8LCostFunc)(const uint32_t* population, int length);
-typedef double (*VP8LCostCombinedFunc)(const uint32_t* X, const uint32_t* Y,
-                                       int length);
+typedef float (*VP8LCostFunc)(const uint32_t* population, int length);
+typedef float (*VP8LCostCombinedFunc)(const uint32_t* X, const uint32_t* Y,
+                                      int length);
 typedef float (*VP8LCombinedShannonEntropyFunc)(const int X[256],
                                                 const int Y[256]);
 
@@ -168,7 +198,7 @@ typedef struct {        // small struct to hold counters
 } VP8LStreaks;
 
 typedef struct {            // small struct to hold bit entropy results
-  double entropy;           // entropy
+  float entropy;            // entropy
   uint32_t sum;             // sum of the population
   int nonzeros;             // number of non-zero elements in the population
   uint32_t max_val;         // maximum value in the population
diff --git a/src/3rdparty/libwebp/src/dsp/lossless_common.h b/src/3rdparty/libwebp/src/dsp/lossless_common.h
index a2648d1..6a2f736 100644
--- a/src/3rdparty/libwebp/src/dsp/lossless_common.h
+++ b/src/3rdparty/libwebp/src/dsp/lossless_common.h
@@ -177,24 +177,13 @@ uint32_t VP8LSubPixels(uint32_t a, uint32_t b) {
 static void PREDICTOR_ADD(const uint32_t* in, const uint32_t* upper, \
                           int num_pixels, uint32_t* out) {           \
   int x;                                                             \
+  assert(upper != NULL);                                             \
   for (x = 0; x < num_pixels; ++x) {                                 \
-    const uint32_t pred = (PREDICTOR)(out[x - 1], upper + x);        \
+    const uint32_t pred = (PREDICTOR)(&out[x - 1], upper + x);       \
     out[x] = VP8LAddPixels(in[x], pred);                             \
   }                                                                  \
 }
 
-// It subtracts the prediction from the input pixel and stores the residual
-// in the output pixel.
-#define GENERATE_PREDICTOR_SUB(PREDICTOR, PREDICTOR_SUB)             \
-static void PREDICTOR_SUB(const uint32_t* in, const uint32_t* upper, \
-                          int num_pixels, uint32_t* out) {           \
-  int x;                                                             \
-  for (x = 0; x < num_pixels; ++x) {                                 \
-    const uint32_t pred = (PREDICTOR)(in[x - 1], upper + x);         \
-    out[x] = VP8LSubPixels(in[x], pred);                             \
-  }                                                                  \
-}
-
 #ifdef __cplusplus
 }    // extern "C"
 #endif
diff --git a/src/3rdparty/libwebp/src/dsp/lossless_enc.c b/src/3rdparty/libwebp/src/dsp/lossless_enc.c
index 9c36055..b1f9f26 100644
--- a/src/3rdparty/libwebp/src/dsp/lossless_enc.c
+++ b/src/3rdparty/libwebp/src/dsp/lossless_enc.c
@@ -329,6 +329,15 @@ const uint8_t kPrefixEncodeExtraBitsValue[PREFIX_LOOKUP_IDX_MAX] = {
 static float FastSLog2Slow_C(uint32_t v) {
   assert(v >= LOG_LOOKUP_IDX_MAX);
   if (v < APPROX_LOG_WITH_CORRECTION_MAX) {
+#if !defined(WEBP_HAVE_SLOW_CLZ_CTZ)
+    // use clz if available
+    const int log_cnt = BitsLog2Floor(v) - 7;
+    const uint32_t y = 1 << log_cnt;
+    int correction = 0;
+    const float v_f = (float)v;
+    const uint32_t orig_v = v;
+    v >>= log_cnt;
+#else
     int log_cnt = 0;
     uint32_t y = 1;
     int correction = 0;
@@ -339,6 +348,7 @@ static float FastSLog2Slow_C(uint32_t v) {
       v = v >> 1;
       y = y << 1;
     } while (v >= LOG_LOOKUP_IDX_MAX);
+#endif
     // vf = (2^log_cnt) * Xf; where y = 2^log_cnt and Xf < 256
     // Xf = floor(Xf) * (1 + (v % y) / v)
     // log2(Xf) = log2(floor(Xf)) + log2(1 + (v % y) / v)
@@ -355,6 +365,14 @@ static float FastSLog2Slow_C(uint32_t v) {
 static float FastLog2Slow_C(uint32_t v) {
   assert(v >= LOG_LOOKUP_IDX_MAX);
   if (v < APPROX_LOG_WITH_CORRECTION_MAX) {
+#if !defined(WEBP_HAVE_SLOW_CLZ_CTZ)
+    // use clz if available
+    const int log_cnt = BitsLog2Floor(v) - 7;
+    const uint32_t y = 1 << log_cnt;
+    const uint32_t orig_v = v;
+    double log_2;
+    v >>= log_cnt;
+#else
     int log_cnt = 0;
     uint32_t y = 1;
     const uint32_t orig_v = v;
@@ -364,6 +382,7 @@ static float FastLog2Slow_C(uint32_t v) {
       v = v >> 1;
       y = y << 1;
     } while (v >= LOG_LOOKUP_IDX_MAX);
+#endif
     log_2 = kLog2Table[v] + log_cnt;
     if (orig_v >= APPROX_LOG_MAX) {
       // Since the division is still expensive, add this correction factor only
@@ -383,7 +402,7 @@ static float FastLog2Slow_C(uint32_t v) {
 // Compute the combined Shanon's entropy for distribution {X} and {X+Y}
 static float CombinedShannonEntropy_C(const int X[256], const int Y[256]) {
   int i;
-  double retval = 0.;
+  float retval = 0.f;
   int sumX = 0, sumXY = 0;
   for (i = 0; i < 256; ++i) {
     const int x = X[i];
@@ -399,7 +418,7 @@ static float CombinedShannonEntropy_C(const int X[256], const int Y[256]) {
     }
   }
   retval += VP8LFastSLog2(sumX) + VP8LFastSLog2(sumXY);
-  return (float)retval;
+  return retval;
 }
 
 void VP8LBitEntropyInit(VP8LBitEntropy* const entropy) {
@@ -503,11 +522,11 @@ static void GetCombinedEntropyUnrefined_C(const uint32_t X[],
 void VP8LSubtractGreenFromBlueAndRed_C(uint32_t* argb_data, int num_pixels) {
   int i;
   for (i = 0; i < num_pixels; ++i) {
-    const int argb = argb_data[i];
+    const int argb = (int)argb_data[i];
     const int green = (argb >> 8) & 0xff;
     const uint32_t new_r = (((argb >> 16) & 0xff) - green) & 0xff;
     const uint32_t new_b = (((argb >>  0) & 0xff) - green) & 0xff;
-    argb_data[i] = (argb & 0xff00ff00u) | (new_r << 16) | new_b;
+    argb_data[i] = ((uint32_t)argb & 0xff00ff00u) | (new_r << 16) | new_b;
   }
 }
 
@@ -528,10 +547,10 @@ void VP8LTransformColor_C(const VP8LMultipliers* const m, uint32_t* data,
     const int8_t red   = U32ToS8(argb >> 16);
     int new_red = red & 0xff;
     int new_blue = argb & 0xff;
-    new_red -= ColorTransformDelta(m->green_to_red_, green);
+    new_red -= ColorTransformDelta((int8_t)m->green_to_red_, green);
     new_red &= 0xff;
-    new_blue -= ColorTransformDelta(m->green_to_blue_, green);
-    new_blue -= ColorTransformDelta(m->red_to_blue_, red);
+    new_blue -= ColorTransformDelta((int8_t)m->green_to_blue_, green);
+    new_blue -= ColorTransformDelta((int8_t)m->red_to_blue_, red);
     new_blue &= 0xff;
     data[i] = (argb & 0xff00ff00u) | (new_red << 16) | (new_blue);
   }
@@ -541,7 +560,7 @@ static WEBP_INLINE uint8_t TransformColorRed(uint8_t green_to_red,
                                              uint32_t argb) {
   const int8_t green = U32ToS8(argb >> 8);
   int new_red = argb >> 16;
-  new_red -= ColorTransformDelta(green_to_red, green);
+  new_red -= ColorTransformDelta((int8_t)green_to_red, green);
   return (new_red & 0xff);
 }
 
@@ -550,9 +569,9 @@ static WEBP_INLINE uint8_t TransformColorBlue(uint8_t green_to_blue,
                                               uint32_t argb) {
   const int8_t green = U32ToS8(argb >>  8);
   const int8_t red   = U32ToS8(argb >> 16);
-  uint8_t new_blue = argb & 0xff;
-  new_blue -= ColorTransformDelta(green_to_blue, green);
-  new_blue -= ColorTransformDelta(red_to_blue, red);
+  int new_blue = argb & 0xff;
+  new_blue -= ColorTransformDelta((int8_t)green_to_blue, green);
+  new_blue -= ColorTransformDelta((int8_t)red_to_blue, red);
   return (new_blue & 0xff);
 }
 
@@ -617,17 +636,17 @@ void VP8LBundleColorMap_C(const uint8_t* const row, int width, int xbits,
 
 //------------------------------------------------------------------------------
 
-static double ExtraCost_C(const uint32_t* population, int length) {
+static float ExtraCost_C(const uint32_t* population, int length) {
   int i;
-  double cost = 0.;
+  float cost = 0.f;
   for (i = 2; i < length - 2; ++i) cost += (i >> 1) * population[i + 2];
   return cost;
 }
 
-static double ExtraCostCombined_C(const uint32_t* X, const uint32_t* Y,
+static float ExtraCostCombined_C(const uint32_t* X, const uint32_t* Y,
                                   int length) {
   int i;
-  double cost = 0.;
+  float cost = 0.f;
   for (i = 2; i < length - 2; ++i) {
     const int xy = X[i + 2] + Y[i + 2];
     cost += (i >> 1) * xy;
@@ -702,140 +721,6 @@ void VP8LHistogramAdd(const VP8LHistogram* const a,
 //------------------------------------------------------------------------------
 // Image transforms.
 
-static WEBP_INLINE uint32_t Average2(uint32_t a0, uint32_t a1) {
-  return (((a0 ^ a1) & 0xfefefefeu) >> 1) + (a0 & a1);
-}
-
-static WEBP_INLINE uint32_t Average3(uint32_t a0, uint32_t a1, uint32_t a2) {
-  return Average2(Average2(a0, a2), a1);
-}
-
-static WEBP_INLINE uint32_t Average4(uint32_t a0, uint32_t a1,
-                                     uint32_t a2, uint32_t a3) {
-  return Average2(Average2(a0, a1), Average2(a2, a3));
-}
-
-static WEBP_INLINE uint32_t Clip255(uint32_t a) {
-  if (a < 256) {
-    return a;
-  }
-  // return 0, when a is a negative integer.
-  // return 255, when a is positive.
-  return ~a >> 24;
-}
-
-static WEBP_INLINE int AddSubtractComponentFull(int a, int b, int c) {
-  return Clip255(a + b - c);
-}
-
-static WEBP_INLINE uint32_t ClampedAddSubtractFull(uint32_t c0, uint32_t c1,
-                                                   uint32_t c2) {
-  const int a = AddSubtractComponentFull(c0 >> 24, c1 >> 24, c2 >> 24);
-  const int r = AddSubtractComponentFull((c0 >> 16) & 0xff,
-                                         (c1 >> 16) & 0xff,
-                                         (c2 >> 16) & 0xff);
-  const int g = AddSubtractComponentFull((c0 >> 8) & 0xff,
-                                         (c1 >> 8) & 0xff,
-                                         (c2 >> 8) & 0xff);
-  const int b = AddSubtractComponentFull(c0 & 0xff, c1 & 0xff, c2 & 0xff);
-  return ((uint32_t)a << 24) | (r << 16) | (g << 8) | b;
-}
-
-static WEBP_INLINE int AddSubtractComponentHalf(int a, int b) {
-  return Clip255(a + (a - b) / 2);
-}
-
-static WEBP_INLINE uint32_t ClampedAddSubtractHalf(uint32_t c0, uint32_t c1,
-                                                   uint32_t c2) {
-  const uint32_t ave = Average2(c0, c1);
-  const int a = AddSubtractComponentHalf(ave >> 24, c2 >> 24);
-  const int r = AddSubtractComponentHalf((ave >> 16) & 0xff, (c2 >> 16) & 0xff);
-  const int g = AddSubtractComponentHalf((ave >> 8) & 0xff, (c2 >> 8) & 0xff);
-  const int b = AddSubtractComponentHalf((ave >> 0) & 0xff, (c2 >> 0) & 0xff);
-  return ((uint32_t)a << 24) | (r << 16) | (g << 8) | b;
-}
-
-// gcc-4.9 on ARM generates incorrect code in Select() when Sub3() is inlined.
-#if defined(__arm__) && \
-    (LOCAL_GCC_VERSION == 0x409 || LOCAL_GCC_VERSION == 0x408)
-# define LOCAL_INLINE __attribute__ ((noinline))
-#else
-# define LOCAL_INLINE WEBP_INLINE
-#endif
-
-static LOCAL_INLINE int Sub3(int a, int b, int c) {
-  const int pb = b - c;
-  const int pa = a - c;
-  return abs(pb) - abs(pa);
-}
-
-#undef LOCAL_INLINE
-
-static WEBP_INLINE uint32_t Select(uint32_t a, uint32_t b, uint32_t c) {
-  const int pa_minus_pb =
-      Sub3((a >> 24)       , (b >> 24)       , (c >> 24)       ) +
-      Sub3((a >> 16) & 0xff, (b >> 16) & 0xff, (c >> 16) & 0xff) +
-      Sub3((a >>  8) & 0xff, (b >>  8) & 0xff, (c >>  8) & 0xff) +
-      Sub3((a      ) & 0xff, (b      ) & 0xff, (c      ) & 0xff);
-  return (pa_minus_pb <= 0) ? a : b;
-}
-
-//------------------------------------------------------------------------------
-// Predictors
-
-static uint32_t Predictor2(uint32_t left, const uint32_t* const top) {
-  (void)left;
-  return top[0];
-}
-static uint32_t Predictor3(uint32_t left, const uint32_t* const top) {
-  (void)left;
-  return top[1];
-}
-static uint32_t Predictor4(uint32_t left, const uint32_t* const top) {
-  (void)left;
-  return top[-1];
-}
-static uint32_t Predictor5(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = Average3(left, top[0], top[1]);
-  return pred;
-}
-static uint32_t Predictor6(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = Average2(left, top[-1]);
-  return pred;
-}
-static uint32_t Predictor7(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = Average2(left, top[0]);
-  return pred;
-}
-static uint32_t Predictor8(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = Average2(top[-1], top[0]);
-  (void)left;
-  return pred;
-}
-static uint32_t Predictor9(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = Average2(top[0], top[1]);
-  (void)left;
-  return pred;
-}
-static uint32_t Predictor10(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = Average4(left, top[-1], top[0], top[1]);
-  return pred;
-}
-static uint32_t Predictor11(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = Select(top[0], left, top[-1]);
-  return pred;
-}
-static uint32_t Predictor12(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = ClampedAddSubtractFull(left, top[0], top[-1]);
-  return pred;
-}
-static uint32_t Predictor13(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = ClampedAddSubtractHalf(left, top[0], top[-1]);
-  return pred;
-}
-
-//------------------------------------------------------------------------------
-
 static void PredictorSub0_C(const uint32_t* in, const uint32_t* upper,
                             int num_pixels, uint32_t* out) {
   int i;
@@ -850,18 +735,33 @@ static void PredictorSub1_C(const uint32_t* in, const uint32_t* upper,
   (void)upper;
 }
 
-GENERATE_PREDICTOR_SUB(Predictor2, PredictorSub2_C)
-GENERATE_PREDICTOR_SUB(Predictor3, PredictorSub3_C)
-GENERATE_PREDICTOR_SUB(Predictor4, PredictorSub4_C)
-GENERATE_PREDICTOR_SUB(Predictor5, PredictorSub5_C)
-GENERATE_PREDICTOR_SUB(Predictor6, PredictorSub6_C)
-GENERATE_PREDICTOR_SUB(Predictor7, PredictorSub7_C)
-GENERATE_PREDICTOR_SUB(Predictor8, PredictorSub8_C)
-GENERATE_PREDICTOR_SUB(Predictor9, PredictorSub9_C)
-GENERATE_PREDICTOR_SUB(Predictor10, PredictorSub10_C)
-GENERATE_PREDICTOR_SUB(Predictor11, PredictorSub11_C)
-GENERATE_PREDICTOR_SUB(Predictor12, PredictorSub12_C)
-GENERATE_PREDICTOR_SUB(Predictor13, PredictorSub13_C)
+// It subtracts the prediction from the input pixel and stores the residual
+// in the output pixel.
+#define GENERATE_PREDICTOR_SUB(PREDICTOR_I)                                \
+static void PredictorSub##PREDICTOR_I##_C(const uint32_t* in,              \
+                                          const uint32_t* upper,           \
+                                          int num_pixels, uint32_t* out) { \
+  int x;                                                                   \
+  assert(upper != NULL);                                                   \
+  for (x = 0; x < num_pixels; ++x) {                                       \
+    const uint32_t pred =                                                  \
+        VP8LPredictor##PREDICTOR_I##_C(&in[x - 1], upper + x);             \
+    out[x] = VP8LSubPixels(in[x], pred);                                   \
+  }                                                                        \
+}
+
+GENERATE_PREDICTOR_SUB(2)
+GENERATE_PREDICTOR_SUB(3)
+GENERATE_PREDICTOR_SUB(4)
+GENERATE_PREDICTOR_SUB(5)
+GENERATE_PREDICTOR_SUB(6)
+GENERATE_PREDICTOR_SUB(7)
+GENERATE_PREDICTOR_SUB(8)
+GENERATE_PREDICTOR_SUB(9)
+GENERATE_PREDICTOR_SUB(10)
+GENERATE_PREDICTOR_SUB(11)
+GENERATE_PREDICTOR_SUB(12)
+GENERATE_PREDICTOR_SUB(13)
 
 //------------------------------------------------------------------------------
 
@@ -962,10 +862,10 @@ WEBP_DSP_INIT_FUNC(VP8LEncDspInit) {
 
   // If defined, use CPUInfo() to overwrite some pointers with faster versions.
   if (VP8GetCPUInfo != NULL) {
-#if defined(WEBP_USE_SSE2)
+#if defined(WEBP_HAVE_SSE2)
     if (VP8GetCPUInfo(kSSE2)) {
       VP8LEncDspInitSSE2();
-#if defined(WEBP_USE_SSE41)
+#if defined(WEBP_HAVE_SSE41)
       if (VP8GetCPUInfo(kSSE4_1)) {
         VP8LEncDspInitSSE41();
       }
@@ -989,7 +889,7 @@ WEBP_DSP_INIT_FUNC(VP8LEncDspInit) {
 #endif
   }
 
-#if defined(WEBP_USE_NEON)
+#if defined(WEBP_HAVE_NEON)
   if (WEBP_NEON_OMIT_C_CODE ||
       (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
     VP8LEncDspInitNEON();
diff --git a/src/3rdparty/libwebp/src/dsp/lossless_enc_mips32.c b/src/3rdparty/libwebp/src/dsp/lossless_enc_mips32.c
index 0412a09..639f786 100644
--- a/src/3rdparty/libwebp/src/dsp/lossless_enc_mips32.c
+++ b/src/3rdparty/libwebp/src/dsp/lossless_enc_mips32.c
@@ -103,8 +103,8 @@ static float FastLog2Slow_MIPS32(uint32_t v) {
 //     cost += i * *(pop + 1);
 //     pop += 2;
 //   }
-//   return (double)cost;
-static double ExtraCost_MIPS32(const uint32_t* const population, int length) {
+//   return (float)cost;
+static float ExtraCost_MIPS32(const uint32_t* const population, int length) {
   int i, temp0, temp1;
   const uint32_t* pop = &population[4];
   const uint32_t* const LoopEnd = &population[length];
@@ -130,7 +130,7 @@ static double ExtraCost_MIPS32(const uint32_t* const population, int length) {
     : "memory", "hi", "lo"
   );
 
-  return (double)((int64_t)temp0 << 32 | temp1);
+  return (float)((int64_t)temp0 << 32 | temp1);
 }
 
 // C version of this function:
@@ -148,9 +148,9 @@ static double ExtraCost_MIPS32(const uint32_t* const population, int length) {
 //     pX += 2;
 //     pY += 2;
 //   }
-//   return (double)cost;
-static double ExtraCostCombined_MIPS32(const uint32_t* const X,
-                                       const uint32_t* const Y, int length) {
+//   return (float)cost;
+static float ExtraCostCombined_MIPS32(const uint32_t* const X,
+                                      const uint32_t* const Y, int length) {
   int i, temp0, temp1, temp2, temp3;
   const uint32_t* pX = &X[4];
   const uint32_t* pY = &Y[4];
@@ -183,7 +183,7 @@ static double ExtraCostCombined_MIPS32(const uint32_t* const X,
     : "memory", "hi", "lo"
   );
 
-  return (double)((int64_t)temp0 << 32 | temp1);
+  return (float)((int64_t)temp0 << 32 | temp1);
 }
 
 #define HUFFMAN_COST_PASS                                 \
@@ -347,24 +347,24 @@ static void GetCombinedEntropyUnrefined_MIPS32(const uint32_t X[],
 static void AddVector_MIPS32(const uint32_t* pa, const uint32_t* pb,
                              uint32_t* pout, int size) {
   uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
-  const uint32_t end = ((size) / 4) * 4;
+  const int end = ((size) / 4) * 4;
   const uint32_t* const LoopEnd = pa + end;
   int i;
   ASM_START
   ADD_TO_OUT(0, 4, 8, 12, 1, pa, pb, pout)
   ASM_END_0
-  for (i = end; i < size; ++i) pout[i] = pa[i] + pb[i];
+  for (i = 0; i < size - end; ++i) pout[i] = pa[i] + pb[i];
 }
 
 static void AddVectorEq_MIPS32(const uint32_t* pa, uint32_t* pout, int size) {
   uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
-  const uint32_t end = ((size) / 4) * 4;
+  const int end = ((size) / 4) * 4;
   const uint32_t* const LoopEnd = pa + end;
   int i;
   ASM_START
   ADD_TO_OUT(0, 4, 8, 12, 0, pa, pout, pout)
   ASM_END_1
-  for (i = end; i < size; ++i) pout[i] += pa[i];
+  for (i = 0; i < size - end; ++i) pout[i] += pa[i];
 }
 
 #undef ASM_END_1
diff --git a/src/3rdparty/libwebp/src/dsp/lossless_enc_sse2.c b/src/3rdparty/libwebp/src/dsp/lossless_enc_sse2.c
index 8adc521..66cbaab 100644
--- a/src/3rdparty/libwebp/src/dsp/lossless_enc_sse2.c
+++ b/src/3rdparty/libwebp/src/dsp/lossless_enc_sse2.c
@@ -54,8 +54,8 @@ static void TransformColor_SSE2(const VP8LMultipliers* const m,
   const __m128i mults_rb = MK_CST_16(CST_5b(m->green_to_red_),
                                      CST_5b(m->green_to_blue_));
   const __m128i mults_b2 = MK_CST_16(CST_5b(m->red_to_blue_), 0);
-  const __m128i mask_ag = _mm_set1_epi32(0xff00ff00);  // alpha-green masks
-  const __m128i mask_rb = _mm_set1_epi32(0x00ff00ff);  // red-blue masks
+  const __m128i mask_ag = _mm_set1_epi32((int)0xff00ff00);  // alpha-green masks
+  const __m128i mask_rb = _mm_set1_epi32(0x00ff00ff);       // red-blue masks
   int i;
   for (i = 0; i + 4 <= num_pixels; i += 4) {
     const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]); // argb
@@ -232,76 +232,55 @@ static void AddVectorEq_SSE2(const uint32_t* a, uint32_t* out, int size) {
 //------------------------------------------------------------------------------
 // Entropy
 
-// Checks whether the X or Y contribution is worth computing and adding.
-// Used in loop unrolling.
-#define ANALYZE_X_OR_Y(x_or_y, j)                                           \
-  do {                                                                      \
-    if ((x_or_y)[i + (j)] != 0) retval -= VP8LFastSLog2((x_or_y)[i + (j)]); \
-  } while (0)
-
-// Checks whether the X + Y contribution is worth computing and adding.
-// Used in loop unrolling.
-#define ANALYZE_XY(j)                  \
-  do {                                 \
-    if (tmp[j] != 0) {                 \
-      retval -= VP8LFastSLog2(tmp[j]); \
-      ANALYZE_X_OR_Y(X, j);            \
-    }                                  \
-  } while (0)
+// TODO(https://crbug.com/webp/499): this function produces different results
+// from the C code due to use of double/float resulting in output differences
+// when compared to -noasm.
+#if !(defined(WEBP_HAVE_SLOW_CLZ_CTZ) || defined(__i386__) || defined(_M_IX86))
 
 static float CombinedShannonEntropy_SSE2(const int X[256], const int Y[256]) {
   int i;
-  double retval = 0.;
-  int sumX, sumXY;
-  int32_t tmp[4];
-  __m128i zero = _mm_setzero_si128();
-  // Sums up X + Y, 4 ints at a time (and will merge it at the end for sumXY).
-  __m128i sumXY_128 = zero;
-  __m128i sumX_128 = zero;
-
-  for (i = 0; i < 256; i += 4) {
-    const __m128i x = _mm_loadu_si128((const __m128i*)(X + i));
-    const __m128i y = _mm_loadu_si128((const __m128i*)(Y + i));
-
-    // Check if any X is non-zero: this actually provides a speedup as X is
-    // usually sparse.
-    if (_mm_movemask_epi8(_mm_cmpeq_epi32(x, zero)) != 0xFFFF) {
-      const __m128i xy_128 = _mm_add_epi32(x, y);
-      sumXY_128 = _mm_add_epi32(sumXY_128, xy_128);
-
-      sumX_128 = _mm_add_epi32(sumX_128, x);
-
-      // Analyze the different X + Y.
-      _mm_storeu_si128((__m128i*)tmp, xy_128);
-
-      ANALYZE_XY(0);
-      ANALYZE_XY(1);
-      ANALYZE_XY(2);
-      ANALYZE_XY(3);
-    } else {
-      // X is fully 0, so only deal with Y.
-      sumXY_128 = _mm_add_epi32(sumXY_128, y);
-
-      ANALYZE_X_OR_Y(Y, 0);
-      ANALYZE_X_OR_Y(Y, 1);
-      ANALYZE_X_OR_Y(Y, 2);
-      ANALYZE_X_OR_Y(Y, 3);
+  float retval = 0.f;
+  int sumX = 0, sumXY = 0;
+  const __m128i zero = _mm_setzero_si128();
+
+  for (i = 0; i < 256; i += 16) {
+    const __m128i x0 = _mm_loadu_si128((const __m128i*)(X + i +  0));
+    const __m128i y0 = _mm_loadu_si128((const __m128i*)(Y + i +  0));
+    const __m128i x1 = _mm_loadu_si128((const __m128i*)(X + i +  4));
+    const __m128i y1 = _mm_loadu_si128((const __m128i*)(Y + i +  4));
+    const __m128i x2 = _mm_loadu_si128((const __m128i*)(X + i +  8));
+    const __m128i y2 = _mm_loadu_si128((const __m128i*)(Y + i +  8));
+    const __m128i x3 = _mm_loadu_si128((const __m128i*)(X + i + 12));
+    const __m128i y3 = _mm_loadu_si128((const __m128i*)(Y + i + 12));
+    const __m128i x4 = _mm_packs_epi16(_mm_packs_epi32(x0, x1),
+                                       _mm_packs_epi32(x2, x3));
+    const __m128i y4 = _mm_packs_epi16(_mm_packs_epi32(y0, y1),
+                                       _mm_packs_epi32(y2, y3));
+    const int32_t mx = _mm_movemask_epi8(_mm_cmpgt_epi8(x4, zero));
+    int32_t my = _mm_movemask_epi8(_mm_cmpgt_epi8(y4, zero)) | mx;
+    while (my) {
+      const int32_t j = BitsCtz(my);
+      int xy;
+      if ((mx >> j) & 1) {
+        const int x = X[i + j];
+        sumXY += x;
+        retval -= VP8LFastSLog2(x);
+      }
+      xy = X[i + j] + Y[i + j];
+      sumX += xy;
+      retval -= VP8LFastSLog2(xy);
+      my &= my - 1;
     }
   }
+  retval += VP8LFastSLog2(sumX) + VP8LFastSLog2(sumXY);
+  return retval;
+}
 
-  // Sum up sumX_128 to get sumX.
-  _mm_storeu_si128((__m128i*)tmp, sumX_128);
-  sumX = tmp[3] + tmp[2] + tmp[1] + tmp[0];
+#else
 
-  // Sum up sumXY_128 to get sumXY.
-  _mm_storeu_si128((__m128i*)tmp, sumXY_128);
-  sumXY = tmp[3] + tmp[2] + tmp[1] + tmp[0];
+#define DONT_USE_COMBINED_SHANNON_ENTROPY_SSE2_FUNC   // won't be faster
 
-  retval += VP8LFastSLog2(sumX) + VP8LFastSLog2(sumXY);
-  return (float)retval;
-}
-#undef ANALYZE_X_OR_Y
-#undef ANALYZE_XY
+#endif
 
 //------------------------------------------------------------------------------
 
@@ -397,7 +376,7 @@ static void BundleColorMap_SSE2(const uint8_t* const row, int width, int xbits,
       break;
     }
     case 2: {
-      const __m128i mask_or = _mm_set1_epi32(0xff000000);
+      const __m128i mask_or = _mm_set1_epi32((int)0xff000000);
       const __m128i mul_cst = _mm_set1_epi16(0x0104);
       const __m128i mask_mul = _mm_set1_epi16(0x0f00);
       for (x = 0; x + 16 <= width; x += 16, dst += 4) {
@@ -448,31 +427,34 @@ static WEBP_INLINE void Average2_m128i(const __m128i* const a0,
 static void PredictorSub0_SSE2(const uint32_t* in, const uint32_t* upper,
                                int num_pixels, uint32_t* out) {
   int i;
-  const __m128i black = _mm_set1_epi32(ARGB_BLACK);
+  const __m128i black = _mm_set1_epi32((int)ARGB_BLACK);
   for (i = 0; i + 4 <= num_pixels; i += 4) {
     const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
     const __m128i res = _mm_sub_epi8(src, black);
     _mm_storeu_si128((__m128i*)&out[i], res);
   }
   if (i != num_pixels) {
-    VP8LPredictorsSub_C[0](in + i, upper + i, num_pixels - i, out + i);
+    VP8LPredictorsSub_C[0](in + i, NULL, num_pixels - i, out + i);
   }
+  (void)upper;
 }
 
-#define GENERATE_PREDICTOR_1(X, IN)                                           \
-static void PredictorSub##X##_SSE2(const uint32_t* in, const uint32_t* upper, \
-                                   int num_pixels, uint32_t* out) {           \
-  int i;                                                                      \
-  for (i = 0; i + 4 <= num_pixels; i += 4) {                                  \
-    const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);              \
-    const __m128i pred = _mm_loadu_si128((const __m128i*)&(IN));              \
-    const __m128i res = _mm_sub_epi8(src, pred);                              \
-    _mm_storeu_si128((__m128i*)&out[i], res);                                 \
-  }                                                                           \
-  if (i != num_pixels) {                                                      \
-    VP8LPredictorsSub_C[(X)](in + i, upper + i, num_pixels - i, out + i);     \
-  }                                                                           \
-}
+#define GENERATE_PREDICTOR_1(X, IN)                                         \
+  static void PredictorSub##X##_SSE2(const uint32_t* const in,              \
+                                     const uint32_t* const upper,           \
+                                     int num_pixels, uint32_t* const out) { \
+    int i;                                                                  \
+    for (i = 0; i + 4 <= num_pixels; i += 4) {                              \
+      const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);          \
+      const __m128i pred = _mm_loadu_si128((const __m128i*)&(IN));          \
+      const __m128i res = _mm_sub_epi8(src, pred);                          \
+      _mm_storeu_si128((__m128i*)&out[i], res);                             \
+    }                                                                       \
+    if (i != num_pixels) {                                                  \
+      VP8LPredictorsSub_C[(X)](in + i, WEBP_OFFSET_PTR(upper, i),           \
+                               num_pixels - i, out + i);                    \
+    }                                                                       \
+  }
 
 GENERATE_PREDICTOR_1(1, in[i - 1])       // Predictor1: L
 GENERATE_PREDICTOR_1(2, upper[i])        // Predictor2: T
@@ -656,7 +638,9 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitSSE2(void) {
   VP8LCollectColorRedTransforms = CollectColorRedTransforms_SSE2;
   VP8LAddVector = AddVector_SSE2;
   VP8LAddVectorEq = AddVectorEq_SSE2;
+#if !defined(DONT_USE_COMBINED_SHANNON_ENTROPY_SSE2_FUNC)
   VP8LCombinedShannonEntropy = CombinedShannonEntropy_SSE2;
+#endif
   VP8LVectorMismatch = VectorMismatch_SSE2;
   VP8LBundleColorMap = BundleColorMap_SSE2;
 
diff --git a/src/3rdparty/libwebp/src/dsp/lossless_enc_sse41.c b/src/3rdparty/libwebp/src/dsp/lossless_enc_sse41.c
index 719d8ed..ad358a6 100644
--- a/src/3rdparty/libwebp/src/dsp/lossless_enc_sse41.c
+++ b/src/3rdparty/libwebp/src/dsp/lossless_enc_sse41.c
@@ -44,46 +44,47 @@ static void SubtractGreenFromBlueAndRed_SSE41(uint32_t* argb_data,
 //------------------------------------------------------------------------------
 // Color Transform
 
-#define SPAN 8
+#define MK_CST_16(HI, LO) \
+  _mm_set1_epi32((int)(((uint32_t)(HI) << 16) | ((LO) & 0xffff)))
+
 static void CollectColorBlueTransforms_SSE41(const uint32_t* argb, int stride,
                                              int tile_width, int tile_height,
                                              int green_to_blue, int red_to_blue,
                                              int histo[]) {
-  const __m128i mults_r = _mm_set1_epi16(CST_5b(red_to_blue));
-  const __m128i mults_g = _mm_set1_epi16(CST_5b(green_to_blue));
-  const __m128i mask_g = _mm_set1_epi16((short)0xff00);   // green mask
-  const __m128i mask_gb = _mm_set1_epi32(0xffff);         // green/blue mask
-  const __m128i mask_b = _mm_set1_epi16(0x00ff);          // blue mask
-  const __m128i shuffler_lo = _mm_setr_epi8(-1, 2, -1, 6, -1, 10, -1, 14, -1,
-                                            -1, -1, -1, -1, -1, -1, -1);
-  const __m128i shuffler_hi = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1,
-                                            2, -1, 6, -1, 10, -1, 14);
-  int y;
-  for (y = 0; y < tile_height; ++y) {
-    const uint32_t* const src = argb + y * stride;
-    int i, x;
-    for (x = 0; x + SPAN <= tile_width; x += SPAN) {
-      uint16_t values[SPAN];
-      const __m128i in0 = _mm_loadu_si128((__m128i*)&src[x + 0]);
-      const __m128i in1 = _mm_loadu_si128((__m128i*)&src[x + SPAN / 2]);
-      const __m128i r0 = _mm_shuffle_epi8(in0, shuffler_lo);
-      const __m128i r1 = _mm_shuffle_epi8(in1, shuffler_hi);
-      const __m128i r = _mm_or_si128(r0, r1);         // r 0
-      const __m128i gb0 = _mm_and_si128(in0, mask_gb);
-      const __m128i gb1 = _mm_and_si128(in1, mask_gb);
-      const __m128i gb = _mm_packus_epi32(gb0, gb1);  // g b
-      const __m128i g = _mm_and_si128(gb, mask_g);    // g 0
-      const __m128i A = _mm_mulhi_epi16(r, mults_r);  // x dbr
-      const __m128i B = _mm_mulhi_epi16(g, mults_g);  // x dbg
-      const __m128i C = _mm_sub_epi8(gb, B);          // x b'
-      const __m128i D = _mm_sub_epi8(C, A);           // x b''
-      const __m128i E = _mm_and_si128(D, mask_b);     // 0 b''
-      _mm_storeu_si128((__m128i*)values, E);
-      for (i = 0; i < SPAN; ++i) ++histo[values[i]];
+  const __m128i mult =
+      MK_CST_16(CST_5b(red_to_blue) + 256,CST_5b(green_to_blue));
+  const __m128i perm =
+      _mm_setr_epi8(-1, 1, -1, 2, -1, 5, -1, 6, -1, 9, -1, 10, -1, 13, -1, 14);
+  if (tile_width >= 4) {
+    int y;
+    for (y = 0; y < tile_height; ++y) {
+      const uint32_t* const src = argb + y * stride;
+      const __m128i A1 = _mm_loadu_si128((const __m128i*)src);
+      const __m128i B1 = _mm_shuffle_epi8(A1, perm);
+      const __m128i C1 = _mm_mulhi_epi16(B1, mult);
+      const __m128i D1 = _mm_sub_epi16(A1, C1);
+      __m128i E = _mm_add_epi16(_mm_srli_epi32(D1, 16), D1);
+      int x;
+      for (x = 4; x + 4 <= tile_width; x += 4) {
+        const __m128i A2 = _mm_loadu_si128((const __m128i*)(src + x));
+        __m128i B2, C2, D2;
+        ++histo[_mm_extract_epi8(E,  0)];
+        B2 = _mm_shuffle_epi8(A2, perm);
+        ++histo[_mm_extract_epi8(E,  4)];
+        C2 = _mm_mulhi_epi16(B2, mult);
+        ++histo[_mm_extract_epi8(E,  8)];
+        D2 = _mm_sub_epi16(A2, C2);
+        ++histo[_mm_extract_epi8(E, 12)];
+        E = _mm_add_epi16(_mm_srli_epi32(D2, 16), D2);
+      }
+      ++histo[_mm_extract_epi8(E,  0)];
+      ++histo[_mm_extract_epi8(E,  4)];
+      ++histo[_mm_extract_epi8(E,  8)];
+      ++histo[_mm_extract_epi8(E, 12)];
     }
   }
   {
-    const int left_over = tile_width & (SPAN - 1);
+    const int left_over = tile_width & 3;
     if (left_over > 0) {
       VP8LCollectColorBlueTransforms_C(argb + tile_width - left_over, stride,
                                        left_over, tile_height,
@@ -95,33 +96,37 @@ static void CollectColorBlueTransforms_SSE41(const uint32_t* argb, int stride,
 static void CollectColorRedTransforms_SSE41(const uint32_t* argb, int stride,
                                             int tile_width, int tile_height,
                                             int green_to_red, int histo[]) {
-  const __m128i mults_g = _mm_set1_epi16(CST_5b(green_to_red));
-  const __m128i mask_g = _mm_set1_epi32(0x00ff00);  // green mask
-  const __m128i mask = _mm_set1_epi16(0xff);
-
-  int y;
-  for (y = 0; y < tile_height; ++y) {
-    const uint32_t* const src = argb + y * stride;
-    int i, x;
-    for (x = 0; x + SPAN <= tile_width; x += SPAN) {
-      uint16_t values[SPAN];
-      const __m128i in0 = _mm_loadu_si128((__m128i*)&src[x + 0]);
-      const __m128i in1 = _mm_loadu_si128((__m128i*)&src[x + SPAN / 2]);
-      const __m128i g0 = _mm_and_si128(in0, mask_g);  // 0 0  | g 0
-      const __m128i g1 = _mm_and_si128(in1, mask_g);
-      const __m128i g = _mm_packus_epi32(g0, g1);     // g 0
-      const __m128i A0 = _mm_srli_epi32(in0, 16);     // 0 0  | x r
-      const __m128i A1 = _mm_srli_epi32(in1, 16);
-      const __m128i A = _mm_packus_epi32(A0, A1);     // x r
-      const __m128i B = _mm_mulhi_epi16(g, mults_g);  // x dr
-      const __m128i C = _mm_sub_epi8(A, B);           // x r'
-      const __m128i D = _mm_and_si128(C, mask);       // 0 r'
-      _mm_storeu_si128((__m128i*)values, D);
-      for (i = 0; i < SPAN; ++i) ++histo[values[i]];
+
+  const __m128i mult = MK_CST_16(0, CST_5b(green_to_red));
+  const __m128i mask_g = _mm_set1_epi32(0x0000ff00);
+  if (tile_width >= 4) {
+    int y;
+    for (y = 0; y < tile_height; ++y) {
+      const uint32_t* const src = argb + y * stride;
+      const __m128i A1 = _mm_loadu_si128((const __m128i*)src);
+      const __m128i B1 = _mm_and_si128(A1, mask_g);
+      const __m128i C1 = _mm_madd_epi16(B1, mult);
+      __m128i D = _mm_sub_epi16(A1, C1);
+      int x;
+      for (x = 4; x + 4 <= tile_width; x += 4) {
+        const __m128i A2 = _mm_loadu_si128((const __m128i*)(src + x));
+        __m128i B2, C2;
+        ++histo[_mm_extract_epi8(D,  2)];
+        B2 = _mm_and_si128(A2, mask_g);
+        ++histo[_mm_extract_epi8(D,  6)];
+        C2 = _mm_madd_epi16(B2, mult);
+        ++histo[_mm_extract_epi8(D, 10)];
+        ++histo[_mm_extract_epi8(D, 14)];
+        D = _mm_sub_epi16(A2, C2);
+      }
+      ++histo[_mm_extract_epi8(D,  2)];
+      ++histo[_mm_extract_epi8(D,  6)];
+      ++histo[_mm_extract_epi8(D, 10)];
+      ++histo[_mm_extract_epi8(D, 14)];
     }
   }
   {
-    const int left_over = tile_width & (SPAN - 1);
+    const int left_over = tile_width & 3;
     if (left_over > 0) {
       VP8LCollectColorRedTransforms_C(argb + tile_width - left_over, stride,
                                       left_over, tile_height, green_to_red,
@@ -130,6 +135,8 @@ static void CollectColorRedTransforms_SSE41(const uint32_t* argb, int stride,
   }
 }
 
+#undef MK_CST_16
+
 //------------------------------------------------------------------------------
 // Entry point
 
diff --git a/src/3rdparty/libwebp/src/dsp/lossless_mips_dsp_r2.c b/src/3rdparty/libwebp/src/dsp/lossless_mips_dsp_r2.c
index 9888854..bfe5ea6 100644
--- a/src/3rdparty/libwebp/src/dsp/lossless_mips_dsp_r2.c
+++ b/src/3rdparty/libwebp/src/dsp/lossless_mips_dsp_r2.c
@@ -188,46 +188,51 @@ static WEBP_INLINE uint32_t Average4(uint32_t a0, uint32_t a1,
   return Average2(Average2(a0, a1), Average2(a2, a3));
 }
 
-static uint32_t Predictor5_MIPSdspR2(uint32_t left, const uint32_t* const top) {
-  return Average3(left, top[0], top[1]);
+static uint32_t Predictor5_MIPSdspR2(const uint32_t* const left,
+                                     const uint32_t* const top) {
+  return Average3(*left, top[0], top[1]);
 }
 
-static uint32_t Predictor6_MIPSdspR2(uint32_t left, const uint32_t* const top) {
-  return Average2(left, top[-1]);
+static uint32_t Predictor6_MIPSdspR2(const uint32_t* const left,
+                                     const uint32_t* const top) {
+  return Average2(*left, top[-1]);
 }
 
-static uint32_t Predictor7_MIPSdspR2(uint32_t left, const uint32_t* const top) {
-  return Average2(left, top[0]);
+static uint32_t Predictor7_MIPSdspR2(const uint32_t* const left,
+                                     const uint32_t* const top) {
+  return Average2(*left, top[0]);
 }
 
-static uint32_t Predictor8_MIPSdspR2(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor8_MIPSdspR2(const uint32_t* const left,
+                                     const uint32_t* const top) {
   (void)left;
   return Average2(top[-1], top[0]);
 }
 
-static uint32_t Predictor9_MIPSdspR2(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor9_MIPSdspR2(const uint32_t* const left,
+                                     const uint32_t* const top) {
   (void)left;
   return Average2(top[0], top[1]);
 }
 
-static uint32_t Predictor10_MIPSdspR2(uint32_t left,
+static uint32_t Predictor10_MIPSdspR2(const uint32_t* const left,
                                       const uint32_t* const top) {
-  return Average4(left, top[-1], top[0], top[1]);
+  return Average4(*left, top[-1], top[0], top[1]);
 }
 
-static uint32_t Predictor11_MIPSdspR2(uint32_t left,
+static uint32_t Predictor11_MIPSdspR2(const uint32_t* const left,
                                       const uint32_t* const top) {
-  return Select(top[0], left, top[-1]);
+  return Select(top[0], *left, top[-1]);
 }
 
-static uint32_t Predictor12_MIPSdspR2(uint32_t left,
+static uint32_t Predictor12_MIPSdspR2(const uint32_t* const left,
                                       const uint32_t* const top) {
-  return ClampedAddSubtractFull(left, top[0], top[-1]);
+  return ClampedAddSubtractFull(*left, top[0], top[-1]);
 }
 
-static uint32_t Predictor13_MIPSdspR2(uint32_t left,
+static uint32_t Predictor13_MIPSdspR2(const uint32_t* const left,
                                       const uint32_t* const top) {
-  return ClampedAddSubtractHalf(left, top[0], top[-1]);
+  return ClampedAddSubtractHalf(*left, top[0], top[-1]);
 }
 
 // Add green to blue and red channels (i.e. perform the inverse transform of
diff --git a/src/3rdparty/libwebp/src/dsp/lossless_neon.c b/src/3rdparty/libwebp/src/dsp/lossless_neon.c
index 76a1b6f..89e3e01 100644
--- a/src/3rdparty/libwebp/src/dsp/lossless_neon.c
+++ b/src/3rdparty/libwebp/src/dsp/lossless_neon.c
@@ -188,17 +188,21 @@ static WEBP_INLINE uint32_t Average3_NEON(uint32_t a0, uint32_t a1,
   return avg;
 }
 
-static uint32_t Predictor5_NEON(uint32_t left, const uint32_t* const top) {
-  return Average3_NEON(left, top[0], top[1]);
+static uint32_t Predictor5_NEON(const uint32_t* const left,
+                                const uint32_t* const top) {
+  return Average3_NEON(*left, top[0], top[1]);
 }
-static uint32_t Predictor6_NEON(uint32_t left, const uint32_t* const top) {
-  return Average2_NEON(left, top[-1]);
+static uint32_t Predictor6_NEON(const uint32_t* const left,
+                                const uint32_t* const top) {
+  return Average2_NEON(*left, top[-1]);
 }
-static uint32_t Predictor7_NEON(uint32_t left, const uint32_t* const top) {
-  return Average2_NEON(left, top[0]);
+static uint32_t Predictor7_NEON(const uint32_t* const left,
+                                const uint32_t* const top) {
+  return Average2_NEON(*left, top[0]);
 }
-static uint32_t Predictor13_NEON(uint32_t left, const uint32_t* const top) {
-  return ClampedAddSubtractHalf_NEON(left, top[0], top[-1]);
+static uint32_t Predictor13_NEON(const uint32_t* const left,
+                                 const uint32_t* const top) {
+  return ClampedAddSubtractHalf_NEON(*left, top[0], top[-1]);
 }
 
 // Batch versions of those functions.
diff --git a/src/3rdparty/libwebp/src/dsp/lossless_sse2.c b/src/3rdparty/libwebp/src/dsp/lossless_sse2.c
index 17d7576..4b6a532 100644
--- a/src/3rdparty/libwebp/src/dsp/lossless_sse2.c
+++ b/src/3rdparty/libwebp/src/dsp/lossless_sse2.c
@@ -18,7 +18,6 @@
 #include "src/dsp/common_sse2.h"
 #include "src/dsp/lossless.h"
 #include "src/dsp/lossless_common.h"
-#include <assert.h>
 #include <emmintrin.h>
 
 //------------------------------------------------------------------------------
@@ -28,23 +27,22 @@ static WEBP_INLINE uint32_t ClampedAddSubtractFull_SSE2(uint32_t c0,
                                                         uint32_t c1,
                                                         uint32_t c2) {
   const __m128i zero = _mm_setzero_si128();
-  const __m128i C0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c0), zero);
-  const __m128i C1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c1), zero);
-  const __m128i C2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c2), zero);
+  const __m128i C0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128((int)c0), zero);
+  const __m128i C1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128((int)c1), zero);
+  const __m128i C2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128((int)c2), zero);
   const __m128i V1 = _mm_add_epi16(C0, C1);
   const __m128i V2 = _mm_sub_epi16(V1, C2);
   const __m128i b = _mm_packus_epi16(V2, V2);
-  const uint32_t output = _mm_cvtsi128_si32(b);
-  return output;
+  return (uint32_t)_mm_cvtsi128_si32(b);
 }
 
 static WEBP_INLINE uint32_t ClampedAddSubtractHalf_SSE2(uint32_t c0,
                                                         uint32_t c1,
                                                         uint32_t c2) {
   const __m128i zero = _mm_setzero_si128();
-  const __m128i C0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c0), zero);
-  const __m128i C1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c1), zero);
-  const __m128i B0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c2), zero);
+  const __m128i C0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128((int)c0), zero);
+  const __m128i C1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128((int)c1), zero);
+  const __m128i B0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128((int)c2), zero);
   const __m128i avg = _mm_add_epi16(C1, C0);
   const __m128i A0 = _mm_srli_epi16(avg, 1);
   const __m128i A1 = _mm_sub_epi16(A0, B0);
@@ -53,16 +51,15 @@ static WEBP_INLINE uint32_t ClampedAddSubtractHalf_SSE2(uint32_t c0,
   const __m128i A3 = _mm_srai_epi16(A2, 1);
   const __m128i A4 = _mm_add_epi16(A0, A3);
   const __m128i A5 = _mm_packus_epi16(A4, A4);
-  const uint32_t output = _mm_cvtsi128_si32(A5);
-  return output;
+  return (uint32_t)_mm_cvtsi128_si32(A5);
 }
 
 static WEBP_INLINE uint32_t Select_SSE2(uint32_t a, uint32_t b, uint32_t c) {
   int pa_minus_pb;
   const __m128i zero = _mm_setzero_si128();
-  const __m128i A0 = _mm_cvtsi32_si128(a);
-  const __m128i B0 = _mm_cvtsi32_si128(b);
-  const __m128i C0 = _mm_cvtsi32_si128(c);
+  const __m128i A0 = _mm_cvtsi32_si128((int)a);
+  const __m128i B0 = _mm_cvtsi32_si128((int)b);
+  const __m128i C0 = _mm_cvtsi32_si128((int)c);
   const __m128i AC0 = _mm_subs_epu8(A0, C0);
   const __m128i CA0 = _mm_subs_epu8(C0, A0);
   const __m128i BC0 = _mm_subs_epu8(B0, C0);
@@ -95,8 +92,8 @@ static WEBP_INLINE void Average2_uint32_SSE2(const uint32_t a0,
                                              __m128i* const avg) {
   // (a + b) >> 1 = ((a + b + 1) >> 1) - ((a ^ b) & 1)
   const __m128i ones = _mm_set1_epi8(1);
-  const __m128i A0 = _mm_cvtsi32_si128(a0);
-  const __m128i A1 = _mm_cvtsi32_si128(a1);
+  const __m128i A0 = _mm_cvtsi32_si128((int)a0);
+  const __m128i A1 = _mm_cvtsi32_si128((int)a1);
   const __m128i avg1 = _mm_avg_epu8(A0, A1);
   const __m128i one = _mm_and_si128(_mm_xor_si128(A0, A1), ones);
   *avg = _mm_sub_epi8(avg1, one);
@@ -104,8 +101,8 @@ static WEBP_INLINE void Average2_uint32_SSE2(const uint32_t a0,
 
 static WEBP_INLINE __m128i Average2_uint32_16_SSE2(uint32_t a0, uint32_t a1) {
   const __m128i zero = _mm_setzero_si128();
-  const __m128i A0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(a0), zero);
-  const __m128i A1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(a1), zero);
+  const __m128i A0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128((int)a0), zero);
+  const __m128i A1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128((int)a1), zero);
   const __m128i sum = _mm_add_epi16(A1, A0);
   return _mm_srli_epi16(sum, 1);
 }
@@ -113,19 +110,18 @@ static WEBP_INLINE __m128i Average2_uint32_16_SSE2(uint32_t a0, uint32_t a1) {
 static WEBP_INLINE uint32_t Average2_SSE2(uint32_t a0, uint32_t a1) {
   __m128i output;
   Average2_uint32_SSE2(a0, a1, &output);
-  return _mm_cvtsi128_si32(output);
+  return (uint32_t)_mm_cvtsi128_si32(output);
 }
 
 static WEBP_INLINE uint32_t Average3_SSE2(uint32_t a0, uint32_t a1,
                                           uint32_t a2) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i avg1 = Average2_uint32_16_SSE2(a0, a2);
-  const __m128i A1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(a1), zero);
+  const __m128i A1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128((int)a1), zero);
   const __m128i sum = _mm_add_epi16(avg1, A1);
   const __m128i avg2 = _mm_srli_epi16(sum, 1);
   const __m128i A2 = _mm_packus_epi16(avg2, avg2);
-  const uint32_t output = _mm_cvtsi128_si32(A2);
-  return output;
+  return (uint32_t)_mm_cvtsi128_si32(A2);
 }
 
 static WEBP_INLINE uint32_t Average4_SSE2(uint32_t a0, uint32_t a1,
@@ -135,46 +131,54 @@ static WEBP_INLINE uint32_t Average4_SSE2(uint32_t a0, uint32_t a1,
   const __m128i sum = _mm_add_epi16(avg2, avg1);
   const __m128i avg3 = _mm_srli_epi16(sum, 1);
   const __m128i A0 = _mm_packus_epi16(avg3, avg3);
-  const uint32_t output = _mm_cvtsi128_si32(A0);
-  return output;
+  return (uint32_t)_mm_cvtsi128_si32(A0);
 }
 
-static uint32_t Predictor5_SSE2(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = Average3_SSE2(left, top[0], top[1]);
+static uint32_t Predictor5_SSE2(const uint32_t* const left,
+                                const uint32_t* const top) {
+  const uint32_t pred = Average3_SSE2(*left, top[0], top[1]);
   return pred;
 }
-static uint32_t Predictor6_SSE2(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = Average2_SSE2(left, top[-1]);
+static uint32_t Predictor6_SSE2(const uint32_t* const left,
+                                const uint32_t* const top) {
+  const uint32_t pred = Average2_SSE2(*left, top[-1]);
   return pred;
 }
-static uint32_t Predictor7_SSE2(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = Average2_SSE2(left, top[0]);
+static uint32_t Predictor7_SSE2(const uint32_t* const left,
+                                const uint32_t* const top) {
+  const uint32_t pred = Average2_SSE2(*left, top[0]);
   return pred;
 }
-static uint32_t Predictor8_SSE2(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor8_SSE2(const uint32_t* const left,
+                                const uint32_t* const top) {
   const uint32_t pred = Average2_SSE2(top[-1], top[0]);
   (void)left;
   return pred;
 }
-static uint32_t Predictor9_SSE2(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor9_SSE2(const uint32_t* const left,
+                                const uint32_t* const top) {
   const uint32_t pred = Average2_SSE2(top[0], top[1]);
   (void)left;
   return pred;
 }
-static uint32_t Predictor10_SSE2(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = Average4_SSE2(left, top[-1], top[0], top[1]);
+static uint32_t Predictor10_SSE2(const uint32_t* const left,
+                                 const uint32_t* const top) {
+  const uint32_t pred = Average4_SSE2(*left, top[-1], top[0], top[1]);
   return pred;
 }
-static uint32_t Predictor11_SSE2(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = Select_SSE2(top[0], left, top[-1]);
+static uint32_t Predictor11_SSE2(const uint32_t* const left,
+                                 const uint32_t* const top) {
+  const uint32_t pred = Select_SSE2(top[0], *left, top[-1]);
   return pred;
 }
-static uint32_t Predictor12_SSE2(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = ClampedAddSubtractFull_SSE2(left, top[0], top[-1]);
+static uint32_t Predictor12_SSE2(const uint32_t* const left,
+                                 const uint32_t* const top) {
+  const uint32_t pred = ClampedAddSubtractFull_SSE2(*left, top[0], top[-1]);
   return pred;
 }
-static uint32_t Predictor13_SSE2(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = ClampedAddSubtractHalf_SSE2(left, top[0], top[-1]);
+static uint32_t Predictor13_SSE2(const uint32_t* const left,
+                                 const uint32_t* const top) {
+  const uint32_t pred = ClampedAddSubtractHalf_SSE2(*left, top[0], top[-1]);
   return pred;
 }
 
@@ -184,22 +188,23 @@ static uint32_t Predictor13_SSE2(uint32_t left, const uint32_t* const top) {
 static void PredictorAdd0_SSE2(const uint32_t* in, const uint32_t* upper,
                                int num_pixels, uint32_t* out) {
   int i;
-  const __m128i black = _mm_set1_epi32(ARGB_BLACK);
+  const __m128i black = _mm_set1_epi32((int)ARGB_BLACK);
   for (i = 0; i + 4 <= num_pixels; i += 4) {
     const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
     const __m128i res = _mm_add_epi8(src, black);
     _mm_storeu_si128((__m128i*)&out[i], res);
   }
   if (i != num_pixels) {
-    VP8LPredictorsAdd_C[0](in + i, upper + i, num_pixels - i, out + i);
+    VP8LPredictorsAdd_C[0](in + i, NULL, num_pixels - i, out + i);
   }
+  (void)upper;
 }
 
 // Predictor1: left.
 static void PredictorAdd1_SSE2(const uint32_t* in, const uint32_t* upper,
                                int num_pixels, uint32_t* out) {
   int i;
-  __m128i prev = _mm_set1_epi32(out[-1]);
+  __m128i prev = _mm_set1_epi32((int)out[-1]);
   for (i = 0; i + 4 <= num_pixels; i += 4) {
     // a | b | c | d
     const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
@@ -276,12 +281,12 @@ GENERATE_PREDICTOR_2(9, upper[i + 1])
 #undef GENERATE_PREDICTOR_2
 
 // Predictor10: average of (average of (L,TL), average of (T, TR)).
-#define DO_PRED10(OUT) do {               \
-  __m128i avgLTL, avg;                    \
-  Average2_m128i(&L, &TL, &avgLTL);       \
-  Average2_m128i(&avgTTR, &avgLTL, &avg); \
-  L = _mm_add_epi8(avg, src);             \
-  out[i + (OUT)] = _mm_cvtsi128_si32(L);  \
+#define DO_PRED10(OUT) do {                         \
+  __m128i avgLTL, avg;                              \
+  Average2_m128i(&L, &TL, &avgLTL);                 \
+  Average2_m128i(&avgTTR, &avgLTL, &avg);           \
+  L = _mm_add_epi8(avg, src);                       \
+  out[i + (OUT)] = (uint32_t)_mm_cvtsi128_si32(L);  \
 } while (0)
 
 #define DO_PRED10_SHIFT do {                                  \
@@ -294,7 +299,7 @@ GENERATE_PREDICTOR_2(9, upper[i + 1])
 static void PredictorAdd10_SSE2(const uint32_t* in, const uint32_t* upper,
                                 int num_pixels, uint32_t* out) {
   int i;
-  __m128i L = _mm_cvtsi32_si128(out[-1]);
+  __m128i L = _mm_cvtsi32_si128((int)out[-1]);
   for (i = 0; i + 4 <= num_pixels; i += 4) {
     __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
     __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]);
@@ -327,7 +332,7 @@ static void PredictorAdd10_SSE2(const uint32_t* in, const uint32_t* upper,
   const __m128i B = _mm_andnot_si128(mask, T);                         \
   const __m128i pred = _mm_or_si128(A, B); /* pred = (pa > b)? L : T*/ \
   L = _mm_add_epi8(src, pred);                                         \
-  out[i + (OUT)] = _mm_cvtsi128_si32(L);                               \
+  out[i + (OUT)] = (uint32_t)_mm_cvtsi128_si32(L);                     \
 } while (0)
 
 #define DO_PRED11_SHIFT do {                                \
@@ -342,7 +347,7 @@ static void PredictorAdd11_SSE2(const uint32_t* in, const uint32_t* upper,
                                 int num_pixels, uint32_t* out) {
   int i;
   __m128i pa;
-  __m128i L = _mm_cvtsi32_si128(out[-1]);
+  __m128i L = _mm_cvtsi32_si128((int)out[-1]);
   for (i = 0; i + 4 <= num_pixels; i += 4) {
     __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]);
     __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]);
@@ -375,12 +380,12 @@ static void PredictorAdd11_SSE2(const uint32_t* in, const uint32_t* upper,
 #undef DO_PRED11_SHIFT
 
 // Predictor12: ClampedAddSubtractFull.
-#define DO_PRED12(DIFF, LANE, OUT) do {            \
-  const __m128i all = _mm_add_epi16(L, (DIFF));    \
-  const __m128i alls = _mm_packus_epi16(all, all); \
-  const __m128i res = _mm_add_epi8(src, alls);     \
-  out[i + (OUT)] = _mm_cvtsi128_si32(res);         \
-  L = _mm_unpacklo_epi8(res, zero);                \
+#define DO_PRED12(DIFF, LANE, OUT) do {              \
+  const __m128i all = _mm_add_epi16(L, (DIFF));      \
+  const __m128i alls = _mm_packus_epi16(all, all);   \
+  const __m128i res = _mm_add_epi8(src, alls);       \
+  out[i + (OUT)] = (uint32_t)_mm_cvtsi128_si32(res); \
+  L = _mm_unpacklo_epi8(res, zero);                  \
 } while (0)
 
 #define DO_PRED12_SHIFT(DIFF, LANE) do {                    \
@@ -393,7 +398,7 @@ static void PredictorAdd12_SSE2(const uint32_t* in, const uint32_t* upper,
                                 int num_pixels, uint32_t* out) {
   int i;
   const __m128i zero = _mm_setzero_si128();
-  const __m128i L8 = _mm_cvtsi32_si128(out[-1]);
+  const __m128i L8 = _mm_cvtsi32_si128((int)out[-1]);
   __m128i L = _mm_unpacklo_epi8(L8, zero);
   for (i = 0; i + 4 <= num_pixels; i += 4) {
     // Load 4 pixels at a time.
@@ -459,7 +464,7 @@ static void TransformColorInverse_SSE2(const VP8LMultipliers* const m,
   const __m128i mults_b2 = MK_CST_16(CST(red_to_blue_), 0);
 #undef MK_CST_16
 #undef CST
-  const __m128i mask_ag = _mm_set1_epi32(0xff00ff00);  // alpha-green masks
+  const __m128i mask_ag = _mm_set1_epi32((int)0xff00ff00);  // alpha-green masks
   int i;
   for (i = 0; i + 4 <= num_pixels; i += 4) {
     const __m128i in = _mm_loadu_si128((const __m128i*)&src[i]); // argb
@@ -523,7 +528,7 @@ static void ConvertBGRAToRGB_SSE2(const uint32_t* src, int num_pixels,
 
 static void ConvertBGRAToRGBA_SSE2(const uint32_t* src,
                                    int num_pixels, uint8_t* dst) {
-  const __m128i red_blue_mask = _mm_set1_epi32(0x00ff00ffu);
+  const __m128i red_blue_mask = _mm_set1_epi32(0x00ff00ff);
   const __m128i* in = (const __m128i*)src;
   __m128i* out = (__m128i*)dst;
   while (num_pixels >= 8) {
@@ -552,7 +557,7 @@ static void ConvertBGRAToRGBA_SSE2(const uint32_t* src,
 static void ConvertBGRAToRGBA4444_SSE2(const uint32_t* src,
                                        int num_pixels, uint8_t* dst) {
   const __m128i mask_0x0f = _mm_set1_epi8(0x0f);
-  const __m128i mask_0xf0 = _mm_set1_epi8(0xf0);
+  const __m128i mask_0xf0 = _mm_set1_epi8((char)0xf0);
   const __m128i* in = (const __m128i*)src;
   __m128i* out = (__m128i*)dst;
   while (num_pixels >= 8) {
@@ -587,8 +592,8 @@ static void ConvertBGRAToRGBA4444_SSE2(const uint32_t* src,
 
 static void ConvertBGRAToRGB565_SSE2(const uint32_t* src,
                                      int num_pixels, uint8_t* dst) {
-  const __m128i mask_0xe0 = _mm_set1_epi8(0xe0);
-  const __m128i mask_0xf8 = _mm_set1_epi8(0xf8);
+  const __m128i mask_0xe0 = _mm_set1_epi8((char)0xe0);
+  const __m128i mask_0xf8 = _mm_set1_epi8((char)0xf8);
   const __m128i mask_0x07 = _mm_set1_epi8(0x07);
   const __m128i* in = (const __m128i*)src;
   __m128i* out = (__m128i*)dst;
diff --git a/src/3rdparty/libwebp/src/dsp/lossless_sse41.c b/src/3rdparty/libwebp/src/dsp/lossless_sse41.c
new file mode 100644
index 0000000..bb7ce76
--- /dev/null
+++ b/src/3rdparty/libwebp/src/dsp/lossless_sse41.c
@@ -0,0 +1,133 @@
+// Copyright 2021 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// SSE41 variant of methods for lossless decoder
+
+#include "src/dsp/dsp.h"
+
+#if defined(WEBP_USE_SSE41)
+
+#include "src/dsp/common_sse41.h"
+#include "src/dsp/lossless.h"
+#include "src/dsp/lossless_common.h"
+
+//------------------------------------------------------------------------------
+// Color-space conversion functions
+
+static void TransformColorInverse_SSE41(const VP8LMultipliers* const m,
+                                        const uint32_t* const src,
+                                        int num_pixels, uint32_t* dst) {
+// sign-extended multiplying constants, pre-shifted by 5.
+#define CST(X)  (((int16_t)(m->X << 8)) >> 5)   // sign-extend
+  const __m128i mults_rb =
+      _mm_set1_epi32((int)((uint32_t)CST(green_to_red_) << 16 |
+                           (CST(green_to_blue_) & 0xffff)));
+  const __m128i mults_b2 = _mm_set1_epi32(CST(red_to_blue_));
+#undef CST
+  const __m128i mask_ag = _mm_set1_epi32((int)0xff00ff00);
+  const __m128i perm1 = _mm_setr_epi8(-1, 1, -1, 1, -1, 5, -1, 5,
+                                      -1, 9, -1, 9, -1, 13, -1, 13);
+  const __m128i perm2 = _mm_setr_epi8(-1, 2, -1, -1, -1, 6, -1, -1,
+                                      -1, 10, -1, -1, -1, 14, -1, -1);
+  int i;
+  for (i = 0; i + 4 <= num_pixels; i += 4) {
+    const __m128i A = _mm_loadu_si128((const __m128i*)(src + i));
+    const __m128i B = _mm_shuffle_epi8(A, perm1); // argb -> g0g0
+    const __m128i C = _mm_mulhi_epi16(B, mults_rb);
+    const __m128i D = _mm_add_epi8(A, C);
+    const __m128i E = _mm_shuffle_epi8(D, perm2);
+    const __m128i F = _mm_mulhi_epi16(E, mults_b2);
+    const __m128i G = _mm_add_epi8(D, F);
+    const __m128i out = _mm_blendv_epi8(G, A, mask_ag);
+    _mm_storeu_si128((__m128i*)&dst[i], out);
+  }
+  // Fall-back to C-version for left-overs.
+  if (i != num_pixels) {
+    VP8LTransformColorInverse_C(m, src + i, num_pixels - i, dst + i);
+  }
+}
+
+//------------------------------------------------------------------------------
+
+#define ARGB_TO_RGB_SSE41 do {                        \
+  while (num_pixels >= 16) {                          \
+    const __m128i in0 = _mm_loadu_si128(in + 0);      \
+    const __m128i in1 = _mm_loadu_si128(in + 1);      \
+    const __m128i in2 = _mm_loadu_si128(in + 2);      \
+    const __m128i in3 = _mm_loadu_si128(in + 3);      \
+    const __m128i a0 = _mm_shuffle_epi8(in0, perm0);  \
+    const __m128i a1 = _mm_shuffle_epi8(in1, perm1);  \
+    const __m128i a2 = _mm_shuffle_epi8(in2, perm2);  \
+    const __m128i a3 = _mm_shuffle_epi8(in3, perm3);  \
+    const __m128i b0 = _mm_blend_epi16(a0, a1, 0xc0); \
+    const __m128i b1 = _mm_blend_epi16(a1, a2, 0xf0); \
+    const __m128i b2 = _mm_blend_epi16(a2, a3, 0xfc); \
+    _mm_storeu_si128(out + 0, b0);                    \
+    _mm_storeu_si128(out + 1, b1);                    \
+    _mm_storeu_si128(out + 2, b2);                    \
+    in += 4;                                          \
+    out += 3;                                         \
+    num_pixels -= 16;                                 \
+  }                                                   \
+} while (0)
+
+static void ConvertBGRAToRGB_SSE41(const uint32_t* src, int num_pixels,
+                                   uint8_t* dst) {
+  const __m128i* in = (const __m128i*)src;
+  __m128i* out = (__m128i*)dst;
+  const __m128i perm0 = _mm_setr_epi8(2, 1, 0, 6, 5, 4, 10, 9,
+                                      8, 14, 13, 12, -1, -1, -1, -1);
+  const __m128i perm1 = _mm_shuffle_epi32(perm0, 0x39);
+  const __m128i perm2 = _mm_shuffle_epi32(perm0, 0x4e);
+  const __m128i perm3 = _mm_shuffle_epi32(perm0, 0x93);
+
+  ARGB_TO_RGB_SSE41;
+
+  // left-overs
+  if (num_pixels > 0) {
+    VP8LConvertBGRAToRGB_C((const uint32_t*)in, num_pixels, (uint8_t*)out);
+  }
+}
+
+static void ConvertBGRAToBGR_SSE41(const uint32_t* src,
+                                   int num_pixels, uint8_t* dst) {
+  const __m128i* in = (const __m128i*)src;
+  __m128i* out = (__m128i*)dst;
+  const __m128i perm0 = _mm_setr_epi8(0, 1, 2, 4, 5, 6, 8, 9, 10,
+                                      12, 13, 14, -1, -1, -1, -1);
+  const __m128i perm1 = _mm_shuffle_epi32(perm0, 0x39);
+  const __m128i perm2 = _mm_shuffle_epi32(perm0, 0x4e);
+  const __m128i perm3 = _mm_shuffle_epi32(perm0, 0x93);
+
+  ARGB_TO_RGB_SSE41;
+
+  // left-overs
+  if (num_pixels > 0) {
+    VP8LConvertBGRAToBGR_C((const uint32_t*)in, num_pixels, (uint8_t*)out);
+  }
+}
+
+#undef ARGB_TO_RGB_SSE41
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8LDspInitSSE41(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitSSE41(void) {
+  VP8LTransformColorInverse = TransformColorInverse_SSE41;
+  VP8LConvertBGRAToRGB = ConvertBGRAToRGB_SSE41;
+  VP8LConvertBGRAToBGR = ConvertBGRAToBGR_SSE41;
+}
+
+#else  // !WEBP_USE_SSE41
+
+WEBP_DSP_INIT_STUB(VP8LDspInitSSE41)
+
+#endif  // WEBP_USE_SSE41
diff --git a/src/3rdparty/libwebp/src/dsp/msa_macro.h b/src/3rdparty/libwebp/src/dsp/msa_macro.h
index de026a1..51f6c64 100644
--- a/src/3rdparty/libwebp/src/dsp/msa_macro.h
+++ b/src/3rdparty/libwebp/src/dsp/msa_macro.h
@@ -14,6 +14,10 @@
 #ifndef WEBP_DSP_MSA_MACRO_H_
 #define WEBP_DSP_MSA_MACRO_H_
 
+#include "src/dsp/dsp.h"
+
+#if defined(WEBP_USE_MSA)
+
 #include <stdint.h>
 #include <msa.h>
 
@@ -1389,4 +1393,5 @@ static WEBP_INLINE uint32_t func_hadd_uh_u32(v8u16 in) {
 } while (0)
 #define AVER_UB2_UB(...) AVER_UB2(v16u8, __VA_ARGS__)
 
+#endif  // WEBP_USE_MSA
 #endif  // WEBP_DSP_MSA_MACRO_H_
diff --git a/src/3rdparty/libwebp/src/dsp/neon.h b/src/3rdparty/libwebp/src/dsp/neon.h
index aa1dea1..c591f9b 100644
--- a/src/3rdparty/libwebp/src/dsp/neon.h
+++ b/src/3rdparty/libwebp/src/dsp/neon.h
@@ -12,10 +12,12 @@
 #ifndef WEBP_DSP_NEON_H_
 #define WEBP_DSP_NEON_H_
 
-#include <arm_neon.h>
-
 #include "src/dsp/dsp.h"
 
+#if defined(WEBP_USE_NEON)
+
+#include <arm_neon.h>
+
 // Right now, some intrinsics functions seem slower, so we disable them
 // everywhere except newer clang/gcc or aarch64 where the inline assembly is
 // incompatible.
@@ -98,4 +100,5 @@ static WEBP_INLINE int32x4x4_t Transpose4x4_NEON(const int32x4x4_t rows) {
 } while (0)
 #endif
 
+#endif  // WEBP_USE_NEON
 #endif  // WEBP_DSP_NEON_H_
diff --git a/src/3rdparty/libwebp/src/dsp/quant.h b/src/3rdparty/libwebp/src/dsp/quant.h
index 5e8dba8..fc099bf 100644
--- a/src/3rdparty/libwebp/src/dsp/quant.h
+++ b/src/3rdparty/libwebp/src/dsp/quant.h
@@ -21,10 +21,15 @@
 
 #define IsFlat IsFlat_NEON
 
-static uint32x2_t horizontal_add_uint32x4(const uint32x4_t a) {
+static uint32_t horizontal_add_uint32x4(const uint32x4_t a) {
+#if defined(__aarch64__)
+  return vaddvq_u32(a);
+#else
   const uint64x2_t b = vpaddlq_u32(a);
-  return vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
-                  vreinterpret_u32_u64(vget_high_u64(b)));
+  const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
+                                vreinterpret_u32_u64(vget_high_u64(b)));
+  return vget_lane_u32(c, 0);
+#endif
 }
 
 static WEBP_INLINE int IsFlat(const int16_t* levels, int num_blocks,
@@ -45,7 +50,7 @@ static WEBP_INLINE int IsFlat(const int16_t* levels, int num_blocks,
 
     levels += 16;
   }
-  return thresh >= (int32_t)vget_lane_u32(horizontal_add_uint32x4(sum), 0);
+  return thresh >= (int)horizontal_add_uint32x4(sum);
 }
 
 #else
diff --git a/src/3rdparty/libwebp/src/dsp/rescaler.c b/src/3rdparty/libwebp/src/dsp/rescaler.c
index c5a01e8..14620ce 100644
--- a/src/3rdparty/libwebp/src/dsp/rescaler.c
+++ b/src/3rdparty/libwebp/src/dsp/rescaler.c
@@ -38,8 +38,9 @@ void WebPRescalerImportRowExpand_C(WebPRescaler* const wrk,
     int x_out = channel;
     // simple bilinear interpolation
     int accum = wrk->x_add;
-    int left = src[x_in];
-    int right = (wrk->src_width > 1) ? src[x_in + x_stride] : left;
+    rescaler_t left = (rescaler_t)src[x_in];
+    rescaler_t right =
+        (wrk->src_width > 1) ? (rescaler_t)src[x_in + x_stride] : left;
     x_in += x_stride;
     while (1) {
       wrk->frow[x_out] = right * wrk->x_add + (left - right) * accum;
@@ -50,7 +51,7 @@ void WebPRescalerImportRowExpand_C(WebPRescaler* const wrk,
         left = right;
         x_in += x_stride;
         assert(x_in < wrk->src_width * x_stride);
-        right = src[x_in];
+        right = (rescaler_t)src[x_in];
         accum += wrk->x_add;
       }
     }
@@ -213,7 +214,7 @@ WEBP_DSP_INIT_FUNC(WebPRescalerDspInit) {
   WebPRescalerImportRowShrink = WebPRescalerImportRowShrink_C;
 
   if (VP8GetCPUInfo != NULL) {
-#if defined(WEBP_USE_SSE2)
+#if defined(WEBP_HAVE_SSE2)
     if (VP8GetCPUInfo(kSSE2)) {
       WebPRescalerDspInitSSE2();
     }
@@ -235,7 +236,7 @@ WEBP_DSP_INIT_FUNC(WebPRescalerDspInit) {
 #endif
   }
 
-#if defined(WEBP_USE_NEON)
+#if defined(WEBP_HAVE_NEON)
   if (WEBP_NEON_OMIT_C_CODE ||
       (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
     WebPRescalerDspInitNEON();
diff --git a/src/3rdparty/libwebp/src/dsp/rescaler_sse2.c b/src/3rdparty/libwebp/src/dsp/rescaler_sse2.c
index d7effea..3f18e94 100644
--- a/src/3rdparty/libwebp/src/dsp/rescaler_sse2.c
+++ b/src/3rdparty/libwebp/src/dsp/rescaler_sse2.c
@@ -85,7 +85,7 @@ static void RescalerImportRowExpand_SSE2(WebPRescaler* const wrk,
       const __m128i mult = _mm_cvtsi32_si128(((x_add - accum) << 16) | accum);
       const __m128i out = _mm_madd_epi16(cur_pixels, mult);
       assert(sizeof(*frow) == sizeof(uint32_t));
-      WebPUint32ToMem((uint8_t*)frow, _mm_cvtsi128_si32(out));
+      WebPInt32ToMem((uint8_t*)frow, _mm_cvtsi128_si32(out));
       frow += 1;
       if (frow >= frow_end) break;
       accum -= wrk->x_sub;
@@ -132,7 +132,7 @@ static void RescalerImportRowShrink_SSE2(WebPRescaler* const wrk,
     __m128i base = zero;
     accum += wrk->x_add;
     while (accum > 0) {
-      const __m128i A = _mm_cvtsi32_si128(WebPMemToUint32(src));
+      const __m128i A = _mm_cvtsi32_si128(WebPMemToInt32(src));
       src += 4;
       base = _mm_unpacklo_epi8(A, zero);
       // To avoid overflow, we need: base * x_add / x_sub < 32768
@@ -198,7 +198,7 @@ static WEBP_INLINE void ProcessRow_SSE2(const __m128i* const A0,
                                         const __m128i* const mult,
                                         uint8_t* const dst) {
   const __m128i rounder = _mm_set_epi32(0, ROUNDER, 0, ROUNDER);
-  const __m128i mask = _mm_set_epi32(0xffffffffu, 0, 0xffffffffu, 0);
+  const __m128i mask = _mm_set_epi32(~0, 0, ~0, 0);
   const __m128i B0 = _mm_mul_epu32(*A0, *mult);
   const __m128i B1 = _mm_mul_epu32(*A1, *mult);
   const __m128i B2 = _mm_mul_epu32(*A2, *mult);
diff --git a/src/3rdparty/libwebp/src/dsp/ssim.c b/src/3rdparty/libwebp/src/dsp/ssim.c
index 989ce82..f85c2e6 100644
--- a/src/3rdparty/libwebp/src/dsp/ssim.c
+++ b/src/3rdparty/libwebp/src/dsp/ssim.c
@@ -150,7 +150,7 @@ WEBP_DSP_INIT_FUNC(VP8SSIMDspInit) {
 #endif
 
   if (VP8GetCPUInfo != NULL) {
-#if defined(WEBP_USE_SSE2)
+#if defined(WEBP_HAVE_SSE2)
     if (VP8GetCPUInfo(kSSE2)) {
       VP8SSIMDspInitSSE2();
     }
diff --git a/src/3rdparty/libwebp/src/dsp/upsampling.c b/src/3rdparty/libwebp/src/dsp/upsampling.c
index 9b60da5..87f771f 100644
--- a/src/3rdparty/libwebp/src/dsp/upsampling.c
+++ b/src/3rdparty/libwebp/src/dsp/upsampling.c
@@ -233,12 +233,12 @@ WEBP_DSP_INIT_FUNC(WebPInitYUV444Converters) {
   WebPYUV444Converters[MODE_rgbA_4444] = WebPYuv444ToRgba4444_C;
 
   if (VP8GetCPUInfo != NULL) {
-#if defined(WEBP_USE_SSE2)
+#if defined(WEBP_HAVE_SSE2)
     if (VP8GetCPUInfo(kSSE2)) {
       WebPInitYUV444ConvertersSSE2();
     }
 #endif
-#if defined(WEBP_USE_SSE41)
+#if defined(WEBP_HAVE_SSE41)
     if (VP8GetCPUInfo(kSSE4_1)) {
       WebPInitYUV444ConvertersSSE41();
     }
@@ -278,12 +278,12 @@ WEBP_DSP_INIT_FUNC(WebPInitUpsamplers) {
 
   // If defined, use CPUInfo() to overwrite some pointers with faster versions.
   if (VP8GetCPUInfo != NULL) {
-#if defined(WEBP_USE_SSE2)
+#if defined(WEBP_HAVE_SSE2)
     if (VP8GetCPUInfo(kSSE2)) {
       WebPInitUpsamplersSSE2();
     }
 #endif
-#if defined(WEBP_USE_SSE41)
+#if defined(WEBP_HAVE_SSE41)
     if (VP8GetCPUInfo(kSSE4_1)) {
       WebPInitUpsamplersSSE41();
     }
@@ -300,7 +300,7 @@ WEBP_DSP_INIT_FUNC(WebPInitUpsamplers) {
 #endif
   }
 
-#if defined(WEBP_USE_NEON)
+#if defined(WEBP_HAVE_NEON)
   if (WEBP_NEON_OMIT_C_CODE ||
       (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
     WebPInitUpsamplersNEON();
diff --git a/src/3rdparty/libwebp/src/dsp/upsampling_msa.c b/src/3rdparty/libwebp/src/dsp/upsampling_msa.c
index 99eea70..f2e03e8 100644
--- a/src/3rdparty/libwebp/src/dsp/upsampling_msa.c
+++ b/src/3rdparty/libwebp/src/dsp/upsampling_msa.c
@@ -576,9 +576,9 @@ static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bot_y,        \
   const uint32_t l_uv = ((cur_u[0]) | ((cur_v[0]) << 16));               \
   const uint32_t uv0 = (3 * tl_uv + l_uv + 0x00020002u) >> 2;            \
   const uint8_t* ptop_y = &top_y[1];                                     \
-  uint8_t *ptop_dst = top_dst + XSTEP;                                   \
+  uint8_t* ptop_dst = top_dst + XSTEP;                                   \
   const uint8_t* pbot_y = &bot_y[1];                                     \
-  uint8_t *pbot_dst = bot_dst + XSTEP;                                   \
+  uint8_t* pbot_dst = bot_dst + XSTEP;                                   \
                                                                          \
   FUNC(top_y[0], uv0 & 0xff, (uv0 >> 16), top_dst);                      \
   if (bot_y != NULL) {                                                   \
diff --git a/src/3rdparty/libwebp/src/dsp/upsampling_neon.c b/src/3rdparty/libwebp/src/dsp/upsampling_neon.c
index 17cbc9f..6ba71a7 100644
--- a/src/3rdparty/libwebp/src/dsp/upsampling_neon.c
+++ b/src/3rdparty/libwebp/src/dsp/upsampling_neon.c
@@ -58,8 +58,8 @@
 } while (0)
 
 // Turn the macro into a function for reducing code-size when non-critical
-static void Upsample16Pixels_NEON(const uint8_t *r1, const uint8_t *r2,
-                                  uint8_t *out) {
+static void Upsample16Pixels_NEON(const uint8_t* r1, const uint8_t* r2,
+                                  uint8_t* out) {
   UPSAMPLE_16PIXELS(r1, r2, out);
 }
 
@@ -190,14 +190,14 @@ static const int16_t kCoeffs1[4] = { 19077, 26149, 6419, 13320 };
 }
 
 #define NEON_UPSAMPLE_FUNC(FUNC_NAME, FMT, XSTEP)                       \
-static void FUNC_NAME(const uint8_t *top_y, const uint8_t *bottom_y,    \
-                      const uint8_t *top_u, const uint8_t *top_v,       \
-                      const uint8_t *cur_u, const uint8_t *cur_v,       \
-                      uint8_t *top_dst, uint8_t *bottom_dst, int len) { \
+static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y,    \
+                      const uint8_t* top_u, const uint8_t* top_v,       \
+                      const uint8_t* cur_u, const uint8_t* cur_v,       \
+                      uint8_t* top_dst, uint8_t* bottom_dst, int len) { \
   int block;                                                            \
   /* 16 byte aligned array to cache reconstructed u and v */            \
   uint8_t uv_buf[2 * 32 + 15];                                          \
-  uint8_t *const r_uv = (uint8_t*)((uintptr_t)(uv_buf + 15) & ~15);     \
+  uint8_t* const r_uv = (uint8_t*)((uintptr_t)(uv_buf + 15) & ~15);     \
   const int uv_len = (len + 1) >> 1;                                    \
   /* 9 pixels must be read-able for each block */                       \
   const int num_blocks = (uv_len - 1) >> 3;                             \
diff --git a/src/3rdparty/libwebp/src/dsp/upsampling_sse2.c b/src/3rdparty/libwebp/src/dsp/upsampling_sse2.c
index 340f1e2..08b6d0b 100644
--- a/src/3rdparty/libwebp/src/dsp/upsampling_sse2.c
+++ b/src/3rdparty/libwebp/src/dsp/upsampling_sse2.c
@@ -121,7 +121,7 @@ static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y,           \
   int uv_pos, pos;                                                             \
   /* 16byte-aligned array to cache reconstructed u and v */                    \
   uint8_t uv_buf[14 * 32 + 15] = { 0 };                                        \
-  uint8_t* const r_u = (uint8_t*)((uintptr_t)(uv_buf + 15) & ~15);             \
+  uint8_t* const r_u = (uint8_t*)((uintptr_t)(uv_buf + 15) & ~(uintptr_t)15);  \
   uint8_t* const r_v = r_u + 32;                                               \
                                                                                \
   assert(top_y != NULL);                                                       \
diff --git a/src/3rdparty/libwebp/src/dsp/yuv.c b/src/3rdparty/libwebp/src/dsp/yuv.c
index 14e67fc..d16c13d 100644
--- a/src/3rdparty/libwebp/src/dsp/yuv.c
+++ b/src/3rdparty/libwebp/src/dsp/yuv.c
@@ -90,16 +90,16 @@ WEBP_DSP_INIT_FUNC(WebPInitSamplers) {
 
   // If defined, use CPUInfo() to overwrite some pointers with faster versions.
   if (VP8GetCPUInfo != NULL) {
-#if defined(WEBP_USE_SSE2)
+#if defined(WEBP_HAVE_SSE2)
     if (VP8GetCPUInfo(kSSE2)) {
       WebPInitSamplersSSE2();
     }
-#endif  // WEBP_USE_SSE2
-#if defined(WEBP_USE_SSE41)
+#endif  // WEBP_HAVE_SSE2
+#if defined(WEBP_HAVE_SSE41)
     if (VP8GetCPUInfo(kSSE4_1)) {
       WebPInitSamplersSSE41();
     }
-#endif  // WEBP_USE_SSE41
+#endif  // WEBP_HAVE_SSE41
 #if defined(WEBP_USE_MIPS32)
     if (VP8GetCPUInfo(kMIPS32)) {
       WebPInitSamplersMIPS32();
@@ -194,50 +194,6 @@ void WebPConvertRGBA32ToUV_C(const uint16_t* rgb,
 
 //-----------------------------------------------------------------------------
 
-#if !WEBP_NEON_OMIT_C_CODE
-#define MAX_Y ((1 << 10) - 1)    // 10b precision over 16b-arithmetic
-static uint16_t clip_y(int v) {
-  return (v < 0) ? 0 : (v > MAX_Y) ? MAX_Y : (uint16_t)v;
-}
-
-static uint64_t SharpYUVUpdateY_C(const uint16_t* ref, const uint16_t* src,
-                                  uint16_t* dst, int len) {
-  uint64_t diff = 0;
-  int i;
-  for (i = 0; i < len; ++i) {
-    const int diff_y = ref[i] - src[i];
-    const int new_y = (int)dst[i] + diff_y;
-    dst[i] = clip_y(new_y);
-    diff += (uint64_t)abs(diff_y);
-  }
-  return diff;
-}
-
-static void SharpYUVUpdateRGB_C(const int16_t* ref, const int16_t* src,
-                                int16_t* dst, int len) {
-  int i;
-  for (i = 0; i < len; ++i) {
-    const int diff_uv = ref[i] - src[i];
-    dst[i] += diff_uv;
-  }
-}
-
-static void SharpYUVFilterRow_C(const int16_t* A, const int16_t* B, int len,
-                                const uint16_t* best_y, uint16_t* out) {
-  int i;
-  for (i = 0; i < len; ++i, ++A, ++B) {
-    const int v0 = (A[0] * 9 + A[1] * 3 + B[0] * 3 + B[1] + 8) >> 4;
-    const int v1 = (A[1] * 9 + A[0] * 3 + B[1] * 3 + B[0] + 8) >> 4;
-    out[2 * i + 0] = clip_y(best_y[2 * i + 0] + v0);
-    out[2 * i + 1] = clip_y(best_y[2 * i + 1] + v1);
-  }
-}
-#endif  // !WEBP_NEON_OMIT_C_CODE
-
-#undef MAX_Y
-
-//-----------------------------------------------------------------------------
-
 void (*WebPConvertRGB24ToY)(const uint8_t* rgb, uint8_t* y, int width);
 void (*WebPConvertBGR24ToY)(const uint8_t* bgr, uint8_t* y, int width);
 void (*WebPConvertRGBA32ToUV)(const uint16_t* rgb,
@@ -247,18 +203,9 @@ void (*WebPConvertARGBToY)(const uint32_t* argb, uint8_t* y, int width);
 void (*WebPConvertARGBToUV)(const uint32_t* argb, uint8_t* u, uint8_t* v,
                             int src_width, int do_store);
 
-uint64_t (*WebPSharpYUVUpdateY)(const uint16_t* ref, const uint16_t* src,
-                                uint16_t* dst, int len);
-void (*WebPSharpYUVUpdateRGB)(const int16_t* ref, const int16_t* src,
-                              int16_t* dst, int len);
-void (*WebPSharpYUVFilterRow)(const int16_t* A, const int16_t* B, int len,
-                              const uint16_t* best_y, uint16_t* out);
-
 extern void WebPInitConvertARGBToYUVSSE2(void);
 extern void WebPInitConvertARGBToYUVSSE41(void);
 extern void WebPInitConvertARGBToYUVNEON(void);
-extern void WebPInitSharpYUVSSE2(void);
-extern void WebPInitSharpYUVNEON(void);
 
 WEBP_DSP_INIT_FUNC(WebPInitConvertARGBToYUV) {
   WebPConvertARGBToY = ConvertARGBToY_C;
@@ -269,40 +216,29 @@ WEBP_DSP_INIT_FUNC(WebPInitConvertARGBToYUV) {
 
   WebPConvertRGBA32ToUV = WebPConvertRGBA32ToUV_C;
 
-#if !WEBP_NEON_OMIT_C_CODE
-  WebPSharpYUVUpdateY = SharpYUVUpdateY_C;
-  WebPSharpYUVUpdateRGB = SharpYUVUpdateRGB_C;
-  WebPSharpYUVFilterRow = SharpYUVFilterRow_C;
-#endif
-
   if (VP8GetCPUInfo != NULL) {
-#if defined(WEBP_USE_SSE2)
+#if defined(WEBP_HAVE_SSE2)
     if (VP8GetCPUInfo(kSSE2)) {
       WebPInitConvertARGBToYUVSSE2();
-      WebPInitSharpYUVSSE2();
     }
-#endif  // WEBP_USE_SSE2
-#if defined(WEBP_USE_SSE41)
+#endif  // WEBP_HAVE_SSE2
+#if defined(WEBP_HAVE_SSE41)
     if (VP8GetCPUInfo(kSSE4_1)) {
       WebPInitConvertARGBToYUVSSE41();
     }
-#endif  // WEBP_USE_SSE41
+#endif  // WEBP_HAVE_SSE41
   }
 
-#if defined(WEBP_USE_NEON)
+#if defined(WEBP_HAVE_NEON)
   if (WEBP_NEON_OMIT_C_CODE ||
       (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
     WebPInitConvertARGBToYUVNEON();
-    WebPInitSharpYUVNEON();
   }
-#endif  // WEBP_USE_NEON
+#endif  // WEBP_HAVE_NEON
 
   assert(WebPConvertARGBToY != NULL);
   assert(WebPConvertARGBToUV != NULL);
   assert(WebPConvertRGB24ToY != NULL);
   assert(WebPConvertBGR24ToY != NULL);
   assert(WebPConvertRGBA32ToUV != NULL);
-  assert(WebPSharpYUVUpdateY != NULL);
-  assert(WebPSharpYUVUpdateRGB != NULL);
-  assert(WebPSharpYUVFilterRow != NULL);
 }
diff --git a/src/3rdparty/libwebp/src/dsp/yuv.h b/src/3rdparty/libwebp/src/dsp/yuv.h
index c12be1d..66a397d 100644
--- a/src/3rdparty/libwebp/src/dsp/yuv.h
+++ b/src/3rdparty/libwebp/src/dsp/yuv.h
@@ -10,7 +10,7 @@
 // inline YUV<->RGB conversion function
 //
 // The exact naming is Y'CbCr, following the ITU-R BT.601 standard.
-// More information at: http://en.wikipedia.org/wiki/YCbCr
+// More information at: https://en.wikipedia.org/wiki/YCbCr
 // Y = 0.2569 * R + 0.5044 * G + 0.0979 * B + 16
 // U = -0.1483 * R - 0.2911 * G + 0.4394 * B + 128
 // V = 0.4394 * R - 0.3679 * G - 0.0715 * B + 128
diff --git a/src/3rdparty/libwebp/src/dsp/yuv_neon.c b/src/3rdparty/libwebp/src/dsp/yuv_neon.c
index a34d602..ff77b00 100644
--- a/src/3rdparty/libwebp/src/dsp/yuv_neon.c
+++ b/src/3rdparty/libwebp/src/dsp/yuv_neon.c
@@ -173,116 +173,8 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUVNEON(void) {
   WebPConvertRGBA32ToUV = ConvertRGBA32ToUV_NEON;
 }
 
-//------------------------------------------------------------------------------
-
-#define MAX_Y ((1 << 10) - 1)    // 10b precision over 16b-arithmetic
-static uint16_t clip_y_NEON(int v) {
-  return (v < 0) ? 0 : (v > MAX_Y) ? MAX_Y : (uint16_t)v;
-}
-
-static uint64_t SharpYUVUpdateY_NEON(const uint16_t* ref, const uint16_t* src,
-                                     uint16_t* dst, int len) {
-  int i;
-  const int16x8_t zero = vdupq_n_s16(0);
-  const int16x8_t max = vdupq_n_s16(MAX_Y);
-  uint64x2_t sum = vdupq_n_u64(0);
-  uint64_t diff;
-
-  for (i = 0; i + 8 <= len; i += 8) {
-    const int16x8_t A = vreinterpretq_s16_u16(vld1q_u16(ref + i));
-    const int16x8_t B = vreinterpretq_s16_u16(vld1q_u16(src + i));
-    const int16x8_t C = vreinterpretq_s16_u16(vld1q_u16(dst + i));
-    const int16x8_t D = vsubq_s16(A, B);       // diff_y
-    const int16x8_t F = vaddq_s16(C, D);       // new_y
-    const uint16x8_t H =
-        vreinterpretq_u16_s16(vmaxq_s16(vminq_s16(F, max), zero));
-    const int16x8_t I = vabsq_s16(D);          // abs(diff_y)
-    vst1q_u16(dst + i, H);
-    sum = vpadalq_u32(sum, vpaddlq_u16(vreinterpretq_u16_s16(I)));
-  }
-  diff = vgetq_lane_u64(sum, 0) + vgetq_lane_u64(sum, 1);
-  for (; i < len; ++i) {
-    const int diff_y = ref[i] - src[i];
-    const int new_y = (int)(dst[i]) + diff_y;
-    dst[i] = clip_y_NEON(new_y);
-    diff += (uint64_t)(abs(diff_y));
-  }
-  return diff;
-}
-
-static void SharpYUVUpdateRGB_NEON(const int16_t* ref, const int16_t* src,
-                                   int16_t* dst, int len) {
-  int i;
-  for (i = 0; i + 8 <= len; i += 8) {
-    const int16x8_t A = vld1q_s16(ref + i);
-    const int16x8_t B = vld1q_s16(src + i);
-    const int16x8_t C = vld1q_s16(dst + i);
-    const int16x8_t D = vsubq_s16(A, B);   // diff_uv
-    const int16x8_t E = vaddq_s16(C, D);   // new_uv
-    vst1q_s16(dst + i, E);
-  }
-  for (; i < len; ++i) {
-    const int diff_uv = ref[i] - src[i];
-    dst[i] += diff_uv;
-  }
-}
-
-static void SharpYUVFilterRow_NEON(const int16_t* A, const int16_t* B, int len,
-                                   const uint16_t* best_y, uint16_t* out) {
-  int i;
-  const int16x8_t max = vdupq_n_s16(MAX_Y);
-  const int16x8_t zero = vdupq_n_s16(0);
-  for (i = 0; i + 8 <= len; i += 8) {
-    const int16x8_t a0 = vld1q_s16(A + i + 0);
-    const int16x8_t a1 = vld1q_s16(A + i + 1);
-    const int16x8_t b0 = vld1q_s16(B + i + 0);
-    const int16x8_t b1 = vld1q_s16(B + i + 1);
-    const int16x8_t a0b1 = vaddq_s16(a0, b1);
-    const int16x8_t a1b0 = vaddq_s16(a1, b0);
-    const int16x8_t a0a1b0b1 = vaddq_s16(a0b1, a1b0);  // A0+A1+B0+B1
-    const int16x8_t a0b1_2 = vaddq_s16(a0b1, a0b1);    // 2*(A0+B1)
-    const int16x8_t a1b0_2 = vaddq_s16(a1b0, a1b0);    // 2*(A1+B0)
-    const int16x8_t c0 = vshrq_n_s16(vaddq_s16(a0b1_2, a0a1b0b1), 3);
-    const int16x8_t c1 = vshrq_n_s16(vaddq_s16(a1b0_2, a0a1b0b1), 3);
-    const int16x8_t d0 = vaddq_s16(c1, a0);
-    const int16x8_t d1 = vaddq_s16(c0, a1);
-    const int16x8_t e0 = vrshrq_n_s16(d0, 1);
-    const int16x8_t e1 = vrshrq_n_s16(d1, 1);
-    const int16x8x2_t f = vzipq_s16(e0, e1);
-    const int16x8_t g0 = vreinterpretq_s16_u16(vld1q_u16(best_y + 2 * i + 0));
-    const int16x8_t g1 = vreinterpretq_s16_u16(vld1q_u16(best_y + 2 * i + 8));
-    const int16x8_t h0 = vaddq_s16(g0, f.val[0]);
-    const int16x8_t h1 = vaddq_s16(g1, f.val[1]);
-    const int16x8_t i0 = vmaxq_s16(vminq_s16(h0, max), zero);
-    const int16x8_t i1 = vmaxq_s16(vminq_s16(h1, max), zero);
-    vst1q_u16(out + 2 * i + 0, vreinterpretq_u16_s16(i0));
-    vst1q_u16(out + 2 * i + 8, vreinterpretq_u16_s16(i1));
-  }
-  for (; i < len; ++i) {
-    const int a0b1 = A[i + 0] + B[i + 1];
-    const int a1b0 = A[i + 1] + B[i + 0];
-    const int a0a1b0b1 = a0b1 + a1b0 + 8;
-    const int v0 = (8 * A[i + 0] + 2 * a1b0 + a0a1b0b1) >> 4;
-    const int v1 = (8 * A[i + 1] + 2 * a0b1 + a0a1b0b1) >> 4;
-    out[2 * i + 0] = clip_y_NEON(best_y[2 * i + 0] + v0);
-    out[2 * i + 1] = clip_y_NEON(best_y[2 * i + 1] + v1);
-  }
-}
-#undef MAX_Y
-
-//------------------------------------------------------------------------------
-
-extern void WebPInitSharpYUVNEON(void);
-
-WEBP_TSAN_IGNORE_FUNCTION void WebPInitSharpYUVNEON(void) {
-  WebPSharpYUVUpdateY = SharpYUVUpdateY_NEON;
-  WebPSharpYUVUpdateRGB = SharpYUVUpdateRGB_NEON;
-  WebPSharpYUVFilterRow = SharpYUVFilterRow_NEON;
-}
-
 #else  // !WEBP_USE_NEON
 
 WEBP_DSP_INIT_STUB(WebPInitConvertARGBToYUVNEON)
-WEBP_DSP_INIT_STUB(WebPInitSharpYUVNEON)
 
 #endif  // WEBP_USE_NEON
diff --git a/src/3rdparty/libwebp/src/dsp/yuv_sse2.c b/src/3rdparty/libwebp/src/dsp/yuv_sse2.c
index baa48d5..01a48f9 100644
--- a/src/3rdparty/libwebp/src/dsp/yuv_sse2.c
+++ b/src/3rdparty/libwebp/src/dsp/yuv_sse2.c
@@ -15,10 +15,12 @@
 
 #if defined(WEBP_USE_SSE2)
 
-#include "src/dsp/common_sse2.h"
 #include <stdlib.h>
 #include <emmintrin.h>
 
+#include "src/dsp/common_sse2.h"
+#include "src/utils/utils.h"
+
 //-----------------------------------------------------------------------------
 // Convert spans of 32 pixels to various RGB formats for the fancy upsampler.
 
@@ -74,7 +76,7 @@ static WEBP_INLINE __m128i Load_HI_16_SSE2(const uint8_t* src) {
 // Load and replicate the U/V samples
 static WEBP_INLINE __m128i Load_UV_HI_8_SSE2(const uint8_t* src) {
   const __m128i zero = _mm_setzero_si128();
-  const __m128i tmp0 = _mm_cvtsi32_si128(*(const uint32_t*)src);
+  const __m128i tmp0 = _mm_cvtsi32_si128(WebPMemToInt32(src));
   const __m128i tmp1 = _mm_unpacklo_epi8(zero, tmp0);
   return _mm_unpacklo_epi16(tmp1, tmp1);   // replicate samples
 }
@@ -130,7 +132,7 @@ static WEBP_INLINE void PackAndStore4444_SSE2(const __m128i* const R,
   const __m128i rg0 = _mm_packus_epi16(*B, *A);
   const __m128i ba0 = _mm_packus_epi16(*R, *G);
 #endif
-  const __m128i mask_0xf0 = _mm_set1_epi8(0xf0);
+  const __m128i mask_0xf0 = _mm_set1_epi8((char)0xf0);
   const __m128i rb1 = _mm_unpacklo_epi8(rg0, ba0);  // rbrbrbrbrb...
   const __m128i ga1 = _mm_unpackhi_epi8(rg0, ba0);  // gagagagaga...
   const __m128i rb2 = _mm_and_si128(rb1, mask_0xf0);
@@ -147,9 +149,10 @@ static WEBP_INLINE void PackAndStore565_SSE2(const __m128i* const R,
   const __m128i r0 = _mm_packus_epi16(*R, *R);
   const __m128i g0 = _mm_packus_epi16(*G, *G);
   const __m128i b0 = _mm_packus_epi16(*B, *B);
-  const __m128i r1 = _mm_and_si128(r0, _mm_set1_epi8(0xf8));
+  const __m128i r1 = _mm_and_si128(r0, _mm_set1_epi8((char)0xf8));
   const __m128i b1 = _mm_and_si128(_mm_srli_epi16(b0, 3), _mm_set1_epi8(0x1f));
-  const __m128i g1 = _mm_srli_epi16(_mm_and_si128(g0, _mm_set1_epi8(0xe0)), 5);
+  const __m128i g1 =
+      _mm_srli_epi16(_mm_and_si128(g0, _mm_set1_epi8((char)0xe0)), 5);
   const __m128i g2 = _mm_slli_epi16(_mm_and_si128(g0, _mm_set1_epi8(0x1c)), 3);
   const __m128i rg = _mm_or_si128(r1, g1);
   const __m128i gb = _mm_or_si128(g2, b1);
@@ -747,128 +750,9 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUVSSE2(void) {
   WebPConvertRGBA32ToUV = ConvertRGBA32ToUV_SSE2;
 }
 
-//------------------------------------------------------------------------------
-
-#define MAX_Y ((1 << 10) - 1)    // 10b precision over 16b-arithmetic
-static uint16_t clip_y(int v) {
-  return (v < 0) ? 0 : (v > MAX_Y) ? MAX_Y : (uint16_t)v;
-}
-
-static uint64_t SharpYUVUpdateY_SSE2(const uint16_t* ref, const uint16_t* src,
-                                     uint16_t* dst, int len) {
-  uint64_t diff = 0;
-  uint32_t tmp[4];
-  int i;
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i max = _mm_set1_epi16(MAX_Y);
-  const __m128i one = _mm_set1_epi16(1);
-  __m128i sum = zero;
-
-  for (i = 0; i + 8 <= len; i += 8) {
-    const __m128i A = _mm_loadu_si128((const __m128i*)(ref + i));
-    const __m128i B = _mm_loadu_si128((const __m128i*)(src + i));
-    const __m128i C = _mm_loadu_si128((const __m128i*)(dst + i));
-    const __m128i D = _mm_sub_epi16(A, B);       // diff_y
-    const __m128i E = _mm_cmpgt_epi16(zero, D);  // sign (-1 or 0)
-    const __m128i F = _mm_add_epi16(C, D);       // new_y
-    const __m128i G = _mm_or_si128(E, one);      // -1 or 1
-    const __m128i H = _mm_max_epi16(_mm_min_epi16(F, max), zero);
-    const __m128i I = _mm_madd_epi16(D, G);      // sum(abs(...))
-    _mm_storeu_si128((__m128i*)(dst + i), H);
-    sum = _mm_add_epi32(sum, I);
-  }
-  _mm_storeu_si128((__m128i*)tmp, sum);
-  diff = tmp[3] + tmp[2] + tmp[1] + tmp[0];
-  for (; i < len; ++i) {
-    const int diff_y = ref[i] - src[i];
-    const int new_y = (int)dst[i] + diff_y;
-    dst[i] = clip_y(new_y);
-    diff += (uint64_t)abs(diff_y);
-  }
-  return diff;
-}
-
-static void SharpYUVUpdateRGB_SSE2(const int16_t* ref, const int16_t* src,
-                                   int16_t* dst, int len) {
-  int i = 0;
-  for (i = 0; i + 8 <= len; i += 8) {
-    const __m128i A = _mm_loadu_si128((const __m128i*)(ref + i));
-    const __m128i B = _mm_loadu_si128((const __m128i*)(src + i));
-    const __m128i C = _mm_loadu_si128((const __m128i*)(dst + i));
-    const __m128i D = _mm_sub_epi16(A, B);   // diff_uv
-    const __m128i E = _mm_add_epi16(C, D);   // new_uv
-    _mm_storeu_si128((__m128i*)(dst + i), E);
-  }
-  for (; i < len; ++i) {
-    const int diff_uv = ref[i] - src[i];
-    dst[i] += diff_uv;
-  }
-}
-
-static void SharpYUVFilterRow_SSE2(const int16_t* A, const int16_t* B, int len,
-                                   const uint16_t* best_y, uint16_t* out) {
-  int i;
-  const __m128i kCst8 = _mm_set1_epi16(8);
-  const __m128i max = _mm_set1_epi16(MAX_Y);
-  const __m128i zero = _mm_setzero_si128();
-  for (i = 0; i + 8 <= len; i += 8) {
-    const __m128i a0 = _mm_loadu_si128((const __m128i*)(A + i + 0));
-    const __m128i a1 = _mm_loadu_si128((const __m128i*)(A + i + 1));
-    const __m128i b0 = _mm_loadu_si128((const __m128i*)(B + i + 0));
-    const __m128i b1 = _mm_loadu_si128((const __m128i*)(B + i + 1));
-    const __m128i a0b1 = _mm_add_epi16(a0, b1);
-    const __m128i a1b0 = _mm_add_epi16(a1, b0);
-    const __m128i a0a1b0b1 = _mm_add_epi16(a0b1, a1b0);  // A0+A1+B0+B1
-    const __m128i a0a1b0b1_8 = _mm_add_epi16(a0a1b0b1, kCst8);
-    const __m128i a0b1_2 = _mm_add_epi16(a0b1, a0b1);    // 2*(A0+B1)
-    const __m128i a1b0_2 = _mm_add_epi16(a1b0, a1b0);    // 2*(A1+B0)
-    const __m128i c0 = _mm_srai_epi16(_mm_add_epi16(a0b1_2, a0a1b0b1_8), 3);
-    const __m128i c1 = _mm_srai_epi16(_mm_add_epi16(a1b0_2, a0a1b0b1_8), 3);
-    const __m128i d0 = _mm_add_epi16(c1, a0);
-    const __m128i d1 = _mm_add_epi16(c0, a1);
-    const __m128i e0 = _mm_srai_epi16(d0, 1);
-    const __m128i e1 = _mm_srai_epi16(d1, 1);
-    const __m128i f0 = _mm_unpacklo_epi16(e0, e1);
-    const __m128i f1 = _mm_unpackhi_epi16(e0, e1);
-    const __m128i g0 = _mm_loadu_si128((const __m128i*)(best_y + 2 * i + 0));
-    const __m128i g1 = _mm_loadu_si128((const __m128i*)(best_y + 2 * i + 8));
-    const __m128i h0 = _mm_add_epi16(g0, f0);
-    const __m128i h1 = _mm_add_epi16(g1, f1);
-    const __m128i i0 = _mm_max_epi16(_mm_min_epi16(h0, max), zero);
-    const __m128i i1 = _mm_max_epi16(_mm_min_epi16(h1, max), zero);
-    _mm_storeu_si128((__m128i*)(out + 2 * i + 0), i0);
-    _mm_storeu_si128((__m128i*)(out + 2 * i + 8), i1);
-  }
-  for (; i < len; ++i) {
-    //   (9 * A0 + 3 * A1 + 3 * B0 + B1 + 8) >> 4 =
-    // = (8 * A0 + 2 * (A1 + B0) + (A0 + A1 + B0 + B1 + 8)) >> 4
-    // We reuse the common sub-expressions.
-    const int a0b1 = A[i + 0] + B[i + 1];
-    const int a1b0 = A[i + 1] + B[i + 0];
-    const int a0a1b0b1 = a0b1 + a1b0 + 8;
-    const int v0 = (8 * A[i + 0] + 2 * a1b0 + a0a1b0b1) >> 4;
-    const int v1 = (8 * A[i + 1] + 2 * a0b1 + a0a1b0b1) >> 4;
-    out[2 * i + 0] = clip_y(best_y[2 * i + 0] + v0);
-    out[2 * i + 1] = clip_y(best_y[2 * i + 1] + v1);
-  }
-}
-
-#undef MAX_Y
-
-//------------------------------------------------------------------------------
-
-extern void WebPInitSharpYUVSSE2(void);
-
-WEBP_TSAN_IGNORE_FUNCTION void WebPInitSharpYUVSSE2(void) {
-  WebPSharpYUVUpdateY = SharpYUVUpdateY_SSE2;
-  WebPSharpYUVUpdateRGB = SharpYUVUpdateRGB_SSE2;
-  WebPSharpYUVFilterRow = SharpYUVFilterRow_SSE2;
-}
-
 #else  // !WEBP_USE_SSE2
 
 WEBP_DSP_INIT_STUB(WebPInitSamplersSSE2)
 WEBP_DSP_INIT_STUB(WebPInitConvertARGBToYUVSSE2)
-WEBP_DSP_INIT_STUB(WebPInitSharpYUVSSE2)
 
 #endif  // WEBP_USE_SSE2
diff --git a/src/3rdparty/libwebp/src/dsp/yuv_sse41.c b/src/3rdparty/libwebp/src/dsp/yuv_sse41.c
index 579d1f7..f79b802 100644
--- a/src/3rdparty/libwebp/src/dsp/yuv_sse41.c
+++ b/src/3rdparty/libwebp/src/dsp/yuv_sse41.c
@@ -15,10 +15,12 @@
 
 #if defined(WEBP_USE_SSE41)
 
-#include "src/dsp/common_sse41.h"
 #include <stdlib.h>
 #include <smmintrin.h>
 
+#include "src/dsp/common_sse41.h"
+#include "src/utils/utils.h"
+
 //-----------------------------------------------------------------------------
 // Convert spans of 32 pixels to various RGB formats for the fancy upsampler.
 
@@ -74,7 +76,7 @@ static WEBP_INLINE __m128i Load_HI_16_SSE41(const uint8_t* src) {
 // Load and replicate the U/V samples
 static WEBP_INLINE __m128i Load_UV_HI_8_SSE41(const uint8_t* src) {
   const __m128i zero = _mm_setzero_si128();
-  const __m128i tmp0 = _mm_cvtsi32_si128(*(const uint32_t*)src);
+  const __m128i tmp0 = _mm_cvtsi32_si128(WebPMemToInt32(src));
   const __m128i tmp1 = _mm_unpacklo_epi8(zero, tmp0);
   return _mm_unpacklo_epi16(tmp1, tmp1);   // replicate samples
 }
diff --git a/src/3rdparty/libwebp/src/enc/alpha_enc.c b/src/3rdparty/libwebp/src/enc/alpha_enc.c
index dce9ca9..f7c0269 100644
--- a/src/3rdparty/libwebp/src/enc/alpha_enc.c
+++ b/src/3rdparty/libwebp/src/enc/alpha_enc.c
@@ -86,7 +86,7 @@ static int EncodeLossless(const uint8_t* const data, int width, int height,
   // a decoder bug related to alpha with color cache.
   // See: https://code.google.com/p/webp/issues/detail?id=239
   // Need to re-enable this later.
-  ok = (VP8LEncodeStream(&config, &picture, bw, 0 /*use_cache*/) == VP8_ENC_OK);
+  ok = VP8LEncodeStream(&config, &picture, bw, /*use_cache=*/0);
   WebPPictureFree(&picture);
   ok = ok && !bw->error_;
   if (!ok) {
@@ -303,7 +303,7 @@ static int EncodeAlpha(VP8Encoder* const enc,
   int ok = 1;
   const int reduce_levels = (quality < 100);
 
-  // quick sanity checks
+  // quick correctness checks
   assert((uint64_t)data_size == (uint64_t)width * height);  // as per spec
   assert(enc != NULL && pic != NULL && pic->a != NULL);
   assert(output != NULL && output_size != NULL);
@@ -361,7 +361,7 @@ static int EncodeAlpha(VP8Encoder* const enc,
 //------------------------------------------------------------------------------
 // Main calls
 
-static int CompressAlphaJob(void* arg1, void* dummy) {
+static int CompressAlphaJob(void* arg1, void* unused) {
   VP8Encoder* const enc = (VP8Encoder*)arg1;
   const WebPConfig* config = enc->config_;
   uint8_t* alpha_data = NULL;
@@ -375,13 +375,13 @@ static int CompressAlphaJob(void* arg1, void* dummy) {
                    filter, effort_level, &alpha_data, &alpha_size)) {
     return 0;
   }
-  if (alpha_size != (uint32_t)alpha_size) {  // Sanity check.
+  if (alpha_size != (uint32_t)alpha_size) {  // Soundness check.
     WebPSafeFree(alpha_data);
     return 0;
   }
   enc->alpha_data_size_ = (uint32_t)alpha_size;
   enc->alpha_data_ = alpha_data;
-  (void)dummy;
+  (void)unused;
   return 1;
 }
 
diff --git a/src/3rdparty/libwebp/src/enc/analysis_enc.c b/src/3rdparty/libwebp/src/enc/analysis_enc.c
index 687757a..a0001ac 100644
--- a/src/3rdparty/libwebp/src/enc/analysis_enc.c
+++ b/src/3rdparty/libwebp/src/enc/analysis_enc.c
@@ -126,16 +126,6 @@ static void InitHistogram(VP8Histogram* const histo) {
   histo->last_non_zero = 1;
 }
 
-static void MergeHistograms(const VP8Histogram* const in,
-                            VP8Histogram* const out) {
-  if (in->max_value > out->max_value) {
-    out->max_value = in->max_value;
-  }
-  if (in->last_non_zero > out->last_non_zero) {
-    out->last_non_zero = in->last_non_zero;
-  }
-}
-
 //------------------------------------------------------------------------------
 // Simplified k-Means, to assign Nb segments based on alpha-histogram
 
@@ -285,49 +275,6 @@ static int FastMBAnalyze(VP8EncIterator* const it) {
   return 0;
 }
 
-static int MBAnalyzeBestIntra4Mode(VP8EncIterator* const it,
-                                   int best_alpha) {
-  uint8_t modes[16];
-  const int max_mode = MAX_INTRA4_MODE;
-  int i4_alpha;
-  VP8Histogram total_histo;
-  int cur_histo = 0;
-  InitHistogram(&total_histo);
-
-  VP8IteratorStartI4(it);
-  do {
-    int mode;
-    int best_mode_alpha = DEFAULT_ALPHA;
-    VP8Histogram histos[2];
-    const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC + VP8Scan[it->i4_];
-
-    VP8MakeIntra4Preds(it);
-    for (mode = 0; mode < max_mode; ++mode) {
-      int alpha;
-
-      InitHistogram(&histos[cur_histo]);
-      VP8CollectHistogram(src, it->yuv_p_ + VP8I4ModeOffsets[mode],
-                          0, 1, &histos[cur_histo]);
-      alpha = GetAlpha(&histos[cur_histo]);
-      if (IS_BETTER_ALPHA(alpha, best_mode_alpha)) {
-        best_mode_alpha = alpha;
-        modes[it->i4_] = mode;
-        cur_histo ^= 1;   // keep track of best histo so far.
-      }
-    }
-    // accumulate best histogram
-    MergeHistograms(&histos[cur_histo ^ 1], &total_histo);
-    // Note: we reuse the original samples for predictors
-  } while (VP8IteratorRotateI4(it, it->yuv_in_ + Y_OFF_ENC));
-
-  i4_alpha = GetAlpha(&total_histo);
-  if (IS_BETTER_ALPHA(i4_alpha, best_alpha)) {
-    VP8SetIntra4Mode(it, modes);
-    best_alpha = i4_alpha;
-  }
-  return best_alpha;
-}
-
 static int MBAnalyzeBestUVMode(VP8EncIterator* const it) {
   int best_alpha = DEFAULT_ALPHA;
   int smallest_alpha = 0;
@@ -371,13 +318,6 @@ static void MBAnalyze(VP8EncIterator* const it,
     best_alpha = FastMBAnalyze(it);
   } else {
     best_alpha = MBAnalyzeBestIntra16Mode(it);
-    if (enc->method_ >= 5) {
-      // We go and make a fast decision for intra4/intra16.
-      // It's usually not a good and definitive pick, but helps seeding the
-      // stats about level bit-cost.
-      // TODO(skal): improve criterion.
-      best_alpha = MBAnalyzeBestIntra4Mode(it, best_alpha);
-    }
   }
   best_uv_alpha = MBAnalyzeBestUVMode(it);
 
@@ -451,12 +391,14 @@ static int DoSegmentsJob(void* arg1, void* arg2) {
   return ok;
 }
 
+#ifdef WEBP_USE_THREAD
 static void MergeJobs(const SegmentJob* const src, SegmentJob* const dst) {
   int i;
   for (i = 0; i <= MAX_ALPHA; ++i) dst->alphas[i] += src->alphas[i];
   dst->alpha += src->alpha;
   dst->uv_alpha += src->uv_alpha;
 }
+#endif
 
 // initialize the job struct with some tasks to perform
 static void InitSegmentJob(VP8Encoder* const enc, SegmentJob* const job,
@@ -485,10 +427,10 @@ int VP8EncAnalyze(VP8Encoder* const enc) {
       (enc->method_ <= 1);  // for method 0 - 1, we need preds_[] to be filled.
   if (do_segments) {
     const int last_row = enc->mb_h_;
-    // We give a little more than a half work to the main thread.
-    const int split_row = (9 * last_row + 15) >> 4;
     const int total_mb = last_row * enc->mb_w_;
 #ifdef WEBP_USE_THREAD
+    // We give a little more than a half work to the main thread.
+    const int split_row = (9 * last_row + 15) >> 4;
     const int kMinSplitRow = 2;  // minimal rows needed for mt to be worth it
     const int do_mt = (enc->thread_level_ > 0) && (split_row >= kMinSplitRow);
 #else
@@ -498,6 +440,7 @@ int VP8EncAnalyze(VP8Encoder* const enc) {
         WebPGetWorkerInterface();
     SegmentJob main_job;
     if (do_mt) {
+#ifdef WEBP_USE_THREAD
       SegmentJob side_job;
       // Note the use of '&' instead of '&&' because we must call the functions
       // no matter what.
@@ -515,6 +458,7 @@ int VP8EncAnalyze(VP8Encoder* const enc) {
       }
       worker_interface->End(&side_job.worker);
       if (ok) MergeJobs(&side_job, &main_job);  // merge results together
+#endif  // WEBP_USE_THREAD
     } else {
       // Even for single-thread case, we use the generic Worker tools.
       InitSegmentJob(enc, &main_job, 0, last_row);
diff --git a/src/3rdparty/libwebp/src/enc/backward_references_cost_enc.c b/src/3rdparty/libwebp/src/enc/backward_references_cost_enc.c
index 516abd7..6968ef3 100644
--- a/src/3rdparty/libwebp/src/enc/backward_references_cost_enc.c
+++ b/src/3rdparty/libwebp/src/enc/backward_references_cost_enc.c
@@ -15,10 +15,11 @@
 //
 
 #include <assert.h>
+#include <float.h>
 
+#include "src/dsp/lossless_common.h"
 #include "src/enc/backward_references_enc.h"
 #include "src/enc/histogram_enc.h"
-#include "src/dsp/lossless_common.h"
 #include "src/utils/color_cache_utils.h"
 #include "src/utils/utils.h"
 
@@ -30,15 +31,15 @@ extern void VP8LBackwardRefsCursorAdd(VP8LBackwardRefs* const refs,
                                       const PixOrCopy v);
 
 typedef struct {
-  double alpha_[VALUES_IN_BYTE];
-  double red_[VALUES_IN_BYTE];
-  double blue_[VALUES_IN_BYTE];
-  double distance_[NUM_DISTANCE_CODES];
-  double* literal_;
+  float alpha_[VALUES_IN_BYTE];
+  float red_[VALUES_IN_BYTE];
+  float blue_[VALUES_IN_BYTE];
+  float distance_[NUM_DISTANCE_CODES];
+  float* literal_;
 } CostModel;
 
 static void ConvertPopulationCountTableToBitEstimates(
-    int num_symbols, const uint32_t population_counts[], double output[]) {
+    int num_symbols, const uint32_t population_counts[], float output[]) {
   uint32_t sum = 0;
   int nonzeros = 0;
   int i;
@@ -51,7 +52,7 @@ static void ConvertPopulationCountTableToBitEstimates(
   if (nonzeros <= 1) {
     memset(output, 0, num_symbols * sizeof(*output));
   } else {
-    const double logsum = VP8LFastLog2(sum);
+    const float logsum = VP8LFastLog2(sum);
     for (i = 0; i < num_symbols; ++i) {
       output[i] = logsum - VP8LFastLog2(population_counts[i]);
     }
@@ -75,8 +76,8 @@ static int CostModelBuild(CostModel* const m, int xsize, int cache_bits,
   }
 
   ConvertPopulationCountTableToBitEstimates(
-      VP8LHistogramNumCodes(histo->palette_code_bits_),
-      histo->literal_, m->literal_);
+      VP8LHistogramNumCodes(histo->palette_code_bits_), histo->literal_,
+      m->literal_);
   ConvertPopulationCountTableToBitEstimates(
       VALUES_IN_BYTE, histo->red_, m->red_);
   ConvertPopulationCountTableToBitEstimates(
@@ -92,27 +93,27 @@ static int CostModelBuild(CostModel* const m, int xsize, int cache_bits,
   return ok;
 }
 
-static WEBP_INLINE double GetLiteralCost(const CostModel* const m, uint32_t v) {
+static WEBP_INLINE float GetLiteralCost(const CostModel* const m, uint32_t v) {
   return m->alpha_[v >> 24] +
          m->red_[(v >> 16) & 0xff] +
          m->literal_[(v >> 8) & 0xff] +
          m->blue_[v & 0xff];
 }
 
-static WEBP_INLINE double GetCacheCost(const CostModel* const m, uint32_t idx) {
+static WEBP_INLINE float GetCacheCost(const CostModel* const m, uint32_t idx) {
   const int literal_idx = VALUES_IN_BYTE + NUM_LENGTH_CODES + idx;
   return m->literal_[literal_idx];
 }
 
-static WEBP_INLINE double GetLengthCost(const CostModel* const m,
-                                        uint32_t length) {
+static WEBP_INLINE float GetLengthCost(const CostModel* const m,
+                                       uint32_t length) {
   int code, extra_bits;
   VP8LPrefixEncodeBits(length, &code, &extra_bits);
   return m->literal_[VALUES_IN_BYTE + code] + extra_bits;
 }
 
-static WEBP_INLINE double GetDistanceCost(const CostModel* const m,
-                                          uint32_t distance) {
+static WEBP_INLINE float GetDistanceCost(const CostModel* const m,
+                                         uint32_t distance) {
   int code, extra_bits;
   VP8LPrefixEncodeBits(distance, &code, &extra_bits);
   return m->distance_[code] + extra_bits;
@@ -122,20 +123,20 @@ static WEBP_INLINE void AddSingleLiteralWithCostModel(
     const uint32_t* const argb, VP8LColorCache* const hashers,
     const CostModel* const cost_model, int idx, int use_color_cache,
     float prev_cost, float* const cost, uint16_t* const dist_array) {
-  double cost_val = prev_cost;
+  float cost_val = prev_cost;
   const uint32_t color = argb[idx];
   const int ix = use_color_cache ? VP8LColorCacheContains(hashers, color) : -1;
   if (ix >= 0) {
     // use_color_cache is true and hashers contains color
-    const double mul0 = 0.68;
+    const float mul0 = 0.68f;
     cost_val += GetCacheCost(cost_model, ix) * mul0;
   } else {
-    const double mul1 = 0.82;
+    const float mul1 = 0.82f;
     if (use_color_cache) VP8LColorCacheInsert(hashers, color);
     cost_val += GetLiteralCost(cost_model, color) * mul1;
   }
   if (cost[idx] > cost_val) {
-    cost[idx] = (float)cost_val;
+    cost[idx] = cost_val;
     dist_array[idx] = 1;  // only one is inserted.
   }
 }
@@ -172,7 +173,7 @@ struct CostInterval {
 
 // The GetLengthCost(cost_model, k) are cached in a CostCacheInterval.
 typedef struct {
-  double cost_;
+  float cost_;
   int start_;
   int end_;       // Exclusive.
 } CostCacheInterval;
@@ -187,7 +188,7 @@ typedef struct {
   int count_;  // The number of stored intervals.
   CostCacheInterval* cache_intervals_;
   size_t cache_intervals_size_;
-  double cost_cache_[MAX_LENGTH];  // Contains the GetLengthCost(cost_model, k).
+  float cost_cache_[MAX_LENGTH];  // Contains the GetLengthCost(cost_model, k).
   float* costs_;
   uint16_t* dist_array_;
   // Most of the time, we only need few intervals -> use a free-list, to avoid
@@ -262,10 +263,13 @@ static int CostManagerInit(CostManager* const manager,
   CostManagerInitFreeList(manager);
 
   // Fill in the cost_cache_.
+  // Has to be done in two passes due to a GCC bug on i686
+  // related to https://gcc.gnu.org/bugzilla/show_bug.cgi?id=323
+  for (i = 0; i < cost_cache_size; ++i) {
+    manager->cost_cache_[i] = GetLengthCost(cost_model, i);
+  }
   manager->cache_intervals_size_ = 1;
-  manager->cost_cache_[0] = GetLengthCost(cost_model, 0);
   for (i = 1; i < cost_cache_size; ++i) {
-    manager->cost_cache_[i] = GetLengthCost(cost_model, i);
     // Get the number of bound intervals.
     if (manager->cost_cache_[i] != manager->cost_cache_[i - 1]) {
       ++manager->cache_intervals_size_;
@@ -294,7 +298,7 @@ static int CostManagerInit(CostManager* const manager,
     cur->end_ = 1;
     cur->cost_ = manager->cost_cache_[0];
     for (i = 1; i < cost_cache_size; ++i) {
-      const double cost_val = manager->cost_cache_[i];
+      const float cost_val = manager->cost_cache_[i];
       if (cost_val != cur->cost_) {
         ++cur;
         // Initialize an interval.
@@ -303,6 +307,8 @@ static int CostManagerInit(CostManager* const manager,
       }
       cur->end_ = i + 1;
     }
+    assert((size_t)(cur - manager->cache_intervals_) + 1 ==
+           manager->cache_intervals_size_);
   }
 
   manager->costs_ = (float*)WebPSafeMalloc(pix_count, sizeof(*manager->costs_));
@@ -311,7 +317,7 @@ static int CostManagerInit(CostManager* const manager,
     return 0;
   }
   // Set the initial costs_ high for every pixel as we will keep the minimum.
-  for (i = 0; i < pix_count; ++i) manager->costs_[i] = 1e38f;
+  for (i = 0; i < pix_count; ++i) manager->costs_[i] = FLT_MAX;
 
   return 1;
 }
@@ -457,7 +463,7 @@ static WEBP_INLINE void InsertInterval(CostManager* const manager,
 // If handling the interval or one of its subintervals becomes to heavy, its
 // contribution is added to the costs right away.
 static WEBP_INLINE void PushInterval(CostManager* const manager,
-                                     double distance_cost, int position,
+                                     float distance_cost, int position,
                                      int len) {
   size_t i;
   CostInterval* interval = manager->head_;
@@ -474,7 +480,7 @@ static WEBP_INLINE void PushInterval(CostManager* const manager,
       const int k = j - position;
       float cost_tmp;
       assert(k >= 0 && k < MAX_LENGTH);
-      cost_tmp = (float)(distance_cost + manager->cost_cache_[k]);
+      cost_tmp = distance_cost + manager->cost_cache_[k];
 
       if (manager->costs_[j] > cost_tmp) {
         manager->costs_[j] = cost_tmp;
@@ -492,7 +498,7 @@ static WEBP_INLINE void PushInterval(CostManager* const manager,
     const int end = position + (cost_cache_intervals[i].end_ > len
                                  ? len
                                  : cost_cache_intervals[i].end_);
-    const float cost = (float)(distance_cost + cost_cache_intervals[i].cost_);
+    const float cost = distance_cost + cost_cache_intervals[i].cost_;
 
     for (; interval != NULL && interval->start_ < end;
          interval = interval_next) {
@@ -570,22 +576,21 @@ static int BackwardReferencesHashChainDistanceOnly(
   const int pix_count = xsize * ysize;
   const int use_color_cache = (cache_bits > 0);
   const size_t literal_array_size =
-      sizeof(double) * (NUM_LITERAL_CODES + NUM_LENGTH_CODES +
-                        ((cache_bits > 0) ? (1 << cache_bits) : 0));
+      sizeof(float) * (VP8LHistogramNumCodes(cache_bits));
   const size_t cost_model_size = sizeof(CostModel) + literal_array_size;
   CostModel* const cost_model =
       (CostModel*)WebPSafeCalloc(1ULL, cost_model_size);
   VP8LColorCache hashers;
   CostManager* cost_manager =
-      (CostManager*)WebPSafeMalloc(1ULL, sizeof(*cost_manager));
+      (CostManager*)WebPSafeCalloc(1ULL, sizeof(*cost_manager));
   int offset_prev = -1, len_prev = -1;
-  double offset_cost = -1;
+  float offset_cost = -1.f;
   int first_offset_is_constant = -1;  // initialized with 'impossible' value
   int reach = 0;
 
   if (cost_model == NULL || cost_manager == NULL) goto Error;
 
-  cost_model->literal_ = (double*)(cost_model + 1);
+  cost_model->literal_ = (float*)(cost_model + 1);
   if (use_color_cache) {
     cc_init = VP8LColorCacheInit(&hashers, cache_bits);
     if (!cc_init) goto Error;
@@ -675,7 +680,7 @@ static int BackwardReferencesHashChainDistanceOnly(
   }
 
   ok = !refs->error_;
-Error:
+ Error:
   if (cc_init) VP8LColorCacheClear(&hashers);
   CostManagerClear(cost_manager);
   WebPSafeFree(cost_model);
diff --git a/src/3rdparty/libwebp/src/enc/backward_references_enc.c b/src/3rdparty/libwebp/src/enc/backward_references_enc.c
index d445b40..49a0fac 100644
--- a/src/3rdparty/libwebp/src/enc/backward_references_enc.c
+++ b/src/3rdparty/libwebp/src/enc/backward_references_enc.c
@@ -10,16 +10,20 @@
 // Author: Jyrki Alakuijala (jyrki@google.com)
 //
 
+#include "src/enc/backward_references_enc.h"
+
 #include <assert.h>
+#include <float.h>
 #include <math.h>
 
-#include "src/enc/backward_references_enc.h"
-#include "src/enc/histogram_enc.h"
+#include "src/dsp/dsp.h"
 #include "src/dsp/lossless.h"
 #include "src/dsp/lossless_common.h"
-#include "src/dsp/dsp.h"
+#include "src/enc/histogram_enc.h"
+#include "src/enc/vp8i_enc.h"
 #include "src/utils/color_cache_utils.h"
 #include "src/utils/utils.h"
+#include "src/webp/encode.h"
 
 #define MIN_BLOCK_SIZE 256  // minimum block size for backward references
 
@@ -103,6 +107,20 @@ void VP8LBackwardRefsClear(VP8LBackwardRefs* const refs) {
   }
 }
 
+// Swaps the content of two VP8LBackwardRefs.
+static void BackwardRefsSwap(VP8LBackwardRefs* const refs1,
+                             VP8LBackwardRefs* const refs2) {
+  const int point_to_refs1 =
+      (refs1->tail_ != NULL && refs1->tail_ == &refs1->refs_);
+  const int point_to_refs2 =
+      (refs2->tail_ != NULL && refs2->tail_ == &refs2->refs_);
+  const VP8LBackwardRefs tmp = *refs1;
+  *refs1 = *refs2;
+  *refs2 = tmp;
+  if (point_to_refs2) refs1->tail_ = &refs1->refs_;
+  if (point_to_refs1) refs2->tail_ = &refs2->refs_;
+}
+
 void VP8LBackwardRefsInit(VP8LBackwardRefs* const refs, int block_size) {
   assert(refs != NULL);
   memset(refs, 0, sizeof(*refs));
@@ -154,6 +172,22 @@ static PixOrCopyBlock* BackwardRefsNewBlock(VP8LBackwardRefs* const refs) {
   return b;
 }
 
+// Return 1 on success, 0 on error.
+static int BackwardRefsClone(const VP8LBackwardRefs* const from,
+                             VP8LBackwardRefs* const to) {
+  const PixOrCopyBlock* block_from = from->refs_;
+  VP8LClearBackwardRefs(to);
+  while (block_from != NULL) {
+    PixOrCopyBlock* const block_to = BackwardRefsNewBlock(to);
+    if (block_to == NULL) return 0;
+    memcpy(block_to->start_, block_from->start_,
+           block_from->size_ * sizeof(PixOrCopy));
+    block_to->size_ = block_from->size_;
+    block_from = block_from->next_;
+  }
+  return 1;
+}
+
 extern void VP8LBackwardRefsCursorAdd(VP8LBackwardRefs* const refs,
                                       const PixOrCopy v);
 void VP8LBackwardRefsCursorAdd(VP8LBackwardRefs* const refs,
@@ -224,10 +258,13 @@ static WEBP_INLINE int MaxFindCopyLength(int len) {
 
 int VP8LHashChainFill(VP8LHashChain* const p, int quality,
                       const uint32_t* const argb, int xsize, int ysize,
-                      int low_effort) {
+                      int low_effort, const WebPPicture* const pic,
+                      int percent_range, int* const percent) {
   const int size = xsize * ysize;
   const int iter_max = GetMaxItersForQuality(quality);
   const uint32_t window_size = GetWindowSizeForHashChain(quality, xsize);
+  int remaining_percent = percent_range;
+  int percent_start = *percent;
   int pos;
   int argb_comp;
   uint32_t base_position;
@@ -245,7 +282,13 @@ int VP8LHashChainFill(VP8LHashChain* const p, int quality,
 
   hash_to_first_index =
       (int32_t*)WebPSafeMalloc(HASH_SIZE, sizeof(*hash_to_first_index));
-  if (hash_to_first_index == NULL) return 0;
+  if (hash_to_first_index == NULL) {
+    WebPEncodingSetError(pic, VP8_ENC_ERROR_OUT_OF_MEMORY);
+    return 0;
+  }
+
+  percent_range = remaining_percent / 2;
+  remaining_percent -= percent_range;
 
   // Set the int32_t array to -1.
   memset(hash_to_first_index, 0xff, HASH_SIZE * sizeof(*hash_to_first_index));
@@ -292,12 +335,22 @@ int VP8LHashChainFill(VP8LHashChain* const p, int quality,
       hash_to_first_index[hash_code] = pos++;
       argb_comp = argb_comp_next;
     }
+
+    if (!WebPReportProgress(
+            pic, percent_start + percent_range * pos / (size - 2), percent)) {
+      WebPSafeFree(hash_to_first_index);
+      return 0;
+    }
   }
   // Process the penultimate pixel.
   chain[pos] = hash_to_first_index[GetPixPairHash64(argb + pos)];
 
   WebPSafeFree(hash_to_first_index);
 
+  percent_start += percent_range;
+  if (!WebPReportProgress(pic, percent_start, percent)) return 0;
+  percent_range = remaining_percent;
+
   // Find the best match interval at each pixel, defined by an offset to the
   // pixel and a length. The right-most pixel cannot match anything to the right
   // (hence a best length of 0) and the left-most pixel nothing to the left
@@ -386,8 +439,17 @@ int VP8LHashChainFill(VP8LHashChain* const p, int quality,
         max_base_position = base_position;
       }
     }
+
+    if (!WebPReportProgress(pic,
+                            percent_start + percent_range *
+                                                (size - 2 - base_position) /
+                                                (size - 2),
+                            percent)) {
+      return 0;
+    }
   }
-  return 1;
+
+  return WebPReportProgress(pic, percent_start + percent_range, percent);
 }
 
 static WEBP_INLINE void AddSingleLiteral(uint32_t pixel, int use_color_cache,
@@ -697,7 +759,7 @@ static int CalculateBestCacheSize(const uint32_t* argb, int quality,
                                   int* const best_cache_bits) {
   int i;
   const int cache_bits_max = (quality <= 25) ? 0 : *best_cache_bits;
-  double entropy_min = MAX_ENTROPY;
+  float entropy_min = MAX_ENTROPY;
   int cc_init[MAX_COLOR_CACHE_BITS + 1] = { 0 };
   VP8LColorCache hashers[MAX_COLOR_CACHE_BITS + 1];
   VP8LRefsCursor c = VP8LRefsCursorInit(refs);
@@ -753,12 +815,18 @@ static int CalculateBestCacheSize(const uint32_t* argb, int quality,
         }
       }
     } else {
+      int code, extra_bits, extra_bits_value;
       // We should compute the contribution of the (distance,length)
       // histograms but those are the same independently from the cache size.
       // As those constant contributions are in the end added to the other
-      // histogram contributions, we can safely ignore them.
+      // histogram contributions, we can ignore them, except for the length
+      // prefix that is part of the literal_ histogram.
       int len = PixOrCopyLength(v);
       uint32_t argb_prev = *argb ^ 0xffffffffu;
+      VP8LPrefixEncode(len, &code, &extra_bits, &extra_bits_value);
+      for (i = 0; i <= cache_bits_max; ++i) {
+        ++histos[i]->literal_[NUM_LITERAL_CODES + code];
+      }
       // Update the color caches.
       do {
         if (*argb != argb_prev) {
@@ -776,14 +844,14 @@ static int CalculateBestCacheSize(const uint32_t* argb, int quality,
   }
 
   for (i = 0; i <= cache_bits_max; ++i) {
-    const double entropy = VP8LHistogramEstimateBits(histos[i]);
+    const float entropy = VP8LHistogramEstimateBits(histos[i]);
     if (i == 0 || entropy < entropy_min) {
       entropy_min = entropy;
       *best_cache_bits = i;
     }
   }
   ok = 1;
-Error:
+ Error:
   for (i = 0; i <= cache_bits_max; ++i) {
     if (cc_init[i]) VP8LColorCacheClear(&hashers[i]);
     VP8LFreeHistogram(histos[i]);
@@ -842,16 +910,21 @@ extern int VP8LBackwardReferencesTraceBackwards(
     int xsize, int ysize, const uint32_t* const argb, int cache_bits,
     const VP8LHashChain* const hash_chain,
     const VP8LBackwardRefs* const refs_src, VP8LBackwardRefs* const refs_dst);
-static VP8LBackwardRefs* GetBackwardReferences(
-    int width, int height, const uint32_t* const argb, int quality,
-    int lz77_types_to_try, int* const cache_bits,
-    const VP8LHashChain* const hash_chain, VP8LBackwardRefs* best,
-    VP8LBackwardRefs* worst) {
-  const int cache_bits_initial = *cache_bits;
-  double bit_cost_best = -1;
+static int GetBackwardReferences(int width, int height,
+                                 const uint32_t* const argb, int quality,
+                                 int lz77_types_to_try, int cache_bits_max,
+                                 int do_no_cache,
+                                 const VP8LHashChain* const hash_chain,
+                                 VP8LBackwardRefs* const refs,
+                                 int* const cache_bits_best) {
   VP8LHistogram* histo = NULL;
-  int lz77_type, lz77_type_best = 0;
+  int i, lz77_type;
+  // Index 0 is for a color cache, index 1 for no cache (if needed).
+  int lz77_types_best[2] = {0, 0};
+  float bit_costs_best[2] = {FLT_MAX, FLT_MAX};
   VP8LHashChain hash_chain_box;
+  VP8LBackwardRefs* const refs_tmp = &refs[do_no_cache ? 2 : 1];
+  int status = 0;
   memset(&hash_chain_box, 0, sizeof(hash_chain_box));
 
   histo = VP8LAllocateHistogram(MAX_COLOR_CACHE_BITS);
@@ -860,86 +933,136 @@ static VP8LBackwardRefs* GetBackwardReferences(
   for (lz77_type = 1; lz77_types_to_try;
        lz77_types_to_try &= ~lz77_type, lz77_type <<= 1) {
     int res = 0;
-    double bit_cost;
-    int cache_bits_tmp = cache_bits_initial;
+    float bit_cost = 0.f;
     if ((lz77_types_to_try & lz77_type) == 0) continue;
     switch (lz77_type) {
       case kLZ77RLE:
-        res = BackwardReferencesRle(width, height, argb, 0, worst);
+        res = BackwardReferencesRle(width, height, argb, 0, refs_tmp);
         break;
       case kLZ77Standard:
         // Compute LZ77 with no cache (0 bits), as the ideal LZ77 with a color
         // cache is not that different in practice.
-        res = BackwardReferencesLz77(width, height, argb, 0, hash_chain, worst);
+        res = BackwardReferencesLz77(width, height, argb, 0, hash_chain,
+                                     refs_tmp);
         break;
       case kLZ77Box:
         if (!VP8LHashChainInit(&hash_chain_box, width * height)) goto Error;
         res = BackwardReferencesLz77Box(width, height, argb, 0, hash_chain,
-                                        &hash_chain_box, worst);
+                                        &hash_chain_box, refs_tmp);
         break;
       default:
         assert(0);
     }
     if (!res) goto Error;
 
-    // Next, try with a color cache and update the references.
-    if (!CalculateBestCacheSize(argb, quality, worst, &cache_bits_tmp)) {
-      goto Error;
-    }
-    if (cache_bits_tmp > 0) {
-      if (!BackwardRefsWithLocalCache(argb, cache_bits_tmp, worst)) {
-        goto Error;
+    // Start with the no color cache case.
+    for (i = 1; i >= 0; --i) {
+      int cache_bits = (i == 1) ? 0 : cache_bits_max;
+
+      if (i == 1 && !do_no_cache) continue;
+
+      if (i == 0) {
+        // Try with a color cache.
+        if (!CalculateBestCacheSize(argb, quality, refs_tmp, &cache_bits)) {
+          goto Error;
+        }
+        if (cache_bits > 0) {
+          if (!BackwardRefsWithLocalCache(argb, cache_bits, refs_tmp)) {
+            goto Error;
+          }
+        }
+      }
+
+      if (i == 0 && do_no_cache && cache_bits == 0) {
+        // No need to re-compute bit_cost as it was computed at i == 1.
+      } else {
+        VP8LHistogramCreate(histo, refs_tmp, cache_bits);
+        bit_cost = VP8LHistogramEstimateBits(histo);
       }
-    }
 
-    // Keep the best backward references.
-    VP8LHistogramCreate(histo, worst, cache_bits_tmp);
-    bit_cost = VP8LHistogramEstimateBits(histo);
-    if (lz77_type_best == 0 || bit_cost < bit_cost_best) {
-      VP8LBackwardRefs* const tmp = worst;
-      worst = best;
-      best = tmp;
-      bit_cost_best = bit_cost;
-      *cache_bits = cache_bits_tmp;
-      lz77_type_best = lz77_type;
+      if (bit_cost < bit_costs_best[i]) {
+        if (i == 1) {
+          // Do not swap as the full cache analysis would have the wrong
+          // VP8LBackwardRefs to start with.
+          if (!BackwardRefsClone(refs_tmp, &refs[1])) goto Error;
+        } else {
+          BackwardRefsSwap(refs_tmp, &refs[0]);
+        }
+        bit_costs_best[i] = bit_cost;
+        lz77_types_best[i] = lz77_type;
+        if (i == 0) *cache_bits_best = cache_bits;
+      }
     }
   }
-  assert(lz77_type_best > 0);
+  assert(lz77_types_best[0] > 0);
+  assert(!do_no_cache || lz77_types_best[1] > 0);
 
   // Improve on simple LZ77 but only for high quality (TraceBackwards is
   // costly).
-  if ((lz77_type_best == kLZ77Standard || lz77_type_best == kLZ77Box) &&
-      quality >= 25) {
-    const VP8LHashChain* const hash_chain_tmp =
-        (lz77_type_best == kLZ77Standard) ? hash_chain : &hash_chain_box;
-    if (VP8LBackwardReferencesTraceBackwards(width, height, argb, *cache_bits,
-                                             hash_chain_tmp, best, worst)) {
-      double bit_cost_trace;
-      VP8LHistogramCreate(histo, worst, *cache_bits);
+  for (i = 1; i >= 0; --i) {
+    if (i == 1 && !do_no_cache) continue;
+    if ((lz77_types_best[i] == kLZ77Standard ||
+         lz77_types_best[i] == kLZ77Box) &&
+        quality >= 25) {
+      const VP8LHashChain* const hash_chain_tmp =
+          (lz77_types_best[i] == kLZ77Standard) ? hash_chain : &hash_chain_box;
+      const int cache_bits = (i == 1) ? 0 : *cache_bits_best;
+      float bit_cost_trace;
+      if (!VP8LBackwardReferencesTraceBackwards(width, height, argb, cache_bits,
+                                                hash_chain_tmp, &refs[i],
+                                                refs_tmp)) {
+        goto Error;
+      }
+      VP8LHistogramCreate(histo, refs_tmp, cache_bits);
       bit_cost_trace = VP8LHistogramEstimateBits(histo);
-      if (bit_cost_trace < bit_cost_best) best = worst;
+      if (bit_cost_trace < bit_costs_best[i]) {
+        BackwardRefsSwap(refs_tmp, &refs[i]);
+      }
     }
-  }
 
-  BackwardReferences2DLocality(width, best);
+    BackwardReferences2DLocality(width, &refs[i]);
+
+    if (i == 1 && lz77_types_best[0] == lz77_types_best[1] &&
+        *cache_bits_best == 0) {
+      // If the best cache size is 0 and we have the same best LZ77, just copy
+      // the data over and stop here.
+      if (!BackwardRefsClone(&refs[1], &refs[0])) goto Error;
+      break;
+    }
+  }
+  status = 1;
 
-Error:
+ Error:
   VP8LHashChainClear(&hash_chain_box);
   VP8LFreeHistogram(histo);
-  return best;
+  return status;
 }
 
-VP8LBackwardRefs* VP8LGetBackwardReferences(
+int VP8LGetBackwardReferences(
     int width, int height, const uint32_t* const argb, int quality,
-    int low_effort, int lz77_types_to_try, int* const cache_bits,
-    const VP8LHashChain* const hash_chain, VP8LBackwardRefs* const refs_tmp1,
-    VP8LBackwardRefs* const refs_tmp2) {
+    int low_effort, int lz77_types_to_try, int cache_bits_max, int do_no_cache,
+    const VP8LHashChain* const hash_chain, VP8LBackwardRefs* const refs,
+    int* const cache_bits_best, const WebPPicture* const pic, int percent_range,
+    int* const percent) {
   if (low_effort) {
-    return GetBackwardReferencesLowEffort(width, height, argb, cache_bits,
-                                          hash_chain, refs_tmp1);
+    VP8LBackwardRefs* refs_best;
+    *cache_bits_best = cache_bits_max;
+    refs_best = GetBackwardReferencesLowEffort(
+        width, height, argb, cache_bits_best, hash_chain, refs);
+    if (refs_best == NULL) {
+      WebPEncodingSetError(pic, VP8_ENC_ERROR_OUT_OF_MEMORY);
+      return 0;
+    }
+    // Set it in first position.
+    BackwardRefsSwap(refs_best, &refs[0]);
   } else {
-    return GetBackwardReferences(width, height, argb, quality,
-                                 lz77_types_to_try, cache_bits, hash_chain,
-                                 refs_tmp1, refs_tmp2);
+    if (!GetBackwardReferences(width, height, argb, quality, lz77_types_to_try,
+                               cache_bits_max, do_no_cache, hash_chain, refs,
+                               cache_bits_best)) {
+      WebPEncodingSetError(pic, VP8_ENC_ERROR_OUT_OF_MEMORY);
+      return 0;
+    }
   }
+
+  return WebPReportProgress(pic, *percent + percent_range, percent);
 }
diff --git a/src/3rdparty/libwebp/src/enc/backward_references_enc.h b/src/3rdparty/libwebp/src/enc/backward_references_enc.h
index 103ddfd..4dff1c2 100644
--- a/src/3rdparty/libwebp/src/enc/backward_references_enc.h
+++ b/src/3rdparty/libwebp/src/enc/backward_references_enc.h
@@ -16,6 +16,7 @@
 #include <assert.h>
 #include <stdlib.h>
 #include "src/webp/types.h"
+#include "src/webp/encode.h"
 #include "src/webp/format_constants.h"
 
 #ifdef __cplusplus
@@ -133,10 +134,11 @@ struct VP8LHashChain {
 
 // Must be called first, to set size.
 int VP8LHashChainInit(VP8LHashChain* const p, int size);
-// Pre-compute the best matches for argb.
+// Pre-compute the best matches for argb. pic and percent are for progress.
 int VP8LHashChainFill(VP8LHashChain* const p, int quality,
                       const uint32_t* const argb, int xsize, int ysize,
-                      int low_effort);
+                      int low_effort, const WebPPicture* const pic,
+                      int percent_range, int* const percent);
 void VP8LHashChainClear(VP8LHashChain* const p);  // release memory
 
 static WEBP_INLINE int VP8LHashChainFindOffset(const VP8LHashChain* const p,
@@ -218,14 +220,22 @@ enum VP8LLZ77Type {
 // Evaluates best possible backward references for specified quality.
 // The input cache_bits to 'VP8LGetBackwardReferences' sets the maximum cache
 // bits to use (passing 0 implies disabling the local color cache).
-// The optimal cache bits is evaluated and set for the *cache_bits parameter.
-// The return value is the pointer to the best of the two backward refs viz,
-// refs[0] or refs[1].
-VP8LBackwardRefs* VP8LGetBackwardReferences(
+// The optimal cache bits is evaluated and set for the *cache_bits_best
+// parameter with the matching refs_best.
+// If do_no_cache == 0, refs is an array of 2 values and the best
+// VP8LBackwardRefs is put in the first element.
+// If do_no_cache != 0, refs is an array of 3 values and the best
+// VP8LBackwardRefs is put in the first element, the best value with no-cache in
+// the second element.
+// In both cases, the last element is used as temporary internally.
+// pic and percent are for progress.
+// Returns false in case of error (stored in pic->error_code).
+int VP8LGetBackwardReferences(
     int width, int height, const uint32_t* const argb, int quality,
-    int low_effort, int lz77_types_to_try, int* const cache_bits,
-    const VP8LHashChain* const hash_chain, VP8LBackwardRefs* const refs_tmp1,
-    VP8LBackwardRefs* const refs_tmp2);
+    int low_effort, int lz77_types_to_try, int cache_bits_max, int do_no_cache,
+    const VP8LHashChain* const hash_chain, VP8LBackwardRefs* const refs,
+    int* const cache_bits_best, const WebPPicture* const pic, int percent_range,
+    int* const percent);
 
 #ifdef __cplusplus
 }
diff --git a/src/3rdparty/libwebp/src/enc/config_enc.c b/src/3rdparty/libwebp/src/enc/config_enc.c
index 9d48289..3518b41 100644
--- a/src/3rdparty/libwebp/src/enc/config_enc.c
+++ b/src/3rdparty/libwebp/src/enc/config_enc.c
@@ -39,6 +39,8 @@ int WebPConfigInitInternal(WebPConfig* config,
   config->partitions = 0;
   config->segments = 4;
   config->pass = 1;
+  config->qmin = 0;
+  config->qmax = 100;
   config->show_compressed = 0;
   config->preprocessing = 0;
   config->autofilter = 0;
@@ -106,6 +108,9 @@ int WebPValidateConfig(const WebPConfig* config) {
   if (config->filter_type < 0 || config->filter_type > 1) return 0;
   if (config->autofilter < 0 || config->autofilter > 1) return 0;
   if (config->pass < 1 || config->pass > 10) return 0;
+  if (config->qmin < 0 || config->qmax > 100 || config->qmin > config->qmax) {
+    return 0;
+  }
   if (config->show_compressed < 0 || config->show_compressed > 1) return 0;
   if (config->preprocessing < 0 || config->preprocessing > 7) return 0;
   if (config->partitions < 0 || config->partitions > 3) return 0;
diff --git a/src/3rdparty/libwebp/src/enc/frame_enc.c b/src/3rdparty/libwebp/src/enc/frame_enc.c
index 1aec376..b93d9e5 100644
--- a/src/3rdparty/libwebp/src/enc/frame_enc.c
+++ b/src/3rdparty/libwebp/src/enc/frame_enc.c
@@ -31,10 +31,15 @@
 // we allow 2k of extra head-room in PARTITION0 limit.
 #define PARTITION0_SIZE_LIMIT ((VP8_MAX_PARTITION0_SIZE - 2048ULL) << 11)
 
+static float Clamp(float v, float min, float max) {
+  return (v < min) ? min : (v > max) ? max : v;
+}
+
 typedef struct {  // struct for organizing convergence in either size or PSNR
   int is_first;
   float dq;
   float q, last_q;
+  float qmin, qmax;
   double value, last_value;   // PSNR or size
   double target;
   int do_size_search;
@@ -47,7 +52,9 @@ static int InitPassStats(const VP8Encoder* const enc, PassStats* const s) {
 
   s->is_first = 1;
   s->dq = 10.f;
-  s->q = s->last_q = enc->config_->quality;
+  s->qmin = 1.f * enc->config_->qmin;
+  s->qmax = 1.f * enc->config_->qmax;
+  s->q = s->last_q = Clamp(enc->config_->quality, s->qmin, s->qmax);
   s->target = do_size_search ? (double)target_size
             : (target_PSNR > 0.) ? target_PSNR
             : 40.;   // default, just in case
@@ -56,10 +63,6 @@ static int InitPassStats(const VP8Encoder* const enc, PassStats* const s) {
   return do_size_search;
 }
 
-static float Clamp(float v, float min, float max) {
-  return (v < min) ? min : (v > max) ? max : v;
-}
-
 static float ComputeNextQ(PassStats* const s) {
   float dq;
   if (s->is_first) {
@@ -75,7 +78,7 @@ static float ComputeNextQ(PassStats* const s) {
   s->dq = Clamp(dq, -30.f, 30.f);
   s->last_q = s->q;
   s->last_value = s->value;
-  s->q = Clamp(s->q + s->dq, 0.f, 100.f);
+  s->q = Clamp(s->q + s->dq, s->qmin, s->qmax);
   return s->q;
 }
 
@@ -775,6 +778,7 @@ int VP8EncTokenLoop(VP8Encoder* const enc) {
   // Roughly refresh the proba eight times per pass
   int max_count = (enc->mb_w_ * enc->mb_h_) >> 3;
   int num_pass_left = enc->config_->pass;
+  int remaining_progress = 40;  // percents
   const int do_search = enc->do_search_;
   VP8EncIterator it;
   VP8EncProba* const proba = &enc->proba_;
@@ -802,6 +806,9 @@ int VP8EncTokenLoop(VP8Encoder* const enc) {
     uint64_t size_p0 = 0;
     uint64_t distortion = 0;
     int cnt = max_count;
+    // The final number of passes is not trivial to know in advance.
+    const int pass_progress = remaining_progress / (2 + num_pass_left);
+    remaining_progress -= pass_progress;
     VP8IteratorInit(enc, &it);
     SetLoopParams(enc, stats.q);
     if (is_last_pass) {
@@ -829,7 +836,7 @@ int VP8EncTokenLoop(VP8Encoder* const enc) {
         StoreSideInfo(&it);
         VP8StoreFilterStats(&it);
         VP8IteratorExport(&it);
-        ok = VP8IteratorProgress(&it, 20);
+        ok = VP8IteratorProgress(&it, pass_progress);
       }
       VP8IteratorSaveBoundary(&it);
     } while (ok && VP8IteratorNext(&it));
@@ -848,9 +855,10 @@ int VP8EncTokenLoop(VP8Encoder* const enc) {
     }
 
 #if (DEBUG_SEARCH > 0)
-    printf("#%2d metric:%.1lf -> %.1lf   last_q=%.2lf q=%.2lf dq=%.2lf\n",
+    printf("#%2d metric:%.1lf -> %.1lf   last_q=%.2lf q=%.2lf dq=%.2lf "
+           " range:[%.1f, %.1f]\n",
            num_pass_left, stats.last_value, stats.value,
-           stats.last_q, stats.q, stats.dq);
+           stats.last_q, stats.q, stats.dq, stats.qmin, stats.qmax);
 #endif
     if (enc->max_i4_header_bits_ > 0 && size_p0 > PARTITION0_SIZE_LIMIT) {
       ++num_pass_left;
@@ -874,7 +882,8 @@ int VP8EncTokenLoop(VP8Encoder* const enc) {
     ok = VP8EmitTokens(&enc->tokens_, enc->parts_ + 0,
                        (const uint8_t*)proba->coeffs_, 1);
   }
-  ok = ok && WebPReportProgress(enc->pic_, enc->percent_ + 20, &enc->percent_);
+  ok = ok && WebPReportProgress(enc->pic_, enc->percent_ + remaining_progress,
+                                &enc->percent_);
   return PostLoopFinalize(&it, ok);
 }
 
diff --git a/src/3rdparty/libwebp/src/enc/histogram_enc.c b/src/3rdparty/libwebp/src/enc/histogram_enc.c
index d89b985..8418def 100644
--- a/src/3rdparty/libwebp/src/enc/histogram_enc.c
+++ b/src/3rdparty/libwebp/src/enc/histogram_enc.c
@@ -13,15 +13,17 @@
 #include "src/webp/config.h"
 #endif
 
+#include <float.h>
 #include <math.h>
 
-#include "src/enc/backward_references_enc.h"
-#include "src/enc/histogram_enc.h"
 #include "src/dsp/lossless.h"
 #include "src/dsp/lossless_common.h"
+#include "src/enc/backward_references_enc.h"
+#include "src/enc/histogram_enc.h"
+#include "src/enc/vp8i_enc.h"
 #include "src/utils/utils.h"
 
-#define MAX_COST 1.e38
+#define MAX_BIT_COST FLT_MAX
 
 // Number of partitions for the three dominant (literal, red and blue) symbol
 // costs.
@@ -208,6 +210,7 @@ void VP8LHistogramAddSinglePixOrCopy(VP8LHistogram* const histo,
   } else if (PixOrCopyIsCacheIdx(v)) {
     const int literal_ix =
         NUM_LITERAL_CODES + NUM_LENGTH_CODES + PixOrCopyCacheIdx(v);
+    assert(histo->palette_code_bits_ != 0);
     ++histo->literal_[literal_ix];
   } else {
     int code, extra_bits;
@@ -227,8 +230,8 @@ void VP8LHistogramAddSinglePixOrCopy(VP8LHistogram* const histo,
 // -----------------------------------------------------------------------------
 // Entropy-related functions.
 
-static WEBP_INLINE double BitsEntropyRefine(const VP8LBitEntropy* entropy) {
-  double mix;
+static WEBP_INLINE float BitsEntropyRefine(const VP8LBitEntropy* entropy) {
+  float mix;
   if (entropy->nonzeros < 5) {
     if (entropy->nonzeros <= 1) {
       return 0;
@@ -237,67 +240,67 @@ static WEBP_INLINE double BitsEntropyRefine(const VP8LBitEntropy* entropy) {
     // Let's mix in a bit of entropy to favor good clustering when
     // distributions of these are combined.
     if (entropy->nonzeros == 2) {
-      return 0.99 * entropy->sum + 0.01 * entropy->entropy;
+      return 0.99f * entropy->sum + 0.01f * entropy->entropy;
     }
     // No matter what the entropy says, we cannot be better than min_limit
     // with Huffman coding. I am mixing a bit of entropy into the
     // min_limit since it produces much better (~0.5 %) compression results
     // perhaps because of better entropy clustering.
     if (entropy->nonzeros == 3) {
-      mix = 0.95;
+      mix = 0.95f;
     } else {
-      mix = 0.7;  // nonzeros == 4.
+      mix = 0.7f;  // nonzeros == 4.
     }
   } else {
-    mix = 0.627;
+    mix = 0.627f;
   }
 
   {
-    double min_limit = 2 * entropy->sum - entropy->max_val;
-    min_limit = mix * min_limit + (1.0 - mix) * entropy->entropy;
+    float min_limit = 2.f * entropy->sum - entropy->max_val;
+    min_limit = mix * min_limit + (1.f - mix) * entropy->entropy;
     return (entropy->entropy < min_limit) ? min_limit : entropy->entropy;
   }
 }
 
-double VP8LBitsEntropy(const uint32_t* const array, int n) {
+float VP8LBitsEntropy(const uint32_t* const array, int n) {
   VP8LBitEntropy entropy;
   VP8LBitsEntropyUnrefined(array, n, &entropy);
 
   return BitsEntropyRefine(&entropy);
 }
 
-static double InitialHuffmanCost(void) {
+static float InitialHuffmanCost(void) {
   // Small bias because Huffman code length is typically not stored in
   // full length.
   static const int kHuffmanCodeOfHuffmanCodeSize = CODE_LENGTH_CODES * 3;
-  static const double kSmallBias = 9.1;
+  static const float kSmallBias = 9.1f;
   return kHuffmanCodeOfHuffmanCodeSize - kSmallBias;
 }
 
 // Finalize the Huffman cost based on streak numbers and length type (<3 or >=3)
-static double FinalHuffmanCost(const VP8LStreaks* const stats) {
+static float FinalHuffmanCost(const VP8LStreaks* const stats) {
   // The constants in this function are experimental and got rounded from
   // their original values in 1/8 when switched to 1/1024.
-  double retval = InitialHuffmanCost();
+  float retval = InitialHuffmanCost();
   // Second coefficient: Many zeros in the histogram are covered efficiently
   // by a run-length encode. Originally 2/8.
-  retval += stats->counts[0] * 1.5625 + 0.234375 * stats->streaks[0][1];
+  retval += stats->counts[0] * 1.5625f + 0.234375f * stats->streaks[0][1];
   // Second coefficient: Constant values are encoded less efficiently, but still
   // RLE'ed. Originally 6/8.
-  retval += stats->counts[1] * 2.578125 + 0.703125 * stats->streaks[1][1];
+  retval += stats->counts[1] * 2.578125f + 0.703125f * stats->streaks[1][1];
   // 0s are usually encoded more efficiently than non-0s.
   // Originally 15/8.
-  retval += 1.796875 * stats->streaks[0][0];
+  retval += 1.796875f * stats->streaks[0][0];
   // Originally 26/8.
-  retval += 3.28125 * stats->streaks[1][0];
+  retval += 3.28125f * stats->streaks[1][0];
   return retval;
 }
 
 // Get the symbol entropy for the distribution 'population'.
 // Set 'trivial_sym', if there's only one symbol present in the distribution.
-static double PopulationCost(const uint32_t* const population, int length,
-                             uint32_t* const trivial_sym,
-                             uint8_t* const is_used) {
+static float PopulationCost(const uint32_t* const population, int length,
+                            uint32_t* const trivial_sym,
+                            uint8_t* const is_used) {
   VP8LBitEntropy bit_entropy;
   VP8LStreaks stats;
   VP8LGetEntropyUnrefined(population, length, &bit_entropy, &stats);
@@ -313,11 +316,10 @@ static double PopulationCost(const uint32_t* const population, int length,
 
 // trivial_at_end is 1 if the two histograms only have one element that is
 // non-zero: both the zero-th one, or both the last one.
-static WEBP_INLINE double GetCombinedEntropy(const uint32_t* const X,
-                                             const uint32_t* const Y,
-                                             int length, int is_X_used,
-                                             int is_Y_used,
-                                             int trivial_at_end) {
+static WEBP_INLINE float GetCombinedEntropy(const uint32_t* const X,
+                                            const uint32_t* const Y, int length,
+                                            int is_X_used, int is_Y_used,
+                                            int trivial_at_end) {
   VP8LStreaks stats;
   if (trivial_at_end) {
     // This configuration is due to palettization that transforms an indexed
@@ -355,7 +357,7 @@ static WEBP_INLINE double GetCombinedEntropy(const uint32_t* const X,
 }
 
 // Estimates the Entropy + Huffman + other block overhead size cost.
-double VP8LHistogramEstimateBits(VP8LHistogram* const p) {
+float VP8LHistogramEstimateBits(VP8LHistogram* const p) {
   return
       PopulationCost(p->literal_, VP8LHistogramNumCodes(p->palette_code_bits_),
                      NULL, &p->is_used_[0])
@@ -372,8 +374,7 @@ double VP8LHistogramEstimateBits(VP8LHistogram* const p) {
 
 static int GetCombinedHistogramEntropy(const VP8LHistogram* const a,
                                        const VP8LHistogram* const b,
-                                       double cost_threshold,
-                                       double* cost) {
+                                       float cost_threshold, float* cost) {
   const int palette_code_bits = a->palette_code_bits_;
   int trivial_at_end = 0;
   assert(a->palette_code_bits_ == b->palette_code_bits_);
@@ -438,12 +439,11 @@ static WEBP_INLINE void HistogramAdd(const VP8LHistogram* const a,
 // Since the previous score passed is 'cost_threshold', we only need to compare
 // the partial cost against 'cost_threshold + C(a) + C(b)' to possibly bail-out
 // early.
-static double HistogramAddEval(const VP8LHistogram* const a,
-                               const VP8LHistogram* const b,
-                               VP8LHistogram* const out,
-                               double cost_threshold) {
-  double cost = 0;
-  const double sum_cost = a->bit_cost_ + b->bit_cost_;
+static float HistogramAddEval(const VP8LHistogram* const a,
+                              const VP8LHistogram* const b,
+                              VP8LHistogram* const out, float cost_threshold) {
+  float cost = 0;
+  const float sum_cost = a->bit_cost_ + b->bit_cost_;
   cost_threshold += sum_cost;
 
   if (GetCombinedHistogramEntropy(a, b, cost_threshold, &cost)) {
@@ -458,10 +458,10 @@ static double HistogramAddEval(const VP8LHistogram* const a,
 // Same as HistogramAddEval(), except that the resulting histogram
 // is not stored. Only the cost C(a+b) - C(a) is evaluated. We omit
 // the term C(b) which is constant over all the evaluations.
-static double HistogramAddThresh(const VP8LHistogram* const a,
-                                 const VP8LHistogram* const b,
-                                 double cost_threshold) {
-  double cost;
+static float HistogramAddThresh(const VP8LHistogram* const a,
+                                const VP8LHistogram* const b,
+                                float cost_threshold) {
+  float cost;
   assert(a != NULL && b != NULL);
   cost = -a->bit_cost_;
   GetCombinedHistogramEntropy(a, b, cost_threshold, &cost);
@@ -472,24 +472,22 @@ static double HistogramAddThresh(const VP8LHistogram* const a,
 
 // The structure to keep track of cost range for the three dominant entropy
 // symbols.
-// TODO(skal): Evaluate if float can be used here instead of double for
-// representing the entropy costs.
 typedef struct {
-  double literal_max_;
-  double literal_min_;
-  double red_max_;
-  double red_min_;
-  double blue_max_;
-  double blue_min_;
+  float literal_max_;
+  float literal_min_;
+  float red_max_;
+  float red_min_;
+  float blue_max_;
+  float blue_min_;
 } DominantCostRange;
 
 static void DominantCostRangeInit(DominantCostRange* const c) {
   c->literal_max_ = 0.;
-  c->literal_min_ = MAX_COST;
+  c->literal_min_ = MAX_BIT_COST;
   c->red_max_ = 0.;
-  c->red_min_ = MAX_COST;
+  c->red_min_ = MAX_BIT_COST;
   c->blue_max_ = 0.;
-  c->blue_min_ = MAX_COST;
+  c->blue_min_ = MAX_BIT_COST;
 }
 
 static void UpdateDominantCostRange(
@@ -504,10 +502,9 @@ static void UpdateDominantCostRange(
 
 static void UpdateHistogramCost(VP8LHistogram* const h) {
   uint32_t alpha_sym, red_sym, blue_sym;
-  const double alpha_cost =
-      PopulationCost(h->alpha_, NUM_LITERAL_CODES, &alpha_sym,
-                     &h->is_used_[3]);
-  const double distance_cost =
+  const float alpha_cost =
+      PopulationCost(h->alpha_, NUM_LITERAL_CODES, &alpha_sym, &h->is_used_[3]);
+  const float distance_cost =
       PopulationCost(h->distance_, NUM_DISTANCE_CODES, NULL, &h->is_used_[4]) +
       VP8LExtraCost(h->distance_, NUM_DISTANCE_CODES);
   const int num_codes = VP8LHistogramNumCodes(h->palette_code_bits_);
@@ -528,10 +525,10 @@ static void UpdateHistogramCost(VP8LHistogram* const h) {
   }
 }
 
-static int GetBinIdForEntropy(double min, double max, double val) {
-  const double range = max - min;
+static int GetBinIdForEntropy(float min, float max, float val) {
+  const float range = max - min;
   if (range > 0.) {
-    const double delta = val - min;
+    const float delta = val - min;
     return (int)((NUM_PARTITIONS - 1e-6) * delta / range);
   } else {
     return 0;
@@ -640,15 +637,11 @@ static void HistogramAnalyzeEntropyBin(VP8LHistogramSet* const image_histo,
 
 // Merges some histograms with same bin_id together if it's advantageous.
 // Sets the remaining histograms to NULL.
-static void HistogramCombineEntropyBin(VP8LHistogramSet* const image_histo,
-                                       int *num_used,
-                                       const uint16_t* const clusters,
-                                       uint16_t* const cluster_mappings,
-                                       VP8LHistogram* cur_combo,
-                                       const uint16_t* const bin_map,
-                                       int num_bins,
-                                       double combine_cost_factor,
-                                       int low_effort) {
+static void HistogramCombineEntropyBin(
+    VP8LHistogramSet* const image_histo, int* num_used,
+    const uint16_t* const clusters, uint16_t* const cluster_mappings,
+    VP8LHistogram* cur_combo, const uint16_t* const bin_map, int num_bins,
+    float combine_cost_factor, int low_effort) {
   VP8LHistogram** const histograms = image_histo->histograms;
   int idx;
   struct {
@@ -678,11 +671,10 @@ static void HistogramCombineEntropyBin(VP8LHistogramSet* const image_histo,
       cluster_mappings[clusters[idx]] = clusters[first];
     } else {
       // try to merge #idx into #first (both share the same bin_id)
-      const double bit_cost = histograms[idx]->bit_cost_;
-      const double bit_cost_thresh = -bit_cost * combine_cost_factor;
-      const double curr_cost_diff =
-          HistogramAddEval(histograms[first], histograms[idx],
-                           cur_combo, bit_cost_thresh);
+      const float bit_cost = histograms[idx]->bit_cost_;
+      const float bit_cost_thresh = -bit_cost * combine_cost_factor;
+      const float curr_cost_diff = HistogramAddEval(
+          histograms[first], histograms[idx], cur_combo, bit_cost_thresh);
       if (curr_cost_diff < bit_cost_thresh) {
         // Try to merge two histograms only if the combo is a trivial one or
         // the two candidate histograms are already non-trivial.
@@ -730,8 +722,8 @@ static uint32_t MyRand(uint32_t* const seed) {
 typedef struct {
   int idx1;
   int idx2;
-  double cost_diff;
-  double cost_combo;
+  float cost_diff;
+  float cost_combo;
 } HistogramPair;
 
 typedef struct {
@@ -786,10 +778,9 @@ static void HistoQueueUpdateHead(HistoQueue* const histo_queue,
 // Update the cost diff and combo of a pair of histograms. This needs to be
 // called when the the histograms have been merged with a third one.
 static void HistoQueueUpdatePair(const VP8LHistogram* const h1,
-                                 const VP8LHistogram* const h2,
-                                 double threshold,
+                                 const VP8LHistogram* const h2, float threshold,
                                  HistogramPair* const pair) {
-  const double sum_cost = h1->bit_cost_ + h2->bit_cost_;
+  const float sum_cost = h1->bit_cost_ + h2->bit_cost_;
   pair->cost_combo = 0.;
   GetCombinedHistogramEntropy(h1, h2, sum_cost + threshold, &pair->cost_combo);
   pair->cost_diff = pair->cost_combo - sum_cost;
@@ -798,9 +789,9 @@ static void HistoQueueUpdatePair(const VP8LHistogram* const h1,
 // Create a pair from indices "idx1" and "idx2" provided its cost
 // is inferior to "threshold", a negative entropy.
 // It returns the cost of the pair, or 0. if it superior to threshold.
-static double HistoQueuePush(HistoQueue* const histo_queue,
-                             VP8LHistogram** const histograms, int idx1,
-                             int idx2, double threshold) {
+static float HistoQueuePush(HistoQueue* const histo_queue,
+                            VP8LHistogram** const histograms, int idx1,
+                            int idx2, float threshold) {
   const VP8LHistogram* h1;
   const VP8LHistogram* h2;
   HistogramPair pair;
@@ -944,8 +935,8 @@ static int HistogramCombineStochastic(VP8LHistogramSet* const image_histo,
            ++tries_with_no_success < num_tries_no_success;
        ++iter) {
     int* mapping_index;
-    double best_cost =
-        (histo_queue.size == 0) ? 0. : histo_queue.queue[0].cost_diff;
+    float best_cost =
+        (histo_queue.size == 0) ? 0.f : histo_queue.queue[0].cost_diff;
     int best_idx1 = -1, best_idx2 = 1;
     const uint32_t rand_range = (*num_used - 1) * (*num_used);
     // (*num_used) / 2 was chosen empirically. Less means faster but worse
@@ -954,7 +945,7 @@ static int HistogramCombineStochastic(VP8LHistogramSet* const image_histo,
 
     // Pick random samples.
     for (j = 0; *num_used >= 2 && j < num_tries; ++j) {
-      double curr_cost;
+      float curr_cost;
       // Choose two different histograms at random and try to combine them.
       const uint32_t tmp = MyRand(&seed) % rand_range;
       uint32_t idx1 = tmp / (*num_used - 1);
@@ -1033,7 +1024,7 @@ static int HistogramCombineStochastic(VP8LHistogramSet* const image_histo,
   *do_greedy = (*num_used <= min_cluster_size);
   ok = 1;
 
-End:
+ End:
   HistoQueueClear(&histo_queue);
   WebPSafeFree(mappings);
   return ok;
@@ -1056,7 +1047,7 @@ static void HistogramRemap(const VP8LHistogramSet* const in,
   if (out_size > 1) {
     for (i = 0; i < in_size; ++i) {
       int best_out = 0;
-      double best_bits = MAX_COST;
+      float best_bits = MAX_BIT_COST;
       int k;
       if (in_histo[i] == NULL) {
         // Arbitrarily set to the previous value if unused to help future LZ77.
@@ -1064,7 +1055,7 @@ static void HistogramRemap(const VP8LHistogramSet* const in,
         continue;
       }
       for (k = 0; k < out_size; ++k) {
-        double cur_bits;
+        float cur_bits;
         cur_bits = HistogramAddThresh(out_histo[k], in_histo[i], best_bits);
         if (k == 0 || cur_bits < best_bits) {
           best_bits = cur_bits;
@@ -1092,13 +1083,13 @@ static void HistogramRemap(const VP8LHistogramSet* const in,
   }
 }
 
-static double GetCombineCostFactor(int histo_size, int quality) {
-  double combine_cost_factor = 0.16;
+static float GetCombineCostFactor(int histo_size, int quality) {
+  float combine_cost_factor = 0.16f;
   if (quality < 90) {
-    if (histo_size > 256) combine_cost_factor /= 2.;
-    if (histo_size > 512) combine_cost_factor /= 2.;
-    if (histo_size > 1024) combine_cost_factor /= 2.;
-    if (quality <= 50) combine_cost_factor /= 2.;
+    if (histo_size > 256) combine_cost_factor /= 2.f;
+    if (histo_size > 512) combine_cost_factor /= 2.f;
+    if (histo_size > 1024) combine_cost_factor /= 2.f;
+    if (quality <= 50) combine_cost_factor /= 2.f;
   }
   return combine_cost_factor;
 }
@@ -1168,15 +1159,17 @@ static void RemoveEmptyHistograms(VP8LHistogramSet* const image_histo) {
 }
 
 int VP8LGetHistoImageSymbols(int xsize, int ysize,
-                             const VP8LBackwardRefs* const refs,
-                             int quality, int low_effort,
-                             int histo_bits, int cache_bits,
+                             const VP8LBackwardRefs* const refs, int quality,
+                             int low_effort, int histogram_bits, int cache_bits,
                              VP8LHistogramSet* const image_histo,
                              VP8LHistogram* const tmp_histo,
-                             uint16_t* const histogram_symbols) {
-  int ok = 0;
-  const int histo_xsize = histo_bits ? VP8LSubSampleSize(xsize, histo_bits) : 1;
-  const int histo_ysize = histo_bits ? VP8LSubSampleSize(ysize, histo_bits) : 1;
+                             uint16_t* const histogram_symbols,
+                             const WebPPicture* const pic, int percent_range,
+                             int* const percent) {
+  const int histo_xsize =
+      histogram_bits ? VP8LSubSampleSize(xsize, histogram_bits) : 1;
+  const int histo_ysize =
+      histogram_bits ? VP8LSubSampleSize(ysize, histogram_bits) : 1;
   const int image_histo_raw_size = histo_xsize * histo_ysize;
   VP8LHistogramSet* const orig_histo =
       VP8LAllocateHistogramSet(image_histo_raw_size, cache_bits);
@@ -1189,10 +1182,13 @@ int VP8LGetHistoImageSymbols(int xsize, int ysize,
       WebPSafeMalloc(2 * image_histo_raw_size, sizeof(map_tmp));
   uint16_t* const cluster_mappings = map_tmp + image_histo_raw_size;
   int num_used = image_histo_raw_size;
-  if (orig_histo == NULL || map_tmp == NULL) goto Error;
+  if (orig_histo == NULL || map_tmp == NULL) {
+    WebPEncodingSetError(pic, VP8_ENC_ERROR_OUT_OF_MEMORY);
+    goto Error;
+  }
 
   // Construct the histograms from backward references.
-  HistogramBuild(xsize, histo_bits, refs, orig_histo);
+  HistogramBuild(xsize, histogram_bits, refs, orig_histo);
   // Copies the histograms and computes its bit_cost.
   // histogram_symbols is optimized
   HistogramCopyAndAnalyze(orig_histo, image_histo, &num_used,
@@ -1203,16 +1199,15 @@ int VP8LGetHistoImageSymbols(int xsize, int ysize,
 
   if (entropy_combine) {
     uint16_t* const bin_map = map_tmp;
-    const double combine_cost_factor =
+    const float combine_cost_factor =
         GetCombineCostFactor(image_histo_raw_size, quality);
     const uint32_t num_clusters = num_used;
 
     HistogramAnalyzeEntropyBin(image_histo, bin_map, low_effort);
     // Collapse histograms with similar entropy.
-    HistogramCombineEntropyBin(image_histo, &num_used, histogram_symbols,
-                               cluster_mappings, tmp_histo, bin_map,
-                               entropy_combine_num_bins, combine_cost_factor,
-                               low_effort);
+    HistogramCombineEntropyBin(
+        image_histo, &num_used, histogram_symbols, cluster_mappings, tmp_histo,
+        bin_map, entropy_combine_num_bins, combine_cost_factor, low_effort);
     OptimizeHistogramSymbols(image_histo, cluster_mappings, num_clusters,
                              map_tmp, histogram_symbols);
   }
@@ -1226,11 +1221,13 @@ int VP8LGetHistoImageSymbols(int xsize, int ysize,
     int do_greedy;
     if (!HistogramCombineStochastic(image_histo, &num_used, threshold_size,
                                     &do_greedy)) {
+      WebPEncodingSetError(pic, VP8_ENC_ERROR_OUT_OF_MEMORY);
       goto Error;
     }
     if (do_greedy) {
       RemoveEmptyHistograms(image_histo);
       if (!HistogramCombineGreedy(image_histo, &num_used)) {
+        WebPEncodingSetError(pic, VP8_ENC_ERROR_OUT_OF_MEMORY);
         goto Error;
       }
     }
@@ -1240,10 +1237,12 @@ int VP8LGetHistoImageSymbols(int xsize, int ysize,
   RemoveEmptyHistograms(image_histo);
   HistogramRemap(orig_histo, image_histo, histogram_symbols);
 
-  ok = 1;
+  if (!WebPReportProgress(pic, *percent + percent_range, percent)) {
+    goto Error;
+  }
 
  Error:
   VP8LFreeHistogramSet(orig_histo);
   WebPSafeFree(map_tmp);
-  return ok;
+  return (pic->error_code == VP8_ENC_OK);
 }
diff --git a/src/3rdparty/libwebp/src/enc/histogram_enc.h b/src/3rdparty/libwebp/src/enc/histogram_enc.h
index 54c2d21..4c0bb97 100644
--- a/src/3rdparty/libwebp/src/enc/histogram_enc.h
+++ b/src/3rdparty/libwebp/src/enc/histogram_enc.h
@@ -40,10 +40,10 @@ typedef struct {
   int palette_code_bits_;
   uint32_t trivial_symbol_;  // True, if histograms for Red, Blue & Alpha
                              // literal symbols are single valued.
-  double bit_cost_;          // cached value of bit cost.
-  double literal_cost_;      // Cached values of dominant entropy costs:
-  double red_cost_;          // literal, red & blue.
-  double blue_cost_;
+  float bit_cost_;           // cached value of bit cost.
+  float literal_cost_;       // Cached values of dominant entropy costs:
+  float red_cost_;           // literal, red & blue.
+  float blue_cost_;
   uint8_t is_used_[5];       // 5 for literal, red, blue, alpha, distance
 } VP8LHistogram;
 
@@ -64,8 +64,8 @@ void VP8LHistogramCreate(VP8LHistogram* const p,
                          const VP8LBackwardRefs* const refs,
                          int palette_code_bits);
 
-// Return the size of the histogram for a given palette_code_bits.
-int VP8LGetHistogramSize(int palette_code_bits);
+// Return the size of the histogram for a given cache_bits.
+int VP8LGetHistogramSize(int cache_bits);
 
 // Set the palette_code_bits and reset the stats.
 // If init_arrays is true, the arrays are also filled with 0's.
@@ -105,21 +105,23 @@ static WEBP_INLINE int VP8LHistogramNumCodes(int palette_code_bits) {
       ((palette_code_bits > 0) ? (1 << palette_code_bits) : 0);
 }
 
-// Builds the histogram image.
+// Builds the histogram image. pic and percent are for progress.
+// Returns false in case of error (stored in pic->error_code).
 int VP8LGetHistoImageSymbols(int xsize, int ysize,
-                             const VP8LBackwardRefs* const refs,
-                             int quality, int low_effort,
-                             int histogram_bits, int cache_bits,
-                             VP8LHistogramSet* const image_in,
+                             const VP8LBackwardRefs* const refs, int quality,
+                             int low_effort, int histogram_bits, int cache_bits,
+                             VP8LHistogramSet* const image_histo,
                              VP8LHistogram* const tmp_histo,
-                             uint16_t* const histogram_symbols);
+                             uint16_t* const histogram_symbols,
+                             const WebPPicture* const pic, int percent_range,
+                             int* const percent);
 
 // Returns the entropy for the symbols in the input array.
-double VP8LBitsEntropy(const uint32_t* const array, int n);
+float VP8LBitsEntropy(const uint32_t* const array, int n);
 
 // Estimate how many bits the combined entropy of literals and distance
 // approximately maps to.
-double VP8LHistogramEstimateBits(VP8LHistogram* const p);
+float VP8LHistogramEstimateBits(VP8LHistogram* const p);
 
 #ifdef __cplusplus
 }
diff --git a/src/3rdparty/libwebp/src/enc/picture_csp_enc.c b/src/3rdparty/libwebp/src/enc/picture_csp_enc.c
index 02d9df7..78c8ca4 100644
--- a/src/3rdparty/libwebp/src/enc/picture_csp_enc.c
+++ b/src/3rdparty/libwebp/src/enc/picture_csp_enc.c
@@ -15,12 +15,19 @@
 #include <stdlib.h>
 #include <math.h>
 
+#include "sharpyuv/sharpyuv.h"
+#include "sharpyuv/sharpyuv_csp.h"
 #include "src/enc/vp8i_enc.h"
 #include "src/utils/random_utils.h"
 #include "src/utils/utils.h"
 #include "src/dsp/dsp.h"
 #include "src/dsp/lossless.h"
 #include "src/dsp/yuv.h"
+#include "src/dsp/cpu.h"
+
+#if defined(WEBP_USE_THREAD) && !defined(_WIN32)
+#include <pthread.h>
+#endif
 
 // Uncomment to disable gamma-compression during RGB->U/V averaging
 #define USE_GAMMA_COMPRESSION
@@ -29,11 +36,15 @@
 #define USE_INVERSE_ALPHA_TABLE
 
 #ifdef WORDS_BIGENDIAN
-#define ALPHA_OFFSET 0   // uint32_t 0xff000000 is 0xff,00,00,00 in memory
+// uint32_t 0xff000000 is 0xff,00,00,00 in memory
+#define CHANNEL_OFFSET(i) (i)
 #else
-#define ALPHA_OFFSET 3   // uint32_t 0xff000000 is 0x00,00,00,ff in memory
+// uint32_t 0xff000000 is 0x00,00,00,ff in memory
+#define CHANNEL_OFFSET(i) (3-(i))
 #endif
 
+#define ALPHA_OFFSET CHANNEL_OFFSET(0)
+
 //------------------------------------------------------------------------------
 // Detection of non-trivial transparency
 
@@ -57,16 +68,16 @@ static int CheckNonOpaque(const uint8_t* alpha, int width, int height,
 // Checking for the presence of non-opaque alpha.
 int WebPPictureHasTransparency(const WebPPicture* picture) {
   if (picture == NULL) return 0;
-  if (!picture->use_argb) {
-    return CheckNonOpaque(picture->a, picture->width, picture->height,
-                          1, picture->a_stride);
-  } else {
-    const int alpha_offset = ALPHA_OFFSET;
-    return CheckNonOpaque((const uint8_t*)picture->argb + alpha_offset,
-                          picture->width, picture->height,
-                          4, picture->argb_stride * sizeof(*picture->argb));
+  if (picture->use_argb) {
+    if (picture->argb != NULL) {
+      return CheckNonOpaque((const uint8_t*)picture->argb + ALPHA_OFFSET,
+                            picture->width, picture->height,
+                            4, picture->argb_stride * sizeof(*picture->argb));
+    }
+    return 0;
   }
-  return 0;
+  return CheckNonOpaque(picture->a, picture->width, picture->height,
+                        1, picture->a_stride);
 }
 
 //------------------------------------------------------------------------------
@@ -74,29 +85,30 @@ int WebPPictureHasTransparency(const WebPPicture* picture) {
 
 #if defined(USE_GAMMA_COMPRESSION)
 
-// gamma-compensates loss of resolution during chroma subsampling
-#define kGamma 0.80      // for now we use a different gamma value than kGammaF
-#define kGammaFix 12     // fixed-point precision for linear values
-#define kGammaScale ((1 << kGammaFix) - 1)
-#define kGammaTabFix 7   // fixed-point fractional bits precision
-#define kGammaTabScale (1 << kGammaTabFix)
-#define kGammaTabRounder (kGammaTabScale >> 1)
-#define kGammaTabSize (1 << (kGammaFix - kGammaTabFix))
+// Gamma correction compensates loss of resolution during chroma subsampling.
+#define GAMMA_FIX 12      // fixed-point precision for linear values
+#define GAMMA_TAB_FIX 7   // fixed-point fractional bits precision
+#define GAMMA_TAB_SIZE (1 << (GAMMA_FIX - GAMMA_TAB_FIX))
+static const double kGamma = 0.80;
+static const int kGammaScale = ((1 << GAMMA_FIX) - 1);
+static const int kGammaTabScale = (1 << GAMMA_TAB_FIX);
+static const int kGammaTabRounder = (1 << GAMMA_TAB_FIX >> 1);
 
-static int kLinearToGammaTab[kGammaTabSize + 1];
+static int kLinearToGammaTab[GAMMA_TAB_SIZE + 1];
 static uint16_t kGammaToLinearTab[256];
 static volatile int kGammaTablesOk = 0;
+static void InitGammaTables(void);
 
-static WEBP_TSAN_IGNORE_FUNCTION void InitGammaTables(void) {
+WEBP_DSP_INIT_FUNC(InitGammaTables) {
   if (!kGammaTablesOk) {
     int v;
-    const double scale = (double)(1 << kGammaTabFix) / kGammaScale;
+    const double scale = (double)(1 << GAMMA_TAB_FIX) / kGammaScale;
     const double norm = 1. / 255.;
     for (v = 0; v <= 255; ++v) {
       kGammaToLinearTab[v] =
           (uint16_t)(pow(norm * v, kGamma) * kGammaScale + .5);
     }
-    for (v = 0; v <= kGammaTabSize; ++v) {
+    for (v = 0; v <= GAMMA_TAB_SIZE; ++v) {
       kLinearToGammaTab[v] = (int)(255. * pow(scale * v, 1. / kGamma) + .5);
     }
     kGammaTablesOk = 1;
@@ -108,12 +120,12 @@ static WEBP_INLINE uint32_t GammaToLinear(uint8_t v) {
 }
 
 static WEBP_INLINE int Interpolate(int v) {
-  const int tab_pos = v >> (kGammaTabFix + 2);    // integer part
+  const int tab_pos = v >> (GAMMA_TAB_FIX + 2);    // integer part
   const int x = v & ((kGammaTabScale << 2) - 1);  // fractional part
   const int v0 = kLinearToGammaTab[tab_pos];
   const int v1 = kLinearToGammaTab[tab_pos + 1];
   const int y = v1 * x + v0 * ((kGammaTabScale << 2) - x);   // interpolate
-  assert(tab_pos + 1 < kGammaTabSize + 1);
+  assert(tab_pos + 1 < GAMMA_TAB_SIZE + 1);
   return y;
 }
 
@@ -121,7 +133,7 @@ static WEBP_INLINE int Interpolate(int v) {
 // U/V value, suitable for RGBToU/V calls.
 static WEBP_INLINE int LinearToGamma(uint32_t base_value, int shift) {
   const int y = Interpolate(base_value << shift);   // final uplifted value
-  return (y + kGammaTabRounder) >> kGammaTabFix;    // descale
+  return (y + kGammaTabRounder) >> GAMMA_TAB_FIX;    // descale
 }
 
 #else
@@ -155,414 +167,26 @@ static int RGBToV(int r, int g, int b, VP8Random* const rg) {
 //------------------------------------------------------------------------------
 // Sharp RGB->YUV conversion
 
-static const int kNumIterations = 4;
 static const int kMinDimensionIterativeConversion = 4;
 
-// We could use SFIX=0 and only uint8_t for fixed_y_t, but it produces some
-// banding sometimes. Better use extra precision.
-#define SFIX 2                // fixed-point precision of RGB and Y/W
-typedef int16_t fixed_t;      // signed type with extra SFIX precision for UV
-typedef uint16_t fixed_y_t;   // unsigned type with extra SFIX precision for W
-
-#define SHALF (1 << SFIX >> 1)
-#define MAX_Y_T ((256 << SFIX) - 1)
-#define SROUNDER (1 << (YUV_FIX + SFIX - 1))
-
-#if defined(USE_GAMMA_COMPRESSION)
-
-// We use tables of different size and precision for the Rec709 / BT2020
-// transfer function.
-#define kGammaF (1./0.45)
-static uint32_t kLinearToGammaTabS[kGammaTabSize + 2];
-#define GAMMA_TO_LINEAR_BITS 14
-static uint32_t kGammaToLinearTabS[MAX_Y_T + 1];   // size scales with Y_FIX
-static volatile int kGammaTablesSOk = 0;
-
-static WEBP_TSAN_IGNORE_FUNCTION void InitGammaTablesS(void) {
-  assert(2 * GAMMA_TO_LINEAR_BITS < 32);  // we use uint32_t intermediate values
-  if (!kGammaTablesSOk) {
-    int v;
-    const double norm = 1. / MAX_Y_T;
-    const double scale = 1. / kGammaTabSize;
-    const double a = 0.09929682680944;
-    const double thresh = 0.018053968510807;
-    const double final_scale = 1 << GAMMA_TO_LINEAR_BITS;
-    for (v = 0; v <= MAX_Y_T; ++v) {
-      const double g = norm * v;
-      double value;
-      if (g <= thresh * 4.5) {
-        value = g / 4.5;
-      } else {
-        const double a_rec = 1. / (1. + a);
-        value = pow(a_rec * (g + a), kGammaF);
-      }
-      kGammaToLinearTabS[v] = (uint32_t)(value * final_scale + .5);
-    }
-    for (v = 0; v <= kGammaTabSize; ++v) {
-      const double g = scale * v;
-      double value;
-      if (g <= thresh) {
-        value = 4.5 * g;
-      } else {
-        value = (1. + a) * pow(g, 1. / kGammaF) - a;
-      }
-      // we already incorporate the 1/2 rounding constant here
-      kLinearToGammaTabS[v] =
-          (uint32_t)(MAX_Y_T * value) + (1 << GAMMA_TO_LINEAR_BITS >> 1);
-    }
-    // to prevent small rounding errors to cause read-overflow:
-    kLinearToGammaTabS[kGammaTabSize + 1] = kLinearToGammaTabS[kGammaTabSize];
-    kGammaTablesSOk = 1;
-  }
-}
-
-// return value has a fixed-point precision of GAMMA_TO_LINEAR_BITS
-static WEBP_INLINE uint32_t GammaToLinearS(int v) {
-  return kGammaToLinearTabS[v];
-}
-
-static WEBP_INLINE uint32_t LinearToGammaS(uint32_t value) {
-  // 'value' is in GAMMA_TO_LINEAR_BITS fractional precision
-  const uint32_t v = value * kGammaTabSize;
-  const uint32_t tab_pos = v >> GAMMA_TO_LINEAR_BITS;
-  // fractional part, in GAMMA_TO_LINEAR_BITS fixed-point precision
-  const uint32_t x = v - (tab_pos << GAMMA_TO_LINEAR_BITS);  // fractional part
-  // v0 / v1 are in GAMMA_TO_LINEAR_BITS fixed-point precision (range [0..1])
-  const uint32_t v0 = kLinearToGammaTabS[tab_pos + 0];
-  const uint32_t v1 = kLinearToGammaTabS[tab_pos + 1];
-  // Final interpolation. Note that rounding is already included.
-  const uint32_t v2 = (v1 - v0) * x;    // note: v1 >= v0.
-  const uint32_t result = v0 + (v2 >> GAMMA_TO_LINEAR_BITS);
-  return result;
-}
-
-#else
-
-static void InitGammaTablesS(void) {}
-static WEBP_INLINE uint32_t GammaToLinearS(int v) {
-  return (v << GAMMA_TO_LINEAR_BITS) / MAX_Y_T;
-}
-static WEBP_INLINE uint32_t LinearToGammaS(uint32_t value) {
-  return (MAX_Y_T * value) >> GAMMA_TO_LINEAR_BITS;
-}
-
-#endif    // USE_GAMMA_COMPRESSION
-
-//------------------------------------------------------------------------------
-
-static uint8_t clip_8b(fixed_t v) {
-  return (!(v & ~0xff)) ? (uint8_t)v : (v < 0) ? 0u : 255u;
-}
-
-static fixed_y_t clip_y(int y) {
-  return (!(y & ~MAX_Y_T)) ? (fixed_y_t)y : (y < 0) ? 0 : MAX_Y_T;
-}
-
-//------------------------------------------------------------------------------
-
-static int RGBToGray(int r, int g, int b) {
-  const int luma = 13933 * r + 46871 * g + 4732 * b + YUV_HALF;
-  return (luma >> YUV_FIX);
-}
-
-static uint32_t ScaleDown(int a, int b, int c, int d) {
-  const uint32_t A = GammaToLinearS(a);
-  const uint32_t B = GammaToLinearS(b);
-  const uint32_t C = GammaToLinearS(c);
-  const uint32_t D = GammaToLinearS(d);
-  return LinearToGammaS((A + B + C + D + 2) >> 2);
-}
-
-static WEBP_INLINE void UpdateW(const fixed_y_t* src, fixed_y_t* dst, int w) {
-  int i;
-  for (i = 0; i < w; ++i) {
-    const uint32_t R = GammaToLinearS(src[0 * w + i]);
-    const uint32_t G = GammaToLinearS(src[1 * w + i]);
-    const uint32_t B = GammaToLinearS(src[2 * w + i]);
-    const uint32_t Y = RGBToGray(R, G, B);
-    dst[i] = (fixed_y_t)LinearToGammaS(Y);
-  }
-}
-
-static void UpdateChroma(const fixed_y_t* src1, const fixed_y_t* src2,
-                         fixed_t* dst, int uv_w) {
-  int i;
-  for (i = 0; i < uv_w; ++i) {
-    const int r = ScaleDown(src1[0 * uv_w + 0], src1[0 * uv_w + 1],
-                            src2[0 * uv_w + 0], src2[0 * uv_w + 1]);
-    const int g = ScaleDown(src1[2 * uv_w + 0], src1[2 * uv_w + 1],
-                            src2[2 * uv_w + 0], src2[2 * uv_w + 1]);
-    const int b = ScaleDown(src1[4 * uv_w + 0], src1[4 * uv_w + 1],
-                            src2[4 * uv_w + 0], src2[4 * uv_w + 1]);
-    const int W = RGBToGray(r, g, b);
-    dst[0 * uv_w] = (fixed_t)(r - W);
-    dst[1 * uv_w] = (fixed_t)(g - W);
-    dst[2 * uv_w] = (fixed_t)(b - W);
-    dst  += 1;
-    src1 += 2;
-    src2 += 2;
-  }
-}
-
-static void StoreGray(const fixed_y_t* rgb, fixed_y_t* y, int w) {
-  int i;
-  for (i = 0; i < w; ++i) {
-    y[i] = RGBToGray(rgb[0 * w + i], rgb[1 * w + i], rgb[2 * w + i]);
-  }
-}
-
-//------------------------------------------------------------------------------
-
-static WEBP_INLINE fixed_y_t Filter2(int A, int B, int W0) {
-  const int v0 = (A * 3 + B + 2) >> 2;
-  return clip_y(v0 + W0);
-}
-
-//------------------------------------------------------------------------------
-
-static WEBP_INLINE fixed_y_t UpLift(uint8_t a) {  // 8bit -> SFIX
-  return ((fixed_y_t)a << SFIX) | SHALF;
-}
-
-static void ImportOneRow(const uint8_t* const r_ptr,
-                         const uint8_t* const g_ptr,
-                         const uint8_t* const b_ptr,
-                         int step,
-                         int pic_width,
-                         fixed_y_t* const dst) {
-  int i;
-  const int w = (pic_width + 1) & ~1;
-  for (i = 0; i < pic_width; ++i) {
-    const int off = i * step;
-    dst[i + 0 * w] = UpLift(r_ptr[off]);
-    dst[i + 1 * w] = UpLift(g_ptr[off]);
-    dst[i + 2 * w] = UpLift(b_ptr[off]);
-  }
-  if (pic_width & 1) {  // replicate rightmost pixel
-    dst[pic_width + 0 * w] = dst[pic_width + 0 * w - 1];
-    dst[pic_width + 1 * w] = dst[pic_width + 1 * w - 1];
-    dst[pic_width + 2 * w] = dst[pic_width + 2 * w - 1];
-  }
-}
-
-static void InterpolateTwoRows(const fixed_y_t* const best_y,
-                               const fixed_t* prev_uv,
-                               const fixed_t* cur_uv,
-                               const fixed_t* next_uv,
-                               int w,
-                               fixed_y_t* out1,
-                               fixed_y_t* out2) {
-  const int uv_w = w >> 1;
-  const int len = (w - 1) >> 1;   // length to filter
-  int k = 3;
-  while (k-- > 0) {   // process each R/G/B segments in turn
-    // special boundary case for i==0
-    out1[0] = Filter2(cur_uv[0], prev_uv[0], best_y[0]);
-    out2[0] = Filter2(cur_uv[0], next_uv[0], best_y[w]);
-
-    WebPSharpYUVFilterRow(cur_uv, prev_uv, len, best_y + 0 + 1, out1 + 1);
-    WebPSharpYUVFilterRow(cur_uv, next_uv, len, best_y + w + 1, out2 + 1);
-
-    // special boundary case for i == w - 1 when w is even
-    if (!(w & 1)) {
-      out1[w - 1] = Filter2(cur_uv[uv_w - 1], prev_uv[uv_w - 1],
-                            best_y[w - 1 + 0]);
-      out2[w - 1] = Filter2(cur_uv[uv_w - 1], next_uv[uv_w - 1],
-                            best_y[w - 1 + w]);
-    }
-    out1 += w;
-    out2 += w;
-    prev_uv += uv_w;
-    cur_uv  += uv_w;
-    next_uv += uv_w;
-  }
-}
-
-static WEBP_INLINE uint8_t ConvertRGBToY(int r, int g, int b) {
-  const int luma = 16839 * r + 33059 * g + 6420 * b + SROUNDER;
-  return clip_8b(16 + (luma >> (YUV_FIX + SFIX)));
-}
-
-static WEBP_INLINE uint8_t ConvertRGBToU(int r, int g, int b) {
-  const int u =  -9719 * r - 19081 * g + 28800 * b + SROUNDER;
-  return clip_8b(128 + (u >> (YUV_FIX + SFIX)));
-}
-
-static WEBP_INLINE uint8_t ConvertRGBToV(int r, int g, int b) {
-  const int v = +28800 * r - 24116 * g -  4684 * b + SROUNDER;
-  return clip_8b(128 + (v >> (YUV_FIX + SFIX)));
-}
-
-static int ConvertWRGBToYUV(const fixed_y_t* best_y, const fixed_t* best_uv,
-                            WebPPicture* const picture) {
-  int i, j;
-  uint8_t* dst_y = picture->y;
-  uint8_t* dst_u = picture->u;
-  uint8_t* dst_v = picture->v;
-  const fixed_t* const best_uv_base = best_uv;
-  const int w = (picture->width + 1) & ~1;
-  const int h = (picture->height + 1) & ~1;
-  const int uv_w = w >> 1;
-  const int uv_h = h >> 1;
-  for (best_uv = best_uv_base, j = 0; j < picture->height; ++j) {
-    for (i = 0; i < picture->width; ++i) {
-      const int off = (i >> 1);
-      const int W = best_y[i];
-      const int r = best_uv[off + 0 * uv_w] + W;
-      const int g = best_uv[off + 1 * uv_w] + W;
-      const int b = best_uv[off + 2 * uv_w] + W;
-      dst_y[i] = ConvertRGBToY(r, g, b);
-    }
-    best_y += w;
-    best_uv += (j & 1) * 3 * uv_w;
-    dst_y += picture->y_stride;
-  }
-  for (best_uv = best_uv_base, j = 0; j < uv_h; ++j) {
-    for (i = 0; i < uv_w; ++i) {
-      const int off = i;
-      const int r = best_uv[off + 0 * uv_w];
-      const int g = best_uv[off + 1 * uv_w];
-      const int b = best_uv[off + 2 * uv_w];
-      dst_u[i] = ConvertRGBToU(r, g, b);
-      dst_v[i] = ConvertRGBToV(r, g, b);
-    }
-    best_uv += 3 * uv_w;
-    dst_u += picture->uv_stride;
-    dst_v += picture->uv_stride;
-  }
-  return 1;
-}
-
 //------------------------------------------------------------------------------
 // Main function
 
-#define SAFE_ALLOC(W, H, T) ((T*)WebPSafeMalloc((W) * (H), sizeof(T)))
-
 static int PreprocessARGB(const uint8_t* r_ptr,
                           const uint8_t* g_ptr,
                           const uint8_t* b_ptr,
                           int step, int rgb_stride,
                           WebPPicture* const picture) {
-  // we expand the right/bottom border if needed
-  const int w = (picture->width + 1) & ~1;
-  const int h = (picture->height + 1) & ~1;
-  const int uv_w = w >> 1;
-  const int uv_h = h >> 1;
-  uint64_t prev_diff_y_sum = ~0;
-  int j, iter;
-
-  // TODO(skal): allocate one big memory chunk. But for now, it's easier
-  // for valgrind debugging to have several chunks.
-  fixed_y_t* const tmp_buffer = SAFE_ALLOC(w * 3, 2, fixed_y_t);   // scratch
-  fixed_y_t* const best_y_base = SAFE_ALLOC(w, h, fixed_y_t);
-  fixed_y_t* const target_y_base = SAFE_ALLOC(w, h, fixed_y_t);
-  fixed_y_t* const best_rgb_y = SAFE_ALLOC(w, 2, fixed_y_t);
-  fixed_t* const best_uv_base = SAFE_ALLOC(uv_w * 3, uv_h, fixed_t);
-  fixed_t* const target_uv_base = SAFE_ALLOC(uv_w * 3, uv_h, fixed_t);
-  fixed_t* const best_rgb_uv = SAFE_ALLOC(uv_w * 3, 1, fixed_t);
-  fixed_y_t* best_y = best_y_base;
-  fixed_y_t* target_y = target_y_base;
-  fixed_t* best_uv = best_uv_base;
-  fixed_t* target_uv = target_uv_base;
-  const uint64_t diff_y_threshold = (uint64_t)(3.0 * w * h);
-  int ok;
-
-  if (best_y_base == NULL || best_uv_base == NULL ||
-      target_y_base == NULL || target_uv_base == NULL ||
-      best_rgb_y == NULL || best_rgb_uv == NULL ||
-      tmp_buffer == NULL) {
-    ok = WebPEncodingSetError(picture, VP8_ENC_ERROR_OUT_OF_MEMORY);
-    goto End;
-  }
-  assert(picture->width >= kMinDimensionIterativeConversion);
-  assert(picture->height >= kMinDimensionIterativeConversion);
-
-  WebPInitConvertARGBToYUV();
-
-  // Import RGB samples to W/RGB representation.
-  for (j = 0; j < picture->height; j += 2) {
-    const int is_last_row = (j == picture->height - 1);
-    fixed_y_t* const src1 = tmp_buffer + 0 * w;
-    fixed_y_t* const src2 = tmp_buffer + 3 * w;
-
-    // prepare two rows of input
-    ImportOneRow(r_ptr, g_ptr, b_ptr, step, picture->width, src1);
-    if (!is_last_row) {
-      ImportOneRow(r_ptr + rgb_stride, g_ptr + rgb_stride, b_ptr + rgb_stride,
-                   step, picture->width, src2);
-    } else {
-      memcpy(src2, src1, 3 * w * sizeof(*src2));
-    }
-    StoreGray(src1, best_y + 0, w);
-    StoreGray(src2, best_y + w, w);
-
-    UpdateW(src1, target_y, w);
-    UpdateW(src2, target_y + w, w);
-    UpdateChroma(src1, src2, target_uv, uv_w);
-    memcpy(best_uv, target_uv, 3 * uv_w * sizeof(*best_uv));
-    best_y += 2 * w;
-    best_uv += 3 * uv_w;
-    target_y += 2 * w;
-    target_uv += 3 * uv_w;
-    r_ptr += 2 * rgb_stride;
-    g_ptr += 2 * rgb_stride;
-    b_ptr += 2 * rgb_stride;
-  }
-
-  // Iterate and resolve clipping conflicts.
-  for (iter = 0; iter < kNumIterations; ++iter) {
-    const fixed_t* cur_uv = best_uv_base;
-    const fixed_t* prev_uv = best_uv_base;
-    uint64_t diff_y_sum = 0;
-
-    best_y = best_y_base;
-    best_uv = best_uv_base;
-    target_y = target_y_base;
-    target_uv = target_uv_base;
-    for (j = 0; j < h; j += 2) {
-      fixed_y_t* const src1 = tmp_buffer + 0 * w;
-      fixed_y_t* const src2 = tmp_buffer + 3 * w;
-      {
-        const fixed_t* const next_uv = cur_uv + ((j < h - 2) ? 3 * uv_w : 0);
-        InterpolateTwoRows(best_y, prev_uv, cur_uv, next_uv, w, src1, src2);
-        prev_uv = cur_uv;
-        cur_uv = next_uv;
-      }
-
-      UpdateW(src1, best_rgb_y + 0 * w, w);
-      UpdateW(src2, best_rgb_y + 1 * w, w);
-      UpdateChroma(src1, src2, best_rgb_uv, uv_w);
-
-      // update two rows of Y and one row of RGB
-      diff_y_sum += WebPSharpYUVUpdateY(target_y, best_rgb_y, best_y, 2 * w);
-      WebPSharpYUVUpdateRGB(target_uv, best_rgb_uv, best_uv, 3 * uv_w);
-
-      best_y += 2 * w;
-      best_uv += 3 * uv_w;
-      target_y += 2 * w;
-      target_uv += 3 * uv_w;
-    }
-    // test exit condition
-    if (iter > 0) {
-      if (diff_y_sum < diff_y_threshold) break;
-      if (diff_y_sum > prev_diff_y_sum) break;
-    }
-    prev_diff_y_sum = diff_y_sum;
+  const int ok = SharpYuvConvert(
+      r_ptr, g_ptr, b_ptr, step, rgb_stride, /*rgb_bit_depth=*/8,
+      picture->y, picture->y_stride, picture->u, picture->uv_stride, picture->v,
+      picture->uv_stride, /*yuv_bit_depth=*/8, picture->width,
+      picture->height, SharpYuvGetConversionMatrix(kSharpYuvMatrixWebp));
+  if (!ok) {
+    return WebPEncodingSetError(picture, VP8_ENC_ERROR_OUT_OF_MEMORY);
   }
-  // final reconstruction
-  ok = ConvertWRGBToYUV(best_y_base, best_uv_base, picture);
-
- End:
-  WebPSafeFree(best_y_base);
-  WebPSafeFree(best_uv_base);
-  WebPSafeFree(target_y_base);
-  WebPSafeFree(target_uv_base);
-  WebPSafeFree(best_rgb_y);
-  WebPSafeFree(best_rgb_uv);
-  WebPSafeFree(tmp_buffer);
   return ok;
 }
-#undef SAFE_ALLOC
 
 //------------------------------------------------------------------------------
 // "Fast" regular RGB->YUV
@@ -587,8 +211,8 @@ static const int kAlphaFix = 19;
 // and constant are adjusted very tightly to fit 32b arithmetic.
 // In particular, they use the fact that the operands for 'v / a' are actually
 // derived as v = (a0.p0 + a1.p1 + a2.p2 + a3.p3) and a = a0 + a1 + a2 + a3
-// with ai in [0..255] and pi in [0..1<<kGammaFix). The constraint to avoid
-// overflow is: kGammaFix + kAlphaFix <= 31.
+// with ai in [0..255] and pi in [0..1<<GAMMA_FIX). The constraint to avoid
+// overflow is: GAMMA_FIX + kAlphaFix <= 31.
 static const uint32_t kInvAlpha[4 * 0xff + 1] = {
   0,  /* alpha = 0 */
   524288, 262144, 174762, 131072, 104857, 87381, 74898, 65536,
@@ -814,11 +438,20 @@ static WEBP_INLINE void AccumulateRGB(const uint8_t* const r_ptr,
     dst[0] = SUM4(r_ptr + j, step);
     dst[1] = SUM4(g_ptr + j, step);
     dst[2] = SUM4(b_ptr + j, step);
+    // MemorySanitizer may raise false positives with data that passes through
+    // RGBA32PackedToPlanar_16b_SSE41() due to incorrect modeling of shuffles.
+    // See https://crbug.com/webp/573.
+#ifdef WEBP_MSAN
+    dst[3] = 0;
+#endif
   }
   if (width & 1) {
     dst[0] = SUM2(r_ptr + j);
     dst[1] = SUM2(g_ptr + j);
     dst[2] = SUM2(b_ptr + j);
+#ifdef WEBP_MSAN
+    dst[3] = 0;
+#endif
   }
 }
 
@@ -835,6 +468,8 @@ static WEBP_INLINE void ConvertRowsToUV(const uint16_t* rgb,
   }
 }
 
+extern void SharpYuvInit(VP8CPUInfo cpu_info_func);
+
 static int ImportYUVAFromRGBA(const uint8_t* r_ptr,
                               const uint8_t* g_ptr,
                               const uint8_t* b_ptr,
@@ -859,18 +494,18 @@ static int ImportYUVAFromRGBA(const uint8_t* r_ptr,
     use_iterative_conversion = 0;
   }
 
-  if (!WebPPictureAllocYUVA(picture, width, height)) {
+  if (!WebPPictureAllocYUVA(picture)) {
     return 0;
   }
   if (has_alpha) {
     assert(step == 4);
 #if defined(USE_GAMMA_COMPRESSION) && defined(USE_INVERSE_ALPHA_TABLE)
-    assert(kAlphaFix + kGammaFix <= 31);
+    assert(kAlphaFix + GAMMA_FIX <= 31);
 #endif
   }
 
   if (use_iterative_conversion) {
-    InitGammaTablesS();
+    SharpYuvInit(VP8GetCPUInfo);
     if (!PreprocessARGB(r_ptr, g_ptr, b_ptr, step, rgb_stride, picture)) {
       return 0;
     }
@@ -997,10 +632,10 @@ static int PictureARGBToYUVA(WebPPicture* picture, WebPEncCSP colorspace,
     return WebPEncodingSetError(picture, VP8_ENC_ERROR_INVALID_CONFIGURATION);
   } else {
     const uint8_t* const argb = (const uint8_t*)picture->argb;
-    const uint8_t* const a = argb + (0 ^ ALPHA_OFFSET);
-    const uint8_t* const r = argb + (1 ^ ALPHA_OFFSET);
-    const uint8_t* const g = argb + (2 ^ ALPHA_OFFSET);
-    const uint8_t* const b = argb + (3 ^ ALPHA_OFFSET);
+    const uint8_t* const a = argb + CHANNEL_OFFSET(0);
+    const uint8_t* const r = argb + CHANNEL_OFFSET(1);
+    const uint8_t* const g = argb + CHANNEL_OFFSET(2);
+    const uint8_t* const b = argb + CHANNEL_OFFSET(3);
 
     picture->colorspace = WEBP_YUV420;
     return ImportYUVAFromRGBA(r, g, b, a, 4, 4 * picture->argb_stride,
@@ -1040,7 +675,7 @@ int WebPPictureYUVAToARGB(WebPPicture* picture) {
     return WebPEncodingSetError(picture, VP8_ENC_ERROR_INVALID_CONFIGURATION);
   }
   // Allocate a new argb buffer (discarding the previous one).
-  if (!WebPPictureAllocARGB(picture, picture->width, picture->height)) return 0;
+  if (!WebPPictureAllocARGB(picture)) return 0;
   picture->use_argb = 1;
 
   // Convert
@@ -1050,7 +685,7 @@ int WebPPictureYUVAToARGB(WebPPicture* picture) {
     const int height = picture->height;
     const int argb_stride = 4 * picture->argb_stride;
     uint8_t* dst = (uint8_t*)picture->argb;
-    const uint8_t *cur_u = picture->u, *cur_v = picture->v, *cur_y = picture->y;
+    const uint8_t* cur_u = picture->u, *cur_v = picture->v, *cur_y = picture->y;
     WebPUpsampleLinePairFunc upsample =
         WebPGetLinePairConverter(ALPHA_OFFSET > 0);
 
@@ -1102,6 +737,8 @@ static int Import(WebPPicture* const picture,
   const int width = picture->width;
   const int height = picture->height;
 
+  if (abs(rgb_stride) < (import_alpha ? 4 : 3) * width) return 0;
+
   if (!picture->use_argb) {
     const uint8_t* a_ptr = import_alpha ? rgb + 3 : NULL;
     return ImportYUVAFromRGBA(r_ptr, g_ptr, b_ptr, a_ptr, step, rgb_stride,
@@ -1159,24 +796,24 @@ static int Import(WebPPicture* const picture,
 #if !defined(WEBP_REDUCE_CSP)
 
 int WebPPictureImportBGR(WebPPicture* picture,
-                         const uint8_t* rgb, int rgb_stride) {
-  return (picture != NULL && rgb != NULL)
-             ? Import(picture, rgb, rgb_stride, 3, 1, 0)
+                         const uint8_t* bgr, int bgr_stride) {
+  return (picture != NULL && bgr != NULL)
+             ? Import(picture, bgr, bgr_stride, 3, 1, 0)
              : 0;
 }
 
 int WebPPictureImportBGRA(WebPPicture* picture,
-                          const uint8_t* rgba, int rgba_stride) {
-  return (picture != NULL && rgba != NULL)
-             ? Import(picture, rgba, rgba_stride, 4, 1, 1)
+                          const uint8_t* bgra, int bgra_stride) {
+  return (picture != NULL && bgra != NULL)
+             ? Import(picture, bgra, bgra_stride, 4, 1, 1)
              : 0;
 }
 
 
 int WebPPictureImportBGRX(WebPPicture* picture,
-                          const uint8_t* rgba, int rgba_stride) {
-  return (picture != NULL && rgba != NULL)
-             ? Import(picture, rgba, rgba_stride, 4, 1, 0)
+                          const uint8_t* bgrx, int bgrx_stride) {
+  return (picture != NULL && bgrx != NULL)
+             ? Import(picture, bgrx, bgrx_stride, 4, 1, 0)
              : 0;
 }
 
@@ -1197,9 +834,9 @@ int WebPPictureImportRGBA(WebPPicture* picture,
 }
 
 int WebPPictureImportRGBX(WebPPicture* picture,
-                          const uint8_t* rgba, int rgba_stride) {
-  return (picture != NULL && rgba != NULL)
-             ? Import(picture, rgba, rgba_stride, 4, 0, 0)
+                          const uint8_t* rgbx, int rgbx_stride) {
+  return (picture != NULL && rgbx != NULL)
+             ? Import(picture, rgbx, rgbx_stride, 4, 0, 0)
              : 0;
 }
 
diff --git a/src/3rdparty/libwebp/src/enc/picture_enc.c b/src/3rdparty/libwebp/src/enc/picture_enc.c
index c691622..3af6383 100644
--- a/src/3rdparty/libwebp/src/enc/picture_enc.c
+++ b/src/3rdparty/libwebp/src/enc/picture_enc.c
@@ -45,6 +45,22 @@ int WebPPictureInitInternal(WebPPicture* picture, int version) {
 
 //------------------------------------------------------------------------------
 
+int WebPValidatePicture(const WebPPicture* const picture) {
+  if (picture == NULL) return 0;
+  if (picture->width <= 0 || picture->height <= 0) {
+    return WebPEncodingSetError(picture, VP8_ENC_ERROR_BAD_DIMENSION);
+  }
+  if (picture->width <= 0 || picture->width / 4 > INT_MAX / 4 ||
+      picture->height <= 0 || picture->height / 4 > INT_MAX / 4) {
+    return WebPEncodingSetError(picture, VP8_ENC_ERROR_BAD_DIMENSION);
+  }
+  if (picture->colorspace != WEBP_YUV420 &&
+      picture->colorspace != WEBP_YUV420A) {
+    return WebPEncodingSetError(picture, VP8_ENC_ERROR_INVALID_CONFIGURATION);
+  }
+  return 1;
+}
+
 static void WebPPictureResetBufferARGB(WebPPicture* const picture) {
   picture->memory_argb_ = NULL;
   picture->argb = NULL;
@@ -63,18 +79,17 @@ void WebPPictureResetBuffers(WebPPicture* const picture) {
   WebPPictureResetBufferYUVA(picture);
 }
 
-int WebPPictureAllocARGB(WebPPicture* const picture, int width, int height) {
+int WebPPictureAllocARGB(WebPPicture* const picture) {
   void* memory;
+  const int width = picture->width;
+  const int height = picture->height;
   const uint64_t argb_size = (uint64_t)width * height;
 
-  assert(picture != NULL);
+  if (!WebPValidatePicture(picture)) return 0;
 
   WebPSafeFree(picture->memory_argb_);
   WebPPictureResetBufferARGB(picture);
 
-  if (width <= 0 || height <= 0) {
-    return WebPEncodingSetError(picture, VP8_ENC_ERROR_BAD_DIMENSION);
-  }
   // allocate a new buffer.
   memory = WebPSafeMalloc(argb_size + WEBP_ALIGN_CST, sizeof(*picture->argb));
   if (memory == NULL) {
@@ -86,10 +101,10 @@ int WebPPictureAllocARGB(WebPPicture* const picture, int width, int height) {
   return 1;
 }
 
-int WebPPictureAllocYUVA(WebPPicture* const picture, int width, int height) {
-  const WebPEncCSP uv_csp =
-      (WebPEncCSP)((int)picture->colorspace & WEBP_CSP_UV_MASK);
+int WebPPictureAllocYUVA(WebPPicture* const picture) {
   const int has_alpha = (int)picture->colorspace & WEBP_CSP_ALPHA_BIT;
+  const int width = picture->width;
+  const int height = picture->height;
   const int y_stride = width;
   const int uv_width = (int)(((int64_t)width + 1) >> 1);
   const int uv_height = (int)(((int64_t)height + 1) >> 1);
@@ -98,15 +113,11 @@ int WebPPictureAllocYUVA(WebPPicture* const picture, int width, int height) {
   uint64_t y_size, uv_size, a_size, total_size;
   uint8_t* mem;
 
-  assert(picture != NULL);
+  if (!WebPValidatePicture(picture)) return 0;
 
   WebPSafeFree(picture->memory_);
   WebPPictureResetBufferYUVA(picture);
 
-  if (uv_csp != WEBP_YUV420) {
-    return WebPEncodingSetError(picture, VP8_ENC_ERROR_INVALID_CONFIGURATION);
-  }
-
   // alpha
   a_width = has_alpha ? width : 0;
   a_stride = a_width;
@@ -152,15 +163,12 @@ int WebPPictureAllocYUVA(WebPPicture* const picture, int width, int height) {
 
 int WebPPictureAlloc(WebPPicture* picture) {
   if (picture != NULL) {
-    const int width = picture->width;
-    const int height = picture->height;
-
     WebPPictureFree(picture);   // erase previous buffer
 
     if (!picture->use_argb) {
-      return WebPPictureAllocYUVA(picture, width, height);
+      return WebPPictureAllocYUVA(picture);
     } else {
-      return WebPPictureAllocARGB(picture, width, height);
+      return WebPPictureAllocARGB(picture);
     }
   }
   return 1;
diff --git a/src/3rdparty/libwebp/src/enc/picture_rescale_enc.c b/src/3rdparty/libwebp/src/enc/picture_rescale_enc.c
index 58a6ae7..839f91c 100644
--- a/src/3rdparty/libwebp/src/enc/picture_rescale_enc.c
+++ b/src/3rdparty/libwebp/src/enc/picture_rescale_enc.c
@@ -13,14 +13,15 @@
 
 #include "src/webp/encode.h"
 
-#if !defined(WEBP_REDUCE_SIZE)
-
 #include <assert.h>
 #include <stdlib.h>
 
 #include "src/enc/vp8i_enc.h"
+
+#if !defined(WEBP_REDUCE_SIZE)
 #include "src/utils/rescaler_utils.h"
 #include "src/utils/utils.h"
+#endif  // !defined(WEBP_REDUCE_SIZE)
 
 #define HALVE(x) (((x) + 1) >> 1)
 
@@ -56,6 +57,7 @@ static int AdjustAndCheckRectangle(const WebPPicture* const pic,
   return 1;
 }
 
+#if !defined(WEBP_REDUCE_SIZE)
 int WebPPictureCopy(const WebPPicture* src, WebPPicture* dst) {
   if (src == NULL || dst == NULL) return 0;
   if (src == dst) return 1;
@@ -81,6 +83,7 @@ int WebPPictureCopy(const WebPPicture* src, WebPPicture* dst) {
   }
   return 1;
 }
+#endif  // !defined(WEBP_REDUCE_SIZE)
 
 int WebPPictureIsView(const WebPPicture* picture) {
   if (picture == NULL) return 0;
@@ -120,6 +123,7 @@ int WebPPictureView(const WebPPicture* src,
   return 1;
 }
 
+#if !defined(WEBP_REDUCE_SIZE)
 //------------------------------------------------------------------------------
 // Picture cropping
 
@@ -164,22 +168,25 @@ int WebPPictureCrop(WebPPicture* pic,
 //------------------------------------------------------------------------------
 // Simple picture rescaler
 
-static void RescalePlane(const uint8_t* src,
-                         int src_width, int src_height, int src_stride,
-                         uint8_t* dst,
-                         int dst_width, int dst_height, int dst_stride,
-                         rescaler_t* const work,
-                         int num_channels) {
+static int RescalePlane(const uint8_t* src,
+                        int src_width, int src_height, int src_stride,
+                        uint8_t* dst,
+                        int dst_width, int dst_height, int dst_stride,
+                        rescaler_t* const work,
+                        int num_channels) {
   WebPRescaler rescaler;
   int y = 0;
-  WebPRescalerInit(&rescaler, src_width, src_height,
-                   dst, dst_width, dst_height, dst_stride,
-                   num_channels, work);
+  if (!WebPRescalerInit(&rescaler, src_width, src_height,
+                        dst, dst_width, dst_height, dst_stride,
+                        num_channels, work)) {
+    return 0;
+  }
   while (y < src_height) {
     y += WebPRescalerImport(&rescaler, src_height - y,
                             src + y * src_stride, src_stride);
     WebPRescalerExport(&rescaler);
   }
+  return 1;
 }
 
 static void AlphaMultiplyARGB(WebPPicture* const pic, int inverse) {
@@ -195,52 +202,53 @@ static void AlphaMultiplyY(WebPPicture* const pic, int inverse) {
   }
 }
 
-int WebPPictureRescale(WebPPicture* pic, int width, int height) {
+int WebPPictureRescale(WebPPicture* picture, int width, int height) {
   WebPPicture tmp;
   int prev_width, prev_height;
   rescaler_t* work;
 
-  if (pic == NULL) return 0;
-  prev_width = pic->width;
-  prev_height = pic->height;
+  if (picture == NULL) return 0;
+  prev_width = picture->width;
+  prev_height = picture->height;
   if (!WebPRescalerGetScaledDimensions(
           prev_width, prev_height, &width, &height)) {
     return 0;
   }
 
-  PictureGrabSpecs(pic, &tmp);
+  PictureGrabSpecs(picture, &tmp);
   tmp.width = width;
   tmp.height = height;
   if (!WebPPictureAlloc(&tmp)) return 0;
 
-  if (!pic->use_argb) {
+  if (!picture->use_argb) {
     work = (rescaler_t*)WebPSafeMalloc(2ULL * width, sizeof(*work));
     if (work == NULL) {
       WebPPictureFree(&tmp);
       return 0;
     }
     // If present, we need to rescale alpha first (for AlphaMultiplyY).
-    if (pic->a != NULL) {
+    if (picture->a != NULL) {
       WebPInitAlphaProcessing();
-      RescalePlane(pic->a, prev_width, prev_height, pic->a_stride,
-                   tmp.a, width, height, tmp.a_stride, work, 1);
+      if (!RescalePlane(picture->a, prev_width, prev_height, picture->a_stride,
+                        tmp.a, width, height, tmp.a_stride, work, 1)) {
+        return 0;
+      }
     }
 
     // We take transparency into account on the luma plane only. That's not
     // totally exact blending, but still is a good approximation.
-    AlphaMultiplyY(pic, 0);
-    RescalePlane(pic->y, prev_width, prev_height, pic->y_stride,
-                 tmp.y, width, height, tmp.y_stride, work, 1);
+    AlphaMultiplyY(picture, 0);
+    if (!RescalePlane(picture->y, prev_width, prev_height, picture->y_stride,
+                      tmp.y, width, height, tmp.y_stride, work, 1) ||
+        !RescalePlane(picture->u, HALVE(prev_width), HALVE(prev_height),
+                      picture->uv_stride, tmp.u, HALVE(width), HALVE(height),
+                      tmp.uv_stride, work, 1) ||
+        !RescalePlane(picture->v, HALVE(prev_width), HALVE(prev_height),
+                      picture->uv_stride, tmp.v, HALVE(width), HALVE(height),
+                      tmp.uv_stride, work, 1)) {
+      return 0;
+    }
     AlphaMultiplyY(&tmp, 1);
-
-    RescalePlane(pic->u,
-                 HALVE(prev_width), HALVE(prev_height), pic->uv_stride,
-                 tmp.u,
-                 HALVE(width), HALVE(height), tmp.uv_stride, work, 1);
-    RescalePlane(pic->v,
-                 HALVE(prev_width), HALVE(prev_height), pic->uv_stride,
-                 tmp.v,
-                 HALVE(width), HALVE(height), tmp.uv_stride, work, 1);
   } else {
     work = (rescaler_t*)WebPSafeMalloc(2ULL * width * 4, sizeof(*work));
     if (work == NULL) {
@@ -251,17 +259,17 @@ int WebPPictureRescale(WebPPicture* pic, int width, int height) {
     // weighting first (black-matting), scale the RGB values, and remove
     // the premultiplication afterward (while preserving the alpha channel).
     WebPInitAlphaProcessing();
-    AlphaMultiplyARGB(pic, 0);
-    RescalePlane((const uint8_t*)pic->argb, prev_width, prev_height,
-                 pic->argb_stride * 4,
-                 (uint8_t*)tmp.argb, width, height,
-                 tmp.argb_stride * 4,
-                 work, 4);
+    AlphaMultiplyARGB(picture, 0);
+    if (!RescalePlane((const uint8_t*)picture->argb, prev_width, prev_height,
+                      picture->argb_stride * 4, (uint8_t*)tmp.argb, width,
+                      height, tmp.argb_stride * 4, work, 4)) {
+      return 0;
+    }
     AlphaMultiplyARGB(&tmp, 1);
   }
-  WebPPictureFree(pic);
+  WebPPictureFree(picture);
   WebPSafeFree(work);
-  *pic = tmp;
+  *picture = tmp;
   return 1;
 }
 
@@ -273,23 +281,6 @@ int WebPPictureCopy(const WebPPicture* src, WebPPicture* dst) {
   return 0;
 }
 
-int WebPPictureIsView(const WebPPicture* picture) {
-  (void)picture;
-  return 0;
-}
-
-int WebPPictureView(const WebPPicture* src,
-                    int left, int top, int width, int height,
-                    WebPPicture* dst) {
-  (void)src;
-  (void)left;
-  (void)top;
-  (void)width;
-  (void)height;
-  (void)dst;
-  return 0;
-}
-
 int WebPPictureCrop(WebPPicture* pic,
                     int left, int top, int width, int height) {
   (void)pic;
diff --git a/src/3rdparty/libwebp/src/enc/picture_tools_enc.c b/src/3rdparty/libwebp/src/enc/picture_tools_enc.c
index d0e8a49..147cc18 100644
--- a/src/3rdparty/libwebp/src/enc/picture_tools_enc.c
+++ b/src/3rdparty/libwebp/src/enc/picture_tools_enc.c
@@ -83,6 +83,19 @@ static int SmoothenBlock(const uint8_t* a_ptr, int a_stride, uint8_t* y_ptr,
   return (count == 0);
 }
 
+void WebPReplaceTransparentPixels(WebPPicture* const pic, uint32_t color) {
+  if (pic != NULL && pic->use_argb) {
+    int y = pic->height;
+    uint32_t* argb = pic->argb;
+    color &= 0xffffffu;   // force alpha=0
+    WebPInitAlphaProcessing();
+    while (y-- > 0) {
+      WebPAlphaReplace(argb, pic->width, color);
+      argb += pic->argb_stride;
+    }
+  }
+}
+
 void WebPCleanupTransparentArea(WebPPicture* pic) {
   int x, y, w, h;
   if (pic == NULL) return;
@@ -165,24 +178,6 @@ void WebPCleanupTransparentArea(WebPPicture* pic) {
 #undef SIZE
 #undef SIZE2
 
-void WebPCleanupTransparentAreaLossless(WebPPicture* const pic) {
-  int x, y, w, h;
-  uint32_t* argb;
-  assert(pic != NULL && pic->use_argb);
-  w = pic->width;
-  h = pic->height;
-  argb = pic->argb;
-
-  for (y = 0; y < h; ++y) {
-    for (x = 0; x < w; ++x) {
-      if ((argb[x] & 0xff000000) == 0) {
-        argb[x] = 0x00000000;
-      }
-    }
-    argb += pic->argb_stride;
-  }
-}
-
 //------------------------------------------------------------------------------
 // Blend color and remove transparency info
 
@@ -195,27 +190,28 @@ static WEBP_INLINE uint32_t MakeARGB32(int r, int g, int b) {
   return (0xff000000u | (r << 16) | (g << 8) | b);
 }
 
-void WebPBlendAlpha(WebPPicture* pic, uint32_t background_rgb) {
+void WebPBlendAlpha(WebPPicture* picture, uint32_t background_rgb) {
   const int red = (background_rgb >> 16) & 0xff;
   const int green = (background_rgb >> 8) & 0xff;
   const int blue = (background_rgb >> 0) & 0xff;
   int x, y;
-  if (pic == NULL) return;
-  if (!pic->use_argb) {
-    const int uv_width = (pic->width >> 1);  // omit last pixel during u/v loop
+  if (picture == NULL) return;
+  if (!picture->use_argb) {
+    // omit last pixel during u/v loop
+    const int uv_width = (picture->width >> 1);
     const int Y0 = VP8RGBToY(red, green, blue, YUV_HALF);
     // VP8RGBToU/V expects the u/v values summed over four pixels
     const int U0 = VP8RGBToU(4 * red, 4 * green, 4 * blue, 4 * YUV_HALF);
     const int V0 = VP8RGBToV(4 * red, 4 * green, 4 * blue, 4 * YUV_HALF);
-    const int has_alpha = pic->colorspace & WEBP_CSP_ALPHA_BIT;
-    uint8_t* y_ptr = pic->y;
-    uint8_t* u_ptr = pic->u;
-    uint8_t* v_ptr = pic->v;
-    uint8_t* a_ptr = pic->a;
+    const int has_alpha = picture->colorspace & WEBP_CSP_ALPHA_BIT;
+    uint8_t* y_ptr = picture->y;
+    uint8_t* u_ptr = picture->u;
+    uint8_t* v_ptr = picture->v;
+    uint8_t* a_ptr = picture->a;
     if (!has_alpha || a_ptr == NULL) return;    // nothing to do
-    for (y = 0; y < pic->height; ++y) {
+    for (y = 0; y < picture->height; ++y) {
       // Luma blending
-      for (x = 0; x < pic->width; ++x) {
+      for (x = 0; x < picture->width; ++x) {
         const uint8_t alpha = a_ptr[x];
         if (alpha < 0xff) {
           y_ptr[x] = BLEND(Y0, y_ptr[x], alpha);
@@ -224,7 +220,7 @@ void WebPBlendAlpha(WebPPicture* pic, uint32_t background_rgb) {
       // Chroma blending every even line
       if ((y & 1) == 0) {
         uint8_t* const a_ptr2 =
-            (y + 1 == pic->height) ? a_ptr : a_ptr + pic->a_stride;
+            (y + 1 == picture->height) ? a_ptr : a_ptr + picture->a_stride;
         for (x = 0; x < uv_width; ++x) {
           // Average four alpha values into a single blending weight.
           // TODO(skal): might lead to visible contouring. Can we do better?
@@ -234,24 +230,24 @@ void WebPBlendAlpha(WebPPicture* pic, uint32_t background_rgb) {
           u_ptr[x] = BLEND_10BIT(U0, u_ptr[x], alpha);
           v_ptr[x] = BLEND_10BIT(V0, v_ptr[x], alpha);
         }
-        if (pic->width & 1) {   // rightmost pixel
+        if (picture->width & 1) {  // rightmost pixel
           const uint32_t alpha = 2 * (a_ptr[2 * x + 0] + a_ptr2[2 * x + 0]);
           u_ptr[x] = BLEND_10BIT(U0, u_ptr[x], alpha);
           v_ptr[x] = BLEND_10BIT(V0, v_ptr[x], alpha);
         }
       } else {
-        u_ptr += pic->uv_stride;
-        v_ptr += pic->uv_stride;
+        u_ptr += picture->uv_stride;
+        v_ptr += picture->uv_stride;
       }
-      memset(a_ptr, 0xff, pic->width);  // reset alpha value to opaque
-      a_ptr += pic->a_stride;
-      y_ptr += pic->y_stride;
+      memset(a_ptr, 0xff, picture->width);  // reset alpha value to opaque
+      a_ptr += picture->a_stride;
+      y_ptr += picture->y_stride;
     }
   } else {
-    uint32_t* argb = pic->argb;
+    uint32_t* argb = picture->argb;
     const uint32_t background = MakeARGB32(red, green, blue);
-    for (y = 0; y < pic->height; ++y) {
-      for (x = 0; x < pic->width; ++x) {
+    for (y = 0; y < picture->height; ++y) {
+      for (x = 0; x < picture->width; ++x) {
         const int alpha = (argb[x] >> 24) & 0xff;
         if (alpha != 0xff) {
           if (alpha > 0) {
@@ -267,7 +263,7 @@ void WebPBlendAlpha(WebPPicture* pic, uint32_t background_rgb) {
           }
         }
       }
-      argb += pic->argb_stride;
+      argb += picture->argb_stride;
     }
   }
 }
diff --git a/src/3rdparty/libwebp/src/enc/predictor_enc.c b/src/3rdparty/libwebp/src/enc/predictor_enc.c
index 2e6762e..b3d44b5 100644
--- a/src/3rdparty/libwebp/src/enc/predictor_enc.c
+++ b/src/3rdparty/libwebp/src/enc/predictor_enc.c
@@ -16,6 +16,7 @@
 
 #include "src/dsp/lossless.h"
 #include "src/dsp/lossless_common.h"
+#include "src/enc/vp8i_enc.h"
 #include "src/enc/vp8li_enc.h"
 
 #define MAX_DIFF_COST (1e30f)
@@ -31,10 +32,10 @@ static WEBP_INLINE int GetMin(int a, int b) { return (a > b) ? b : a; }
 // Methods to calculate Entropy (Shannon).
 
 static float PredictionCostSpatial(const int counts[256], int weight_0,
-                                   double exp_val) {
+                                   float exp_val) {
   const int significant_symbols = 256 >> 4;
-  const double exp_decay_factor = 0.6;
-  double bits = weight_0 * counts[0];
+  const float exp_decay_factor = 0.6f;
+  float bits = (float)weight_0 * counts[0];
   int i;
   for (i = 1; i < significant_symbols; ++i) {
     bits += exp_val * (counts[i] + counts[256 - i]);
@@ -46,9 +47,9 @@ static float PredictionCostSpatial(const int counts[256], int weight_0,
 static float PredictionCostSpatialHistogram(const int accumulated[4][256],
                                             const int tile[4][256]) {
   int i;
-  double retval = 0;
+  float retval = 0.f;
   for (i = 0; i < 4; ++i) {
-    const double kExpValue = 0.94;
+    const float kExpValue = 0.94f;
     retval += PredictionCostSpatial(tile[i], 1, kExpValue);
     retval += VP8LCombinedShannonEntropy(tile[i], accumulated[i]);
   }
@@ -249,7 +250,7 @@ static WEBP_INLINE void GetResidual(
       } else if (x == 0) {
         predict = upper_row[x];  // Top.
       } else {
-        predict = pred_func(current_row[x - 1], upper_row + x);
+        predict = pred_func(&current_row[x - 1], upper_row + x);
       }
 #if (WEBP_NEAR_LOSSLESS == 1)
       if (max_quantization == 1 || mode == 0 || y == 0 || y == height - 1 ||
@@ -472,12 +473,15 @@ static void CopyImageWithPrediction(int width, int height,
 // with respect to predictions. If near_lossless_quality < 100, applies
 // near lossless processing, shaving off more bits of residuals for lower
 // qualities.
-void VP8LResidualImage(int width, int height, int bits, int low_effort,
-                       uint32_t* const argb, uint32_t* const argb_scratch,
-                       uint32_t* const image, int near_lossless_quality,
-                       int exact, int used_subtract_green) {
+int VP8LResidualImage(int width, int height, int bits, int low_effort,
+                      uint32_t* const argb, uint32_t* const argb_scratch,
+                      uint32_t* const image, int near_lossless_quality,
+                      int exact, int used_subtract_green,
+                      const WebPPicture* const pic, int percent_range,
+                      int* const percent) {
   const int tiles_per_row = VP8LSubSampleSize(width, bits);
   const int tiles_per_col = VP8LSubSampleSize(height, bits);
+  int percent_start = *percent;
   int tile_y;
   int histo[4][256];
   const int max_quantization = 1 << VP8LNearLosslessBits(near_lossless_quality);
@@ -491,17 +495,24 @@ void VP8LResidualImage(int width, int height, int bits, int low_effort,
     for (tile_y = 0; tile_y < tiles_per_col; ++tile_y) {
       int tile_x;
       for (tile_x = 0; tile_x < tiles_per_row; ++tile_x) {
-        const int pred = GetBestPredictorForTile(width, height, tile_x, tile_y,
-            bits, histo, argb_scratch, argb, max_quantization, exact,
-            used_subtract_green, image);
+        const int pred = GetBestPredictorForTile(
+            width, height, tile_x, tile_y, bits, histo, argb_scratch, argb,
+            max_quantization, exact, used_subtract_green, image);
         image[tile_y * tiles_per_row + tile_x] = ARGB_BLACK | (pred << 8);
       }
+
+      if (!WebPReportProgress(
+              pic, percent_start + percent_range * tile_y / tiles_per_col,
+              percent)) {
+        return 0;
+      }
     }
   }
 
   CopyImageWithPrediction(width, height, bits, image, argb_scratch, argb,
                           low_effort, max_quantization, exact,
                           used_subtract_green);
+  return WebPReportProgress(pic, percent_start + percent_range, percent);
 }
 
 //------------------------------------------------------------------------------
@@ -532,7 +543,7 @@ static float PredictionCostCrossColor(const int accumulated[256],
                                       const int counts[256]) {
   // Favor low entropy, locally and globally.
   // Favor small absolute values for PredictionCostSpatial
-  static const double kExpValue = 2.4;
+  static const float kExpValue = 2.4f;
   return VP8LCombinedShannonEntropy(counts, accumulated) +
          PredictionCostSpatial(counts, 3, kExpValue);
 }
@@ -714,11 +725,14 @@ static void CopyTileWithColorTransform(int xsize, int ysize,
   }
 }
 
-void VP8LColorSpaceTransform(int width, int height, int bits, int quality,
-                             uint32_t* const argb, uint32_t* image) {
+int VP8LColorSpaceTransform(int width, int height, int bits, int quality,
+                            uint32_t* const argb, uint32_t* image,
+                            const WebPPicture* const pic, int percent_range,
+                            int* const percent) {
   const int max_tile_size = 1 << bits;
   const int tile_xsize = VP8LSubSampleSize(width, bits);
   const int tile_ysize = VP8LSubSampleSize(height, bits);
+  int percent_start = *percent;
   int accumulated_red_histo[256] = { 0 };
   int accumulated_blue_histo[256] = { 0 };
   int tile_x, tile_y;
@@ -768,5 +782,11 @@ void VP8LColorSpaceTransform(int width, int height, int bits, int quality,
         }
       }
     }
+    if (!WebPReportProgress(
+            pic, percent_start + percent_range * tile_y / tile_ysize,
+            percent)) {
+      return 0;
+    }
   }
+  return 1;
 }
diff --git a/src/3rdparty/libwebp/src/enc/quant_enc.c b/src/3rdparty/libwebp/src/enc/quant_enc.c
index 01eb565..6d8202d 100644
--- a/src/3rdparty/libwebp/src/enc/quant_enc.c
+++ b/src/3rdparty/libwebp/src/enc/quant_enc.c
@@ -533,7 +533,8 @@ static void InitScore(VP8ModeScore* const rd) {
   rd->score = MAX_COST;
 }
 
-static void CopyScore(VP8ModeScore* const dst, const VP8ModeScore* const src) {
+static void CopyScore(VP8ModeScore* WEBP_RESTRICT const dst,
+                      const VP8ModeScore* WEBP_RESTRICT const src) {
   dst->D  = src->D;
   dst->SD = src->SD;
   dst->R  = src->R;
@@ -542,7 +543,8 @@ static void CopyScore(VP8ModeScore* const dst, const VP8ModeScore* const src) {
   dst->score = src->score;
 }
 
-static void AddScore(VP8ModeScore* const dst, const VP8ModeScore* const src) {
+static void AddScore(VP8ModeScore* WEBP_RESTRICT const dst,
+                     const VP8ModeScore* WEBP_RESTRICT const src) {
   dst->D  += src->D;
   dst->SD += src->SD;
   dst->R  += src->R;
@@ -585,15 +587,18 @@ static WEBP_INLINE score_t RDScoreTrellis(int lambda, score_t rate,
   return rate * lambda + RD_DISTO_MULT * distortion;
 }
 
-static int TrellisQuantizeBlock(const VP8Encoder* const enc,
+// Coefficient type.
+enum { TYPE_I16_AC = 0, TYPE_I16_DC = 1, TYPE_CHROMA_A = 2, TYPE_I4_AC = 3 };
+
+static int TrellisQuantizeBlock(const VP8Encoder* WEBP_RESTRICT const enc,
                                 int16_t in[16], int16_t out[16],
                                 int ctx0, int coeff_type,
-                                const VP8Matrix* const mtx,
+                                const VP8Matrix* WEBP_RESTRICT const mtx,
                                 int lambda) {
   const ProbaArray* const probas = enc->proba_.coeffs_[coeff_type];
   CostArrayPtr const costs =
       (CostArrayPtr)enc->proba_.remapped_costs_[coeff_type];
-  const int first = (coeff_type == 0) ? 1 : 0;
+  const int first = (coeff_type == TYPE_I16_AC) ? 1 : 0;
   Node nodes[16][NUM_NODES];
   ScoreState score_states[2][NUM_NODES];
   ScoreState* ss_cur = &SCORE_STATE(0, MIN_DELTA);
@@ -657,16 +662,17 @@ static int TrellisQuantizeBlock(const VP8Encoder* const enc,
     // test all alternate level values around level0.
     for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
       Node* const cur = &NODE(n, m);
-      int level = level0 + m;
+      const int level = level0 + m;
       const int ctx = (level > 2) ? 2 : level;
       const int band = VP8EncBands[n + 1];
       score_t base_score;
-      score_t best_cur_score = MAX_COST;
-      int best_prev = 0;   // default, in case
+      score_t best_cur_score;
+      int best_prev;
+      score_t cost, score;
 
-      ss_cur[m].score = MAX_COST;
       ss_cur[m].costs = costs[n + 1][ctx];
       if (level < 0 || level > thresh_level) {
+        ss_cur[m].score = MAX_COST;
         // Node is dead.
         continue;
       }
@@ -682,18 +688,24 @@ static int TrellisQuantizeBlock(const VP8Encoder* const enc,
       }
 
       // Inspect all possible non-dead predecessors. Retain only the best one.
-      for (p = -MIN_DELTA; p <= MAX_DELTA; ++p) {
+      // The base_score is added to all scores so it is only added for the final
+      // value after the loop.
+      cost = VP8LevelCost(ss_prev[-MIN_DELTA].costs, level);
+      best_cur_score =
+          ss_prev[-MIN_DELTA].score + RDScoreTrellis(lambda, cost, 0);
+      best_prev = -MIN_DELTA;
+      for (p = -MIN_DELTA + 1; p <= MAX_DELTA; ++p) {
         // Dead nodes (with ss_prev[p].score >= MAX_COST) are automatically
         // eliminated since their score can't be better than the current best.
-        const score_t cost = VP8LevelCost(ss_prev[p].costs, level);
+        cost = VP8LevelCost(ss_prev[p].costs, level);
         // Examine node assuming it's a non-terminal one.
-        const score_t score =
-            base_score + ss_prev[p].score + RDScoreTrellis(lambda, cost, 0);
+        score = ss_prev[p].score + RDScoreTrellis(lambda, cost, 0);
         if (score < best_cur_score) {
           best_cur_score = score;
           best_prev = p;
         }
       }
+      best_cur_score += base_score;
       // Store best finding in current node.
       cur->sign = sign;
       cur->level = level;
@@ -701,11 +713,11 @@ static int TrellisQuantizeBlock(const VP8Encoder* const enc,
       ss_cur[m].score = best_cur_score;
 
       // Now, record best terminal node (and thus best entry in the graph).
-      if (level != 0) {
+      if (level != 0 && best_cur_score < best_score) {
         const score_t last_pos_cost =
             (n < 15) ? VP8BitCost(0, probas[band][ctx][0]) : 0;
         const score_t last_pos_score = RDScoreTrellis(lambda, last_pos_cost, 0);
-        const score_t score = best_cur_score + last_pos_score;
+        score = best_cur_score + last_pos_score;
         if (score < best_score) {
           best_score = score;
           best_path[0] = n;                     // best eob position
@@ -717,10 +729,16 @@ static int TrellisQuantizeBlock(const VP8Encoder* const enc,
   }
 
   // Fresh start
-  memset(in + first, 0, (16 - first) * sizeof(*in));
-  memset(out + first, 0, (16 - first) * sizeof(*out));
+  // Beware! We must preserve in[0]/out[0] value for TYPE_I16_AC case.
+  if (coeff_type == TYPE_I16_AC) {
+    memset(in + 1, 0, 15 * sizeof(*in));
+    memset(out + 1, 0, 15 * sizeof(*out));
+  } else {
+    memset(in, 0, 16 * sizeof(*in));
+    memset(out, 0, 16 * sizeof(*out));
+  }
   if (best_path[0] == -1) {
-    return 0;   // skip!
+    return 0;  // skip!
   }
 
   {
@@ -751,9 +769,9 @@ static int TrellisQuantizeBlock(const VP8Encoder* const enc,
 // all at once. Output is the reconstructed block in *yuv_out, and the
 // quantized levels in *levels.
 
-static int ReconstructIntra16(VP8EncIterator* const it,
-                              VP8ModeScore* const rd,
-                              uint8_t* const yuv_out,
+static int ReconstructIntra16(VP8EncIterator* WEBP_RESTRICT const it,
+                              VP8ModeScore* WEBP_RESTRICT const rd,
+                              uint8_t* WEBP_RESTRICT const yuv_out,
                               int mode) {
   const VP8Encoder* const enc = it->enc_;
   const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode];
@@ -775,9 +793,9 @@ static int ReconstructIntra16(VP8EncIterator* const it,
     for (y = 0, n = 0; y < 4; ++y) {
       for (x = 0; x < 4; ++x, ++n) {
         const int ctx = it->top_nz_[x] + it->left_nz_[y];
-        const int non_zero =
-            TrellisQuantizeBlock(enc, tmp[n], rd->y_ac_levels[n], ctx, 0,
-                                 &dqm->y1_, dqm->lambda_trellis_i16_);
+        const int non_zero = TrellisQuantizeBlock(
+            enc, tmp[n], rd->y_ac_levels[n], ctx, TYPE_I16_AC, &dqm->y1_,
+            dqm->lambda_trellis_i16_);
         it->top_nz_[x] = it->left_nz_[y] = non_zero;
         rd->y_ac_levels[n][0] = 0;
         nz |= non_zero << n;
@@ -803,10 +821,10 @@ static int ReconstructIntra16(VP8EncIterator* const it,
   return nz;
 }
 
-static int ReconstructIntra4(VP8EncIterator* const it,
+static int ReconstructIntra4(VP8EncIterator* WEBP_RESTRICT const it,
                              int16_t levels[16],
-                             const uint8_t* const src,
-                             uint8_t* const yuv_out,
+                             const uint8_t* WEBP_RESTRICT const src,
+                             uint8_t* WEBP_RESTRICT const yuv_out,
                              int mode) {
   const VP8Encoder* const enc = it->enc_;
   const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode];
@@ -818,7 +836,7 @@ static int ReconstructIntra4(VP8EncIterator* const it,
   if (DO_TRELLIS_I4 && it->do_trellis_) {
     const int x = it->i4_ & 3, y = it->i4_ >> 2;
     const int ctx = it->top_nz_[x] + it->left_nz_[y];
-    nz = TrellisQuantizeBlock(enc, tmp, levels, ctx, 3, &dqm->y1_,
+    nz = TrellisQuantizeBlock(enc, tmp, levels, ctx, TYPE_I4_AC, &dqm->y1_,
                               dqm->lambda_trellis_i4_);
   } else {
     nz = VP8EncQuantizeBlock(tmp, levels, &dqm->y1_);
@@ -839,7 +857,8 @@ static int ReconstructIntra4(VP8EncIterator* const it,
 
 // Quantize as usual, but also compute and return the quantization error.
 // Error is already divided by DSHIFT.
-static int QuantizeSingle(int16_t* const v, const VP8Matrix* const mtx) {
+static int QuantizeSingle(int16_t* WEBP_RESTRICT const v,
+                          const VP8Matrix* WEBP_RESTRICT const mtx) {
   int V = *v;
   const int sign = (V < 0);
   if (sign) V = -V;
@@ -853,9 +872,10 @@ static int QuantizeSingle(int16_t* const v, const VP8Matrix* const mtx) {
   return (sign ? -V : V) >> DSCALE;
 }
 
-static void CorrectDCValues(const VP8EncIterator* const it,
-                            const VP8Matrix* const mtx,
-                            int16_t tmp[][16], VP8ModeScore* const rd) {
+static void CorrectDCValues(const VP8EncIterator* WEBP_RESTRICT const it,
+                            const VP8Matrix* WEBP_RESTRICT const mtx,
+                            int16_t tmp[][16],
+                            VP8ModeScore* WEBP_RESTRICT const rd) {
   //         | top[0] | top[1]
   // --------+--------+---------
   // left[0] | tmp[0]   tmp[1]  <->   err0 err1
@@ -886,8 +906,8 @@ static void CorrectDCValues(const VP8EncIterator* const it,
   }
 }
 
-static void StoreDiffusionErrors(VP8EncIterator* const it,
-                                 const VP8ModeScore* const rd) {
+static void StoreDiffusionErrors(VP8EncIterator* WEBP_RESTRICT const it,
+                                 const VP8ModeScore* WEBP_RESTRICT const rd) {
   int ch;
   for (ch = 0; ch <= 1; ++ch) {
     int8_t* const top = it->top_derr_[it->x_][ch];
@@ -906,8 +926,9 @@ static void StoreDiffusionErrors(VP8EncIterator* const it,
 
 //------------------------------------------------------------------------------
 
-static int ReconstructUV(VP8EncIterator* const it, VP8ModeScore* const rd,
-                         uint8_t* const yuv_out, int mode) {
+static int ReconstructUV(VP8EncIterator* WEBP_RESTRICT const it,
+                         VP8ModeScore* WEBP_RESTRICT const rd,
+                         uint8_t* WEBP_RESTRICT const yuv_out, int mode) {
   const VP8Encoder* const enc = it->enc_;
   const uint8_t* const ref = it->yuv_p_ + VP8UVModeOffsets[mode];
   const uint8_t* const src = it->yuv_in_ + U_OFF_ENC;
@@ -927,9 +948,9 @@ static int ReconstructUV(VP8EncIterator* const it, VP8ModeScore* const rd,
       for (y = 0; y < 2; ++y) {
         for (x = 0; x < 2; ++x, ++n) {
           const int ctx = it->top_nz_[4 + ch + x] + it->left_nz_[4 + ch + y];
-          const int non_zero =
-              TrellisQuantizeBlock(enc, tmp[n], rd->uv_levels[n], ctx, 2,
-                                   &dqm->uv_, dqm->lambda_trellis_uv_);
+          const int non_zero = TrellisQuantizeBlock(
+              enc, tmp[n], rd->uv_levels[n], ctx, TYPE_CHROMA_A, &dqm->uv_,
+              dqm->lambda_trellis_uv_);
           it->top_nz_[4 + ch + x] = it->left_nz_[4 + ch + y] = non_zero;
           nz |= non_zero << n;
         }
@@ -978,7 +999,8 @@ static void SwapOut(VP8EncIterator* const it) {
   SwapPtr(&it->yuv_out_, &it->yuv_out2_);
 }
 
-static void PickBestIntra16(VP8EncIterator* const it, VP8ModeScore* rd) {
+static void PickBestIntra16(VP8EncIterator* WEBP_RESTRICT const it,
+                            VP8ModeScore* WEBP_RESTRICT rd) {
   const int kNumBlocks = 16;
   VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
   const int lambda = dqm->lambda_i16_;
@@ -1038,7 +1060,7 @@ static void PickBestIntra16(VP8EncIterator* const it, VP8ModeScore* rd) {
 //------------------------------------------------------------------------------
 
 // return the cost array corresponding to the surrounding prediction modes.
-static const uint16_t* GetCostModeI4(VP8EncIterator* const it,
+static const uint16_t* GetCostModeI4(VP8EncIterator* WEBP_RESTRICT const it,
                                      const uint8_t modes[16]) {
   const int preds_w = it->enc_->preds_w_;
   const int x = (it->i4_ & 3), y = it->i4_ >> 2;
@@ -1047,7 +1069,8 @@ static const uint16_t* GetCostModeI4(VP8EncIterator* const it,
   return VP8FixedCostsI4[top][left];
 }
 
-static int PickBestIntra4(VP8EncIterator* const it, VP8ModeScore* const rd) {
+static int PickBestIntra4(VP8EncIterator* WEBP_RESTRICT const it,
+                          VP8ModeScore* WEBP_RESTRICT const rd) {
   const VP8Encoder* const enc = it->enc_;
   const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
   const int lambda = dqm->lambda_i4_;
@@ -1143,7 +1166,8 @@ static int PickBestIntra4(VP8EncIterator* const it, VP8ModeScore* const rd) {
 
 //------------------------------------------------------------------------------
 
-static void PickBestUV(VP8EncIterator* const it, VP8ModeScore* const rd) {
+static void PickBestUV(VP8EncIterator* WEBP_RESTRICT const it,
+                       VP8ModeScore* WEBP_RESTRICT const rd) {
   const int kNumBlocks = 8;
   const VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
   const int lambda = dqm->lambda_uv_;
@@ -1195,7 +1219,8 @@ static void PickBestUV(VP8EncIterator* const it, VP8ModeScore* const rd) {
 //------------------------------------------------------------------------------
 // Final reconstruction and quantization.
 
-static void SimpleQuantize(VP8EncIterator* const it, VP8ModeScore* const rd) {
+static void SimpleQuantize(VP8EncIterator* WEBP_RESTRICT const it,
+                           VP8ModeScore* WEBP_RESTRICT const rd) {
   const VP8Encoder* const enc = it->enc_;
   const int is_i16 = (it->mb_->type_ == 1);
   int nz = 0;
@@ -1220,9 +1245,9 @@ static void SimpleQuantize(VP8EncIterator* const it, VP8ModeScore* const rd) {
 }
 
 // Refine intra16/intra4 sub-modes based on distortion only (not rate).
-static void RefineUsingDistortion(VP8EncIterator* const it,
+static void RefineUsingDistortion(VP8EncIterator* WEBP_RESTRICT const it,
                                   int try_both_modes, int refine_uv_mode,
-                                  VP8ModeScore* const rd) {
+                                  VP8ModeScore* WEBP_RESTRICT const rd) {
   score_t best_score = MAX_COST;
   int nz = 0;
   int mode;
@@ -1336,7 +1361,8 @@ static void RefineUsingDistortion(VP8EncIterator* const it,
 //------------------------------------------------------------------------------
 // Entry point
 
-int VP8Decimate(VP8EncIterator* const it, VP8ModeScore* const rd,
+int VP8Decimate(VP8EncIterator* WEBP_RESTRICT const it,
+                VP8ModeScore* WEBP_RESTRICT const rd,
                 VP8RDLevel rd_opt) {
   int is_skipped;
   const int method = it->enc_->method_;
diff --git a/src/3rdparty/libwebp/src/enc/syntax_enc.c b/src/3rdparty/libwebp/src/enc/syntax_enc.c
index a9e5a6c..e18cf65 100644
--- a/src/3rdparty/libwebp/src/enc/syntax_enc.c
+++ b/src/3rdparty/libwebp/src/enc/syntax_enc.c
@@ -349,7 +349,7 @@ int VP8EncWrite(VP8Encoder* const enc) {
                                        (enc->alpha_data_size_ & 1);
     riff_size += CHUNK_HEADER_SIZE + padded_alpha_size;
   }
-  // Sanity check.
+  // RIFF size should fit in 32-bits.
   if (riff_size > 0xfffffffeU) {
     return WebPEncodingSetError(pic, VP8_ENC_ERROR_FILE_TOO_BIG);
   }
diff --git a/src/3rdparty/libwebp/src/enc/vp8i_enc.h b/src/3rdparty/libwebp/src/enc/vp8i_enc.h
index 24e1944..c9927c4 100644
--- a/src/3rdparty/libwebp/src/enc/vp8i_enc.h
+++ b/src/3rdparty/libwebp/src/enc/vp8i_enc.h
@@ -31,8 +31,8 @@ extern "C" {
 
 // version numbers
 #define ENC_MAJ_VERSION 1
-#define ENC_MIN_VERSION 0
-#define ENC_REV_VERSION 3
+#define ENC_MIN_VERSION 3
+#define ENC_REV_VERSION 0
 
 enum { MAX_LF_LEVELS = 64,       // Maximum loop filter level
        MAX_VARIABLE_LEVEL = 67,  // last (inclusive) level with variable cost
@@ -249,7 +249,7 @@ typedef struct {
   int           percent0_;         // saved initial progress percent
 
   DError        left_derr_;        // left error diffusion (u/v)
-  DError       *top_derr_;         // top diffusion error - NULL if disabled
+  DError*       top_derr_;         // top diffusion error - NULL if disabled
 
   uint8_t* y_left_;    // left luma samples (addressable from index -1 to 15).
   uint8_t* u_left_;    // left u samples (addressable from index -1 to 7)
@@ -286,8 +286,7 @@ int VP8IteratorNext(VP8EncIterator* const it);
 // save the yuv_out_ boundary values to top_/left_ arrays for next iterations.
 void VP8IteratorSaveBoundary(VP8EncIterator* const it);
 // Report progression based on macroblock rows. Return 0 for user-abort request.
-int VP8IteratorProgress(const VP8EncIterator* const it,
-                        int final_delta_percent);
+int VP8IteratorProgress(const VP8EncIterator* const it, int delta);
 // Intra4x4 iterations
 void VP8IteratorStartI4(VP8EncIterator* const it);
 // returns true if not done.
@@ -471,7 +470,8 @@ int VP8EncAnalyze(VP8Encoder* const enc);
 // Sets up segment's quantization values, base_quant_ and filter strengths.
 void VP8SetSegmentParams(VP8Encoder* const enc, float quality);
 // Pick best modes and fills the levels. Returns true if skipped.
-int VP8Decimate(VP8EncIterator* const it, VP8ModeScore* const rd,
+int VP8Decimate(VP8EncIterator* WEBP_RESTRICT const it,
+                VP8ModeScore* WEBP_RESTRICT const rd,
                 VP8RDLevel rd_opt);
 
   // in alpha.c
@@ -491,23 +491,28 @@ int VP8FilterStrengthFromDelta(int sharpness, int delta);
 
   // misc utils for picture_*.c:
 
+// Returns true if 'picture' is non-NULL and dimensions/colorspace are within
+// their valid ranges. If returning false, the 'error_code' in 'picture' is
+// updated.
+int WebPValidatePicture(const WebPPicture* const picture);
+
 // Remove reference to the ARGB/YUVA buffer (doesn't free anything).
 void WebPPictureResetBuffers(WebPPicture* const picture);
 
-// Allocates ARGB buffer of given dimension (previous one is always free'd).
-// Preserves the YUV(A) buffer. Returns false in case of error (invalid param,
-// out-of-memory).
-int WebPPictureAllocARGB(WebPPicture* const picture, int width, int height);
+// Allocates ARGB buffer according to set width/height (previous one is
+// always free'd). Preserves the YUV(A) buffer. Returns false in case of error
+// (invalid param, out-of-memory).
+int WebPPictureAllocARGB(WebPPicture* const picture);
 
-// Allocates YUVA buffer of given dimension (previous one is always free'd).
-// Uses picture->csp to determine whether an alpha buffer is needed.
+// Allocates YUVA buffer according to set width/height (previous one is always
+// free'd). Uses picture->csp to determine whether an alpha buffer is needed.
 // Preserves the ARGB buffer.
 // Returns false in case of error (invalid param, out-of-memory).
-int WebPPictureAllocYUVA(WebPPicture* const picture, int width, int height);
+int WebPPictureAllocYUVA(WebPPicture* const picture);
 
-// Clean-up the RGB samples under fully transparent area, to help lossless
-// compressibility (no guarantee, though). Assumes that pic->use_argb is true.
-void WebPCleanupTransparentAreaLossless(WebPPicture* const pic);
+// Replace samples that are fully transparent by 'color' to help compressibility
+// (no guarantee, though). Assumes pic->use_argb is true.
+void WebPReplaceTransparentPixels(WebPPicture* const pic, uint32_t color);
 
 //------------------------------------------------------------------------------
 
diff --git a/src/3rdparty/libwebp/src/enc/vp8l_enc.c b/src/3rdparty/libwebp/src/enc/vp8l_enc.c
index 2efd403..0b07e52 100644
--- a/src/3rdparty/libwebp/src/enc/vp8l_enc.c
+++ b/src/3rdparty/libwebp/src/enc/vp8l_enc.c
@@ -15,15 +15,16 @@
 #include <assert.h>
 #include <stdlib.h>
 
+#include "src/dsp/lossless.h"
+#include "src/dsp/lossless_common.h"
 #include "src/enc/backward_references_enc.h"
 #include "src/enc/histogram_enc.h"
 #include "src/enc/vp8i_enc.h"
 #include "src/enc/vp8li_enc.h"
-#include "src/dsp/lossless.h"
-#include "src/dsp/lossless_common.h"
 #include "src/utils/bit_writer_utils.h"
 #include "src/utils/huffman_encode_utils.h"
 #include "src/utils/utils.h"
+#include "src/webp/encode.h"
 #include "src/webp/format_constants.h"
 
 // Maximum number of histogram images (sub-blocks).
@@ -65,25 +66,22 @@ static WEBP_INLINE void SwapColor(uint32_t* const col1, uint32_t* const col2) {
   *col2 = tmp;
 }
 
-static void GreedyMinimizeDeltas(uint32_t palette[], int num_colors) {
-  // Find greedily always the closest color of the predicted color to minimize
-  // deltas in the palette. This reduces storage needs since the
-  // palette is stored with delta encoding.
-  uint32_t predict = 0x00000000;
-  int i, k;
-  for (i = 0; i < num_colors; ++i) {
-    int best_ix = i;
-    uint32_t best_score = ~0U;
-    for (k = i; k < num_colors; ++k) {
-      const uint32_t cur_score = PaletteColorDistance(palette[k], predict);
-      if (best_score > cur_score) {
-        best_score = cur_score;
-        best_ix = k;
-      }
+static WEBP_INLINE int SearchColorNoIdx(const uint32_t sorted[], uint32_t color,
+                                        int num_colors) {
+  int low = 0, hi = num_colors;
+  if (sorted[low] == color) return low;  // loop invariant: sorted[low] != color
+  while (1) {
+    const int mid = (low + hi) >> 1;
+    if (sorted[mid] == color) {
+      return mid;
+    } else if (sorted[mid] < color) {
+      low = mid;
+    } else {
+      hi = mid;
     }
-    SwapColor(&palette[best_ix], &palette[i]);
-    predict = palette[i];
   }
+  assert(0);
+  return 0;
 }
 
 // The palette has been sorted by alpha. This function checks if the other
@@ -92,7 +90,8 @@ static void GreedyMinimizeDeltas(uint32_t palette[], int num_colors) {
 // no benefit to re-organize them greedily. A monotonic development
 // would be spotted in green-only situations (like lossy alpha) or gray-scale
 // images.
-static int PaletteHasNonMonotonousDeltas(uint32_t palette[], int num_colors) {
+static int PaletteHasNonMonotonousDeltas(const uint32_t* const palette,
+                                         int num_colors) {
   uint32_t predict = 0x000000;
   int i;
   uint8_t sign_found = 0x00;
@@ -115,28 +114,218 @@ static int PaletteHasNonMonotonousDeltas(uint32_t palette[], int num_colors) {
   return (sign_found & (sign_found << 1)) != 0;  // two consequent signs.
 }
 
+static void PaletteSortMinimizeDeltas(const uint32_t* const palette_sorted,
+                                      int num_colors, uint32_t* const palette) {
+  uint32_t predict = 0x00000000;
+  int i, k;
+  memcpy(palette, palette_sorted, num_colors * sizeof(*palette));
+  if (!PaletteHasNonMonotonousDeltas(palette_sorted, num_colors)) return;
+  // Find greedily always the closest color of the predicted color to minimize
+  // deltas in the palette. This reduces storage needs since the
+  // palette is stored with delta encoding.
+  for (i = 0; i < num_colors; ++i) {
+    int best_ix = i;
+    uint32_t best_score = ~0U;
+    for (k = i; k < num_colors; ++k) {
+      const uint32_t cur_score = PaletteColorDistance(palette[k], predict);
+      if (best_score > cur_score) {
+        best_score = cur_score;
+        best_ix = k;
+      }
+    }
+    SwapColor(&palette[best_ix], &palette[i]);
+    predict = palette[i];
+  }
+}
+
+// Sort palette in increasing order and prepare an inverse mapping array.
+static void PrepareMapToPalette(const uint32_t palette[], uint32_t num_colors,
+                                uint32_t sorted[], uint32_t idx_map[]) {
+  uint32_t i;
+  memcpy(sorted, palette, num_colors * sizeof(*sorted));
+  qsort(sorted, num_colors, sizeof(*sorted), PaletteCompareColorsForQsort);
+  for (i = 0; i < num_colors; ++i) {
+    idx_map[SearchColorNoIdx(sorted, palette[i], num_colors)] = i;
+  }
+}
+
 // -----------------------------------------------------------------------------
-// Palette
+// Modified Zeng method from "A Survey on Palette Reordering
+// Methods for Improving the Compression of Color-Indexed Images" by Armando J.
+// Pinho and Antonio J. R. Neves.
+
+// Finds the biggest cooccurrence in the matrix.
+static void CoOccurrenceFindMax(const uint32_t* const cooccurrence,
+                                uint32_t num_colors, uint8_t* const c1,
+                                uint8_t* const c2) {
+  // Find the index that is most frequently located adjacent to other
+  // (different) indexes.
+  uint32_t best_sum = 0u;
+  uint32_t i, j, best_cooccurrence;
+  *c1 = 0u;
+  for (i = 0; i < num_colors; ++i) {
+    uint32_t sum = 0;
+    for (j = 0; j < num_colors; ++j) sum += cooccurrence[i * num_colors + j];
+    if (sum > best_sum) {
+      best_sum = sum;
+      *c1 = i;
+    }
+  }
+  // Find the index that is most frequently found adjacent to *c1.
+  *c2 = 0u;
+  best_cooccurrence = 0u;
+  for (i = 0; i < num_colors; ++i) {
+    if (cooccurrence[*c1 * num_colors + i] > best_cooccurrence) {
+      best_cooccurrence = cooccurrence[*c1 * num_colors + i];
+      *c2 = i;
+    }
+  }
+  assert(*c1 != *c2);
+}
 
-// If number of colors in the image is less than or equal to MAX_PALETTE_SIZE,
-// creates a palette and returns true, else returns false.
-static int AnalyzeAndCreatePalette(const WebPPicture* const pic,
-                                   int low_effort,
-                                   uint32_t palette[MAX_PALETTE_SIZE],
-                                   int* const palette_size) {
-  const int num_colors = WebPGetColorPalette(pic, palette);
-  if (num_colors > MAX_PALETTE_SIZE) {
-    *palette_size = 0;
+// Builds the cooccurrence matrix
+static int CoOccurrenceBuild(const WebPPicture* const pic,
+                             const uint32_t* const palette, uint32_t num_colors,
+                             uint32_t* cooccurrence) {
+  uint32_t *lines, *line_top, *line_current, *line_tmp;
+  int x, y;
+  const uint32_t* src = pic->argb;
+  uint32_t prev_pix = ~src[0];
+  uint32_t prev_idx = 0u;
+  uint32_t idx_map[MAX_PALETTE_SIZE] = {0};
+  uint32_t palette_sorted[MAX_PALETTE_SIZE];
+  lines = (uint32_t*)WebPSafeMalloc(2 * pic->width, sizeof(*lines));
+  if (lines == NULL) {
+    WebPEncodingSetError(pic, VP8_ENC_ERROR_OUT_OF_MEMORY);
     return 0;
   }
-  *palette_size = num_colors;
-  qsort(palette, num_colors, sizeof(*palette), PaletteCompareColorsForQsort);
-  if (!low_effort && PaletteHasNonMonotonousDeltas(palette, num_colors)) {
-    GreedyMinimizeDeltas(palette, num_colors);
+  line_top = &lines[0];
+  line_current = &lines[pic->width];
+  PrepareMapToPalette(palette, num_colors, palette_sorted, idx_map);
+  for (y = 0; y < pic->height; ++y) {
+    for (x = 0; x < pic->width; ++x) {
+      const uint32_t pix = src[x];
+      if (pix != prev_pix) {
+        prev_idx = idx_map[SearchColorNoIdx(palette_sorted, pix, num_colors)];
+        prev_pix = pix;
+      }
+      line_current[x] = prev_idx;
+      // 4-connectivity is what works best as mentioned in "On the relation
+      // between Memon's and the modified Zeng's palette reordering methods".
+      if (x > 0 && prev_idx != line_current[x - 1]) {
+        const uint32_t left_idx = line_current[x - 1];
+        ++cooccurrence[prev_idx * num_colors + left_idx];
+        ++cooccurrence[left_idx * num_colors + prev_idx];
+      }
+      if (y > 0 && prev_idx != line_top[x]) {
+        const uint32_t top_idx = line_top[x];
+        ++cooccurrence[prev_idx * num_colors + top_idx];
+        ++cooccurrence[top_idx * num_colors + prev_idx];
+      }
+    }
+    line_tmp = line_top;
+    line_top = line_current;
+    line_current = line_tmp;
+    src += pic->argb_stride;
   }
+  WebPSafeFree(lines);
   return 1;
 }
 
+struct Sum {
+  uint8_t index;
+  uint32_t sum;
+};
+
+// Implements the modified Zeng method from "A Survey on Palette Reordering
+// Methods for Improving the Compression of Color-Indexed Images" by Armando J.
+// Pinho and Antonio J. R. Neves.
+static int PaletteSortModifiedZeng(
+    const WebPPicture* const pic, const uint32_t* const palette_sorted,
+    uint32_t num_colors, uint32_t* const palette) {
+  uint32_t i, j, ind;
+  uint8_t remapping[MAX_PALETTE_SIZE];
+  uint32_t* cooccurrence;
+  struct Sum sums[MAX_PALETTE_SIZE];
+  uint32_t first, last;
+  uint32_t num_sums;
+  // TODO(vrabaud) check whether one color images should use palette or not.
+  if (num_colors <= 1) return 1;
+  // Build the co-occurrence matrix.
+  cooccurrence =
+      (uint32_t*)WebPSafeCalloc(num_colors * num_colors, sizeof(*cooccurrence));
+  if (cooccurrence == NULL) {
+    WebPEncodingSetError(pic, VP8_ENC_ERROR_OUT_OF_MEMORY);
+    return 0;
+  }
+  if (!CoOccurrenceBuild(pic, palette_sorted, num_colors, cooccurrence)) {
+    return 0;
+  }
+
+  // Initialize the mapping list with the two best indices.
+  CoOccurrenceFindMax(cooccurrence, num_colors, &remapping[0], &remapping[1]);
+
+  // We need to append and prepend to the list of remapping. To this end, we
+  // actually define the next start/end of the list as indices in a vector (with
+  // a wrap around when the end is reached).
+  first = 0;
+  last = 1;
+  num_sums = num_colors - 2;  // -2 because we know the first two values
+  if (num_sums > 0) {
+    // Initialize the sums with the first two remappings and find the best one
+    struct Sum* best_sum = &sums[0];
+    best_sum->index = 0u;
+    best_sum->sum = 0u;
+    for (i = 0, j = 0; i < num_colors; ++i) {
+      if (i == remapping[0] || i == remapping[1]) continue;
+      sums[j].index = i;
+      sums[j].sum = cooccurrence[i * num_colors + remapping[0]] +
+                    cooccurrence[i * num_colors + remapping[1]];
+      if (sums[j].sum > best_sum->sum) best_sum = &sums[j];
+      ++j;
+    }
+
+    while (num_sums > 0) {
+      const uint8_t best_index = best_sum->index;
+      // Compute delta to know if we need to prepend or append the best index.
+      int32_t delta = 0;
+      const int32_t n = num_colors - num_sums;
+      for (ind = first, j = 0; (ind + j) % num_colors != last + 1; ++j) {
+        const uint16_t l_j = remapping[(ind + j) % num_colors];
+        delta += (n - 1 - 2 * (int32_t)j) *
+                 (int32_t)cooccurrence[best_index * num_colors + l_j];
+      }
+      if (delta > 0) {
+        first = (first == 0) ? num_colors - 1 : first - 1;
+        remapping[first] = best_index;
+      } else {
+        ++last;
+        remapping[last] = best_index;
+      }
+      // Remove best_sum from sums.
+      *best_sum = sums[num_sums - 1];
+      --num_sums;
+      // Update all the sums and find the best one.
+      best_sum = &sums[0];
+      for (i = 0; i < num_sums; ++i) {
+        sums[i].sum += cooccurrence[best_index * num_colors + sums[i].index];
+        if (sums[i].sum > best_sum->sum) best_sum = &sums[i];
+      }
+    }
+  }
+  assert((last + 1) % num_colors == first);
+  WebPSafeFree(cooccurrence);
+
+  // Re-map the palette.
+  for (i = 0; i < num_colors; ++i) {
+    palette[i] = palette_sorted[remapping[(first + i) % num_colors]];
+  }
+  return 1;
+}
+
+// -----------------------------------------------------------------------------
+// Palette
+
 // These five modes are evaluated and their respective entropy is computed.
 typedef enum {
   kDirect = 0,
@@ -144,10 +333,18 @@ typedef enum {
   kSubGreen = 2,
   kSpatialSubGreen = 3,
   kPalette = 4,
-  kNumEntropyIx = 5
+  kPaletteAndSpatial = 5,
+  kNumEntropyIx = 6
 } EntropyIx;
 
 typedef enum {
+  kSortedDefault = 0,
+  kMinimizeDelta = 1,
+  kModifiedZeng = 2,
+  kUnusedPalette = 3,
+} PaletteSorting;
+
+typedef enum {
   kHistoAlpha = 0,
   kHistoAlphaPred,
   kHistoGreen,
@@ -164,10 +361,11 @@ typedef enum {
   kHistoTotal  // Must be last.
 } HistoIx;
 
-static void AddSingleSubGreen(int p, uint32_t* const r, uint32_t* const b) {
-  const int green = p >> 8;  // The upper bits are masked away later.
-  ++r[((p >> 16) - green) & 0xff];
-  ++b[((p >>  0) - green) & 0xff];
+static void AddSingleSubGreen(uint32_t p,
+                              uint32_t* const r, uint32_t* const b) {
+  const int green = (int)p >> 8;  // The upper bits are masked away later.
+  ++r[(((int)p >> 16) - green) & 0xff];
+  ++b[(((int)p >>  0) - green) & 0xff];
 }
 
 static void AddSingle(uint32_t p,
@@ -241,8 +439,8 @@ static int AnalyzeEntropy(const uint32_t* argb,
       curr_row += argb_stride;
     }
     {
-      double entropy_comp[kHistoTotal];
-      double entropy[kNumEntropyIx];
+      float entropy_comp[kHistoTotal];
+      float entropy[kNumEntropyIx];
       int k;
       int last_mode_to_analyze = use_palette ? kPalette : kSpatialSubGreen;
       int j;
@@ -354,14 +552,21 @@ static int GetTransformBits(int method, int histo_bits) {
 }
 
 // Set of parameters to be used in each iteration of the cruncher.
-#define CRUNCH_CONFIGS_LZ77_MAX 2
+#define CRUNCH_SUBCONFIGS_MAX 2
+typedef struct {
+  int lz77_;
+  int do_no_cache_;
+} CrunchSubConfig;
 typedef struct {
   int entropy_idx_;
-  int lz77s_types_to_try_[CRUNCH_CONFIGS_LZ77_MAX];
-  int lz77s_types_to_try_size_;
+  PaletteSorting palette_sorting_type_;
+  CrunchSubConfig sub_configs_[CRUNCH_SUBCONFIGS_MAX];
+  int sub_configs_size_;
 } CrunchConfig;
 
-#define CRUNCH_CONFIGS_MAX kNumEntropyIx
+// +2 because we add a palette sorting configuration for kPalette and
+// kPaletteAndSpatial.
+#define CRUNCH_CONFIGS_MAX (kNumEntropyIx + 2)
 
 static int EncoderAnalyze(VP8LEncoder* const enc,
                           CrunchConfig crunch_configs[CRUNCH_CONFIGS_MAX],
@@ -376,11 +581,20 @@ static int EncoderAnalyze(VP8LEncoder* const enc,
   int i;
   int use_palette;
   int n_lz77s;
+  // If set to 0, analyze the cache with the computed cache value. If 1, also
+  // analyze with no-cache.
+  int do_no_cache = 0;
   assert(pic != NULL && pic->argb != NULL);
 
-  use_palette =
-      AnalyzeAndCreatePalette(pic, low_effort,
-                              enc->palette_, &enc->palette_size_);
+  // Check whether a palette is possible.
+  enc->palette_size_ = WebPGetColorPalette(pic, enc->palette_sorted_);
+  use_palette = (enc->palette_size_ <= MAX_PALETTE_SIZE);
+  if (!use_palette) {
+    enc->palette_size_ = 0;
+  } else {
+    qsort(enc->palette_sorted_, enc->palette_size_,
+          sizeof(*enc->palette_sorted_), PaletteCompareColorsForQsort);
+  }
 
   // Empirical bit sizes.
   enc->histo_bits_ = GetHistoBits(method, use_palette,
@@ -390,6 +604,8 @@ static int EncoderAnalyze(VP8LEncoder* const enc,
   if (low_effort) {
     // AnalyzeEntropy is somewhat slow.
     crunch_configs[0].entropy_idx_ = use_palette ? kPalette : kSpatialSubGreen;
+    crunch_configs[0].palette_sorting_type_ =
+        use_palette ? kSortedDefault : kUnusedPalette;
     n_lz77s = 1;
     *crunch_configs_size = 1;
   } else {
@@ -402,29 +618,59 @@ static int EncoderAnalyze(VP8LEncoder* const enc,
       return 0;
     }
     if (method == 6 && config->quality == 100) {
+      do_no_cache = 1;
       // Go brute force on all transforms.
       *crunch_configs_size = 0;
       for (i = 0; i < kNumEntropyIx; ++i) {
-        if (i != kPalette || use_palette) {
+        // We can only apply kPalette or kPaletteAndSpatial if we can indeed use
+        // a palette.
+        if ((i != kPalette && i != kPaletteAndSpatial) || use_palette) {
           assert(*crunch_configs_size < CRUNCH_CONFIGS_MAX);
-          crunch_configs[(*crunch_configs_size)++].entropy_idx_ = i;
+          crunch_configs[(*crunch_configs_size)].entropy_idx_ = i;
+          if (use_palette && (i == kPalette || i == kPaletteAndSpatial)) {
+            crunch_configs[(*crunch_configs_size)].palette_sorting_type_ =
+                kMinimizeDelta;
+            ++*crunch_configs_size;
+            // Also add modified Zeng's method.
+            crunch_configs[(*crunch_configs_size)].entropy_idx_ = i;
+            crunch_configs[(*crunch_configs_size)].palette_sorting_type_ =
+                kModifiedZeng;
+          } else {
+            crunch_configs[(*crunch_configs_size)].palette_sorting_type_ =
+                kUnusedPalette;
+          }
+          ++*crunch_configs_size;
         }
       }
     } else {
       // Only choose the guessed best transform.
       *crunch_configs_size = 1;
       crunch_configs[0].entropy_idx_ = min_entropy_ix;
+      crunch_configs[0].palette_sorting_type_ =
+          use_palette ? kMinimizeDelta : kUnusedPalette;
+      if (config->quality >= 75 && method == 5) {
+        // Test with and without color cache.
+        do_no_cache = 1;
+        // If we have a palette, also check in combination with spatial.
+        if (min_entropy_ix == kPalette) {
+          *crunch_configs_size = 2;
+          crunch_configs[1].entropy_idx_ = kPaletteAndSpatial;
+          crunch_configs[1].palette_sorting_type_ = kMinimizeDelta;
+        }
+      }
     }
   }
   // Fill in the different LZ77s.
-  assert(n_lz77s <= CRUNCH_CONFIGS_LZ77_MAX);
+  assert(n_lz77s <= CRUNCH_SUBCONFIGS_MAX);
   for (i = 0; i < *crunch_configs_size; ++i) {
     int j;
     for (j = 0; j < n_lz77s; ++j) {
-      crunch_configs[i].lz77s_types_to_try_[j] =
+      assert(j < CRUNCH_SUBCONFIGS_MAX);
+      crunch_configs[i].sub_configs_[j].lz77_ =
           (j == 0) ? kLZ77Standard | kLZ77RLE : kLZ77Box;
+      crunch_configs[i].sub_configs_[j].do_no_cache_ = do_no_cache;
     }
-    crunch_configs[i].lz77s_types_to_try_size_ = n_lz77s;
+    crunch_configs[i].sub_configs_size_ = n_lz77s;
   }
   return 1;
 }
@@ -440,7 +686,7 @@ static int EncoderInit(VP8LEncoder* const enc) {
   int i;
   if (!VP8LHashChainInit(&enc->hash_chain_, pix_cnt)) return 0;
 
-  for (i = 0; i < 3; ++i) VP8LBackwardRefsInit(&enc->refs_[i], refs_block_size);
+  for (i = 0; i < 4; ++i) VP8LBackwardRefsInit(&enc->refs_[i], refs_block_size);
 
   return 1;
 }
@@ -708,11 +954,11 @@ static WEBP_INLINE void WriteHuffmanCodeWithExtraBits(
   VP8LPutBits(bw, (bits << depth) | symbol, depth + n_bits);
 }
 
-static WebPEncodingError StoreImageToBitMask(
+static int StoreImageToBitMask(
     VP8LBitWriter* const bw, int width, int histo_bits,
     const VP8LBackwardRefs* const refs,
     const uint16_t* histogram_symbols,
-    const HuffmanTreeCode* const huffman_codes) {
+    const HuffmanTreeCode* const huffman_codes, const WebPPicture* const pic) {
   const int histo_xsize = histo_bits ? VP8LSubSampleSize(width, histo_bits) : 1;
   const int tile_mask = (histo_bits == 0) ? 0 : -(1 << histo_bits);
   // x and y trace the position in the image.
@@ -765,49 +1011,53 @@ static WebPEncodingError StoreImageToBitMask(
     }
     VP8LRefsCursorNext(&c);
   }
-  return bw->error_ ? VP8_ENC_ERROR_OUT_OF_MEMORY : VP8_ENC_OK;
+  if (bw->error_) {
+    WebPEncodingSetError(pic, VP8_ENC_ERROR_OUT_OF_MEMORY);
+    return 0;
+  }
+  return 1;
 }
 
-// Special case of EncodeImageInternal() for cache-bits=0, histo_bits=31
-static WebPEncodingError EncodeImageNoHuffman(VP8LBitWriter* const bw,
-                                              const uint32_t* const argb,
-                                              VP8LHashChain* const hash_chain,
-                                              VP8LBackwardRefs* const refs_tmp1,
-                                              VP8LBackwardRefs* const refs_tmp2,
-                                              int width, int height,
-                                              int quality, int low_effort) {
+// Special case of EncodeImageInternal() for cache-bits=0, histo_bits=31.
+// pic and percent are for progress.
+static int EncodeImageNoHuffman(VP8LBitWriter* const bw,
+                                const uint32_t* const argb,
+                                VP8LHashChain* const hash_chain,
+                                VP8LBackwardRefs* const refs_array, int width,
+                                int height, int quality, int low_effort,
+                                const WebPPicture* const pic, int percent_range,
+                                int* const percent) {
   int i;
   int max_tokens = 0;
-  WebPEncodingError err = VP8_ENC_OK;
   VP8LBackwardRefs* refs;
   HuffmanTreeToken* tokens = NULL;
-  HuffmanTreeCode huffman_codes[5] = { { 0, NULL, NULL } };
-  const uint16_t histogram_symbols[1] = { 0 };    // only one tree, one symbol
+  HuffmanTreeCode huffman_codes[5] = {{0, NULL, NULL}};
+  const uint16_t histogram_symbols[1] = {0};  // only one tree, one symbol
   int cache_bits = 0;
   VP8LHistogramSet* histogram_image = NULL;
   HuffmanTree* const huff_tree = (HuffmanTree*)WebPSafeMalloc(
-        3ULL * CODE_LENGTH_CODES, sizeof(*huff_tree));
+      3ULL * CODE_LENGTH_CODES, sizeof(*huff_tree));
   if (huff_tree == NULL) {
-    err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+    WebPEncodingSetError(pic, VP8_ENC_ERROR_OUT_OF_MEMORY);
     goto Error;
   }
 
   // Calculate backward references from ARGB image.
-  if (!VP8LHashChainFill(hash_chain, quality, argb, width, height,
-                         low_effort)) {
-    err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+  if (!VP8LHashChainFill(hash_chain, quality, argb, width, height, low_effort,
+                         pic, percent_range / 2, percent)) {
     goto Error;
   }
-  refs = VP8LGetBackwardReferences(width, height, argb, quality, 0,
-                                   kLZ77Standard | kLZ77RLE, &cache_bits,
-                                   hash_chain, refs_tmp1, refs_tmp2);
-  if (refs == NULL) {
-    err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+  if (!VP8LGetBackwardReferences(width, height, argb, quality, /*low_effort=*/0,
+                                 kLZ77Standard | kLZ77RLE, cache_bits,
+                                 /*do_no_cache=*/0, hash_chain, refs_array,
+                                 &cache_bits, pic,
+                                 percent_range - percent_range / 2, percent)) {
     goto Error;
   }
+  refs = &refs_array[0];
   histogram_image = VP8LAllocateHistogramSet(1, cache_bits);
   if (histogram_image == NULL) {
-    err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+    WebPEncodingSetError(pic, VP8_ENC_ERROR_OUT_OF_MEMORY);
     goto Error;
   }
   VP8LHistogramSetClear(histogram_image);
@@ -818,7 +1068,7 @@ static WebPEncodingError EncodeImageNoHuffman(VP8LBitWriter* const bw,
   // Create Huffman bit lengths and codes for each histogram image.
   assert(histogram_image->size == 1);
   if (!GetHuffBitLengthsAndCodes(histogram_image, huffman_codes)) {
-    err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+    WebPEncodingSetError(pic, VP8_ENC_ERROR_OUT_OF_MEMORY);
     goto Error;
   }
 
@@ -835,7 +1085,7 @@ static WebPEncodingError EncodeImageNoHuffman(VP8LBitWriter* const bw,
 
   tokens = (HuffmanTreeToken*)WebPSafeMalloc(max_tokens, sizeof(*tokens));
   if (tokens == NULL) {
-    err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+    WebPEncodingSetError(pic, VP8_ENC_ERROR_OUT_OF_MEMORY);
     goto Error;
   }
 
@@ -847,27 +1097,32 @@ static WebPEncodingError EncodeImageNoHuffman(VP8LBitWriter* const bw,
   }
 
   // Store actual literals.
-  err = StoreImageToBitMask(bw, width, 0, refs, histogram_symbols,
-                            huffman_codes);
+  if (!StoreImageToBitMask(bw, width, 0, refs, histogram_symbols, huffman_codes,
+                           pic)) {
+    goto Error;
+  }
 
  Error:
   WebPSafeFree(tokens);
   WebPSafeFree(huff_tree);
   VP8LFreeHistogramSet(histogram_image);
   WebPSafeFree(huffman_codes[0].codes);
-  return err;
+  return (pic->error_code == VP8_ENC_OK);
 }
 
-static WebPEncodingError EncodeImageInternal(
+// pic and percent are for progress.
+static int EncodeImageInternal(
     VP8LBitWriter* const bw, const uint32_t* const argb,
-    VP8LHashChain* const hash_chain, VP8LBackwardRefs refs_array[3], int width,
+    VP8LHashChain* const hash_chain, VP8LBackwardRefs refs_array[4], int width,
     int height, int quality, int low_effort, int use_cache,
     const CrunchConfig* const config, int* cache_bits, int histogram_bits,
-    size_t init_byte_position, int* const hdr_size, int* const data_size) {
-  WebPEncodingError err = VP8_ENC_OK;
+    size_t init_byte_position, int* const hdr_size, int* const data_size,
+    const WebPPicture* const pic, int percent_range, int* const percent) {
   const uint32_t histogram_image_xysize =
       VP8LSubSampleSize(width, histogram_bits) *
       VP8LSubSampleSize(height, histogram_bits);
+  int remaining_percent = percent_range;
+  int percent_start = *percent;
   VP8LHistogramSet* histogram_image = NULL;
   VP8LHistogram* tmp_histo = NULL;
   int histogram_image_size = 0;
@@ -876,112 +1131,135 @@ static WebPEncodingError EncodeImageInternal(
       3ULL * CODE_LENGTH_CODES, sizeof(*huff_tree));
   HuffmanTreeToken* tokens = NULL;
   HuffmanTreeCode* huffman_codes = NULL;
-  VP8LBackwardRefs* refs_best;
-  VP8LBackwardRefs* refs_tmp;
-  uint16_t* const histogram_symbols =
-      (uint16_t*)WebPSafeMalloc(histogram_image_xysize,
-                                sizeof(*histogram_symbols));
-  int lz77s_idx;
+  uint16_t* const histogram_symbols = (uint16_t*)WebPSafeMalloc(
+      histogram_image_xysize, sizeof(*histogram_symbols));
+  int sub_configs_idx;
+  int cache_bits_init, write_histogram_image;
   VP8LBitWriter bw_init = *bw, bw_best;
   int hdr_size_tmp;
+  VP8LHashChain hash_chain_histogram;  // histogram image hash chain
+  size_t bw_size_best = ~(size_t)0;
   assert(histogram_bits >= MIN_HUFFMAN_BITS);
   assert(histogram_bits <= MAX_HUFFMAN_BITS);
   assert(hdr_size != NULL);
   assert(data_size != NULL);
 
-  if (histogram_symbols == NULL) {
-    err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+  memset(&hash_chain_histogram, 0, sizeof(hash_chain_histogram));
+  if (!VP8LBitWriterInit(&bw_best, 0)) {
+    WebPEncodingSetError(pic, VP8_ENC_ERROR_OUT_OF_MEMORY);
+    goto Error;
+  }
+
+  // Make sure we can allocate the different objects.
+  if (huff_tree == NULL || histogram_symbols == NULL ||
+      !VP8LHashChainInit(&hash_chain_histogram, histogram_image_xysize)) {
+    WebPEncodingSetError(pic, VP8_ENC_ERROR_OUT_OF_MEMORY);
     goto Error;
   }
 
+  percent_range = remaining_percent / 5;
+  if (!VP8LHashChainFill(hash_chain, quality, argb, width, height,
+                         low_effort, pic, percent_range, percent)) {
+    goto Error;
+  }
+  percent_start += percent_range;
+  remaining_percent -= percent_range;
+
   if (use_cache) {
     // If the value is different from zero, it has been set during the
     // palette analysis.
-    if (*cache_bits == 0) *cache_bits = MAX_COLOR_CACHE_BITS;
+    cache_bits_init = (*cache_bits == 0) ? MAX_COLOR_CACHE_BITS : *cache_bits;
   } else {
-    *cache_bits = 0;
+    cache_bits_init = 0;
   }
-  // 'best_refs' is the reference to the best backward refs and points to one
-  // of refs_array[0] or refs_array[1].
-  // Calculate backward references from ARGB image.
-  if (huff_tree == NULL ||
-      !VP8LHashChainFill(hash_chain, quality, argb, width, height,
-                         low_effort) ||
-      !VP8LBitWriterInit(&bw_best, 0) ||
-      (config->lz77s_types_to_try_size_ > 1 &&
-       !VP8LBitWriterClone(bw, &bw_best))) {
-    err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+  // If several iterations will happen, clone into bw_best.
+  if ((config->sub_configs_size_ > 1 || config->sub_configs_[0].do_no_cache_) &&
+      !VP8LBitWriterClone(bw, &bw_best)) {
+    WebPEncodingSetError(pic, VP8_ENC_ERROR_OUT_OF_MEMORY);
     goto Error;
   }
-  for (lz77s_idx = 0; lz77s_idx < config->lz77s_types_to_try_size_;
-       ++lz77s_idx) {
-    refs_best = VP8LGetBackwardReferences(
-        width, height, argb, quality, low_effort,
-        config->lz77s_types_to_try_[lz77s_idx], cache_bits, hash_chain,
-        &refs_array[0], &refs_array[1]);
-    if (refs_best == NULL) {
-      err = VP8_ENC_ERROR_OUT_OF_MEMORY;
-      goto Error;
-    }
-    // Keep the best references aside and use the other element from the first
-    // two as a temporary for later usage.
-    refs_tmp = &refs_array[refs_best == &refs_array[0] ? 1 : 0];
-
-    histogram_image =
-        VP8LAllocateHistogramSet(histogram_image_xysize, *cache_bits);
-    tmp_histo = VP8LAllocateHistogram(*cache_bits);
-    if (histogram_image == NULL || tmp_histo == NULL) {
-      err = VP8_ENC_ERROR_OUT_OF_MEMORY;
-      goto Error;
-    }
 
-    // Build histogram image and symbols from backward references.
-    if (!VP8LGetHistoImageSymbols(width, height, refs_best, quality, low_effort,
-                                  histogram_bits, *cache_bits, histogram_image,
-                                  tmp_histo, histogram_symbols)) {
-      err = VP8_ENC_ERROR_OUT_OF_MEMORY;
-      goto Error;
-    }
-    // Create Huffman bit lengths and codes for each histogram image.
-    histogram_image_size = histogram_image->size;
-    bit_array_size = 5 * histogram_image_size;
-    huffman_codes = (HuffmanTreeCode*)WebPSafeCalloc(bit_array_size,
-                                                     sizeof(*huffman_codes));
-    // Note: some histogram_image entries may point to tmp_histos[], so the
-    // latter need to outlive the following call to GetHuffBitLengthsAndCodes().
-    if (huffman_codes == NULL ||
-        !GetHuffBitLengthsAndCodes(histogram_image, huffman_codes)) {
-      err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+  for (sub_configs_idx = 0; sub_configs_idx < config->sub_configs_size_;
+       ++sub_configs_idx) {
+    const CrunchSubConfig* const sub_config =
+        &config->sub_configs_[sub_configs_idx];
+    int cache_bits_best, i_cache;
+    int i_remaining_percent = remaining_percent / config->sub_configs_size_;
+    int i_percent_range = i_remaining_percent / 4;
+    i_remaining_percent -= i_percent_range;
+
+    if (!VP8LGetBackwardReferences(
+            width, height, argb, quality, low_effort, sub_config->lz77_,
+            cache_bits_init, sub_config->do_no_cache_, hash_chain,
+            &refs_array[0], &cache_bits_best, pic, i_percent_range, percent)) {
       goto Error;
     }
-    // Free combined histograms.
-    VP8LFreeHistogramSet(histogram_image);
-    histogram_image = NULL;
 
-    // Free scratch histograms.
-    VP8LFreeHistogram(tmp_histo);
-    tmp_histo = NULL;
+    for (i_cache = 0; i_cache < (sub_config->do_no_cache_ ? 2 : 1); ++i_cache) {
+      const int cache_bits_tmp = (i_cache == 0) ? cache_bits_best : 0;
+      // Speed-up: no need to study the no-cache case if it was already studied
+      // in i_cache == 0.
+      if (i_cache == 1 && cache_bits_best == 0) break;
+
+      // Reset the bit writer for this iteration.
+      VP8LBitWriterReset(&bw_init, bw);
+
+      // Build histogram image and symbols from backward references.
+      histogram_image =
+          VP8LAllocateHistogramSet(histogram_image_xysize, cache_bits_tmp);
+      tmp_histo = VP8LAllocateHistogram(cache_bits_tmp);
+      if (histogram_image == NULL || tmp_histo == NULL) {
+        WebPEncodingSetError(pic, VP8_ENC_ERROR_OUT_OF_MEMORY);
+        goto Error;
+      }
 
-    // Color Cache parameters.
-    if (*cache_bits > 0) {
-      VP8LPutBits(bw, 1, 1);
-      VP8LPutBits(bw, *cache_bits, 4);
-    } else {
-      VP8LPutBits(bw, 0, 1);
-    }
+      i_percent_range = i_remaining_percent / 3;
+      i_remaining_percent -= i_percent_range;
+      if (!VP8LGetHistoImageSymbols(
+              width, height, &refs_array[i_cache], quality, low_effort,
+              histogram_bits, cache_bits_tmp, histogram_image, tmp_histo,
+              histogram_symbols, pic, i_percent_range, percent)) {
+        goto Error;
+      }
+      // Create Huffman bit lengths and codes for each histogram image.
+      histogram_image_size = histogram_image->size;
+      bit_array_size = 5 * histogram_image_size;
+      huffman_codes = (HuffmanTreeCode*)WebPSafeCalloc(bit_array_size,
+                                                       sizeof(*huffman_codes));
+      // Note: some histogram_image entries may point to tmp_histos[], so the
+      // latter need to outlive the following call to
+      // GetHuffBitLengthsAndCodes().
+      if (huffman_codes == NULL ||
+          !GetHuffBitLengthsAndCodes(histogram_image, huffman_codes)) {
+        WebPEncodingSetError(pic, VP8_ENC_ERROR_OUT_OF_MEMORY);
+        goto Error;
+      }
+      // Free combined histograms.
+      VP8LFreeHistogramSet(histogram_image);
+      histogram_image = NULL;
+
+      // Free scratch histograms.
+      VP8LFreeHistogram(tmp_histo);
+      tmp_histo = NULL;
+
+      // Color Cache parameters.
+      if (cache_bits_tmp > 0) {
+        VP8LPutBits(bw, 1, 1);
+        VP8LPutBits(bw, cache_bits_tmp, 4);
+      } else {
+        VP8LPutBits(bw, 0, 1);
+      }
 
-    // Huffman image + meta huffman.
-    {
-      const int write_histogram_image = (histogram_image_size > 1);
+      // Huffman image + meta huffman.
+      write_histogram_image = (histogram_image_size > 1);
       VP8LPutBits(bw, write_histogram_image, 1);
       if (write_histogram_image) {
-        uint32_t* const histogram_argb =
-            (uint32_t*)WebPSafeMalloc(histogram_image_xysize,
-                                      sizeof(*histogram_argb));
+        uint32_t* const histogram_argb = (uint32_t*)WebPSafeMalloc(
+            histogram_image_xysize, sizeof(*histogram_argb));
         int max_index = 0;
         uint32_t i;
         if (histogram_argb == NULL) {
-          err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+          WebPEncodingSetError(pic, VP8_ENC_ERROR_OUT_OF_MEMORY);
           goto Error;
         }
         for (i = 0; i < histogram_image_xysize; ++i) {
@@ -994,73 +1272,81 @@ static WebPEncodingError EncodeImageInternal(
         histogram_image_size = max_index;
 
         VP8LPutBits(bw, histogram_bits - 2, 3);
-        err = EncodeImageNoHuffman(
-            bw, histogram_argb, hash_chain, refs_tmp, &refs_array[2],
-            VP8LSubSampleSize(width, histogram_bits),
-            VP8LSubSampleSize(height, histogram_bits), quality, low_effort);
+        i_percent_range = i_remaining_percent / 2;
+        i_remaining_percent -= i_percent_range;
+        if (!EncodeImageNoHuffman(
+                bw, histogram_argb, &hash_chain_histogram, &refs_array[2],
+                VP8LSubSampleSize(width, histogram_bits),
+                VP8LSubSampleSize(height, histogram_bits), quality, low_effort,
+                pic, i_percent_range, percent)) {
+          WebPSafeFree(histogram_argb);
+          goto Error;
+        }
         WebPSafeFree(histogram_argb);
-        if (err != VP8_ENC_OK) goto Error;
       }
-    }
 
-    // Store Huffman codes.
-    {
-      int i;
-      int max_tokens = 0;
-      // Find maximum number of symbols for the huffman tree-set.
-      for (i = 0; i < 5 * histogram_image_size; ++i) {
-        HuffmanTreeCode* const codes = &huffman_codes[i];
-        if (max_tokens < codes->num_symbols) {
-          max_tokens = codes->num_symbols;
+      // Store Huffman codes.
+      {
+        int i;
+        int max_tokens = 0;
+        // Find maximum number of symbols for the huffman tree-set.
+        for (i = 0; i < 5 * histogram_image_size; ++i) {
+          HuffmanTreeCode* const codes = &huffman_codes[i];
+          if (max_tokens < codes->num_symbols) {
+            max_tokens = codes->num_symbols;
+          }
+        }
+        tokens = (HuffmanTreeToken*)WebPSafeMalloc(max_tokens, sizeof(*tokens));
+        if (tokens == NULL) goto Error;
+        for (i = 0; i < 5 * histogram_image_size; ++i) {
+          HuffmanTreeCode* const codes = &huffman_codes[i];
+          StoreHuffmanCode(bw, huff_tree, tokens, codes);
+          ClearHuffmanTreeIfOnlyOneSymbol(codes);
         }
       }
-      tokens = (HuffmanTreeToken*)WebPSafeMalloc(max_tokens, sizeof(*tokens));
-      if (tokens == NULL) {
-        err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+      // Store actual literals.
+      hdr_size_tmp = (int)(VP8LBitWriterNumBytes(bw) - init_byte_position);
+      if (!StoreImageToBitMask(bw, width, histogram_bits, &refs_array[i_cache],
+                               histogram_symbols, huffman_codes, pic)) {
         goto Error;
       }
-      for (i = 0; i < 5 * histogram_image_size; ++i) {
-        HuffmanTreeCode* const codes = &huffman_codes[i];
-        StoreHuffmanCode(bw, huff_tree, tokens, codes);
-        ClearHuffmanTreeIfOnlyOneSymbol(codes);
+      // Keep track of the smallest image so far.
+      if (VP8LBitWriterNumBytes(bw) < bw_size_best) {
+        bw_size_best = VP8LBitWriterNumBytes(bw);
+        *cache_bits = cache_bits_tmp;
+        *hdr_size = hdr_size_tmp;
+        *data_size =
+            (int)(VP8LBitWriterNumBytes(bw) - init_byte_position - *hdr_size);
+        VP8LBitWriterSwap(bw, &bw_best);
+      }
+      WebPSafeFree(tokens);
+      tokens = NULL;
+      if (huffman_codes != NULL) {
+        WebPSafeFree(huffman_codes->codes);
+        WebPSafeFree(huffman_codes);
+        huffman_codes = NULL;
       }
-    }
-    // Store actual literals.
-    hdr_size_tmp = (int)(VP8LBitWriterNumBytes(bw) - init_byte_position);
-    err = StoreImageToBitMask(bw, width, histogram_bits, refs_best,
-                              histogram_symbols, huffman_codes);
-    // Keep track of the smallest image so far.
-    if (lz77s_idx == 0 ||
-        VP8LBitWriterNumBytes(bw) < VP8LBitWriterNumBytes(&bw_best)) {
-      *hdr_size = hdr_size_tmp;
-      *data_size =
-          (int)(VP8LBitWriterNumBytes(bw) - init_byte_position - *hdr_size);
-      VP8LBitWriterSwap(bw, &bw_best);
-    }
-    // Reset the bit writer for the following iteration if any.
-    if (config->lz77s_types_to_try_size_ > 1) VP8LBitWriterReset(&bw_init, bw);
-    WebPSafeFree(tokens);
-    tokens = NULL;
-    if (huffman_codes != NULL) {
-      WebPSafeFree(huffman_codes->codes);
-      WebPSafeFree(huffman_codes);
-      huffman_codes = NULL;
     }
   }
   VP8LBitWriterSwap(bw, &bw_best);
 
+  if (!WebPReportProgress(pic, percent_start + remaining_percent, percent)) {
+    goto Error;
+  }
+
  Error:
   WebPSafeFree(tokens);
   WebPSafeFree(huff_tree);
   VP8LFreeHistogramSet(histogram_image);
   VP8LFreeHistogram(tmp_histo);
+  VP8LHashChainClear(&hash_chain_histogram);
   if (huffman_codes != NULL) {
     WebPSafeFree(huffman_codes->codes);
     WebPSafeFree(huffman_codes);
   }
   WebPSafeFree(histogram_symbols);
   VP8LBitWriterWipeOut(&bw_best);
-  return err;
+  return (pic->error_code == VP8_ENC_OK);
 }
 
 // -----------------------------------------------------------------------------
@@ -1069,72 +1355,73 @@ static WebPEncodingError EncodeImageInternal(
 static void ApplySubtractGreen(VP8LEncoder* const enc, int width, int height,
                                VP8LBitWriter* const bw) {
   VP8LPutBits(bw, TRANSFORM_PRESENT, 1);
-  VP8LPutBits(bw, SUBTRACT_GREEN, 2);
+  VP8LPutBits(bw, SUBTRACT_GREEN_TRANSFORM, 2);
   VP8LSubtractGreenFromBlueAndRed(enc->argb_, width * height);
 }
 
-static WebPEncodingError ApplyPredictFilter(const VP8LEncoder* const enc,
-                                            int width, int height,
-                                            int quality, int low_effort,
-                                            int used_subtract_green,
-                                            VP8LBitWriter* const bw) {
+static int ApplyPredictFilter(const VP8LEncoder* const enc, int width,
+                              int height, int quality, int low_effort,
+                              int used_subtract_green, VP8LBitWriter* const bw,
+                              int percent_range, int* const percent) {
   const int pred_bits = enc->transform_bits_;
   const int transform_width = VP8LSubSampleSize(width, pred_bits);
   const int transform_height = VP8LSubSampleSize(height, pred_bits);
   // we disable near-lossless quantization if palette is used.
-  const int near_lossless_strength = enc->use_palette_ ? 100
-                                   : enc->config_->near_lossless;
+  const int near_lossless_strength =
+      enc->use_palette_ ? 100 : enc->config_->near_lossless;
 
-  VP8LResidualImage(width, height, pred_bits, low_effort, enc->argb_,
-                    enc->argb_scratch_, enc->transform_data_,
-                    near_lossless_strength, enc->config_->exact,
-                    used_subtract_green);
+  if (!VP8LResidualImage(
+          width, height, pred_bits, low_effort, enc->argb_, enc->argb_scratch_,
+          enc->transform_data_, near_lossless_strength, enc->config_->exact,
+          used_subtract_green, enc->pic_, percent_range / 2, percent)) {
+    return 0;
+  }
   VP8LPutBits(bw, TRANSFORM_PRESENT, 1);
   VP8LPutBits(bw, PREDICTOR_TRANSFORM, 2);
   assert(pred_bits >= 2);
   VP8LPutBits(bw, pred_bits - 2, 3);
   return EncodeImageNoHuffman(
       bw, enc->transform_data_, (VP8LHashChain*)&enc->hash_chain_,
-      (VP8LBackwardRefs*)&enc->refs_[0],  // cast const away
-      (VP8LBackwardRefs*)&enc->refs_[1], transform_width, transform_height,
-      quality, low_effort);
+      (VP8LBackwardRefs*)&enc->refs_[0], transform_width, transform_height,
+      quality, low_effort, enc->pic_, percent_range - percent_range / 2,
+      percent);
 }
 
-static WebPEncodingError ApplyCrossColorFilter(const VP8LEncoder* const enc,
-                                               int width, int height,
-                                               int quality, int low_effort,
-                                               VP8LBitWriter* const bw) {
+static int ApplyCrossColorFilter(const VP8LEncoder* const enc, int width,
+                                 int height, int quality, int low_effort,
+                                 VP8LBitWriter* const bw, int percent_range,
+                                 int* const percent) {
   const int ccolor_transform_bits = enc->transform_bits_;
   const int transform_width = VP8LSubSampleSize(width, ccolor_transform_bits);
   const int transform_height = VP8LSubSampleSize(height, ccolor_transform_bits);
 
-  VP8LColorSpaceTransform(width, height, ccolor_transform_bits, quality,
-                          enc->argb_, enc->transform_data_);
+  if (!VP8LColorSpaceTransform(width, height, ccolor_transform_bits, quality,
+                               enc->argb_, enc->transform_data_, enc->pic_,
+                               percent_range / 2, percent)) {
+    return 0;
+  }
   VP8LPutBits(bw, TRANSFORM_PRESENT, 1);
   VP8LPutBits(bw, CROSS_COLOR_TRANSFORM, 2);
   assert(ccolor_transform_bits >= 2);
   VP8LPutBits(bw, ccolor_transform_bits - 2, 3);
   return EncodeImageNoHuffman(
       bw, enc->transform_data_, (VP8LHashChain*)&enc->hash_chain_,
-      (VP8LBackwardRefs*)&enc->refs_[0],  // cast const away
-      (VP8LBackwardRefs*)&enc->refs_[1], transform_width, transform_height,
-      quality, low_effort);
+      (VP8LBackwardRefs*)&enc->refs_[0], transform_width, transform_height,
+      quality, low_effort, enc->pic_, percent_range - percent_range / 2,
+      percent);
 }
 
 // -----------------------------------------------------------------------------
 
-static WebPEncodingError WriteRiffHeader(const WebPPicture* const pic,
-                                         size_t riff_size, size_t vp8l_size) {
+static int WriteRiffHeader(const WebPPicture* const pic, size_t riff_size,
+                           size_t vp8l_size) {
   uint8_t riff[RIFF_HEADER_SIZE + CHUNK_HEADER_SIZE + VP8L_SIGNATURE_SIZE] = {
     'R', 'I', 'F', 'F', 0, 0, 0, 0, 'W', 'E', 'B', 'P',
     'V', 'P', '8', 'L', 0, 0, 0, 0, VP8L_MAGIC_BYTE,
   };
   PutLE32(riff + TAG_SIZE, (uint32_t)riff_size);
   PutLE32(riff + RIFF_HEADER_SIZE + TAG_SIZE, (uint32_t)vp8l_size);
-  if (!pic->writer(riff, sizeof(riff), pic)) {
-    return VP8_ENC_ERROR_BAD_WRITE;
-  }
-  return VP8_ENC_OK;
+  return pic->writer(riff, sizeof(riff), pic);
 }
 
 static int WriteImageSize(const WebPPicture* const pic,
@@ -1154,36 +1441,29 @@ static int WriteRealAlphaAndVersion(VP8LBitWriter* const bw, int has_alpha) {
   return !bw->error_;
 }
 
-static WebPEncodingError WriteImage(const WebPPicture* const pic,
-                                    VP8LBitWriter* const bw,
-                                    size_t* const coded_size) {
-  WebPEncodingError err = VP8_ENC_OK;
+static int WriteImage(const WebPPicture* const pic, VP8LBitWriter* const bw,
+                      size_t* const coded_size) {
   const uint8_t* const webpll_data = VP8LBitWriterFinish(bw);
   const size_t webpll_size = VP8LBitWriterNumBytes(bw);
   const size_t vp8l_size = VP8L_SIGNATURE_SIZE + webpll_size;
   const size_t pad = vp8l_size & 1;
   const size_t riff_size = TAG_SIZE + CHUNK_HEADER_SIZE + vp8l_size + pad;
 
-  err = WriteRiffHeader(pic, riff_size, vp8l_size);
-  if (err != VP8_ENC_OK) goto Error;
-
-  if (!pic->writer(webpll_data, webpll_size, pic)) {
-    err = VP8_ENC_ERROR_BAD_WRITE;
-    goto Error;
+  if (!WriteRiffHeader(pic, riff_size, vp8l_size) ||
+      !pic->writer(webpll_data, webpll_size, pic)) {
+    WebPEncodingSetError(pic, VP8_ENC_ERROR_BAD_WRITE);
+    return 0;
   }
 
   if (pad) {
     const uint8_t pad_byte[1] = { 0 };
     if (!pic->writer(pad_byte, 1, pic)) {
-      err = VP8_ENC_ERROR_BAD_WRITE;
-      goto Error;
+      WebPEncodingSetError(pic, VP8_ENC_ERROR_BAD_WRITE);
+      return 0;
     }
   }
   *coded_size = CHUNK_HEADER_SIZE + riff_size;
-  return VP8_ENC_OK;
-
- Error:
-  return err;
+  return 1;
 }
 
 // -----------------------------------------------------------------------------
@@ -1199,18 +1479,16 @@ static void ClearTransformBuffer(VP8LEncoder* const enc) {
 // Flags influencing the memory allocated:
 //  enc->transform_bits_
 //  enc->use_predict_, enc->use_cross_color_
-static WebPEncodingError AllocateTransformBuffer(VP8LEncoder* const enc,
-                                                 int width, int height) {
-  WebPEncodingError err = VP8_ENC_OK;
+static int AllocateTransformBuffer(VP8LEncoder* const enc, int width,
+                                   int height) {
   const uint64_t image_size = width * height;
   // VP8LResidualImage needs room for 2 scanlines of uint32 pixels with an extra
   // pixel in each, plus 2 regular scanlines of bytes.
   // TODO(skal): Clean up by using arithmetic in bytes instead of words.
   const uint64_t argb_scratch_size =
-      enc->use_predict_
-          ? (width + 1) * 2 +
-            (width * 2 + sizeof(uint32_t) - 1) / sizeof(uint32_t)
-          : 0;
+      enc->use_predict_ ? (width + 1) * 2 + (width * 2 + sizeof(uint32_t) - 1) /
+                                                sizeof(uint32_t)
+                        : 0;
   const uint64_t transform_data_size =
       (enc->use_predict_ || enc->use_cross_color_)
           ? VP8LSubSampleSize(width, enc->transform_bits_) *
@@ -1218,17 +1496,16 @@ static WebPEncodingError AllocateTransformBuffer(VP8LEncoder* const enc,
           : 0;
   const uint64_t max_alignment_in_words =
       (WEBP_ALIGN_CST + sizeof(uint32_t) - 1) / sizeof(uint32_t);
-  const uint64_t mem_size =
-      image_size + max_alignment_in_words +
-      argb_scratch_size + max_alignment_in_words +
-      transform_data_size;
+  const uint64_t mem_size = image_size + max_alignment_in_words +
+                            argb_scratch_size + max_alignment_in_words +
+                            transform_data_size;
   uint32_t* mem = enc->transform_mem_;
   if (mem == NULL || mem_size > enc->transform_mem_size_) {
     ClearTransformBuffer(enc);
     mem = (uint32_t*)WebPSafeMalloc(mem_size, sizeof(*mem));
     if (mem == NULL) {
-      err = VP8_ENC_ERROR_OUT_OF_MEMORY;
-      goto Error;
+      WebPEncodingSetError(enc->pic_, VP8_ENC_ERROR_OUT_OF_MEMORY);
+      return 0;
     }
     enc->transform_mem_ = mem;
     enc->transform_mem_size_ = (size_t)mem_size;
@@ -1241,19 +1518,16 @@ static WebPEncodingError AllocateTransformBuffer(VP8LEncoder* const enc,
   enc->transform_data_ = mem;
 
   enc->current_width_ = width;
- Error:
-  return err;
+  return 1;
 }
 
-static WebPEncodingError MakeInputImageCopy(VP8LEncoder* const enc) {
-  WebPEncodingError err = VP8_ENC_OK;
+static int MakeInputImageCopy(VP8LEncoder* const enc) {
   const WebPPicture* const picture = enc->pic_;
   const int width = picture->width;
   const int height = picture->height;
 
-  err = AllocateTransformBuffer(enc, width, height);
-  if (err != VP8_ENC_OK) return err;
-  if (enc->argb_content_ == kEncoderARGB) return VP8_ENC_OK;
+  if (!AllocateTransformBuffer(enc, width, height)) return 0;
+  if (enc->argb_content_ == kEncoderARGB) return 1;
 
   {
     uint32_t* dst = enc->argb_;
@@ -1267,27 +1541,11 @@ static WebPEncodingError MakeInputImageCopy(VP8LEncoder* const enc) {
   }
   enc->argb_content_ = kEncoderARGB;
   assert(enc->current_width_ == width);
-  return VP8_ENC_OK;
+  return 1;
 }
 
 // -----------------------------------------------------------------------------
 
-static WEBP_INLINE int SearchColorNoIdx(const uint32_t sorted[], uint32_t color,
-                                        int hi) {
-  int low = 0;
-  if (sorted[low] == color) return low;  // loop invariant: sorted[low] != color
-  while (1) {
-    const int mid = (low + hi) >> 1;
-    if (sorted[mid] == color) {
-      return mid;
-    } else if (sorted[mid] < color) {
-      low = mid;
-    } else {
-      hi = mid;
-    }
-  }
-}
-
 #define APPLY_PALETTE_GREEDY_MAX 4
 
 static WEBP_INLINE uint32_t SearchColorGreedy(const uint32_t palette[],
@@ -1322,17 +1580,6 @@ static WEBP_INLINE uint32_t ApplyPaletteHash2(uint32_t color) {
          (32 - PALETTE_INV_SIZE_BITS);
 }
 
-// Sort palette in increasing order and prepare an inverse mapping array.
-static void PrepareMapToPalette(const uint32_t palette[], int num_colors,
-                                uint32_t sorted[], uint32_t idx_map[]) {
-  int i;
-  memcpy(sorted, palette, num_colors * sizeof(*sorted));
-  qsort(sorted, num_colors, sizeof(*sorted), PaletteCompareColorsForQsort);
-  for (i = 0; i < num_colors; ++i) {
-    idx_map[SearchColorNoIdx(sorted, palette[i], num_colors)] = i;
-  }
-}
-
 // Use 1 pixel cache for ARGB pixels.
 #define APPLY_PALETTE_FOR(COLOR_INDEX) do {         \
   uint32_t prev_pix = palette[0];                   \
@@ -1356,16 +1603,19 @@ static void PrepareMapToPalette(const uint32_t palette[], int num_colors,
 // using 'row' as a temporary buffer of size 'width'.
 // We assume that all src[] values have a corresponding entry in the palette.
 // Note: src[] can be the same as dst[]
-static WebPEncodingError ApplyPalette(const uint32_t* src, uint32_t src_stride,
-                                      uint32_t* dst, uint32_t dst_stride,
-                                      const uint32_t* palette, int palette_size,
-                                      int width, int height, int xbits) {
+static int ApplyPalette(const uint32_t* src, uint32_t src_stride, uint32_t* dst,
+                        uint32_t dst_stride, const uint32_t* palette,
+                        int palette_size, int width, int height, int xbits,
+                        const WebPPicture* const pic) {
   // TODO(skal): this tmp buffer is not needed if VP8LBundleColorMap() can be
   // made to work in-place.
   uint8_t* const tmp_row = (uint8_t*)WebPSafeMalloc(width, sizeof(*tmp_row));
   int x, y;
 
-  if (tmp_row == NULL) return VP8_ENC_ERROR_OUT_OF_MEMORY;
+  if (tmp_row == NULL) {
+    WebPEncodingSetError(pic, VP8_ENC_ERROR_OUT_OF_MEMORY);
+    return 0;
+  }
 
   if (palette_size < APPLY_PALETTE_GREEDY_MAX) {
     APPLY_PALETTE_FOR(SearchColorGreedy(palette, palette_size, pix));
@@ -1410,7 +1660,7 @@ static WebPEncodingError ApplyPalette(const uint32_t* src, uint32_t src_stride,
     }
   }
   WebPSafeFree(tmp_row);
-  return VP8_ENC_OK;
+  return 1;
 }
 #undef APPLY_PALETTE_FOR
 #undef PALETTE_INV_SIZE_BITS
@@ -1418,9 +1668,7 @@ static WebPEncodingError ApplyPalette(const uint32_t* src, uint32_t src_stride,
 #undef APPLY_PALETTE_GREEDY_MAX
 
 // Note: Expects "enc->palette_" to be set properly.
-static WebPEncodingError MapImageFromPalette(VP8LEncoder* const enc,
-                                             int in_place) {
-  WebPEncodingError err = VP8_ENC_OK;
+static int MapImageFromPalette(VP8LEncoder* const enc, int in_place) {
   const WebPPicture* const pic = enc->pic_;
   const int width = pic->width;
   const int height = pic->height;
@@ -1438,19 +1686,22 @@ static WebPEncodingError MapImageFromPalette(VP8LEncoder* const enc,
     xbits = (palette_size <= 16) ? 1 : 0;
   }
 
-  err = AllocateTransformBuffer(enc, VP8LSubSampleSize(width, xbits), height);
-  if (err != VP8_ENC_OK) return err;
-
-  err = ApplyPalette(src, src_stride,
+  if (!AllocateTransformBuffer(enc, VP8LSubSampleSize(width, xbits), height)) {
+    return 0;
+  }
+  if (!ApplyPalette(src, src_stride,
                      enc->argb_, enc->current_width_,
-                     palette, palette_size, width, height, xbits);
+                     palette, palette_size, width, height, xbits, pic)) {
+    return 0;
+  }
   enc->argb_content_ = kEncoderPalette;
-  return err;
+  return 1;
 }
 
 // Save palette_[] to bitstream.
 static WebPEncodingError EncodePalette(VP8LBitWriter* const bw, int low_effort,
-                                       VP8LEncoder* const enc) {
+                                       VP8LEncoder* const enc,
+                                       int percent_range, int* const percent) {
   int i;
   uint32_t tmp_palette[MAX_PALETTE_SIZE];
   const int palette_size = enc->palette_size_;
@@ -1464,8 +1715,8 @@ static WebPEncodingError EncodePalette(VP8LBitWriter* const bw, int low_effort,
   }
   tmp_palette[0] = palette[0];
   return EncodeImageNoHuffman(bw, tmp_palette, &enc->hash_chain_,
-                              &enc->refs_[0], &enc->refs_[1], palette_size, 1,
-                              20 /* quality */, low_effort);
+                              &enc->refs_[0], palette_size, 1, /*quality=*/20,
+                              low_effort, enc->pic_, percent_range, percent);
 }
 
 // -----------------------------------------------------------------------------
@@ -1491,7 +1742,7 @@ static void VP8LEncoderDelete(VP8LEncoder* enc) {
   if (enc != NULL) {
     int i;
     VP8LHashChainClear(&enc->hash_chain_);
-    for (i = 0; i < 3; ++i) VP8LBackwardRefsClear(&enc->refs_[i]);
+    for (i = 0; i < 4; ++i) VP8LBackwardRefsClear(&enc->refs_[i]);
     ClearTransformBuffer(enc);
     WebPSafeFree(enc);
   }
@@ -1509,7 +1760,6 @@ typedef struct {
   CrunchConfig crunch_configs_[CRUNCH_CONFIGS_MAX];
   int num_crunch_configs_;
   int red_and_blue_always_zero_;
-  WebPEncodingError err_;
   WebPAuxStats* stats_;
 } StreamEncodeContext;
 
@@ -1526,7 +1776,6 @@ static int EncodeStreamHook(void* input, void* data2) {
 #if !defined(WEBP_DISABLE_STATS)
   WebPAuxStats* const stats = params->stats_;
 #endif
-  WebPEncodingError err = VP8_ENC_OK;
   const int quality = (int)config->quality;
   const int low_effort = (config->method == 0);
 #if (WEBP_NEAR_LOSSLESS == 1)
@@ -1534,6 +1783,7 @@ static int EncodeStreamHook(void* input, void* data2) {
 #endif
   const int height = picture->height;
   const size_t byte_position = VP8LBitWriterNumBytes(bw);
+  int percent = 2;  // for WebPProgressHook
 #if (WEBP_NEAR_LOSSLESS == 1)
   int use_near_lossless = 0;
 #endif
@@ -1541,24 +1791,28 @@ static int EncodeStreamHook(void* input, void* data2) {
   int data_size = 0;
   int use_delta_palette = 0;
   int idx;
-  size_t best_size = 0;
+  size_t best_size = ~(size_t)0;
   VP8LBitWriter bw_init = *bw, bw_best;
   (void)data2;
 
   if (!VP8LBitWriterInit(&bw_best, 0) ||
       (num_crunch_configs > 1 && !VP8LBitWriterClone(bw, &bw_best))) {
-    err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+    WebPEncodingSetError(picture, VP8_ENC_ERROR_OUT_OF_MEMORY);
     goto Error;
   }
 
   for (idx = 0; idx < num_crunch_configs; ++idx) {
     const int entropy_idx = crunch_configs[idx].entropy_idx_;
-    enc->use_palette_ = (entropy_idx == kPalette);
+    int remaining_percent = 97 / num_crunch_configs, percent_range;
+    enc->use_palette_ =
+        (entropy_idx == kPalette) || (entropy_idx == kPaletteAndSpatial);
     enc->use_subtract_green_ =
         (entropy_idx == kSubGreen) || (entropy_idx == kSpatialSubGreen);
-    enc->use_predict_ =
-        (entropy_idx == kSpatial) || (entropy_idx == kSpatialSubGreen);
-    if (low_effort) {
+    enc->use_predict_ = (entropy_idx == kSpatial) ||
+                        (entropy_idx == kSpatialSubGreen) ||
+                        (entropy_idx == kPaletteAndSpatial);
+    // When using a palette, R/B==0, hence no need to test for cross-color.
+    if (low_effort || enc->use_palette_) {
       enc->use_cross_color_ = 0;
     } else {
       enc->use_cross_color_ = red_and_blue_always_zero ? 0 : enc->use_predict_;
@@ -1573,11 +1827,10 @@ static int EncodeStreamHook(void* input, void* data2) {
     use_near_lossless = (config->near_lossless < 100) && !enc->use_palette_ &&
                         !enc->use_predict_;
     if (use_near_lossless) {
-      err = AllocateTransformBuffer(enc, width, height);
-      if (err != VP8_ENC_OK) goto Error;
+      if (!AllocateTransformBuffer(enc, width, height)) goto Error;
       if ((enc->argb_content_ != kEncoderNearLossless) &&
           !VP8ApplyNearLossless(picture, config->near_lossless, enc->argb_)) {
-        err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+        WebPEncodingSetError(picture, VP8_ENC_ERROR_OUT_OF_MEMORY);
         goto Error;
       }
       enc->argb_content_ = kEncoderNearLossless;
@@ -1590,10 +1843,26 @@ static int EncodeStreamHook(void* input, void* data2) {
 
     // Encode palette
     if (enc->use_palette_) {
-      err = EncodePalette(bw, low_effort, enc);
-      if (err != VP8_ENC_OK) goto Error;
-      err = MapImageFromPalette(enc, use_delta_palette);
-      if (err != VP8_ENC_OK) goto Error;
+      if (crunch_configs[idx].palette_sorting_type_ == kSortedDefault) {
+        // Nothing to do, we have already sorted the palette.
+        memcpy(enc->palette_, enc->palette_sorted_,
+               enc->palette_size_ * sizeof(*enc->palette_));
+      } else if (crunch_configs[idx].palette_sorting_type_ == kMinimizeDelta) {
+        PaletteSortMinimizeDeltas(enc->palette_sorted_, enc->palette_size_,
+                                  enc->palette_);
+      } else {
+        assert(crunch_configs[idx].palette_sorting_type_ == kModifiedZeng);
+        if (!PaletteSortModifiedZeng(enc->pic_, enc->palette_sorted_,
+                                      enc->palette_size_, enc->palette_)) {
+          goto Error;
+        }
+      }
+      percent_range = remaining_percent / 4;
+      if (!EncodePalette(bw, low_effort, enc, percent_range, &percent)) {
+        goto Error;
+      }
+      remaining_percent -= percent_range;
+      if (!MapImageFromPalette(enc, use_delta_palette)) goto Error;
       // If using a color cache, do not have it bigger than the number of
       // colors.
       if (use_cache && enc->palette_size_ < (1 << MAX_COLOR_CACHE_BITS)) {
@@ -1604,8 +1873,7 @@ static int EncodeStreamHook(void* input, void* data2) {
       // In case image is not packed.
       if (enc->argb_content_ != kEncoderNearLossless &&
           enc->argb_content_ != kEncoderPalette) {
-        err = MakeInputImageCopy(enc);
-        if (err != VP8_ENC_OK) goto Error;
+        if (!MakeInputImageCopy(enc)) goto Error;
       }
 
       // -----------------------------------------------------------------------
@@ -1616,15 +1884,22 @@ static int EncodeStreamHook(void* input, void* data2) {
       }
 
       if (enc->use_predict_) {
-        err = ApplyPredictFilter(enc, enc->current_width_, height, quality,
-                                 low_effort, enc->use_subtract_green_, bw);
-        if (err != VP8_ENC_OK) goto Error;
+        percent_range = remaining_percent / 3;
+        if (!ApplyPredictFilter(enc, enc->current_width_, height, quality,
+                                low_effort, enc->use_subtract_green_, bw,
+                                percent_range, &percent)) {
+          goto Error;
+        }
+        remaining_percent -= percent_range;
       }
 
       if (enc->use_cross_color_) {
-        err = ApplyCrossColorFilter(enc, enc->current_width_, height, quality,
-                                    low_effort, bw);
-        if (err != VP8_ENC_OK) goto Error;
+        percent_range = remaining_percent / 2;
+        if (!ApplyCrossColorFilter(enc, enc->current_width_, height, quality,
+                                   low_effort, bw, percent_range, &percent)) {
+          goto Error;
+        }
+        remaining_percent -= percent_range;
       }
     }
 
@@ -1632,15 +1907,16 @@ static int EncodeStreamHook(void* input, void* data2) {
 
     // -------------------------------------------------------------------------
     // Encode and write the transformed image.
-    err = EncodeImageInternal(bw, enc->argb_, &enc->hash_chain_, enc->refs_,
-                              enc->current_width_, height, quality, low_effort,
-                              use_cache, &crunch_configs[idx],
-                              &enc->cache_bits_, enc->histo_bits_,
-                              byte_position, &hdr_size, &data_size);
-    if (err != VP8_ENC_OK) goto Error;
+    if (!EncodeImageInternal(
+            bw, enc->argb_, &enc->hash_chain_, enc->refs_, enc->current_width_,
+            height, quality, low_effort, use_cache, &crunch_configs[idx],
+            &enc->cache_bits_, enc->histo_bits_, byte_position, &hdr_size,
+            &data_size, picture, remaining_percent, &percent)) {
+      goto Error;
+    }
 
     // If we are better than what we already have.
-    if (idx == 0 || VP8LBitWriterNumBytes(bw) < best_size) {
+    if (VP8LBitWriterNumBytes(bw) < best_size) {
       best_size = VP8LBitWriterNumBytes(bw);
       // Store the BitWriter.
       VP8LBitWriterSwap(bw, &bw_best);
@@ -1667,18 +1943,15 @@ static int EncodeStreamHook(void* input, void* data2) {
   }
   VP8LBitWriterSwap(&bw_best, bw);
 
-Error:
+ Error:
   VP8LBitWriterWipeOut(&bw_best);
-  params->err_ = err;
   // The hook should return false in case of error.
-  return (err == VP8_ENC_OK);
+  return (params->picture_->error_code == VP8_ENC_OK);
 }
 
-WebPEncodingError VP8LEncodeStream(const WebPConfig* const config,
-                                   const WebPPicture* const picture,
-                                   VP8LBitWriter* const bw_main,
-                                   int use_cache) {
-  WebPEncodingError err = VP8_ENC_OK;
+int VP8LEncodeStream(const WebPConfig* const config,
+                     const WebPPicture* const picture,
+                     VP8LBitWriter* const bw_main, int use_cache) {
   VP8LEncoder* const enc_main = VP8LEncoderNew(config, picture);
   VP8LEncoder* enc_side = NULL;
   CrunchConfig crunch_configs[CRUNCH_CONFIGS_MAX];
@@ -1690,15 +1963,24 @@ WebPEncodingError VP8LEncodeStream(const WebPConfig* const config,
   // The main thread uses picture->stats, the side thread uses stats_side.
   WebPAuxStats stats_side;
   VP8LBitWriter bw_side;
+  WebPPicture picture_side;
   const WebPWorkerInterface* const worker_interface = WebPGetWorkerInterface();
   int ok_main;
 
+  if (enc_main == NULL || !VP8LBitWriterInit(&bw_side, 0)) {
+    WebPEncodingSetError(picture, VP8_ENC_ERROR_OUT_OF_MEMORY);
+    VP8LEncoderDelete(enc_main);
+    return 0;
+  }
+
+  // Avoid "garbage value" error from Clang's static analysis tool.
+  WebPPictureInit(&picture_side);
+
   // Analyze image (entropy, num_palettes etc)
-  if (enc_main == NULL ||
-      !EncoderAnalyze(enc_main, crunch_configs, &num_crunch_configs_main,
+  if (!EncoderAnalyze(enc_main, crunch_configs, &num_crunch_configs_main,
                       &red_and_blue_always_zero) ||
-      !EncoderInit(enc_main) || !VP8LBitWriterInit(&bw_side, 0)) {
-    err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+      !EncoderInit(enc_main)) {
+    WebPEncodingSetError(picture, VP8_ENC_ERROR_OUT_OF_MEMORY);
     goto Error;
   }
 
@@ -1727,25 +2009,32 @@ WebPEncodingError VP8LEncodeStream(const WebPConfig* const config,
       StreamEncodeContext* const param =
           (idx == 0) ? &params_main : &params_side;
       param->config_ = config;
-      param->picture_ = picture;
       param->use_cache_ = use_cache;
       param->red_and_blue_always_zero_ = red_and_blue_always_zero;
       if (idx == 0) {
+        param->picture_ = picture;
         param->stats_ = picture->stats;
         param->bw_ = bw_main;
         param->enc_ = enc_main;
       } else {
+        // Create a side picture (error_code is not thread-safe).
+        if (!WebPPictureView(picture, /*left=*/0, /*top=*/0, picture->width,
+                             picture->height, &picture_side)) {
+          assert(0);
+        }
+        picture_side.progress_hook = NULL;  // Progress hook is not thread-safe.
+        param->picture_ = &picture_side;  // No need to free a view afterwards.
         param->stats_ = (picture->stats == NULL) ? NULL : &stats_side;
         // Create a side bit writer.
         if (!VP8LBitWriterClone(bw_main, &bw_side)) {
-          err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+          WebPEncodingSetError(picture, VP8_ENC_ERROR_OUT_OF_MEMORY);
           goto Error;
         }
         param->bw_ = &bw_side;
         // Create a side encoder.
-        enc_side = VP8LEncoderNew(config, picture);
+        enc_side = VP8LEncoderNew(config, &picture_side);
         if (enc_side == NULL || !EncoderInit(enc_side)) {
-          err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+          WebPEncodingSetError(picture, VP8_ENC_ERROR_OUT_OF_MEMORY);
           goto Error;
         }
         // Copy the values that were computed for the main encoder.
@@ -1754,6 +2043,8 @@ WebPEncodingError VP8LEncodeStream(const WebPConfig* const config,
         enc_side->palette_size_ = enc_main->palette_size_;
         memcpy(enc_side->palette_, enc_main->palette_,
                sizeof(enc_main->palette_));
+        memcpy(enc_side->palette_sorted_, enc_main->palette_sorted_,
+               sizeof(enc_main->palette_sorted_));
         param->enc_ = enc_side;
       }
       // Create the workers.
@@ -1767,7 +2058,7 @@ WebPEncodingError VP8LEncodeStream(const WebPConfig* const config,
   // Start the second thread if needed.
   if (num_crunch_configs_side != 0) {
     if (!worker_interface->Reset(&worker_side)) {
-      err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+      WebPEncodingSetError(picture, VP8_ENC_ERROR_OUT_OF_MEMORY);
       goto Error;
     }
 #if !defined(WEBP_DISABLE_STATS)
@@ -1777,8 +2068,6 @@ WebPEncodingError VP8LEncodeStream(const WebPConfig* const config,
       memcpy(&stats_side, picture->stats, sizeof(stats_side));
     }
 #endif
-    // This line is only useful to remove a Clang static analyzer warning.
-    params_side.err_ = VP8_ENC_OK;
     worker_interface->Launch(&worker_side);
   }
   // Execute the main thread.
@@ -1790,7 +2079,10 @@ WebPEncodingError VP8LEncodeStream(const WebPConfig* const config,
     const int ok_side = worker_interface->Sync(&worker_side);
     worker_interface->End(&worker_side);
     if (!ok_main || !ok_side) {
-      err = ok_main ? params_side.err_ : params_main.err_;
+      if (picture->error_code == VP8_ENC_OK) {
+        assert(picture_side.error_code != VP8_ENC_OK);
+        WebPEncodingSetError(picture, picture_side.error_code);
+      }
       goto Error;
     }
     if (VP8LBitWriterNumBytes(&bw_side) < VP8LBitWriterNumBytes(bw_main)) {
@@ -1801,22 +2093,17 @@ WebPEncodingError VP8LEncodeStream(const WebPConfig* const config,
       }
 #endif
     }
-  } else {
-    if (!ok_main) {
-      err = params_main.err_;
-      goto Error;
-    }
   }
 
-Error:
+ Error:
   VP8LBitWriterWipeOut(&bw_side);
   VP8LEncoderDelete(enc_main);
   VP8LEncoderDelete(enc_side);
-  return err;
+  return (picture->error_code == VP8_ENC_OK);
 }
 
 #undef CRUNCH_CONFIGS_MAX
-#undef CRUNCH_CONFIGS_LZ77_MAX
+#undef CRUNCH_SUBCONFIGS_MAX
 
 int VP8LEncodeImage(const WebPConfig* const config,
                     const WebPPicture* const picture) {
@@ -1825,14 +2112,12 @@ int VP8LEncodeImage(const WebPConfig* const config,
   size_t coded_size;
   int percent = 0;
   int initial_size;
-  WebPEncodingError err = VP8_ENC_OK;
   VP8LBitWriter bw;
 
   if (picture == NULL) return 0;
 
   if (config == NULL || picture->argb == NULL) {
-    err = VP8_ENC_ERROR_NULL_PARAMETER;
-    WebPEncodingSetError(picture, err);
+    WebPEncodingSetError(picture, VP8_ENC_ERROR_NULL_PARAMETER);
     return 0;
   }
 
@@ -1843,13 +2128,13 @@ int VP8LEncodeImage(const WebPConfig* const config,
   initial_size = (config->image_hint == WEBP_HINT_GRAPH) ?
       width * height : width * height * 2;
   if (!VP8LBitWriterInit(&bw, initial_size)) {
-    err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+    WebPEncodingSetError(picture, VP8_ENC_ERROR_OUT_OF_MEMORY);
     goto Error;
   }
 
   if (!WebPReportProgress(picture, 1, &percent)) {
  UserAbort:
-    err = VP8_ENC_ERROR_USER_ABORT;
+    WebPEncodingSetError(picture, VP8_ENC_ERROR_USER_ABORT);
     goto Error;
   }
   // Reset stats (for pure lossless coding)
@@ -1865,28 +2150,26 @@ int VP8LEncodeImage(const WebPConfig* const config,
 
   // Write image size.
   if (!WriteImageSize(picture, &bw)) {
-    err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+    WebPEncodingSetError(picture, VP8_ENC_ERROR_OUT_OF_MEMORY);
     goto Error;
   }
 
   has_alpha = WebPPictureHasTransparency(picture);
   // Write the non-trivial Alpha flag and lossless version.
   if (!WriteRealAlphaAndVersion(&bw, has_alpha)) {
-    err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+    WebPEncodingSetError(picture, VP8_ENC_ERROR_OUT_OF_MEMORY);
     goto Error;
   }
 
-  if (!WebPReportProgress(picture, 5, &percent)) goto UserAbort;
+  if (!WebPReportProgress(picture, 2, &percent)) goto UserAbort;
 
   // Encode main image stream.
-  err = VP8LEncodeStream(config, picture, &bw, 1 /*use_cache*/);
-  if (err != VP8_ENC_OK) goto Error;
+  if (!VP8LEncodeStream(config, picture, &bw, 1 /*use_cache*/)) goto Error;
 
-  if (!WebPReportProgress(picture, 90, &percent)) goto UserAbort;
+  if (!WebPReportProgress(picture, 99, &percent)) goto UserAbort;
 
   // Finish the RIFF chunk.
-  err = WriteImage(picture, &bw, &coded_size);
-  if (err != VP8_ENC_OK) goto Error;
+  if (!WriteImage(picture, &bw, &coded_size)) goto Error;
 
   if (!WebPReportProgress(picture, 100, &percent)) goto UserAbort;
 
@@ -1905,13 +2188,11 @@ int VP8LEncodeImage(const WebPConfig* const config,
   }
 
  Error:
-  if (bw.error_) err = VP8_ENC_ERROR_OUT_OF_MEMORY;
-  VP8LBitWriterWipeOut(&bw);
-  if (err != VP8_ENC_OK) {
-    WebPEncodingSetError(picture, err);
-    return 0;
+  if (bw.error_) {
+    WebPEncodingSetError(picture, VP8_ENC_ERROR_OUT_OF_MEMORY);
   }
-  return 1;
+  VP8LBitWriterWipeOut(&bw);
+  return (picture->error_code == VP8_ENC_OK);
 }
 
 //------------------------------------------------------------------------------
diff --git a/src/3rdparty/libwebp/src/enc/vp8li_enc.h b/src/3rdparty/libwebp/src/enc/vp8li_enc.h
index d2d0fc5..3d35e16 100644
--- a/src/3rdparty/libwebp/src/enc/vp8li_enc.h
+++ b/src/3rdparty/libwebp/src/enc/vp8li_enc.h
@@ -69,9 +69,11 @@ typedef struct {
   int use_palette_;
   int palette_size_;
   uint32_t palette_[MAX_PALETTE_SIZE];
+  // Sorted version of palette_ for cache purposes.
+  uint32_t palette_sorted_[MAX_PALETTE_SIZE];
 
   // Some 'scratch' (potentially large) objects.
-  struct VP8LBackwardRefs refs_[3];  // Backward Refs array for temporaries.
+  struct VP8LBackwardRefs refs_[4];  // Backward Refs array for temporaries.
   VP8LHashChain hash_chain_;         // HashChain data for constructing
                                      // backward references.
 } VP8LEncoder;
@@ -87,9 +89,10 @@ int VP8LEncodeImage(const WebPConfig* const config,
 
 // Encodes the main image stream using the supplied bit writer.
 // If 'use_cache' is false, disables the use of color cache.
-WebPEncodingError VP8LEncodeStream(const WebPConfig* const config,
-                                   const WebPPicture* const picture,
-                                   VP8LBitWriter* const bw, int use_cache);
+// Returns false in case of error (stored in picture->error_code).
+int VP8LEncodeStream(const WebPConfig* const config,
+                     const WebPPicture* const picture, VP8LBitWriter* const bw,
+                     int use_cache);
 
 #if (WEBP_NEAR_LOSSLESS == 1)
 // in near_lossless.c
@@ -101,13 +104,18 @@ int VP8ApplyNearLossless(const WebPPicture* const picture, int quality,
 //------------------------------------------------------------------------------
 // Image transforms in predictor.c.
 
-void VP8LResidualImage(int width, int height, int bits, int low_effort,
-                       uint32_t* const argb, uint32_t* const argb_scratch,
-                       uint32_t* const image, int near_lossless, int exact,
-                       int used_subtract_green);
-
-void VP8LColorSpaceTransform(int width, int height, int bits, int quality,
-                             uint32_t* const argb, uint32_t* image);
+// pic and percent are for progress.
+// Returns false in case of error (stored in pic->error_code).
+int VP8LResidualImage(int width, int height, int bits, int low_effort,
+                      uint32_t* const argb, uint32_t* const argb_scratch,
+                      uint32_t* const image, int near_lossless, int exact,
+                      int used_subtract_green, const WebPPicture* const pic,
+                      int percent_range, int* const percent);
+
+int VP8LColorSpaceTransform(int width, int height, int bits, int quality,
+                            uint32_t* const argb, uint32_t* image,
+                            const WebPPicture* const pic, int percent_range,
+                            int* const percent);
 
 //------------------------------------------------------------------------------
 
diff --git a/src/3rdparty/libwebp/src/enc/webp_enc.c b/src/3rdparty/libwebp/src/enc/webp_enc.c
index 9f4b10c..9620e05 100644
--- a/src/3rdparty/libwebp/src/enc/webp_enc.c
+++ b/src/3rdparty/libwebp/src/enc/webp_enc.c
@@ -336,9 +336,7 @@ int WebPEncode(const WebPConfig* config, WebPPicture* pic) {
   if (!WebPValidateConfig(config)) {
     return WebPEncodingSetError(pic, VP8_ENC_ERROR_INVALID_CONFIGURATION);
   }
-  if (pic->width <= 0 || pic->height <= 0) {
-    return WebPEncodingSetError(pic, VP8_ENC_ERROR_BAD_DIMENSION);
-  }
+  if (!WebPValidatePicture(pic)) return 0;
   if (pic->width > WEBP_MAX_DIMENSION || pic->height > WEBP_MAX_DIMENSION) {
     return WebPEncodingSetError(pic, VP8_ENC_ERROR_BAD_DIMENSION);
   }
@@ -400,7 +398,7 @@ int WebPEncode(const WebPConfig* config, WebPPicture* pic) {
     }
 
     if (!config->exact) {
-      WebPCleanupTransparentAreaLossless(pic);
+      WebPReplaceTransparentPixels(pic, 0x000000);
     }
 
     ok = VP8LEncodeImage(config, pic);  // Sets pic->error in case of problem.
diff --git a/src/3rdparty/libwebp/src/mux/anim_encode.c b/src/3rdparty/libwebp/src/mux/anim_encode.c
index 7be9906..7078d9a 100644
--- a/src/3rdparty/libwebp/src/mux/anim_encode.c
+++ b/src/3rdparty/libwebp/src/mux/anim_encode.c
@@ -248,9 +248,6 @@ WebPAnimEncoder* WebPAnimEncoderNewInternal(
 
   enc = (WebPAnimEncoder*)WebPSafeCalloc(1, sizeof(*enc));
   if (enc == NULL) return NULL;
-  // sanity inits, so we can call WebPAnimEncoderDelete():
-  enc->encoded_frames_ = NULL;
-  enc->mux_ = NULL;
   MarkNoError(enc);
 
   // Dimensions and options.
@@ -421,7 +418,7 @@ static void MinimizeChangeRectangle(const WebPPicture* const src,
   const int max_allowed_diff_lossy = QualityToMaxDiff(quality);
   const int max_allowed_diff = is_lossless ? 0 : max_allowed_diff_lossy;
 
-  // Sanity checks.
+  // Assumption/correctness checks.
   assert(src->width == dst->width && src->height == dst->height);
   assert(rect->x_offset_ + rect->width_ <= dst->width);
   assert(rect->y_offset_ + rect->height_ <= dst->height);
@@ -949,7 +946,8 @@ static int IncreasePreviousDuration(WebPAnimEncoder* const enc, int duration) {
   int new_duration;
 
   assert(enc->count_ >= 1);
-  assert(prev_enc_frame->sub_frame_.duration ==
+  assert(!prev_enc_frame->is_key_frame_ ||
+         prev_enc_frame->sub_frame_.duration ==
          prev_enc_frame->key_frame_.duration);
   assert(prev_enc_frame->sub_frame_.duration ==
          (prev_enc_frame->sub_frame_.duration & (MAX_DURATION - 1)));
@@ -966,7 +964,7 @@ static int IncreasePreviousDuration(WebPAnimEncoder* const enc, int duration) {
       0x10, 0x88, 0x88, 0x08
     };
     const WebPData lossless_1x1 = {
-        lossless_1x1_bytes, sizeof(lossless_1x1_bytes)
+      lossless_1x1_bytes, sizeof(lossless_1x1_bytes)
     };
     const uint8_t lossy_1x1_bytes[] = {
       0x52, 0x49, 0x46, 0x46, 0x40, 0x00, 0x00, 0x00, 0x57, 0x45, 0x42, 0x50,
@@ -1358,6 +1356,12 @@ int WebPAnimEncoderAdd(WebPAnimEncoder* enc, WebPPicture* frame, int timestamp,
     if (!IncreasePreviousDuration(enc, (int)prev_frame_duration)) {
       return 0;
     }
+    // IncreasePreviousDuration() may add a frame to avoid exceeding
+    // MAX_DURATION which could cause CacheFrame() to over read encoded_frames_
+    // before the next flush.
+    if (enc->count_ == enc->size_ && !FlushFrames(enc)) {
+      return 0;
+    }
   } else {
     enc->first_timestamp_ = timestamp;
   }
diff --git a/src/3rdparty/libwebp/src/mux/muxedit.c b/src/3rdparty/libwebp/src/mux/muxedit.c
index ccf14b2..63e71a0 100644
--- a/src/3rdparty/libwebp/src/mux/muxedit.c
+++ b/src/3rdparty/libwebp/src/mux/muxedit.c
@@ -70,6 +70,7 @@ void WebPMuxDelete(WebPMux* mux) {
     err = ChunkAssignData(&chunk, data, copy_data, tag);                       \
     if (err == WEBP_MUX_OK) {                                                  \
       err = ChunkSetHead(&chunk, (LIST));                                      \
+      if (err != WEBP_MUX_OK) ChunkRelease(&chunk);                            \
     }                                                                          \
     return err;                                                                \
   }
@@ -235,7 +236,6 @@ WebPMuxError WebPMuxSetImage(WebPMux* mux, const WebPData* bitstream,
   WebPMuxImage wpi;
   WebPMuxError err;
 
-  // Sanity checks.
   if (mux == NULL || bitstream == NULL || bitstream->bytes == NULL ||
       bitstream->size > MAX_CHUNK_PAYLOAD) {
     return WEBP_MUX_INVALID_ARGUMENT;
@@ -267,7 +267,6 @@ WebPMuxError WebPMuxPushFrame(WebPMux* mux, const WebPMuxFrameInfo* info,
   WebPMuxImage wpi;
   WebPMuxError err;
 
-  // Sanity checks.
   if (mux == NULL || info == NULL) return WEBP_MUX_INVALID_ARGUMENT;
 
   if (info->id != WEBP_CHUNK_ANMF) return WEBP_MUX_INVALID_ARGUMENT;
diff --git a/src/3rdparty/libwebp/src/mux/muxi.h b/src/3rdparty/libwebp/src/mux/muxi.h
index 7bc0b07..7929138 100644
--- a/src/3rdparty/libwebp/src/mux/muxi.h
+++ b/src/3rdparty/libwebp/src/mux/muxi.h
@@ -28,8 +28,8 @@ extern "C" {
 // Defines and constants.
 
 #define MUX_MAJ_VERSION 1
-#define MUX_MIN_VERSION 0
-#define MUX_REV_VERSION 3
+#define MUX_MIN_VERSION 3
+#define MUX_REV_VERSION 0
 
 // Chunk object.
 typedef struct WebPChunk WebPChunk;
diff --git a/src/3rdparty/libwebp/src/mux/muxinternal.c b/src/3rdparty/libwebp/src/mux/muxinternal.c
index b9ee671..75b6b41 100644
--- a/src/3rdparty/libwebp/src/mux/muxinternal.c
+++ b/src/3rdparty/libwebp/src/mux/muxinternal.c
@@ -155,17 +155,18 @@ WebPMuxError ChunkSetHead(WebPChunk* const chunk,
 
 WebPMuxError ChunkAppend(WebPChunk* const chunk,
                          WebPChunk*** const chunk_list) {
+  WebPMuxError err;
   assert(chunk_list != NULL && *chunk_list != NULL);
 
   if (**chunk_list == NULL) {
-    ChunkSetHead(chunk, *chunk_list);
+    err = ChunkSetHead(chunk, *chunk_list);
   } else {
     WebPChunk* last_chunk = **chunk_list;
     while (last_chunk->next_ != NULL) last_chunk = last_chunk->next_;
-    ChunkSetHead(chunk, &last_chunk->next_);
-    *chunk_list = &last_chunk->next_;
+    err = ChunkSetHead(chunk, &last_chunk->next_);
+    if (err == WEBP_MUX_OK) *chunk_list = &last_chunk->next_;
   }
-  return WEBP_MUX_OK;
+  return err;
 }
 
 //------------------------------------------------------------------------------
diff --git a/src/3rdparty/libwebp/src/mux/muxread.c b/src/3rdparty/libwebp/src/mux/muxread.c
index 268f6ac..8005039 100644
--- a/src/3rdparty/libwebp/src/mux/muxread.c
+++ b/src/3rdparty/libwebp/src/mux/muxread.c
@@ -56,7 +56,7 @@ static WebPMuxError ChunkVerifyAndAssign(WebPChunk* chunk,
   uint32_t chunk_size;
   WebPData chunk_data;
 
-  // Sanity checks.
+  // Correctness checks.
   if (data_size < CHUNK_HEADER_SIZE) return WEBP_MUX_NOT_ENOUGH_DATA;
   chunk_size = GetLE32(data + TAG_SIZE);
   if (chunk_size > MAX_CHUNK_PAYLOAD) return WEBP_MUX_BAD_DATA;
@@ -100,7 +100,7 @@ static int MuxImageParse(const WebPChunk* const chunk, int copy_data,
                          WebPMuxImage* const wpi) {
   const uint8_t* bytes = chunk->data_.bytes;
   size_t size = chunk->data_.size;
-  const uint8_t* const last = bytes + size;
+  const uint8_t* const last = (bytes == NULL) ? NULL : bytes + size;
   WebPChunk subchunk;
   size_t subchunk_size;
   WebPChunk** unknown_chunk_list = &wpi->unknown_;
@@ -155,7 +155,6 @@ static int MuxImageParse(const WebPChunk* const chunk, int copy_data,
         break;
       default:
         goto Fail;
-        break;
     }
     subchunk_size = ChunkDiskSize(&subchunk);
     bytes += subchunk_size;
@@ -187,7 +186,6 @@ WebPMux* WebPMuxCreateInternal(const WebPData* bitstream, int copy_data,
   WebPChunk** chunk_list_ends[WEBP_CHUNK_NIL + 1] = { NULL };
   ChunkInit(&chunk);
 
-  // Sanity checks.
   if (WEBP_ABI_IS_INCOMPATIBLE(version, WEBP_MUX_ABI_VERSION)) {
     return NULL;  // version mismatch
   }
@@ -264,7 +262,6 @@ WebPMux* WebPMuxCreateInternal(const WebPData* bitstream, int copy_data,
         if (!MuxImageParse(&chunk, copy_data, wpi)) goto Err;
         ChunkRelease(&chunk);
         goto PushImage;
-        break;
       default:  // A non-image chunk.
         if (wpi->is_partial_) goto Err;  // Encountered a non-image chunk before
                                          // getting all chunks of an image.
@@ -483,7 +480,6 @@ WebPMuxError WebPMuxGetFrame(
   WebPMuxError err;
   WebPMuxImage* wpi;
 
-  // Sanity checks.
   if (mux == NULL || frame == NULL) {
     return WEBP_MUX_INVALID_ARGUMENT;
   }
diff --git a/src/3rdparty/libwebp/src/utils/bit_reader_inl_utils.h b/src/3rdparty/libwebp/src/utils/bit_reader_inl_utils.h
index 46b3880..24f3af7 100644
--- a/src/3rdparty/libwebp/src/utils/bit_reader_inl_utils.h
+++ b/src/3rdparty/libwebp/src/utils/bit_reader_inl_utils.h
@@ -55,7 +55,7 @@ void VP8LoadFinalBytes(VP8BitReader* const br);
 
 // makes sure br->value_ has at least BITS bits worth of data
 static WEBP_UBSAN_IGNORE_UNDEF WEBP_INLINE
-void VP8LoadNewBytes(VP8BitReader* const br) {
+void VP8LoadNewBytes(VP8BitReader* WEBP_RESTRICT const br) {
   assert(br != NULL && br->buf_ != NULL);
   // Read 'BITS' bits at a time if possible.
   if (br->buf_ < br->buf_max_) {
@@ -104,7 +104,7 @@ void VP8LoadNewBytes(VP8BitReader* const br) {
 }
 
 // Read a bit with proba 'prob'. Speed-critical function!
-static WEBP_INLINE int VP8GetBit(VP8BitReader* const br,
+static WEBP_INLINE int VP8GetBit(VP8BitReader* WEBP_RESTRICT const br,
                                  int prob, const char label[]) {
   // Don't move this declaration! It makes a big speed difference to store
   // 'range' *before* calling VP8LoadNewBytes(), even if this function doesn't
@@ -137,7 +137,8 @@ static WEBP_INLINE int VP8GetBit(VP8BitReader* const br,
 
 // simplified version of VP8GetBit() for prob=0x80 (note shift is always 1 here)
 static WEBP_UBSAN_IGNORE_UNSIGNED_OVERFLOW WEBP_INLINE
-int VP8GetSigned(VP8BitReader* const br, int v, const char label[]) {
+int VP8GetSigned(VP8BitReader* WEBP_RESTRICT const br, int v,
+                 const char label[]) {
   if (br->bits_ < 0) {
     VP8LoadNewBytes(br);
   }
@@ -147,15 +148,15 @@ int VP8GetSigned(VP8BitReader* const br, int v, const char label[]) {
     const range_t value = (range_t)(br->value_ >> pos);
     const int32_t mask = (int32_t)(split - value) >> 31;  // -1 or 0
     br->bits_ -= 1;
-    br->range_ += mask;
+    br->range_ += (range_t)mask;
     br->range_ |= 1;
-    br->value_ -= (bit_t)((split + 1) & mask) << pos;
+    br->value_ -= (bit_t)((split + 1) & (uint32_t)mask) << pos;
     BT_TRACK(br);
     return (v ^ mask) - mask;
   }
 }
 
-static WEBP_INLINE int VP8GetBitAlt(VP8BitReader* const br,
+static WEBP_INLINE int VP8GetBitAlt(VP8BitReader* WEBP_RESTRICT const br,
                                     int prob, const char label[]) {
   // Don't move this declaration! It makes a big speed difference to store
   // 'range' *before* calling VP8LoadNewBytes(), even if this function doesn't
diff --git a/src/3rdparty/libwebp/src/utils/bit_writer_utils.c b/src/3rdparty/libwebp/src/utils/bit_writer_utils.c
index bef0e31..2f40850 100644
--- a/src/3rdparty/libwebp/src/utils/bit_writer_utils.c
+++ b/src/3rdparty/libwebp/src/utils/bit_writer_utils.c
@@ -278,7 +278,7 @@ void VP8LPutBitsFlushBits(VP8LBitWriter* const bw) {
   // If needed, make some room by flushing some bits out.
   if (bw->cur_ + VP8L_WRITER_BYTES > bw->end_) {
     const uint64_t extra_size = (bw->end_ - bw->buf_) + MIN_EXTRA_SIZE;
-    if (extra_size != (size_t)extra_size ||
+    if (!CheckSizeOverflow(extra_size) ||
         !VP8LBitWriterResize(bw, (size_t)extra_size)) {
       bw->cur_ = bw->buf_;
       bw->error_ = 1;
@@ -314,7 +314,7 @@ void VP8LPutBitsInternal(VP8LBitWriter* const bw, uint32_t bits, int n_bits) {
     while (used >= VP8L_WRITER_BITS) {
       if (bw->cur_ + VP8L_WRITER_BYTES > bw->end_) {
         const uint64_t extra_size = (bw->end_ - bw->buf_) + MIN_EXTRA_SIZE;
-        if (extra_size != (size_t)extra_size ||
+        if (!CheckSizeOverflow(extra_size) ||
             !VP8LBitWriterResize(bw, (size_t)extra_size)) {
           bw->cur_ = bw->buf_;
           bw->error_ = 1;
diff --git a/src/3rdparty/libwebp/src/utils/color_cache_utils.c b/src/3rdparty/libwebp/src/utils/color_cache_utils.c
index b09f538..7b5222b 100644
--- a/src/3rdparty/libwebp/src/utils/color_cache_utils.c
+++ b/src/3rdparty/libwebp/src/utils/color_cache_utils.c
@@ -20,22 +20,22 @@
 //------------------------------------------------------------------------------
 // VP8LColorCache.
 
-int VP8LColorCacheInit(VP8LColorCache* const cc, int hash_bits) {
+int VP8LColorCacheInit(VP8LColorCache* const color_cache, int hash_bits) {
   const int hash_size = 1 << hash_bits;
-  assert(cc != NULL);
+  assert(color_cache != NULL);
   assert(hash_bits > 0);
-  cc->colors_ = (uint32_t*)WebPSafeCalloc((uint64_t)hash_size,
-                                          sizeof(*cc->colors_));
-  if (cc->colors_ == NULL) return 0;
-  cc->hash_shift_ = 32 - hash_bits;
-  cc->hash_bits_ = hash_bits;
+  color_cache->colors_ = (uint32_t*)WebPSafeCalloc(
+      (uint64_t)hash_size, sizeof(*color_cache->colors_));
+  if (color_cache->colors_ == NULL) return 0;
+  color_cache->hash_shift_ = 32 - hash_bits;
+  color_cache->hash_bits_ = hash_bits;
   return 1;
 }
 
-void VP8LColorCacheClear(VP8LColorCache* const cc) {
-  if (cc != NULL) {
-    WebPSafeFree(cc->colors_);
-    cc->colors_ = NULL;
+void VP8LColorCacheClear(VP8LColorCache* const color_cache) {
+  if (color_cache != NULL) {
+    WebPSafeFree(color_cache->colors_);
+    color_cache->colors_ = NULL;
   }
 }
 
diff --git a/src/3rdparty/libwebp/src/utils/color_cache_utils.h b/src/3rdparty/libwebp/src/utils/color_cache_utils.h
index ec21d51..b45d47c 100644
--- a/src/3rdparty/libwebp/src/utils/color_cache_utils.h
+++ b/src/3rdparty/libwebp/src/utils/color_cache_utils.h
@@ -26,7 +26,7 @@ extern "C" {
 
 // Main color cache struct.
 typedef struct {
-  uint32_t *colors_;  // color entries
+  uint32_t* colors_;  // color entries
   int hash_shift_;    // Hash shift: 32 - hash_bits_.
   int hash_bits_;
 } VP8LColorCache;
diff --git a/src/3rdparty/libwebp/src/utils/huffman_encode_utils.c b/src/3rdparty/libwebp/src/utils/huffman_encode_utils.c
index 6f3b1bb..585db91 100644
--- a/src/3rdparty/libwebp/src/utils/huffman_encode_utils.c
+++ b/src/3rdparty/libwebp/src/utils/huffman_encode_utils.c
@@ -161,7 +161,7 @@ static void SetBitDepths(const HuffmanTree* const tree,
 // especially when population counts are longer than 2**tree_limit, but
 // we are not planning to use this with extremely long blocks.
 //
-// See http://en.wikipedia.org/wiki/Huffman_coding
+// See https://en.wikipedia.org/wiki/Huffman_coding
 static void GenerateOptimalTree(const uint32_t* const histogram,
                                 int histogram_size,
                                 HuffmanTree* tree, int tree_depth_limit,
@@ -404,8 +404,7 @@ static void ConvertBitDepthsToSymbols(HuffmanTreeCode* const tree) {
 // Main entry point
 
 void VP8LCreateHuffmanTree(uint32_t* const histogram, int tree_depth_limit,
-                           uint8_t* const buf_rle,
-                           HuffmanTree* const huff_tree,
+                           uint8_t* const buf_rle, HuffmanTree* const huff_tree,
                            HuffmanTreeCode* const huff_code) {
   const int num_symbols = huff_code->num_symbols;
   memset(buf_rle, 0, num_symbols * sizeof(*buf_rle));
diff --git a/src/3rdparty/libwebp/src/utils/huffman_encode_utils.h b/src/3rdparty/libwebp/src/utils/huffman_encode_utils.h
index 3e6763c..3f7f1d8 100644
--- a/src/3rdparty/libwebp/src/utils/huffman_encode_utils.h
+++ b/src/3rdparty/libwebp/src/utils/huffman_encode_utils.h
@@ -51,7 +51,7 @@ int VP8LCreateCompressedHuffmanTree(const HuffmanTreeCode* const tree,
 // huffman code tree.
 void VP8LCreateHuffmanTree(uint32_t* const histogram, int tree_depth_limit,
                            uint8_t* const buf_rle, HuffmanTree* const huff_tree,
-                           HuffmanTreeCode* const tree);
+                           HuffmanTreeCode* const huff_code);
 
 #ifdef __cplusplus
 }
diff --git a/src/3rdparty/libwebp/src/utils/huffman_utils.c b/src/3rdparty/libwebp/src/utils/huffman_utils.c
index 0cba0fb..90c2fbf 100644
--- a/src/3rdparty/libwebp/src/utils/huffman_utils.c
+++ b/src/3rdparty/libwebp/src/utils/huffman_utils.c
@@ -142,7 +142,7 @@ static int BuildHuffmanTable(HuffmanCode* const root_table, int root_bits,
 
   {
     int step;              // step size to replicate values in current table
-    uint32_t low = -1;     // low bits for current root entry
+    uint32_t low = 0xffffffffu;        // low bits for current root entry
     uint32_t mask = total_size - 1;    // mask for low bits
     uint32_t key = 0;      // reversed prefix code
     int num_nodes = 1;     // number of Huffman tree nodes
diff --git a/src/3rdparty/libwebp/src/utils/quant_levels_dec_utils.c b/src/3rdparty/libwebp/src/utils/quant_levels_dec_utils.c
index f65b6cd..97e7893 100644
--- a/src/3rdparty/libwebp/src/utils/quant_levels_dec_utils.c
+++ b/src/3rdparty/libwebp/src/utils/quant_levels_dec_utils.c
@@ -30,7 +30,7 @@
 
 #define DFIX 4           // extra precision for ordered dithering
 #define DSIZE 4          // dithering size (must be a power of two)
-// cf. http://en.wikipedia.org/wiki/Ordered_dithering
+// cf. https://en.wikipedia.org/wiki/Ordered_dithering
 static const uint8_t kOrderedDither[DSIZE][DSIZE] = {
   {  0,  8,  2, 10 },     // coefficients are in DFIX fixed-point precision
   { 12,  4, 14,  6 },
diff --git a/src/3rdparty/libwebp/src/utils/rescaler_utils.c b/src/3rdparty/libwebp/src/utils/rescaler_utils.c
index 4bcae24..a0581a1 100644
--- a/src/3rdparty/libwebp/src/utils/rescaler_utils.c
+++ b/src/3rdparty/libwebp/src/utils/rescaler_utils.c
@@ -12,66 +12,74 @@
 // Author: Skal (pascal.massimino@gmail.com)
 
 #include <assert.h>
+#include <limits.h>
 #include <stdlib.h>
 #include <string.h>
 #include "src/dsp/dsp.h"
 #include "src/utils/rescaler_utils.h"
+#include "src/utils/utils.h"
 
 //------------------------------------------------------------------------------
 
-void WebPRescalerInit(WebPRescaler* const wrk, int src_width, int src_height,
-                      uint8_t* const dst,
-                      int dst_width, int dst_height, int dst_stride,
-                      int num_channels, rescaler_t* const work) {
+int WebPRescalerInit(WebPRescaler* const rescaler,
+                     int src_width, int src_height,
+                     uint8_t* const dst,
+                     int dst_width, int dst_height, int dst_stride,
+                     int num_channels, rescaler_t* const work) {
   const int x_add = src_width, x_sub = dst_width;
   const int y_add = src_height, y_sub = dst_height;
-  wrk->x_expand = (src_width < dst_width);
-  wrk->y_expand = (src_height < dst_height);
-  wrk->src_width = src_width;
-  wrk->src_height = src_height;
-  wrk->dst_width = dst_width;
-  wrk->dst_height = dst_height;
-  wrk->src_y = 0;
-  wrk->dst_y = 0;
-  wrk->dst = dst;
-  wrk->dst_stride = dst_stride;
-  wrk->num_channels = num_channels;
+  const uint64_t total_size = 2ull * dst_width * num_channels * sizeof(*work);
+  if (!CheckSizeOverflow(total_size)) return 0;
+
+  rescaler->x_expand = (src_width < dst_width);
+  rescaler->y_expand = (src_height < dst_height);
+  rescaler->src_width = src_width;
+  rescaler->src_height = src_height;
+  rescaler->dst_width = dst_width;
+  rescaler->dst_height = dst_height;
+  rescaler->src_y = 0;
+  rescaler->dst_y = 0;
+  rescaler->dst = dst;
+  rescaler->dst_stride = dst_stride;
+  rescaler->num_channels = num_channels;
 
   // for 'x_expand', we use bilinear interpolation
-  wrk->x_add = wrk->x_expand ? (x_sub - 1) : x_add;
-  wrk->x_sub = wrk->x_expand ? (x_add - 1) : x_sub;
-  if (!wrk->x_expand) {  // fx_scale is not used otherwise
-    wrk->fx_scale = WEBP_RESCALER_FRAC(1, wrk->x_sub);
+  rescaler->x_add = rescaler->x_expand ? (x_sub - 1) : x_add;
+  rescaler->x_sub = rescaler->x_expand ? (x_add - 1) : x_sub;
+  if (!rescaler->x_expand) {  // fx_scale is not used otherwise
+    rescaler->fx_scale = WEBP_RESCALER_FRAC(1, rescaler->x_sub);
   }
   // vertical scaling parameters
-  wrk->y_add = wrk->y_expand ? y_add - 1 : y_add;
-  wrk->y_sub = wrk->y_expand ? y_sub - 1 : y_sub;
-  wrk->y_accum = wrk->y_expand ? wrk->y_sub : wrk->y_add;
-  if (!wrk->y_expand) {
+  rescaler->y_add = rescaler->y_expand ? y_add - 1 : y_add;
+  rescaler->y_sub = rescaler->y_expand ? y_sub - 1 : y_sub;
+  rescaler->y_accum = rescaler->y_expand ? rescaler->y_sub : rescaler->y_add;
+  if (!rescaler->y_expand) {
     // This is WEBP_RESCALER_FRAC(dst_height, x_add * y_add) without the cast.
-    // Its value is <= WEBP_RESCALER_ONE, because dst_height <= wrk->y_add, and
-    // wrk->x_add >= 1;
-    const uint64_t ratio =
-        (uint64_t)dst_height * WEBP_RESCALER_ONE / (wrk->x_add * wrk->y_add);
+    // Its value is <= WEBP_RESCALER_ONE, because dst_height <= rescaler->y_add
+    // and rescaler->x_add >= 1;
+    const uint64_t num = (uint64_t)dst_height * WEBP_RESCALER_ONE;
+    const uint64_t den = (uint64_t)rescaler->x_add * rescaler->y_add;
+    const uint64_t ratio = num / den;
     if (ratio != (uint32_t)ratio) {
       // When ratio == WEBP_RESCALER_ONE, we can't represent the ratio with the
       // current fixed-point precision. This happens when src_height ==
-      // wrk->y_add (which == src_height), and wrk->x_add == 1.
+      // rescaler->y_add (which == src_height), and rescaler->x_add == 1.
       // => We special-case fxy_scale = 0, in WebPRescalerExportRow().
-      wrk->fxy_scale = 0;
+      rescaler->fxy_scale = 0;
     } else {
-      wrk->fxy_scale = (uint32_t)ratio;
+      rescaler->fxy_scale = (uint32_t)ratio;
     }
-    wrk->fy_scale = WEBP_RESCALER_FRAC(1, wrk->y_sub);
+    rescaler->fy_scale = WEBP_RESCALER_FRAC(1, rescaler->y_sub);
   } else {
-    wrk->fy_scale = WEBP_RESCALER_FRAC(1, wrk->x_add);
-    // wrk->fxy_scale is unused here.
+    rescaler->fy_scale = WEBP_RESCALER_FRAC(1, rescaler->x_add);
+    // rescaler->fxy_scale is unused here.
   }
-  wrk->irow = work;
-  wrk->frow = work + num_channels * dst_width;
-  memset(work, 0, 2 * dst_width * num_channels * sizeof(*work));
+  rescaler->irow = work;
+  rescaler->frow = work + num_channels * dst_width;
+  memset(work, 0, (size_t)total_size);
 
   WebPRescalerDspInit();
+  return 1;
 }
 
 int WebPRescalerGetScaledDimensions(int src_width, int src_height,
@@ -82,6 +90,7 @@ int WebPRescalerGetScaledDimensions(int src_width, int src_height,
   {
     int width = *scaled_width;
     int height = *scaled_height;
+    const int max_size = INT_MAX / 2;
 
     // if width is unspecified, scale original proportionally to height ratio.
     if (width == 0 && src_height > 0) {
@@ -94,7 +103,7 @@ int WebPRescalerGetScaledDimensions(int src_width, int src_height,
           (int)(((uint64_t)src_height * width + src_width - 1) / src_width);
     }
     // Check if the overall dimensions still make sense.
-    if (width <= 0 || height <= 0) {
+    if (width <= 0 || height <= 0 || width > max_size || height > max_size) {
       return 0;
     }
 
@@ -107,31 +116,34 @@ int WebPRescalerGetScaledDimensions(int src_width, int src_height,
 //------------------------------------------------------------------------------
 // all-in-one calls
 
-int WebPRescaleNeededLines(const WebPRescaler* const wrk, int max_num_lines) {
-  const int num_lines = (wrk->y_accum + wrk->y_sub - 1) / wrk->y_sub;
+int WebPRescaleNeededLines(const WebPRescaler* const rescaler,
+                           int max_num_lines) {
+  const int num_lines =
+      (rescaler->y_accum + rescaler->y_sub - 1) / rescaler->y_sub;
   return (num_lines > max_num_lines) ? max_num_lines : num_lines;
 }
 
-int WebPRescalerImport(WebPRescaler* const wrk, int num_lines,
+int WebPRescalerImport(WebPRescaler* const rescaler, int num_lines,
                        const uint8_t* src, int src_stride) {
   int total_imported = 0;
-  while (total_imported < num_lines && !WebPRescalerHasPendingOutput(wrk)) {
-    if (wrk->y_expand) {
-      rescaler_t* const tmp = wrk->irow;
-      wrk->irow = wrk->frow;
-      wrk->frow = tmp;
+  while (total_imported < num_lines &&
+         !WebPRescalerHasPendingOutput(rescaler)) {
+    if (rescaler->y_expand) {
+      rescaler_t* const tmp = rescaler->irow;
+      rescaler->irow = rescaler->frow;
+      rescaler->frow = tmp;
     }
-    WebPRescalerImportRow(wrk, src);
-    if (!wrk->y_expand) {     // Accumulate the contribution of the new row.
+    WebPRescalerImportRow(rescaler, src);
+    if (!rescaler->y_expand) {    // Accumulate the contribution of the new row.
       int x;
-      for (x = 0; x < wrk->num_channels * wrk->dst_width; ++x) {
-        wrk->irow[x] += wrk->frow[x];
+      for (x = 0; x < rescaler->num_channels * rescaler->dst_width; ++x) {
+        rescaler->irow[x] += rescaler->frow[x];
       }
     }
-    ++wrk->src_y;
+    ++rescaler->src_y;
     src += src_stride;
     ++total_imported;
-    wrk->y_accum -= wrk->y_sub;
+    rescaler->y_accum -= rescaler->y_sub;
   }
   return total_imported;
 }
diff --git a/src/3rdparty/libwebp/src/utils/rescaler_utils.h b/src/3rdparty/libwebp/src/utils/rescaler_utils.h
index ca41e42..ef201ef 100644
--- a/src/3rdparty/libwebp/src/utils/rescaler_utils.h
+++ b/src/3rdparty/libwebp/src/utils/rescaler_utils.h
@@ -47,12 +47,13 @@ struct WebPRescaler {
 };
 
 // Initialize a rescaler given scratch area 'work' and dimensions of src & dst.
-void WebPRescalerInit(WebPRescaler* const rescaler,
-                      int src_width, int src_height,
-                      uint8_t* const dst,
-                      int dst_width, int dst_height, int dst_stride,
-                      int num_channels,
-                      rescaler_t* const work);
+// Returns false in case of error.
+int WebPRescalerInit(WebPRescaler* const rescaler,
+                     int src_width, int src_height,
+                     uint8_t* const dst,
+                     int dst_width, int dst_height, int dst_stride,
+                     int num_channels,
+                     rescaler_t* const work);
 
 // If either 'scaled_width' or 'scaled_height' (but not both) is 0 the value
 // will be calculated preserving the aspect ratio, otherwise the values are
diff --git a/src/3rdparty/libwebp/src/utils/thread_utils.c b/src/3rdparty/libwebp/src/utils/thread_utils.c
index 438296b..4e470e1 100644
--- a/src/3rdparty/libwebp/src/utils/thread_utils.c
+++ b/src/3rdparty/libwebp/src/utils/thread_utils.c
@@ -73,7 +73,7 @@ typedef struct {
 #endif
 
 static int pthread_create(pthread_t* const thread, const void* attr,
-                          unsigned int (__stdcall *start)(void*), void* arg) {
+                          unsigned int (__stdcall* start)(void*), void* arg) {
   (void)attr;
 #ifdef USE_CREATE_THREAD
   *thread = CreateThread(NULL,   /* lpThreadAttributes */
diff --git a/src/3rdparty/libwebp/src/utils/utils.c b/src/3rdparty/libwebp/src/utils/utils.c
index 44d5c14..a7c3a70 100644
--- a/src/3rdparty/libwebp/src/utils/utils.c
+++ b/src/3rdparty/libwebp/src/utils/utils.c
@@ -23,7 +23,7 @@
 // alloc/free etc) is printed. For debugging/tuning purpose only (it's slow,
 // and not multi-thread safe!).
 // An interesting alternative is valgrind's 'massif' tool:
-//    http://valgrind.org/docs/manual/ms-manual.html
+//    https://valgrind.org/docs/manual/ms-manual.html
 // Here is an example command line:
 /*    valgrind --tool=massif --massif-out-file=massif.out \
                --stacks=yes --alloc-fn=WebPSafeMalloc --alloc-fn=WebPSafeCalloc
@@ -101,6 +101,9 @@ static void Increment(int* const v) {
 #if defined(MALLOC_LIMIT)
     {
       const char* const malloc_limit_str = getenv("MALLOC_LIMIT");
+#if MALLOC_LIMIT > 1
+      mem_limit = (size_t)MALLOC_LIMIT;
+#endif
       if (malloc_limit_str != NULL) {
         mem_limit = atoi(malloc_limit_str);
       }
@@ -169,16 +172,16 @@ static int CheckSizeArgumentsOverflow(uint64_t nmemb, size_t size) {
   const uint64_t total_size = nmemb * size;
   if (nmemb == 0) return 1;
   if ((uint64_t)size > WEBP_MAX_ALLOCABLE_MEMORY / nmemb) return 0;
-  if (total_size != (size_t)total_size) return 0;
+  if (!CheckSizeOverflow(total_size)) return 0;
 #if defined(PRINT_MEM_INFO) && defined(MALLOC_FAIL_AT)
   if (countdown_to_fail > 0 && --countdown_to_fail == 0) {
     return 0;    // fake fail!
   }
 #endif
-#if defined(MALLOC_LIMIT)
+#if defined(PRINT_MEM_INFO) && defined(MALLOC_LIMIT)
   if (mem_limit > 0) {
     const uint64_t new_total_mem = (uint64_t)total_mem + total_size;
-    if (new_total_mem != (size_t)new_total_mem ||
+    if (!CheckSizeOverflow(new_total_mem) ||
         new_total_mem > mem_limit) {
       return 0;   // fake fail!
     }
@@ -216,9 +219,14 @@ void WebPSafeFree(void* const ptr) {
   free(ptr);
 }
 
-// Public API function.
+// Public API functions.
+
+void* WebPMalloc(size_t size) {
+  return WebPSafeMalloc(1, size);
+}
+
 void WebPFree(void* ptr) {
-  free(ptr);
+  WebPSafeFree(ptr);
 }
 
 //------------------------------------------------------------------------------
@@ -226,7 +234,7 @@ void WebPFree(void* ptr) {
 void WebPCopyPlane(const uint8_t* src, int src_stride,
                    uint8_t* dst, int dst_stride, int width, int height) {
   assert(src != NULL && dst != NULL);
-  assert(src_stride >= width && dst_stride >= width);
+  assert(abs(src_stride) >= width && abs(dst_stride) >= width);
   while (height-- > 0) {
     memcpy(dst, src, width);
     src += src_stride;
diff --git a/src/3rdparty/libwebp/src/utils/utils.h b/src/3rdparty/libwebp/src/utils/utils.h
index 2a3ec92..c5ee873 100644
--- a/src/3rdparty/libwebp/src/utils/utils.h
+++ b/src/3rdparty/libwebp/src/utils/utils.h
@@ -42,6 +42,10 @@ extern "C" {
 #endif
 #endif  // WEBP_MAX_ALLOCABLE_MEMORY
 
+static WEBP_INLINE int CheckSizeOverflow(uint64_t size) {
+  return size == (size_t)size;
+}
+
 // size-checking safe malloc/calloc: verify that the requested size is not too
 // large, or return NULL. You don't need to call these for constructs like
 // malloc(sizeof(foo)), but only if there's picture-dependent size involved
@@ -60,7 +64,8 @@ WEBP_EXTERN void WebPSafeFree(void* const ptr);
 // Alignment
 
 #define WEBP_ALIGN_CST 31
-#define WEBP_ALIGN(PTR) (((uintptr_t)(PTR) + WEBP_ALIGN_CST) & ~WEBP_ALIGN_CST)
+#define WEBP_ALIGN(PTR) (((uintptr_t)(PTR) + WEBP_ALIGN_CST) & \
+                         ~(uintptr_t)WEBP_ALIGN_CST)
 
 #include <string.h>
 // memcpy() is the safe way of moving potentially unaligned 32b memory.
@@ -69,10 +74,19 @@ static WEBP_INLINE uint32_t WebPMemToUint32(const uint8_t* const ptr) {
   memcpy(&A, ptr, sizeof(A));
   return A;
 }
+
+static WEBP_INLINE int32_t WebPMemToInt32(const uint8_t* const ptr) {
+  return (int32_t)WebPMemToUint32(ptr);
+}
+
 static WEBP_INLINE void WebPUint32ToMem(uint8_t* const ptr, uint32_t val) {
   memcpy(ptr, &val, sizeof(val));
 }
 
+static WEBP_INLINE void WebPInt32ToMem(uint8_t* const ptr, int val) {
+  WebPUint32ToMem(ptr, (uint32_t)val);
+}
+
 //------------------------------------------------------------------------------
 // Reading/writing data.
 
@@ -107,24 +121,33 @@ static WEBP_INLINE void PutLE32(uint8_t* const data, uint32_t val) {
   PutLE16(data + 2, (int)(val >> 16));
 }
 
-// Returns (int)floor(log2(n)). n must be > 0.
 // use GNU builtins where available.
 #if defined(__GNUC__) && \
     ((__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || __GNUC__ >= 4)
+// Returns (int)floor(log2(n)). n must be > 0.
 static WEBP_INLINE int BitsLog2Floor(uint32_t n) {
   return 31 ^ __builtin_clz(n);
 }
+// counts the number of trailing zero
+static WEBP_INLINE int BitsCtz(uint32_t n) { return __builtin_ctz(n); }
 #elif defined(_MSC_VER) && _MSC_VER > 1310 && \
       (defined(_M_X64) || defined(_M_IX86))
 #include <intrin.h>
 #pragma intrinsic(_BitScanReverse)
+#pragma intrinsic(_BitScanForward)
 
 static WEBP_INLINE int BitsLog2Floor(uint32_t n) {
-  unsigned long first_set_bit;
+  unsigned long first_set_bit;  // NOLINT (runtime/int)
   _BitScanReverse(&first_set_bit, n);
   return first_set_bit;
 }
-#else   // default: use the C-version.
+static WEBP_INLINE int BitsCtz(uint32_t n) {
+  unsigned long first_set_bit;  // NOLINT (runtime/int)
+  _BitScanForward(&first_set_bit, n);
+  return first_set_bit;
+}
+#else   // default: use the (slow) C-version.
+#define WEBP_HAVE_SLOW_CLZ_CTZ   // signal that the Clz/Ctz function are slow
 // Returns 31 ^ clz(n) = log2(n). This is the default C-implementation, either
 // based on table or not. Can be used as fallback if clz() is not available.
 #define WEBP_NEED_LOG_TABLE_8BIT
@@ -139,6 +162,15 @@ static WEBP_INLINE int WebPLog2FloorC(uint32_t n) {
 }
 
 static WEBP_INLINE int BitsLog2Floor(uint32_t n) { return WebPLog2FloorC(n); }
+
+static WEBP_INLINE int BitsCtz(uint32_t n) {
+  int i;
+  for (i = 0; i < 32; ++i, n >>= 1) {
+    if (n & 1) return i;
+  }
+  return 32;
+}
+
 #endif
 
 //------------------------------------------------------------------------------
diff --git a/src/3rdparty/libwebp/src/webp/config.h b/src/3rdparty/libwebp/src/webp/config.h
index 80bb1f0..7b1a617 100644
--- a/src/3rdparty/libwebp/src/webp/config.h
+++ b/src/3rdparty/libwebp/src/webp/config.h
@@ -29,9 +29,6 @@
 /* Define to 1 if you have the <inttypes.h> header file. */
 /* #undef HAVE_INTTYPES_H */
 
-/* Define to 1 if you have the <memory.h> header file. */
-/* #undef HAVE_MEMORY_H */
-
 /* Define to 1 if you have the <OpenGL/glut.h> header file. */
 /* #undef HAVE_OPENGL_GLUT_H */
 
@@ -44,6 +41,9 @@
 /* Define to 1 if you have the <stdint.h> header file. */
 /* #undef HAVE_STDINT_H */
 
+/* Define to 1 if you have the <stdio.h> header file. */
+/* #undef HAVE_STDIO_H */
+
 /* Define to 1 if you have the <stdlib.h> header file. */
 /* #undef HAVE_STDLIB_H */
 
@@ -81,7 +81,7 @@
 #define PACKAGE_NAME "libwebp"
 
 /* Define to the full name and version of this package. */
-#define PACKAGE_STRING "libwebp 1.0.3"
+#define PACKAGE_STRING "libwebp 1.2.4"
 
 /* Define to the one symbol short name of this package. */
 #define PACKAGE_TARNAME "libwebp"
@@ -90,17 +90,19 @@
 #define PACKAGE_URL "http://developers.google.com/speed/webp"
 
 /* Define to the version of this package. */
-#define PACKAGE_VERSION "1.0.3"
+#define PACKAGE_VERSION "1.2.4"
 
 /* Define to necessary symbol if this constant uses a non-standard name on
    your system. */
 /* #undef PTHREAD_CREATE_JOINABLE */
 
-/* Define to 1 if you have the ANSI C header files. */
+/* Define to 1 if all of the C90 standard headers exist (not just the ones
+   required in a freestanding environment). This macro is provided for
+   backward compatibility; new code need not use it. */
 /* #undef STDC_HEADERS */
 
 /* Version number of package */
-#define VERSION "1.0.3"
+#define VERSION "1.2.4"
 
 /* Enable experimental code */
 /* #undef WEBP_EXPERIMENTAL_FEATURES */
diff --git a/src/3rdparty/libwebp/src/webp/decode.h b/src/3rdparty/libwebp/src/webp/decode.h
index ae8bfe8..d982475 100644
--- a/src/3rdparty/libwebp/src/webp/decode.h
+++ b/src/3rdparty/libwebp/src/webp/decode.h
@@ -20,7 +20,7 @@
 extern "C" {
 #endif
 
-#define WEBP_DECODER_ABI_VERSION 0x0208    // MAJOR(8b) + MINOR(8b)
+#define WEBP_DECODER_ABI_VERSION 0x0209    // MAJOR(8b) + MINOR(8b)
 
 // Note: forward declaring enumerations is not allowed in (strict) C and C++,
 // the types are left here for reference.
@@ -85,15 +85,12 @@ WEBP_EXTERN uint8_t* WebPDecodeBGR(const uint8_t* data, size_t data_size,
 // Upon return, the Y buffer has a stride returned as '*stride', while U and V
 // have a common stride returned as '*uv_stride'.
 // Return NULL in case of error.
-// (*) Also named Y'CbCr. See: http://en.wikipedia.org/wiki/YCbCr
+// (*) Also named Y'CbCr. See: https://en.wikipedia.org/wiki/YCbCr
 WEBP_EXTERN uint8_t* WebPDecodeYUV(const uint8_t* data, size_t data_size,
                                    int* width, int* height,
                                    uint8_t** u, uint8_t** v,
                                    int* stride, int* uv_stride);
 
-// Releases memory returned by the WebPDecode*() functions above.
-WEBP_EXTERN void WebPFree(void* ptr);
-
 // These five functions are variants of the above ones, that decode the image
 // directly into a pre-allocated buffer 'output_buffer'. The maximum storage
 // available in this buffer is indicated by 'output_buffer_size'. If this
@@ -456,7 +453,7 @@ struct WebPDecoderOptions {
   int scaled_width, scaled_height;    // final resolution
   int use_threads;                    // if true, use multi-threaded decoding
   int dithering_strength;             // dithering strength (0=Off, 100=full)
-  int flip;                           // flip output vertically
+  int flip;                           // if true, flip output vertically
   int alpha_dithering_strength;       // alpha dithering strength in [0..100]
 
   uint32_t pad[5];                    // padding for later use
diff --git a/src/3rdparty/libwebp/src/webp/encode.h b/src/3rdparty/libwebp/src/webp/encode.h
index 339f881..56b68e2 100644
--- a/src/3rdparty/libwebp/src/webp/encode.h
+++ b/src/3rdparty/libwebp/src/webp/encode.h
@@ -20,7 +20,7 @@
 extern "C" {
 #endif
 
-#define WEBP_ENCODER_ABI_VERSION 0x020e    // MAJOR(8b) + MINOR(8b)
+#define WEBP_ENCODER_ABI_VERSION 0x020f    // MAJOR(8b) + MINOR(8b)
 
 // Note: forward declaring enumerations is not allowed in (strict) C and C++,
 // the types are left here for reference.
@@ -79,9 +79,6 @@ WEBP_EXTERN size_t WebPEncodeLosslessBGRA(const uint8_t* bgra,
                                           int width, int height, int stride,
                                           uint8_t** output);
 
-// Releases memory returned by the WebPEncode*() functions above.
-WEBP_EXTERN void WebPFree(void* ptr);
-
 //------------------------------------------------------------------------------
 // Coding parameters
 
@@ -151,7 +148,8 @@ struct WebPConfig {
   int use_delta_palette;  // reserved for future lossless feature
   int use_sharp_yuv;      // if needed, use sharp (and slow) RGB->YUV conversion
 
-  uint32_t pad[2];        // padding for later use
+  int qmin;               // minimum permissible quality factor
+  int qmax;               // maximum permissible quality factor
 };
 
 // Enumerate some predefined settings for WebPConfig, depending on the type
@@ -294,6 +292,11 @@ typedef enum WebPEncodingError {
 #define WEBP_MAX_DIMENSION 16383
 
 // Main exchange structure (input samples, output bytes, statistics)
+//
+// Once WebPPictureInit() has been called, it's ok to make all the INPUT fields
+// (use_argb, y/u/v, argb, ...) point to user-owned data, even if
+// WebPPictureAlloc() has been called. Depending on the value use_argb,
+// it's guaranteed that either *argb or *y/*u/*v content will be kept untouched.
 struct WebPPicture {
   //   INPUT
   //////////////
@@ -306,7 +309,7 @@ struct WebPPicture {
   // YUV input (mostly used for input to lossy compression)
   WebPEncCSP colorspace;     // colorspace: should be YUV420 for now (=Y'CbCr).
   int width, height;         // dimensions (less or equal to WEBP_MAX_DIMENSION)
-  uint8_t *y, *u, *v;        // pointers to luma/chroma planes.
+  uint8_t* y, *u, *v;        // pointers to luma/chroma planes.
   int y_stride, uv_stride;   // luma/chroma strides.
   uint8_t* a;                // pointer to the alpha plane
   int a_stride;              // stride of the alpha plane
@@ -350,7 +353,7 @@ struct WebPPicture {
   uint32_t pad3[3];       // padding for later use
 
   // Unused for now
-  uint8_t *pad4, *pad5;
+  uint8_t* pad4, *pad5;
   uint32_t pad6[8];       // padding for later use
 
   // PRIVATE FIELDS
@@ -438,7 +441,7 @@ WEBP_EXTERN int WebPPictureCrop(WebPPicture* picture,
 // the original dimension will be lost). Picture 'dst' need not be initialized
 // with WebPPictureInit() if it is different from 'src', since its content will
 // be overwritten.
-// Returns false in case of memory allocation error or invalid parameters.
+// Returns false in case of invalid parameters.
 WEBP_EXTERN int WebPPictureView(const WebPPicture* src,
                                 int left, int top, int width, int height,
                                 WebPPicture* dst);
@@ -452,7 +455,7 @@ WEBP_EXTERN int WebPPictureIsView(const WebPPicture* picture);
 // dimension will be calculated preserving the aspect ratio.
 // No gamma correction is applied.
 // Returns false in case of error (invalid parameter or insufficient memory).
-WEBP_EXTERN int WebPPictureRescale(WebPPicture* pic, int width, int height);
+WEBP_EXTERN int WebPPictureRescale(WebPPicture* picture, int width, int height);
 
 // Colorspace conversion function to import RGB samples.
 // Previous buffer will be free'd, if any.
@@ -523,7 +526,7 @@ WEBP_EXTERN int WebPPictureHasTransparency(const WebPPicture* picture);
 // Remove the transparency information (if present) by blending the color with
 // the background color 'background_rgb' (specified as 24bit RGB triplet).
 // After this call, all alpha values are reset to 0xff.
-WEBP_EXTERN void WebPBlendAlpha(WebPPicture* pic, uint32_t background_rgb);
+WEBP_EXTERN void WebPBlendAlpha(WebPPicture* picture, uint32_t background_rgb);
 
 //------------------------------------------------------------------------------
 // Main call
diff --git a/src/3rdparty/libwebp/src/webp/format_constants.h b/src/3rdparty/libwebp/src/webp/format_constants.h
index eca6981..999035c 100644
--- a/src/3rdparty/libwebp/src/webp/format_constants.h
+++ b/src/3rdparty/libwebp/src/webp/format_constants.h
@@ -55,7 +55,7 @@
 typedef enum {
   PREDICTOR_TRANSFORM      = 0,
   CROSS_COLOR_TRANSFORM    = 1,
-  SUBTRACT_GREEN           = 2,
+  SUBTRACT_GREEN_TRANSFORM = 2,
   COLOR_INDEXING_TRANSFORM = 3
 } VP8LImageTransformType;
 
diff --git a/src/3rdparty/libwebp/src/webp/mux.h b/src/3rdparty/libwebp/src/webp/mux.h
index 66096a9..7d27489 100644
--- a/src/3rdparty/libwebp/src/webp/mux.h
+++ b/src/3rdparty/libwebp/src/webp/mux.h
@@ -57,7 +57,7 @@ extern "C" {
   WebPMuxGetChunk(mux, "ICCP", &icc_profile);
   // ... (Consume icc_data).
   WebPMuxDelete(mux);
-  free(data);
+  WebPFree(data);
 */
 
 // Note: forward declaring enumerations is not allowed in (strict) C and C++,
@@ -245,7 +245,7 @@ WEBP_EXTERN WebPMuxError WebPMuxPushFrame(
     WebPMux* mux, const WebPMuxFrameInfo* frame, int copy_data);
 
 // Gets the nth frame from the mux object.
-// The content of 'frame->bitstream' is allocated using malloc(), and NOT
+// The content of 'frame->bitstream' is allocated using WebPMalloc(), and NOT
 // owned by the 'mux' object. It MUST be deallocated by the caller by calling
 // WebPDataClear().
 // nth=0 has a special meaning - last position.
@@ -376,10 +376,10 @@ WEBP_EXTERN WebPMuxError WebPMuxNumChunks(const WebPMux* mux,
 // Assembles all chunks in WebP RIFF format and returns in 'assembled_data'.
 // This function also validates the mux object.
 // Note: The content of 'assembled_data' will be ignored and overwritten.
-// Also, the content of 'assembled_data' is allocated using malloc(), and NOT
-// owned by the 'mux' object. It MUST be deallocated by the caller by calling
-// WebPDataClear(). It's always safe to call WebPDataClear() upon return,
-// even in case of error.
+// Also, the content of 'assembled_data' is allocated using WebPMalloc(), and
+// NOT owned by the 'mux' object. It MUST be deallocated by the caller by
+// calling WebPDataClear(). It's always safe to call WebPDataClear() upon
+// return, even in case of error.
 // Parameters:
 //   mux - (in/out) object whose chunks are to be assembled
 //   assembled_data - (out) assembled WebP data
diff --git a/src/3rdparty/libwebp/src/webp/mux_types.h b/src/3rdparty/libwebp/src/webp/mux_types.h
index ceea77d..2fe8195 100644
--- a/src/3rdparty/libwebp/src/webp/mux_types.h
+++ b/src/3rdparty/libwebp/src/webp/mux_types.h
@@ -14,7 +14,6 @@
 #ifndef WEBP_WEBP_MUX_TYPES_H_
 #define WEBP_WEBP_MUX_TYPES_H_
 
-#include <stdlib.h>  // free()
 #include <string.h>  // memset()
 #include "./types.h"
 
@@ -56,6 +55,7 @@ typedef enum WebPMuxAnimBlend {
 
 // Data type used to describe 'raw' data, e.g., chunk data
 // (ICC profile, metadata) and WebP compressed image data.
+// 'bytes' memory must be allocated using WebPMalloc() and such.
 struct WebPData {
   const uint8_t* bytes;
   size_t size;
@@ -68,11 +68,11 @@ static WEBP_INLINE void WebPDataInit(WebPData* webp_data) {
   }
 }
 
-// Clears the contents of the 'webp_data' object by calling free(). Does not
-// deallocate the object itself.
+// Clears the contents of the 'webp_data' object by calling WebPFree().
+// Does not deallocate the object itself.
 static WEBP_INLINE void WebPDataClear(WebPData* webp_data) {
   if (webp_data != NULL) {
-    free((void*)webp_data->bytes);
+    WebPFree((void*)webp_data->bytes);
     WebPDataInit(webp_data);
   }
 }
@@ -83,7 +83,7 @@ static WEBP_INLINE int WebPDataCopy(const WebPData* src, WebPData* dst) {
   if (src == NULL || dst == NULL) return 0;
   WebPDataInit(dst);
   if (src->bytes != NULL && src->size != 0) {
-    dst->bytes = (uint8_t*)malloc(src->size);
+    dst->bytes = (uint8_t*)WebPMalloc(src->size);
     if (dst->bytes == NULL) return 0;
     memcpy((void*)dst->bytes, src->bytes, src->size);
     dst->size = src->size;
diff --git a/src/3rdparty/libwebp/src/webp/types.h b/src/3rdparty/libwebp/src/webp/types.h
index 0ce2622..f255432 100644
--- a/src/3rdparty/libwebp/src/webp/types.h
+++ b/src/3rdparty/libwebp/src/webp/types.h
@@ -7,7 +7,7 @@
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
-//  Common types
+//  Common types + memory wrappers
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
@@ -42,11 +42,31 @@ typedef long long int int64_t;
 # if defined(__GNUC__) && __GNUC__ >= 4
 #  define WEBP_EXTERN extern __attribute__ ((visibility ("default")))
 # else
-#  define WEBP_EXTERN extern
+#  if defined(_MSC_VER) && defined(WEBP_DLL)
+#   define WEBP_EXTERN __declspec(dllexport)
+#  else
+#   define WEBP_EXTERN extern
+#  endif
 # endif  /* __GNUC__ >= 4 */
 #endif  /* WEBP_EXTERN */
 
 // Macro to check ABI compatibility (same major revision number)
 #define WEBP_ABI_IS_INCOMPATIBLE(a, b) (((a) >> 8) != ((b) >> 8))
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Allocates 'size' bytes of memory. Returns NULL upon error. Memory
+// must be deallocated by calling WebPFree(). This function is made available
+// by the core 'libwebp' library.
+WEBP_EXTERN void* WebPMalloc(size_t size);
+
+// Releases memory returned by the WebPDecode*() functions (from decode.h).
+WEBP_EXTERN void WebPFree(void* ptr);
+
+#ifdef __cplusplus
+}    // extern "C"
+#endif
+
 #endif  // WEBP_WEBP_TYPES_H_