diff options
author | Eirik Aavitsland <eirik.aavitsland@qt.io> | 2019-11-21 13:50:25 +0100 |
---|---|---|
committer | Eirik Aavitsland <eirik.aavitsland@qt.io> | 2019-11-29 09:57:36 +0100 |
commit | 218e5a342b289edeac82dbebf9ab62f893964dc2 (patch) | |
tree | 4f37d3c2b3866e18700765316805f0fd2ede7960 | |
parent | 35382896a4c8f155d45a02ecfc5479b5d6e0db47 (diff) |
Update bundled libwebp to version 1.0.3
[ChangeLog][Third-Party Code] Update bundled libwebp to version 1.0.3
(This is a squashed cherry pick from 5.12 branch)
Change-Id: I177a6818c5e073298fc565f91d4126af628b7639
Reviewed-by: Andy Nichols <andy.nichols@qt.io>
Reviewed-by: Liang Qi <liang.qi@qt.io>
170 files changed, 10382 insertions, 6999 deletions
diff --git a/src/3rdparty/libwebp.pri b/src/3rdparty/libwebp.pri index 55d7e60..3e839a2 100644 --- a/src/3rdparty/libwebp.pri +++ b/src/3rdparty/libwebp.pri @@ -25,12 +25,10 @@ SOURCES += \ $$PWD/libwebp/src/dsp/alpha_processing_mips_dsp_r2.c \ $$PWD/libwebp/src/dsp/alpha_processing_sse2.c \ $$PWD/libwebp/src/dsp/alpha_processing_sse41.c \ - $$PWD/libwebp/src/dsp/argb.c \ - $$PWD/libwebp/src/dsp/argb_mips_dsp_r2.c \ - $$PWD/libwebp/src/dsp/argb_sse2.c \ $$PWD/libwebp/src/dsp/cost.c \ $$PWD/libwebp/src/dsp/cost_mips32.c \ $$PWD/libwebp/src/dsp/cost_mips_dsp_r2.c \ + $$PWD/libwebp/src/dsp/cost_neon.c \ $$PWD/libwebp/src/dsp/cost_sse2.c \ $$PWD/libwebp/src/dsp/cpu.c \ $$PWD/libwebp/src/dsp/dec.c \ @@ -41,7 +39,6 @@ SOURCES += \ $$PWD/libwebp/src/dsp/dec_sse2.c \ $$PWD/libwebp/src/dsp/dec_sse41.c \ $$PWD/libwebp/src/dsp/enc.c \ - $$PWD/libwebp/src/dsp/enc_avx2.c \ $$PWD/libwebp/src/dsp/enc_mips32.c \ $$PWD/libwebp/src/dsp/enc_mips_dsp_r2.c \ $$PWD/libwebp/src/dsp/enc_msa.c \ @@ -64,21 +61,25 @@ SOURCES += \ $$PWD/libwebp/src/dsp/rescaler_mips_dsp_r2.c \ $$PWD/libwebp/src/dsp/rescaler_msa.c \ $$PWD/libwebp/src/dsp/rescaler_sse2.c \ + $$PWD/libwebp/src/dsp/ssim.c \ + $$PWD/libwebp/src/dsp/ssim_sse2.c \ $$PWD/libwebp/src/dsp/upsampling.c \ $$PWD/libwebp/src/dsp/upsampling_mips_dsp_r2.c \ $$PWD/libwebp/src/dsp/upsampling_msa.c \ $$PWD/libwebp/src/dsp/upsampling_sse2.c \ + $$PWD/libwebp/src/dsp/upsampling_sse41.c \ $$PWD/libwebp/src/dsp/yuv.c \ $$PWD/libwebp/src/dsp/yuv_mips_dsp_r2.c \ $$PWD/libwebp/src/dsp/lossless_sse2.c \ $$PWD/libwebp/src/dsp/yuv_mips32.c \ $$PWD/libwebp/src/dsp/yuv_sse2.c \ + $$PWD/libwebp/src/dsp/yuv_sse41.c \ $$PWD/libwebp/src/enc/alpha_enc.c \ $$PWD/libwebp/src/enc/analysis_enc.c \ + $$PWD/libwebp/src/enc/backward_references_cost_enc.c \ $$PWD/libwebp/src/enc/backward_references_enc.c \ $$PWD/libwebp/src/enc/config_enc.c \ $$PWD/libwebp/src/enc/cost_enc.c \ - $$PWD/libwebp/src/enc/delta_palettization_enc.c \ $$PWD/libwebp/src/enc/filter_enc.c \ $$PWD/libwebp/src/enc/frame_enc.c \ $$PWD/libwebp/src/enc/histogram_enc.c \ @@ -131,7 +132,8 @@ equals(QT_ARCH, arm)|equals(QT_ARCH, arm64) { $$PWD/libwebp/src/dsp/lossless_enc_neon.c \ $$PWD/libwebp/src/dsp/lossless_neon.c \ $$PWD/libwebp/src/dsp/rescaler_neon.c \ - $$PWD/libwebp/src/dsp/upsampling_neon.c + $$PWD/libwebp/src/dsp/upsampling_neon.c \ + $$PWD/libwebp/src/dsp/yuv_neon.c contains(QT_CPU_FEATURES.$$QT_ARCH, neon) { # Default compiler settings include this feature, so just add to SOURCES diff --git a/src/3rdparty/libwebp/AUTHORS b/src/3rdparty/libwebp/AUTHORS index b6e9cfb..67482c1 100644 --- a/src/3rdparty/libwebp/AUTHORS +++ b/src/3rdparty/libwebp/AUTHORS @@ -1,4 +1,6 @@ Contributors: +- Aidan O'Loan (aidanol at gmail dot com) +- Alan Browning (browning at google dot com) - Charles Munger (clm at google dot com) - Christian Duvivier (cduvivier at google dot com) - Djordje Pesut (djordje dot pesut at imgtec dot com) @@ -6,9 +8,10 @@ Contributors: - James Zern (jzern at google dot com) - Jan Engelhardt (jengelh at medozas dot de) - Jehan (jehan at girinstud dot io) -- Johann (johann dot koenig at duck dot com) +- Johann Koenig (johann dot koenig at duck dot com) - Jovan Zelincevic (jovan dot zelincevic at imgtec dot com) - Jyrki Alakuijala (jyrki at google dot com) +- Konstantin Ivlev (tomskside at gmail dot com) - Lode Vandevenne (lode at google dot com) - Lou Quillio (louquillio at google dot com) - Mans Rullgard (mans at mansr dot com) @@ -35,4 +38,6 @@ Contributors: - Urvang Joshi (urvang at google dot com) - Vikas Arora (vikasa at google dot com) - Vincent Rabaud (vrabaud at google dot com) +- Vlad Tsyrklevich (vtsyrklevich at chromium dot org) - Yang Zhang (yang dot zhang at arm dot com) +- Yannis Guyon (yguyon at google dot com) diff --git a/src/3rdparty/libwebp/ChangeLog b/src/3rdparty/libwebp/ChangeLog index 7ac7b5f..10ba580 100644 --- a/src/3rdparty/libwebp/ChangeLog +++ b/src/3rdparty/libwebp/ChangeLog @@ -1,9 +1,551 @@ +2ad0916d update NEWS +1287362b bump version to 1.0.3 +7b968cc2 update AUTHORS +9d6988f4 Fix the oscillating prediction problem at low quality +312f74d0 makefile.unix: allow *_LIBS to be overridden w/EXTRA_LIBS +92dbf237 filters_sse2,cosmetics: shorten some long lines +a277d197 filters_sse2.c: quiet integer sanitizer warnings +804540f1 Fix cpufeatures in CMake. +bf00c15b Add CMake option for bittrace. +a788b498 filters_sse2.c: quiet integer sanitizer warnings +e6a92c5e filters.c: quiet integer sanitizer warnings +ec1cc40a lossless.c: remove U32 -> S8 conversion warnings +1106478f remove conversion U32 -> S8 warnings +812a6b49 lossless_enc: fix some conversion warning +4627c1c9 lossless_enc,TransformColorBlue: quiet uint32_t conv warning +c84673a6 lossless_enc_sse{2,41}: quiet signed conv warnings +776a7757 dec_sse2: quiet signed conv warnings +bd39c063 Merge "thread_utils: release mutex before signaling" +0550576f Merge "(alpha_processing,enc}_sse2: quiet signed conv warnings" +6682f2c4 thread_utils: release mutex before signaling +e78dea75 (alpha_processing,enc}_sse2: quiet signed conv warnings +9acf18ba iosbuild.sh: add WebP{Demux,Mux}.framework +b9be7e65 vwebp: remove the -fit option (and make it default) +1394a2bb Merge "README.webp_js: update Emscripten.cmake note" +dd3e7f8a README.webp_js: update Emscripten.cmake note +32cf8801 predictor_enc,GetBestGreenRedToBlue: quiet implicit conv warnings +e1c8acb5 Merge "vwebp: add a -fit option" +cbd23dd5 vwebp: add a -fit option +2e672351 bit_writer_utils,Flush: quiet implicit conversion warnings +1326988d swig: update libwebp_python_wrap.c +0e7f8548 update generated swig files +17ed1438 Merge "PutLE{16,24}: quiet implicit conversion warnings" +24686538 PutLE{16,24}: quiet implicit conversion warnings +153bb3a0 fix some clang-7 warnings: +ab2dc893 Rescaler: fix rounding error +aa65f89a HistogramCombineStochastic: fix free of uninit value +af0bac64 Merge "encode.h: mention 'exact' default in WebPEncodeLossless*" +6d2e11ec encode.h: mention 'exact' default in WebPEncodeLossless* +8c3f04fe AndroidCPUInfo: reorder terms in conditional +fcfd9c71 BitTrace: if BITTRACE is > 0, record and print syntax bits used +067031ea Speedups for unused Huffman groups. +01ac46ba libwebp: Display "libjpeg error:" in imageio/jpegdec +d9a662e1 WebPRescalerGetScaledDimensions: round scaled dimension up +62eb3f08 libwebp: Fix missing '{' in README +e05f785a Merge "unicode,INIT_WARGV: add missing cast" +63c9a69f tag the VP8LHashPix() function for potential uint roll-over +2b7214ab unicode,INIT_WARGV: add missing cast +bf424b46 tag the GetPixPairHash64() function for potential uint roll-over +7d05d6ca Have the color cache computation be u32-bit only. +6bcf8769 Remove BINARYEN_METHOD in wasm settings. +2b98df90 update ChangeLog (tag: v1.0.2-rc1, tag: v1.0.2) +61e372b7 update NEWS +7ae658a0 bump version to 1.0.2 +51c4907d update AUTHORS +666bd6c6 man/cwebp.1: refine near-lossless text +561cdce5 Clarify the doc about GetFeatures. +aec2cf02 near_lossless: fix fuzzing-detected integer overflow +928a75de webp: Fix VP8LBitWriterClone() bug +5173d4ee neon IsFlat +5b081219 IsFlat: inline when possible +381b7b54 IsFlat: use int for thresh +6ed15ea1 fix unprobable leak in webp_sdl.c +22bbb24e Merge "IsFlat: return int" +8b3fb238 Merge tag 'v1.0.1' +f435de95 IsFlat: return int +41521aed utils.h: only define WEBP_NEED_LOG_TABLE_8BIT when needed +9f4d4a3f neon: GetResidualCost +0fd7514b neon: SetResidualCoeffs +f95a996c Simpler histogram clustering. +e85d3313 update ChangeLog (tag: v1.0.1-rc2, tag: v1.0.1) +fa8210e4 Fix pair update in stochastic entropy merging. +fd198f73 add codereview.settings +825389ac README.mux: add a reference to the AnimDecoder API +3be698c3 CMake: fix webp_js compilation +485ff86f Fix pair update in stochastic entropy merging. +4cd0582d CMake: fix webp_js compilation +4cbb4caf update NEWS +f5a5918d bump version to 1.0.1 +d61385db Speed-up: Make sure we only initialize histograms when needed. +6752904b Speed-up: Make sure we only initialize histograms when needed. +0c570316 update AUTHORS +301a2dda img2webp: add help note about arguments from a file +f0abab92 Speedups for empty histograms. +f2dfd925 Split HistogramAdd to only have the high level logic in C. +06b7bc7d Fix compilation on windows and clang-cl+ninja. +b6284d82 img2webp: add help note about arguments from a file +decf6f6b Speedups for empty histograms. +dea3e899 Split HistogramAdd to only have the high level logic in C. +632798ae Merge "Fix compilation on windows and clang-cl+ninja." +dc1a9518 Merge "libwebp: Unicode command tools on Windows" +9cf9841b libwebp: Unicode command tools on Windows +98179495 remove some minor TODOs +a376e7b9 Fix compilation on windows and clang-cl+ninja. +cbf82cc0 Remove AVX2 files. +5030e902 Merge "TIFF decoder: remove unused KINV definition" +ac543311 Remove a few more useless #defines +123d3306 TIFF decoder: remove unused KINV definition +ef1094b0 Merge "- install pkg-config files during the CMake build" +b911fbc9 libwebp: Remove duplicate GIFDisplayError in anim_util +eee00b66 - install pkg-config files during the CMake build +ac3ec8c9 Merge "Clean-up the common sources in dsp." +3e13da7b Clean-up the common sources in dsp. +5c395f1d libwebp: cmake-format all +e7a69729 libwebp: Add extras targets in CMakeLists.txt +e52485d6 libwebp: Rename macros in webpmux.c +92dc0f09 clean-up MakeInputImageCopy() +39952de2 VP8IteratorImport: add missing 'const' +382af7a2 clean-up WebPBlendAlpha +14d020f6 libwebp: Use ExUtilGet*() in anim_diff +0d92ff25 libwebp: remove useless variable in gif2webp +556cb1b4 Merge "CMake: Set WEBP_BUILD_GIF2WEBP to off" +da26ee49 CMake: Set WEBP_BUILD_GIF2WEBP to off +b2a867c0 cwebp: Don't premultiply during -resize if -exact +637141bc pngdec: fix build w/libpng < 1.4.x +bc5092b1 pngdec: set memory functions +50d8345a Fix CMake math library. +6aa3e8aa Fix math library on Visual Studio. +d71df4e2 Fix math library finding in CMake. +de08d727 cosmetics: normalize include guard comment +009562b4 vwebp: Fix bug when Dispose then NoBlend frames +423f2579 Fix up CMake to create targets. +907208f9 Wait for all threads to be done in DecodeRemaining. +4649b3c4 vwebp: Add background color display option +78ad57a3 Fix bad glClearColor parameters +da96d8d9 Allow for a non-initialized alpha decompressor in DoRemap. +2563db47 fix rescaling rounding inaccuracy +211f37ee fix endian problems in pattern copy +5f0f5c07 Make sure partition #0 is read before VP8 data in IDecode. +de98732b fix GetColorf() bug +4338cd36 misc fixes in libwebpmux +e00af13e fix signatures after a9ceda7ff1 +a9ceda7f Speed-up chunk list operations. +2281bbf6 Merge "Better handling of bogus Huffman codes." +39cb9aad Better handling of bogus Huffman codes. +89cc9d37 Merge "fix read-overflow while parsing VP8X chunk" +95fd6507 fix read-overflow while parsing VP8X chunk +9e729fe1 Fix VP8IoTeardownHook being called twice on worker sync failure +29fb8562 Merge "muxread,anmf: fail on multiple image chunks" +eb82ce76 muxread,anmf: fail on multiple image chunks +1344a2e9 fix alpha-filtering crash when image width is larger than radius +be738c6d muxread,ChunkVerifyAndAssign: validate chunk_size +2c70ad76 muxread,CreateInternal: fix riff size checks +569001f1 Fix for thread race heap-use-after-free +c56a02d9 Android.mk: use LOCAL_EXPORT_C_INCLUDES w/public libs +15795596 CMakeLists.txt,cosmetics: normalize if() formatting +1a44c233 Merge "cmake: add support for webpmux" +e9569ad7 Merge "configure,*am,cosmetics: s/WANT_/BUILD_/" +35c7de6f cmake: add support for webpmux +0f25e61c WebpToSDL(): fix the return value in case of error +5d8985de configure,*am,cosmetics: s/WANT_/BUILD_/ +895fd28f Merge "man/Makefile.am: add img2webp.1" +5cf3e2af man/Makefile.am: add img2webp.1 +2a9de5b9 Add build rules for anim_diff & anim_dump utils. +71ed73cf fix invalid check for buffer size +af0e4fbb gif2webp: fix transcode of loop count=65535 +dce5d764 Limit memory allocation when reading invalid Huffman codes. +f9df0081 Merge "cmake: quiet glut deprecation warnings on OS X" +dc39b16f webpmux.1: correct grammar +c7aa1264 cwebp.c: fix a missing \n +53aa51e9 Merge tag 'v1.0.0' +698b8844 update ChangeLog (tag: v1.0.0) +8d510751 webp-container-spec: correct frame duration=0 note +e6b2164e vwebp: Copy Chrome's behavior w/frame duration == 0 +094b3b28 cmake: quiet glut deprecation warnings on OS X +71c39a06 webp-container-spec: correct frame duration=0 note +fd3d5756 vwebp: Copy Chrome's behavior w/frame duration == 0 +b0c966fb Build vwebp from CMake. +d20b7707 update ChangeLog (tag: v1.0.0-rc3) +0d5fad46 add WEBP_DSP_INIT / WEBP_DSP_INIT_FUNC +d77bf512 add WEBP_DSP_INIT / WEBP_DSP_INIT_FUNC +c1cb86af fix 16b overflow in SSE2 +e577feb7 makefile.unix: add DEBUG flag for compiling w/ debug-symbol +99be34b3 cwebp,get_disto: fix bpp output +e122e511 cwebp,get_disto: fix bpp output +f5565ca8 cmake: Make sure we use near-lossless by default. +d898dc14 fix bug in WebPImport565: alpha value was not set +1c8f358d Fix CMake with WASM. +a0215fb7 webp_js: fix webp_js demo html +882784b0 update ChangeLog (tag: v1.0.0-rc2) +2f930e08 Revert "Use proper targets for CMake." +8165e8fb Use proper targets for CMake. +3f157dd5 Remove some very hard TODOs. +abb47760 Merge "Use proper targets for CMake." +cd758a17 {de,}mux/Makefile.am: add missing headers +e155dda0 Use proper targets for CMake. +b892b8ba makefile.unix,dist: use ascii for text output +64a57d05 add -version option to anim_dump,anim_diff and img2webp +994be82d Merge "Remove some very hard TODOs." +4033e1d7 Remove some very hard TODOs. +fc1b8e3a webp_js: fix webp_js demo html +15aa48d9 update ChangeLog (tag: v1.0.0-rc1) +e607dabc update AUTHORS +38410c08 [CFI] Remove function pointer casts +978eec25 [CFI] Remove function pointer casts +c57b2736 bump version to 1.0.0 +cba28853 update NEWS +c909d531 Merge "remove some deprecation warning on MacOSX" +217443c7 remove some deprecation warning on MacOSX +b672bdfa configure: quiet glut deprecation warnings on OS X +daa9fcaf configure: use sdl-config if available +dd174cae Merge "imagedec: support metadata reading for WebP image decoding" +641cedcc imagedec: support metadata reading for WebP image decoding +065b2ce1 anim_diff: add a couple missing newlines in Help() +c4cc1147 Merge "gif2webp: force low duration frames to 100ms" +09333097 gif2webp: force low duration frames to 100ms +e03f0ec3 sharp_yuv: use 14b fixed-point precision for gamma +b2db361c image_enc,WebPWritePNG: move locals after setjmp +74e82ec6 Merge "WebPPictureDistortion: fix big-endian results order" +645d04ca Merge "cwebp,get_disto: report bpp" +120f58c3 Merge "lossless*sse2: improve non-const 16-bit vector creation" +a7fe9412 WebPPictureDistortion: fix big-endian results order +e26fe066 cwebp,get_disto: report bpp +9df64e28 Merge changes Id5b4a1a4,Ia20ce844 +8043504f lossless*sse2: improve non-const 16-bit vector creation +1e3dfc48 Import: extract condition from loop +3b07d327 Import,RGBA: fix for BigEndian import +551948e4 Remove unused argument in VP8LBitsEntropy. +3005237a ReadWebP: fix for big-endian +499c395a Merge "anim_diff: expose the -max_diff option" +f69dcd69 Merge "remove WEBP_EXPERIMENTAL_FEATURES" +07d884d5 anim_diff: expose the -max_diff option +f4dd9256 remove WEBP_EXPERIMENTAL_FEATURES +94a8377b extract the command-line parsing helpers to example_util +fc09e6e2 PNM decoder: prevent unsupported depth=2 PAM case. +6de58603 MIPS64: Fix defined-but-not-used errors with WEBP_REDUCE_CSP +cbde5728 gif2webp: add support for reading from stdin +cf1c5054 Add an SSE4 version of some lossless color transforms. +45a8b5eb Fix lint error with man page. +cff38e8f Merge "PNG decoder: handle gAMA chunk" +59cb1a48 Merge "enable dc error-diffusion always" +78318b30 PNG decoder: handle gAMA chunk +664c21dd Merge "remove some TODOs" +815652de enable dc error-diffusion always +aec45cec remove some TODOs +5715dfce fix block-count[] increment in case of large image +c2d04f3e enable DC error-diffusion always for multi-pass +96bf07c5 use DC error diffusion for U/V at low-quality +1c59020b fix missing sse41 targets in makefile.unix +7a8e814b cosmetics: s/color_space/colorspace/ +05f6fe24 upsampling: rm asserts w/REDUCE_CSP+OMIT_C_CODE +b4cf5597 Merge "Upsampling SSE2/SSE4 speedup." +ccbeb32c Makefile.vc: add missing sse41 files +55403a9a Upsampling SSE2/SSE4 speedup. +807b53c4 Implement the upsampling/yuv functions in SSE41 +84101a81 Fix wasm WebP compilation +8bebd2a3 fix warning on MSVC +a7f93fe3 webpmux: allow reading argument from a file +b69f18a7 gif2webp.1: fix -loop_compatibility layout +72d530c0 Merge "fix lossless decoding w/WEBP_REDUCE_SIZE" +296c7dc4 fix lossless decoding w/WEBP_REDUCE_SIZE +0d5d029c Merge "ImgIoUtilReadFile: fix file leak upon error" +ae568ce7 ImgIoUtilReadFile: fix file leak upon error +796b5a8a Merge tag 'v0.6.1' +6b7a95fd update ChangeLog (tag: v0.6.1) +f66955de WEBP_REDUCE_CSP: restrict colorspace support +1af0df76 Merge "WEBP_REDUCE_CSP: restrict colorspace support" +6de20df0 WEBP_REDUCE_CSP: restrict colorspace support +a289d8e7 update ChangeLog (tag: v0.6.1-rc2) +c10a493c vwebp: disable double buffering on windows & mac +0d4466c2 webp_to_sdl.c: fix file mode +1b27bf8b WEBP_REDUCE_SIZE: disable all rescaler code +126be109 webpinfo: add -version option +0df22b9e WEBP_REDUCE_SIZE: disable all rescaler code +9add62b5 bump version to 0.6.1 +d3e26144 update NEWS +2edda639 README: add webpinfo section +9ca568ef Merge "right-size some tables" +31f1995c Merge "SSE2 implementation of HasAlphaXXX" +a80c46bd SSE2 implementation of HasAlphaXXX +083507f2 right-size some tables +2e5785b2 anim_utils.c: remove warning when !defined(WEBP_HAVE_GIF) +b299c47e add WEBP_REDUCE_SIZE +f593d71a enc: disable pic->stats/extra_info w/WEBP_DISABLE_STATS +541179a9 Merge "predictor_enc: fix build w/--disable-near-lossless" +5755a7ec predictor_enc: fix build w/--disable-near-lossless +eab5bab7 add WEBP_DISABLE_STATS +8052c585 remove some petty TODOs from vwebp. +c245343d move LOAD8x4 and STORE8x2 closer to their use location +b9e734fd dec,cosmetics: normalize function naming style +c188d546 dec: harmonize function suffixes +28c5ac81 dec_sse41: harmonize function suffixes +e65b72a3 Merge "introduce WebPHasAlpha8b and WebPHasAlpha32b" +b94cee98 dec_sse2: remove HE8uv_SSE2 +44a0ee3f introduce WebPHasAlpha8b and WebPHasAlpha32b +aebf59ac Merge "WebPPictureAllocARGB: align argb allocation" +c184665e WebPPictureAllocARGB: align argb allocation +3daf7509 WebPParseHeaders: remove obsolete animation TODO +80285d97 cmake: avoid security warnings under msvc +650eac55 cmake: don't set -Wall with MSVC +c462cd00 Remove useless code. +01a98217 Merge "remove WebPWorkerImpl declaration from the header" +3c49fc47 Merge "thread_utils: fix potentially bad call to Execute" +fde2782e thread_utils: fix potentially bad call to Execute +2a270c1d remove WebPWorkerImpl declaration from the header +f1f437cc remove mention of 'lossy-only parameters' from the doc +3879074d Merge "WebPMemToUint32: remove ptr cast to int" +04b029d2 WebPMemToUint32: remove ptr cast to int +b7971d0e dsp: avoid defining _C functions w/NEON builds +6ba98764 webpdec: correct alloc size check w/use_argb +5cfb3b0f normalize include guards +f433205e Merge changes Ia17c7dfc,I75423abb,Ia2f716b4,I161caa14,I4210081a, ... +8d033b14 {dec,enc}_neon: harmonize function suffixes x2 +0295e981 upsampling_neon: harmonize function suffixes +d572c4e5 yuv_neon: harmonize function suffixes +ab9c2500 rescaler_neon: harmonize function suffixes +93e0ce27 lossless_neon: harmonize function suffixes +22fbc50e lossless_enc_neon: harmonize function suffixes +447875b4 filters_neon,cosmetics: fix indent +e51bdd43 remove unused VP8TokenToStats() function +785da7ea enc_neon: harmonize function suffixes +bc1a251f dec_neon: harmonize function suffixes +61e535f1 dsp/lossless: workaround gcc-4.8 bug on arm +68b2eab7 cwebp: fix alpha reporting w/lossless & metadata +30042faa WebPDemuxGetI: add doc details around WebPFormatFeature +0a17f471 Merge "WIP: list includes as descendants of the project dir" +a4399721 WIP: list includes as descendants of the project dir +08275708 Merge "Make sure we reach the full range for alpha blending." +d361a6a7 yuv_sse2: harmonize function suffixes +6921aa6f upsampling_sse2: harmonize function suffixes +08c67d3e ssim_sse2: harmonize function suffixes +582a1b57 rescaler_sse2: harmonize function suffixes +2c1b18ba lossless_sse2: harmonize function suffixes +0ac46e81 lossless_enc_sse2: harmonize function suffixes +bc634d57 enc_sse2: harmonize function suffixes +bcb7347c dec_sse2: harmonize function suffixes +e14ad93c Make sure we reach the full range for alpha blending. +7038ca8d demux,StoreFrame: restore hdr size check to min req +fb3daad6 cpu: fix ssse3 check +be590e06 Merge "Fix CMake redefinition for HAVE_CPU_FEATURES_H" +35f736e1 Fix CMake redefinition for HAVE_CPU_FEATURES_H +a5216efc Fix integer overflow warning. +a9c8916b decode.h,WebPIDecGetRGB: clarify output ptr validity +3c74c645 gif2webp: handle 1-frame case properly + fix anim_diff +c7f295d3 Merge "gif2webp: introduce -loop_compatibility option" +b4e04677 gif2webp: introduce -loop_compatibility option +f78da3de add LOCAL_CLANG_PREREQ and avoid WORK_AROUND_GCC w/3.8+ +01c426f1 define WEBP_USE_INTRINSICS w/gcc-4.9+ +8635973d use sdl-config (if available) to determine the link flags +e9459382 use CPPFLAGS before CFLAGS +4a9d788e Merge "Android.mk,mips: fix clang build with r15" +4fbdc9fb Android.mk,mips: fix clang build with r15 +a80fcc4a ifdef code not used by Chrome/Android. +3993af12 Fix signed integer overflows. +f66f94ef anim_dump: small tool to dump frames from animated WebP +6eba857b Merge "rationalize the Makefile.am" +c5e34fba function definition cleanup +3822762a rationalize the Makefile.am +501ef6e4 configure style fix: animdiff -> anim_diff +f8bdc268 Merge "protect against NULL dump_folder[] value in ReadAnimatedImage()" +23bfc652 protect against NULL dump_folder[] value in ReadAnimatedImage() +8dc3d71b cosmetics,ReadAnimatedWebP: correct function comment +5bd40066 Merge changes I66a64a0a,I4d2e520f +7945575c cosmetics,webpinfo: remove an else after a return +8729fa11 cosmetics,cwebp: remove an else after a return +f324b7f9 cosmetics: normalize fn proto & decl param names +869eb369 CMake cleanups. +289e62a3 Remove declaration of unimplemented VP8ApplyNearLosslessPredict +20a94186 pnmdec,PAM: validate depth before calculating bytes_per_px +34130afe anim_encode: fix integer overflow +42c79aa6 Merge "Encoder: harmonize function suffixes" +b09307dc Encoder: harmonize function suffixes +bed0456d Merge "SSIM: harmonize the function suffix" +54f6a3cf lossless_sse2.c: fix some missed suffix changes +088f1dcc SSIM: harmonize the function suffix +86fc4dd9 webpdec: use ImgIoUtilCheckSizeArgumentsOverflow +08ea9ecd imageio: add ability restrict max image size +6f9daa4a jpegdec,ReadError: fix leaks on error +a0f72a4f VP8LTransformColorFunc: drop an non-respected 'const' from the signature. +8c934902 Merge "Lossess dec: harmonize the function suffixes" +622242aa Lossess dec: harmonize the function suffixes +1411f027 Lossless Enc: harmonize the function suffixes +24ad2e3c add const to two variables +46efe062 Merge "Allow the lossless cruncher to work for alpha." +8c3f9a47 Speed-up LZ77. +1aef4c71 Allow the lossless cruncher to work for alpha. +b8821dbd Improve the box LZ77 speed. +7beed280 add missing ()s to macro parameters +6473d20b Merge "fix Android standalone toolchain build" +dcefed95 Merge "build.gradle: fix arm64 build" +0c83a8bc Merge "yuv: harmonize suffix naming" +c6d1db4b fix Android standalone toolchain build +663a6d9d unify the ALTERNATE_CODE flag usage +73ea9f27 yuv: harmonize suffix naming +c71b68ac build.gradle: fix arm64 build +c4568b47 Rescaler: harmonize the suffix naming +6cb13b05 Merge "alpha_processing: harmonize the naming suffixes to be _C()" +83a3e69a Merge "simplify WEBP_EXTERN macro" +7295fde2 Merge "filters: harmonize the suffixes naming to _SSE2(), _C(), etc." +8e42ba4c simplify WEBP_EXTERN macro +331ab34b cost*.c: harmonize the suffix namings +b161f670 filters: harmonize the suffixes naming to _SSE2(), _C(), etc. +dec5e4d3 alpha_processing: harmonize the naming suffixes to be _C() +6878d427 fix memory leak in SDL_Init() +461ae555 Merge "configure: fix warnings in sdl check" +62486a22 configure: test for -Wundef +92982609 dsp.h: fix -Wundef w/__mips_dsp_rev +0265cede configure: fix warnings in sdl check +88c73d8a backward_references_enc.h: fix WINDOW_SIZE_BITS check +4ea49f6b rescaler_sse2.c: fix WEBP_RESCALER_FIX -> _RFIX typo +1b526638 Clean-up some CMake +87f57a4b Merge "cmake: fix gif lib detection when cross compiling" +b34a9db1 cosmetics,dec_sse2: remove some redundant comments +471c5755 cmake: fix gif lib detection when cross compiling +c793417a cmake: disable gif2webp if gif lib isn't found +dcbc1c88 cmake: split gif detection from IMG deps +66ad84f0 Merge "muxread: remove unreachable code" +50ec3ab7 muxread: remove unreachable code +7d67a164 Lossy encoding: smoothen transparent areas to improve compression +e50650c7 Merge "fix signature for DISABLE_TOKEN_BUFFER compilation" +671d2567 fix signature for DISABLE_TOKEN_BUFFER compilation +d6755580 cpu.cmake: use unique flag to test simd disable flags +28914528 Merge "Remove the argb* files." +8acb4942 Remove the argb* files. +3b62347b README: correct cmake invocation note +7ca0df13 Have the SSE2 version of PackARGB use common code. +7b250459 Merge "Re-use the transformed image when trying several LZ77 in lossless." +e132072f Re-use the transformed image when trying several LZ77 in lossless. +5d7a50ef Get code to compile in C++. +7b012987 configure: test for -Wparentheses-equality +f0569adb Fix man pages for multi-threading. +f1d5a397 multithread cruncher: only copy stats when picture->stats != NULL +f8c2ac15 Multi-thread the lossless cruncher. +a88c6522 Merge "Integrate a new LZ77 looking for matches in the neighborhood of a pixel only." +8f6df1d0 Unroll Predictors 10, 11 and 12. +355c3d1b Integrate a new LZ77 looking for matches in the neighborhood of a pixel only. +a1779a01 Refactor LZ77 handling in preparation for a new method. +67de68b5 Android.mk/build.gradle: fix mips build with clang from r14b +f209a548 Use the plane code and not the distance when computing statistics. +b903b80c Split cost-based backward references in its own file. +498cad34 Cosmetic changes in backward reference. +e4eb4587 lossless, VP8LTransformColor_C: make sure no overflow happens with colors. +af6deaff webpinfo: handle alpha flag mismatch +7caef29b Fix typo that creeped in. +39e19f92 Merge "near lossless: fix unsigned int overflow warnings." +9bbc0891 near lossless: fix unsigned int overflow warnings. +e1118d62 Merge "cosmetics,FindClosestDiscretized: use uint in mask creation" +186bc9b7 Merge "webpinfo: tolerate ALPH+VP8L" +b5887297 cosmetics,FindClosestDiscretized: use uint in mask creation +f1784aee near_lossless,FindClosestDiscretized: use unsigned ops +0d20abb3 webpinfo: tolerate ALPH+VP8L +972104b3 webpmux: tolerate false positive Alpha flag +dd7e83cc tiffdec,ReadTIFF: ensure data_size is < tsize_t max +d988eb7b tiffdec,MyRead: quiet -Wshorten-64-to-32 warning +dabda707 webpinfo: add support to parse Alpha bitstream +4c117643 webpinfo: correct background color output, BGRA->ARGB +defc98d7 Doc: clarify the role of quality in WebPConfig. +d78ff780 Merge "Fix code to compile with C++." +c8f14093 Fix code to compile with C++. +497dc6a7 pnmdec: sanitize invalid header output +d78e5867 Merge "configure: test for -Wconstant-conversion" +481e91eb Merge "pnmdec,PAM: set bytes_per_px based on depth when missing" +93b12753 configure: test for -Wconstant-conversion +645f0c53 pnmdec,PAM: set bytes_per_px based on depth when missing +e9154605 Merge "vwebp: activate GLUT double-buffering" +818d795b vwebp: activate GLUT double-buffering +d63e6f4b Add a man page for webpinfo +4d708435 Merge "NEON: implement ConvertRGB24ToY/BGR24/ARGB/RGBA32ToUV/ARGBToUV" +faf42213 NEON: implement ConvertRGB24ToY/BGR24/ARGB/RGBA32ToUV/ARGBToUV +b4d576fa Install man pages with CMake. +cbc1b921 webpinfo: add features to parse bitstream header +e644c556 Fix bad bit writer initialization. +b62cdad2 Merge "Implement a cruncher for lossless at method 6." +da3e4dfb use the exact constant for the gamma transfer function +a9c701e0 Merge "tiffdec: fix EXTRASAMPLES check" +adab8ce0 Implement a cruncher for lossless at method 6. +1b92b237 Merge "Fix VP8ApplyNearLossless to respect const and stride." +1923ff02 tiffdec: fix EXTRASAMPLES check +97cce5ba tiffdec: only request EXTRASAMPLES w/> 3 samples/px +0dcd85b6 Fix VP8ApplyNearLossless to respect const and stride. +f7682189 yuv: rationalize the C/SSE2 function naming +52245424 NEON implementation of some Sharp-YUV420 functions +690efd82 Avoid several backward reference copies. +4bb1f607 src/dec/vp8_dec.h, cosmetics: fix comments +285748be cmake: build/install webpinfo +78fd199c backward_references_enc.c: clear -Wshadow warnings +ae836410 WebPLog2FloorC: clear -Wshadow warning +d0b7404e Merge "WASM support" +134e314f WASM support +c08adb6f Merge "VP8LEnc: remove use of BitsLog2Ceiling()" +28c37ebd VP8LEnc: remove use of BitsLog2Ceiling() +2cb58ab2 webpinfo: output format as a human readable string +bb175a93 Merge "rename some symbols clashing with MSVC headers" +39eda658 Remove a duplicated pixel hash implementation. +36b8274d rename some symbols clashing with MSVC headers +274daf54 Add webpinfo tool. +ec5036e4 add explicit reference to /usr/local/{lib,inc} +18f0dfac Merge "fix TIFF encoder regarding rgbA/RGBA" +4e2b0b50 Merge "webpdec.h: fix a doc typo" +e2eeabff Merge "Install binaries, libraries and headers in CMake." +836607e6 webpdec.h: fix a doc typo +9273e441 fix TIFF encoder regarding rgbA/RGBA +17e3c11f Add limited PAM decoding support +5f624871 Install binaries, libraries and headers in CMake. +976adac1 Merge "lossless incremental decoding: fix missing eos_ test" +f8fad4fa lossless incremental decoding: fix missing eos_ test +27415d41 Merge "vwebp_sdl: fix the makefile.unix" +49566182 Merge "ImgIoUtilWriteFile(): use ImgIoUtilSetBinaryMode" +6f75a51b Analyze the transform entropy on the whole image. +a5e4e3af Use palette only if we can in entropy analysis. +75a9c3c4 Improve compression by better entropy analysis. +39cf6f4f vwebp_sdl: fix the makefile.unix +699b0416 ImgIoUtilWriteFile(): use ImgIoUtilSetBinaryMode +7d985bd1 Fix small entropy analysis bug. +6e7caf06 Optimize the color cache size. +833c9219 More efficient stochastic histogram merge. +5183326b Refactor the greedy histogram merge. +99f6f462 Merge "histogram_enc.c,MyRand: s/ul/u/ for unsigned constants" +80a22186 ssim.c: remove dead include +a128dfff histogram_enc.c,MyRand: s/ul/u/ for unsigned constants +693bf74e move the SSIM calculation code in ssim.c / ssim_sse2.c +10d791ca Merge "Fix the random generator in HistogramCombineStochastic." +fa63a966 Fix the random generator in HistogramCombineStochastic. +16be192f VP8LSetBitPos: remove the eos_ setting +027151ca don't erase the surface before blitting. +4105d565 disable WEBP_USE_XXX optimisations when EMSCRIPTEN is defined +9ee32a75 Merge "WebP-JS: emscripten-based Javascript decoder" +ca9f7b7d WebP-JS: emscripten-based Javascript decoder +868aa690 Perform greedy histogram merge in a unified way. +5b393f2d Merge "fix path typo for vwebp_sdl in Makefile.vc" +e0012bea CMake: only use libwebpdecoder for building dwebp +84c2a7b0 fix path typo for vwebp_sdl in Makefile.vc +1b0e4abf Merge "Add a flag to disable SIMD optimizations." +32263250 Add a flag to disable SIMD optimizations. +b494fdec optimize the ARGB->ARGB Import to use memcpy +f1536039 Merge "ReadWebP: decode directly into a pre-allocated buffer" +e69ed291 ReadWebP: decode directly into a pre-allocated buffer +57d8de8a Merge "vwebp_sdl: simple viewer based on SDL" +5cfd4ebc LZ77 interval speedups. Faster, smaller, simpler. +1e7ad88b PNM header decoder: add some basic numerical validation +17c7890c Merge "Add a decoder only library for WebP in CMake." +be733786 Merge "Add clang build fix for MSA" +03cda0e4 Add a decoder only library for WebP in CMake. +aa893914 Add clang build fix for MSA +31a92e97 Merge "imageio: add limited PNM support for reading" +dcf9d82a imageio: add limited PNM support for reading +6524fcd6 vwebp_sdl: simple viewer based on SDL +6cf24a24 get_disto: fix reference file read +43d472aa Merge tag 'v0.6.0' +50d1a848 update ChangeLog (tag: v0.6.0, origin/0.6.0) 20a7fea0 extras/Makefile.am: fix libwebpextras.la reference 415f3ffe update ChangeLog (tag: v0.6.0-rc3) 3c6d1224 update NEWS ee4a4141 update AUTHORS 32ed856f Fix "all|no frames are keyframes" settings. +1c3190b6 Merge "Fix "all|no frames are keyframes" settings." f4dc56fd disable GradientUnfilter_NEON +4f3e3bbd disable GradientUnfilter_NEON +2dc0bdca Fix "all|no frames are keyframes" settings. 0d8e0588 img2webp: treat -loop as a no-op w/single images b0450139 ReadImage(): restore size reporting 0ad3b4ef update ChangeLog (tag: v0.6.0-rc2) @@ -71,7 +613,7 @@ b016cb91 NEON: faster fancy upsampling f04eb376 Merge tag 'v0.5.2' 341d711c NEON: 5% faster conversion to RGB565 and RGBA4444 abb54827 remove Clang warnings with unused arch arguments. -ece9684f update ChangeLog (tag: v0.5.2-rc2, tag: v0.5.2, origin/0.5.2, 0.5.2) +ece9684f update ChangeLog (tag: v0.5.2-rc2, tag: v0.5.2, origin/0.5.2) aa7744ca anim_util: quiet implicit conv warnings in 32-bit d9120271 jpegdec: correct ContextFill signature 24eb3940 Remove some errors when compiling the code as C++. @@ -358,7 +900,7 @@ bbb6ecd9 Merge "Add MSA optimized distortion functions" c0991a14 io,EmitRescaledAlphaYUV: factor out a common expr 48bf5ed1 build.gradle: remove tab bfef6c9f Merge tag 'v0.5.1' -3d97bb75 update ChangeLog (tag: v0.5.1, origin/0.5.1, 0.5.1) +3d97bb75 update ChangeLog (tag: v0.5.1, origin/0.5.1) deb54d91 Clarify the expected 'config' lifespan in WebPIDecode() 435308e0 Add MSA optimized encoder transform functions dce64bfa Add MSA optimized alpha filter functions @@ -552,7 +1094,7 @@ b74657fb configure: fix builtin detection w/-Werror 6c1d7631 avoid Yoda style for comparison 8ce975ac SSE optimization for vector mismatch. 7db53831 Merge tag 'v0.5.0' -37f04949 update ChangeLog (tag: v0.5.0-rc1, tag: v0.5.0, origin/0.5.0, 0.5.0) +37f04949 update ChangeLog (tag: v0.5.0-rc1, tag: v0.5.0, origin/0.5.0) 7e7b6ccc faster rgb565/rgb4444/argb output 4c7f565f update NEWS 1f62b6b2 update AUTHORS @@ -1336,7 +1878,7 @@ b5a36cc9 add -near_lossless [0..100] experimental option 0524d9e5 dsp: detect mips64 & disable mips32 code d3485d96 cwebp.1: fix quality description placement 29a9fe22 Merge tag 'v0.4.1' -8af27718 update ChangeLog (tag: v0.4.1, origin/0.4.1, 0.4.1) +8af27718 update ChangeLog (tag: v0.4.1, origin/0.4.1) e09e9ff6 Record & log the image pre-processing time. f59c0b4b iosbuild.sh: specify optimization flags 8d34ea3e update ChangeLog (tag: v0.4.1-rc1) @@ -1721,7 +2263,7 @@ ea59a8e9 Merge "Merge tag 'v0.4.0'" effcb0fd Merge tag 'v0.4.0' 7c76255d autoconf: update ax_pthread.m4 fff2a11b make -short work with -print_ssim, -print_psnr, etc. -68e7901d update ChangeLog (tag: v0.4.0-rc1, tag: v0.4.0, origin/0.4.0, 0.4.0) +68e7901d update ChangeLog (tag: v0.4.0-rc1, tag: v0.4.0, origin/0.4.0) 256e4333 update NEWS description with new general features 29625340 Merge "gif2webp: don't use C99 %zu" into 0.4.0 3b9f9dd0 gif2webp: don't use C99 %zu @@ -2497,7 +3039,7 @@ a61a824b Merge "Add NULL check in chunk APIs" a0770727 mux struct naming 6c66dde8 Merge "Tune Lossless encoder" ab5ea217 Tune Lossless encoder -74fefc8c Update ChangeLog (tag: v0.2.1, origin/0.2.0, 0.2.0) +74fefc8c Update ChangeLog (tag: v0.2.1, origin/0.2.0) 92f8059c Rename some chunks: 3bb4bbeb Merge "Mux API change:" d0c79f05 Mux API change: diff --git a/src/3rdparty/libwebp/NEWS b/src/3rdparty/libwebp/NEWS index 3bf4bd0..5c0fc8c 100644 --- a/src/3rdparty/libwebp/NEWS +++ b/src/3rdparty/libwebp/NEWS @@ -1,3 +1,58 @@ +- 7/4/2019: version 1.0.3 + This is a binary compatible release. + * resize fixes for Nx1 sizes and the addition of non-opaque alpha values for + odd sizes (issues #418, #434) + * lossless encode/decode performance improvements + * lossy compression performance improvement at low quality levels with flat + content (issue #432) + * python swig files updated to support python 3 + Tool updates: + vwebp will now preserve the aspect ratio of images that exceed monitor + resolution by scaling the image to fit (issue #433) + +- 1/14/2019: version 1.0.2 + This is a binary compatible release. + * (Windows) unicode file support in the tools (linux and mac already had + support, issue #398) + * lossless encoder speedups + * lossy encoder speedup on ARM + * lossless multi-threaded security fix (chromium:917029) + +- 11/2/2018: version 1.0.1 + This is a binary compatible release. + * lossless encoder speedups + * big-endian fix for alpha decoding (issue #393) + * gif2webp fix for loop count=65535 transcode (issue #382) + * further security related hardening in libwebp & libwebpmux + (issues #383, #385, #386, #387, #388, #391) + (oss-fuzz #9099, #9100, #9105, #9106, #9111, #9112, #9119, #9123, #9170, + #9178, #9179, #9183, #9186, #9191, #9364, #9417, #9496, #10349, + #10423, #10634, #10700, #10838, #10922, #11021, #11088, #11152) + * miscellaneous bug & build fixes (issues #381, #394, #396, #397, #400) + +- 4/2/2018: version 1.0.0 + This is a binary compatible release. + * lossy encoder improvements to avoid chroma shifts in various circumstances + (issues #308, #340) + * big-endian fixes for decode, RGBA import and WebPPictureDistortion + Tool updates: + gifwebp, anim_diff - default duration behavior (<= 10ms) changed to match + web browsers, transcoding tools (issue #379) + img2webp, webpmux - allow options to be passed in via a file (issue #355) + +- 11/24/2017: version 0.6.1 + This is a binary compatible release. + * lossless performance and compression improvements + a new 'cruncher' mode + (-m 6 -q 100) + * ARM performance improvements with clang (15-20% w/ndk r15c, issue #339) + * webp-js: emscripten/webassembly based javascript decoder + * miscellaneous bug & build fixes (issue #329, #332, #343, #353, #360, #361, + #363) + Tool updates / additions: + added webpinfo - prints file format information (issue #330) + gif2webp - loop behavior modified to match Chrome M63+ (crbug.com/649264); + '-loop_compatibility' can be used for the old behavior + - 1/26/2017: version 0.6.0 * lossless performance and compression improvements * miscellaneous performance improvements (SSE2, NEON, MSA) diff --git a/src/3rdparty/libwebp/README b/src/3rdparty/libwebp/README index 4c15c4a..60da8a2 100644 --- a/src/3rdparty/libwebp/README +++ b/src/3rdparty/libwebp/README @@ -4,7 +4,7 @@ \__\__/\____/\_____/__/ ____ ___ / _/ / \ \ / _ \/ _/ / \_/ / / \ \ __/ \__ - \____/____/\_____/_____/____/v0.6.0 + \____/____/\_____/_____/____/v1.0.3 Description: ============ @@ -113,8 +113,8 @@ make install CMake: ------ -The support for CMake is minimal: it only helps you compile libwebp, cwebp and -dwebp. +With CMake, you can compile libwebp, cwebp, dwebp, gif2web, img2webp, webpinfo +and the JS bindings. Prerequisites: A compiler (e.g., gcc with autotools) and CMake. @@ -123,18 +123,27 @@ minimal build: $ sudo apt-get install build-essential cmake When building from git sources, you will need to run cmake to generate the -configure script. +makefiles. mkdir build && cd build && cmake ../ make make install -If you also want cwebp or dwebp, you will need to enable them through CMake: +If you also want any of the executables, you will need to enable them through +CMake, e.g.: cmake -DWEBP_BUILD_CWEBP=ON -DWEBP_BUILD_DWEBP=ON ../ or through your favorite interface (like ccmake or cmake-qt-gui). +Use option -DWEBP_UNICODE=ON for Unicode support on Windows (with chcp 65001). + +Finally, once installed, you can also use WebP in your CMake project by doing: + +find_package(WebP) + +which will define the CMake variables WebP_INCLUDE_DIRS and WebP_LIBRARIES. + Gradle: ------- The support for Gradle is minimal: it only helps you compile libwebp, cwebp and @@ -360,6 +369,23 @@ Use following options to convert into alternate image formats: -quiet ....... quiet mode, don't print anything -noasm ....... disable all assembly optimizations +WebP file analysis tool: +======================== + +'webpinfo' can be used to print out the chunk level structure and bitstream +header information of WebP files. It can also check if the files are of valid +WebP format. + +Usage: webpinfo [options] in_files +Note: there could be multiple input files; + options must come before input files. +Options: + -version ........... Print version number and exit. + -quiet ............. Do not show chunk parsing information. + -diag .............. Show parsing error diagnosis. + -summary ........... Show chunk stats summary. + -bitstream_info .... Parse bitstream header. + Visualization tool: =================== @@ -378,12 +404,14 @@ Options are: -nofilter .... disable in-loop filtering -dither <int> dithering strength (0..100), default=50 -noalphadither disable alpha plane dithering + -usebgcolor .. display background color -mt .......... use multi-threading -info ........ print info -h ........... this help message Keyboard shortcuts: 'c' ................ toggle use of color profile + 'b' ................ toggle background color display 'i' ................ overlay file information 'd' ................ disable blending & disposal (debug) 'q' / 'Q' / ESC .... quit @@ -434,6 +462,7 @@ File-level options (only used at the start of compression): -mixed ............... use mixed lossy/lossless automatic mode -v ................... verbose mode -h ................... this help + -version ............. print version number and exit Per-frame options (only used for subsequent images input): -d <int> ............. frame duration in ms (default: 100) @@ -445,6 +474,9 @@ Per-frame options (only used for subsequent images input): example: img2webp -loop 2 in0.png -lossy in1.jpg -d 80 in2.tiff -o out.webp +Note: if a single file name is passed as the argument, the arguments will be +tokenized from this file. The file name must not start with the character '-'. + Animated GIF conversion: ======================== Animated GIF files can be converted to WebP files with animation using the @@ -470,6 +502,8 @@ Options: -metadata <string> ..... comma separated list of metadata to copy from the input to the output if present Valid values: all, none, icc, xmp (default) + -loop_compatibility .... use compatibility mode for Chrome + version prior to M62 (inclusive) -mt .................... use multi-threading if available -version ............... print version number and exit @@ -498,6 +532,11 @@ Options: -min_psnr <float> ... minimum per-frame PSNR -raw_comparison ..... if this flag is not used, RGB is premultiplied before comparison + -max_diff <int> ..... maximum allowed difference per channel + between corresponding pixels in subsequent + frames + -h .................. this help + -version ............ print version number and exit Building: --------- @@ -558,7 +597,7 @@ The encoding flow looks like: // Setup a config, starting form a preset and tuning some additional // parameters WebPConfig config; - if (!WebPConfigPreset(&config, WEBP_PRESET_PHOTO, quality_factor)) + if (!WebPConfigPreset(&config, WEBP_PRESET_PHOTO, quality_factor)) { return 0; // version error } // ... additional tuning diff --git a/src/3rdparty/libwebp/qt_attribution.json b/src/3rdparty/libwebp/qt_attribution.json index 09f91dd..8d15db5 100644 --- a/src/3rdparty/libwebp/qt_attribution.json +++ b/src/3rdparty/libwebp/qt_attribution.json @@ -6,7 +6,7 @@ "Description": "WebP is a new image format that provides lossless and lossy compression for images on the web.", "Homepage": "https://developers.google.com/speed/webp/", - "Version": "0.6.0", + "Version": "1.0.3", "License": "BSD 3-clause \"New\" or \"Revised\" License", "LicenseId": "BSD-3-Clause", "LicenseFile": "COPYING", diff --git a/src/3rdparty/libwebp/src/dec/alpha_dec.c b/src/3rdparty/libwebp/src/dec/alpha_dec.c index 83ffd4b..bce735b 100644 --- a/src/3rdparty/libwebp/src/dec/alpha_dec.c +++ b/src/3rdparty/libwebp/src/dec/alpha_dec.c @@ -12,13 +12,13 @@ // Author: Skal (pascal.massimino@gmail.com) #include <stdlib.h> -#include "./alphai_dec.h" -#include "./vp8i_dec.h" -#include "./vp8li_dec.h" -#include "../dsp/dsp.h" -#include "../utils/quant_levels_dec_utils.h" -#include "../utils/utils.h" -#include "../webp/format_constants.h" +#include "src/dec/alphai_dec.h" +#include "src/dec/vp8i_dec.h" +#include "src/dec/vp8li_dec.h" +#include "src/dsp/dsp.h" +#include "src/utils/quant_levels_dec_utils.h" +#include "src/utils/utils.h" +#include "src/webp/format_constants.h" //------------------------------------------------------------------------------ // ALPHDecoder object. diff --git a/src/3rdparty/libwebp/src/dec/alphai_dec.h b/src/3rdparty/libwebp/src/dec/alphai_dec.h index 561e815..a64104a 100644 --- a/src/3rdparty/libwebp/src/dec/alphai_dec.h +++ b/src/3rdparty/libwebp/src/dec/alphai_dec.h @@ -11,11 +11,11 @@ // // Author: Urvang (urvang@google.com) -#ifndef WEBP_DEC_ALPHAI_H_ -#define WEBP_DEC_ALPHAI_H_ +#ifndef WEBP_DEC_ALPHAI_DEC_H_ +#define WEBP_DEC_ALPHAI_DEC_H_ -#include "./webpi_dec.h" -#include "../utils/filters_utils.h" +#include "src/dec/webpi_dec.h" +#include "src/utils/filters_utils.h" #ifdef __cplusplus extern "C" { @@ -51,4 +51,4 @@ void WebPDeallocateAlphaMemory(VP8Decoder* const dec); } // extern "C" #endif -#endif /* WEBP_DEC_ALPHAI_H_ */ +#endif // WEBP_DEC_ALPHAI_DEC_H_ diff --git a/src/3rdparty/libwebp/src/dec/buffer_dec.c b/src/3rdparty/libwebp/src/dec/buffer_dec.c index c685fd5..3cd94eb 100644 --- a/src/3rdparty/libwebp/src/dec/buffer_dec.c +++ b/src/3rdparty/libwebp/src/dec/buffer_dec.c @@ -13,15 +13,15 @@ #include <stdlib.h> -#include "./vp8i_dec.h" -#include "./webpi_dec.h" -#include "../utils/utils.h" +#include "src/dec/vp8i_dec.h" +#include "src/dec/webpi_dec.h" +#include "src/utils/utils.h" //------------------------------------------------------------------------------ // WebPDecBuffer // Number of bytes per pixel for the different color-spaces. -static const int kModeBpp[MODE_LAST] = { +static const uint8_t kModeBpp[MODE_LAST] = { 3, 4, 3, 4, 4, 2, 2, 4, 4, 4, 2, // pre-multiplied modes 1, 1 }; @@ -36,7 +36,7 @@ static int IsValidColorspace(int webp_csp_mode) { // strictly speaking, the very last (or first, if flipped) row // doesn't require padding. #define MIN_BUFFER_SIZE(WIDTH, HEIGHT, STRIDE) \ - (uint64_t)(STRIDE) * ((HEIGHT) - 1) + (WIDTH) + ((uint64_t)(STRIDE) * ((HEIGHT) - 1) + (WIDTH)) static VP8StatusCode CheckDecBuffer(const WebPDecBuffer* const buffer) { int ok = 1; @@ -74,7 +74,8 @@ static VP8StatusCode CheckDecBuffer(const WebPDecBuffer* const buffer) { } else { // RGB checks const WebPRGBABuffer* const buf = &buffer->u.RGBA; const int stride = abs(buf->stride); - const uint64_t size = MIN_BUFFER_SIZE(width, height, stride); + const uint64_t size = + MIN_BUFFER_SIZE(width * kModeBpp[mode], height, stride); ok &= (size <= buf->size); ok &= (stride >= width * kModeBpp[mode]); ok &= (buf->rgba != NULL); @@ -98,9 +99,14 @@ static VP8StatusCode AllocateBuffer(WebPDecBuffer* const buffer) { uint64_t uv_size = 0, a_size = 0, total_size; // We need memory and it hasn't been allocated yet. // => initialize output buffer, now that dimensions are known. - const int stride = w * kModeBpp[mode]; - const uint64_t size = (uint64_t)stride * h; + int stride; + uint64_t size; + if ((uint64_t)w * kModeBpp[mode] >= (1ull << 32)) { + return VP8_STATUS_INVALID_PARAM; + } + stride = w * kModeBpp[mode]; + size = (uint64_t)stride * h; if (!WebPIsRGBMode(mode)) { uv_stride = (w + 1) / 2; uv_size = (uint64_t)uv_stride * ((h + 1) / 2); @@ -169,11 +175,11 @@ VP8StatusCode WebPFlipBuffer(WebPDecBuffer* const buffer) { return VP8_STATUS_OK; } -VP8StatusCode WebPAllocateDecBuffer(int w, int h, +VP8StatusCode WebPAllocateDecBuffer(int width, int height, const WebPDecoderOptions* const options, - WebPDecBuffer* const out) { + WebPDecBuffer* const buffer) { VP8StatusCode status; - if (out == NULL || w <= 0 || h <= 0) { + if (buffer == NULL || width <= 0 || height <= 0) { return VP8_STATUS_INVALID_PARAM; } if (options != NULL) { // First, apply options if there is any. @@ -182,33 +188,39 @@ VP8StatusCode WebPAllocateDecBuffer(int w, int h, const int ch = options->crop_height; const int x = options->crop_left & ~1; const int y = options->crop_top & ~1; - if (x < 0 || y < 0 || cw <= 0 || ch <= 0 || x + cw > w || y + ch > h) { + if (x < 0 || y < 0 || cw <= 0 || ch <= 0 || + x + cw > width || y + ch > height) { return VP8_STATUS_INVALID_PARAM; // out of frame boundary. } - w = cw; - h = ch; + width = cw; + height = ch; } + if (options->use_scaling) { +#if !defined(WEBP_REDUCE_SIZE) int scaled_width = options->scaled_width; int scaled_height = options->scaled_height; if (!WebPRescalerGetScaledDimensions( - w, h, &scaled_width, &scaled_height)) { + width, height, &scaled_width, &scaled_height)) { return VP8_STATUS_INVALID_PARAM; } - w = scaled_width; - h = scaled_height; + width = scaled_width; + height = scaled_height; +#else + return VP8_STATUS_INVALID_PARAM; // rescaling not supported +#endif } } - out->width = w; - out->height = h; + buffer->width = width; + buffer->height = height; // Then, allocate buffer for real. - status = AllocateBuffer(out); + status = AllocateBuffer(buffer); if (status != VP8_STATUS_OK) return status; // Use the stride trick if vertical flip is needed. if (options != NULL && options->flip) { - status = WebPFlipBuffer(out); + status = WebPFlipBuffer(buffer); } return status; } diff --git a/src/3rdparty/libwebp/src/dec/common_dec.h b/src/3rdparty/libwebp/src/dec/common_dec.h index 6961e22..b158550 100644 --- a/src/3rdparty/libwebp/src/dec/common_dec.h +++ b/src/3rdparty/libwebp/src/dec/common_dec.h @@ -11,8 +11,8 @@ // // Author: Skal (pascal.massimino@gmail.com) -#ifndef WEBP_DEC_COMMON_H_ -#define WEBP_DEC_COMMON_H_ +#ifndef WEBP_DEC_COMMON_DEC_H_ +#define WEBP_DEC_COMMON_DEC_H_ // intra prediction modes enum { B_DC_PRED = 0, // 4x4 modes @@ -51,4 +51,4 @@ enum { MB_FEATURE_TREE_PROBS = 3, NUM_PROBAS = 11 }; -#endif // WEBP_DEC_COMMON_H_ +#endif // WEBP_DEC_COMMON_DEC_H_ diff --git a/src/3rdparty/libwebp/src/dec/frame_dec.c b/src/3rdparty/libwebp/src/dec/frame_dec.c index f91e27f..bda9e1a 100644 --- a/src/3rdparty/libwebp/src/dec/frame_dec.c +++ b/src/3rdparty/libwebp/src/dec/frame_dec.c @@ -12,13 +12,13 @@ // Author: Skal (pascal.massimino@gmail.com) #include <stdlib.h> -#include "./vp8i_dec.h" -#include "../utils/utils.h" +#include "src/dec/vp8i_dec.h" +#include "src/utils/utils.h" //------------------------------------------------------------------------------ // Main reconstruction function. -static const int kScan[16] = { +static const uint16_t kScan[16] = { 0 + 0 * BPS, 4 + 0 * BPS, 8 + 0 * BPS, 12 + 0 * BPS, 0 + 4 * BPS, 4 + 4 * BPS, 8 + 4 * BPS, 12 + 4 * BPS, 0 + 8 * BPS, 4 + 8 * BPS, 8 + 8 * BPS, 12 + 8 * BPS, @@ -320,7 +320,7 @@ static void PrecomputeFilterStrengths(VP8Decoder* const dec) { #define MIN_DITHER_AMP 4 #define DITHER_AMP_TAB_SIZE 12 -static const int kQuantToDitherAmp[DITHER_AMP_TAB_SIZE] = { +static const uint8_t kQuantToDitherAmp[DITHER_AMP_TAB_SIZE] = { // roughly, it's dqm->uv_mat_[1] 8, 7, 6, 4, 4, 2, 2, 2, 1, 1, 1, 1 }; @@ -338,7 +338,6 @@ void VP8InitDithering(const WebPDecoderOptions* const options, for (s = 0; s < NUM_MB_SEGMENTS; ++s) { VP8QuantMatrix* const dqm = &dec->dqm_[s]; if (dqm->uv_quant_ < DITHER_AMP_TAB_SIZE) { - // TODO(skal): should we specially dither more for uv_quant_ < 0? const int idx = (dqm->uv_quant_ < 0) ? 0 : dqm->uv_quant_; dqm->dither_ = (f * kQuantToDitherAmp[idx]) >> 3; } @@ -400,7 +399,9 @@ static void DitherRow(VP8Decoder* const dec) { #define MACROBLOCK_VPOS(mb_y) ((mb_y) * 16) // vertical position of a MB // Finalize and transmit a complete row. Return false in case of user-abort. -static int FinishRow(VP8Decoder* const dec, VP8Io* const io) { +static int FinishRow(void* arg1, void* arg2) { + VP8Decoder* const dec = (VP8Decoder*)arg1; + VP8Io* const io = (VP8Io*)arg2; int ok = 1; const VP8ThreadContext* const ctx = &dec->thread_ctx_; const int cache_id = ctx->id_; @@ -448,10 +449,9 @@ static int FinishRow(VP8Decoder* const dec, VP8Io* const io) { if (y_end > io->crop_bottom) { y_end = io->crop_bottom; // make sure we don't overflow on last row. } + // If dec->alpha_data_ is not NULL, we have some alpha plane present. io->a = NULL; if (dec->alpha_data_ != NULL && y_start < y_end) { - // TODO(skal): testing presence of alpha with dec->alpha_data_ is not a - // good idea. io->a = VP8DecompressAlphaRows(dec, io, y_start, y_end - y_start); if (io->a == NULL) { return VP8SetError(dec, VP8_STATUS_BITSTREAM_ERROR, @@ -558,7 +558,6 @@ VP8StatusCode VP8EnterCritical(VP8Decoder* const dec, VP8Io* const io) { if (io->bypass_filtering) { dec->filter_type_ = 0; } - // TODO(skal): filter type / strength / sharpness forcing // Define the area where we can skip in-loop filtering, in case of cropping. // @@ -569,8 +568,6 @@ VP8StatusCode VP8EnterCritical(VP8Decoder* const dec, VP8Io* const io) { // Means: there's a dependency chain that goes all the way up to the // top-left corner of the picture (MB #0). We must filter all the previous // macroblocks. - // TODO(skal): add an 'approximate_decoding' option, that won't produce - // a 1:1 bit-exactness for complex filtering? { const int extra_pixels = kFilterExtraRows[dec->filter_type_]; if (dec->filter_type_ == 2) { @@ -651,7 +648,7 @@ static int InitThreadContext(VP8Decoder* const dec) { } worker->data1 = dec; worker->data2 = (void*)&dec->thread_ctx_.io_; - worker->hook = (WebPWorkerHook)FinishRow; + worker->hook = FinishRow; dec->num_caches_ = (dec->filter_type_ > 0) ? MT_CACHE_LINES : MT_CACHE_LINES - 1; } else { @@ -671,15 +668,9 @@ int VP8GetThreadMethod(const WebPDecoderOptions* const options, (void)height; assert(headers == NULL || !headers->is_lossless); #if defined(WEBP_USE_THREAD) - if (width < MIN_WIDTH_FOR_THREADS) return 0; - // TODO(skal): tune the heuristic further -#if 0 - if (height < 2 * width) return 2; + if (width >= MIN_WIDTH_FOR_THREADS) return 2; #endif - return 2; -#else // !WEBP_USE_THREAD return 0; -#endif } #undef MT_CACHE_LINES @@ -728,7 +719,7 @@ static int AllocateMemory(VP8Decoder* const dec) { } mem = (uint8_t*)dec->mem_; - dec->intra_t_ = (uint8_t*)mem; + dec->intra_t_ = mem; mem += intra_pred_mode_size; dec->yuv_t_ = (VP8TopSamples*)mem; @@ -750,7 +741,7 @@ static int AllocateMemory(VP8Decoder* const dec) { mem = (uint8_t*)WEBP_ALIGN(mem); assert((yuv_size & WEBP_ALIGN_CST) == 0); - dec->yuv_b_ = (uint8_t*)mem; + dec->yuv_b_ = mem; mem += yuv_size; dec->mb_data_ = (VP8MBData*)mem; @@ -766,7 +757,7 @@ static int AllocateMemory(VP8Decoder* const dec) { const int extra_rows = kFilterExtraRows[dec->filter_type_]; const int extra_y = extra_rows * dec->cache_y_stride_; const int extra_uv = (extra_rows / 2) * dec->cache_uv_stride_; - dec->cache_y_ = ((uint8_t*)mem) + extra_y; + dec->cache_y_ = mem + extra_y; dec->cache_u_ = dec->cache_y_ + 16 * num_caches * dec->cache_y_stride_ + extra_uv; dec->cache_v_ = dec->cache_u_ @@ -776,7 +767,7 @@ static int AllocateMemory(VP8Decoder* const dec) { mem += cache_size; // alpha plane - dec->alpha_plane_ = alpha_size ? (uint8_t*)mem : NULL; + dec->alpha_plane_ = alpha_size ? mem : NULL; mem += alpha_size; assert(mem <= (uint8_t*)dec->mem_ + dec->mem_size_); diff --git a/src/3rdparty/libwebp/src/dec/idec_dec.c b/src/3rdparty/libwebp/src/dec/idec_dec.c index 78fb2e7..9bc9166 100644 --- a/src/3rdparty/libwebp/src/dec/idec_dec.c +++ b/src/3rdparty/libwebp/src/dec/idec_dec.c @@ -15,10 +15,10 @@ #include <string.h> #include <stdlib.h> -#include "./alphai_dec.h" -#include "./webpi_dec.h" -#include "./vp8i_dec.h" -#include "../utils/utils.h" +#include "src/dec/alphai_dec.h" +#include "src/dec/webpi_dec.h" +#include "src/dec/vp8i_dec.h" +#include "src/utils/utils.h" // In append mode, buffer allocations increase as multiples of this value. // Needs to be a power of 2. @@ -140,10 +140,9 @@ static void DoRemap(WebPIDecoder* const idec, ptrdiff_t offset) { if (NeedCompressedAlpha(idec)) { ALPHDecoder* const alph_dec = dec->alph_dec_; dec->alpha_data_ += offset; - if (alph_dec != NULL) { + if (alph_dec != NULL && alph_dec->vp8l_dec_ != NULL) { if (alph_dec->method_ == ALPHA_LOSSLESS_COMPRESSION) { VP8LDecoder* const alph_vp8l_dec = alph_dec->vp8l_dec_; - assert(alph_vp8l_dec != NULL); assert(dec->alpha_data_size_ >= ALPHA_HEADER_LEN); VP8LBitReaderSetBuffer(&alph_vp8l_dec->br_, dec->alpha_data_ + ALPHA_HEADER_LEN, @@ -283,10 +282,8 @@ static void RestoreContext(const MBContext* context, VP8Decoder* const dec, static VP8StatusCode IDecError(WebPIDecoder* const idec, VP8StatusCode error) { if (idec->state_ == STATE_VP8_DATA) { - VP8Io* const io = &idec->io_; - if (io->teardown != NULL) { - io->teardown(io); - } + // Synchronize the thread, clean-up and check for errors. + VP8ExitCritical((VP8Decoder*)idec->dec_, &idec->io_); } idec->state_ = STATE_ERROR; return error; @@ -451,7 +448,10 @@ static VP8StatusCode DecodeRemaining(WebPIDecoder* const idec) { VP8Decoder* const dec = (VP8Decoder*)idec->dec_; VP8Io* const io = &idec->io_; - assert(dec->ready_); + // Make sure partition #0 has been read before, to set dec to ready_. + if (!dec->ready_) { + return IDecError(idec, VP8_STATUS_BITSTREAM_ERROR); + } for (; dec->mb_y_ < dec->mb_h_; ++dec->mb_y_) { if (idec->last_mb_y_ != dec->mb_y_) { if (!VP8ParseIntraModeRow(&dec->br_, dec)) { @@ -473,6 +473,12 @@ static VP8StatusCode DecodeRemaining(WebPIDecoder* const idec) { MemDataSize(&idec->mem_) > MAX_MB_SIZE) { return IDecError(idec, VP8_STATUS_BITSTREAM_ERROR); } + // Synchronize the threads. + if (dec->mt_method_ > 0) { + if (!WebPGetWorkerInterface()->Sync(&dec->worker_)) { + return IDecError(idec, VP8_STATUS_BITSTREAM_ERROR); + } + } RestoreContext(&context, dec, token_br); return VP8_STATUS_SUSPENDED; } @@ -491,6 +497,7 @@ static VP8StatusCode DecodeRemaining(WebPIDecoder* const idec) { } // Synchronize the thread and check for errors. if (!VP8ExitCritical(dec, io)) { + idec->state_ = STATE_ERROR; // prevent re-entry in IDecError return IDecError(idec, VP8_STATUS_USER_ABORT); } dec->ready_ = 0; @@ -571,6 +578,10 @@ static VP8StatusCode IDecode(WebPIDecoder* idec) { status = DecodePartition0(idec); } if (idec->state_ == STATE_VP8_DATA) { + const VP8Decoder* const dec = (VP8Decoder*)idec->dec_; + if (dec == NULL) { + return VP8_STATUS_SUSPENDED; // can't continue if we have no decoder. + } status = DecodeRemaining(idec); } if (idec->state_ == STATE_VP8L_HEADER) { @@ -673,12 +684,12 @@ void WebPIDelete(WebPIDecoder* idec) { //------------------------------------------------------------------------------ // Wrapper toward WebPINewDecoder -WebPIDecoder* WebPINewRGB(WEBP_CSP_MODE mode, uint8_t* output_buffer, +WebPIDecoder* WebPINewRGB(WEBP_CSP_MODE csp, uint8_t* output_buffer, size_t output_buffer_size, int output_stride) { const int is_external_memory = (output_buffer != NULL) ? 1 : 0; WebPIDecoder* idec; - if (mode >= MODE_YUV) return NULL; + if (csp >= MODE_YUV) return NULL; if (is_external_memory == 0) { // Overwrite parameters to sane values. output_buffer_size = 0; output_stride = 0; @@ -689,7 +700,7 @@ WebPIDecoder* WebPINewRGB(WEBP_CSP_MODE mode, uint8_t* output_buffer, } idec = WebPINewDecoder(NULL); if (idec == NULL) return NULL; - idec->output_.colorspace = mode; + idec->output_.colorspace = csp; idec->output_.is_external_memory = is_external_memory; idec->output_.u.RGBA.rgba = output_buffer; idec->output_.u.RGBA.stride = output_stride; diff --git a/src/3rdparty/libwebp/src/dec/io_dec.c b/src/3rdparty/libwebp/src/dec/io_dec.c index 8bfab86..e603f19 100644 --- a/src/3rdparty/libwebp/src/dec/io_dec.c +++ b/src/3rdparty/libwebp/src/dec/io_dec.c @@ -13,11 +13,11 @@ #include <assert.h> #include <stdlib.h> -#include "../dec/vp8i_dec.h" -#include "./webpi_dec.h" -#include "../dsp/dsp.h" -#include "../dsp/yuv.h" -#include "../utils/utils.h" +#include "src/dec/vp8i_dec.h" +#include "src/dec/webpi_dec.h" +#include "src/dsp/dsp.h" +#include "src/dsp/yuv.h" +#include "src/utils/utils.h" //------------------------------------------------------------------------------ // Main YUV<->RGB conversion functions @@ -212,7 +212,7 @@ static int EmitAlphaRGBA4444(const VP8Io* const io, WebPDecParams* const p, int num_rows; const int start_y = GetAlphaSourceRow(io, &alpha, &num_rows); uint8_t* const base_rgba = buf->rgba + start_y * buf->stride; -#ifdef WEBP_SWAP_16BIT_CSP +#if (WEBP_SWAP_16BIT_CSP == 1) uint8_t* alpha_dst = base_rgba; #else uint8_t* alpha_dst = base_rgba + 1; @@ -241,6 +241,7 @@ static int EmitAlphaRGBA4444(const VP8Io* const io, WebPDecParams* const p, //------------------------------------------------------------------------------ // YUV rescaling (no final RGB conversion needed) +#if !defined(WEBP_REDUCE_SIZE) static int Rescale(const uint8_t* src, int src_stride, int new_lines, WebPRescaler* const wrk) { int num_lines_out = 0; @@ -431,7 +432,7 @@ static int ExportAlphaRGBA4444(WebPDecParams* const p, int y_pos, int max_lines_out) { const WebPRGBABuffer* const buf = &p->output->u.RGBA; uint8_t* const base_rgba = buf->rgba + y_pos * buf->stride; -#ifdef WEBP_SWAP_16BIT_CSP +#if (WEBP_SWAP_16BIT_CSP == 1) uint8_t* alpha_dst = base_rgba; #else uint8_t* alpha_dst = base_rgba + 1; @@ -541,6 +542,8 @@ static int InitRGBRescaler(const VP8Io* const io, WebPDecParams* const p) { return 1; } +#endif // WEBP_REDUCE_SIZE + //------------------------------------------------------------------------------ // Default custom functions @@ -561,10 +564,14 @@ static int CustomSetup(VP8Io* io) { WebPInitUpsamplers(); } if (io->use_scaling) { +#if !defined(WEBP_REDUCE_SIZE) const int ok = is_rgb ? InitRGBRescaler(io, p) : InitYUVRescaler(io, p); if (!ok) { return 0; // memory error } +#else + return 0; // rescaling support not compiled +#endif } else { if (is_rgb) { WebPInitSamplers(); @@ -598,9 +605,6 @@ static int CustomSetup(VP8Io* io) { } } - if (is_rgb) { - VP8YUVInit(); - } return 1; } diff --git a/src/3rdparty/libwebp/src/dec/quant_dec.c b/src/3rdparty/libwebp/src/dec/quant_dec.c index 14e3198..a0ac018 100644 --- a/src/3rdparty/libwebp/src/dec/quant_dec.c +++ b/src/3rdparty/libwebp/src/dec/quant_dec.c @@ -11,7 +11,7 @@ // // Author: Skal (pascal.massimino@gmail.com) -#include "./vp8i_dec.h" +#include "src/dec/vp8i_dec.h" static WEBP_INLINE int clip(int v, int M) { return v < 0 ? 0 : v > M ? M : v; @@ -61,12 +61,17 @@ static const uint16_t kAcTable[128] = { void VP8ParseQuant(VP8Decoder* const dec) { VP8BitReader* const br = &dec->br_; - const int base_q0 = VP8GetValue(br, 7); - const int dqy1_dc = VP8Get(br) ? VP8GetSignedValue(br, 4) : 0; - const int dqy2_dc = VP8Get(br) ? VP8GetSignedValue(br, 4) : 0; - const int dqy2_ac = VP8Get(br) ? VP8GetSignedValue(br, 4) : 0; - const int dquv_dc = VP8Get(br) ? VP8GetSignedValue(br, 4) : 0; - const int dquv_ac = VP8Get(br) ? VP8GetSignedValue(br, 4) : 0; + const int base_q0 = VP8GetValue(br, 7, "global-header"); + const int dqy1_dc = VP8Get(br, "global-header") ? + VP8GetSignedValue(br, 4, "global-header") : 0; + const int dqy2_dc = VP8Get(br, "global-header") ? + VP8GetSignedValue(br, 4, "global-header") : 0; + const int dqy2_ac = VP8Get(br, "global-header") ? + VP8GetSignedValue(br, 4, "global-header") : 0; + const int dquv_dc = VP8Get(br, "global-header") ? + VP8GetSignedValue(br, 4, "global-header") : 0; + const int dquv_ac = VP8Get(br, "global-header") ? + VP8GetSignedValue(br, 4, "global-header") : 0; const VP8SegmentHeader* const hdr = &dec->segment_hdr_; int i; diff --git a/src/3rdparty/libwebp/src/dec/tree_dec.c b/src/3rdparty/libwebp/src/dec/tree_dec.c index 9e805f6..1c6fdea 100644 --- a/src/3rdparty/libwebp/src/dec/tree_dec.c +++ b/src/3rdparty/libwebp/src/dec/tree_dec.c @@ -11,15 +11,19 @@ // // Author: Skal (pascal.massimino@gmail.com) -#include "./vp8i_dec.h" -#include "../utils/bit_reader_inl_utils.h" +#include "src/dec/vp8i_dec.h" +#include "src/utils/bit_reader_inl_utils.h" +#if !defined(USE_GENERIC_TREE) #if !defined(__arm__) && !defined(_M_ARM) && !defined(__aarch64__) // using a table is ~1-2% slower on ARM. Prefer the coded-tree approach then. -#define USE_GENERIC_TREE +#define USE_GENERIC_TREE 1 // ALTERNATE_CODE +#else +#define USE_GENERIC_TREE 0 #endif +#endif // USE_GENERIC_TREE -#ifdef USE_GENERIC_TREE +#if (USE_GENERIC_TREE == 1) static const int8_t kYModesIntra4[18] = { -B_DC_PRED, 1, -B_TM_PRED, 2, @@ -292,20 +296,21 @@ static void ParseIntraMode(VP8BitReader* const br, // to decode more than 1 keyframe. if (dec->segment_hdr_.update_map_) { // Hardcoded tree parsing - block->segment_ = !VP8GetBit(br, dec->proba_.segments_[0]) - ? VP8GetBit(br, dec->proba_.segments_[1]) - : 2 + VP8GetBit(br, dec->proba_.segments_[2]); + block->segment_ = !VP8GetBit(br, dec->proba_.segments_[0], "segments") + ? VP8GetBit(br, dec->proba_.segments_[1], "segments") + : VP8GetBit(br, dec->proba_.segments_[2], "segments") + 2; } else { block->segment_ = 0; // default for intra } - if (dec->use_skip_proba_) block->skip_ = VP8GetBit(br, dec->skip_p_); + if (dec->use_skip_proba_) block->skip_ = VP8GetBit(br, dec->skip_p_, "skip"); - block->is_i4x4_ = !VP8GetBit(br, 145); // decide for B_PRED first + block->is_i4x4_ = !VP8GetBit(br, 145, "block-size"); if (!block->is_i4x4_) { // Hardcoded 16x16 intra-mode decision tree. const int ymode = - VP8GetBit(br, 156) ? (VP8GetBit(br, 128) ? TM_PRED : H_PRED) - : (VP8GetBit(br, 163) ? V_PRED : DC_PRED); + VP8GetBit(br, 156, "pred-modes") ? + (VP8GetBit(br, 128, "pred-modes") ? TM_PRED : H_PRED) : + (VP8GetBit(br, 163, "pred-modes") ? V_PRED : DC_PRED); block->imodes_[0] = ymode; memset(top, ymode, 4 * sizeof(*top)); memset(left, ymode, 4 * sizeof(*left)); @@ -317,25 +322,28 @@ static void ParseIntraMode(VP8BitReader* const br, int x; for (x = 0; x < 4; ++x) { const uint8_t* const prob = kBModesProba[top[x]][ymode]; -#ifdef USE_GENERIC_TREE +#if (USE_GENERIC_TREE == 1) // Generic tree-parsing - int i = kYModesIntra4[VP8GetBit(br, prob[0])]; + int i = kYModesIntra4[VP8GetBit(br, prob[0], "pred-modes")]; while (i > 0) { - i = kYModesIntra4[2 * i + VP8GetBit(br, prob[i])]; + i = kYModesIntra4[2 * i + VP8GetBit(br, prob[i], "pred-modes")]; } ymode = -i; #else // Hardcoded tree parsing - ymode = !VP8GetBit(br, prob[0]) ? B_DC_PRED : - !VP8GetBit(br, prob[1]) ? B_TM_PRED : - !VP8GetBit(br, prob[2]) ? B_VE_PRED : - !VP8GetBit(br, prob[3]) ? - (!VP8GetBit(br, prob[4]) ? B_HE_PRED : - (!VP8GetBit(br, prob[5]) ? B_RD_PRED : B_VR_PRED)) : - (!VP8GetBit(br, prob[6]) ? B_LD_PRED : - (!VP8GetBit(br, prob[7]) ? B_VL_PRED : - (!VP8GetBit(br, prob[8]) ? B_HD_PRED : B_HU_PRED))); -#endif // USE_GENERIC_TREE + ymode = !VP8GetBit(br, prob[0], "pred-modes") ? B_DC_PRED : + !VP8GetBit(br, prob[1], "pred-modes") ? B_TM_PRED : + !VP8GetBit(br, prob[2], "pred-modes") ? B_VE_PRED : + !VP8GetBit(br, prob[3], "pred-modes") ? + (!VP8GetBit(br, prob[4], "pred-modes") ? B_HE_PRED : + (!VP8GetBit(br, prob[5], "pred-modes") ? B_RD_PRED + : B_VR_PRED)) : + (!VP8GetBit(br, prob[6], "pred-modes") ? B_LD_PRED : + (!VP8GetBit(br, prob[7], "pred-modes") ? B_VL_PRED : + (!VP8GetBit(br, prob[8], "pred-modes") ? B_HD_PRED + : B_HU_PRED)) + ); +#endif // USE_GENERIC_TREE top[x] = ymode; } memcpy(modes, top, 4 * sizeof(*top)); @@ -344,9 +352,9 @@ static void ParseIntraMode(VP8BitReader* const br, } } // Hardcoded UVMode decision tree - block->uvmode_ = !VP8GetBit(br, 142) ? DC_PRED - : !VP8GetBit(br, 114) ? V_PRED - : VP8GetBit(br, 183) ? TM_PRED : H_PRED; + block->uvmode_ = !VP8GetBit(br, 142, "pred-modes-uv") ? DC_PRED + : !VP8GetBit(br, 114, "pred-modes-uv") ? V_PRED + : VP8GetBit(br, 183, "pred-modes-uv") ? TM_PRED : H_PRED; } int VP8ParseIntraModeRow(VP8BitReader* const br, VP8Decoder* const dec) { @@ -498,7 +506,7 @@ static const uint8_t // Paragraph 9.9 -static const int kBands[16 + 1] = { +static const uint8_t kBands[16 + 1] = { 0, 1, 2, 3, 6, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7, 0 // extra entry as sentinel }; @@ -510,8 +518,10 @@ void VP8ParseProba(VP8BitReader* const br, VP8Decoder* const dec) { for (b = 0; b < NUM_BANDS; ++b) { for (c = 0; c < NUM_CTX; ++c) { for (p = 0; p < NUM_PROBAS; ++p) { - const int v = VP8GetBit(br, CoeffsUpdateProba[t][b][c][p]) ? - VP8GetValue(br, 8) : CoeffsProba0[t][b][c][p]; + const int v = + VP8GetBit(br, CoeffsUpdateProba[t][b][c][p], "global-header") ? + VP8GetValue(br, 8, "global-header") : + CoeffsProba0[t][b][c][p]; proba->bands_[t][b].probas_[c][p] = v; } } @@ -520,9 +530,8 @@ void VP8ParseProba(VP8BitReader* const br, VP8Decoder* const dec) { proba->bands_ptr_[t][b] = &proba->bands_[t][kBands[b]]; } } - dec->use_skip_proba_ = VP8Get(br); + dec->use_skip_proba_ = VP8Get(br, "global-header"); if (dec->use_skip_proba_) { - dec->skip_p_ = VP8GetValue(br, 8); + dec->skip_p_ = VP8GetValue(br, 8, "global-header"); } } - diff --git a/src/3rdparty/libwebp/src/dec/vp8_dec.c b/src/3rdparty/libwebp/src/dec/vp8_dec.c index fad8d9c..57efb69 100644 --- a/src/3rdparty/libwebp/src/dec/vp8_dec.c +++ b/src/3rdparty/libwebp/src/dec/vp8_dec.c @@ -13,12 +13,12 @@ #include <stdlib.h> -#include "./alphai_dec.h" -#include "./vp8i_dec.h" -#include "./vp8li_dec.h" -#include "./webpi_dec.h" -#include "../utils/bit_reader_inl_utils.h" -#include "../utils/utils.h" +#include "src/dec/alphai_dec.h" +#include "src/dec/vp8i_dec.h" +#include "src/dec/vp8li_dec.h" +#include "src/dec/webpi_dec.h" +#include "src/utils/bit_reader_inl_utils.h" +#include "src/utils/utils.h" //------------------------------------------------------------------------------ @@ -161,23 +161,26 @@ static int ParseSegmentHeader(VP8BitReader* br, VP8SegmentHeader* hdr, VP8Proba* proba) { assert(br != NULL); assert(hdr != NULL); - hdr->use_segment_ = VP8Get(br); + hdr->use_segment_ = VP8Get(br, "global-header"); if (hdr->use_segment_) { - hdr->update_map_ = VP8Get(br); - if (VP8Get(br)) { // update data + hdr->update_map_ = VP8Get(br, "global-header"); + if (VP8Get(br, "global-header")) { // update data int s; - hdr->absolute_delta_ = VP8Get(br); + hdr->absolute_delta_ = VP8Get(br, "global-header"); for (s = 0; s < NUM_MB_SEGMENTS; ++s) { - hdr->quantizer_[s] = VP8Get(br) ? VP8GetSignedValue(br, 7) : 0; + hdr->quantizer_[s] = VP8Get(br, "global-header") ? + VP8GetSignedValue(br, 7, "global-header") : 0; } for (s = 0; s < NUM_MB_SEGMENTS; ++s) { - hdr->filter_strength_[s] = VP8Get(br) ? VP8GetSignedValue(br, 6) : 0; + hdr->filter_strength_[s] = VP8Get(br, "global-header") ? + VP8GetSignedValue(br, 6, "global-header") : 0; } } if (hdr->update_map_) { int s; for (s = 0; s < MB_FEATURE_TREE_PROBS; ++s) { - proba->segments_[s] = VP8Get(br) ? VP8GetValue(br, 8) : 255u; + proba->segments_[s] = VP8Get(br, "global-header") ? + VP8GetValue(br, 8, "global-header") : 255u; } } } else { @@ -205,7 +208,7 @@ static VP8StatusCode ParsePartitions(VP8Decoder* const dec, size_t last_part; size_t p; - dec->num_parts_minus_one_ = (1 << VP8GetValue(br, 2)) - 1; + dec->num_parts_minus_one_ = (1 << VP8GetValue(br, 2, "global-header")) - 1; last_part = dec->num_parts_minus_one_; if (size < 3 * last_part) { // we can't even read the sizes with sz[]! That's a failure. @@ -229,21 +232,21 @@ static VP8StatusCode ParsePartitions(VP8Decoder* const dec, // Paragraph 9.4 static int ParseFilterHeader(VP8BitReader* br, VP8Decoder* const dec) { VP8FilterHeader* const hdr = &dec->filter_hdr_; - hdr->simple_ = VP8Get(br); - hdr->level_ = VP8GetValue(br, 6); - hdr->sharpness_ = VP8GetValue(br, 3); - hdr->use_lf_delta_ = VP8Get(br); + hdr->simple_ = VP8Get(br, "global-header"); + hdr->level_ = VP8GetValue(br, 6, "global-header"); + hdr->sharpness_ = VP8GetValue(br, 3, "global-header"); + hdr->use_lf_delta_ = VP8Get(br, "global-header"); if (hdr->use_lf_delta_) { - if (VP8Get(br)) { // update lf-delta? + if (VP8Get(br, "global-header")) { // update lf-delta? int i; for (i = 0; i < NUM_REF_LF_DELTAS; ++i) { - if (VP8Get(br)) { - hdr->ref_lf_delta_[i] = VP8GetSignedValue(br, 6); + if (VP8Get(br, "global-header")) { + hdr->ref_lf_delta_[i] = VP8GetSignedValue(br, 6, "global-header"); } } for (i = 0; i < NUM_MODE_LF_DELTAS; ++i) { - if (VP8Get(br)) { - hdr->mode_lf_delta_[i] = VP8GetSignedValue(br, 6); + if (VP8Get(br, "global-header")) { + hdr->mode_lf_delta_[i] = VP8GetSignedValue(br, 6, "global-header"); } } } @@ -352,8 +355,8 @@ int VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io) { buf_size -= frm_hdr->partition_length_; if (frm_hdr->key_frame_) { - pic_hdr->colorspace_ = VP8Get(br); - pic_hdr->clamp_type_ = VP8Get(br); + pic_hdr->colorspace_ = VP8Get(br, "global-header"); + pic_hdr->clamp_type_ = VP8Get(br, "global-header"); } if (!ParseSegmentHeader(br, &dec->segment_hdr_, &dec->proba_)) { return VP8SetError(dec, VP8_STATUS_BITSTREAM_ERROR, @@ -378,7 +381,7 @@ int VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io) { "Not a key frame."); } - VP8Get(br); // ignore the value of update_proba_ + VP8Get(br, "global-header"); // ignore the value of update_proba_ VP8ParseProba(br, dec); @@ -403,28 +406,28 @@ static const uint8_t kZigzag[16] = { // See section 13-2: http://tools.ietf.org/html/rfc6386#section-13.2 static int GetLargeValue(VP8BitReader* const br, const uint8_t* const p) { int v; - if (!VP8GetBit(br, p[3])) { - if (!VP8GetBit(br, p[4])) { + if (!VP8GetBit(br, p[3], "coeffs")) { + if (!VP8GetBit(br, p[4], "coeffs")) { v = 2; } else { - v = 3 + VP8GetBit(br, p[5]); + v = 3 + VP8GetBit(br, p[5], "coeffs"); } } else { - if (!VP8GetBit(br, p[6])) { - if (!VP8GetBit(br, p[7])) { - v = 5 + VP8GetBit(br, 159); + if (!VP8GetBit(br, p[6], "coeffs")) { + if (!VP8GetBit(br, p[7], "coeffs")) { + v = 5 + VP8GetBit(br, 159, "coeffs"); } else { - v = 7 + 2 * VP8GetBit(br, 165); - v += VP8GetBit(br, 145); + v = 7 + 2 * VP8GetBit(br, 165, "coeffs"); + v += VP8GetBit(br, 145, "coeffs"); } } else { const uint8_t* tab; - const int bit1 = VP8GetBit(br, p[8]); - const int bit0 = VP8GetBit(br, p[9 + bit1]); + const int bit1 = VP8GetBit(br, p[8], "coeffs"); + const int bit0 = VP8GetBit(br, p[9 + bit1], "coeffs"); const int cat = 2 * bit1 + bit0; v = 0; for (tab = kCat3456[cat]; *tab; ++tab) { - v += v + VP8GetBit(br, *tab); + v += v + VP8GetBit(br, *tab, "coeffs"); } v += 3 + (8 << cat); } @@ -438,24 +441,24 @@ static int GetCoeffsFast(VP8BitReader* const br, int ctx, const quant_t dq, int n, int16_t* out) { const uint8_t* p = prob[n]->probas_[ctx]; for (; n < 16; ++n) { - if (!VP8GetBit(br, p[0])) { + if (!VP8GetBit(br, p[0], "coeffs")) { return n; // previous coeff was last non-zero coeff } - while (!VP8GetBit(br, p[1])) { // sequence of zero coeffs + while (!VP8GetBit(br, p[1], "coeffs")) { // sequence of zero coeffs p = prob[++n]->probas_[0]; if (n == 16) return 16; } { // non zero coeff const VP8ProbaArray* const p_ctx = &prob[n + 1]->probas_[0]; int v; - if (!VP8GetBit(br, p[2])) { + if (!VP8GetBit(br, p[2], "coeffs")) { v = 1; p = p_ctx[1]; } else { v = GetLargeValue(br, p); p = p_ctx[2]; } - out[kZigzag[n]] = VP8GetSigned(br, v) * dq[n > 0]; + out[kZigzag[n]] = VP8GetSigned(br, v, "coeffs") * dq[n > 0]; } } return 16; @@ -468,30 +471,30 @@ static int GetCoeffsAlt(VP8BitReader* const br, int ctx, const quant_t dq, int n, int16_t* out) { const uint8_t* p = prob[n]->probas_[ctx]; for (; n < 16; ++n) { - if (!VP8GetBitAlt(br, p[0])) { + if (!VP8GetBitAlt(br, p[0], "coeffs")) { return n; // previous coeff was last non-zero coeff } - while (!VP8GetBitAlt(br, p[1])) { // sequence of zero coeffs + while (!VP8GetBitAlt(br, p[1], "coeffs")) { // sequence of zero coeffs p = prob[++n]->probas_[0]; if (n == 16) return 16; } { // non zero coeff const VP8ProbaArray* const p_ctx = &prob[n + 1]->probas_[0]; int v; - if (!VP8GetBitAlt(br, p[2])) { + if (!VP8GetBitAlt(br, p[2], "coeffs")) { v = 1; p = p_ctx[1]; } else { v = GetLargeValue(br, p); p = p_ctx[2]; } - out[kZigzag[n]] = VP8GetSigned(br, v) * dq[n > 0]; + out[kZigzag[n]] = VP8GetSigned(br, v, "coeffs") * dq[n > 0]; } } return 16; } -WEBP_TSAN_IGNORE_FUNCTION static void InitGetCoeffs(void) { +static WEBP_TSAN_IGNORE_FUNCTION void InitGetCoeffs(void) { if (GetCoeffs == NULL) { if (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kSlowSSSE3)) { GetCoeffs = GetCoeffsAlt; diff --git a/src/3rdparty/libwebp/src/dec/vp8_dec.h b/src/3rdparty/libwebp/src/dec/vp8_dec.h index b9337bb..a05405d 100644 --- a/src/3rdparty/libwebp/src/dec/vp8_dec.h +++ b/src/3rdparty/libwebp/src/dec/vp8_dec.h @@ -11,10 +11,10 @@ // // Author: Skal (pascal.massimino@gmail.com) -#ifndef WEBP_WEBP_DECODE_VP8_H_ -#define WEBP_WEBP_DECODE_VP8_H_ +#ifndef WEBP_DEC_VP8_DEC_H_ +#define WEBP_DEC_VP8_DEC_H_ -#include "../webp/decode.h" +#include "src/webp/decode.h" #ifdef __cplusplus extern "C" { @@ -33,7 +33,7 @@ extern "C" { // /* customize io's functions (setup()/put()/teardown()) if needed. */ // // VP8Decoder* dec = VP8New(); -// bool ok = VP8Decode(dec); +// int ok = VP8Decode(dec, &io); // if (!ok) printf("Error: %s\n", VP8StatusMessage(dec)); // VP8Delete(dec); // return ok; @@ -157,24 +157,24 @@ void VP8Delete(VP8Decoder* const dec); // Miscellaneous VP8/VP8L bitstream probing functions. // Returns true if the next 3 bytes in data contain the VP8 signature. -WEBP_EXTERN(int) VP8CheckSignature(const uint8_t* const data, size_t data_size); +WEBP_EXTERN int VP8CheckSignature(const uint8_t* const data, size_t data_size); // Validates the VP8 data-header and retrieves basic header information viz // width and height. Returns 0 in case of formatting error. *width/*height // can be passed NULL. -WEBP_EXTERN(int) VP8GetInfo( +WEBP_EXTERN int VP8GetInfo( const uint8_t* data, size_t data_size, // data available so far size_t chunk_size, // total data size expected in the chunk int* const width, int* const height); // Returns true if the next byte(s) in data is a VP8L signature. -WEBP_EXTERN(int) VP8LCheckSignature(const uint8_t* const data, size_t size); +WEBP_EXTERN int VP8LCheckSignature(const uint8_t* const data, size_t size); // Validates the VP8L data-header and retrieves basic header information viz // width, height and alpha. Returns 0 in case of formatting error. // width/height/has_alpha can be passed NULL. -WEBP_EXTERN(int) VP8LGetInfo( +WEBP_EXTERN int VP8LGetInfo( const uint8_t* data, size_t data_size, // data available so far int* const width, int* const height, int* const has_alpha); @@ -182,4 +182,4 @@ WEBP_EXTERN(int) VP8LGetInfo( } // extern "C" #endif -#endif /* WEBP_WEBP_DECODE_VP8_H_ */ +#endif // WEBP_DEC_VP8_DEC_H_ diff --git a/src/3rdparty/libwebp/src/dec/vp8i_dec.h b/src/3rdparty/libwebp/src/dec/vp8i_dec.h index 555853e..3de8d86 100644 --- a/src/3rdparty/libwebp/src/dec/vp8i_dec.h +++ b/src/3rdparty/libwebp/src/dec/vp8i_dec.h @@ -11,16 +11,16 @@ // // Author: Skal (pascal.massimino@gmail.com) -#ifndef WEBP_DEC_VP8I_H_ -#define WEBP_DEC_VP8I_H_ +#ifndef WEBP_DEC_VP8I_DEC_H_ +#define WEBP_DEC_VP8I_DEC_H_ #include <string.h> // for memcpy() -#include "./common_dec.h" -#include "./vp8li_dec.h" -#include "../utils/bit_reader_utils.h" -#include "../utils/random_utils.h" -#include "../utils/thread_utils.h" -#include "../dsp/dsp.h" +#include "src/dec/common_dec.h" +#include "src/dec/vp8li_dec.h" +#include "src/utils/bit_reader_utils.h" +#include "src/utils/random_utils.h" +#include "src/utils/thread_utils.h" +#include "src/dsp/dsp.h" #ifdef __cplusplus extern "C" { @@ -30,9 +30,9 @@ extern "C" { // Various defines and enums // version numbers -#define DEC_MAJ_VERSION 0 -#define DEC_MIN_VERSION 6 -#define DEC_REV_VERSION 0 +#define DEC_MAJ_VERSION 1 +#define DEC_MIN_VERSION 0 +#define DEC_REV_VERSION 3 // YUV-cache parameters. Cache is 32-bytes wide (= one cacheline). // Constraints are: We need to store one 16x16 block of luma samples (y), @@ -57,7 +57,6 @@ extern "C" { // '|' = left sample, '-' = top sample, '+' = top-left sample // 't' = extra top-right sample for 4x4 modes #define YUV_SIZE (BPS * 17 + BPS * 9) -#define Y_SIZE (BPS * 17) #define Y_OFF (BPS * 1 + 8) #define U_OFF (Y_OFF + BPS * 16 + BPS) #define V_OFF (U_OFF + 16) @@ -317,4 +316,4 @@ const uint8_t* VP8DecompressAlphaRows(VP8Decoder* const dec, } // extern "C" #endif -#endif /* WEBP_DEC_VP8I_H_ */ +#endif // WEBP_DEC_VP8I_DEC_H_ diff --git a/src/3rdparty/libwebp/src/dec/vp8l_dec.c b/src/3rdparty/libwebp/src/dec/vp8l_dec.c index ef359a9..d3e2711 100644 --- a/src/3rdparty/libwebp/src/dec/vp8l_dec.c +++ b/src/3rdparty/libwebp/src/dec/vp8l_dec.c @@ -14,22 +14,22 @@ #include <stdlib.h> -#include "./alphai_dec.h" -#include "./vp8li_dec.h" -#include "../dsp/dsp.h" -#include "../dsp/lossless.h" -#include "../dsp/lossless_common.h" -#include "../dsp/yuv.h" -#include "../utils/endian_inl_utils.h" -#include "../utils/huffman_utils.h" -#include "../utils/utils.h" +#include "src/dec/alphai_dec.h" +#include "src/dec/vp8li_dec.h" +#include "src/dsp/dsp.h" +#include "src/dsp/lossless.h" +#include "src/dsp/lossless_common.h" +#include "src/dsp/yuv.h" +#include "src/utils/endian_inl_utils.h" +#include "src/utils/huffman_utils.h" +#include "src/utils/utils.h" #define NUM_ARGB_CACHE_ROWS 16 static const int kCodeLengthLiterals = 16; static const int kCodeLengthRepeatCode = 16; -static const int kCodeLengthExtraBits[3] = { 2, 3, 7 }; -static const int kCodeLengthRepeatOffsets[3] = { 3, 3, 11 }; +static const uint8_t kCodeLengthExtraBits[3] = { 2, 3, 7 }; +static const uint8_t kCodeLengthRepeatOffsets[3] = { 3, 3, 11 }; // ----------------------------------------------------------------------------- // Five Huffman codes are used at each meta code: @@ -86,7 +86,7 @@ static const uint8_t kCodeToPlane[CODE_TO_PLANE_CODES] = { // All values computed for 8-bit first level lookup with Mark Adler's tool: // http://www.hdfgroup.org/ftp/lib-external/zlib/zlib-1.2.5/examples/enough.c #define FIXED_TABLE_SIZE (630 * 3 + 410) -static const int kTableSize[12] = { +static const uint16_t kTableSize[12] = { FIXED_TABLE_SIZE + 654, FIXED_TABLE_SIZE + 656, FIXED_TABLE_SIZE + 658, @@ -363,11 +363,14 @@ static int ReadHuffmanCodes(VP8LDecoder* const dec, int xsize, int ysize, uint32_t* huffman_image = NULL; HTreeGroup* htree_groups = NULL; HuffmanCode* huffman_tables = NULL; - HuffmanCode* next = NULL; + HuffmanCode* huffman_table = NULL; int num_htree_groups = 1; + int num_htree_groups_max = 1; int max_alphabet_size = 0; int* code_lengths = NULL; const int table_size = kTableSize[color_cache_bits]; + int* mapping = NULL; + int ok = 0; if (allow_recursion && VP8LReadBits(br, 1)) { // use meta Huffman codes. @@ -384,9 +387,35 @@ static int ReadHuffmanCodes(VP8LDecoder* const dec, int xsize, int ysize, // The huffman data is stored in red and green bytes. const int group = (huffman_image[i] >> 8) & 0xffff; huffman_image[i] = group; - if (group >= num_htree_groups) { - num_htree_groups = group + 1; + if (group >= num_htree_groups_max) { + num_htree_groups_max = group + 1; + } + } + // Check the validity of num_htree_groups_max. If it seems too big, use a + // smaller value for later. This will prevent big memory allocations to end + // up with a bad bitstream anyway. + // The value of 1000 is totally arbitrary. We know that num_htree_groups_max + // is smaller than (1 << 16) and should be smaller than the number of pixels + // (though the format allows it to be bigger). + if (num_htree_groups_max > 1000 || num_htree_groups_max > xsize * ysize) { + // Create a mapping from the used indices to the minimal set of used + // values [0, num_htree_groups) + mapping = (int*)WebPSafeMalloc(num_htree_groups_max, sizeof(*mapping)); + if (mapping == NULL) { + dec->status_ = VP8_STATUS_OUT_OF_MEMORY; + goto Error; + } + // -1 means a value is unmapped, and therefore unused in the Huffman + // image. + memset(mapping, 0xff, num_htree_groups_max * sizeof(*mapping)); + for (num_htree_groups = 0, i = 0; i < huffman_pixs; ++i) { + // Get the current mapping for the group and remap the Huffman image. + int* const mapped_group = &mapping[huffman_image[i]]; + if (*mapped_group == -1) *mapped_group = num_htree_groups++; + huffman_image[i] = *mapped_group; } + } else { + num_htree_groups = num_htree_groups_max; } } @@ -403,88 +432,106 @@ static int ReadHuffmanCodes(VP8LDecoder* const dec, int xsize, int ysize, } } + code_lengths = (int*)WebPSafeCalloc((uint64_t)max_alphabet_size, + sizeof(*code_lengths)); huffman_tables = (HuffmanCode*)WebPSafeMalloc(num_htree_groups * table_size, sizeof(*huffman_tables)); htree_groups = VP8LHtreeGroupsNew(num_htree_groups); - code_lengths = (int*)WebPSafeCalloc((uint64_t)max_alphabet_size, - sizeof(*code_lengths)); if (htree_groups == NULL || code_lengths == NULL || huffman_tables == NULL) { dec->status_ = VP8_STATUS_OUT_OF_MEMORY; goto Error; } - next = huffman_tables; - for (i = 0; i < num_htree_groups; ++i) { - HTreeGroup* const htree_group = &htree_groups[i]; - HuffmanCode** const htrees = htree_group->htrees; - int size; - int total_size = 0; - int is_trivial_literal = 1; - int max_bits = 0; - for (j = 0; j < HUFFMAN_CODES_PER_META_CODE; ++j) { - int alphabet_size = kAlphabetSize[j]; - htrees[j] = next; - if (j == 0 && color_cache_bits > 0) { - alphabet_size += 1 << color_cache_bits; - } - size = ReadHuffmanCode(alphabet_size, dec, code_lengths, next); - if (size == 0) { - goto Error; - } - if (is_trivial_literal && kLiteralMap[j] == 1) { - is_trivial_literal = (next->bits == 0); + huffman_table = huffman_tables; + for (i = 0; i < num_htree_groups_max; ++i) { + // If the index "i" is unused in the Huffman image, just make sure the + // coefficients are valid but do not store them. + if (mapping != NULL && mapping[i] == -1) { + for (j = 0; j < HUFFMAN_CODES_PER_META_CODE; ++j) { + int alphabet_size = kAlphabetSize[j]; + if (j == 0 && color_cache_bits > 0) { + alphabet_size += (1 << color_cache_bits); + } + // Passing in NULL so that nothing gets filled. + if (!ReadHuffmanCode(alphabet_size, dec, code_lengths, NULL)) { + goto Error; + } } - total_size += next->bits; - next += size; - if (j <= ALPHA) { - int local_max_bits = code_lengths[0]; - int k; - for (k = 1; k < alphabet_size; ++k) { - if (code_lengths[k] > local_max_bits) { - local_max_bits = code_lengths[k]; + } else { + HTreeGroup* const htree_group = + &htree_groups[(mapping == NULL) ? i : mapping[i]]; + HuffmanCode** const htrees = htree_group->htrees; + int size; + int total_size = 0; + int is_trivial_literal = 1; + int max_bits = 0; + for (j = 0; j < HUFFMAN_CODES_PER_META_CODE; ++j) { + int alphabet_size = kAlphabetSize[j]; + htrees[j] = huffman_table; + if (j == 0 && color_cache_bits > 0) { + alphabet_size += (1 << color_cache_bits); + } + size = ReadHuffmanCode(alphabet_size, dec, code_lengths, huffman_table); + if (size == 0) { + goto Error; + } + if (is_trivial_literal && kLiteralMap[j] == 1) { + is_trivial_literal = (huffman_table->bits == 0); + } + total_size += huffman_table->bits; + huffman_table += size; + if (j <= ALPHA) { + int local_max_bits = code_lengths[0]; + int k; + for (k = 1; k < alphabet_size; ++k) { + if (code_lengths[k] > local_max_bits) { + local_max_bits = code_lengths[k]; + } } + max_bits += local_max_bits; } - max_bits += local_max_bits; } - } - htree_group->is_trivial_literal = is_trivial_literal; - htree_group->is_trivial_code = 0; - if (is_trivial_literal) { - const int red = htrees[RED][0].value; - const int blue = htrees[BLUE][0].value; - const int alpha = htrees[ALPHA][0].value; - htree_group->literal_arb = - ((uint32_t)alpha << 24) | (red << 16) | blue; - if (total_size == 0 && htrees[GREEN][0].value < NUM_LITERAL_CODES) { - htree_group->is_trivial_code = 1; - htree_group->literal_arb |= htrees[GREEN][0].value << 8; + htree_group->is_trivial_literal = is_trivial_literal; + htree_group->is_trivial_code = 0; + if (is_trivial_literal) { + const int red = htrees[RED][0].value; + const int blue = htrees[BLUE][0].value; + const int alpha = htrees[ALPHA][0].value; + htree_group->literal_arb = ((uint32_t)alpha << 24) | (red << 16) | blue; + if (total_size == 0 && htrees[GREEN][0].value < NUM_LITERAL_CODES) { + htree_group->is_trivial_code = 1; + htree_group->literal_arb |= htrees[GREEN][0].value << 8; + } } + htree_group->use_packed_table = + !htree_group->is_trivial_code && (max_bits < HUFFMAN_PACKED_BITS); + if (htree_group->use_packed_table) BuildPackedTable(htree_group); } - htree_group->use_packed_table = !htree_group->is_trivial_code && - (max_bits < HUFFMAN_PACKED_BITS); - if (htree_group->use_packed_table) BuildPackedTable(htree_group); } - WebPSafeFree(code_lengths); + ok = 1; - // All OK. Finalize pointers and return. + // All OK. Finalize pointers. hdr->huffman_image_ = huffman_image; hdr->num_htree_groups_ = num_htree_groups; hdr->htree_groups_ = htree_groups; hdr->huffman_tables_ = huffman_tables; - return 1; Error: WebPSafeFree(code_lengths); - WebPSafeFree(huffman_image); - WebPSafeFree(huffman_tables); - VP8LHtreeGroupsFree(htree_groups); - return 0; + WebPSafeFree(mapping); + if (!ok) { + WebPSafeFree(huffman_image); + WebPSafeFree(huffman_tables); + VP8LHtreeGroupsFree(htree_groups); + } + return ok; } //------------------------------------------------------------------------------ // Scaling. +#if !defined(WEBP_REDUCE_SIZE) static int AllocateAndInitRescaler(VP8LDecoder* const dec, VP8Io* const io) { const int num_channels = 4; const int in_width = io->mb_w; @@ -516,10 +563,13 @@ static int AllocateAndInitRescaler(VP8LDecoder* const dec, VP8Io* const io) { out_width, out_height, 0, num_channels, work); return 1; } +#endif // WEBP_REDUCE_SIZE //------------------------------------------------------------------------------ // Export to ARGB +#if !defined(WEBP_REDUCE_SIZE) + // We have special "export" function since we need to convert from BGRA static int Export(WebPRescaler* const rescaler, WEBP_CSP_MODE colorspace, int rgba_stride, uint8_t* const rgba) { @@ -561,6 +611,8 @@ static int EmitRescaledRowsRGBA(const VP8LDecoder* const dec, return num_lines_out; } +#endif // WEBP_REDUCE_SIZE + // Emit rows without any scaling. static int EmitRows(WEBP_CSP_MODE colorspace, const uint8_t* row_in, int in_stride, @@ -746,9 +798,12 @@ static void ProcessRows(VP8LDecoder* const dec, int row) { if (WebPIsRGBMode(output->colorspace)) { // convert to RGBA const WebPRGBABuffer* const buf = &output->u.RGBA; uint8_t* const rgba = buf->rgba + dec->last_out_row_ * buf->stride; - const int num_rows_out = io->use_scaling ? + const int num_rows_out = +#if !defined(WEBP_REDUCE_SIZE) + io->use_scaling ? EmitRescaledRowsRGBA(dec, rows_data, in_stride, io->mb_h, rgba, buf->stride) : +#endif // WEBP_REDUCE_SIZE EmitRows(output->colorspace, rows_data, in_stride, io->mb_w, io->mb_h, rgba, buf->stride); // Update 'last_out_row_'. @@ -875,7 +930,11 @@ static WEBP_INLINE void CopyBlock8b(uint8_t* const dst, int dist, int length) { #endif break; case 2: +#if !defined(WORDS_BIGENDIAN) memcpy(&pattern, src, sizeof(uint16_t)); +#else + pattern = ((uint32_t)src[0] << 8) | src[1]; +#endif #if defined(__arm__) || defined(_M_ARM) pattern |= pattern << 16; #elif defined(WEBP_USE_MIPS_DSP_R2) @@ -1012,12 +1071,13 @@ static int DecodeAlphaData(VP8LDecoder* const dec, uint8_t* const data, ok = 0; goto End; } - assert(br->eos_ == VP8LIsEndOfStream(br)); + br->eos_ = VP8LIsEndOfStream(br); } // Process the remaining rows corresponding to last row-block. ExtractPalettedAlphaRows(dec, row > last_row ? last_row : row); End: + br->eos_ = VP8LIsEndOfStream(br); if (!ok || (br->eos_ && pos < end)) { ok = 0; dec->status_ = br->eos_ ? VP8_STATUS_SUSPENDED @@ -1090,11 +1150,12 @@ static int DecodeImageData(VP8LDecoder* const dec, uint32_t* const data, VP8LFillBitWindow(br); if (htree_group->use_packed_table) { code = ReadPackedSymbols(htree_group, br, src); + if (VP8LIsEndOfStream(br)) break; if (code == PACKED_NON_LITERAL_CODE) goto AdvanceByOne; } else { code = ReadSymbol(htree_group->htrees[GREEN], br); } - if (br->eos_) break; // early out + if (VP8LIsEndOfStream(br)) break; if (code < NUM_LITERAL_CODES) { // Literal if (htree_group->is_trivial_literal) { *src = htree_group->literal_arb | (code << 8); @@ -1104,7 +1165,7 @@ static int DecodeImageData(VP8LDecoder* const dec, uint32_t* const data, VP8LFillBitWindow(br); blue = ReadSymbol(htree_group->htrees[BLUE], br); alpha = ReadSymbol(htree_group->htrees[ALPHA], br); - if (br->eos_) break; + if (VP8LIsEndOfStream(br)) break; *src = ((uint32_t)alpha << 24) | (red << 16) | (code << 8) | blue; } AdvanceByOne: @@ -1132,7 +1193,7 @@ static int DecodeImageData(VP8LDecoder* const dec, uint32_t* const data, VP8LFillBitWindow(br); dist_code = GetCopyDistance(dist_symbol, br); dist = PlaneCodeToDistance(width, dist_code); - if (br->eos_) break; + if (VP8LIsEndOfStream(br)) break; if (src - data < (ptrdiff_t)dist || src_end - src < (ptrdiff_t)length) { goto Error; } else { @@ -1169,9 +1230,9 @@ static int DecodeImageData(VP8LDecoder* const dec, uint32_t* const data, } else { // Not reached goto Error; } - assert(br->eos_ == VP8LIsEndOfStream(br)); } + br->eos_ = VP8LIsEndOfStream(br); if (dec->incremental_ && br->eos_ && src < src_end) { RestoreState(dec); } else if (!br->eos_) { @@ -1512,7 +1573,6 @@ int VP8LDecodeAlphaHeader(ALPHDecoder* const alph_dec, if (dec == NULL) return 0; assert(alph_dec != NULL); - alph_dec->vp8l_dec_ = dec; dec->width_ = alph_dec->width_; dec->height_ = alph_dec->height_; @@ -1544,11 +1604,12 @@ int VP8LDecodeAlphaHeader(ALPHDecoder* const alph_dec, if (!ok) goto Err; + // Only set here, once we are sure it is valid (to avoid thread races). + alph_dec->vp8l_dec_ = dec; return 1; Err: - VP8LDelete(alph_dec->vp8l_dec_); - alph_dec->vp8l_dec_ = NULL; + VP8LDelete(dec); return 0; } @@ -1630,12 +1691,19 @@ int VP8LDecodeImage(VP8LDecoder* const dec) { if (!AllocateInternalBuffers32b(dec, io->width)) goto Err; +#if !defined(WEBP_REDUCE_SIZE) if (io->use_scaling && !AllocateAndInitRescaler(dec, io)) goto Err; - +#else + if (io->use_scaling) { + dec->status_ = VP8_STATUS_INVALID_PARAM; + goto Err; + } +#endif if (io->use_scaling || WebPIsPremultipliedMode(dec->output_->colorspace)) { // need the alpha-multiply functions for premultiplied output or rescaling WebPInitAlphaProcessing(); } + if (!WebPIsRGBMode(dec->output_->colorspace)) { WebPInitConvertARGBToYUV(); if (dec->output_->u.YUVA.a != NULL) WebPInitAlphaProcessing(); diff --git a/src/3rdparty/libwebp/src/dec/vp8li_dec.h b/src/3rdparty/libwebp/src/dec/vp8li_dec.h index 097a9d0..0a4d613 100644 --- a/src/3rdparty/libwebp/src/dec/vp8li_dec.h +++ b/src/3rdparty/libwebp/src/dec/vp8li_dec.h @@ -12,14 +12,14 @@ // Author: Skal (pascal.massimino@gmail.com) // Vikas Arora(vikaas.arora@gmail.com) -#ifndef WEBP_DEC_VP8LI_H_ -#define WEBP_DEC_VP8LI_H_ +#ifndef WEBP_DEC_VP8LI_DEC_H_ +#define WEBP_DEC_VP8LI_DEC_H_ #include <string.h> // for memcpy() -#include "./webpi_dec.h" -#include "../utils/bit_reader_utils.h" -#include "../utils/color_cache_utils.h" -#include "../utils/huffman_utils.h" +#include "src/dec/webpi_dec.h" +#include "src/utils/bit_reader_utils.h" +#include "src/utils/color_cache_utils.h" +#include "src/utils/huffman_utils.h" #ifdef __cplusplus extern "C" { @@ -132,4 +132,4 @@ void VP8LDelete(VP8LDecoder* const dec); } // extern "C" #endif -#endif /* WEBP_DEC_VP8LI_H_ */ +#endif // WEBP_DEC_VP8LI_DEC_H_ diff --git a/src/3rdparty/libwebp/src/dec/webp_dec.c b/src/3rdparty/libwebp/src/dec/webp_dec.c index a8e9c2c..42d0988 100644 --- a/src/3rdparty/libwebp/src/dec/webp_dec.c +++ b/src/3rdparty/libwebp/src/dec/webp_dec.c @@ -13,11 +13,11 @@ #include <stdlib.h> -#include "./vp8i_dec.h" -#include "./vp8li_dec.h" -#include "./webpi_dec.h" -#include "../utils/utils.h" -#include "../webp/mux_types.h" // ALPHA_FLAG +#include "src/dec/vp8i_dec.h" +#include "src/dec/vp8li_dec.h" +#include "src/dec/webpi_dec.h" +#include "src/utils/utils.h" +#include "src/webp/mux_types.h" // ALPHA_FLAG //------------------------------------------------------------------------------ // RIFF layout is: @@ -421,7 +421,9 @@ VP8StatusCode WebPParseHeaders(WebPHeaderStructure* const headers) { NULL, NULL, NULL, &has_animation, NULL, headers); if (status == VP8_STATUS_OK || status == VP8_STATUS_NOT_ENOUGH_DATA) { - // TODO(jzern): full support of animation frames will require API additions. + // The WebPDemux API + libwebp can be used to decode individual + // uncomposited frames or the WebPAnimDecoder can be used to fully + // reconstruct them (see webp/demux.h). if (has_animation) { status = VP8_STATUS_UNSUPPORTED_FEATURE; } diff --git a/src/3rdparty/libwebp/src/dec/webpi_dec.h b/src/3rdparty/libwebp/src/dec/webpi_dec.h index 696abc1..24baff5 100644 --- a/src/3rdparty/libwebp/src/dec/webpi_dec.h +++ b/src/3rdparty/libwebp/src/dec/webpi_dec.h @@ -11,15 +11,15 @@ // // Author: somnath@google.com (Somnath Banerjee) -#ifndef WEBP_DEC_WEBPI_H_ -#define WEBP_DEC_WEBPI_H_ +#ifndef WEBP_DEC_WEBPI_DEC_H_ +#define WEBP_DEC_WEBPI_DEC_H_ #ifdef __cplusplus extern "C" { #endif -#include "../utils/rescaler_utils.h" -#include "./vp8_dec.h" +#include "src/utils/rescaler_utils.h" +#include "src/dec/vp8_dec.h" //------------------------------------------------------------------------------ // WebPDecParams: Decoding output parameters. Transient internal object. @@ -130,4 +130,4 @@ int WebPAvoidSlowMemory(const WebPDecBuffer* const output, } // extern "C" #endif -#endif /* WEBP_DEC_WEBPI_H_ */ +#endif // WEBP_DEC_WEBPI_DEC_H_ diff --git a/src/3rdparty/libwebp/src/demux/anim_decode.c b/src/3rdparty/libwebp/src/demux/anim_decode.c index f1cf176..05dd707 100644 --- a/src/3rdparty/libwebp/src/demux/anim_decode.c +++ b/src/3rdparty/libwebp/src/demux/anim_decode.c @@ -11,15 +11,15 @@ // #ifdef HAVE_CONFIG_H -#include "../webp/config.h" +#include "src/webp/config.h" #endif #include <assert.h> #include <string.h> -#include "../utils/utils.h" -#include "../webp/decode.h" -#include "../webp/demux.h" +#include "src/utils/utils.h" +#include "src/webp/decode.h" +#include "src/webp/demux.h" #define NUM_CHANNELS 4 diff --git a/src/3rdparty/libwebp/src/demux/demux.c b/src/3rdparty/libwebp/src/demux/demux.c index 100eab8..ab6433e 100644 --- a/src/3rdparty/libwebp/src/demux/demux.c +++ b/src/3rdparty/libwebp/src/demux/demux.c @@ -11,21 +11,21 @@ // #ifdef HAVE_CONFIG_H -#include "../webp/config.h" +#include "src/webp/config.h" #endif #include <assert.h> #include <stdlib.h> #include <string.h> -#include "../utils/utils.h" -#include "../webp/decode.h" // WebPGetFeatures -#include "../webp/demux.h" -#include "../webp/format_constants.h" +#include "src/utils/utils.h" +#include "src/webp/decode.h" // WebPGetFeatures +#include "src/webp/demux.h" +#include "src/webp/format_constants.h" -#define DMUX_MAJ_VERSION 0 -#define DMUX_MIN_VERSION 3 -#define DMUX_REV_VERSION 2 +#define DMUX_MAJ_VERSION 1 +#define DMUX_MIN_VERSION 0 +#define DMUX_REV_VERSION 3 typedef struct { size_t start_; // start location of the data @@ -205,12 +205,14 @@ static void SetFrameInfo(size_t start_offset, size_t size, frame->complete_ = complete; } -// Store image bearing chunks to 'frame'. +// Store image bearing chunks to 'frame'. 'min_size' is an optional size +// requirement, it may be zero. static ParseStatus StoreFrame(int frame_num, uint32_t min_size, MemBuffer* const mem, Frame* const frame) { int alpha_chunks = 0; int image_chunks = 0; - int done = (MemDataSize(mem) < min_size); + int done = (MemDataSize(mem) < CHUNK_HEADER_SIZE || + MemDataSize(mem) < min_size); ParseStatus status = PARSE_OK; if (done) return PARSE_NEED_MORE_DATA; @@ -401,9 +403,9 @@ static ParseStatus ParseSingleImage(WebPDemuxer* const dmux) { frame = (Frame*)WebPSafeCalloc(1ULL, sizeof(*frame)); if (frame == NULL) return PARSE_ERROR; - // For the single image case we allow parsing of a partial frame, but we need - // at least CHUNK_HEADER_SIZE for parsing. - status = StoreFrame(1, CHUNK_HEADER_SIZE, &dmux->mem_, frame); + // For the single image case we allow parsing of a partial frame, so no + // minimum size is imposed here. + status = StoreFrame(1, 0, &dmux->mem_, frame); if (status != PARSE_ERROR) { const int has_alpha = !!(dmux->feature_flags_ & ALPHA_FLAG); // Clear any alpha when the alpha flag is missing. diff --git a/src/3rdparty/libwebp/src/dsp/alpha_processing.c b/src/3rdparty/libwebp/src/dsp/alpha_processing.c index 4b60e09..819d139 100644 --- a/src/3rdparty/libwebp/src/dsp/alpha_processing.c +++ b/src/3rdparty/libwebp/src/dsp/alpha_processing.c @@ -12,10 +12,13 @@ // Author: Skal (pascal.massimino@gmail.com) #include <assert.h> -#include "./dsp.h" +#include "src/dsp/dsp.h" // Tables can be faster on some platform but incur some extra binary size (~2k). -// #define USE_TABLES_FOR_ALPHA_MULT +#if !defined(USE_TABLES_FOR_ALPHA_MULT) +#define USE_TABLES_FOR_ALPHA_MULT 0 // ALTERNATE_CODE +#endif + // ----------------------------------------------------------------------------- @@ -29,7 +32,7 @@ static uint32_t Mult(uint8_t x, uint32_t mult) { return v; } -#ifdef USE_TABLES_FOR_ALPHA_MULT +#if (USE_TABLES_FOR_ALPHA_MULT == 1) static const uint32_t kMultTables[2][256] = { { // (255u << MFIX) / alpha @@ -132,9 +135,9 @@ static WEBP_INLINE uint32_t GetScale(uint32_t a, int inverse) { return inverse ? (255u << MFIX) / a : a * KINV_255; } -#endif // USE_TABLES_FOR_ALPHA_MULT +#endif // USE_TABLES_FOR_ALPHA_MULT -void WebPMultARGBRowC(uint32_t* const ptr, int width, int inverse) { +void WebPMultARGBRow_C(uint32_t* const ptr, int width, int inverse) { int x; for (x = 0; x < width; ++x) { const uint32_t argb = ptr[x]; @@ -154,8 +157,8 @@ void WebPMultARGBRowC(uint32_t* const ptr, int width, int inverse) { } } -void WebPMultRowC(uint8_t* const ptr, const uint8_t* const alpha, - int width, int inverse) { +void WebPMultRow_C(uint8_t* const ptr, const uint8_t* const alpha, + int width, int inverse) { int x; for (x = 0; x < width; ++x) { const uint32_t a = alpha[x]; @@ -217,8 +220,9 @@ void WebPMultRows(uint8_t* ptr, int stride, #define PREMULTIPLY(x, m) (((x) * (m) + (1U << 23)) >> 24) #endif -static void ApplyAlphaMultiply(uint8_t* rgba, int alpha_first, - int w, int h, int stride) { +#if !WEBP_NEON_OMIT_C_CODE +static void ApplyAlphaMultiply_C(uint8_t* rgba, int alpha_first, + int w, int h, int stride) { while (h-- > 0) { uint8_t* const rgb = rgba + (alpha_first ? 1 : 0); const uint8_t* const alpha = rgba + (alpha_first ? 0 : 3); @@ -235,6 +239,7 @@ static void ApplyAlphaMultiply(uint8_t* rgba, int alpha_first, rgba += stride; } } +#endif // !WEBP_NEON_OMIT_C_CODE #undef MULTIPLIER #undef PREMULTIPLY @@ -254,9 +259,9 @@ static WEBP_INLINE uint8_t multiply(uint8_t x, uint32_t m) { return (x * m) >> 16; } -static WEBP_INLINE void ApplyAlphaMultiply4444(uint8_t* rgba4444, - int w, int h, int stride, - int rg_byte_pos /* 0 or 1 */) { +static WEBP_INLINE void ApplyAlphaMultiply4444_C(uint8_t* rgba4444, + int w, int h, int stride, + int rg_byte_pos /* 0 or 1 */) { while (h-- > 0) { int i; for (i = 0; i < w; ++i) { @@ -275,15 +280,16 @@ static WEBP_INLINE void ApplyAlphaMultiply4444(uint8_t* rgba4444, } #undef MULTIPLIER -static void ApplyAlphaMultiply_16b(uint8_t* rgba4444, - int w, int h, int stride) { -#ifdef WEBP_SWAP_16BIT_CSP - ApplyAlphaMultiply4444(rgba4444, w, h, stride, 1); +static void ApplyAlphaMultiply_16b_C(uint8_t* rgba4444, + int w, int h, int stride) { +#if (WEBP_SWAP_16BIT_CSP == 1) + ApplyAlphaMultiply4444_C(rgba4444, w, h, stride, 1); #else - ApplyAlphaMultiply4444(rgba4444, w, h, stride, 0); + ApplyAlphaMultiply4444_C(rgba4444, w, h, stride, 0); #endif } +#if !WEBP_NEON_OMIT_C_CODE static int DispatchAlpha_C(const uint8_t* alpha, int alpha_stride, int width, int height, uint8_t* dst, int dst_stride) { @@ -338,6 +344,46 @@ static void ExtractGreen_C(const uint32_t* argb, uint8_t* alpha, int size) { int i; for (i = 0; i < size; ++i) alpha[i] = argb[i] >> 8; } +#endif // !WEBP_NEON_OMIT_C_CODE + +//------------------------------------------------------------------------------ + +static int HasAlpha8b_C(const uint8_t* src, int length) { + while (length-- > 0) if (*src++ != 0xff) return 1; + return 0; +} + +static int HasAlpha32b_C(const uint8_t* src, int length) { + int x; + for (x = 0; length-- > 0; x += 4) if (src[x] != 0xff) return 1; + return 0; +} + +//------------------------------------------------------------------------------ +// Simple channel manipulations. + +static WEBP_INLINE uint32_t MakeARGB32(int a, int r, int g, int b) { + return (((uint32_t)a << 24) | (r << 16) | (g << 8) | b); +} + +#ifdef WORDS_BIGENDIAN +static void PackARGB_C(const uint8_t* a, const uint8_t* r, const uint8_t* g, + const uint8_t* b, int len, uint32_t* out) { + int i; + for (i = 0; i < len; ++i) { + out[i] = MakeARGB32(a[4 * i], r[4 * i], g[4 * i], b[4 * i]); + } +} +#endif + +static void PackRGB_C(const uint8_t* r, const uint8_t* g, const uint8_t* b, + int len, int step, uint32_t* out) { + int i, offset = 0; + for (i = 0; i < len; ++i) { + out[i] = MakeARGB32(0xff, r[offset], g[offset], b[offset]); + offset += step; + } +} void (*WebPApplyAlphaMultiply)(uint8_t*, int, int, int, int); void (*WebPApplyAlphaMultiply4444)(uint8_t*, int, int, int); @@ -345,6 +391,15 @@ int (*WebPDispatchAlpha)(const uint8_t*, int, int, int, uint8_t*, int); void (*WebPDispatchAlphaToGreen)(const uint8_t*, int, int, int, uint32_t*, int); int (*WebPExtractAlpha)(const uint8_t*, int, int, int, uint8_t*, int); void (*WebPExtractGreen)(const uint32_t* argb, uint8_t* alpha, int size); +#ifdef WORDS_BIGENDIAN +void (*WebPPackARGB)(const uint8_t* a, const uint8_t* r, const uint8_t* g, + const uint8_t* b, int, uint32_t*); +#endif +void (*WebPPackRGB)(const uint8_t* r, const uint8_t* g, const uint8_t* b, + int len, int step, uint32_t* out); + +int (*WebPHasAlpha8b)(const uint8_t* src, int length); +int (*WebPHasAlpha32b)(const uint8_t* src, int length); //------------------------------------------------------------------------------ // Init function @@ -354,21 +409,25 @@ extern void WebPInitAlphaProcessingSSE2(void); extern void WebPInitAlphaProcessingSSE41(void); extern void WebPInitAlphaProcessingNEON(void); -static volatile VP8CPUInfo alpha_processing_last_cpuinfo_used = - (VP8CPUInfo)&alpha_processing_last_cpuinfo_used; - -WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessing(void) { - if (alpha_processing_last_cpuinfo_used == VP8GetCPUInfo) return; - - WebPMultARGBRow = WebPMultARGBRowC; - WebPMultRow = WebPMultRowC; - WebPApplyAlphaMultiply = ApplyAlphaMultiply; - WebPApplyAlphaMultiply4444 = ApplyAlphaMultiply_16b; +WEBP_DSP_INIT_FUNC(WebPInitAlphaProcessing) { + WebPMultARGBRow = WebPMultARGBRow_C; + WebPMultRow = WebPMultRow_C; + WebPApplyAlphaMultiply4444 = ApplyAlphaMultiply_16b_C; +#ifdef WORDS_BIGENDIAN + WebPPackARGB = PackARGB_C; +#endif + WebPPackRGB = PackRGB_C; +#if !WEBP_NEON_OMIT_C_CODE + WebPApplyAlphaMultiply = ApplyAlphaMultiply_C; WebPDispatchAlpha = DispatchAlpha_C; WebPDispatchAlphaToGreen = DispatchAlphaToGreen_C; WebPExtractAlpha = ExtractAlpha_C; WebPExtractGreen = ExtractGreen_C; +#endif + + WebPHasAlpha8b = HasAlpha8b_C; + WebPHasAlpha32b = HasAlpha32b_C; // If defined, use CPUInfo() to overwrite some pointers with faster versions. if (VP8GetCPUInfo != NULL) { @@ -382,16 +441,32 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessing(void) { #endif } #endif -#if defined(WEBP_USE_NEON) - if (VP8GetCPUInfo(kNEON)) { - WebPInitAlphaProcessingNEON(); - } -#endif #if defined(WEBP_USE_MIPS_DSP_R2) if (VP8GetCPUInfo(kMIPSdspR2)) { WebPInitAlphaProcessingMIPSdspR2(); } #endif } - alpha_processing_last_cpuinfo_used = VP8GetCPUInfo; + +#if defined(WEBP_USE_NEON) + if (WEBP_NEON_OMIT_C_CODE || + (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) { + WebPInitAlphaProcessingNEON(); + } +#endif + + assert(WebPMultARGBRow != NULL); + assert(WebPMultRow != NULL); + assert(WebPApplyAlphaMultiply != NULL); + assert(WebPApplyAlphaMultiply4444 != NULL); + assert(WebPDispatchAlpha != NULL); + assert(WebPDispatchAlphaToGreen != NULL); + assert(WebPExtractAlpha != NULL); + assert(WebPExtractGreen != NULL); +#ifdef WORDS_BIGENDIAN + assert(WebPPackARGB != NULL); +#endif + assert(WebPPackRGB != NULL); + assert(WebPHasAlpha8b != NULL); + assert(WebPHasAlpha32b != NULL); } diff --git a/src/3rdparty/libwebp/src/dsp/alpha_processing_mips_dsp_r2.c b/src/3rdparty/libwebp/src/dsp/alpha_processing_mips_dsp_r2.c index c631d78..0090e87 100644 --- a/src/3rdparty/libwebp/src/dsp/alpha_processing_mips_dsp_r2.c +++ b/src/3rdparty/libwebp/src/dsp/alpha_processing_mips_dsp_r2.c @@ -12,13 +12,13 @@ // Author(s): Branimir Vasic (branimir.vasic@imgtec.com) // Djordje Pesut (djordje.pesut@imgtec.com) -#include "./dsp.h" +#include "src/dsp/dsp.h" #if defined(WEBP_USE_MIPS_DSP_R2) -static int DispatchAlpha(const uint8_t* alpha, int alpha_stride, - int width, int height, - uint8_t* dst, int dst_stride) { +static int DispatchAlpha_MIPSdspR2(const uint8_t* alpha, int alpha_stride, + int width, int height, + uint8_t* dst, int dst_stride) { uint32_t alpha_mask = 0xffffffff; int i, j, temp0; @@ -79,7 +79,8 @@ static int DispatchAlpha(const uint8_t* alpha, int alpha_stride, return (alpha_mask != 0xff); } -static void MultARGBRow(uint32_t* const ptr, int width, int inverse) { +static void MultARGBRow_MIPSdspR2(uint32_t* const ptr, int width, + int inverse) { int x; const uint32_t c_00ffffff = 0x00ffffffu; const uint32_t c_ff000000 = 0xff000000u; @@ -124,14 +125,100 @@ static void MultARGBRow(uint32_t* const ptr, int width, int inverse) { } } +#ifdef WORDS_BIGENDIAN +static void PackARGB_MIPSdspR2(const uint8_t* a, const uint8_t* r, + const uint8_t* g, const uint8_t* b, int len, + uint32_t* out) { + int temp0, temp1, temp2, temp3, offset; + const int rest = len & 1; + const uint32_t* const loop_end = out + len - rest; + const int step = 4; + __asm__ volatile ( + "xor %[offset], %[offset], %[offset] \n\t" + "beq %[loop_end], %[out], 0f \n\t" + "2: \n\t" + "lbux %[temp0], %[offset](%[a]) \n\t" + "lbux %[temp1], %[offset](%[r]) \n\t" + "lbux %[temp2], %[offset](%[g]) \n\t" + "lbux %[temp3], %[offset](%[b]) \n\t" + "ins %[temp1], %[temp0], 16, 16 \n\t" + "ins %[temp3], %[temp2], 16, 16 \n\t" + "addiu %[out], %[out], 4 \n\t" + "precr.qb.ph %[temp0], %[temp1], %[temp3] \n\t" + "sw %[temp0], -4(%[out]) \n\t" + "addu %[offset], %[offset], %[step] \n\t" + "bne %[loop_end], %[out], 2b \n\t" + "0: \n\t" + "beq %[rest], $zero, 1f \n\t" + "lbux %[temp0], %[offset](%[a]) \n\t" + "lbux %[temp1], %[offset](%[r]) \n\t" + "lbux %[temp2], %[offset](%[g]) \n\t" + "lbux %[temp3], %[offset](%[b]) \n\t" + "ins %[temp1], %[temp0], 16, 16 \n\t" + "ins %[temp3], %[temp2], 16, 16 \n\t" + "precr.qb.ph %[temp0], %[temp1], %[temp3] \n\t" + "sw %[temp0], 0(%[out]) \n\t" + "1: \n\t" + : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2), + [temp3]"=&r"(temp3), [offset]"=&r"(offset), [out]"+&r"(out) + : [a]"r"(a), [r]"r"(r), [g]"r"(g), [b]"r"(b), [step]"r"(step), + [loop_end]"r"(loop_end), [rest]"r"(rest) + : "memory" + ); +} +#endif // WORDS_BIGENDIAN + +static void PackRGB_MIPSdspR2(const uint8_t* r, const uint8_t* g, + const uint8_t* b, int len, int step, + uint32_t* out) { + int temp0, temp1, temp2, offset; + const int rest = len & 1; + const int a = 0xff; + const uint32_t* const loop_end = out + len - rest; + __asm__ volatile ( + "xor %[offset], %[offset], %[offset] \n\t" + "beq %[loop_end], %[out], 0f \n\t" + "2: \n\t" + "lbux %[temp0], %[offset](%[r]) \n\t" + "lbux %[temp1], %[offset](%[g]) \n\t" + "lbux %[temp2], %[offset](%[b]) \n\t" + "ins %[temp0], %[a], 16, 16 \n\t" + "ins %[temp2], %[temp1], 16, 16 \n\t" + "addiu %[out], %[out], 4 \n\t" + "precr.qb.ph %[temp0], %[temp0], %[temp2] \n\t" + "sw %[temp0], -4(%[out]) \n\t" + "addu %[offset], %[offset], %[step] \n\t" + "bne %[loop_end], %[out], 2b \n\t" + "0: \n\t" + "beq %[rest], $zero, 1f \n\t" + "lbux %[temp0], %[offset](%[r]) \n\t" + "lbux %[temp1], %[offset](%[g]) \n\t" + "lbux %[temp2], %[offset](%[b]) \n\t" + "ins %[temp0], %[a], 16, 16 \n\t" + "ins %[temp2], %[temp1], 16, 16 \n\t" + "precr.qb.ph %[temp0], %[temp0], %[temp2] \n\t" + "sw %[temp0], 0(%[out]) \n\t" + "1: \n\t" + : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2), + [offset]"=&r"(offset), [out]"+&r"(out) + : [a]"r"(a), [r]"r"(r), [g]"r"(g), [b]"r"(b), [step]"r"(step), + [loop_end]"r"(loop_end), [rest]"r"(rest) + : "memory" + ); +} + //------------------------------------------------------------------------------ // Entry point extern void WebPInitAlphaProcessingMIPSdspR2(void); WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessingMIPSdspR2(void) { - WebPDispatchAlpha = DispatchAlpha; - WebPMultARGBRow = MultARGBRow; + WebPDispatchAlpha = DispatchAlpha_MIPSdspR2; + WebPMultARGBRow = MultARGBRow_MIPSdspR2; +#ifdef WORDS_BIGENDIAN + WebPPackARGB = PackARGB_MIPSdspR2; +#endif + WebPPackRGB = PackRGB_MIPSdspR2; } #else // !WEBP_USE_MIPS_DSP_R2 diff --git a/src/3rdparty/libwebp/src/dsp/alpha_processing_neon.c b/src/3rdparty/libwebp/src/dsp/alpha_processing_neon.c index 606a401..9d55421 100644 --- a/src/3rdparty/libwebp/src/dsp/alpha_processing_neon.c +++ b/src/3rdparty/libwebp/src/dsp/alpha_processing_neon.c @@ -11,11 +11,11 @@ // // Author: Skal (pascal.massimino@gmail.com) -#include "./dsp.h" +#include "src/dsp/dsp.h" #if defined(WEBP_USE_NEON) -#include "./neon.h" +#include "src/dsp/neon.h" //------------------------------------------------------------------------------ diff --git a/src/3rdparty/libwebp/src/dsp/alpha_processing_sse2.c b/src/3rdparty/libwebp/src/dsp/alpha_processing_sse2.c index 83dc559..2871c56 100644 --- a/src/3rdparty/libwebp/src/dsp/alpha_processing_sse2.c +++ b/src/3rdparty/libwebp/src/dsp/alpha_processing_sse2.c @@ -11,16 +11,16 @@ // // Author: Skal (pascal.massimino@gmail.com) -#include "./dsp.h" +#include "src/dsp/dsp.h" #if defined(WEBP_USE_SSE2) #include <emmintrin.h> //------------------------------------------------------------------------------ -static int DispatchAlpha(const uint8_t* alpha, int alpha_stride, - int width, int height, - uint8_t* dst, int dst_stride) { +static int DispatchAlpha_SSE2(const uint8_t* alpha, int alpha_stride, + int width, int height, + uint8_t* dst, int dst_stride) { // alpha_and stores an 'and' operation of all the alpha[] values. The final // value is not 0xff if any of the alpha[] is not equal to 0xff. uint32_t alpha_and = 0xff; @@ -72,9 +72,9 @@ static int DispatchAlpha(const uint8_t* alpha, int alpha_stride, return (alpha_and != 0xff); } -static void DispatchAlphaToGreen(const uint8_t* alpha, int alpha_stride, - int width, int height, - uint32_t* dst, int dst_stride) { +static void DispatchAlphaToGreen_SSE2(const uint8_t* alpha, int alpha_stride, + int width, int height, + uint32_t* dst, int dst_stride) { int i, j; const __m128i zero = _mm_setzero_si128(); const int limit = width & ~15; @@ -98,9 +98,9 @@ static void DispatchAlphaToGreen(const uint8_t* alpha, int alpha_stride, } } -static int ExtractAlpha(const uint8_t* argb, int argb_stride, - int width, int height, - uint8_t* alpha, int alpha_stride) { +static int ExtractAlpha_SSE2(const uint8_t* argb, int argb_stride, + int width, int height, + uint8_t* alpha, int alpha_stride) { // alpha_and stores an 'and' operation of all the alpha[] values. The final // value is not 0xff if any of the alpha[] is not equal to 0xff. uint32_t alpha_and = 0xff; @@ -210,6 +210,61 @@ static void ApplyAlphaMultiply_SSE2(uint8_t* rgba, int alpha_first, #undef MULTIPLIER #undef PREMULTIPLY +//------------------------------------------------------------------------------ +// Alpha detection + +static int HasAlpha8b_SSE2(const uint8_t* src, int length) { + const __m128i all_0xff = _mm_set1_epi8((char)0xff); + int i = 0; + for (; i + 16 <= length; i += 16) { + const __m128i v = _mm_loadu_si128((const __m128i*)(src + i)); + const __m128i bits = _mm_cmpeq_epi8(v, all_0xff); + const int mask = _mm_movemask_epi8(bits); + if (mask != 0xffff) return 1; + } + for (; i < length; ++i) if (src[i] != 0xff) return 1; + return 0; +} + +static int HasAlpha32b_SSE2(const uint8_t* src, int length) { + const __m128i alpha_mask = _mm_set1_epi32(0xff); + const __m128i all_0xff = _mm_set1_epi8((char)0xff); + int i = 0; + // We don't know if we can access the last 3 bytes after the last alpha + // value 'src[4 * length - 4]' (because we don't know if alpha is the first + // or the last byte of the quadruplet). Hence the '-3' protection below. + length = length * 4 - 3; // size in bytes + for (; i + 64 <= length; i += 64) { + const __m128i a0 = _mm_loadu_si128((const __m128i*)(src + i + 0)); + const __m128i a1 = _mm_loadu_si128((const __m128i*)(src + i + 16)); + const __m128i a2 = _mm_loadu_si128((const __m128i*)(src + i + 32)); + const __m128i a3 = _mm_loadu_si128((const __m128i*)(src + i + 48)); + const __m128i b0 = _mm_and_si128(a0, alpha_mask); + const __m128i b1 = _mm_and_si128(a1, alpha_mask); + const __m128i b2 = _mm_and_si128(a2, alpha_mask); + const __m128i b3 = _mm_and_si128(a3, alpha_mask); + const __m128i c0 = _mm_packs_epi32(b0, b1); + const __m128i c1 = _mm_packs_epi32(b2, b3); + const __m128i d = _mm_packus_epi16(c0, c1); + const __m128i bits = _mm_cmpeq_epi8(d, all_0xff); + const int mask = _mm_movemask_epi8(bits); + if (mask != 0xffff) return 1; + } + for (; i + 32 <= length; i += 32) { + const __m128i a0 = _mm_loadu_si128((const __m128i*)(src + i + 0)); + const __m128i a1 = _mm_loadu_si128((const __m128i*)(src + i + 16)); + const __m128i b0 = _mm_and_si128(a0, alpha_mask); + const __m128i b1 = _mm_and_si128(a1, alpha_mask); + const __m128i c = _mm_packs_epi32(b0, b1); + const __m128i d = _mm_packus_epi16(c, c); + const __m128i bits = _mm_cmpeq_epi8(d, all_0xff); + const int mask = _mm_movemask_epi8(bits); + if (mask != 0xffff) return 1; + } + for (; i <= length; i += 4) if (src[i] != 0xff) return 1; + return 0; +} + // ----------------------------------------------------------------------------- // Apply alpha value to rows @@ -238,7 +293,7 @@ static void MultARGBRow_SSE2(uint32_t* const ptr, int width, int inverse) { } } width -= x; - if (width > 0) WebPMultARGBRowC(ptr + x, width, inverse); + if (width > 0) WebPMultARGBRow_C(ptr + x, width, inverse); } static void MultRow_SSE2(uint8_t* const ptr, const uint8_t* const alpha, @@ -261,7 +316,7 @@ static void MultRow_SSE2(uint8_t* const ptr, const uint8_t* const alpha, } } width -= x; - if (width > 0) WebPMultRowC(ptr + x, alpha + x, width, inverse); + if (width > 0) WebPMultRow_C(ptr + x, alpha + x, width, inverse); } //------------------------------------------------------------------------------ @@ -273,9 +328,12 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessingSSE2(void) { WebPMultARGBRow = MultARGBRow_SSE2; WebPMultRow = MultRow_SSE2; WebPApplyAlphaMultiply = ApplyAlphaMultiply_SSE2; - WebPDispatchAlpha = DispatchAlpha; - WebPDispatchAlphaToGreen = DispatchAlphaToGreen; - WebPExtractAlpha = ExtractAlpha; + WebPDispatchAlpha = DispatchAlpha_SSE2; + WebPDispatchAlphaToGreen = DispatchAlphaToGreen_SSE2; + WebPExtractAlpha = ExtractAlpha_SSE2; + + WebPHasAlpha8b = HasAlpha8b_SSE2; + WebPHasAlpha32b = HasAlpha32b_SSE2; } #else // !WEBP_USE_SSE2 diff --git a/src/3rdparty/libwebp/src/dsp/alpha_processing_sse41.c b/src/3rdparty/libwebp/src/dsp/alpha_processing_sse41.c index 986fde9..56040f9 100644 --- a/src/3rdparty/libwebp/src/dsp/alpha_processing_sse41.c +++ b/src/3rdparty/libwebp/src/dsp/alpha_processing_sse41.c @@ -11,7 +11,7 @@ // // Author: Skal (pascal.massimino@gmail.com) -#include "./dsp.h" +#include "src/dsp/dsp.h" #if defined(WEBP_USE_SSE41) @@ -19,9 +19,9 @@ //------------------------------------------------------------------------------ -static int ExtractAlpha(const uint8_t* argb, int argb_stride, - int width, int height, - uint8_t* alpha, int alpha_stride) { +static int ExtractAlpha_SSE41(const uint8_t* argb, int argb_stride, + int width, int height, + uint8_t* alpha, int alpha_stride) { // alpha_and stores an 'and' operation of all the alpha[] values. The final // value is not 0xff if any of the alpha[] is not equal to 0xff. uint32_t alpha_and = 0xff; @@ -82,7 +82,7 @@ static int ExtractAlpha(const uint8_t* argb, int argb_stride, extern void WebPInitAlphaProcessingSSE41(void); WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessingSSE41(void) { - WebPExtractAlpha = ExtractAlpha; + WebPExtractAlpha = ExtractAlpha_SSE41; } #else // !WEBP_USE_SSE41 diff --git a/src/3rdparty/libwebp/src/dsp/argb.c b/src/3rdparty/libwebp/src/dsp/argb.c deleted file mode 100644 index cc1f9a9..0000000 --- a/src/3rdparty/libwebp/src/dsp/argb.c +++ /dev/null @@ -1,68 +0,0 @@ -// Copyright 2014 Google Inc. All Rights Reserved. -// -// Use of this source code is governed by a BSD-style license -// that can be found in the COPYING file in the root of the source -// tree. An additional intellectual property rights grant can be found -// in the file PATENTS. All contributing project authors may -// be found in the AUTHORS file in the root of the source tree. -// ----------------------------------------------------------------------------- -// -// ARGB making functions. -// -// Author: Djordje Pesut (djordje.pesut@imgtec.com) - -#include "./dsp.h" - -static WEBP_INLINE uint32_t MakeARGB32(int a, int r, int g, int b) { - return (((uint32_t)a << 24) | (r << 16) | (g << 8) | b); -} - -static void PackARGB(const uint8_t* a, const uint8_t* r, const uint8_t* g, - const uint8_t* b, int len, uint32_t* out) { - int i; - for (i = 0; i < len; ++i) { - out[i] = MakeARGB32(a[4 * i], r[4 * i], g[4 * i], b[4 * i]); - } -} - -static void PackRGB(const uint8_t* r, const uint8_t* g, const uint8_t* b, - int len, int step, uint32_t* out) { - int i, offset = 0; - for (i = 0; i < len; ++i) { - out[i] = MakeARGB32(0xff, r[offset], g[offset], b[offset]); - offset += step; - } -} - -void (*VP8PackARGB)(const uint8_t*, const uint8_t*, const uint8_t*, - const uint8_t*, int, uint32_t*); -void (*VP8PackRGB)(const uint8_t*, const uint8_t*, const uint8_t*, - int, int, uint32_t*); - -extern void VP8EncDspARGBInitMIPSdspR2(void); -extern void VP8EncDspARGBInitSSE2(void); - -static volatile VP8CPUInfo argb_last_cpuinfo_used = - (VP8CPUInfo)&argb_last_cpuinfo_used; - -WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspARGBInit(void) { - if (argb_last_cpuinfo_used == VP8GetCPUInfo) return; - - VP8PackARGB = PackARGB; - VP8PackRGB = PackRGB; - - // If defined, use CPUInfo() to overwrite some pointers with faster versions. - if (VP8GetCPUInfo != NULL) { -#if defined(WEBP_USE_SSE2) - if (VP8GetCPUInfo(kSSE2)) { - VP8EncDspARGBInitSSE2(); - } -#endif -#if defined(WEBP_USE_MIPS_DSP_R2) - if (VP8GetCPUInfo(kMIPSdspR2)) { - VP8EncDspARGBInitMIPSdspR2(); - } -#endif - } - argb_last_cpuinfo_used = VP8GetCPUInfo; -} diff --git a/src/3rdparty/libwebp/src/dsp/argb_mips_dsp_r2.c b/src/3rdparty/libwebp/src/dsp/argb_mips_dsp_r2.c deleted file mode 100644 index af65acb..0000000 --- a/src/3rdparty/libwebp/src/dsp/argb_mips_dsp_r2.c +++ /dev/null @@ -1,110 +0,0 @@ -// Copyright 2014 Google Inc. All Rights Reserved. -// -// Use of this source code is governed by a BSD-style license -// that can be found in the COPYING file in the root of the source -// tree. An additional intellectual property rights grant can be found -// in the file PATENTS. All contributing project authors may -// be found in the AUTHORS file in the root of the source tree. -// ----------------------------------------------------------------------------- -// -// ARGB making functions (mips version). -// -// Author: Djordje Pesut (djordje.pesut@imgtec.com) - -#include "./dsp.h" - -#if defined(WEBP_USE_MIPS_DSP_R2) - -static void PackARGB(const uint8_t* a, const uint8_t* r, const uint8_t* g, - const uint8_t* b, int len, uint32_t* out) { - int temp0, temp1, temp2, temp3, offset; - const int rest = len & 1; - const uint32_t* const loop_end = out + len - rest; - const int step = 4; - __asm__ volatile ( - "xor %[offset], %[offset], %[offset] \n\t" - "beq %[loop_end], %[out], 0f \n\t" - "2: \n\t" - "lbux %[temp0], %[offset](%[a]) \n\t" - "lbux %[temp1], %[offset](%[r]) \n\t" - "lbux %[temp2], %[offset](%[g]) \n\t" - "lbux %[temp3], %[offset](%[b]) \n\t" - "ins %[temp1], %[temp0], 16, 16 \n\t" - "ins %[temp3], %[temp2], 16, 16 \n\t" - "addiu %[out], %[out], 4 \n\t" - "precr.qb.ph %[temp0], %[temp1], %[temp3] \n\t" - "sw %[temp0], -4(%[out]) \n\t" - "addu %[offset], %[offset], %[step] \n\t" - "bne %[loop_end], %[out], 2b \n\t" - "0: \n\t" - "beq %[rest], $zero, 1f \n\t" - "lbux %[temp0], %[offset](%[a]) \n\t" - "lbux %[temp1], %[offset](%[r]) \n\t" - "lbux %[temp2], %[offset](%[g]) \n\t" - "lbux %[temp3], %[offset](%[b]) \n\t" - "ins %[temp1], %[temp0], 16, 16 \n\t" - "ins %[temp3], %[temp2], 16, 16 \n\t" - "precr.qb.ph %[temp0], %[temp1], %[temp3] \n\t" - "sw %[temp0], 0(%[out]) \n\t" - "1: \n\t" - : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2), - [temp3]"=&r"(temp3), [offset]"=&r"(offset), [out]"+&r"(out) - : [a]"r"(a), [r]"r"(r), [g]"r"(g), [b]"r"(b), [step]"r"(step), - [loop_end]"r"(loop_end), [rest]"r"(rest) - : "memory" - ); -} - -static void PackRGB(const uint8_t* r, const uint8_t* g, const uint8_t* b, - int len, int step, uint32_t* out) { - int temp0, temp1, temp2, offset; - const int rest = len & 1; - const int a = 0xff; - const uint32_t* const loop_end = out + len - rest; - __asm__ volatile ( - "xor %[offset], %[offset], %[offset] \n\t" - "beq %[loop_end], %[out], 0f \n\t" - "2: \n\t" - "lbux %[temp0], %[offset](%[r]) \n\t" - "lbux %[temp1], %[offset](%[g]) \n\t" - "lbux %[temp2], %[offset](%[b]) \n\t" - "ins %[temp0], %[a], 16, 16 \n\t" - "ins %[temp2], %[temp1], 16, 16 \n\t" - "addiu %[out], %[out], 4 \n\t" - "precr.qb.ph %[temp0], %[temp0], %[temp2] \n\t" - "sw %[temp0], -4(%[out]) \n\t" - "addu %[offset], %[offset], %[step] \n\t" - "bne %[loop_end], %[out], 2b \n\t" - "0: \n\t" - "beq %[rest], $zero, 1f \n\t" - "lbux %[temp0], %[offset](%[r]) \n\t" - "lbux %[temp1], %[offset](%[g]) \n\t" - "lbux %[temp2], %[offset](%[b]) \n\t" - "ins %[temp0], %[a], 16, 16 \n\t" - "ins %[temp2], %[temp1], 16, 16 \n\t" - "precr.qb.ph %[temp0], %[temp0], %[temp2] \n\t" - "sw %[temp0], 0(%[out]) \n\t" - "1: \n\t" - : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2), - [offset]"=&r"(offset), [out]"+&r"(out) - : [a]"r"(a), [r]"r"(r), [g]"r"(g), [b]"r"(b), [step]"r"(step), - [loop_end]"r"(loop_end), [rest]"r"(rest) - : "memory" - ); -} - -//------------------------------------------------------------------------------ -// Entry point - -extern void VP8EncDspARGBInitMIPSdspR2(void); - -WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspARGBInitMIPSdspR2(void) { - VP8PackARGB = PackARGB; - VP8PackRGB = PackRGB; -} - -#else // !WEBP_USE_MIPS_DSP_R2 - -WEBP_DSP_INIT_STUB(VP8EncDspARGBInitMIPSdspR2) - -#endif // WEBP_USE_MIPS_DSP_R2 diff --git a/src/3rdparty/libwebp/src/dsp/argb_sse2.c b/src/3rdparty/libwebp/src/dsp/argb_sse2.c deleted file mode 100644 index afcb195..0000000 --- a/src/3rdparty/libwebp/src/dsp/argb_sse2.c +++ /dev/null @@ -1,67 +0,0 @@ -// Copyright 2014 Google Inc. All Rights Reserved. -// -// Use of this source code is governed by a BSD-style license -// that can be found in the COPYING file in the root of the source -// tree. An additional intellectual property rights grant can be found -// in the file PATENTS. All contributing project authors may -// be found in the AUTHORS file in the root of the source tree. -// ----------------------------------------------------------------------------- -// -// ARGB making functions (SSE2 version). -// -// Author: Skal (pascal.massimino@gmail.com) - -#include "./dsp.h" - -#if defined(WEBP_USE_SSE2) - -#include <assert.h> -#include <emmintrin.h> -#include <string.h> - -static WEBP_INLINE uint32_t MakeARGB32(int a, int r, int g, int b) { - return (((uint32_t)a << 24) | (r << 16) | (g << 8) | b); -} - -static void PackARGB(const uint8_t* a, const uint8_t* r, const uint8_t* g, - const uint8_t* b, int len, uint32_t* out) { - if (g == r + 1) { // RGBA input order. Need to swap R and B. - int i = 0; - const int len_max = len & ~3; // max length processed in main loop - const __m128i red_blue_mask = _mm_set1_epi32(0x00ff00ffu); - assert(b == r + 2); - assert(a == r + 3); - for (; i < len_max; i += 4) { - const __m128i A = _mm_loadu_si128((const __m128i*)(r + 4 * i)); - const __m128i B = _mm_and_si128(A, red_blue_mask); // R 0 B 0 - const __m128i C = _mm_andnot_si128(red_blue_mask, A); // 0 G 0 A - const __m128i D = _mm_shufflelo_epi16(B, _MM_SHUFFLE(2, 3, 0, 1)); - const __m128i E = _mm_shufflehi_epi16(D, _MM_SHUFFLE(2, 3, 0, 1)); - const __m128i F = _mm_or_si128(E, C); - _mm_storeu_si128((__m128i*)(out + i), F); - } - for (; i < len; ++i) { - out[i] = MakeARGB32(a[4 * i], r[4 * i], g[4 * i], b[4 * i]); - } - } else { - assert(g == b + 1); - assert(r == b + 2); - assert(a == b + 3); - memcpy(out, b, len * 4); - } -} - -//------------------------------------------------------------------------------ -// Entry point - -extern void VP8EncDspARGBInitSSE2(void); - -WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspARGBInitSSE2(void) { - VP8PackARGB = PackARGB; -} - -#else // !WEBP_USE_SSE2 - -WEBP_DSP_INIT_STUB(VP8EncDspARGBInitSSE2) - -#endif // WEBP_USE_SSE2 diff --git a/src/3rdparty/libwebp/src/dsp/common_sse2.h b/src/3rdparty/libwebp/src/dsp/common_sse2.h index 995d7cf..e9f1ebf 100644 --- a/src/3rdparty/libwebp/src/dsp/common_sse2.h +++ b/src/3rdparty/libwebp/src/dsp/common_sse2.h @@ -128,9 +128,9 @@ static WEBP_INLINE void VP8Transpose_2_4x4_16b( // Pack the planar buffers // rrrr... rrrr... gggg... gggg... bbbb... bbbb.... // triplet by triplet in the output buffer rgb as rgbrgbrgbrgb ... -static WEBP_INLINE void VP8PlanarTo24b(__m128i* const in0, __m128i* const in1, - __m128i* const in2, __m128i* const in3, - __m128i* const in4, __m128i* const in5) { +static WEBP_INLINE void VP8PlanarTo24b_SSE2( + __m128i* const in0, __m128i* const in1, __m128i* const in2, + __m128i* const in3, __m128i* const in4, __m128i* const in5) { // The input is 6 registers of sixteen 8b but for the sake of explanation, // let's take 6 registers of four 8b values. // To pack, we will keep taking one every two 8b integer and move it @@ -159,10 +159,10 @@ static WEBP_INLINE void VP8PlanarTo24b(__m128i* const in0, __m128i* const in1, // Convert four packed four-channel buffers like argbargbargbargb... into the // split channels aaaaa ... rrrr ... gggg .... bbbbb ...... -static WEBP_INLINE void VP8L32bToPlanar(__m128i* const in0, - __m128i* const in1, - __m128i* const in2, - __m128i* const in3) { +static WEBP_INLINE void VP8L32bToPlanar_SSE2(__m128i* const in0, + __m128i* const in1, + __m128i* const in2, + __m128i* const in3) { // Column-wise transpose. const __m128i A0 = _mm_unpacklo_epi8(*in0, *in1); const __m128i A1 = _mm_unpackhi_epi8(*in0, *in1); diff --git a/src/3rdparty/libwebp/src/dsp/common_sse41.h b/src/3rdparty/libwebp/src/dsp/common_sse41.h new file mode 100644 index 0000000..2f173c0 --- /dev/null +++ b/src/3rdparty/libwebp/src/dsp/common_sse41.h @@ -0,0 +1,132 @@ +// Copyright 2016 Google Inc. All Rights Reserved. +// +// Use of this source code is governed by a BSD-style license +// that can be found in the COPYING file in the root of the source +// tree. An additional intellectual property rights grant can be found +// in the file PATENTS. All contributing project authors may +// be found in the AUTHORS file in the root of the source tree. +// ----------------------------------------------------------------------------- +// +// SSE4 code common to several files. +// +// Author: Vincent Rabaud (vrabaud@google.com) + +#ifndef WEBP_DSP_COMMON_SSE41_H_ +#define WEBP_DSP_COMMON_SSE41_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#if defined(WEBP_USE_SSE41) +#include <smmintrin.h> + +//------------------------------------------------------------------------------ +// Channel mixing. +// Shuffles the input buffer as A0 0 0 A1 0 0 A2 ... +#define WEBP_SSE41_SHUFF(OUT, IN0, IN1) \ + OUT##0 = _mm_shuffle_epi8(*IN0, shuff0); \ + OUT##1 = _mm_shuffle_epi8(*IN0, shuff1); \ + OUT##2 = _mm_shuffle_epi8(*IN0, shuff2); \ + OUT##3 = _mm_shuffle_epi8(*IN1, shuff0); \ + OUT##4 = _mm_shuffle_epi8(*IN1, shuff1); \ + OUT##5 = _mm_shuffle_epi8(*IN1, shuff2); + +// Pack the planar buffers +// rrrr... rrrr... gggg... gggg... bbbb... bbbb.... +// triplet by triplet in the output buffer rgb as rgbrgbrgbrgb ... +static WEBP_INLINE void VP8PlanarTo24b_SSE41( + __m128i* const in0, __m128i* const in1, __m128i* const in2, + __m128i* const in3, __m128i* const in4, __m128i* const in5) { + __m128i R0, R1, R2, R3, R4, R5; + __m128i G0, G1, G2, G3, G4, G5; + __m128i B0, B1, B2, B3, B4, B5; + + // Process R. + { + const __m128i shuff0 = _mm_set_epi8( + 5, -1, -1, 4, -1, -1, 3, -1, -1, 2, -1, -1, 1, -1, -1, 0); + const __m128i shuff1 = _mm_set_epi8( + -1, 10, -1, -1, 9, -1, -1, 8, -1, -1, 7, -1, -1, 6, -1, -1); + const __m128i shuff2 = _mm_set_epi8( + -1, -1, 15, -1, -1, 14, -1, -1, 13, -1, -1, 12, -1, -1, 11, -1); + WEBP_SSE41_SHUFF(R, in0, in1) + } + + // Process G. + { + // Same as before, just shifted to the left by one and including the right + // padding. + const __m128i shuff0 = _mm_set_epi8( + -1, -1, 4, -1, -1, 3, -1, -1, 2, -1, -1, 1, -1, -1, 0, -1); + const __m128i shuff1 = _mm_set_epi8( + 10, -1, -1, 9, -1, -1, 8, -1, -1, 7, -1, -1, 6, -1, -1, 5); + const __m128i shuff2 = _mm_set_epi8( + -1, 15, -1, -1, 14, -1, -1, 13, -1, -1, 12, -1, -1, 11, -1, -1); + WEBP_SSE41_SHUFF(G, in2, in3) + } + + // Process B. + { + const __m128i shuff0 = _mm_set_epi8( + -1, 4, -1, -1, 3, -1, -1, 2, -1, -1, 1, -1, -1, 0, -1, -1); + const __m128i shuff1 = _mm_set_epi8( + -1, -1, 9, -1, -1, 8, -1, -1, 7, -1, -1, 6, -1, -1, 5, -1); + const __m128i shuff2 = _mm_set_epi8( + 15, -1, -1, 14, -1, -1, 13, -1, -1, 12, -1, -1, 11, -1, -1, 10); + WEBP_SSE41_SHUFF(B, in4, in5) + } + + // OR the different channels. + { + const __m128i RG0 = _mm_or_si128(R0, G0); + const __m128i RG1 = _mm_or_si128(R1, G1); + const __m128i RG2 = _mm_or_si128(R2, G2); + const __m128i RG3 = _mm_or_si128(R3, G3); + const __m128i RG4 = _mm_or_si128(R4, G4); + const __m128i RG5 = _mm_or_si128(R5, G5); + *in0 = _mm_or_si128(RG0, B0); + *in1 = _mm_or_si128(RG1, B1); + *in2 = _mm_or_si128(RG2, B2); + *in3 = _mm_or_si128(RG3, B3); + *in4 = _mm_or_si128(RG4, B4); + *in5 = _mm_or_si128(RG5, B5); + } +} + +#undef WEBP_SSE41_SHUFF + +// Convert four packed four-channel buffers like argbargbargbargb... into the +// split channels aaaaa ... rrrr ... gggg .... bbbbb ...... +static WEBP_INLINE void VP8L32bToPlanar_SSE41(__m128i* const in0, + __m128i* const in1, + __m128i* const in2, + __m128i* const in3) { + // aaaarrrrggggbbbb + const __m128i shuff0 = + _mm_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0); + const __m128i A0 = _mm_shuffle_epi8(*in0, shuff0); + const __m128i A1 = _mm_shuffle_epi8(*in1, shuff0); + const __m128i A2 = _mm_shuffle_epi8(*in2, shuff0); + const __m128i A3 = _mm_shuffle_epi8(*in3, shuff0); + // A0A1R0R1 + // G0G1B0B1 + // A2A3R2R3 + // G0G1B0B1 + const __m128i B0 = _mm_unpacklo_epi32(A0, A1); + const __m128i B1 = _mm_unpackhi_epi32(A0, A1); + const __m128i B2 = _mm_unpacklo_epi32(A2, A3); + const __m128i B3 = _mm_unpackhi_epi32(A2, A3); + *in3 = _mm_unpacklo_epi64(B0, B2); + *in2 = _mm_unpackhi_epi64(B0, B2); + *in1 = _mm_unpacklo_epi64(B1, B3); + *in0 = _mm_unpackhi_epi64(B1, B3); +} + +#endif // WEBP_USE_SSE41 + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // WEBP_DSP_COMMON_SSE41_H_ diff --git a/src/3rdparty/libwebp/src/dsp/cost.c b/src/3rdparty/libwebp/src/dsp/cost.c index 58ddea7..cc681cd 100644 --- a/src/3rdparty/libwebp/src/dsp/cost.c +++ b/src/3rdparty/libwebp/src/dsp/cost.c @@ -9,8 +9,8 @@ // // Author: Skal (pascal.massimino@gmail.com) -#include "./dsp.h" -#include "../enc/cost_enc.h" +#include "src/dsp/dsp.h" +#include "src/enc/cost_enc.h" //------------------------------------------------------------------------------ // Boolean-cost cost table @@ -319,7 +319,7 @@ const uint8_t VP8EncBands[16 + 1] = { //------------------------------------------------------------------------------ // Mode costs -static int GetResidualCost(int ctx0, const VP8Residual* const res) { +static int GetResidualCost_C(int ctx0, const VP8Residual* const res) { int n = res->first; // should be prob[VP8EncBands[n]], but it's equivalent for n=0 or 1 const int p0 = res->prob[n][ctx0][0]; @@ -354,8 +354,8 @@ static int GetResidualCost(int ctx0, const VP8Residual* const res) { return cost; } -static void SetResidualCoeffs(const int16_t* const coeffs, - VP8Residual* const res) { +static void SetResidualCoeffs_C(const int16_t* const coeffs, + VP8Residual* const res) { int n; res->last = -1; assert(res->first == 0 || coeffs[0] == 0); @@ -377,15 +377,11 @@ VP8SetResidualCoeffsFunc VP8SetResidualCoeffs; extern void VP8EncDspCostInitMIPS32(void); extern void VP8EncDspCostInitMIPSdspR2(void); extern void VP8EncDspCostInitSSE2(void); +extern void VP8EncDspCostInitNEON(void); -static volatile VP8CPUInfo cost_last_cpuinfo_used = - (VP8CPUInfo)&cost_last_cpuinfo_used; - -WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspCostInit(void) { - if (cost_last_cpuinfo_used == VP8GetCPUInfo) return; - - VP8GetResidualCost = GetResidualCost; - VP8SetResidualCoeffs = SetResidualCoeffs; +WEBP_DSP_INIT_FUNC(VP8EncDspCostInit) { + VP8GetResidualCost = GetResidualCost_C; + VP8SetResidualCoeffs = SetResidualCoeffs_C; // If defined, use CPUInfo() to overwrite some pointers with faster versions. if (VP8GetCPUInfo != NULL) { @@ -404,9 +400,12 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspCostInit(void) { VP8EncDspCostInitSSE2(); } #endif +#if defined(WEBP_USE_NEON) + if (VP8GetCPUInfo(kNEON)) { + VP8EncDspCostInitNEON(); + } +#endif } - - cost_last_cpuinfo_used = VP8GetCPUInfo; } //------------------------------------------------------------------------------ diff --git a/src/3rdparty/libwebp/src/dsp/cost_mips32.c b/src/3rdparty/libwebp/src/dsp/cost_mips32.c index 3102da8..0500f88 100644 --- a/src/3rdparty/libwebp/src/dsp/cost_mips32.c +++ b/src/3rdparty/libwebp/src/dsp/cost_mips32.c @@ -9,13 +9,13 @@ // // Author: Djordje Pesut (djordje.pesut@imgtec.com) -#include "./dsp.h" +#include "src/dsp/dsp.h" #if defined(WEBP_USE_MIPS32) -#include "../enc/cost_enc.h" +#include "src/enc/cost_enc.h" -static int GetResidualCost(int ctx0, const VP8Residual* const res) { +static int GetResidualCost_MIPS32(int ctx0, const VP8Residual* const res) { int temp0, temp1; int v_reg, ctx_reg; int n = res->first; @@ -96,8 +96,8 @@ static int GetResidualCost(int ctx0, const VP8Residual* const res) { return cost; } -static void SetResidualCoeffs(const int16_t* const coeffs, - VP8Residual* const res) { +static void SetResidualCoeffs_MIPS32(const int16_t* const coeffs, + VP8Residual* const res) { const int16_t* p_coeffs = (int16_t*)coeffs; int temp0, temp1, temp2, n, n1; assert(res->first == 0 || coeffs[0] == 0); @@ -143,8 +143,8 @@ static void SetResidualCoeffs(const int16_t* const coeffs, extern void VP8EncDspCostInitMIPS32(void); WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspCostInitMIPS32(void) { - VP8GetResidualCost = GetResidualCost; - VP8SetResidualCoeffs = SetResidualCoeffs; + VP8GetResidualCost = GetResidualCost_MIPS32; + VP8SetResidualCoeffs = SetResidualCoeffs_MIPS32; } #else // !WEBP_USE_MIPS32 diff --git a/src/3rdparty/libwebp/src/dsp/cost_mips_dsp_r2.c b/src/3rdparty/libwebp/src/dsp/cost_mips_dsp_r2.c index 6ec8aeb..51248de 100644 --- a/src/3rdparty/libwebp/src/dsp/cost_mips_dsp_r2.c +++ b/src/3rdparty/libwebp/src/dsp/cost_mips_dsp_r2.c @@ -9,13 +9,13 @@ // // Author: Djordje Pesut (djordje.pesut@imgtec.com) -#include "./dsp.h" +#include "src/dsp/dsp.h" #if defined(WEBP_USE_MIPS_DSP_R2) -#include "../enc/cost_enc.h" +#include "src/enc/cost_enc.h" -static int GetResidualCost(int ctx0, const VP8Residual* const res) { +static int GetResidualCost_MIPSdspR2(int ctx0, const VP8Residual* const res) { int temp0, temp1; int v_reg, ctx_reg; int n = res->first; @@ -97,7 +97,7 @@ static int GetResidualCost(int ctx0, const VP8Residual* const res) { extern void VP8EncDspCostInitMIPSdspR2(void); WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspCostInitMIPSdspR2(void) { - VP8GetResidualCost = GetResidualCost; + VP8GetResidualCost = GetResidualCost_MIPSdspR2; } #else // !WEBP_USE_MIPS_DSP_R2 diff --git a/src/3rdparty/libwebp/src/dsp/cost_neon.c b/src/3rdparty/libwebp/src/dsp/cost_neon.c new file mode 100644 index 0000000..8cc8ce5 --- /dev/null +++ b/src/3rdparty/libwebp/src/dsp/cost_neon.c @@ -0,0 +1,122 @@ +// Copyright 2018 Google Inc. All Rights Reserved. +// +// Use of this source code is governed by a BSD-style license +// that can be found in the COPYING file in the root of the source +// tree. An additional intellectual property rights grant can be found +// in the file PATENTS. All contributing project authors may +// be found in the AUTHORS file in the root of the source tree. +// ----------------------------------------------------------------------------- +// +// ARM NEON version of cost functions + +#include "src/dsp/dsp.h" + +#if defined(WEBP_USE_NEON) + +#include "src/dsp/neon.h" +#include "src/enc/cost_enc.h" + +static const uint8_t position[16] = { 1, 2, 3, 4, 5, 6, 7, 8, + 9, 10, 11, 12, 13, 14, 15, 16 }; + +static void SetResidualCoeffs_NEON(const int16_t* const coeffs, + VP8Residual* const res) { + const int16x8_t minus_one = vdupq_n_s16(-1); + const int16x8_t coeffs_0 = vld1q_s16(coeffs); + const int16x8_t coeffs_1 = vld1q_s16(coeffs + 8); + const uint16x8_t eob_0 = vtstq_s16(coeffs_0, minus_one); + const uint16x8_t eob_1 = vtstq_s16(coeffs_1, minus_one); + const uint8x16_t eob = vcombine_u8(vqmovn_u16(eob_0), vqmovn_u16(eob_1)); + const uint8x16_t masked = vandq_u8(eob, vld1q_u8(position)); + +#ifdef __aarch64__ + res->last = vmaxvq_u8(masked) - 1; +#else + const uint8x8_t eob_8x8 = vmax_u8(vget_low_u8(masked), vget_high_u8(masked)); + const uint16x8_t eob_16x8 = vmovl_u8(eob_8x8); + const uint16x4_t eob_16x4 = + vmax_u16(vget_low_u16(eob_16x8), vget_high_u16(eob_16x8)); + const uint32x4_t eob_32x4 = vmovl_u16(eob_16x4); + uint32x2_t eob_32x2 = + vmax_u32(vget_low_u32(eob_32x4), vget_high_u32(eob_32x4)); + eob_32x2 = vpmax_u32(eob_32x2, eob_32x2); + + vst1_lane_s32(&res->last, vreinterpret_s32_u32(eob_32x2), 0); + --res->last; +#endif // __aarch64__ + + res->coeffs = coeffs; +} + +static int GetResidualCost_NEON(int ctx0, const VP8Residual* const res) { + uint8_t levels[16], ctxs[16]; + uint16_t abs_levels[16]; + int n = res->first; + // should be prob[VP8EncBands[n]], but it's equivalent for n=0 or 1 + const int p0 = res->prob[n][ctx0][0]; + CostArrayPtr const costs = res->costs; + const uint16_t* t = costs[n][ctx0]; + // bit_cost(1, p0) is already incorporated in t[] tables, but only if ctx != 0 + // (as required by the syntax). For ctx0 == 0, we need to add it here or it'll + // be missing during the loop. + int cost = (ctx0 == 0) ? VP8BitCost(1, p0) : 0; + + if (res->last < 0) { + return VP8BitCost(0, p0); + } + + { // precompute clamped levels and contexts, packed to 8b. + const uint8x16_t kCst2 = vdupq_n_u8(2); + const uint8x16_t kCst67 = vdupq_n_u8(MAX_VARIABLE_LEVEL); + const int16x8_t c0 = vld1q_s16(res->coeffs); + const int16x8_t c1 = vld1q_s16(res->coeffs + 8); + const uint16x8_t E0 = vreinterpretq_u16_s16(vabsq_s16(c0)); + const uint16x8_t E1 = vreinterpretq_u16_s16(vabsq_s16(c1)); + const uint8x16_t F = vcombine_u8(vqmovn_u16(E0), vqmovn_u16(E1)); + const uint8x16_t G = vminq_u8(F, kCst2); // context = 0,1,2 + const uint8x16_t H = vminq_u8(F, kCst67); // clamp_level in [0..67] + + vst1q_u8(ctxs, G); + vst1q_u8(levels, H); + + vst1q_u16(abs_levels, E0); + vst1q_u16(abs_levels + 8, E1); + } + for (; n < res->last; ++n) { + const int ctx = ctxs[n]; + const int level = levels[n]; + const int flevel = abs_levels[n]; // full level + cost += VP8LevelFixedCosts[flevel] + t[level]; // simplified VP8LevelCost() + t = costs[n + 1][ctx]; + } + // Last coefficient is always non-zero + { + const int level = levels[n]; + const int flevel = abs_levels[n]; + assert(flevel != 0); + cost += VP8LevelFixedCosts[flevel] + t[level]; + if (n < 15) { + const int b = VP8EncBands[n + 1]; + const int ctx = ctxs[n]; + const int last_p0 = res->prob[b][ctx][0]; + cost += VP8BitCost(0, last_p0); + } + } + return cost; +} + +//------------------------------------------------------------------------------ +// Entry point + +extern void VP8EncDspCostInitNEON(void); + +WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspCostInitNEON(void) { + VP8SetResidualCoeffs = SetResidualCoeffs_NEON; + VP8GetResidualCost = GetResidualCost_NEON; +} + +#else // !WEBP_USE_NEON + +WEBP_DSP_INIT_STUB(VP8EncDspCostInitNEON) + +#endif // WEBP_USE_NEON diff --git a/src/3rdparty/libwebp/src/dsp/cost_sse2.c b/src/3rdparty/libwebp/src/dsp/cost_sse2.c index 421d51f..487a079 100644 --- a/src/3rdparty/libwebp/src/dsp/cost_sse2.c +++ b/src/3rdparty/libwebp/src/dsp/cost_sse2.c @@ -11,19 +11,19 @@ // // Author: Skal (pascal.massimino@gmail.com) -#include "./dsp.h" +#include "src/dsp/dsp.h" #if defined(WEBP_USE_SSE2) #include <emmintrin.h> -#include "../enc/cost_enc.h" -#include "../enc/vp8i_enc.h" -#include "../utils/utils.h" +#include "src/enc/cost_enc.h" +#include "src/enc/vp8i_enc.h" +#include "src/utils/utils.h" //------------------------------------------------------------------------------ -static void SetResidualCoeffsSSE2(const int16_t* const coeffs, - VP8Residual* const res) { +static void SetResidualCoeffs_SSE2(const int16_t* const coeffs, + VP8Residual* const res) { const __m128i c0 = _mm_loadu_si128((const __m128i*)(coeffs + 0)); const __m128i c1 = _mm_loadu_si128((const __m128i*)(coeffs + 8)); // Use SSE2 to compare 16 values with a single instruction. @@ -42,7 +42,7 @@ static void SetResidualCoeffsSSE2(const int16_t* const coeffs, res->coeffs = coeffs; } -static int GetResidualCostSSE2(int ctx0, const VP8Residual* const res) { +static int GetResidualCost_SSE2(int ctx0, const VP8Residual* const res) { uint8_t levels[16], ctxs[16]; uint16_t abs_levels[16]; int n = res->first; @@ -108,8 +108,8 @@ static int GetResidualCostSSE2(int ctx0, const VP8Residual* const res) { extern void VP8EncDspCostInitSSE2(void); WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspCostInitSSE2(void) { - VP8SetResidualCoeffs = SetResidualCoeffsSSE2; - VP8GetResidualCost = GetResidualCostSSE2; + VP8SetResidualCoeffs = SetResidualCoeffs_SSE2; + VP8GetResidualCost = GetResidualCost_SSE2; } #else // !WEBP_USE_SSE2 diff --git a/src/3rdparty/libwebp/src/dsp/cpu.c b/src/3rdparty/libwebp/src/dsp/cpu.c index b39184e..0fa5b6a 100644 --- a/src/3rdparty/libwebp/src/dsp/cpu.c +++ b/src/3rdparty/libwebp/src/dsp/cpu.c @@ -11,14 +11,14 @@ // // Author: Christian Duvivier (cduvivier@google.com) -#include "./dsp.h" +#include "src/dsp/dsp.h" #if defined(WEBP_HAVE_NEON_RTCD) #include <stdio.h> #include <string.h> #endif -#if defined(WEBP_ANDROID_NEON) && !defined(Q_OS_ANDROID_EMBEDDED) +#if defined(WEBP_ANDROID_NEON) #include <cpu-features.h> #endif @@ -143,7 +143,7 @@ static int x86CPUInfo(CPUFeature feature) { return !!(cpu_info[2] & (1 << 0)); } if (feature == kSlowSSSE3) { - if (is_intel && (cpu_info[2] & (1 << 0))) { // SSSE3? + if (is_intel && (cpu_info[2] & (1 << 9))) { // SSSE3? return CheckSlowModel(cpu_info[0]); } return 0; @@ -168,13 +168,13 @@ static int x86CPUInfo(CPUFeature feature) { return 0; } VP8CPUInfo VP8GetCPUInfo = x86CPUInfo; -#elif defined(WEBP_ANDROID_NEON) && !defined(Q_OS_ANDROID_EMBEDDED) // NB: needs to be before generic NEON test. +#elif defined(WEBP_ANDROID_NEON) // NB: needs to be before generic NEON test. static int AndroidCPUInfo(CPUFeature feature) { const AndroidCpuFamily cpu_family = android_getCpuFamily(); const uint64_t cpu_features = android_getCpuFeatures(); if (feature == kNEON) { - return (cpu_family == ANDROID_CPU_FAMILY_ARM && - 0 != (cpu_features & ANDROID_CPU_ARM_FEATURE_NEON)); + return cpu_family == ANDROID_CPU_FAMILY_ARM && + (cpu_features & ANDROID_CPU_ARM_FEATURE_NEON) != 0; } return 0; } diff --git a/src/3rdparty/libwebp/src/dsp/dec.c b/src/3rdparty/libwebp/src/dsp/dec.c index 007e985..1119842 100644 --- a/src/3rdparty/libwebp/src/dsp/dec.c +++ b/src/3rdparty/libwebp/src/dsp/dec.c @@ -11,9 +11,11 @@ // // Author: Skal (pascal.massimino@gmail.com) -#include "./dsp.h" -#include "../dec/vp8i_dec.h" -#include "../utils/utils.h" +#include <assert.h> + +#include "src/dsp/dsp.h" +#include "src/dec/vp8i_dec.h" +#include "src/utils/utils.h" //------------------------------------------------------------------------------ @@ -25,7 +27,7 @@ static WEBP_INLINE uint8_t clip_8b(int v) { // Transforms (Paragraph 14.4) #define STORE(x, y, v) \ - dst[x + y * BPS] = clip_8b(dst[x + y * BPS] + ((v) >> 3)) + dst[(x) + (y) * BPS] = clip_8b(dst[(x) + (y) * BPS] + ((v) >> 3)) #define STORE2(y, dc, d, c) do { \ const int DC = (dc); \ @@ -38,7 +40,8 @@ static WEBP_INLINE uint8_t clip_8b(int v) { #define MUL1(a) ((((a) * 20091) >> 16) + (a)) #define MUL2(a) (((a) * 35468) >> 16) -static void TransformOne(const int16_t* in, uint8_t* dst) { +#if !WEBP_NEON_OMIT_C_CODE +static void TransformOne_C(const int16_t* in, uint8_t* dst) { int C[4 * 4], *tmp; int i; tmp = C; @@ -78,7 +81,7 @@ static void TransformOne(const int16_t* in, uint8_t* dst) { } // Simplified transform when only in[0], in[1] and in[4] are non-zero -static void TransformAC3(const int16_t* in, uint8_t* dst) { +static void TransformAC3_C(const int16_t* in, uint8_t* dst) { const int a = in[0] + 4; const int c4 = MUL2(in[4]); const int d4 = MUL1(in[4]); @@ -93,19 +96,21 @@ static void TransformAC3(const int16_t* in, uint8_t* dst) { #undef MUL2 #undef STORE2 -static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) { - TransformOne(in, dst); +static void TransformTwo_C(const int16_t* in, uint8_t* dst, int do_two) { + TransformOne_C(in, dst); if (do_two) { - TransformOne(in + 16, dst + 4); + TransformOne_C(in + 16, dst + 4); } } +#endif // !WEBP_NEON_OMIT_C_CODE -static void TransformUV(const int16_t* in, uint8_t* dst) { +static void TransformUV_C(const int16_t* in, uint8_t* dst) { VP8Transform(in + 0 * 16, dst, 1); VP8Transform(in + 2 * 16, dst + 4 * BPS, 1); } -static void TransformDC(const int16_t* in, uint8_t* dst) { +#if !WEBP_NEON_OMIT_C_CODE +static void TransformDC_C(const int16_t* in, uint8_t* dst) { const int DC = in[0] + 4; int i, j; for (j = 0; j < 4; ++j) { @@ -114,8 +119,9 @@ static void TransformDC(const int16_t* in, uint8_t* dst) { } } } +#endif // !WEBP_NEON_OMIT_C_CODE -static void TransformDCUV(const int16_t* in, uint8_t* dst) { +static void TransformDCUV_C(const int16_t* in, uint8_t* dst) { if (in[0 * 16]) VP8TransformDC(in + 0 * 16, dst); if (in[1 * 16]) VP8TransformDC(in + 1 * 16, dst + 4); if (in[2 * 16]) VP8TransformDC(in + 2 * 16, dst + 4 * BPS); @@ -127,7 +133,8 @@ static void TransformDCUV(const int16_t* in, uint8_t* dst) { //------------------------------------------------------------------------------ // Paragraph 14.3 -static void TransformWHT(const int16_t* in, int16_t* out) { +#if !WEBP_NEON_OMIT_C_CODE +static void TransformWHT_C(const int16_t* in, int16_t* out) { int tmp[16]; int i; for (i = 0; i < 4; ++i) { @@ -153,6 +160,7 @@ static void TransformWHT(const int16_t* in, int16_t* out) { out += 64; } } +#endif // !WEBP_NEON_OMIT_C_CODE void (*VP8TransformWHT)(const int16_t* in, int16_t* out); @@ -161,6 +169,7 @@ void (*VP8TransformWHT)(const int16_t* in, int16_t* out); #define DST(x, y) dst[(x) + (y) * BPS] +#if !WEBP_NEON_OMIT_C_CODE static WEBP_INLINE void TrueMotion(uint8_t* dst, int size) { const uint8_t* top = dst - BPS; const uint8_t* const clip0 = VP8kclip1 - top[-1]; @@ -174,21 +183,21 @@ static WEBP_INLINE void TrueMotion(uint8_t* dst, int size) { dst += BPS; } } -static void TM4(uint8_t* dst) { TrueMotion(dst, 4); } -static void TM8uv(uint8_t* dst) { TrueMotion(dst, 8); } -static void TM16(uint8_t* dst) { TrueMotion(dst, 16); } +static void TM4_C(uint8_t* dst) { TrueMotion(dst, 4); } +static void TM8uv_C(uint8_t* dst) { TrueMotion(dst, 8); } +static void TM16_C(uint8_t* dst) { TrueMotion(dst, 16); } //------------------------------------------------------------------------------ // 16x16 -static void VE16(uint8_t* dst) { // vertical +static void VE16_C(uint8_t* dst) { // vertical int j; for (j = 0; j < 16; ++j) { memcpy(dst + j * BPS, dst - BPS, 16); } } -static void HE16(uint8_t* dst) { // horizontal +static void HE16_C(uint8_t* dst) { // horizontal int j; for (j = 16; j > 0; --j) { memset(dst, dst[-1], 16); @@ -203,7 +212,7 @@ static WEBP_INLINE void Put16(int v, uint8_t* dst) { } } -static void DC16(uint8_t* dst) { // DC +static void DC16_C(uint8_t* dst) { // DC int DC = 16; int j; for (j = 0; j < 16; ++j) { @@ -212,7 +221,7 @@ static void DC16(uint8_t* dst) { // DC Put16(DC >> 5, dst); } -static void DC16NoTop(uint8_t* dst) { // DC with top samples not available +static void DC16NoTop_C(uint8_t* dst) { // DC with top samples not available int DC = 8; int j; for (j = 0; j < 16; ++j) { @@ -221,7 +230,7 @@ static void DC16NoTop(uint8_t* dst) { // DC with top samples not available Put16(DC >> 4, dst); } -static void DC16NoLeft(uint8_t* dst) { // DC with left samples not available +static void DC16NoLeft_C(uint8_t* dst) { // DC with left samples not available int DC = 8; int i; for (i = 0; i < 16; ++i) { @@ -230,9 +239,10 @@ static void DC16NoLeft(uint8_t* dst) { // DC with left samples not available Put16(DC >> 4, dst); } -static void DC16NoTopLeft(uint8_t* dst) { // DC with no top and left samples +static void DC16NoTopLeft_C(uint8_t* dst) { // DC with no top and left samples Put16(0x80, dst); } +#endif // !WEBP_NEON_OMIT_C_CODE VP8PredFunc VP8PredLuma16[NUM_B_DC_MODES]; @@ -242,7 +252,8 @@ VP8PredFunc VP8PredLuma16[NUM_B_DC_MODES]; #define AVG3(a, b, c) ((uint8_t)(((a) + 2 * (b) + (c) + 2) >> 2)) #define AVG2(a, b) (((a) + (b) + 1) >> 1) -static void VE4(uint8_t* dst) { // vertical +#if !WEBP_NEON_OMIT_C_CODE +static void VE4_C(uint8_t* dst) { // vertical const uint8_t* top = dst - BPS; const uint8_t vals[4] = { AVG3(top[-1], top[0], top[1]), @@ -255,8 +266,9 @@ static void VE4(uint8_t* dst) { // vertical memcpy(dst + i * BPS, vals, sizeof(vals)); } } +#endif // !WEBP_NEON_OMIT_C_CODE -static void HE4(uint8_t* dst) { // horizontal +static void HE4_C(uint8_t* dst) { // horizontal const int A = dst[-1 - BPS]; const int B = dst[-1]; const int C = dst[-1 + BPS]; @@ -268,7 +280,8 @@ static void HE4(uint8_t* dst) { // horizontal WebPUint32ToMem(dst + 3 * BPS, 0x01010101U * AVG3(D, E, E)); } -static void DC4(uint8_t* dst) { // DC +#if !WEBP_NEON_OMIT_C_CODE +static void DC4_C(uint8_t* dst) { // DC uint32_t dc = 4; int i; for (i = 0; i < 4; ++i) dc += dst[i - BPS] + dst[-1 + i * BPS]; @@ -276,7 +289,7 @@ static void DC4(uint8_t* dst) { // DC for (i = 0; i < 4; ++i) memset(dst + i * BPS, dc, 4); } -static void RD4(uint8_t* dst) { // Down-right +static void RD4_C(uint8_t* dst) { // Down-right const int I = dst[-1 + 0 * BPS]; const int J = dst[-1 + 1 * BPS]; const int K = dst[-1 + 2 * BPS]; @@ -295,7 +308,7 @@ static void RD4(uint8_t* dst) { // Down-right DST(3, 0) = AVG3(D, C, B); } -static void LD4(uint8_t* dst) { // Down-Left +static void LD4_C(uint8_t* dst) { // Down-Left const int A = dst[0 - BPS]; const int B = dst[1 - BPS]; const int C = dst[2 - BPS]; @@ -312,8 +325,9 @@ static void LD4(uint8_t* dst) { // Down-Left DST(3, 2) = DST(2, 3) = AVG3(F, G, H); DST(3, 3) = AVG3(G, H, H); } +#endif // !WEBP_NEON_OMIT_C_CODE -static void VR4(uint8_t* dst) { // Vertical-Right +static void VR4_C(uint8_t* dst) { // Vertical-Right const int I = dst[-1 + 0 * BPS]; const int J = dst[-1 + 1 * BPS]; const int K = dst[-1 + 2 * BPS]; @@ -335,7 +349,7 @@ static void VR4(uint8_t* dst) { // Vertical-Right DST(3, 1) = AVG3(B, C, D); } -static void VL4(uint8_t* dst) { // Vertical-Left +static void VL4_C(uint8_t* dst) { // Vertical-Left const int A = dst[0 - BPS]; const int B = dst[1 - BPS]; const int C = dst[2 - BPS]; @@ -357,7 +371,7 @@ static void VL4(uint8_t* dst) { // Vertical-Left DST(3, 3) = AVG3(F, G, H); } -static void HU4(uint8_t* dst) { // Horizontal-Up +static void HU4_C(uint8_t* dst) { // Horizontal-Up const int I = dst[-1 + 0 * BPS]; const int J = dst[-1 + 1 * BPS]; const int K = dst[-1 + 2 * BPS]; @@ -372,7 +386,7 @@ static void HU4(uint8_t* dst) { // Horizontal-Up DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L; } -static void HD4(uint8_t* dst) { // Horizontal-Down +static void HD4_C(uint8_t* dst) { // Horizontal-Down const int I = dst[-1 + 0 * BPS]; const int J = dst[-1 + 1 * BPS]; const int K = dst[-1 + 2 * BPS]; @@ -404,14 +418,15 @@ VP8PredFunc VP8PredLuma4[NUM_BMODES]; //------------------------------------------------------------------------------ // Chroma -static void VE8uv(uint8_t* dst) { // vertical +#if !WEBP_NEON_OMIT_C_CODE +static void VE8uv_C(uint8_t* dst) { // vertical int j; for (j = 0; j < 8; ++j) { memcpy(dst + j * BPS, dst - BPS, 8); } } -static void HE8uv(uint8_t* dst) { // horizontal +static void HE8uv_C(uint8_t* dst) { // horizontal int j; for (j = 0; j < 8; ++j) { memset(dst, dst[-1], 8); @@ -427,7 +442,7 @@ static WEBP_INLINE void Put8x8uv(uint8_t value, uint8_t* dst) { } } -static void DC8uv(uint8_t* dst) { // DC +static void DC8uv_C(uint8_t* dst) { // DC int dc0 = 8; int i; for (i = 0; i < 8; ++i) { @@ -436,7 +451,7 @@ static void DC8uv(uint8_t* dst) { // DC Put8x8uv(dc0 >> 4, dst); } -static void DC8uvNoLeft(uint8_t* dst) { // DC with no left samples +static void DC8uvNoLeft_C(uint8_t* dst) { // DC with no left samples int dc0 = 4; int i; for (i = 0; i < 8; ++i) { @@ -445,7 +460,7 @@ static void DC8uvNoLeft(uint8_t* dst) { // DC with no left samples Put8x8uv(dc0 >> 3, dst); } -static void DC8uvNoTop(uint8_t* dst) { // DC with no top samples +static void DC8uvNoTop_C(uint8_t* dst) { // DC with no top samples int dc0 = 4; int i; for (i = 0; i < 8; ++i) { @@ -454,17 +469,19 @@ static void DC8uvNoTop(uint8_t* dst) { // DC with no top samples Put8x8uv(dc0 >> 3, dst); } -static void DC8uvNoTopLeft(uint8_t* dst) { // DC with nothing +static void DC8uvNoTopLeft_C(uint8_t* dst) { // DC with nothing Put8x8uv(0x80, dst); } +#endif // !WEBP_NEON_OMIT_C_CODE VP8PredFunc VP8PredChroma8[NUM_B_DC_MODES]; //------------------------------------------------------------------------------ // Edge filtering functions +#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC // 4 pixels in, 2 pixels out -static WEBP_INLINE void do_filter2(uint8_t* p, int step) { +static WEBP_INLINE void DoFilter2_C(uint8_t* p, int step) { const int p1 = p[-2*step], p0 = p[-step], q0 = p[0], q1 = p[step]; const int a = 3 * (q0 - p0) + VP8ksclip1[p1 - q1]; // in [-893,892] const int a1 = VP8ksclip2[(a + 4) >> 3]; // in [-16,15] @@ -474,7 +491,7 @@ static WEBP_INLINE void do_filter2(uint8_t* p, int step) { } // 4 pixels in, 4 pixels out -static WEBP_INLINE void do_filter4(uint8_t* p, int step) { +static WEBP_INLINE void DoFilter4_C(uint8_t* p, int step) { const int p1 = p[-2*step], p0 = p[-step], q0 = p[0], q1 = p[step]; const int a = 3 * (q0 - p0); const int a1 = VP8ksclip2[(a + 4) >> 3]; @@ -487,7 +504,7 @@ static WEBP_INLINE void do_filter4(uint8_t* p, int step) { } // 6 pixels in, 6 pixels out -static WEBP_INLINE void do_filter6(uint8_t* p, int step) { +static WEBP_INLINE void DoFilter6_C(uint8_t* p, int step) { const int p2 = p[-3*step], p1 = p[-2*step], p0 = p[-step]; const int q0 = p[0], q1 = p[step], q2 = p[2*step]; const int a = VP8ksclip1[3 * (q0 - p0) + VP8ksclip1[p1 - q1]]; @@ -503,18 +520,22 @@ static WEBP_INLINE void do_filter6(uint8_t* p, int step) { p[ 2*step] = VP8kclip1[q2 - a3]; } -static WEBP_INLINE int hev(const uint8_t* p, int step, int thresh) { +static WEBP_INLINE int Hev(const uint8_t* p, int step, int thresh) { const int p1 = p[-2*step], p0 = p[-step], q0 = p[0], q1 = p[step]; return (VP8kabs0[p1 - p0] > thresh) || (VP8kabs0[q1 - q0] > thresh); } +#endif // !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC -static WEBP_INLINE int needs_filter(const uint8_t* p, int step, int t) { +#if !WEBP_NEON_OMIT_C_CODE +static WEBP_INLINE int NeedsFilter_C(const uint8_t* p, int step, int t) { const int p1 = p[-2 * step], p0 = p[-step], q0 = p[0], q1 = p[step]; return ((4 * VP8kabs0[p0 - q0] + VP8kabs0[p1 - q1]) <= t); } +#endif // !WEBP_NEON_OMIT_C_CODE -static WEBP_INLINE int needs_filter2(const uint8_t* p, - int step, int t, int it) { +#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC +static WEBP_INLINE int NeedsFilter2_C(const uint8_t* p, + int step, int t, int it) { const int p3 = p[-4 * step], p2 = p[-3 * step], p1 = p[-2 * step]; const int p0 = p[-step], q0 = p[0]; const int q1 = p[step], q2 = p[2 * step], q3 = p[3 * step]; @@ -523,140 +544,159 @@ static WEBP_INLINE int needs_filter2(const uint8_t* p, VP8kabs0[p1 - p0] <= it && VP8kabs0[q3 - q2] <= it && VP8kabs0[q2 - q1] <= it && VP8kabs0[q1 - q0] <= it; } +#endif // !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC //------------------------------------------------------------------------------ // Simple In-loop filtering (Paragraph 15.2) -static void SimpleVFilter16(uint8_t* p, int stride, int thresh) { +#if !WEBP_NEON_OMIT_C_CODE +static void SimpleVFilter16_C(uint8_t* p, int stride, int thresh) { int i; const int thresh2 = 2 * thresh + 1; for (i = 0; i < 16; ++i) { - if (needs_filter(p + i, stride, thresh2)) { - do_filter2(p + i, stride); + if (NeedsFilter_C(p + i, stride, thresh2)) { + DoFilter2_C(p + i, stride); } } } -static void SimpleHFilter16(uint8_t* p, int stride, int thresh) { +static void SimpleHFilter16_C(uint8_t* p, int stride, int thresh) { int i; const int thresh2 = 2 * thresh + 1; for (i = 0; i < 16; ++i) { - if (needs_filter(p + i * stride, 1, thresh2)) { - do_filter2(p + i * stride, 1); + if (NeedsFilter_C(p + i * stride, 1, thresh2)) { + DoFilter2_C(p + i * stride, 1); } } } -static void SimpleVFilter16i(uint8_t* p, int stride, int thresh) { +static void SimpleVFilter16i_C(uint8_t* p, int stride, int thresh) { int k; for (k = 3; k > 0; --k) { p += 4 * stride; - SimpleVFilter16(p, stride, thresh); + SimpleVFilter16_C(p, stride, thresh); } } -static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) { +static void SimpleHFilter16i_C(uint8_t* p, int stride, int thresh) { int k; for (k = 3; k > 0; --k) { p += 4; - SimpleHFilter16(p, stride, thresh); + SimpleHFilter16_C(p, stride, thresh); } } +#endif // !WEBP_NEON_OMIT_C_CODE //------------------------------------------------------------------------------ // Complex In-loop filtering (Paragraph 15.3) -static WEBP_INLINE void FilterLoop26(uint8_t* p, - int hstride, int vstride, int size, - int thresh, int ithresh, int hev_thresh) { +#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC +static WEBP_INLINE void FilterLoop26_C(uint8_t* p, + int hstride, int vstride, int size, + int thresh, int ithresh, + int hev_thresh) { const int thresh2 = 2 * thresh + 1; while (size-- > 0) { - if (needs_filter2(p, hstride, thresh2, ithresh)) { - if (hev(p, hstride, hev_thresh)) { - do_filter2(p, hstride); + if (NeedsFilter2_C(p, hstride, thresh2, ithresh)) { + if (Hev(p, hstride, hev_thresh)) { + DoFilter2_C(p, hstride); } else { - do_filter6(p, hstride); + DoFilter6_C(p, hstride); } } p += vstride; } } -static WEBP_INLINE void FilterLoop24(uint8_t* p, - int hstride, int vstride, int size, - int thresh, int ithresh, int hev_thresh) { +static WEBP_INLINE void FilterLoop24_C(uint8_t* p, + int hstride, int vstride, int size, + int thresh, int ithresh, + int hev_thresh) { const int thresh2 = 2 * thresh + 1; while (size-- > 0) { - if (needs_filter2(p, hstride, thresh2, ithresh)) { - if (hev(p, hstride, hev_thresh)) { - do_filter2(p, hstride); + if (NeedsFilter2_C(p, hstride, thresh2, ithresh)) { + if (Hev(p, hstride, hev_thresh)) { + DoFilter2_C(p, hstride); } else { - do_filter4(p, hstride); + DoFilter4_C(p, hstride); } } p += vstride; } } +#endif // !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC +#if !WEBP_NEON_OMIT_C_CODE // on macroblock edges -static void VFilter16(uint8_t* p, int stride, - int thresh, int ithresh, int hev_thresh) { - FilterLoop26(p, stride, 1, 16, thresh, ithresh, hev_thresh); +static void VFilter16_C(uint8_t* p, int stride, + int thresh, int ithresh, int hev_thresh) { + FilterLoop26_C(p, stride, 1, 16, thresh, ithresh, hev_thresh); } -static void HFilter16(uint8_t* p, int stride, - int thresh, int ithresh, int hev_thresh) { - FilterLoop26(p, 1, stride, 16, thresh, ithresh, hev_thresh); +static void HFilter16_C(uint8_t* p, int stride, + int thresh, int ithresh, int hev_thresh) { + FilterLoop26_C(p, 1, stride, 16, thresh, ithresh, hev_thresh); } // on three inner edges -static void VFilter16i(uint8_t* p, int stride, - int thresh, int ithresh, int hev_thresh) { +static void VFilter16i_C(uint8_t* p, int stride, + int thresh, int ithresh, int hev_thresh) { int k; for (k = 3; k > 0; --k) { p += 4 * stride; - FilterLoop24(p, stride, 1, 16, thresh, ithresh, hev_thresh); + FilterLoop24_C(p, stride, 1, 16, thresh, ithresh, hev_thresh); } } +#endif // !WEBP_NEON_OMIT_C_CODE -static void HFilter16i(uint8_t* p, int stride, - int thresh, int ithresh, int hev_thresh) { +#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC +static void HFilter16i_C(uint8_t* p, int stride, + int thresh, int ithresh, int hev_thresh) { int k; for (k = 3; k > 0; --k) { p += 4; - FilterLoop24(p, 1, stride, 16, thresh, ithresh, hev_thresh); + FilterLoop24_C(p, 1, stride, 16, thresh, ithresh, hev_thresh); } } +#endif // !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC +#if !WEBP_NEON_OMIT_C_CODE // 8-pixels wide variant, for chroma filtering -static void VFilter8(uint8_t* u, uint8_t* v, int stride, - int thresh, int ithresh, int hev_thresh) { - FilterLoop26(u, stride, 1, 8, thresh, ithresh, hev_thresh); - FilterLoop26(v, stride, 1, 8, thresh, ithresh, hev_thresh); +static void VFilter8_C(uint8_t* u, uint8_t* v, int stride, + int thresh, int ithresh, int hev_thresh) { + FilterLoop26_C(u, stride, 1, 8, thresh, ithresh, hev_thresh); + FilterLoop26_C(v, stride, 1, 8, thresh, ithresh, hev_thresh); } +#endif // !WEBP_NEON_OMIT_C_CODE -static void HFilter8(uint8_t* u, uint8_t* v, int stride, - int thresh, int ithresh, int hev_thresh) { - FilterLoop26(u, 1, stride, 8, thresh, ithresh, hev_thresh); - FilterLoop26(v, 1, stride, 8, thresh, ithresh, hev_thresh); +#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC +static void HFilter8_C(uint8_t* u, uint8_t* v, int stride, + int thresh, int ithresh, int hev_thresh) { + FilterLoop26_C(u, 1, stride, 8, thresh, ithresh, hev_thresh); + FilterLoop26_C(v, 1, stride, 8, thresh, ithresh, hev_thresh); } +#endif // !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC -static void VFilter8i(uint8_t* u, uint8_t* v, int stride, - int thresh, int ithresh, int hev_thresh) { - FilterLoop24(u + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh); - FilterLoop24(v + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh); +#if !WEBP_NEON_OMIT_C_CODE +static void VFilter8i_C(uint8_t* u, uint8_t* v, int stride, + int thresh, int ithresh, int hev_thresh) { + FilterLoop24_C(u + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh); + FilterLoop24_C(v + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh); } +#endif // !WEBP_NEON_OMIT_C_CODE -static void HFilter8i(uint8_t* u, uint8_t* v, int stride, - int thresh, int ithresh, int hev_thresh) { - FilterLoop24(u + 4, 1, stride, 8, thresh, ithresh, hev_thresh); - FilterLoop24(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh); +#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC +static void HFilter8i_C(uint8_t* u, uint8_t* v, int stride, + int thresh, int ithresh, int hev_thresh) { + FilterLoop24_C(u + 4, 1, stride, 8, thresh, ithresh, hev_thresh); + FilterLoop24_C(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh); } +#endif // !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC //------------------------------------------------------------------------------ -static void DitherCombine8x8(const uint8_t* dither, uint8_t* dst, - int dst_stride) { +static void DitherCombine8x8_C(const uint8_t* dither, uint8_t* dst, + int dst_stride) { int i, j; for (j = 0; j < 8; ++j) { for (i = 0; i < 8; ++i) { @@ -701,62 +741,69 @@ extern void VP8DspInitMIPS32(void); extern void VP8DspInitMIPSdspR2(void); extern void VP8DspInitMSA(void); -static volatile VP8CPUInfo dec_last_cpuinfo_used = - (VP8CPUInfo)&dec_last_cpuinfo_used; +WEBP_DSP_INIT_FUNC(VP8DspInit) { + VP8InitClipTables(); -WEBP_TSAN_IGNORE_FUNCTION void VP8DspInit(void) { - if (dec_last_cpuinfo_used == VP8GetCPUInfo) return; +#if !WEBP_NEON_OMIT_C_CODE + VP8TransformWHT = TransformWHT_C; + VP8Transform = TransformTwo_C; + VP8TransformDC = TransformDC_C; + VP8TransformAC3 = TransformAC3_C; +#endif + VP8TransformUV = TransformUV_C; + VP8TransformDCUV = TransformDCUV_C; + +#if !WEBP_NEON_OMIT_C_CODE + VP8VFilter16 = VFilter16_C; + VP8VFilter16i = VFilter16i_C; + VP8HFilter16 = HFilter16_C; + VP8VFilter8 = VFilter8_C; + VP8VFilter8i = VFilter8i_C; + VP8SimpleVFilter16 = SimpleVFilter16_C; + VP8SimpleHFilter16 = SimpleHFilter16_C; + VP8SimpleVFilter16i = SimpleVFilter16i_C; + VP8SimpleHFilter16i = SimpleHFilter16i_C; +#endif - VP8InitClipTables(); +#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC + VP8HFilter16i = HFilter16i_C; + VP8HFilter8 = HFilter8_C; + VP8HFilter8i = HFilter8i_C; +#endif + +#if !WEBP_NEON_OMIT_C_CODE + VP8PredLuma4[0] = DC4_C; + VP8PredLuma4[1] = TM4_C; + VP8PredLuma4[2] = VE4_C; + VP8PredLuma4[4] = RD4_C; + VP8PredLuma4[6] = LD4_C; +#endif + + VP8PredLuma4[3] = HE4_C; + VP8PredLuma4[5] = VR4_C; + VP8PredLuma4[7] = VL4_C; + VP8PredLuma4[8] = HD4_C; + VP8PredLuma4[9] = HU4_C; + +#if !WEBP_NEON_OMIT_C_CODE + VP8PredLuma16[0] = DC16_C; + VP8PredLuma16[1] = TM16_C; + VP8PredLuma16[2] = VE16_C; + VP8PredLuma16[3] = HE16_C; + VP8PredLuma16[4] = DC16NoTop_C; + VP8PredLuma16[5] = DC16NoLeft_C; + VP8PredLuma16[6] = DC16NoTopLeft_C; + + VP8PredChroma8[0] = DC8uv_C; + VP8PredChroma8[1] = TM8uv_C; + VP8PredChroma8[2] = VE8uv_C; + VP8PredChroma8[3] = HE8uv_C; + VP8PredChroma8[4] = DC8uvNoTop_C; + VP8PredChroma8[5] = DC8uvNoLeft_C; + VP8PredChroma8[6] = DC8uvNoTopLeft_C; +#endif - VP8TransformWHT = TransformWHT; - VP8Transform = TransformTwo; - VP8TransformUV = TransformUV; - VP8TransformDC = TransformDC; - VP8TransformDCUV = TransformDCUV; - VP8TransformAC3 = TransformAC3; - - VP8VFilter16 = VFilter16; - VP8HFilter16 = HFilter16; - VP8VFilter8 = VFilter8; - VP8HFilter8 = HFilter8; - VP8VFilter16i = VFilter16i; - VP8HFilter16i = HFilter16i; - VP8VFilter8i = VFilter8i; - VP8HFilter8i = HFilter8i; - VP8SimpleVFilter16 = SimpleVFilter16; - VP8SimpleHFilter16 = SimpleHFilter16; - VP8SimpleVFilter16i = SimpleVFilter16i; - VP8SimpleHFilter16i = SimpleHFilter16i; - - VP8PredLuma4[0] = DC4; - VP8PredLuma4[1] = TM4; - VP8PredLuma4[2] = VE4; - VP8PredLuma4[3] = HE4; - VP8PredLuma4[4] = RD4; - VP8PredLuma4[5] = VR4; - VP8PredLuma4[6] = LD4; - VP8PredLuma4[7] = VL4; - VP8PredLuma4[8] = HD4; - VP8PredLuma4[9] = HU4; - - VP8PredLuma16[0] = DC16; - VP8PredLuma16[1] = TM16; - VP8PredLuma16[2] = VE16; - VP8PredLuma16[3] = HE16; - VP8PredLuma16[4] = DC16NoTop; - VP8PredLuma16[5] = DC16NoLeft; - VP8PredLuma16[6] = DC16NoTopLeft; - - VP8PredChroma8[0] = DC8uv; - VP8PredChroma8[1] = TM8uv; - VP8PredChroma8[2] = VE8uv; - VP8PredChroma8[3] = HE8uv; - VP8PredChroma8[4] = DC8uvNoTop; - VP8PredChroma8[5] = DC8uvNoLeft; - VP8PredChroma8[6] = DC8uvNoTopLeft; - - VP8DitherCombine8x8 = DitherCombine8x8; + VP8DitherCombine8x8 = DitherCombine8x8_C; // If defined, use CPUInfo() to overwrite some pointers with faster versions. if (VP8GetCPUInfo != NULL) { @@ -770,11 +817,6 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8DspInit(void) { #endif } #endif -#if defined(WEBP_USE_NEON) - if (VP8GetCPUInfo(kNEON)) { - VP8DspInitNEON(); - } -#endif #if defined(WEBP_USE_MIPS32) if (VP8GetCPUInfo(kMIPS32)) { VP8DspInitMIPS32(); @@ -791,5 +833,55 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8DspInit(void) { } #endif } - dec_last_cpuinfo_used = VP8GetCPUInfo; + +#if defined(WEBP_USE_NEON) + if (WEBP_NEON_OMIT_C_CODE || + (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) { + VP8DspInitNEON(); + } +#endif + + assert(VP8TransformWHT != NULL); + assert(VP8Transform != NULL); + assert(VP8TransformDC != NULL); + assert(VP8TransformAC3 != NULL); + assert(VP8TransformUV != NULL); + assert(VP8TransformDCUV != NULL); + assert(VP8VFilter16 != NULL); + assert(VP8HFilter16 != NULL); + assert(VP8VFilter8 != NULL); + assert(VP8HFilter8 != NULL); + assert(VP8VFilter16i != NULL); + assert(VP8HFilter16i != NULL); + assert(VP8VFilter8i != NULL); + assert(VP8HFilter8i != NULL); + assert(VP8SimpleVFilter16 != NULL); + assert(VP8SimpleHFilter16 != NULL); + assert(VP8SimpleVFilter16i != NULL); + assert(VP8SimpleHFilter16i != NULL); + assert(VP8PredLuma4[0] != NULL); + assert(VP8PredLuma4[1] != NULL); + assert(VP8PredLuma4[2] != NULL); + assert(VP8PredLuma4[3] != NULL); + assert(VP8PredLuma4[4] != NULL); + assert(VP8PredLuma4[5] != NULL); + assert(VP8PredLuma4[6] != NULL); + assert(VP8PredLuma4[7] != NULL); + assert(VP8PredLuma4[8] != NULL); + assert(VP8PredLuma4[9] != NULL); + assert(VP8PredLuma16[0] != NULL); + assert(VP8PredLuma16[1] != NULL); + assert(VP8PredLuma16[2] != NULL); + assert(VP8PredLuma16[3] != NULL); + assert(VP8PredLuma16[4] != NULL); + assert(VP8PredLuma16[5] != NULL); + assert(VP8PredLuma16[6] != NULL); + assert(VP8PredChroma8[0] != NULL); + assert(VP8PredChroma8[1] != NULL); + assert(VP8PredChroma8[2] != NULL); + assert(VP8PredChroma8[3] != NULL); + assert(VP8PredChroma8[4] != NULL); + assert(VP8PredChroma8[5] != NULL); + assert(VP8PredChroma8[6] != NULL); + assert(VP8DitherCombine8x8 != NULL); } diff --git a/src/3rdparty/libwebp/src/dsp/dec_clip_tables.c b/src/3rdparty/libwebp/src/dsp/dec_clip_tables.c index 74ba34c..427b74f 100644 --- a/src/3rdparty/libwebp/src/dsp/dec_clip_tables.c +++ b/src/3rdparty/libwebp/src/dsp/dec_clip_tables.c @@ -11,11 +11,14 @@ // // Author: Skal (pascal.massimino@gmail.com) -#include "./dsp.h" +#include "src/dsp/dsp.h" -#define USE_STATIC_TABLES // undefine to have run-time table initialization +// define to 0 to have run-time table initialization +#if !defined(USE_STATIC_TABLES) +#define USE_STATIC_TABLES 1 // ALTERNATE_CODE +#endif -#ifdef USE_STATIC_TABLES +#if (USE_STATIC_TABLES == 1) static const uint8_t abs0[255 + 255 + 1] = { 0xff, 0xfe, 0xfd, 0xfc, 0xfb, 0xfa, 0xf9, 0xf8, 0xf7, 0xf6, 0xf5, 0xf4, @@ -337,7 +340,7 @@ static uint8_t clip1[255 + 511 + 1]; // and make sure it's set to true _last_ (so as to be thread-safe) static volatile int tables_ok = 0; -#endif +#endif // USE_STATIC_TABLES const int8_t* const VP8ksclip1 = (const int8_t*)&sclip1[1020]; const int8_t* const VP8ksclip2 = (const int8_t*)&sclip2[112]; @@ -345,7 +348,7 @@ const uint8_t* const VP8kclip1 = &clip1[255]; const uint8_t* const VP8kabs0 = &abs0[255]; WEBP_TSAN_IGNORE_FUNCTION void VP8InitClipTables(void) { -#if !defined(USE_STATIC_TABLES) +#if (USE_STATIC_TABLES == 0) int i; if (!tables_ok) { for (i = -255; i <= 255; ++i) { diff --git a/src/3rdparty/libwebp/src/dsp/dec_mips32.c b/src/3rdparty/libwebp/src/dsp/dec_mips32.c index 4e9ef42..e4e7096 100644 --- a/src/3rdparty/libwebp/src/dsp/dec_mips32.c +++ b/src/3rdparty/libwebp/src/dsp/dec_mips32.c @@ -12,11 +12,11 @@ // Author(s): Djordje Pesut (djordje.pesut@imgtec.com) // Jovan Zelincevic (jovan.zelincevic@imgtec.com) -#include "./dsp.h" +#include "src/dsp/dsp.h" #if defined(WEBP_USE_MIPS32) -#include "./mips_macro.h" +#include "src/dsp/mips_macro.h" static const int kC1 = 20091 + (1 << 16); static const int kC2 = 35468; diff --git a/src/3rdparty/libwebp/src/dsp/dec_mips_dsp_r2.c b/src/3rdparty/libwebp/src/dsp/dec_mips_dsp_r2.c index db5c657..b0936bc 100644 --- a/src/3rdparty/libwebp/src/dsp/dec_mips_dsp_r2.c +++ b/src/3rdparty/libwebp/src/dsp/dec_mips_dsp_r2.c @@ -12,11 +12,11 @@ // Author(s): Djordje Pesut (djordje.pesut@imgtec.com) // Jovan Zelincevic (jovan.zelincevic@imgtec.com) -#include "./dsp.h" +#include "src/dsp/dsp.h" #if defined(WEBP_USE_MIPS_DSP_R2) -#include "./mips_macro.h" +#include "src/dsp/mips_macro.h" static const int kC1 = 20091 + (1 << 16); static const int kC2 = 35468; diff --git a/src/3rdparty/libwebp/src/dsp/dec_msa.c b/src/3rdparty/libwebp/src/dsp/dec_msa.c index 8d9c98c..8090622 100644 --- a/src/3rdparty/libwebp/src/dsp/dec_msa.c +++ b/src/3rdparty/libwebp/src/dsp/dec_msa.c @@ -12,11 +12,11 @@ // Author(s): Prashant Patil (prashant.patil@imgtec.com) -#include "./dsp.h" +#include "src/dsp/dsp.h" #if defined(WEBP_USE_MSA) -#include "./msa_macro.h" +#include "src/dsp/msa_macro.h" //------------------------------------------------------------------------------ // Transforms @@ -222,6 +222,7 @@ static void TransformAC3(const int16_t* in, uint8_t* dst) { const v16i8 cnst4b = __msa_ldi_b(4); \ const v16i8 cnst3b = __msa_ldi_b(3); \ const v8i16 cnst9h = __msa_ldi_h(9); \ + const v8i16 cnst63h = __msa_ldi_h(63); \ \ FLIP_SIGN4(p1, p0, q0, q1, p1_m, p0_m, q0_m, q1_m); \ filt = __msa_subs_s_b(p1_m, q1_m); \ @@ -241,9 +242,9 @@ static void TransformAC3(const int16_t* in, uint8_t* dst) { ILVRL_B2_SH(filt_sign, filt, filt_r, filt_l); \ /* update q2/p2 */ \ temp0 = filt_r * cnst9h; \ - temp1 = ADDVI_H(temp0, 63); \ + temp1 = temp0 + cnst63h; \ temp2 = filt_l * cnst9h; \ - temp3 = ADDVI_H(temp2, 63); \ + temp3 = temp2 + cnst63h; \ FILT2(q2_m, p2_m, q2, p2); \ /* update q1/p1 */ \ temp1 = temp1 + temp0; \ @@ -708,7 +709,7 @@ static void VE4(uint8_t* dst) { // vertical const uint32_t val0 = LW(ptop + 0); const uint32_t val1 = LW(ptop + 4); uint32_t out; - v16u8 A, B, C, AC, B2, R; + v16u8 A = { 0 }, B, C, AC, B2, R; INSERT_W2_UB(val0, val1, A); B = SLDI_UB(A, A, 1); @@ -725,7 +726,7 @@ static void RD4(uint8_t* dst) { // Down-right uint32_t val0 = LW(ptop + 0); uint32_t val1 = LW(ptop + 4); uint32_t val2, val3; - v16u8 A, B, C, AC, B2, R, A1; + v16u8 A, B, C, AC, B2, R, A1 = { 0 }; INSERT_W2_UB(val0, val1, A1); A = SLDI_UB(A1, A1, 12); @@ -753,7 +754,7 @@ static void LD4(uint8_t* dst) { // Down-Left uint32_t val0 = LW(ptop + 0); uint32_t val1 = LW(ptop + 4); uint32_t val2, val3; - v16u8 A, B, C, AC, B2, R; + v16u8 A = { 0 }, B, C, AC, B2, R; INSERT_W2_UB(val0, val1, A); B = SLDI_UB(A, A, 1); diff --git a/src/3rdparty/libwebp/src/dsp/dec_neon.c b/src/3rdparty/libwebp/src/dsp/dec_neon.c index 34796cf..ffa697f 100644 --- a/src/3rdparty/libwebp/src/dsp/dec_neon.c +++ b/src/3rdparty/libwebp/src/dsp/dec_neon.c @@ -12,43 +12,23 @@ // Authors: Somnath Banerjee (somnath@google.com) // Johann Koenig (johannkoenig@google.com) -#include "./dsp.h" +#include "src/dsp/dsp.h" #if defined(WEBP_USE_NEON) -#include "./neon.h" -#include "../dec/vp8i_dec.h" +#include "src/dsp/neon.h" +#include "src/dec/vp8i_dec.h" //------------------------------------------------------------------------------ // NxM Loading functions -// Load/Store vertical edge -#define LOAD8x4(c1, c2, c3, c4, b1, b2, stride) \ - "vld4.8 {" #c1 "[0]," #c2 "[0]," #c3 "[0]," #c4 "[0]}," #b1 "," #stride "\n" \ - "vld4.8 {" #c1 "[1]," #c2 "[1]," #c3 "[1]," #c4 "[1]}," #b2 "," #stride "\n" \ - "vld4.8 {" #c1 "[2]," #c2 "[2]," #c3 "[2]," #c4 "[2]}," #b1 "," #stride "\n" \ - "vld4.8 {" #c1 "[3]," #c2 "[3]," #c3 "[3]," #c4 "[3]}," #b2 "," #stride "\n" \ - "vld4.8 {" #c1 "[4]," #c2 "[4]," #c3 "[4]," #c4 "[4]}," #b1 "," #stride "\n" \ - "vld4.8 {" #c1 "[5]," #c2 "[5]," #c3 "[5]," #c4 "[5]}," #b2 "," #stride "\n" \ - "vld4.8 {" #c1 "[6]," #c2 "[6]," #c3 "[6]," #c4 "[6]}," #b1 "," #stride "\n" \ - "vld4.8 {" #c1 "[7]," #c2 "[7]," #c3 "[7]," #c4 "[7]}," #b2 "," #stride "\n" - -#define STORE8x2(c1, c2, p, stride) \ - "vst2.8 {" #c1 "[0], " #c2 "[0]}," #p "," #stride " \n" \ - "vst2.8 {" #c1 "[1], " #c2 "[1]}," #p "," #stride " \n" \ - "vst2.8 {" #c1 "[2], " #c2 "[2]}," #p "," #stride " \n" \ - "vst2.8 {" #c1 "[3], " #c2 "[3]}," #p "," #stride " \n" \ - "vst2.8 {" #c1 "[4], " #c2 "[4]}," #p "," #stride " \n" \ - "vst2.8 {" #c1 "[5], " #c2 "[5]}," #p "," #stride " \n" \ - "vst2.8 {" #c1 "[6], " #c2 "[6]}," #p "," #stride " \n" \ - "vst2.8 {" #c1 "[7], " #c2 "[7]}," #p "," #stride " \n" - #if !defined(WORK_AROUND_GCC) // This intrinsics version makes gcc-4.6.3 crash during Load4x??() compilation // (register alloc, probably). The variants somewhat mitigate the problem, but // not quite. HFilter16i() remains problematic. -static WEBP_INLINE uint8x8x4_t Load4x8(const uint8_t* const src, int stride) { +static WEBP_INLINE uint8x8x4_t Load4x8_NEON(const uint8_t* const src, + int stride) { const uint8x8_t zero = vdup_n_u8(0); uint8x8x4_t out; INIT_VECTOR4(out, zero, zero, zero, zero); @@ -63,13 +43,15 @@ static WEBP_INLINE uint8x8x4_t Load4x8(const uint8_t* const src, int stride) { return out; } -static WEBP_INLINE void Load4x16(const uint8_t* const src, int stride, - uint8x16_t* const p1, uint8x16_t* const p0, - uint8x16_t* const q0, uint8x16_t* const q1) { +static WEBP_INLINE void Load4x16_NEON(const uint8_t* const src, int stride, + uint8x16_t* const p1, + uint8x16_t* const p0, + uint8x16_t* const q0, + uint8x16_t* const q1) { // row0 = p1[0..7]|p0[0..7]|q0[0..7]|q1[0..7] // row8 = p1[8..15]|p0[8..15]|q0[8..15]|q1[8..15] - const uint8x8x4_t row0 = Load4x8(src - 2 + 0 * stride, stride); - const uint8x8x4_t row8 = Load4x8(src - 2 + 8 * stride, stride); + const uint8x8x4_t row0 = Load4x8_NEON(src - 2 + 0 * stride, stride); + const uint8x8x4_t row8 = Load4x8_NEON(src - 2 + 8 * stride, stride); *p1 = vcombine_u8(row0.val[0], row8.val[0]); *p0 = vcombine_u8(row0.val[1], row8.val[1]); *q0 = vcombine_u8(row0.val[2], row8.val[2]); @@ -83,9 +65,11 @@ static WEBP_INLINE void Load4x16(const uint8_t* const src, int stride, src += stride; \ } while (0) -static WEBP_INLINE void Load4x16(const uint8_t* src, int stride, - uint8x16_t* const p1, uint8x16_t* const p0, - uint8x16_t* const q0, uint8x16_t* const q1) { +static WEBP_INLINE void Load4x16_NEON(const uint8_t* src, int stride, + uint8x16_t* const p1, + uint8x16_t* const p0, + uint8x16_t* const q0, + uint8x16_t* const q1) { const uint32x4_t zero = vdupq_n_u32(0); uint32x4x4_t in; INIT_VECTOR4(in, zero, zero, zero, zero); @@ -126,40 +110,40 @@ static WEBP_INLINE void Load4x16(const uint8_t* src, int stride, #endif // !WORK_AROUND_GCC -static WEBP_INLINE void Load8x16(const uint8_t* const src, int stride, - uint8x16_t* const p3, uint8x16_t* const p2, - uint8x16_t* const p1, uint8x16_t* const p0, - uint8x16_t* const q0, uint8x16_t* const q1, - uint8x16_t* const q2, uint8x16_t* const q3) { - Load4x16(src - 2, stride, p3, p2, p1, p0); - Load4x16(src + 2, stride, q0, q1, q2, q3); +static WEBP_INLINE void Load8x16_NEON( + const uint8_t* const src, int stride, + uint8x16_t* const p3, uint8x16_t* const p2, uint8x16_t* const p1, + uint8x16_t* const p0, uint8x16_t* const q0, uint8x16_t* const q1, + uint8x16_t* const q2, uint8x16_t* const q3) { + Load4x16_NEON(src - 2, stride, p3, p2, p1, p0); + Load4x16_NEON(src + 2, stride, q0, q1, q2, q3); } -static WEBP_INLINE void Load16x4(const uint8_t* const src, int stride, - uint8x16_t* const p1, uint8x16_t* const p0, - uint8x16_t* const q0, uint8x16_t* const q1) { +static WEBP_INLINE void Load16x4_NEON(const uint8_t* const src, int stride, + uint8x16_t* const p1, + uint8x16_t* const p0, + uint8x16_t* const q0, + uint8x16_t* const q1) { *p1 = vld1q_u8(src - 2 * stride); *p0 = vld1q_u8(src - 1 * stride); *q0 = vld1q_u8(src + 0 * stride); *q1 = vld1q_u8(src + 1 * stride); } -static WEBP_INLINE void Load16x8(const uint8_t* const src, int stride, - uint8x16_t* const p3, uint8x16_t* const p2, - uint8x16_t* const p1, uint8x16_t* const p0, - uint8x16_t* const q0, uint8x16_t* const q1, - uint8x16_t* const q2, uint8x16_t* const q3) { - Load16x4(src - 2 * stride, stride, p3, p2, p1, p0); - Load16x4(src + 2 * stride, stride, q0, q1, q2, q3); +static WEBP_INLINE void Load16x8_NEON( + const uint8_t* const src, int stride, + uint8x16_t* const p3, uint8x16_t* const p2, uint8x16_t* const p1, + uint8x16_t* const p0, uint8x16_t* const q0, uint8x16_t* const q1, + uint8x16_t* const q2, uint8x16_t* const q3) { + Load16x4_NEON(src - 2 * stride, stride, p3, p2, p1, p0); + Load16x4_NEON(src + 2 * stride, stride, q0, q1, q2, q3); } -static WEBP_INLINE void Load8x8x2(const uint8_t* const u, - const uint8_t* const v, - int stride, - uint8x16_t* const p3, uint8x16_t* const p2, - uint8x16_t* const p1, uint8x16_t* const p0, - uint8x16_t* const q0, uint8x16_t* const q1, - uint8x16_t* const q2, uint8x16_t* const q3) { +static WEBP_INLINE void Load8x8x2_NEON( + const uint8_t* const u, const uint8_t* const v, int stride, + uint8x16_t* const p3, uint8x16_t* const p2, uint8x16_t* const p1, + uint8x16_t* const p0, uint8x16_t* const q0, uint8x16_t* const q1, + uint8x16_t* const q2, uint8x16_t* const q3) { // We pack the 8x8 u-samples in the lower half of the uint8x16_t destination // and the v-samples on the higher half. *p3 = vcombine_u8(vld1_u8(u - 4 * stride), vld1_u8(v - 4 * stride)); @@ -177,13 +161,11 @@ static WEBP_INLINE void Load8x8x2(const uint8_t* const u, #define LOAD_UV_8(ROW) \ vcombine_u8(vld1_u8(u - 4 + (ROW) * stride), vld1_u8(v - 4 + (ROW) * stride)) -static WEBP_INLINE void Load8x8x2T(const uint8_t* const u, - const uint8_t* const v, - int stride, - uint8x16_t* const p3, uint8x16_t* const p2, - uint8x16_t* const p1, uint8x16_t* const p0, - uint8x16_t* const q0, uint8x16_t* const q1, - uint8x16_t* const q2, uint8x16_t* const q3) { +static WEBP_INLINE void Load8x8x2T_NEON( + const uint8_t* const u, const uint8_t* const v, int stride, + uint8x16_t* const p3, uint8x16_t* const p2, uint8x16_t* const p1, + uint8x16_t* const p0, uint8x16_t* const q0, uint8x16_t* const q1, + uint8x16_t* const q2, uint8x16_t* const q3) { // We pack the 8x8 u-samples in the lower half of the uint8x16_t destination // and the v-samples on the higher half. const uint8x16_t row0 = LOAD_UV_8(0); @@ -238,8 +220,8 @@ static WEBP_INLINE void Load8x8x2T(const uint8_t* const u, #endif // !WORK_AROUND_GCC -static WEBP_INLINE void Store2x8(const uint8x8x2_t v, - uint8_t* const dst, int stride) { +static WEBP_INLINE void Store2x8_NEON(const uint8x8x2_t v, + uint8_t* const dst, int stride) { vst2_lane_u8(dst + 0 * stride, v, 0); vst2_lane_u8(dst + 1 * stride, v, 1); vst2_lane_u8(dst + 2 * stride, v, 2); @@ -250,20 +232,20 @@ static WEBP_INLINE void Store2x8(const uint8x8x2_t v, vst2_lane_u8(dst + 7 * stride, v, 7); } -static WEBP_INLINE void Store2x16(const uint8x16_t p0, const uint8x16_t q0, - uint8_t* const dst, int stride) { +static WEBP_INLINE void Store2x16_NEON(const uint8x16_t p0, const uint8x16_t q0, + uint8_t* const dst, int stride) { uint8x8x2_t lo, hi; lo.val[0] = vget_low_u8(p0); lo.val[1] = vget_low_u8(q0); hi.val[0] = vget_high_u8(p0); hi.val[1] = vget_high_u8(q0); - Store2x8(lo, dst - 1 + 0 * stride, stride); - Store2x8(hi, dst - 1 + 8 * stride, stride); + Store2x8_NEON(lo, dst - 1 + 0 * stride, stride); + Store2x8_NEON(hi, dst - 1 + 8 * stride, stride); } #if !defined(WORK_AROUND_GCC) -static WEBP_INLINE void Store4x8(const uint8x8x4_t v, - uint8_t* const dst, int stride) { +static WEBP_INLINE void Store4x8_NEON(const uint8x8x4_t v, + uint8_t* const dst, int stride) { vst4_lane_u8(dst + 0 * stride, v, 0); vst4_lane_u8(dst + 1 * stride, v, 1); vst4_lane_u8(dst + 2 * stride, v, 2); @@ -274,9 +256,9 @@ static WEBP_INLINE void Store4x8(const uint8x8x4_t v, vst4_lane_u8(dst + 7 * stride, v, 7); } -static WEBP_INLINE void Store4x16(const uint8x16_t p1, const uint8x16_t p0, - const uint8x16_t q0, const uint8x16_t q1, - uint8_t* const dst, int stride) { +static WEBP_INLINE void Store4x16_NEON(const uint8x16_t p1, const uint8x16_t p0, + const uint8x16_t q0, const uint8x16_t q1, + uint8_t* const dst, int stride) { uint8x8x4_t lo, hi; INIT_VECTOR4(lo, vget_low_u8(p1), vget_low_u8(p0), @@ -284,27 +266,28 @@ static WEBP_INLINE void Store4x16(const uint8x16_t p1, const uint8x16_t p0, INIT_VECTOR4(hi, vget_high_u8(p1), vget_high_u8(p0), vget_high_u8(q0), vget_high_u8(q1)); - Store4x8(lo, dst - 2 + 0 * stride, stride); - Store4x8(hi, dst - 2 + 8 * stride, stride); + Store4x8_NEON(lo, dst - 2 + 0 * stride, stride); + Store4x8_NEON(hi, dst - 2 + 8 * stride, stride); } #endif // !WORK_AROUND_GCC -static WEBP_INLINE void Store16x2(const uint8x16_t p0, const uint8x16_t q0, - uint8_t* const dst, int stride) { +static WEBP_INLINE void Store16x2_NEON(const uint8x16_t p0, const uint8x16_t q0, + uint8_t* const dst, int stride) { vst1q_u8(dst - stride, p0); vst1q_u8(dst, q0); } -static WEBP_INLINE void Store16x4(const uint8x16_t p1, const uint8x16_t p0, - const uint8x16_t q0, const uint8x16_t q1, - uint8_t* const dst, int stride) { - Store16x2(p1, p0, dst - stride, stride); - Store16x2(q0, q1, dst + stride, stride); +static WEBP_INLINE void Store16x4_NEON(const uint8x16_t p1, const uint8x16_t p0, + const uint8x16_t q0, const uint8x16_t q1, + uint8_t* const dst, int stride) { + Store16x2_NEON(p1, p0, dst - stride, stride); + Store16x2_NEON(q0, q1, dst + stride, stride); } -static WEBP_INLINE void Store8x2x2(const uint8x16_t p0, const uint8x16_t q0, - uint8_t* const u, uint8_t* const v, - int stride) { +static WEBP_INLINE void Store8x2x2_NEON(const uint8x16_t p0, + const uint8x16_t q0, + uint8_t* const u, uint8_t* const v, + int stride) { // p0 and q0 contain the u+v samples packed in low/high halves. vst1_u8(u - stride, vget_low_u8(p0)); vst1_u8(u, vget_low_u8(q0)); @@ -312,13 +295,15 @@ static WEBP_INLINE void Store8x2x2(const uint8x16_t p0, const uint8x16_t q0, vst1_u8(v, vget_high_u8(q0)); } -static WEBP_INLINE void Store8x4x2(const uint8x16_t p1, const uint8x16_t p0, - const uint8x16_t q0, const uint8x16_t q1, - uint8_t* const u, uint8_t* const v, - int stride) { +static WEBP_INLINE void Store8x4x2_NEON(const uint8x16_t p1, + const uint8x16_t p0, + const uint8x16_t q0, + const uint8x16_t q1, + uint8_t* const u, uint8_t* const v, + int stride) { // The p1...q1 registers contain the u+v samples packed in low/high halves. - Store8x2x2(p1, p0, u - stride, v - stride, stride); - Store8x2x2(q0, q1, u + stride, v + stride, stride); + Store8x2x2_NEON(p1, p0, u - stride, v - stride, stride); + Store8x2x2_NEON(q0, q1, u + stride, v + stride, stride); } #if !defined(WORK_AROUND_GCC) @@ -329,11 +314,10 @@ static WEBP_INLINE void Store8x4x2(const uint8x16_t p1, const uint8x16_t p0, (DST) += stride; \ } while (0) -static WEBP_INLINE void Store6x8x2(const uint8x16_t p2, const uint8x16_t p1, - const uint8x16_t p0, const uint8x16_t q0, - const uint8x16_t q1, const uint8x16_t q2, - uint8_t* u, uint8_t* v, - int stride) { +static WEBP_INLINE void Store6x8x2_NEON( + const uint8x16_t p2, const uint8x16_t p1, const uint8x16_t p0, + const uint8x16_t q0, const uint8x16_t q1, const uint8x16_t q2, + uint8_t* u, uint8_t* v, int stride) { uint8x8x3_t u0, u1, v0, v1; INIT_VECTOR3(u0, vget_low_u8(p2), vget_low_u8(p1), vget_low_u8(p0)); INIT_VECTOR3(u1, vget_low_u8(q0), vget_low_u8(q1), vget_low_u8(q2)); @@ -358,10 +342,12 @@ static WEBP_INLINE void Store6x8x2(const uint8x16_t p2, const uint8x16_t p1, } #undef STORE6_LANE -static WEBP_INLINE void Store4x8x2(const uint8x16_t p1, const uint8x16_t p0, - const uint8x16_t q0, const uint8x16_t q1, - uint8_t* const u, uint8_t* const v, - int stride) { +static WEBP_INLINE void Store4x8x2_NEON(const uint8x16_t p1, + const uint8x16_t p0, + const uint8x16_t q0, + const uint8x16_t q1, + uint8_t* const u, uint8_t* const v, + int stride) { uint8x8x4_t u0, v0; INIT_VECTOR4(u0, vget_low_u8(p1), vget_low_u8(p0), @@ -390,15 +376,15 @@ static WEBP_INLINE void Store4x8x2(const uint8x16_t p1, const uint8x16_t p0, #endif // !WORK_AROUND_GCC // Zero extend 'v' to an int16x8_t. -static WEBP_INLINE int16x8_t ConvertU8ToS16(uint8x8_t v) { +static WEBP_INLINE int16x8_t ConvertU8ToS16_NEON(uint8x8_t v) { return vreinterpretq_s16_u16(vmovl_u8(v)); } // Performs unsigned 8b saturation on 'dst01' and 'dst23' storing the result // to the corresponding rows of 'dst'. -static WEBP_INLINE void SaturateAndStore4x4(uint8_t* const dst, - const int16x8_t dst01, - const int16x8_t dst23) { +static WEBP_INLINE void SaturateAndStore4x4_NEON(uint8_t* const dst, + const int16x8_t dst01, + const int16x8_t dst23) { // Unsigned saturate to 8b. const uint8x8_t dst01_u8 = vqmovun_s16(dst01); const uint8x8_t dst23_u8 = vqmovun_s16(dst23); @@ -410,8 +396,9 @@ static WEBP_INLINE void SaturateAndStore4x4(uint8_t* const dst, vst1_lane_u32((uint32_t*)(dst + 3 * BPS), vreinterpret_u32_u8(dst23_u8), 1); } -static WEBP_INLINE void Add4x4(const int16x8_t row01, const int16x8_t row23, - uint8_t* const dst) { +static WEBP_INLINE void Add4x4_NEON(const int16x8_t row01, + const int16x8_t row23, + uint8_t* const dst) { uint32x2_t dst01 = vdup_n_u32(0); uint32x2_t dst23 = vdup_n_u32(0); @@ -423,23 +410,23 @@ static WEBP_INLINE void Add4x4(const int16x8_t row01, const int16x8_t row23, { // Convert to 16b. - const int16x8_t dst01_s16 = ConvertU8ToS16(vreinterpret_u8_u32(dst01)); - const int16x8_t dst23_s16 = ConvertU8ToS16(vreinterpret_u8_u32(dst23)); + const int16x8_t dst01_s16 = ConvertU8ToS16_NEON(vreinterpret_u8_u32(dst01)); + const int16x8_t dst23_s16 = ConvertU8ToS16_NEON(vreinterpret_u8_u32(dst23)); // Descale with rounding. const int16x8_t out01 = vrsraq_n_s16(dst01_s16, row01, 3); const int16x8_t out23 = vrsraq_n_s16(dst23_s16, row23, 3); // Add the inverse transform. - SaturateAndStore4x4(dst, out01, out23); + SaturateAndStore4x4_NEON(dst, out01, out23); } } //----------------------------------------------------------------------------- // Simple In-loop filtering (Paragraph 15.2) -static uint8x16_t NeedsFilter(const uint8x16_t p1, const uint8x16_t p0, - const uint8x16_t q0, const uint8x16_t q1, - int thresh) { +static uint8x16_t NeedsFilter_NEON(const uint8x16_t p1, const uint8x16_t p0, + const uint8x16_t q0, const uint8x16_t q1, + int thresh) { const uint8x16_t thresh_v = vdupq_n_u8((uint8_t)thresh); const uint8x16_t a_p0_q0 = vabdq_u8(p0, q0); // abs(p0-q0) const uint8x16_t a_p1_q1 = vabdq_u8(p1, q1); // abs(p1-q1) @@ -450,18 +437,18 @@ static uint8x16_t NeedsFilter(const uint8x16_t p1, const uint8x16_t p0, return mask; } -static int8x16_t FlipSign(const uint8x16_t v) { +static int8x16_t FlipSign_NEON(const uint8x16_t v) { const uint8x16_t sign_bit = vdupq_n_u8(0x80); return vreinterpretq_s8_u8(veorq_u8(v, sign_bit)); } -static uint8x16_t FlipSignBack(const int8x16_t v) { +static uint8x16_t FlipSignBack_NEON(const int8x16_t v) { const int8x16_t sign_bit = vdupq_n_s8(0x80); return vreinterpretq_u8_s8(veorq_s8(v, sign_bit)); } -static int8x16_t GetBaseDelta(const int8x16_t p1, const int8x16_t p0, - const int8x16_t q0, const int8x16_t q1) { +static int8x16_t GetBaseDelta_NEON(const int8x16_t p1, const int8x16_t p0, + const int8x16_t q0, const int8x16_t q1) { const int8x16_t q0_p0 = vqsubq_s8(q0, p0); // (q0-p0) const int8x16_t p1_q1 = vqsubq_s8(p1, q1); // (p1-q1) const int8x16_t s1 = vqaddq_s8(p1_q1, q0_p0); // (p1-q1) + 1 * (q0 - p0) @@ -470,7 +457,7 @@ static int8x16_t GetBaseDelta(const int8x16_t p1, const int8x16_t p0, return s3; } -static int8x16_t GetBaseDelta0(const int8x16_t p0, const int8x16_t q0) { +static int8x16_t GetBaseDelta0_NEON(const int8x16_t p0, const int8x16_t q0) { const int8x16_t q0_p0 = vqsubq_s8(q0, p0); // (q0-p0) const int8x16_t s1 = vqaddq_s8(q0_p0, q0_p0); // 2 * (q0 - p0) const int8x16_t s2 = vqaddq_s8(q0_p0, s1); // 3 * (q0 - p0) @@ -479,9 +466,10 @@ static int8x16_t GetBaseDelta0(const int8x16_t p0, const int8x16_t q0) { //------------------------------------------------------------------------------ -static void ApplyFilter2NoFlip(const int8x16_t p0s, const int8x16_t q0s, - const int8x16_t delta, - int8x16_t* const op0, int8x16_t* const oq0) { +static void ApplyFilter2NoFlip_NEON(const int8x16_t p0s, const int8x16_t q0s, + const int8x16_t delta, + int8x16_t* const op0, + int8x16_t* const oq0) { const int8x16_t kCst3 = vdupq_n_s8(0x03); const int8x16_t kCst4 = vdupq_n_s8(0x04); const int8x16_t delta_p3 = vqaddq_s8(delta, kCst3); @@ -494,9 +482,9 @@ static void ApplyFilter2NoFlip(const int8x16_t p0s, const int8x16_t q0s, #if defined(WEBP_USE_INTRINSICS) -static void ApplyFilter2(const int8x16_t p0s, const int8x16_t q0s, - const int8x16_t delta, - uint8x16_t* const op0, uint8x16_t* const oq0) { +static void ApplyFilter2_NEON(const int8x16_t p0s, const int8x16_t q0s, + const int8x16_t delta, + uint8x16_t* const op0, uint8x16_t* const oq0) { const int8x16_t kCst3 = vdupq_n_s8(0x03); const int8x16_t kCst4 = vdupq_n_s8(0x04); const int8x16_t delta_p3 = vqaddq_s8(delta, kCst3); @@ -505,45 +493,66 @@ static void ApplyFilter2(const int8x16_t p0s, const int8x16_t q0s, const int8x16_t delta4 = vshrq_n_s8(delta_p4, 3); const int8x16_t sp0 = vqaddq_s8(p0s, delta3); const int8x16_t sq0 = vqsubq_s8(q0s, delta4); - *op0 = FlipSignBack(sp0); - *oq0 = FlipSignBack(sq0); -} - -static void DoFilter2(const uint8x16_t p1, const uint8x16_t p0, - const uint8x16_t q0, const uint8x16_t q1, - const uint8x16_t mask, - uint8x16_t* const op0, uint8x16_t* const oq0) { - const int8x16_t p1s = FlipSign(p1); - const int8x16_t p0s = FlipSign(p0); - const int8x16_t q0s = FlipSign(q0); - const int8x16_t q1s = FlipSign(q1); - const int8x16_t delta0 = GetBaseDelta(p1s, p0s, q0s, q1s); + *op0 = FlipSignBack_NEON(sp0); + *oq0 = FlipSignBack_NEON(sq0); +} + +static void DoFilter2_NEON(const uint8x16_t p1, const uint8x16_t p0, + const uint8x16_t q0, const uint8x16_t q1, + const uint8x16_t mask, + uint8x16_t* const op0, uint8x16_t* const oq0) { + const int8x16_t p1s = FlipSign_NEON(p1); + const int8x16_t p0s = FlipSign_NEON(p0); + const int8x16_t q0s = FlipSign_NEON(q0); + const int8x16_t q1s = FlipSign_NEON(q1); + const int8x16_t delta0 = GetBaseDelta_NEON(p1s, p0s, q0s, q1s); const int8x16_t delta1 = vandq_s8(delta0, vreinterpretq_s8_u8(mask)); - ApplyFilter2(p0s, q0s, delta1, op0, oq0); + ApplyFilter2_NEON(p0s, q0s, delta1, op0, oq0); } -static void SimpleVFilter16(uint8_t* p, int stride, int thresh) { +static void SimpleVFilter16_NEON(uint8_t* p, int stride, int thresh) { uint8x16_t p1, p0, q0, q1, op0, oq0; - Load16x4(p, stride, &p1, &p0, &q0, &q1); + Load16x4_NEON(p, stride, &p1, &p0, &q0, &q1); { - const uint8x16_t mask = NeedsFilter(p1, p0, q0, q1, thresh); - DoFilter2(p1, p0, q0, q1, mask, &op0, &oq0); + const uint8x16_t mask = NeedsFilter_NEON(p1, p0, q0, q1, thresh); + DoFilter2_NEON(p1, p0, q0, q1, mask, &op0, &oq0); } - Store16x2(op0, oq0, p, stride); + Store16x2_NEON(op0, oq0, p, stride); } -static void SimpleHFilter16(uint8_t* p, int stride, int thresh) { +static void SimpleHFilter16_NEON(uint8_t* p, int stride, int thresh) { uint8x16_t p1, p0, q0, q1, oq0, op0; - Load4x16(p, stride, &p1, &p0, &q0, &q1); + Load4x16_NEON(p, stride, &p1, &p0, &q0, &q1); { - const uint8x16_t mask = NeedsFilter(p1, p0, q0, q1, thresh); - DoFilter2(p1, p0, q0, q1, mask, &op0, &oq0); + const uint8x16_t mask = NeedsFilter_NEON(p1, p0, q0, q1, thresh); + DoFilter2_NEON(p1, p0, q0, q1, mask, &op0, &oq0); } - Store2x16(op0, oq0, p, stride); + Store2x16_NEON(op0, oq0, p, stride); } #else +// Load/Store vertical edge +#define LOAD8x4(c1, c2, c3, c4, b1, b2, stride) \ + "vld4.8 {" #c1 "[0]," #c2 "[0]," #c3 "[0]," #c4 "[0]}," #b1 "," #stride "\n" \ + "vld4.8 {" #c1 "[1]," #c2 "[1]," #c3 "[1]," #c4 "[1]}," #b2 "," #stride "\n" \ + "vld4.8 {" #c1 "[2]," #c2 "[2]," #c3 "[2]," #c4 "[2]}," #b1 "," #stride "\n" \ + "vld4.8 {" #c1 "[3]," #c2 "[3]," #c3 "[3]," #c4 "[3]}," #b2 "," #stride "\n" \ + "vld4.8 {" #c1 "[4]," #c2 "[4]," #c3 "[4]," #c4 "[4]}," #b1 "," #stride "\n" \ + "vld4.8 {" #c1 "[5]," #c2 "[5]," #c3 "[5]," #c4 "[5]}," #b2 "," #stride "\n" \ + "vld4.8 {" #c1 "[6]," #c2 "[6]," #c3 "[6]," #c4 "[6]}," #b1 "," #stride "\n" \ + "vld4.8 {" #c1 "[7]," #c2 "[7]," #c3 "[7]," #c4 "[7]}," #b2 "," #stride "\n" + +#define STORE8x2(c1, c2, p, stride) \ + "vst2.8 {" #c1 "[0], " #c2 "[0]}," #p "," #stride " \n" \ + "vst2.8 {" #c1 "[1], " #c2 "[1]}," #p "," #stride " \n" \ + "vst2.8 {" #c1 "[2], " #c2 "[2]}," #p "," #stride " \n" \ + "vst2.8 {" #c1 "[3], " #c2 "[3]}," #p "," #stride " \n" \ + "vst2.8 {" #c1 "[4], " #c2 "[4]}," #p "," #stride " \n" \ + "vst2.8 {" #c1 "[5], " #c2 "[5]}," #p "," #stride " \n" \ + "vst2.8 {" #c1 "[6], " #c2 "[6]}," #p "," #stride " \n" \ + "vst2.8 {" #c1 "[7], " #c2 "[7]}," #p "," #stride " \n" + #define QRegs "q0", "q1", "q2", "q3", \ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" @@ -592,7 +601,7 @@ static void SimpleHFilter16(uint8_t* p, int stride, int thresh) { DO_SIMPLE_FILTER(p0, q0, q9) /* apply filter */ \ FLIP_SIGN_BIT2(p0, q0, q10) -static void SimpleVFilter16(uint8_t* p, int stride, int thresh) { +static void SimpleVFilter16_NEON(uint8_t* p, int stride, int thresh) { __asm__ volatile ( "sub %[p], %[p], %[stride], lsl #1 \n" // p -= 2 * stride @@ -613,7 +622,7 @@ static void SimpleVFilter16(uint8_t* p, int stride, int thresh) { ); } -static void SimpleHFilter16(uint8_t* p, int stride, int thresh) { +static void SimpleHFilter16_NEON(uint8_t* p, int stride, int thresh) { __asm__ volatile ( "sub r4, %[p], #2 \n" // base1 = p - 2 "lsl r6, %[stride], #1 \n" // r6 = 2 * stride @@ -639,30 +648,33 @@ static void SimpleHFilter16(uint8_t* p, int stride, int thresh) { ); } +#undef LOAD8x4 +#undef STORE8x2 + #endif // WEBP_USE_INTRINSICS -static void SimpleVFilter16i(uint8_t* p, int stride, int thresh) { +static void SimpleVFilter16i_NEON(uint8_t* p, int stride, int thresh) { uint32_t k; for (k = 3; k != 0; --k) { p += 4 * stride; - SimpleVFilter16(p, stride, thresh); + SimpleVFilter16_NEON(p, stride, thresh); } } -static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) { +static void SimpleHFilter16i_NEON(uint8_t* p, int stride, int thresh) { uint32_t k; for (k = 3; k != 0; --k) { p += 4; - SimpleHFilter16(p, stride, thresh); + SimpleHFilter16_NEON(p, stride, thresh); } } //------------------------------------------------------------------------------ // Complex In-loop filtering (Paragraph 15.3) -static uint8x16_t NeedsHev(const uint8x16_t p1, const uint8x16_t p0, - const uint8x16_t q0, const uint8x16_t q1, - int hev_thresh) { +static uint8x16_t NeedsHev_NEON(const uint8x16_t p1, const uint8x16_t p0, + const uint8x16_t q0, const uint8x16_t q1, + int hev_thresh) { const uint8x16_t hev_thresh_v = vdupq_n_u8((uint8_t)hev_thresh); const uint8x16_t a_p1_p0 = vabdq_u8(p1, p0); // abs(p1 - p0) const uint8x16_t a_q1_q0 = vabdq_u8(q1, q0); // abs(q1 - q0) @@ -671,11 +683,11 @@ static uint8x16_t NeedsHev(const uint8x16_t p1, const uint8x16_t p0, return mask; } -static uint8x16_t NeedsFilter2(const uint8x16_t p3, const uint8x16_t p2, - const uint8x16_t p1, const uint8x16_t p0, - const uint8x16_t q0, const uint8x16_t q1, - const uint8x16_t q2, const uint8x16_t q3, - int ithresh, int thresh) { +static uint8x16_t NeedsFilter2_NEON(const uint8x16_t p3, const uint8x16_t p2, + const uint8x16_t p1, const uint8x16_t p0, + const uint8x16_t q0, const uint8x16_t q1, + const uint8x16_t q2, const uint8x16_t q3, + int ithresh, int thresh) { const uint8x16_t ithresh_v = vdupq_n_u8((uint8_t)ithresh); const uint8x16_t a_p3_p2 = vabdq_u8(p3, p2); // abs(p3 - p2) const uint8x16_t a_p2_p1 = vabdq_u8(p2, p1); // abs(p2 - p1) @@ -689,14 +701,14 @@ static uint8x16_t NeedsFilter2(const uint8x16_t p3, const uint8x16_t p2, const uint8x16_t max12 = vmaxq_u8(max1, max2); const uint8x16_t max123 = vmaxq_u8(max12, max3); const uint8x16_t mask2 = vcgeq_u8(ithresh_v, max123); - const uint8x16_t mask1 = NeedsFilter(p1, p0, q0, q1, thresh); + const uint8x16_t mask1 = NeedsFilter_NEON(p1, p0, q0, q1, thresh); const uint8x16_t mask = vandq_u8(mask1, mask2); return mask; } // 4-points filter -static void ApplyFilter4( +static void ApplyFilter4_NEON( const int8x16_t p1, const int8x16_t p0, const int8x16_t q0, const int8x16_t q1, const int8x16_t delta0, @@ -709,47 +721,47 @@ static void ApplyFilter4( const int8x16_t a1 = vshrq_n_s8(delta1, 3); const int8x16_t a2 = vshrq_n_s8(delta2, 3); const int8x16_t a3 = vrshrq_n_s8(a1, 1); // a3 = (a1 + 1) >> 1 - *op0 = FlipSignBack(vqaddq_s8(p0, a2)); // clip(p0 + a2) - *oq0 = FlipSignBack(vqsubq_s8(q0, a1)); // clip(q0 - a1) - *op1 = FlipSignBack(vqaddq_s8(p1, a3)); // clip(p1 + a3) - *oq1 = FlipSignBack(vqsubq_s8(q1, a3)); // clip(q1 - a3) + *op0 = FlipSignBack_NEON(vqaddq_s8(p0, a2)); // clip(p0 + a2) + *oq0 = FlipSignBack_NEON(vqsubq_s8(q0, a1)); // clip(q0 - a1) + *op1 = FlipSignBack_NEON(vqaddq_s8(p1, a3)); // clip(p1 + a3) + *oq1 = FlipSignBack_NEON(vqsubq_s8(q1, a3)); // clip(q1 - a3) } -static void DoFilter4( +static void DoFilter4_NEON( const uint8x16_t p1, const uint8x16_t p0, const uint8x16_t q0, const uint8x16_t q1, const uint8x16_t mask, const uint8x16_t hev_mask, uint8x16_t* const op1, uint8x16_t* const op0, uint8x16_t* const oq0, uint8x16_t* const oq1) { // This is a fused version of DoFilter2() calling ApplyFilter2 directly - const int8x16_t p1s = FlipSign(p1); - int8x16_t p0s = FlipSign(p0); - int8x16_t q0s = FlipSign(q0); - const int8x16_t q1s = FlipSign(q1); + const int8x16_t p1s = FlipSign_NEON(p1); + int8x16_t p0s = FlipSign_NEON(p0); + int8x16_t q0s = FlipSign_NEON(q0); + const int8x16_t q1s = FlipSign_NEON(q1); const uint8x16_t simple_lf_mask = vandq_u8(mask, hev_mask); // do_filter2 part (simple loopfilter on pixels with hev) { - const int8x16_t delta = GetBaseDelta(p1s, p0s, q0s, q1s); + const int8x16_t delta = GetBaseDelta_NEON(p1s, p0s, q0s, q1s); const int8x16_t simple_lf_delta = vandq_s8(delta, vreinterpretq_s8_u8(simple_lf_mask)); - ApplyFilter2NoFlip(p0s, q0s, simple_lf_delta, &p0s, &q0s); + ApplyFilter2NoFlip_NEON(p0s, q0s, simple_lf_delta, &p0s, &q0s); } // do_filter4 part (complex loopfilter on pixels without hev) { - const int8x16_t delta0 = GetBaseDelta0(p0s, q0s); + const int8x16_t delta0 = GetBaseDelta0_NEON(p0s, q0s); // we use: (mask & hev_mask) ^ mask = mask & !hev_mask const uint8x16_t complex_lf_mask = veorq_u8(simple_lf_mask, mask); const int8x16_t complex_lf_delta = vandq_s8(delta0, vreinterpretq_s8_u8(complex_lf_mask)); - ApplyFilter4(p1s, p0s, q0s, q1s, complex_lf_delta, op1, op0, oq0, oq1); + ApplyFilter4_NEON(p1s, p0s, q0s, q1s, complex_lf_delta, op1, op0, oq0, oq1); } } // 6-points filter -static void ApplyFilter6( +static void ApplyFilter6_NEON( const int8x16_t p2, const int8x16_t p1, const int8x16_t p0, const int8x16_t q0, const int8x16_t q1, const int8x16_t q2, const int8x16_t delta, @@ -778,35 +790,35 @@ static void ApplyFilter6( const int8x16_t a2 = vcombine_s8(a2_lo, a2_hi); const int8x16_t a3 = vcombine_s8(a3_lo, a3_hi); - *op0 = FlipSignBack(vqaddq_s8(p0, a1)); // clip(p0 + a1) - *oq0 = FlipSignBack(vqsubq_s8(q0, a1)); // clip(q0 - q1) - *oq1 = FlipSignBack(vqsubq_s8(q1, a2)); // clip(q1 - a2) - *op1 = FlipSignBack(vqaddq_s8(p1, a2)); // clip(p1 + a2) - *oq2 = FlipSignBack(vqsubq_s8(q2, a3)); // clip(q2 - a3) - *op2 = FlipSignBack(vqaddq_s8(p2, a3)); // clip(p2 + a3) + *op0 = FlipSignBack_NEON(vqaddq_s8(p0, a1)); // clip(p0 + a1) + *oq0 = FlipSignBack_NEON(vqsubq_s8(q0, a1)); // clip(q0 - q1) + *oq1 = FlipSignBack_NEON(vqsubq_s8(q1, a2)); // clip(q1 - a2) + *op1 = FlipSignBack_NEON(vqaddq_s8(p1, a2)); // clip(p1 + a2) + *oq2 = FlipSignBack_NEON(vqsubq_s8(q2, a3)); // clip(q2 - a3) + *op2 = FlipSignBack_NEON(vqaddq_s8(p2, a3)); // clip(p2 + a3) } -static void DoFilter6( +static void DoFilter6_NEON( const uint8x16_t p2, const uint8x16_t p1, const uint8x16_t p0, const uint8x16_t q0, const uint8x16_t q1, const uint8x16_t q2, const uint8x16_t mask, const uint8x16_t hev_mask, uint8x16_t* const op2, uint8x16_t* const op1, uint8x16_t* const op0, uint8x16_t* const oq0, uint8x16_t* const oq1, uint8x16_t* const oq2) { // This is a fused version of DoFilter2() calling ApplyFilter2 directly - const int8x16_t p2s = FlipSign(p2); - const int8x16_t p1s = FlipSign(p1); - int8x16_t p0s = FlipSign(p0); - int8x16_t q0s = FlipSign(q0); - const int8x16_t q1s = FlipSign(q1); - const int8x16_t q2s = FlipSign(q2); + const int8x16_t p2s = FlipSign_NEON(p2); + const int8x16_t p1s = FlipSign_NEON(p1); + int8x16_t p0s = FlipSign_NEON(p0); + int8x16_t q0s = FlipSign_NEON(q0); + const int8x16_t q1s = FlipSign_NEON(q1); + const int8x16_t q2s = FlipSign_NEON(q2); const uint8x16_t simple_lf_mask = vandq_u8(mask, hev_mask); - const int8x16_t delta0 = GetBaseDelta(p1s, p0s, q0s, q1s); + const int8x16_t delta0 = GetBaseDelta_NEON(p1s, p0s, q0s, q1s); // do_filter2 part (simple loopfilter on pixels with hev) { const int8x16_t simple_lf_delta = vandq_s8(delta0, vreinterpretq_s8_u8(simple_lf_mask)); - ApplyFilter2NoFlip(p0s, q0s, simple_lf_delta, &p0s, &q0s); + ApplyFilter2NoFlip_NEON(p0s, q0s, simple_lf_delta, &p0s, &q0s); } // do_filter6 part (complex loopfilter on pixels without hev) @@ -815,65 +827,65 @@ static void DoFilter6( const uint8x16_t complex_lf_mask = veorq_u8(simple_lf_mask, mask); const int8x16_t complex_lf_delta = vandq_s8(delta0, vreinterpretq_s8_u8(complex_lf_mask)); - ApplyFilter6(p2s, p1s, p0s, q0s, q1s, q2s, complex_lf_delta, - op2, op1, op0, oq0, oq1, oq2); + ApplyFilter6_NEON(p2s, p1s, p0s, q0s, q1s, q2s, complex_lf_delta, + op2, op1, op0, oq0, oq1, oq2); } } // on macroblock edges -static void VFilter16(uint8_t* p, int stride, - int thresh, int ithresh, int hev_thresh) { +static void VFilter16_NEON(uint8_t* p, int stride, + int thresh, int ithresh, int hev_thresh) { uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3; - Load16x8(p, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); + Load16x8_NEON(p, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); { - const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3, - ithresh, thresh); - const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh); + const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3, + ithresh, thresh); + const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh); uint8x16_t op2, op1, op0, oq0, oq1, oq2; - DoFilter6(p2, p1, p0, q0, q1, q2, mask, hev_mask, - &op2, &op1, &op0, &oq0, &oq1, &oq2); - Store16x2(op2, op1, p - 2 * stride, stride); - Store16x2(op0, oq0, p + 0 * stride, stride); - Store16x2(oq1, oq2, p + 2 * stride, stride); + DoFilter6_NEON(p2, p1, p0, q0, q1, q2, mask, hev_mask, + &op2, &op1, &op0, &oq0, &oq1, &oq2); + Store16x2_NEON(op2, op1, p - 2 * stride, stride); + Store16x2_NEON(op0, oq0, p + 0 * stride, stride); + Store16x2_NEON(oq1, oq2, p + 2 * stride, stride); } } -static void HFilter16(uint8_t* p, int stride, - int thresh, int ithresh, int hev_thresh) { +static void HFilter16_NEON(uint8_t* p, int stride, + int thresh, int ithresh, int hev_thresh) { uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3; - Load8x16(p, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); + Load8x16_NEON(p, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); { - const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3, - ithresh, thresh); - const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh); + const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3, + ithresh, thresh); + const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh); uint8x16_t op2, op1, op0, oq0, oq1, oq2; - DoFilter6(p2, p1, p0, q0, q1, q2, mask, hev_mask, - &op2, &op1, &op0, &oq0, &oq1, &oq2); - Store2x16(op2, op1, p - 2, stride); - Store2x16(op0, oq0, p + 0, stride); - Store2x16(oq1, oq2, p + 2, stride); + DoFilter6_NEON(p2, p1, p0, q0, q1, q2, mask, hev_mask, + &op2, &op1, &op0, &oq0, &oq1, &oq2); + Store2x16_NEON(op2, op1, p - 2, stride); + Store2x16_NEON(op0, oq0, p + 0, stride); + Store2x16_NEON(oq1, oq2, p + 2, stride); } } // on three inner edges -static void VFilter16i(uint8_t* p, int stride, - int thresh, int ithresh, int hev_thresh) { +static void VFilter16i_NEON(uint8_t* p, int stride, + int thresh, int ithresh, int hev_thresh) { uint32_t k; uint8x16_t p3, p2, p1, p0; - Load16x4(p + 2 * stride, stride, &p3, &p2, &p1, &p0); + Load16x4_NEON(p + 2 * stride, stride, &p3, &p2, &p1, &p0); for (k = 3; k != 0; --k) { uint8x16_t q0, q1, q2, q3; p += 4 * stride; - Load16x4(p + 2 * stride, stride, &q0, &q1, &q2, &q3); + Load16x4_NEON(p + 2 * stride, stride, &q0, &q1, &q2, &q3); { const uint8x16_t mask = - NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3, ithresh, thresh); - const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh); + NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3, ithresh, thresh); + const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh); // p3 and p2 are not just temporary variables here: they will be // re-used for next span. And q2/q3 will become p1/p0 accordingly. - DoFilter4(p1, p0, q0, q1, mask, hev_mask, &p1, &p0, &p3, &p2); - Store16x4(p1, p0, p3, p2, p, stride); + DoFilter4_NEON(p1, p0, q0, q1, mask, hev_mask, &p1, &p0, &p3, &p2); + Store16x4_NEON(p1, p0, p3, p2, p, stride); p1 = q2; p0 = q3; } @@ -881,21 +893,21 @@ static void VFilter16i(uint8_t* p, int stride, } #if !defined(WORK_AROUND_GCC) -static void HFilter16i(uint8_t* p, int stride, - int thresh, int ithresh, int hev_thresh) { +static void HFilter16i_NEON(uint8_t* p, int stride, + int thresh, int ithresh, int hev_thresh) { uint32_t k; uint8x16_t p3, p2, p1, p0; - Load4x16(p + 2, stride, &p3, &p2, &p1, &p0); + Load4x16_NEON(p + 2, stride, &p3, &p2, &p1, &p0); for (k = 3; k != 0; --k) { uint8x16_t q0, q1, q2, q3; p += 4; - Load4x16(p + 2, stride, &q0, &q1, &q2, &q3); + Load4x16_NEON(p + 2, stride, &q0, &q1, &q2, &q3); { const uint8x16_t mask = - NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3, ithresh, thresh); - const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh); - DoFilter4(p1, p0, q0, q1, mask, hev_mask, &p1, &p0, &p3, &p2); - Store4x16(p1, p0, p3, p2, p, stride); + NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3, ithresh, thresh); + const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh); + DoFilter4_NEON(p1, p0, q0, q1, mask, hev_mask, &p1, &p0, &p3, &p2); + Store4x16_NEON(p1, p0, p3, p2, p, stride); p1 = q2; p0 = q3; } @@ -904,67 +916,67 @@ static void HFilter16i(uint8_t* p, int stride, #endif // !WORK_AROUND_GCC // 8-pixels wide variant, for chroma filtering -static void VFilter8(uint8_t* u, uint8_t* v, int stride, - int thresh, int ithresh, int hev_thresh) { +static void VFilter8_NEON(uint8_t* u, uint8_t* v, int stride, + int thresh, int ithresh, int hev_thresh) { uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3; - Load8x8x2(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); + Load8x8x2_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); { - const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3, - ithresh, thresh); - const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh); + const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3, + ithresh, thresh); + const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh); uint8x16_t op2, op1, op0, oq0, oq1, oq2; - DoFilter6(p2, p1, p0, q0, q1, q2, mask, hev_mask, - &op2, &op1, &op0, &oq0, &oq1, &oq2); - Store8x2x2(op2, op1, u - 2 * stride, v - 2 * stride, stride); - Store8x2x2(op0, oq0, u + 0 * stride, v + 0 * stride, stride); - Store8x2x2(oq1, oq2, u + 2 * stride, v + 2 * stride, stride); + DoFilter6_NEON(p2, p1, p0, q0, q1, q2, mask, hev_mask, + &op2, &op1, &op0, &oq0, &oq1, &oq2); + Store8x2x2_NEON(op2, op1, u - 2 * stride, v - 2 * stride, stride); + Store8x2x2_NEON(op0, oq0, u + 0 * stride, v + 0 * stride, stride); + Store8x2x2_NEON(oq1, oq2, u + 2 * stride, v + 2 * stride, stride); } } -static void VFilter8i(uint8_t* u, uint8_t* v, int stride, - int thresh, int ithresh, int hev_thresh) { +static void VFilter8i_NEON(uint8_t* u, uint8_t* v, int stride, + int thresh, int ithresh, int hev_thresh) { uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3; u += 4 * stride; v += 4 * stride; - Load8x8x2(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); + Load8x8x2_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); { - const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3, - ithresh, thresh); - const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh); + const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3, + ithresh, thresh); + const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh); uint8x16_t op1, op0, oq0, oq1; - DoFilter4(p1, p0, q0, q1, mask, hev_mask, &op1, &op0, &oq0, &oq1); - Store8x4x2(op1, op0, oq0, oq1, u, v, stride); + DoFilter4_NEON(p1, p0, q0, q1, mask, hev_mask, &op1, &op0, &oq0, &oq1); + Store8x4x2_NEON(op1, op0, oq0, oq1, u, v, stride); } } #if !defined(WORK_AROUND_GCC) -static void HFilter8(uint8_t* u, uint8_t* v, int stride, - int thresh, int ithresh, int hev_thresh) { +static void HFilter8_NEON(uint8_t* u, uint8_t* v, int stride, + int thresh, int ithresh, int hev_thresh) { uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3; - Load8x8x2T(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); + Load8x8x2T_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); { - const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3, - ithresh, thresh); - const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh); + const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3, + ithresh, thresh); + const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh); uint8x16_t op2, op1, op0, oq0, oq1, oq2; - DoFilter6(p2, p1, p0, q0, q1, q2, mask, hev_mask, - &op2, &op1, &op0, &oq0, &oq1, &oq2); - Store6x8x2(op2, op1, op0, oq0, oq1, oq2, u, v, stride); + DoFilter6_NEON(p2, p1, p0, q0, q1, q2, mask, hev_mask, + &op2, &op1, &op0, &oq0, &oq1, &oq2); + Store6x8x2_NEON(op2, op1, op0, oq0, oq1, oq2, u, v, stride); } } -static void HFilter8i(uint8_t* u, uint8_t* v, int stride, - int thresh, int ithresh, int hev_thresh) { +static void HFilter8i_NEON(uint8_t* u, uint8_t* v, int stride, + int thresh, int ithresh, int hev_thresh) { uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3; u += 4; v += 4; - Load8x8x2T(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); + Load8x8x2T_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); { - const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3, - ithresh, thresh); - const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh); + const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3, + ithresh, thresh); + const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh); uint8x16_t op1, op0, oq0, oq1; - DoFilter4(p1, p0, q0, q1, mask, hev_mask, &op1, &op0, &oq0, &oq1); - Store4x8x2(op1, op0, oq0, oq1, u, v, stride); + DoFilter4_NEON(p1, p0, q0, q1, mask, hev_mask, &op1, &op0, &oq0, &oq1); + Store4x8x2_NEON(op1, op0, oq0, oq1, u, v, stride); } } #endif // !WORK_AROUND_GCC @@ -992,8 +1004,9 @@ static const int16_t kC1 = 20091; static const int16_t kC2 = 17734; // half of kC2, actually. See comment above. #if defined(WEBP_USE_INTRINSICS) -static WEBP_INLINE void Transpose8x2(const int16x8_t in0, const int16x8_t in1, - int16x8x2_t* const out) { +static WEBP_INLINE void Transpose8x2_NEON(const int16x8_t in0, + const int16x8_t in1, + int16x8x2_t* const out) { // a0 a1 a2 a3 | b0 b1 b2 b3 => a0 b0 c0 d0 | a1 b1 c1 d1 // c0 c1 c2 c3 | d0 d1 d2 d3 a2 b2 c2 d2 | a3 b3 c3 d3 const int16x8x2_t tmp0 = vzipq_s16(in0, in1); // a0 c0 a1 c1 a2 c2 ... @@ -1001,7 +1014,7 @@ static WEBP_INLINE void Transpose8x2(const int16x8_t in0, const int16x8_t in1, *out = vzipq_s16(tmp0.val[0], tmp0.val[1]); } -static WEBP_INLINE void TransformPass(int16x8x2_t* const rows) { +static WEBP_INLINE void TransformPass_NEON(int16x8x2_t* const rows) { // {rows} = in0 | in4 // in8 | in12 // B1 = in4 | in12 @@ -1024,20 +1037,20 @@ static WEBP_INLINE void TransformPass(int16x8x2_t* const rows) { const int16x8_t E0 = vqaddq_s16(D0, D1); // a+d | b+c const int16x8_t E_tmp = vqsubq_s16(D0, D1); // a-d | b-c const int16x8_t E1 = vcombine_s16(vget_high_s16(E_tmp), vget_low_s16(E_tmp)); - Transpose8x2(E0, E1, rows); + Transpose8x2_NEON(E0, E1, rows); } -static void TransformOne(const int16_t* in, uint8_t* dst) { +static void TransformOne_NEON(const int16_t* in, uint8_t* dst) { int16x8x2_t rows; INIT_VECTOR2(rows, vld1q_s16(in + 0), vld1q_s16(in + 8)); - TransformPass(&rows); - TransformPass(&rows); - Add4x4(rows.val[0], rows.val[1], dst); + TransformPass_NEON(&rows); + TransformPass_NEON(&rows); + Add4x4_NEON(rows.val[0], rows.val[1], dst); } #else -static void TransformOne(const int16_t* in, uint8_t* dst) { +static void TransformOne_NEON(const int16_t* in, uint8_t* dst) { const int kBPS = BPS; // kC1, kC2. Padded because vld1.16 loads 8 bytes const int16_t constants[4] = { kC1, kC2, 0, 0 }; @@ -1170,16 +1183,16 @@ static void TransformOne(const int16_t* in, uint8_t* dst) { #endif // WEBP_USE_INTRINSICS -static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) { - TransformOne(in, dst); +static void TransformTwo_NEON(const int16_t* in, uint8_t* dst, int do_two) { + TransformOne_NEON(in, dst); if (do_two) { - TransformOne(in + 16, dst + 4); + TransformOne_NEON(in + 16, dst + 4); } } -static void TransformDC(const int16_t* in, uint8_t* dst) { +static void TransformDC_NEON(const int16_t* in, uint8_t* dst) { const int16x8_t DC = vdupq_n_s16(in[0]); - Add4x4(DC, DC, dst); + Add4x4_NEON(DC, DC, dst); } //------------------------------------------------------------------------------ @@ -1191,7 +1204,7 @@ static void TransformDC(const int16_t* in, uint8_t* dst) { *dst = vgetq_lane_s32(rows.val[3], col); (dst) += 16; \ } while (0) -static void TransformWHT(const int16_t* in, int16_t* out) { +static void TransformWHT_NEON(const int16_t* in, int16_t* out) { int32x4x4_t tmp; { @@ -1209,7 +1222,7 @@ static void TransformWHT(const int16_t* in, int16_t* out) { tmp.val[2] = vsubq_s32(a0, a1); tmp.val[3] = vsubq_s32(a3, a2); // Arrange the temporary results column-wise. - tmp = Transpose4x4(tmp); + tmp = Transpose4x4_NEON(tmp); } { @@ -1243,7 +1256,7 @@ static void TransformWHT(const int16_t* in, int16_t* out) { //------------------------------------------------------------------------------ #define MUL(a, b) (((a) * (b)) >> 16) -static void TransformAC3(const int16_t* in, uint8_t* dst) { +static void TransformAC3_NEON(const int16_t* in, uint8_t* dst) { static const int kC1_full = 20091 + (1 << 16); static const int kC2_full = 35468; const int16x4_t A = vld1_dup_s16(in); @@ -1259,14 +1272,14 @@ static void TransformAC3(const int16_t* in, uint8_t* dst) { const int16x4_t B = vqadd_s16(A, CD); const int16x8_t m0_m1 = vcombine_s16(vqadd_s16(B, d4), vqadd_s16(B, c4)); const int16x8_t m2_m3 = vcombine_s16(vqsub_s16(B, c4), vqsub_s16(B, d4)); - Add4x4(m0_m1, m2_m3, dst); + Add4x4_NEON(m0_m1, m2_m3, dst); } #undef MUL //------------------------------------------------------------------------------ // 4x4 -static void DC4(uint8_t* dst) { // DC +static void DC4_NEON(uint8_t* dst) { // DC const uint8x8_t A = vld1_u8(dst - BPS); // top row const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top const uint16x4_t p1 = vpadd_u16(p0, p0); @@ -1287,17 +1300,17 @@ static void DC4(uint8_t* dst) { // DC } // TrueMotion (4x4 + 8x8) -static WEBP_INLINE void TrueMotion(uint8_t* dst, int size) { +static WEBP_INLINE void TrueMotion_NEON(uint8_t* dst, int size) { const uint8x8_t TL = vld1_dup_u8(dst - BPS - 1); // top-left pixel 'A[-1]' const uint8x8_t T = vld1_u8(dst - BPS); // top row 'A[0..3]' const int16x8_t d = vreinterpretq_s16_u16(vsubl_u8(T, TL)); // A[c] - A[-1] int y; for (y = 0; y < size; y += 4) { // left edge - const int16x8_t L0 = ConvertU8ToS16(vld1_dup_u8(dst + 0 * BPS - 1)); - const int16x8_t L1 = ConvertU8ToS16(vld1_dup_u8(dst + 1 * BPS - 1)); - const int16x8_t L2 = ConvertU8ToS16(vld1_dup_u8(dst + 2 * BPS - 1)); - const int16x8_t L3 = ConvertU8ToS16(vld1_dup_u8(dst + 3 * BPS - 1)); + const int16x8_t L0 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 0 * BPS - 1)); + const int16x8_t L1 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 1 * BPS - 1)); + const int16x8_t L2 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 2 * BPS - 1)); + const int16x8_t L3 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 3 * BPS - 1)); const int16x8_t r0 = vaddq_s16(L0, d); // L[r] + A[c] - A[-1] const int16x8_t r1 = vaddq_s16(L1, d); const int16x8_t r2 = vaddq_s16(L2, d); @@ -1322,9 +1335,9 @@ static WEBP_INLINE void TrueMotion(uint8_t* dst, int size) { } } -static void TM4(uint8_t* dst) { TrueMotion(dst, 4); } +static void TM4_NEON(uint8_t* dst) { TrueMotion_NEON(dst, 4); } -static void VE4(uint8_t* dst) { // vertical +static void VE4_NEON(uint8_t* dst) { // vertical // NB: avoid vld1_u64 here as an alignment hint may be added -> SIGBUS. const uint64x1_t A0 = vreinterpret_u64_u8(vld1_u8(dst - BPS - 1)); // top row const uint64x1_t A1 = vshr_n_u64(A0, 8); @@ -1340,7 +1353,7 @@ static void VE4(uint8_t* dst) { // vertical } } -static void RD4(uint8_t* dst) { // Down-right +static void RD4_NEON(uint8_t* dst) { // Down-right const uint8x8_t XABCD_u8 = vld1_u8(dst - BPS - 1); const uint64x1_t XABCD = vreinterpret_u64_u8(XABCD_u8); const uint64x1_t ____XABC = vshl_n_u64(XABCD, 32); @@ -1368,7 +1381,7 @@ static void RD4(uint8_t* dst) { // Down-right vst1_lane_u32((uint32_t*)(dst + 3 * BPS), r3, 0); } -static void LD4(uint8_t* dst) { // Down-left +static void LD4_NEON(uint8_t* dst) { // Down-left // Note using the same shift trick as VE4() is slower here. const uint8x8_t ABCDEFGH = vld1_u8(dst - BPS + 0); const uint8x8_t BCDEFGH0 = vld1_u8(dst - BPS + 1); @@ -1390,7 +1403,7 @@ static void LD4(uint8_t* dst) { // Down-left //------------------------------------------------------------------------------ // Chroma -static void VE8uv(uint8_t* dst) { // vertical +static void VE8uv_NEON(uint8_t* dst) { // vertical const uint8x8_t top = vld1_u8(dst - BPS); int j; for (j = 0; j < 8; ++j) { @@ -1398,7 +1411,7 @@ static void VE8uv(uint8_t* dst) { // vertical } } -static void HE8uv(uint8_t* dst) { // horizontal +static void HE8uv_NEON(uint8_t* dst) { // horizontal int j; for (j = 0; j < 8; ++j) { const uint8x8_t left = vld1_dup_u8(dst - 1); @@ -1407,7 +1420,7 @@ static void HE8uv(uint8_t* dst) { // horizontal } } -static WEBP_INLINE void DC8(uint8_t* dst, int do_top, int do_left) { +static WEBP_INLINE void DC8_NEON(uint8_t* dst, int do_top, int do_left) { uint16x8_t sum_top; uint16x8_t sum_left; uint8x8_t dc0; @@ -1458,17 +1471,17 @@ static WEBP_INLINE void DC8(uint8_t* dst, int do_top, int do_left) { } } -static void DC8uv(uint8_t* dst) { DC8(dst, 1, 1); } -static void DC8uvNoTop(uint8_t* dst) { DC8(dst, 0, 1); } -static void DC8uvNoLeft(uint8_t* dst) { DC8(dst, 1, 0); } -static void DC8uvNoTopLeft(uint8_t* dst) { DC8(dst, 0, 0); } +static void DC8uv_NEON(uint8_t* dst) { DC8_NEON(dst, 1, 1); } +static void DC8uvNoTop_NEON(uint8_t* dst) { DC8_NEON(dst, 0, 1); } +static void DC8uvNoLeft_NEON(uint8_t* dst) { DC8_NEON(dst, 1, 0); } +static void DC8uvNoTopLeft_NEON(uint8_t* dst) { DC8_NEON(dst, 0, 0); } -static void TM8uv(uint8_t* dst) { TrueMotion(dst, 8); } +static void TM8uv_NEON(uint8_t* dst) { TrueMotion_NEON(dst, 8); } //------------------------------------------------------------------------------ // 16x16 -static void VE16(uint8_t* dst) { // vertical +static void VE16_NEON(uint8_t* dst) { // vertical const uint8x16_t top = vld1q_u8(dst - BPS); int j; for (j = 0; j < 16; ++j) { @@ -1476,7 +1489,7 @@ static void VE16(uint8_t* dst) { // vertical } } -static void HE16(uint8_t* dst) { // horizontal +static void HE16_NEON(uint8_t* dst) { // horizontal int j; for (j = 0; j < 16; ++j) { const uint8x16_t left = vld1q_dup_u8(dst - 1); @@ -1485,7 +1498,7 @@ static void HE16(uint8_t* dst) { // horizontal } } -static WEBP_INLINE void DC16(uint8_t* dst, int do_top, int do_left) { +static WEBP_INLINE void DC16_NEON(uint8_t* dst, int do_top, int do_left) { uint16x8_t sum_top; uint16x8_t sum_left; uint8x8_t dc0; @@ -1542,12 +1555,12 @@ static WEBP_INLINE void DC16(uint8_t* dst, int do_top, int do_left) { } } -static void DC16TopLeft(uint8_t* dst) { DC16(dst, 1, 1); } -static void DC16NoTop(uint8_t* dst) { DC16(dst, 0, 1); } -static void DC16NoLeft(uint8_t* dst) { DC16(dst, 1, 0); } -static void DC16NoTopLeft(uint8_t* dst) { DC16(dst, 0, 0); } +static void DC16TopLeft_NEON(uint8_t* dst) { DC16_NEON(dst, 1, 1); } +static void DC16NoTop_NEON(uint8_t* dst) { DC16_NEON(dst, 0, 1); } +static void DC16NoLeft_NEON(uint8_t* dst) { DC16_NEON(dst, 1, 0); } +static void DC16NoTopLeft_NEON(uint8_t* dst) { DC16_NEON(dst, 0, 0); } -static void TM16(uint8_t* dst) { +static void TM16_NEON(uint8_t* dst) { const uint8x8_t TL = vld1_dup_u8(dst - BPS - 1); // top-left pixel 'A[-1]' const uint8x16_t T = vld1q_u8(dst - BPS); // top row 'A[0..15]' // A[c] - A[-1] @@ -1556,10 +1569,10 @@ static void TM16(uint8_t* dst) { int y; for (y = 0; y < 16; y += 4) { // left edge - const int16x8_t L0 = ConvertU8ToS16(vld1_dup_u8(dst + 0 * BPS - 1)); - const int16x8_t L1 = ConvertU8ToS16(vld1_dup_u8(dst + 1 * BPS - 1)); - const int16x8_t L2 = ConvertU8ToS16(vld1_dup_u8(dst + 2 * BPS - 1)); - const int16x8_t L3 = ConvertU8ToS16(vld1_dup_u8(dst + 3 * BPS - 1)); + const int16x8_t L0 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 0 * BPS - 1)); + const int16x8_t L1 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 1 * BPS - 1)); + const int16x8_t L2 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 2 * BPS - 1)); + const int16x8_t L3 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 3 * BPS - 1)); const int16x8_t r0_lo = vaddq_s16(L0, d_lo); // L[r] + A[c] - A[-1] const int16x8_t r1_lo = vaddq_s16(L1, d_lo); const int16x8_t r2_lo = vaddq_s16(L2, d_lo); @@ -1587,49 +1600,49 @@ static void TM16(uint8_t* dst) { extern void VP8DspInitNEON(void); WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitNEON(void) { - VP8Transform = TransformTwo; - VP8TransformAC3 = TransformAC3; - VP8TransformDC = TransformDC; - VP8TransformWHT = TransformWHT; - - VP8VFilter16 = VFilter16; - VP8VFilter16i = VFilter16i; - VP8HFilter16 = HFilter16; + VP8Transform = TransformTwo_NEON; + VP8TransformAC3 = TransformAC3_NEON; + VP8TransformDC = TransformDC_NEON; + VP8TransformWHT = TransformWHT_NEON; + + VP8VFilter16 = VFilter16_NEON; + VP8VFilter16i = VFilter16i_NEON; + VP8HFilter16 = HFilter16_NEON; #if !defined(WORK_AROUND_GCC) - VP8HFilter16i = HFilter16i; + VP8HFilter16i = HFilter16i_NEON; #endif - VP8VFilter8 = VFilter8; - VP8VFilter8i = VFilter8i; + VP8VFilter8 = VFilter8_NEON; + VP8VFilter8i = VFilter8i_NEON; #if !defined(WORK_AROUND_GCC) - VP8HFilter8 = HFilter8; - VP8HFilter8i = HFilter8i; + VP8HFilter8 = HFilter8_NEON; + VP8HFilter8i = HFilter8i_NEON; #endif - VP8SimpleVFilter16 = SimpleVFilter16; - VP8SimpleHFilter16 = SimpleHFilter16; - VP8SimpleVFilter16i = SimpleVFilter16i; - VP8SimpleHFilter16i = SimpleHFilter16i; - - VP8PredLuma4[0] = DC4; - VP8PredLuma4[1] = TM4; - VP8PredLuma4[2] = VE4; - VP8PredLuma4[4] = RD4; - VP8PredLuma4[6] = LD4; - - VP8PredLuma16[0] = DC16TopLeft; - VP8PredLuma16[1] = TM16; - VP8PredLuma16[2] = VE16; - VP8PredLuma16[3] = HE16; - VP8PredLuma16[4] = DC16NoTop; - VP8PredLuma16[5] = DC16NoLeft; - VP8PredLuma16[6] = DC16NoTopLeft; - - VP8PredChroma8[0] = DC8uv; - VP8PredChroma8[1] = TM8uv; - VP8PredChroma8[2] = VE8uv; - VP8PredChroma8[3] = HE8uv; - VP8PredChroma8[4] = DC8uvNoTop; - VP8PredChroma8[5] = DC8uvNoLeft; - VP8PredChroma8[6] = DC8uvNoTopLeft; + VP8SimpleVFilter16 = SimpleVFilter16_NEON; + VP8SimpleHFilter16 = SimpleHFilter16_NEON; + VP8SimpleVFilter16i = SimpleVFilter16i_NEON; + VP8SimpleHFilter16i = SimpleHFilter16i_NEON; + + VP8PredLuma4[0] = DC4_NEON; + VP8PredLuma4[1] = TM4_NEON; + VP8PredLuma4[2] = VE4_NEON; + VP8PredLuma4[4] = RD4_NEON; + VP8PredLuma4[6] = LD4_NEON; + + VP8PredLuma16[0] = DC16TopLeft_NEON; + VP8PredLuma16[1] = TM16_NEON; + VP8PredLuma16[2] = VE16_NEON; + VP8PredLuma16[3] = HE16_NEON; + VP8PredLuma16[4] = DC16NoTop_NEON; + VP8PredLuma16[5] = DC16NoLeft_NEON; + VP8PredLuma16[6] = DC16NoTopLeft_NEON; + + VP8PredChroma8[0] = DC8uv_NEON; + VP8PredChroma8[1] = TM8uv_NEON; + VP8PredChroma8[2] = VE8uv_NEON; + VP8PredChroma8[3] = HE8uv_NEON; + VP8PredChroma8[4] = DC8uvNoTop_NEON; + VP8PredChroma8[5] = DC8uvNoLeft_NEON; + VP8PredChroma8[6] = DC8uvNoTopLeft_NEON; } #else // !WEBP_USE_NEON diff --git a/src/3rdparty/libwebp/src/dsp/dec_sse2.c b/src/3rdparty/libwebp/src/dsp/dec_sse2.c index 411fb02..873aa59 100644 --- a/src/3rdparty/libwebp/src/dsp/dec_sse2.c +++ b/src/3rdparty/libwebp/src/dsp/dec_sse2.c @@ -12,23 +12,25 @@ // Author: somnath@google.com (Somnath Banerjee) // cduvivier@google.com (Christian Duvivier) -#include "./dsp.h" +#include "src/dsp/dsp.h" #if defined(WEBP_USE_SSE2) // The 3-coeff sparse transform in SSE2 is not really faster than the plain-C // one it seems => disable it by default. Uncomment the following to enable: -// #define USE_TRANSFORM_AC3 +#if !defined(USE_TRANSFORM_AC3) +#define USE_TRANSFORM_AC3 0 // ALTERNATE_CODE +#endif #include <emmintrin.h> -#include "./common_sse2.h" -#include "../dec/vp8i_dec.h" -#include "../utils/utils.h" +#include "src/dsp/common_sse2.h" +#include "src/dec/vp8i_dec.h" +#include "src/utils/utils.h" //------------------------------------------------------------------------------ // Transforms (Paragraph 14.4) -static void Transform(const int16_t* in, uint8_t* dst, int do_two) { +static void Transform_SSE2(const int16_t* in, uint8_t* dst, int do_two) { // This implementation makes use of 16-bit fixed point versions of two // multiply constants: // K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16 @@ -193,7 +195,7 @@ static void Transform(const int16_t* in, uint8_t* dst, int do_two) { } } -#if defined(USE_TRANSFORM_AC3) +#if (USE_TRANSFORM_AC3 == 1) #define MUL(a, b) (((a) * (b)) >> 16) static void TransformAC3(const int16_t* in, uint8_t* dst) { static const int kC1 = 20091 + (1 << 16); @@ -248,7 +250,7 @@ static void TransformAC3(const int16_t* in, uint8_t* dst) { _mm_subs_epu8((p), (q))) // Shift each byte of "x" by 3 bits while preserving by the sign bit. -static WEBP_INLINE void SignedShift8b(__m128i* const x) { +static WEBP_INLINE void SignedShift8b_SSE2(__m128i* const x) { const __m128i zero = _mm_setzero_si128(); const __m128i lo_0 = _mm_unpacklo_epi8(zero, *x); const __m128i hi_0 = _mm_unpackhi_epi8(zero, *x); @@ -258,8 +260,8 @@ static WEBP_INLINE void SignedShift8b(__m128i* const x) { } #define FLIP_SIGN_BIT2(a, b) { \ - a = _mm_xor_si128(a, sign_bit); \ - b = _mm_xor_si128(b, sign_bit); \ + (a) = _mm_xor_si128(a, sign_bit); \ + (b) = _mm_xor_si128(b, sign_bit); \ } #define FLIP_SIGN_BIT4(a, b, c, d) { \ @@ -268,11 +270,11 @@ static WEBP_INLINE void SignedShift8b(__m128i* const x) { } // input/output is uint8_t -static WEBP_INLINE void GetNotHEV(const __m128i* const p1, - const __m128i* const p0, - const __m128i* const q0, - const __m128i* const q1, - int hev_thresh, __m128i* const not_hev) { +static WEBP_INLINE void GetNotHEV_SSE2(const __m128i* const p1, + const __m128i* const p0, + const __m128i* const q0, + const __m128i* const q1, + int hev_thresh, __m128i* const not_hev) { const __m128i zero = _mm_setzero_si128(); const __m128i t_1 = MM_ABS(*p1, *p0); const __m128i t_2 = MM_ABS(*q1, *q0); @@ -285,11 +287,11 @@ static WEBP_INLINE void GetNotHEV(const __m128i* const p1, } // input pixels are int8_t -static WEBP_INLINE void GetBaseDelta(const __m128i* const p1, - const __m128i* const p0, - const __m128i* const q0, - const __m128i* const q1, - __m128i* const delta) { +static WEBP_INLINE void GetBaseDelta_SSE2(const __m128i* const p1, + const __m128i* const p0, + const __m128i* const q0, + const __m128i* const q1, + __m128i* const delta) { // beware of addition order, for saturation! const __m128i p1_q1 = _mm_subs_epi8(*p1, *q1); // p1 - q1 const __m128i q0_p0 = _mm_subs_epi8(*q0, *p0); // q0 - p0 @@ -300,15 +302,16 @@ static WEBP_INLINE void GetBaseDelta(const __m128i* const p1, } // input and output are int8_t -static WEBP_INLINE void DoSimpleFilter(__m128i* const p0, __m128i* const q0, - const __m128i* const fl) { +static WEBP_INLINE void DoSimpleFilter_SSE2(__m128i* const p0, + __m128i* const q0, + const __m128i* const fl) { const __m128i k3 = _mm_set1_epi8(3); const __m128i k4 = _mm_set1_epi8(4); __m128i v3 = _mm_adds_epi8(*fl, k3); __m128i v4 = _mm_adds_epi8(*fl, k4); - SignedShift8b(&v4); // v4 >> 3 - SignedShift8b(&v3); // v3 >> 3 + SignedShift8b_SSE2(&v4); // v4 >> 3 + SignedShift8b_SSE2(&v3); // v3 >> 3 *q0 = _mm_subs_epi8(*q0, v4); // q0 -= v4 *p0 = _mm_adds_epi8(*p0, v3); // p0 += v3 } @@ -317,27 +320,27 @@ static WEBP_INLINE void DoSimpleFilter(__m128i* const p0, __m128i* const q0, // Update operations: // q = q - delta and p = p + delta; where delta = [(a_hi >> 7), (a_lo >> 7)] // Pixels 'pi' and 'qi' are int8_t on input, uint8_t on output (sign flip). -static WEBP_INLINE void Update2Pixels(__m128i* const pi, __m128i* const qi, - const __m128i* const a0_lo, - const __m128i* const a0_hi) { +static WEBP_INLINE void Update2Pixels_SSE2(__m128i* const pi, __m128i* const qi, + const __m128i* const a0_lo, + const __m128i* const a0_hi) { const __m128i a1_lo = _mm_srai_epi16(*a0_lo, 7); const __m128i a1_hi = _mm_srai_epi16(*a0_hi, 7); const __m128i delta = _mm_packs_epi16(a1_lo, a1_hi); - const __m128i sign_bit = _mm_set1_epi8(0x80); + const __m128i sign_bit = _mm_set1_epi8((char)0x80); *pi = _mm_adds_epi8(*pi, delta); *qi = _mm_subs_epi8(*qi, delta); FLIP_SIGN_BIT2(*pi, *qi); } // input pixels are uint8_t -static WEBP_INLINE void NeedsFilter(const __m128i* const p1, - const __m128i* const p0, - const __m128i* const q0, - const __m128i* const q1, - int thresh, __m128i* const mask) { - const __m128i m_thresh = _mm_set1_epi8(thresh); +static WEBP_INLINE void NeedsFilter_SSE2(const __m128i* const p1, + const __m128i* const p0, + const __m128i* const q0, + const __m128i* const q1, + int thresh, __m128i* const mask) { + const __m128i m_thresh = _mm_set1_epi8((char)thresh); const __m128i t1 = MM_ABS(*p1, *q1); // abs(p1 - q1) - const __m128i kFE = _mm_set1_epi8(0xFE); + const __m128i kFE = _mm_set1_epi8((char)0xFE); const __m128i t2 = _mm_and_si128(t1, kFE); // set lsb of each byte to zero const __m128i t3 = _mm_srli_epi16(t2, 1); // abs(p1 - q1) / 2 @@ -353,30 +356,31 @@ static WEBP_INLINE void NeedsFilter(const __m128i* const p1, // Edge filtering functions // Applies filter on 2 pixels (p0 and q0) -static WEBP_INLINE void DoFilter2(__m128i* const p1, __m128i* const p0, - __m128i* const q0, __m128i* const q1, - int thresh) { +static WEBP_INLINE void DoFilter2_SSE2(__m128i* const p1, __m128i* const p0, + __m128i* const q0, __m128i* const q1, + int thresh) { __m128i a, mask; - const __m128i sign_bit = _mm_set1_epi8(0x80); - // convert p1/q1 to int8_t (for GetBaseDelta) + const __m128i sign_bit = _mm_set1_epi8((char)0x80); + // convert p1/q1 to int8_t (for GetBaseDelta_SSE2) const __m128i p1s = _mm_xor_si128(*p1, sign_bit); const __m128i q1s = _mm_xor_si128(*q1, sign_bit); - NeedsFilter(p1, p0, q0, q1, thresh, &mask); + NeedsFilter_SSE2(p1, p0, q0, q1, thresh, &mask); FLIP_SIGN_BIT2(*p0, *q0); - GetBaseDelta(&p1s, p0, q0, &q1s, &a); + GetBaseDelta_SSE2(&p1s, p0, q0, &q1s, &a); a = _mm_and_si128(a, mask); // mask filter values we don't care about - DoSimpleFilter(p0, q0, &a); + DoSimpleFilter_SSE2(p0, q0, &a); FLIP_SIGN_BIT2(*p0, *q0); } // Applies filter on 4 pixels (p1, p0, q0 and q1) -static WEBP_INLINE void DoFilter4(__m128i* const p1, __m128i* const p0, - __m128i* const q0, __m128i* const q1, - const __m128i* const mask, int hev_thresh) { +static WEBP_INLINE void DoFilter4_SSE2(__m128i* const p1, __m128i* const p0, + __m128i* const q0, __m128i* const q1, + const __m128i* const mask, + int hev_thresh) { const __m128i zero = _mm_setzero_si128(); - const __m128i sign_bit = _mm_set1_epi8(0x80); + const __m128i sign_bit = _mm_set1_epi8((char)0x80); const __m128i k64 = _mm_set1_epi8(64); const __m128i k3 = _mm_set1_epi8(3); const __m128i k4 = _mm_set1_epi8(4); @@ -384,7 +388,7 @@ static WEBP_INLINE void DoFilter4(__m128i* const p1, __m128i* const p0, __m128i t1, t2, t3; // compute hev mask - GetNotHEV(p1, p0, q0, q1, hev_thresh, ¬_hev); + GetNotHEV_SSE2(p1, p0, q0, q1, hev_thresh, ¬_hev); // convert to signed values FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1); @@ -399,8 +403,8 @@ static WEBP_INLINE void DoFilter4(__m128i* const p1, __m128i* const p0, t2 = _mm_adds_epi8(t1, k3); // 3 * (q0 - p0) + hev(p1 - q1) + 3 t3 = _mm_adds_epi8(t1, k4); // 3 * (q0 - p0) + hev(p1 - q1) + 4 - SignedShift8b(&t2); // (3 * (q0 - p0) + hev(p1 - q1) + 3) >> 3 - SignedShift8b(&t3); // (3 * (q0 - p0) + hev(p1 - q1) + 4) >> 3 + SignedShift8b_SSE2(&t2); // (3 * (q0 - p0) + hev(p1 - q1) + 3) >> 3 + SignedShift8b_SSE2(&t3); // (3 * (q0 - p0) + hev(p1 - q1) + 4) >> 3 *p0 = _mm_adds_epi8(*p0, t2); // p0 += t2 *q0 = _mm_subs_epi8(*q0, t3); // q0 -= t3 FLIP_SIGN_BIT2(*p0, *q0); @@ -417,25 +421,26 @@ static WEBP_INLINE void DoFilter4(__m128i* const p1, __m128i* const p0, } // Applies filter on 6 pixels (p2, p1, p0, q0, q1 and q2) -static WEBP_INLINE void DoFilter6(__m128i* const p2, __m128i* const p1, - __m128i* const p0, __m128i* const q0, - __m128i* const q1, __m128i* const q2, - const __m128i* const mask, int hev_thresh) { +static WEBP_INLINE void DoFilter6_SSE2(__m128i* const p2, __m128i* const p1, + __m128i* const p0, __m128i* const q0, + __m128i* const q1, __m128i* const q2, + const __m128i* const mask, + int hev_thresh) { const __m128i zero = _mm_setzero_si128(); - const __m128i sign_bit = _mm_set1_epi8(0x80); + const __m128i sign_bit = _mm_set1_epi8((char)0x80); __m128i a, not_hev; // compute hev mask - GetNotHEV(p1, p0, q0, q1, hev_thresh, ¬_hev); + GetNotHEV_SSE2(p1, p0, q0, q1, hev_thresh, ¬_hev); FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1); FLIP_SIGN_BIT2(*p2, *q2); - GetBaseDelta(p1, p0, q0, q1, &a); + GetBaseDelta_SSE2(p1, p0, q0, q1, &a); { // do simple filter on pixels with hev const __m128i m = _mm_andnot_si128(not_hev, *mask); const __m128i f = _mm_and_si128(a, m); - DoSimpleFilter(p0, q0, &f); + DoSimpleFilter_SSE2(p0, q0, &f); } { // do strong filter on pixels with not hev @@ -460,15 +465,15 @@ static WEBP_INLINE void DoFilter6(__m128i* const p2, __m128i* const p1, const __m128i a0_lo = _mm_add_epi16(a1_lo, f9_lo); // Filter * 27 + 63 const __m128i a0_hi = _mm_add_epi16(a1_hi, f9_hi); // Filter * 27 + 63 - Update2Pixels(p2, q2, &a2_lo, &a2_hi); - Update2Pixels(p1, q1, &a1_lo, &a1_hi); - Update2Pixels(p0, q0, &a0_lo, &a0_hi); + Update2Pixels_SSE2(p2, q2, &a2_lo, &a2_hi); + Update2Pixels_SSE2(p1, q1, &a1_lo, &a1_hi); + Update2Pixels_SSE2(p0, q0, &a0_lo, &a0_hi); } } // reads 8 rows across a vertical edge. -static WEBP_INLINE void Load8x4(const uint8_t* const b, int stride, - __m128i* const p, __m128i* const q) { +static WEBP_INLINE void Load8x4_SSE2(const uint8_t* const b, int stride, + __m128i* const p, __m128i* const q) { // A0 = 63 62 61 60 23 22 21 20 43 42 41 40 03 02 01 00 // A1 = 73 72 71 70 33 32 31 30 53 52 51 50 13 12 11 10 const __m128i A0 = _mm_set_epi32( @@ -494,11 +499,11 @@ static WEBP_INLINE void Load8x4(const uint8_t* const b, int stride, *q = _mm_unpackhi_epi32(C0, C1); } -static WEBP_INLINE void Load16x4(const uint8_t* const r0, - const uint8_t* const r8, - int stride, - __m128i* const p1, __m128i* const p0, - __m128i* const q0, __m128i* const q1) { +static WEBP_INLINE void Load16x4_SSE2(const uint8_t* const r0, + const uint8_t* const r8, + int stride, + __m128i* const p1, __m128i* const p0, + __m128i* const q0, __m128i* const q1) { // Assume the pixels around the edge (|) are numbered as follows // 00 01 | 02 03 // 10 11 | 12 13 @@ -514,8 +519,8 @@ static WEBP_INLINE void Load16x4(const uint8_t* const r0, // q0 = 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02 // p0 = f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80 // q1 = f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82 - Load8x4(r0, stride, p1, q0); - Load8x4(r8, stride, p0, q1); + Load8x4_SSE2(r0, stride, p1, q0); + Load8x4_SSE2(r8, stride, p0, q1); { // p1 = f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 @@ -531,7 +536,8 @@ static WEBP_INLINE void Load16x4(const uint8_t* const r0, } } -static WEBP_INLINE void Store4x4(__m128i* const x, uint8_t* dst, int stride) { +static WEBP_INLINE void Store4x4_SSE2(__m128i* const x, + uint8_t* dst, int stride) { int i; for (i = 0; i < 4; ++i, dst += stride) { WebPUint32ToMem(dst, _mm_cvtsi128_si32(*x)); @@ -540,12 +546,12 @@ static WEBP_INLINE void Store4x4(__m128i* const x, uint8_t* dst, int stride) { } // Transpose back and store -static WEBP_INLINE void Store16x4(const __m128i* const p1, - const __m128i* const p0, - const __m128i* const q0, - const __m128i* const q1, - uint8_t* r0, uint8_t* r8, - int stride) { +static WEBP_INLINE void Store16x4_SSE2(const __m128i* const p1, + const __m128i* const p0, + const __m128i* const q0, + const __m128i* const q1, + uint8_t* r0, uint8_t* r8, + int stride) { __m128i t1, p1_s, p0_s, q0_s, q1_s; // p0 = 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00 @@ -572,55 +578,55 @@ static WEBP_INLINE void Store16x4(const __m128i* const p1, p1_s = _mm_unpacklo_epi16(t1, q1_s); q1_s = _mm_unpackhi_epi16(t1, q1_s); - Store4x4(&p0_s, r0, stride); + Store4x4_SSE2(&p0_s, r0, stride); r0 += 4 * stride; - Store4x4(&q0_s, r0, stride); + Store4x4_SSE2(&q0_s, r0, stride); - Store4x4(&p1_s, r8, stride); + Store4x4_SSE2(&p1_s, r8, stride); r8 += 4 * stride; - Store4x4(&q1_s, r8, stride); + Store4x4_SSE2(&q1_s, r8, stride); } //------------------------------------------------------------------------------ // Simple In-loop filtering (Paragraph 15.2) -static void SimpleVFilter16(uint8_t* p, int stride, int thresh) { +static void SimpleVFilter16_SSE2(uint8_t* p, int stride, int thresh) { // Load __m128i p1 = _mm_loadu_si128((__m128i*)&p[-2 * stride]); __m128i p0 = _mm_loadu_si128((__m128i*)&p[-stride]); __m128i q0 = _mm_loadu_si128((__m128i*)&p[0]); __m128i q1 = _mm_loadu_si128((__m128i*)&p[stride]); - DoFilter2(&p1, &p0, &q0, &q1, thresh); + DoFilter2_SSE2(&p1, &p0, &q0, &q1, thresh); // Store _mm_storeu_si128((__m128i*)&p[-stride], p0); _mm_storeu_si128((__m128i*)&p[0], q0); } -static void SimpleHFilter16(uint8_t* p, int stride, int thresh) { +static void SimpleHFilter16_SSE2(uint8_t* p, int stride, int thresh) { __m128i p1, p0, q0, q1; p -= 2; // beginning of p1 - Load16x4(p, p + 8 * stride, stride, &p1, &p0, &q0, &q1); - DoFilter2(&p1, &p0, &q0, &q1, thresh); - Store16x4(&p1, &p0, &q0, &q1, p, p + 8 * stride, stride); + Load16x4_SSE2(p, p + 8 * stride, stride, &p1, &p0, &q0, &q1); + DoFilter2_SSE2(&p1, &p0, &q0, &q1, thresh); + Store16x4_SSE2(&p1, &p0, &q0, &q1, p, p + 8 * stride, stride); } -static void SimpleVFilter16i(uint8_t* p, int stride, int thresh) { +static void SimpleVFilter16i_SSE2(uint8_t* p, int stride, int thresh) { int k; for (k = 3; k > 0; --k) { p += 4 * stride; - SimpleVFilter16(p, stride, thresh); + SimpleVFilter16_SSE2(p, stride, thresh); } } -static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) { +static void SimpleHFilter16i_SSE2(uint8_t* p, int stride, int thresh) { int k; for (k = 3; k > 0; --k) { p += 4; - SimpleHFilter16(p, stride, thresh); + SimpleHFilter16_SSE2(p, stride, thresh); } } @@ -628,60 +634,60 @@ static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) { // Complex In-loop filtering (Paragraph 15.3) #define MAX_DIFF1(p3, p2, p1, p0, m) do { \ - m = MM_ABS(p1, p0); \ - m = _mm_max_epu8(m, MM_ABS(p3, p2)); \ - m = _mm_max_epu8(m, MM_ABS(p2, p1)); \ + (m) = MM_ABS(p1, p0); \ + (m) = _mm_max_epu8(m, MM_ABS(p3, p2)); \ + (m) = _mm_max_epu8(m, MM_ABS(p2, p1)); \ } while (0) #define MAX_DIFF2(p3, p2, p1, p0, m) do { \ - m = _mm_max_epu8(m, MM_ABS(p1, p0)); \ - m = _mm_max_epu8(m, MM_ABS(p3, p2)); \ - m = _mm_max_epu8(m, MM_ABS(p2, p1)); \ + (m) = _mm_max_epu8(m, MM_ABS(p1, p0)); \ + (m) = _mm_max_epu8(m, MM_ABS(p3, p2)); \ + (m) = _mm_max_epu8(m, MM_ABS(p2, p1)); \ } while (0) #define LOAD_H_EDGES4(p, stride, e1, e2, e3, e4) { \ - e1 = _mm_loadu_si128((__m128i*)&(p)[0 * stride]); \ - e2 = _mm_loadu_si128((__m128i*)&(p)[1 * stride]); \ - e3 = _mm_loadu_si128((__m128i*)&(p)[2 * stride]); \ - e4 = _mm_loadu_si128((__m128i*)&(p)[3 * stride]); \ + (e1) = _mm_loadu_si128((__m128i*)&(p)[0 * (stride)]); \ + (e2) = _mm_loadu_si128((__m128i*)&(p)[1 * (stride)]); \ + (e3) = _mm_loadu_si128((__m128i*)&(p)[2 * (stride)]); \ + (e4) = _mm_loadu_si128((__m128i*)&(p)[3 * (stride)]); \ } #define LOADUV_H_EDGE(p, u, v, stride) do { \ const __m128i U = _mm_loadl_epi64((__m128i*)&(u)[(stride)]); \ const __m128i V = _mm_loadl_epi64((__m128i*)&(v)[(stride)]); \ - p = _mm_unpacklo_epi64(U, V); \ + (p) = _mm_unpacklo_epi64(U, V); \ } while (0) #define LOADUV_H_EDGES4(u, v, stride, e1, e2, e3, e4) { \ - LOADUV_H_EDGE(e1, u, v, 0 * stride); \ - LOADUV_H_EDGE(e2, u, v, 1 * stride); \ - LOADUV_H_EDGE(e3, u, v, 2 * stride); \ - LOADUV_H_EDGE(e4, u, v, 3 * stride); \ + LOADUV_H_EDGE(e1, u, v, 0 * (stride)); \ + LOADUV_H_EDGE(e2, u, v, 1 * (stride)); \ + LOADUV_H_EDGE(e3, u, v, 2 * (stride)); \ + LOADUV_H_EDGE(e4, u, v, 3 * (stride)); \ } #define STOREUV(p, u, v, stride) { \ - _mm_storel_epi64((__m128i*)&u[(stride)], p); \ - p = _mm_srli_si128(p, 8); \ - _mm_storel_epi64((__m128i*)&v[(stride)], p); \ + _mm_storel_epi64((__m128i*)&(u)[(stride)], p); \ + (p) = _mm_srli_si128(p, 8); \ + _mm_storel_epi64((__m128i*)&(v)[(stride)], p); \ } -static WEBP_INLINE void ComplexMask(const __m128i* const p1, - const __m128i* const p0, - const __m128i* const q0, - const __m128i* const q1, - int thresh, int ithresh, - __m128i* const mask) { +static WEBP_INLINE void ComplexMask_SSE2(const __m128i* const p1, + const __m128i* const p0, + const __m128i* const q0, + const __m128i* const q1, + int thresh, int ithresh, + __m128i* const mask) { const __m128i it = _mm_set1_epi8(ithresh); const __m128i diff = _mm_subs_epu8(*mask, it); const __m128i thresh_mask = _mm_cmpeq_epi8(diff, _mm_setzero_si128()); __m128i filter_mask; - NeedsFilter(p1, p0, q0, q1, thresh, &filter_mask); + NeedsFilter_SSE2(p1, p0, q0, q1, thresh, &filter_mask); *mask = _mm_and_si128(thresh_mask, filter_mask); } // on macroblock edges -static void VFilter16(uint8_t* p, int stride, - int thresh, int ithresh, int hev_thresh) { +static void VFilter16_SSE2(uint8_t* p, int stride, + int thresh, int ithresh, int hev_thresh) { __m128i t1; __m128i mask; __m128i p2, p1, p0, q0, q1, q2; @@ -694,8 +700,8 @@ static void VFilter16(uint8_t* p, int stride, LOAD_H_EDGES4(p, stride, q0, q1, q2, t1); MAX_DIFF2(t1, q2, q1, q0, mask); - ComplexMask(&p1, &p0, &q0, &q1, thresh, ithresh, &mask); - DoFilter6(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh); + ComplexMask_SSE2(&p1, &p0, &q0, &q1, thresh, ithresh, &mask); + DoFilter6_SSE2(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh); // Store _mm_storeu_si128((__m128i*)&p[-3 * stride], p2); @@ -706,28 +712,28 @@ static void VFilter16(uint8_t* p, int stride, _mm_storeu_si128((__m128i*)&p[+2 * stride], q2); } -static void HFilter16(uint8_t* p, int stride, - int thresh, int ithresh, int hev_thresh) { +static void HFilter16_SSE2(uint8_t* p, int stride, + int thresh, int ithresh, int hev_thresh) { __m128i mask; __m128i p3, p2, p1, p0, q0, q1, q2, q3; uint8_t* const b = p - 4; - Load16x4(b, b + 8 * stride, stride, &p3, &p2, &p1, &p0); // p3, p2, p1, p0 + Load16x4_SSE2(b, b + 8 * stride, stride, &p3, &p2, &p1, &p0); MAX_DIFF1(p3, p2, p1, p0, mask); - Load16x4(p, p + 8 * stride, stride, &q0, &q1, &q2, &q3); // q0, q1, q2, q3 + Load16x4_SSE2(p, p + 8 * stride, stride, &q0, &q1, &q2, &q3); MAX_DIFF2(q3, q2, q1, q0, mask); - ComplexMask(&p1, &p0, &q0, &q1, thresh, ithresh, &mask); - DoFilter6(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh); + ComplexMask_SSE2(&p1, &p0, &q0, &q1, thresh, ithresh, &mask); + DoFilter6_SSE2(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh); - Store16x4(&p3, &p2, &p1, &p0, b, b + 8 * stride, stride); - Store16x4(&q0, &q1, &q2, &q3, p, p + 8 * stride, stride); + Store16x4_SSE2(&p3, &p2, &p1, &p0, b, b + 8 * stride, stride); + Store16x4_SSE2(&q0, &q1, &q2, &q3, p, p + 8 * stride, stride); } // on three inner edges -static void VFilter16i(uint8_t* p, int stride, - int thresh, int ithresh, int hev_thresh) { +static void VFilter16i_SSE2(uint8_t* p, int stride, + int thresh, int ithresh, int hev_thresh) { int k; __m128i p3, p2, p1, p0; // loop invariants @@ -744,8 +750,8 @@ static void VFilter16i(uint8_t* p, int stride, // p3 and p2 are not just temporary variables here: they will be // re-used for next span. And q2/q3 will become p1/p0 accordingly. - ComplexMask(&p1, &p0, &p3, &p2, thresh, ithresh, &mask); - DoFilter4(&p1, &p0, &p3, &p2, &mask, hev_thresh); + ComplexMask_SSE2(&p1, &p0, &p3, &p2, thresh, ithresh, &mask); + DoFilter4_SSE2(&p1, &p0, &p3, &p2, &mask, hev_thresh); // Store _mm_storeu_si128((__m128i*)&b[0 * stride], p1); @@ -759,12 +765,12 @@ static void VFilter16i(uint8_t* p, int stride, } } -static void HFilter16i(uint8_t* p, int stride, - int thresh, int ithresh, int hev_thresh) { +static void HFilter16i_SSE2(uint8_t* p, int stride, + int thresh, int ithresh, int hev_thresh) { int k; __m128i p3, p2, p1, p0; // loop invariants - Load16x4(p, p + 8 * stride, stride, &p3, &p2, &p1, &p0); // prologue + Load16x4_SSE2(p, p + 8 * stride, stride, &p3, &p2, &p1, &p0); // prologue for (k = 3; k > 0; --k) { __m128i mask, tmp1, tmp2; @@ -773,13 +779,13 @@ static void HFilter16i(uint8_t* p, int stride, p += 4; // beginning of q0 (and next span) MAX_DIFF1(p3, p2, p1, p0, mask); // compute partial mask - Load16x4(p, p + 8 * stride, stride, &p3, &p2, &tmp1, &tmp2); + Load16x4_SSE2(p, p + 8 * stride, stride, &p3, &p2, &tmp1, &tmp2); MAX_DIFF2(p3, p2, tmp1, tmp2, mask); - ComplexMask(&p1, &p0, &p3, &p2, thresh, ithresh, &mask); - DoFilter4(&p1, &p0, &p3, &p2, &mask, hev_thresh); + ComplexMask_SSE2(&p1, &p0, &p3, &p2, thresh, ithresh, &mask); + DoFilter4_SSE2(&p1, &p0, &p3, &p2, &mask, hev_thresh); - Store16x4(&p1, &p0, &p3, &p2, b, b + 8 * stride, stride); + Store16x4_SSE2(&p1, &p0, &p3, &p2, b, b + 8 * stride, stride); // rotate samples p1 = tmp1; @@ -788,8 +794,8 @@ static void HFilter16i(uint8_t* p, int stride, } // 8-pixels wide variant, for chroma filtering -static void VFilter8(uint8_t* u, uint8_t* v, int stride, - int thresh, int ithresh, int hev_thresh) { +static void VFilter8_SSE2(uint8_t* u, uint8_t* v, int stride, + int thresh, int ithresh, int hev_thresh) { __m128i mask; __m128i t1, p2, p1, p0, q0, q1, q2; @@ -801,8 +807,8 @@ static void VFilter8(uint8_t* u, uint8_t* v, int stride, LOADUV_H_EDGES4(u, v, stride, q0, q1, q2, t1); MAX_DIFF2(t1, q2, q1, q0, mask); - ComplexMask(&p1, &p0, &q0, &q1, thresh, ithresh, &mask); - DoFilter6(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh); + ComplexMask_SSE2(&p1, &p0, &q0, &q1, thresh, ithresh, &mask); + DoFilter6_SSE2(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh); // Store STOREUV(p2, u, v, -3 * stride); @@ -813,28 +819,28 @@ static void VFilter8(uint8_t* u, uint8_t* v, int stride, STOREUV(q2, u, v, 2 * stride); } -static void HFilter8(uint8_t* u, uint8_t* v, int stride, - int thresh, int ithresh, int hev_thresh) { +static void HFilter8_SSE2(uint8_t* u, uint8_t* v, int stride, + int thresh, int ithresh, int hev_thresh) { __m128i mask; __m128i p3, p2, p1, p0, q0, q1, q2, q3; uint8_t* const tu = u - 4; uint8_t* const tv = v - 4; - Load16x4(tu, tv, stride, &p3, &p2, &p1, &p0); // p3, p2, p1, p0 + Load16x4_SSE2(tu, tv, stride, &p3, &p2, &p1, &p0); MAX_DIFF1(p3, p2, p1, p0, mask); - Load16x4(u, v, stride, &q0, &q1, &q2, &q3); // q0, q1, q2, q3 + Load16x4_SSE2(u, v, stride, &q0, &q1, &q2, &q3); MAX_DIFF2(q3, q2, q1, q0, mask); - ComplexMask(&p1, &p0, &q0, &q1, thresh, ithresh, &mask); - DoFilter6(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh); + ComplexMask_SSE2(&p1, &p0, &q0, &q1, thresh, ithresh, &mask); + DoFilter6_SSE2(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh); - Store16x4(&p3, &p2, &p1, &p0, tu, tv, stride); - Store16x4(&q0, &q1, &q2, &q3, u, v, stride); + Store16x4_SSE2(&p3, &p2, &p1, &p0, tu, tv, stride); + Store16x4_SSE2(&q0, &q1, &q2, &q3, u, v, stride); } -static void VFilter8i(uint8_t* u, uint8_t* v, int stride, - int thresh, int ithresh, int hev_thresh) { +static void VFilter8i_SSE2(uint8_t* u, uint8_t* v, int stride, + int thresh, int ithresh, int hev_thresh) { __m128i mask; __m128i t1, t2, p1, p0, q0, q1; @@ -849,8 +855,8 @@ static void VFilter8i(uint8_t* u, uint8_t* v, int stride, LOADUV_H_EDGES4(u, v, stride, q0, q1, t1, t2); MAX_DIFF2(t2, t1, q1, q0, mask); - ComplexMask(&p1, &p0, &q0, &q1, thresh, ithresh, &mask); - DoFilter4(&p1, &p0, &q0, &q1, &mask, hev_thresh); + ComplexMask_SSE2(&p1, &p0, &q0, &q1, thresh, ithresh, &mask); + DoFilter4_SSE2(&p1, &p0, &q0, &q1, &mask, hev_thresh); // Store STOREUV(p1, u, v, -2 * stride); @@ -859,24 +865,24 @@ static void VFilter8i(uint8_t* u, uint8_t* v, int stride, STOREUV(q1, u, v, 1 * stride); } -static void HFilter8i(uint8_t* u, uint8_t* v, int stride, - int thresh, int ithresh, int hev_thresh) { +static void HFilter8i_SSE2(uint8_t* u, uint8_t* v, int stride, + int thresh, int ithresh, int hev_thresh) { __m128i mask; __m128i t1, t2, p1, p0, q0, q1; - Load16x4(u, v, stride, &t2, &t1, &p1, &p0); // p3, p2, p1, p0 + Load16x4_SSE2(u, v, stride, &t2, &t1, &p1, &p0); // p3, p2, p1, p0 MAX_DIFF1(t2, t1, p1, p0, mask); u += 4; // beginning of q0 v += 4; - Load16x4(u, v, stride, &q0, &q1, &t1, &t2); // q0, q1, q2, q3 + Load16x4_SSE2(u, v, stride, &q0, &q1, &t1, &t2); // q0, q1, q2, q3 MAX_DIFF2(t2, t1, q1, q0, mask); - ComplexMask(&p1, &p0, &q0, &q1, thresh, ithresh, &mask); - DoFilter4(&p1, &p0, &q0, &q1, &mask, hev_thresh); + ComplexMask_SSE2(&p1, &p0, &q0, &q1, thresh, ithresh, &mask); + DoFilter4_SSE2(&p1, &p0, &q0, &q1, &mask, hev_thresh); u -= 2; // beginning of p1 v -= 2; - Store16x4(&p1, &p0, &q0, &q1, u, v, stride); + Store16x4_SSE2(&p1, &p0, &q0, &q1, u, v, stride); } //------------------------------------------------------------------------------ @@ -893,7 +899,7 @@ static void HFilter8i(uint8_t* u, uint8_t* v, int stride, // where: AC = (a + b + 1) >> 1, BC = (b + c + 1) >> 1 // and ab = a ^ b, bc = b ^ c, lsb = (AC^BC)&1 -static void VE4(uint8_t* dst) { // vertical +static void VE4_SSE2(uint8_t* dst) { // vertical const __m128i one = _mm_set1_epi8(1); const __m128i ABCDEFGH = _mm_loadl_epi64((__m128i*)(dst - BPS - 1)); const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 1); @@ -909,7 +915,7 @@ static void VE4(uint8_t* dst) { // vertical } } -static void LD4(uint8_t* dst) { // Down-Left +static void LD4_SSE2(uint8_t* dst) { // Down-Left const __m128i one = _mm_set1_epi8(1); const __m128i ABCDEFGH = _mm_loadl_epi64((__m128i*)(dst - BPS)); const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 1); @@ -925,7 +931,7 @@ static void LD4(uint8_t* dst) { // Down-Left WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3))); } -static void VR4(uint8_t* dst) { // Vertical-Right +static void VR4_SSE2(uint8_t* dst) { // Vertical-Right const __m128i one = _mm_set1_epi8(1); const int I = dst[-1 + 0 * BPS]; const int J = dst[-1 + 1 * BPS]; @@ -935,7 +941,7 @@ static void VR4(uint8_t* dst) { // Vertical-Right const __m128i ABCD0 = _mm_srli_si128(XABCD, 1); const __m128i abcd = _mm_avg_epu8(XABCD, ABCD0); const __m128i _XABCD = _mm_slli_si128(XABCD, 1); - const __m128i IXABCD = _mm_insert_epi16(_XABCD, I | (X << 8), 0); + const __m128i IXABCD = _mm_insert_epi16(_XABCD, (short)(I | (X << 8)), 0); const __m128i avg1 = _mm_avg_epu8(IXABCD, ABCD0); const __m128i lsb = _mm_and_si128(_mm_xor_si128(IXABCD, ABCD0), one); const __m128i avg2 = _mm_subs_epu8(avg1, lsb); @@ -950,7 +956,7 @@ static void VR4(uint8_t* dst) { // Vertical-Right DST(0, 3) = AVG3(K, J, I); } -static void VL4(uint8_t* dst) { // Vertical-Left +static void VL4_SSE2(uint8_t* dst) { // Vertical-Left const __m128i one = _mm_set1_epi8(1); const __m128i ABCDEFGH = _mm_loadl_epi64((__m128i*)(dst - BPS)); const __m128i BCDEFGH_ = _mm_srli_si128(ABCDEFGH, 1); @@ -975,7 +981,7 @@ static void VL4(uint8_t* dst) { // Vertical-Left DST(3, 3) = (extra_out >> 8) & 0xff; } -static void RD4(uint8_t* dst) { // Down-right +static void RD4_SSE2(uint8_t* dst) { // Down-right const __m128i one = _mm_set1_epi8(1); const __m128i XABCD = _mm_loadl_epi64((__m128i*)(dst - BPS - 1)); const __m128i ____XABCD = _mm_slli_si128(XABCD, 4); @@ -1004,7 +1010,7 @@ static void RD4(uint8_t* dst) { // Down-right //------------------------------------------------------------------------------ // Luma 16x16 -static WEBP_INLINE void TrueMotion(uint8_t* dst, int size) { +static WEBP_INLINE void TrueMotion_SSE2(uint8_t* dst, int size) { const uint8_t* top = dst - BPS; const __m128i zero = _mm_setzero_si128(); int y; @@ -1041,11 +1047,11 @@ static WEBP_INLINE void TrueMotion(uint8_t* dst, int size) { } } -static void TM4(uint8_t* dst) { TrueMotion(dst, 4); } -static void TM8uv(uint8_t* dst) { TrueMotion(dst, 8); } -static void TM16(uint8_t* dst) { TrueMotion(dst, 16); } +static void TM4_SSE2(uint8_t* dst) { TrueMotion_SSE2(dst, 4); } +static void TM8uv_SSE2(uint8_t* dst) { TrueMotion_SSE2(dst, 8); } +static void TM16_SSE2(uint8_t* dst) { TrueMotion_SSE2(dst, 16); } -static void VE16(uint8_t* dst) { +static void VE16_SSE2(uint8_t* dst) { const __m128i top = _mm_loadu_si128((const __m128i*)(dst - BPS)); int j; for (j = 0; j < 16; ++j) { @@ -1053,7 +1059,7 @@ static void VE16(uint8_t* dst) { } } -static void HE16(uint8_t* dst) { // horizontal +static void HE16_SSE2(uint8_t* dst) { // horizontal int j; for (j = 16; j > 0; --j) { const __m128i values = _mm_set1_epi8(dst[-1]); @@ -1062,7 +1068,7 @@ static void HE16(uint8_t* dst) { // horizontal } } -static WEBP_INLINE void Put16(uint8_t v, uint8_t* dst) { +static WEBP_INLINE void Put16_SSE2(uint8_t v, uint8_t* dst) { int j; const __m128i values = _mm_set1_epi8(v); for (j = 0; j < 16; ++j) { @@ -1070,7 +1076,7 @@ static WEBP_INLINE void Put16(uint8_t v, uint8_t* dst) { } } -static void DC16(uint8_t* dst) { // DC +static void DC16_SSE2(uint8_t* dst) { // DC const __m128i zero = _mm_setzero_si128(); const __m128i top = _mm_loadu_si128((const __m128i*)(dst - BPS)); const __m128i sad8x2 = _mm_sad_epu8(top, zero); @@ -1083,37 +1089,37 @@ static void DC16(uint8_t* dst) { // DC } { const int DC = _mm_cvtsi128_si32(sum) + left + 16; - Put16(DC >> 5, dst); + Put16_SSE2(DC >> 5, dst); } } -static void DC16NoTop(uint8_t* dst) { // DC with top samples not available +static void DC16NoTop_SSE2(uint8_t* dst) { // DC with top samples unavailable int DC = 8; int j; for (j = 0; j < 16; ++j) { DC += dst[-1 + j * BPS]; } - Put16(DC >> 4, dst); + Put16_SSE2(DC >> 4, dst); } -static void DC16NoLeft(uint8_t* dst) { // DC with left samples not available +static void DC16NoLeft_SSE2(uint8_t* dst) { // DC with left samples unavailable const __m128i zero = _mm_setzero_si128(); const __m128i top = _mm_loadu_si128((const __m128i*)(dst - BPS)); const __m128i sad8x2 = _mm_sad_epu8(top, zero); // sum the two sads: sad8x2[0:1] + sad8x2[8:9] const __m128i sum = _mm_add_epi16(sad8x2, _mm_shuffle_epi32(sad8x2, 2)); const int DC = _mm_cvtsi128_si32(sum) + 8; - Put16(DC >> 4, dst); + Put16_SSE2(DC >> 4, dst); } -static void DC16NoTopLeft(uint8_t* dst) { // DC with no top and left samples - Put16(0x80, dst); +static void DC16NoTopLeft_SSE2(uint8_t* dst) { // DC with no top & left samples + Put16_SSE2(0x80, dst); } //------------------------------------------------------------------------------ // Chroma -static void VE8uv(uint8_t* dst) { // vertical +static void VE8uv_SSE2(uint8_t* dst) { // vertical int j; const __m128i top = _mm_loadl_epi64((const __m128i*)(dst - BPS)); for (j = 0; j < 8; ++j) { @@ -1121,17 +1127,8 @@ static void VE8uv(uint8_t* dst) { // vertical } } -static void HE8uv(uint8_t* dst) { // horizontal - int j; - for (j = 0; j < 8; ++j) { - const __m128i values = _mm_set1_epi8(dst[-1]); - _mm_storel_epi64((__m128i*)dst, values); - dst += BPS; - } -} - // helper for chroma-DC predictions -static WEBP_INLINE void Put8x8uv(uint8_t v, uint8_t* dst) { +static WEBP_INLINE void Put8x8uv_SSE2(uint8_t v, uint8_t* dst) { int j; const __m128i values = _mm_set1_epi8(v); for (j = 0; j < 8; ++j) { @@ -1139,7 +1136,7 @@ static WEBP_INLINE void Put8x8uv(uint8_t v, uint8_t* dst) { } } -static void DC8uv(uint8_t* dst) { // DC +static void DC8uv_SSE2(uint8_t* dst) { // DC const __m128i zero = _mm_setzero_si128(); const __m128i top = _mm_loadl_epi64((const __m128i*)(dst - BPS)); const __m128i sum = _mm_sad_epu8(top, zero); @@ -1150,29 +1147,29 @@ static void DC8uv(uint8_t* dst) { // DC } { const int DC = _mm_cvtsi128_si32(sum) + left + 8; - Put8x8uv(DC >> 4, dst); + Put8x8uv_SSE2(DC >> 4, dst); } } -static void DC8uvNoLeft(uint8_t* dst) { // DC with no left samples +static void DC8uvNoLeft_SSE2(uint8_t* dst) { // DC with no left samples const __m128i zero = _mm_setzero_si128(); const __m128i top = _mm_loadl_epi64((const __m128i*)(dst - BPS)); const __m128i sum = _mm_sad_epu8(top, zero); const int DC = _mm_cvtsi128_si32(sum) + 4; - Put8x8uv(DC >> 3, dst); + Put8x8uv_SSE2(DC >> 3, dst); } -static void DC8uvNoTop(uint8_t* dst) { // DC with no top samples +static void DC8uvNoTop_SSE2(uint8_t* dst) { // DC with no top samples int dc0 = 4; int i; for (i = 0; i < 8; ++i) { dc0 += dst[-1 + i * BPS]; } - Put8x8uv(dc0 >> 3, dst); + Put8x8uv_SSE2(dc0 >> 3, dst); } -static void DC8uvNoTopLeft(uint8_t* dst) { // DC with nothing - Put8x8uv(0x80, dst); +static void DC8uvNoTopLeft_SSE2(uint8_t* dst) { // DC with nothing + Put8x8uv_SSE2(0x80, dst); } //------------------------------------------------------------------------------ @@ -1181,47 +1178,46 @@ static void DC8uvNoTopLeft(uint8_t* dst) { // DC with nothing extern void VP8DspInitSSE2(void); WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitSSE2(void) { - VP8Transform = Transform; -#if defined(USE_TRANSFORM_AC3) - VP8TransformAC3 = TransformAC3; + VP8Transform = Transform_SSE2; +#if (USE_TRANSFORM_AC3 == 1) + VP8TransformAC3 = TransformAC3_SSE2; #endif - VP8VFilter16 = VFilter16; - VP8HFilter16 = HFilter16; - VP8VFilter8 = VFilter8; - VP8HFilter8 = HFilter8; - VP8VFilter16i = VFilter16i; - VP8HFilter16i = HFilter16i; - VP8VFilter8i = VFilter8i; - VP8HFilter8i = HFilter8i; - - VP8SimpleVFilter16 = SimpleVFilter16; - VP8SimpleHFilter16 = SimpleHFilter16; - VP8SimpleVFilter16i = SimpleVFilter16i; - VP8SimpleHFilter16i = SimpleHFilter16i; - - VP8PredLuma4[1] = TM4; - VP8PredLuma4[2] = VE4; - VP8PredLuma4[4] = RD4; - VP8PredLuma4[5] = VR4; - VP8PredLuma4[6] = LD4; - VP8PredLuma4[7] = VL4; - - VP8PredLuma16[0] = DC16; - VP8PredLuma16[1] = TM16; - VP8PredLuma16[2] = VE16; - VP8PredLuma16[3] = HE16; - VP8PredLuma16[4] = DC16NoTop; - VP8PredLuma16[5] = DC16NoLeft; - VP8PredLuma16[6] = DC16NoTopLeft; - - VP8PredChroma8[0] = DC8uv; - VP8PredChroma8[1] = TM8uv; - VP8PredChroma8[2] = VE8uv; - VP8PredChroma8[3] = HE8uv; - VP8PredChroma8[4] = DC8uvNoTop; - VP8PredChroma8[5] = DC8uvNoLeft; - VP8PredChroma8[6] = DC8uvNoTopLeft; + VP8VFilter16 = VFilter16_SSE2; + VP8HFilter16 = HFilter16_SSE2; + VP8VFilter8 = VFilter8_SSE2; + VP8HFilter8 = HFilter8_SSE2; + VP8VFilter16i = VFilter16i_SSE2; + VP8HFilter16i = HFilter16i_SSE2; + VP8VFilter8i = VFilter8i_SSE2; + VP8HFilter8i = HFilter8i_SSE2; + + VP8SimpleVFilter16 = SimpleVFilter16_SSE2; + VP8SimpleHFilter16 = SimpleHFilter16_SSE2; + VP8SimpleVFilter16i = SimpleVFilter16i_SSE2; + VP8SimpleHFilter16i = SimpleHFilter16i_SSE2; + + VP8PredLuma4[1] = TM4_SSE2; + VP8PredLuma4[2] = VE4_SSE2; + VP8PredLuma4[4] = RD4_SSE2; + VP8PredLuma4[5] = VR4_SSE2; + VP8PredLuma4[6] = LD4_SSE2; + VP8PredLuma4[7] = VL4_SSE2; + + VP8PredLuma16[0] = DC16_SSE2; + VP8PredLuma16[1] = TM16_SSE2; + VP8PredLuma16[2] = VE16_SSE2; + VP8PredLuma16[3] = HE16_SSE2; + VP8PredLuma16[4] = DC16NoTop_SSE2; + VP8PredLuma16[5] = DC16NoLeft_SSE2; + VP8PredLuma16[6] = DC16NoTopLeft_SSE2; + + VP8PredChroma8[0] = DC8uv_SSE2; + VP8PredChroma8[1] = TM8uv_SSE2; + VP8PredChroma8[2] = VE8uv_SSE2; + VP8PredChroma8[4] = DC8uvNoTop_SSE2; + VP8PredChroma8[5] = DC8uvNoLeft_SSE2; + VP8PredChroma8[6] = DC8uvNoTopLeft_SSE2; } #else // !WEBP_USE_SSE2 diff --git a/src/3rdparty/libwebp/src/dsp/dec_sse41.c b/src/3rdparty/libwebp/src/dsp/dec_sse41.c index 4e81ec4..8f18506 100644 --- a/src/3rdparty/libwebp/src/dsp/dec_sse41.c +++ b/src/3rdparty/libwebp/src/dsp/dec_sse41.c @@ -11,15 +11,15 @@ // // Author: Skal (pascal.massimino@gmail.com) -#include "./dsp.h" +#include "src/dsp/dsp.h" #if defined(WEBP_USE_SSE41) #include <smmintrin.h> -#include "../dec/vp8i_dec.h" -#include "../utils/utils.h" +#include "src/dec/vp8i_dec.h" +#include "src/utils/utils.h" -static void HE16(uint8_t* dst) { // horizontal +static void HE16_SSE41(uint8_t* dst) { // horizontal int j; const __m128i kShuffle3 = _mm_set1_epi8(3); for (j = 16; j > 0; --j) { @@ -36,7 +36,7 @@ static void HE16(uint8_t* dst) { // horizontal extern void VP8DspInitSSE41(void); WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitSSE41(void) { - VP8PredLuma16[3] = HE16; + VP8PredLuma16[3] = HE16_SSE41; } #else // !WEBP_USE_SSE41 diff --git a/src/3rdparty/libwebp/src/dsp/dsp.h b/src/3rdparty/libwebp/src/dsp/dsp.h index 813fed4..0d7f3fb 100644 --- a/src/3rdparty/libwebp/src/dsp/dsp.h +++ b/src/3rdparty/libwebp/src/dsp/dsp.h @@ -15,10 +15,10 @@ #define WEBP_DSP_DSP_H_ #ifdef HAVE_CONFIG_H -#include "../webp/config.h" +#include "src/webp/config.h" #endif -#include "../webp/types.h" +#include "src/webp/types.h" #ifdef __cplusplus extern "C" { @@ -38,17 +38,29 @@ extern "C" { # define LOCAL_GCC_PREREQ(maj, min) 0 #endif +#if defined(__clang__) +# define LOCAL_CLANG_VERSION ((__clang_major__ << 8) | __clang_minor__) +# define LOCAL_CLANG_PREREQ(maj, min) \ + (LOCAL_CLANG_VERSION >= (((maj) << 8) | (min))) +#else +# define LOCAL_CLANG_VERSION 0 +# define LOCAL_CLANG_PREREQ(maj, min) 0 +#endif + #ifndef __has_builtin # define __has_builtin(x) 0 #endif +// for now, none of the optimizations below are available in emscripten +#if !defined(EMSCRIPTEN) + #if defined(_MSC_VER) && _MSC_VER > 1310 && \ - (defined(_M_X64) || defined(_M_IX86)) + (defined(_M_X64) || defined(_M_IX86)) && !defined(__clang__) #define WEBP_MSC_SSE2 // Visual C++ SSE2 targets #endif #if defined(_MSC_VER) && _MSC_VER >= 1500 && \ - (defined(_M_X64) || defined(_M_IX86)) + (defined(_M_X64) || defined(_M_IX86)) && !defined(__clang__) #define WEBP_MSC_SSE41 // Visual C++ SSE4.1 targets #endif @@ -64,23 +76,21 @@ extern "C" { #define WEBP_USE_SSE41 #endif -#if defined(__AVX2__) || defined(WEBP_HAVE_AVX2) -#define WEBP_USE_AVX2 -#endif - -#if defined(__ANDROID__) && defined(__ARM_ARCH_7A__) -#define WEBP_ANDROID_NEON // Android targets that might support NEON -#endif - // The intrinsics currently cause compiler errors with arm-nacl-gcc and the // inline assembly would need to be modified for use with Native Client. -#if (defined(__ARM_NEON__) || defined(WEBP_ANDROID_NEON) || \ +#if (defined(__ARM_NEON__) || \ defined(__aarch64__) || defined(WEBP_HAVE_NEON)) && \ !defined(__native_client__) #define WEBP_USE_NEON #endif -#if defined(_MSC_VER) && _MSC_VER >= 1700 && defined(_M_ARM) +#if !defined(WEBP_USE_NEON) && defined(__ANDROID__) && \ + defined(__ARM_ARCH_7A__) && defined(HAVE_CPU_FEATURES_H) +#define WEBP_ANDROID_NEON // Android targets that may have NEON +#define WEBP_USE_NEON +#endif + +#if defined(_MSC_VER) && _MSC_VER >= 1700 && defined(_M_ARM) && !defined(__clang__) #define WEBP_USE_NEON #define WEBP_USE_INTRINSICS #endif @@ -90,7 +100,7 @@ extern "C" { #define WEBP_USE_MIPS32 #if (__mips_isa_rev >= 2) #define WEBP_USE_MIPS32_R2 -#if defined(__mips_dspr2) || (__mips_dsp_rev >= 2) +#if defined(__mips_dspr2) || (defined(__mips_dsp_rev) && __mips_dsp_rev >= 2) #define WEBP_USE_MIPS_DSP_R2 #endif #endif @@ -100,6 +110,24 @@ extern "C" { #define WEBP_USE_MSA #endif +#endif /* EMSCRIPTEN */ + +#ifndef WEBP_DSP_OMIT_C_CODE +#define WEBP_DSP_OMIT_C_CODE 1 +#endif + +#if (defined(__aarch64__) || defined(__ARM_NEON__)) && WEBP_DSP_OMIT_C_CODE +#define WEBP_NEON_OMIT_C_CODE 1 +#else +#define WEBP_NEON_OMIT_C_CODE 0 +#endif + +#if !(LOCAL_CLANG_PREREQ(3,8) || LOCAL_GCC_PREREQ(4,8) || defined(__aarch64__)) +#define WEBP_NEON_WORK_AROUND_GCC 1 +#else +#define WEBP_NEON_WORK_AROUND_GCC 0 +#endif + // This macro prevents thread_sanitizer from reporting known concurrent writes. #define WEBP_TSAN_IGNORE_FUNCTION #if defined(__has_feature) @@ -109,6 +137,42 @@ extern "C" { #endif #endif +#if defined(WEBP_USE_THREAD) && !defined(_WIN32) +#include <pthread.h> // NOLINT + +#define WEBP_DSP_INIT(func) do { \ + static volatile VP8CPUInfo func ## _last_cpuinfo_used = \ + (VP8CPUInfo)&func ## _last_cpuinfo_used; \ + static pthread_mutex_t func ## _lock = PTHREAD_MUTEX_INITIALIZER; \ + if (pthread_mutex_lock(&func ## _lock)) break; \ + if (func ## _last_cpuinfo_used != VP8GetCPUInfo) func(); \ + func ## _last_cpuinfo_used = VP8GetCPUInfo; \ + (void)pthread_mutex_unlock(&func ## _lock); \ +} while (0) +#else // !(defined(WEBP_USE_THREAD) && !defined(_WIN32)) +#define WEBP_DSP_INIT(func) do { \ + static volatile VP8CPUInfo func ## _last_cpuinfo_used = \ + (VP8CPUInfo)&func ## _last_cpuinfo_used; \ + if (func ## _last_cpuinfo_used == VP8GetCPUInfo) break; \ + func(); \ + func ## _last_cpuinfo_used = VP8GetCPUInfo; \ +} while (0) +#endif // defined(WEBP_USE_THREAD) && !defined(_WIN32) + +// Defines an Init + helper function that control multiple initialization of +// function pointers / tables. +/* Usage: + WEBP_DSP_INIT_FUNC(InitFunc) { + ...function body + } +*/ +#define WEBP_DSP_INIT_FUNC(name) \ + static WEBP_TSAN_IGNORE_FUNCTION void name ## _body(void); \ + WEBP_TSAN_IGNORE_FUNCTION void name(void) { \ + WEBP_DSP_INIT(name ## _body); \ + } \ + static WEBP_TSAN_IGNORE_FUNCTION void name ## _body(void) + #define WEBP_UBSAN_IGNORE_UNDEF #define WEBP_UBSAN_IGNORE_UNSIGNED_OVERFLOW #if defined(__clang__) && defined(__has_attribute) @@ -129,6 +193,18 @@ extern "C" { #endif #endif +// Regularize the definition of WEBP_SWAP_16BIT_CSP (backward compatibility) +#if !defined(WEBP_SWAP_16BIT_CSP) +#define WEBP_SWAP_16BIT_CSP 0 +#endif + +// some endian fix (e.g.: mips-gcc doesn't define __BIG_ENDIAN__) +#if !defined(WORDS_BIGENDIAN) && \ + (defined(__BIG_ENDIAN__) || defined(_M_PPC) || \ + (defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__))) +#define WORDS_BIGENDIAN +#endif + typedef enum { kSSE2, kSSE3, @@ -143,7 +219,7 @@ typedef enum { } CPUFeature; // returns true if the CPU supports the feature. typedef int (*VP8CPUInfo)(CPUFeature feature); -WEBP_EXTERN(VP8CPUInfo) VP8GetCPUInfo; +WEBP_EXTERN VP8CPUInfo VP8GetCPUInfo; //------------------------------------------------------------------------------ // Init stub generator @@ -152,7 +228,7 @@ WEBP_EXTERN(VP8CPUInfo) VP8GetCPUInfo; // avoiding a compiler warning. #define WEBP_DSP_INIT_STUB(func) \ extern void func(void); \ - WEBP_TSAN_IGNORE_FUNCTION void func(void) {} + void func(void) {} //------------------------------------------------------------------------------ // Encoding @@ -271,6 +347,7 @@ typedef double (*VP8SSIMGetClippedFunc)(const uint8_t* src1, int stride1, int xo, int yo, // center position int W, int H); // plane dimension +#if !defined(WEBP_REDUCE_SIZE) // This version is called with the guarantee that you can load 8 bytes and // 8 rows at offset src1 and src2 typedef double (*VP8SSIMGetFunc)(const uint8_t* src1, int stride1, @@ -278,10 +355,13 @@ typedef double (*VP8SSIMGetFunc)(const uint8_t* src1, int stride1, extern VP8SSIMGetFunc VP8SSIMGet; // unclipped / unchecked extern VP8SSIMGetClippedFunc VP8SSIMGetClipped; // with clipping +#endif +#if !defined(WEBP_DISABLE_STATS) typedef uint32_t (*VP8AccumulateSSEFunc)(const uint8_t* src1, const uint8_t* src2, int len); extern VP8AccumulateSSEFunc VP8AccumulateSSE; +#endif // must be called before using any of the above directly void VP8SSIMDspInit(void); @@ -462,12 +542,12 @@ extern WebPRescalerExportRowFunc WebPRescalerExportRowExpand; extern WebPRescalerExportRowFunc WebPRescalerExportRowShrink; // Plain-C implementation, as fall-back. -extern void WebPRescalerImportRowExpandC(struct WebPRescaler* const wrk, - const uint8_t* src); -extern void WebPRescalerImportRowShrinkC(struct WebPRescaler* const wrk, - const uint8_t* src); -extern void WebPRescalerExportRowExpandC(struct WebPRescaler* const wrk); -extern void WebPRescalerExportRowShrinkC(struct WebPRescaler* const wrk); +extern void WebPRescalerImportRowExpand_C(struct WebPRescaler* const wrk, + const uint8_t* src); +extern void WebPRescalerImportRowShrink_C(struct WebPRescaler* const wrk, + const uint8_t* src); +extern void WebPRescalerExportRowExpand_C(struct WebPRescaler* const wrk); +extern void WebPRescalerExportRowShrink_C(struct WebPRescaler* const wrk); // Main entry calls: extern void WebPRescalerImportRow(struct WebPRescaler* const wrk, @@ -533,24 +613,28 @@ void WebPMultRows(uint8_t* ptr, int stride, int width, int num_rows, int inverse); // Plain-C versions, used as fallback by some implementations. -void WebPMultRowC(uint8_t* const ptr, const uint8_t* const alpha, - int width, int inverse); -void WebPMultARGBRowC(uint32_t* const ptr, int width, int inverse); - -// To be called first before using the above. -void WebPInitAlphaProcessing(void); +void WebPMultRow_C(uint8_t* const ptr, const uint8_t* const alpha, + int width, int inverse); +void WebPMultARGBRow_C(uint32_t* const ptr, int width, int inverse); +#ifdef WORDS_BIGENDIAN // ARGB packing function: a/r/g/b input is rgba or bgra order. -extern void (*VP8PackARGB)(const uint8_t* a, const uint8_t* r, - const uint8_t* g, const uint8_t* b, int len, - uint32_t* out); +extern void (*WebPPackARGB)(const uint8_t* a, const uint8_t* r, + const uint8_t* g, const uint8_t* b, int len, + uint32_t* out); +#endif // RGB packing function. 'step' can be 3 or 4. r/g/b input is rgb or bgr order. -extern void (*VP8PackRGB)(const uint8_t* r, const uint8_t* g, const uint8_t* b, - int len, int step, uint32_t* out); +extern void (*WebPPackRGB)(const uint8_t* r, const uint8_t* g, const uint8_t* b, + int len, int step, uint32_t* out); + +// This function returns true if src[i] contains a value different from 0xff. +extern int (*WebPHasAlpha8b)(const uint8_t* src, int length); +// This function returns true if src[4*i] contains a value different from 0xff. +extern int (*WebPHasAlpha32b)(const uint8_t* src, int length); // To be called first before using the above. -void VP8EncDspARGBInit(void); +void WebPInitAlphaProcessing(void); //------------------------------------------------------------------------------ // Filter functions @@ -591,4 +675,4 @@ void VP8FiltersInit(void); } // extern "C" #endif -#endif /* WEBP_DSP_DSP_H_ */ +#endif // WEBP_DSP_DSP_H_ diff --git a/src/3rdparty/libwebp/src/dsp/enc.c b/src/3rdparty/libwebp/src/dsp/enc.c index f31bc6d..2fddbc4 100644 --- a/src/3rdparty/libwebp/src/dsp/enc.c +++ b/src/3rdparty/libwebp/src/dsp/enc.c @@ -14,16 +14,18 @@ #include <assert.h> #include <stdlib.h> // for abs() -#include "./dsp.h" -#include "../enc/vp8i_enc.h" +#include "src/dsp/dsp.h" +#include "src/enc/vp8i_enc.h" static WEBP_INLINE uint8_t clip_8b(int v) { return (!(v & ~0xff)) ? v : (v < 0) ? 0 : 255; } +#if !WEBP_NEON_OMIT_C_CODE static WEBP_INLINE int clip_max(int v, int max) { return (v > max) ? max : v; } +#endif // !WEBP_NEON_OMIT_C_CODE //------------------------------------------------------------------------------ // Compute susceptibility based on DCT-coeff histograms: @@ -56,9 +58,10 @@ void VP8SetHistogramData(const int distribution[MAX_COEFF_THRESH + 1], histo->last_non_zero = last_non_zero; } -static void CollectHistogram(const uint8_t* ref, const uint8_t* pred, - int start_block, int end_block, - VP8Histogram* const histo) { +#if !WEBP_NEON_OMIT_C_CODE +static void CollectHistogram_C(const uint8_t* ref, const uint8_t* pred, + int start_block, int end_block, + VP8Histogram* const histo) { int j; int distribution[MAX_COEFF_THRESH + 1] = { 0 }; for (j = start_block; j < end_block; ++j) { @@ -76,6 +79,7 @@ static void CollectHistogram(const uint8_t* ref, const uint8_t* pred, } VP8SetHistogramData(distribution, histo); } +#endif // !WEBP_NEON_OMIT_C_CODE //------------------------------------------------------------------------------ // run-time tables (~4k) @@ -100,6 +104,8 @@ static WEBP_TSAN_IGNORE_FUNCTION void InitTables(void) { //------------------------------------------------------------------------------ // Transforms (Paragraph 14.4) +#if !WEBP_NEON_OMIT_C_CODE + #define STORE(x, y, v) \ dst[(x) + (y) * BPS] = clip_8b(ref[(x) + (y) * BPS] + ((v) >> 3)) @@ -140,15 +146,15 @@ static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in, } } -static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst, - int do_two) { +static void ITransform_C(const uint8_t* ref, const int16_t* in, uint8_t* dst, + int do_two) { ITransformOne(ref, in, dst); if (do_two) { ITransformOne(ref + 4, in + 16, dst + 4); } } -static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) { +static void FTransform_C(const uint8_t* src, const uint8_t* ref, int16_t* out) { int i; int tmp[16]; for (i = 0; i < 4; ++i, src += BPS, ref += BPS) { @@ -176,13 +182,16 @@ static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) { out[12+ i] = ((a3 * 2217 - a2 * 5352 + 51000) >> 16); } } +#endif // !WEBP_NEON_OMIT_C_CODE -static void FTransform2(const uint8_t* src, const uint8_t* ref, int16_t* out) { +static void FTransform2_C(const uint8_t* src, const uint8_t* ref, + int16_t* out) { VP8FTransform(src, ref, out); VP8FTransform(src + 4, ref + 4, out + 16); } -static void FTransformWHT(const int16_t* in, int16_t* out) { +#if !WEBP_NEON_OMIT_C_CODE +static void FTransformWHT_C(const int16_t* in, int16_t* out) { // input is 12b signed int32_t tmp[16]; int i; @@ -211,6 +220,7 @@ static void FTransformWHT(const int16_t* in, int16_t* out) { out[12 + i] = b3 >> 1; } } +#endif // !WEBP_NEON_OMIT_C_CODE #undef MUL #undef STORE @@ -303,8 +313,8 @@ static WEBP_INLINE void DCMode(uint8_t* dst, const uint8_t* left, //------------------------------------------------------------------------------ // Chroma 8x8 prediction (paragraph 12.2) -static void IntraChromaPreds(uint8_t* dst, const uint8_t* left, - const uint8_t* top) { +static void IntraChromaPreds_C(uint8_t* dst, const uint8_t* left, + const uint8_t* top) { // U block DCMode(C8DC8 + dst, left, top, 8, 8, 4); VerticalPred(C8VE8 + dst, top, 8); @@ -323,8 +333,8 @@ static void IntraChromaPreds(uint8_t* dst, const uint8_t* left, //------------------------------------------------------------------------------ // luma 16x16 prediction (paragraph 12.3) -static void Intra16Preds(uint8_t* dst, - const uint8_t* left, const uint8_t* top) { +static void Intra16Preds_C(uint8_t* dst, + const uint8_t* left, const uint8_t* top) { DCMode(I16DC16 + dst, left, top, 16, 16, 5); VerticalPred(I16VE16 + dst, top, 16); HorizontalPred(I16HE16 + dst, left, 16); @@ -507,7 +517,7 @@ static void TM4(uint8_t* dst, const uint8_t* top) { // Left samples are top[-5 .. -2], top_left is top[-1], top are // located at top[0..3], and top right is top[4..7] -static void Intra4Preds(uint8_t* dst, const uint8_t* top) { +static void Intra4Preds_C(uint8_t* dst, const uint8_t* top) { DC4(I4DC4 + dst, top); TM4(I4TM4 + dst, top); VE4(I4VE4 + dst, top); @@ -523,6 +533,7 @@ static void Intra4Preds(uint8_t* dst, const uint8_t* top) { //------------------------------------------------------------------------------ // Metric +#if !WEBP_NEON_OMIT_C_CODE static WEBP_INLINE int GetSSE(const uint8_t* a, const uint8_t* b, int w, int h) { int count = 0; @@ -538,20 +549,21 @@ static WEBP_INLINE int GetSSE(const uint8_t* a, const uint8_t* b, return count; } -static int SSE16x16(const uint8_t* a, const uint8_t* b) { +static int SSE16x16_C(const uint8_t* a, const uint8_t* b) { return GetSSE(a, b, 16, 16); } -static int SSE16x8(const uint8_t* a, const uint8_t* b) { +static int SSE16x8_C(const uint8_t* a, const uint8_t* b) { return GetSSE(a, b, 16, 8); } -static int SSE8x8(const uint8_t* a, const uint8_t* b) { +static int SSE8x8_C(const uint8_t* a, const uint8_t* b) { return GetSSE(a, b, 8, 8); } -static int SSE4x4(const uint8_t* a, const uint8_t* b) { +static int SSE4x4_C(const uint8_t* a, const uint8_t* b) { return GetSSE(a, b, 4, 4); } +#endif // !WEBP_NEON_OMIT_C_CODE -static void Mean16x4(const uint8_t* ref, uint32_t dc[4]) { +static void Mean16x4_C(const uint8_t* ref, uint32_t dc[4]) { int k, x, y; for (k = 0; k < 4; ++k) { uint32_t avg = 0; @@ -571,6 +583,7 @@ static void Mean16x4(const uint8_t* ref, uint32_t dc[4]) { // We try to match the spectral content (weighted) between source and // reconstructed samples. +#if !WEBP_NEON_OMIT_C_CODE // Hadamard transform // Returns the weighted sum of the absolute value of transformed coefficients. // w[] contains a row-major 4 by 4 symmetric matrix. @@ -608,24 +621,25 @@ static int TTransform(const uint8_t* in, const uint16_t* w) { return sum; } -static int Disto4x4(const uint8_t* const a, const uint8_t* const b, - const uint16_t* const w) { +static int Disto4x4_C(const uint8_t* const a, const uint8_t* const b, + const uint16_t* const w) { const int sum1 = TTransform(a, w); const int sum2 = TTransform(b, w); return abs(sum2 - sum1) >> 5; } -static int Disto16x16(const uint8_t* const a, const uint8_t* const b, - const uint16_t* const w) { +static int Disto16x16_C(const uint8_t* const a, const uint8_t* const b, + const uint16_t* const w) { int D = 0; int x, y; for (y = 0; y < 16 * BPS; y += 4 * BPS) { for (x = 0; x < 16; x += 4) { - D += Disto4x4(a + x + y, b + x + y, w); + D += Disto4x4_C(a + x + y, b + x + y, w); } } return D; } +#endif // !WEBP_NEON_OMIT_C_CODE //------------------------------------------------------------------------------ // Quantization @@ -636,8 +650,8 @@ static const uint8_t kZigzag[16] = { }; // Simple quantization -static int QuantizeBlock(int16_t in[16], int16_t out[16], - const VP8Matrix* const mtx) { +static int QuantizeBlock_C(int16_t in[16], int16_t out[16], + const VP8Matrix* const mtx) { int last = -1; int n; for (n = 0; n < 16; ++n) { @@ -662,13 +676,15 @@ static int QuantizeBlock(int16_t in[16], int16_t out[16], return (last >= 0); } -static int Quantize2Blocks(int16_t in[32], int16_t out[32], - const VP8Matrix* const mtx) { +#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC +static int Quantize2Blocks_C(int16_t in[32], int16_t out[32], + const VP8Matrix* const mtx) { int nz; nz = VP8EncQuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0; nz |= VP8EncQuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1; return nz; } +#endif // !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC //------------------------------------------------------------------------------ // Block copy @@ -682,149 +698,15 @@ static WEBP_INLINE void Copy(const uint8_t* src, uint8_t* dst, int w, int h) { } } -static void Copy4x4(const uint8_t* src, uint8_t* dst) { +static void Copy4x4_C(const uint8_t* src, uint8_t* dst) { Copy(src, dst, 4, 4); } -static void Copy16x8(const uint8_t* src, uint8_t* dst) { +static void Copy16x8_C(const uint8_t* src, uint8_t* dst) { Copy(src, dst, 16, 8); } //------------------------------------------------------------------------------ -// SSIM / PSNR - -// hat-shaped filter. Sum of coefficients is equal to 16. -static const uint32_t kWeight[2 * VP8_SSIM_KERNEL + 1] = { - 1, 2, 3, 4, 3, 2, 1 -}; -static const uint32_t kWeightSum = 16 * 16; // sum{kWeight}^2 - -static WEBP_INLINE double SSIMCalculation( - const VP8DistoStats* const stats, uint32_t N /*num samples*/) { - const uint32_t w2 = N * N; - const uint32_t C1 = 20 * w2; - const uint32_t C2 = 60 * w2; - const uint32_t C3 = 8 * 8 * w2; // 'dark' limit ~= 6 - const uint64_t xmxm = (uint64_t)stats->xm * stats->xm; - const uint64_t ymym = (uint64_t)stats->ym * stats->ym; - if (xmxm + ymym >= C3) { - const int64_t xmym = (int64_t)stats->xm * stats->ym; - const int64_t sxy = (int64_t)stats->xym * N - xmym; // can be negative - const uint64_t sxx = (uint64_t)stats->xxm * N - xmxm; - const uint64_t syy = (uint64_t)stats->yym * N - ymym; - // we descale by 8 to prevent overflow during the fnum/fden multiply. - const uint64_t num_S = (2 * (uint64_t)(sxy < 0 ? 0 : sxy) + C2) >> 8; - const uint64_t den_S = (sxx + syy + C2) >> 8; - const uint64_t fnum = (2 * xmym + C1) * num_S; - const uint64_t fden = (xmxm + ymym + C1) * den_S; - const double r = (double)fnum / fden; - assert(r >= 0. && r <= 1.0); - return r; - } - return 1.; // area is too dark to contribute meaningfully -} - -double VP8SSIMFromStats(const VP8DistoStats* const stats) { - return SSIMCalculation(stats, kWeightSum); -} - -double VP8SSIMFromStatsClipped(const VP8DistoStats* const stats) { - return SSIMCalculation(stats, stats->w); -} - -static double SSIMGetClipped_C(const uint8_t* src1, int stride1, - const uint8_t* src2, int stride2, - int xo, int yo, int W, int H) { - VP8DistoStats stats = { 0, 0, 0, 0, 0, 0 }; - const int ymin = (yo - VP8_SSIM_KERNEL < 0) ? 0 : yo - VP8_SSIM_KERNEL; - const int ymax = (yo + VP8_SSIM_KERNEL > H - 1) ? H - 1 - : yo + VP8_SSIM_KERNEL; - const int xmin = (xo - VP8_SSIM_KERNEL < 0) ? 0 : xo - VP8_SSIM_KERNEL; - const int xmax = (xo + VP8_SSIM_KERNEL > W - 1) ? W - 1 - : xo + VP8_SSIM_KERNEL; - int x, y; - src1 += ymin * stride1; - src2 += ymin * stride2; - for (y = ymin; y <= ymax; ++y, src1 += stride1, src2 += stride2) { - for (x = xmin; x <= xmax; ++x) { - const uint32_t w = kWeight[VP8_SSIM_KERNEL + x - xo] - * kWeight[VP8_SSIM_KERNEL + y - yo]; - const uint32_t s1 = src1[x]; - const uint32_t s2 = src2[x]; - stats.w += w; - stats.xm += w * s1; - stats.ym += w * s2; - stats.xxm += w * s1 * s1; - stats.xym += w * s1 * s2; - stats.yym += w * s2 * s2; - } - } - return VP8SSIMFromStatsClipped(&stats); -} - -static double SSIMGet_C(const uint8_t* src1, int stride1, - const uint8_t* src2, int stride2) { - VP8DistoStats stats = { 0, 0, 0, 0, 0, 0 }; - int x, y; - for (y = 0; y <= 2 * VP8_SSIM_KERNEL; ++y, src1 += stride1, src2 += stride2) { - for (x = 0; x <= 2 * VP8_SSIM_KERNEL; ++x) { - const uint32_t w = kWeight[x] * kWeight[y]; - const uint32_t s1 = src1[x]; - const uint32_t s2 = src2[x]; - stats.xm += w * s1; - stats.ym += w * s2; - stats.xxm += w * s1 * s1; - stats.xym += w * s1 * s2; - stats.yym += w * s2 * s2; - } - } - return VP8SSIMFromStats(&stats); -} - -//------------------------------------------------------------------------------ - -static uint32_t AccumulateSSE(const uint8_t* src1, - const uint8_t* src2, int len) { - int i; - uint32_t sse2 = 0; - assert(len <= 65535); // to ensure that accumulation fits within uint32_t - for (i = 0; i < len; ++i) { - const int32_t diff = src1[i] - src2[i]; - sse2 += diff * diff; - } - return sse2; -} - -//------------------------------------------------------------------------------ - -VP8SSIMGetFunc VP8SSIMGet; -VP8SSIMGetClippedFunc VP8SSIMGetClipped; -VP8AccumulateSSEFunc VP8AccumulateSSE; - -extern void VP8SSIMDspInitSSE2(void); - -static volatile VP8CPUInfo ssim_last_cpuinfo_used = - (VP8CPUInfo)&ssim_last_cpuinfo_used; - -WEBP_TSAN_IGNORE_FUNCTION void VP8SSIMDspInit(void) { - if (ssim_last_cpuinfo_used == VP8GetCPUInfo) return; - - VP8SSIMGetClipped = SSIMGetClipped_C; - VP8SSIMGet = SSIMGet_C; - - VP8AccumulateSSE = AccumulateSSE; - if (VP8GetCPUInfo != NULL) { -#if defined(WEBP_USE_SSE2) - if (VP8GetCPUInfo(kSSE2)) { - VP8SSIMDspInitSSE2(); - } -#endif - } - - ssim_last_cpuinfo_used = VP8GetCPUInfo; -} - -//------------------------------------------------------------------------------ // Initialization // Speed-critical function pointers. We have to initialize them to the default @@ -852,42 +734,42 @@ VP8BlockCopy VP8Copy16x8; extern void VP8EncDspInitSSE2(void); extern void VP8EncDspInitSSE41(void); -extern void VP8EncDspInitAVX2(void); extern void VP8EncDspInitNEON(void); extern void VP8EncDspInitMIPS32(void); extern void VP8EncDspInitMIPSdspR2(void); extern void VP8EncDspInitMSA(void); -static volatile VP8CPUInfo enc_last_cpuinfo_used = - (VP8CPUInfo)&enc_last_cpuinfo_used; - -WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInit(void) { - if (enc_last_cpuinfo_used == VP8GetCPUInfo) return; - +WEBP_DSP_INIT_FUNC(VP8EncDspInit) { VP8DspInit(); // common inverse transforms InitTables(); // default C implementations - VP8CollectHistogram = CollectHistogram; - VP8ITransform = ITransform; - VP8FTransform = FTransform; - VP8FTransform2 = FTransform2; - VP8FTransformWHT = FTransformWHT; - VP8EncPredLuma4 = Intra4Preds; - VP8EncPredLuma16 = Intra16Preds; - VP8EncPredChroma8 = IntraChromaPreds; - VP8SSE16x16 = SSE16x16; - VP8SSE8x8 = SSE8x8; - VP8SSE16x8 = SSE16x8; - VP8SSE4x4 = SSE4x4; - VP8TDisto4x4 = Disto4x4; - VP8TDisto16x16 = Disto16x16; - VP8Mean16x4 = Mean16x4; - VP8EncQuantizeBlock = QuantizeBlock; - VP8EncQuantize2Blocks = Quantize2Blocks; - VP8EncQuantizeBlockWHT = QuantizeBlock; - VP8Copy4x4 = Copy4x4; - VP8Copy16x8 = Copy16x8; +#if !WEBP_NEON_OMIT_C_CODE + VP8ITransform = ITransform_C; + VP8FTransform = FTransform_C; + VP8FTransformWHT = FTransformWHT_C; + VP8TDisto4x4 = Disto4x4_C; + VP8TDisto16x16 = Disto16x16_C; + VP8CollectHistogram = CollectHistogram_C; + VP8SSE16x16 = SSE16x16_C; + VP8SSE16x8 = SSE16x8_C; + VP8SSE8x8 = SSE8x8_C; + VP8SSE4x4 = SSE4x4_C; +#endif + +#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC + VP8EncQuantizeBlock = QuantizeBlock_C; + VP8EncQuantize2Blocks = Quantize2Blocks_C; +#endif + + VP8FTransform2 = FTransform2_C; + VP8EncPredLuma4 = Intra4Preds_C; + VP8EncPredLuma16 = Intra16Preds_C; + VP8EncPredChroma8 = IntraChromaPreds_C; + VP8Mean16x4 = Mean16x4_C; + VP8EncQuantizeBlockWHT = QuantizeBlock_C; + VP8Copy4x4 = Copy4x4_C; + VP8Copy16x8 = Copy16x8_C; // If defined, use CPUInfo() to overwrite some pointers with faster versions. if (VP8GetCPUInfo != NULL) { @@ -901,16 +783,6 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInit(void) { #endif } #endif -#if defined(WEBP_USE_AVX2) - if (VP8GetCPUInfo(kAVX2)) { - VP8EncDspInitAVX2(); - } -#endif -#if defined(WEBP_USE_NEON) - if (VP8GetCPUInfo(kNEON)) { - VP8EncDspInitNEON(); - } -#endif #if defined(WEBP_USE_MIPS32) if (VP8GetCPUInfo(kMIPS32)) { VP8EncDspInitMIPS32(); @@ -927,5 +799,32 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInit(void) { } #endif } - enc_last_cpuinfo_used = VP8GetCPUInfo; + +#if defined(WEBP_USE_NEON) + if (WEBP_NEON_OMIT_C_CODE || + (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) { + VP8EncDspInitNEON(); + } +#endif + + assert(VP8ITransform != NULL); + assert(VP8FTransform != NULL); + assert(VP8FTransformWHT != NULL); + assert(VP8TDisto4x4 != NULL); + assert(VP8TDisto16x16 != NULL); + assert(VP8CollectHistogram != NULL); + assert(VP8SSE16x16 != NULL); + assert(VP8SSE16x8 != NULL); + assert(VP8SSE8x8 != NULL); + assert(VP8SSE4x4 != NULL); + assert(VP8EncQuantizeBlock != NULL); + assert(VP8EncQuantize2Blocks != NULL); + assert(VP8FTransform2 != NULL); + assert(VP8EncPredLuma4 != NULL); + assert(VP8EncPredLuma16 != NULL); + assert(VP8EncPredChroma8 != NULL); + assert(VP8Mean16x4 != NULL); + assert(VP8EncQuantizeBlockWHT != NULL); + assert(VP8Copy4x4 != NULL); + assert(VP8Copy16x8 != NULL); } diff --git a/src/3rdparty/libwebp/src/dsp/enc_avx2.c b/src/3rdparty/libwebp/src/dsp/enc_avx2.c deleted file mode 100644 index 93efb30..0000000 --- a/src/3rdparty/libwebp/src/dsp/enc_avx2.c +++ /dev/null @@ -1,21 +0,0 @@ -// Copyright 2014 Google Inc. All Rights Reserved. -// -// Use of this source code is governed by a BSD-style license -// that can be found in the COPYING file in the root of the source -// tree. An additional intellectual property rights grant can be found -// in the file PATENTS. All contributing project authors may -// be found in the AUTHORS file in the root of the source tree. -// ----------------------------------------------------------------------------- -// -// AVX2 version of speed-critical encoding functions. - -#include "./dsp.h" - -#if defined(WEBP_USE_AVX2) - -#endif // WEBP_USE_AVX2 - -//------------------------------------------------------------------------------ -// Entry point - -WEBP_DSP_INIT_STUB(VP8EncDspInitAVX2) diff --git a/src/3rdparty/libwebp/src/dsp/enc_mips32.c b/src/3rdparty/libwebp/src/dsp/enc_mips32.c index 752b14d..618f0fc 100644 --- a/src/3rdparty/libwebp/src/dsp/enc_mips32.c +++ b/src/3rdparty/libwebp/src/dsp/enc_mips32.c @@ -13,13 +13,13 @@ // Jovan Zelincevic (jovan.zelincevic@imgtec.com) // Slobodan Prijic (slobodan.prijic@imgtec.com) -#include "./dsp.h" +#include "src/dsp/dsp.h" #if defined(WEBP_USE_MIPS32) -#include "./mips_macro.h" -#include "../enc/vp8i_enc.h" -#include "../enc/cost_enc.h" +#include "src/dsp/mips_macro.h" +#include "src/enc/vp8i_enc.h" +#include "src/enc/cost_enc.h" static const int kC1 = 20091 + (1 << 16); static const int kC2 = 35468; @@ -113,8 +113,9 @@ static const int kC2 = 35468; "sb %[" #TEMP12 "], 3+" XSTR(BPS) "*" #A "(%[temp16]) \n\t" // Does one or two inverse transforms. -static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in, - uint8_t* dst) { +static WEBP_INLINE void ITransformOne_MIPS32(const uint8_t* ref, + const int16_t* in, + uint8_t* dst) { int temp0, temp1, temp2, temp3, temp4, temp5, temp6; int temp7, temp8, temp9, temp10, temp11, temp12, temp13; int temp14, temp15, temp16, temp17, temp18, temp19, temp20; @@ -144,11 +145,11 @@ static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in, ); } -static void ITransform(const uint8_t* ref, const int16_t* in, - uint8_t* dst, int do_two) { - ITransformOne(ref, in, dst); +static void ITransform_MIPS32(const uint8_t* ref, const int16_t* in, + uint8_t* dst, int do_two) { + ITransformOne_MIPS32(ref, in, dst); if (do_two) { - ITransformOne(ref + 4, in + 16, dst + 4); + ITransformOne_MIPS32(ref + 4, in + 16, dst + 4); } } @@ -187,8 +188,8 @@ static void ITransform(const uint8_t* ref, const int16_t* in, "sh %[temp5], " #J "(%[ppin]) \n\t" \ "sh %[level], " #N "(%[pout]) \n\t" -static int QuantizeBlock(int16_t in[16], int16_t out[16], - const VP8Matrix* const mtx) { +static int QuantizeBlock_MIPS32(int16_t in[16], int16_t out[16], + const VP8Matrix* const mtx) { int temp0, temp1, temp2, temp3, temp4, temp5; int sign, coeff, level, i; int max_level = MAX_LEVEL; @@ -238,11 +239,11 @@ static int QuantizeBlock(int16_t in[16], int16_t out[16], return 0; } -static int Quantize2Blocks(int16_t in[32], int16_t out[32], - const VP8Matrix* const mtx) { +static int Quantize2Blocks_MIPS32(int16_t in[32], int16_t out[32], + const VP8Matrix* const mtx) { int nz; - nz = QuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0; - nz |= QuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1; + nz = QuantizeBlock_MIPS32(in + 0 * 16, out + 0 * 16, mtx) << 0; + nz |= QuantizeBlock_MIPS32(in + 1 * 16, out + 1 * 16, mtx) << 1; return nz; } @@ -361,8 +362,8 @@ static int Quantize2Blocks(int16_t in[32], int16_t out[32], "msub %[temp6], %[temp0] \n\t" \ "msub %[temp7], %[temp1] \n\t" -static int Disto4x4(const uint8_t* const a, const uint8_t* const b, - const uint16_t* const w) { +static int Disto4x4_MIPS32(const uint8_t* const a, const uint8_t* const b, + const uint16_t* const w) { int tmp[32]; int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8; @@ -396,13 +397,13 @@ static int Disto4x4(const uint8_t* const a, const uint8_t* const b, #undef VERTICAL_PASS #undef HORIZONTAL_PASS -static int Disto16x16(const uint8_t* const a, const uint8_t* const b, - const uint16_t* const w) { +static int Disto16x16_MIPS32(const uint8_t* const a, const uint8_t* const b, + const uint16_t* const w) { int D = 0; int x, y; for (y = 0; y < 16 * BPS; y += 4 * BPS) { for (x = 0; x < 16; x += 4) { - D += Disto4x4(a + x + y, b + x + y, w); + D += Disto4x4_MIPS32(a + x + y, b + x + y, w); } } return D; @@ -478,7 +479,8 @@ static int Disto16x16(const uint8_t* const a, const uint8_t* const b, "sh %[" #TEMP8 "], " #D "(%[temp20]) \n\t" \ "sh %[" #TEMP12 "], " #B "(%[temp20]) \n\t" -static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) { +static void FTransform_MIPS32(const uint8_t* src, const uint8_t* ref, + int16_t* out) { int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8; int temp9, temp10, temp11, temp12, temp13, temp14, temp15, temp16; int temp17, temp18, temp19, temp20; @@ -539,7 +541,7 @@ static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) { GET_SSE_INNER(C, C + 1, C + 2, C + 3) \ GET_SSE_INNER(D, D + 1, D + 2, D + 3) -static int SSE16x16(const uint8_t* a, const uint8_t* b) { +static int SSE16x16_MIPS32(const uint8_t* a, const uint8_t* b) { int count; int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; @@ -573,7 +575,7 @@ static int SSE16x16(const uint8_t* a, const uint8_t* b) { return count; } -static int SSE16x8(const uint8_t* a, const uint8_t* b) { +static int SSE16x8_MIPS32(const uint8_t* a, const uint8_t* b) { int count; int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; @@ -599,7 +601,7 @@ static int SSE16x8(const uint8_t* a, const uint8_t* b) { return count; } -static int SSE8x8(const uint8_t* a, const uint8_t* b) { +static int SSE8x8_MIPS32(const uint8_t* a, const uint8_t* b) { int count; int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; @@ -621,7 +623,7 @@ static int SSE8x8(const uint8_t* a, const uint8_t* b) { return count; } -static int SSE4x4(const uint8_t* a, const uint8_t* b) { +static int SSE4x4_MIPS32(const uint8_t* a, const uint8_t* b) { int count; int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; @@ -651,17 +653,20 @@ static int SSE4x4(const uint8_t* a, const uint8_t* b) { extern void VP8EncDspInitMIPS32(void); WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitMIPS32(void) { - VP8ITransform = ITransform; - VP8FTransform = FTransform; - VP8EncQuantizeBlock = QuantizeBlock; - VP8EncQuantize2Blocks = Quantize2Blocks; - VP8TDisto4x4 = Disto4x4; - VP8TDisto16x16 = Disto16x16; + VP8ITransform = ITransform_MIPS32; + VP8FTransform = FTransform_MIPS32; + + VP8EncQuantizeBlock = QuantizeBlock_MIPS32; + VP8EncQuantize2Blocks = Quantize2Blocks_MIPS32; + + VP8TDisto4x4 = Disto4x4_MIPS32; + VP8TDisto16x16 = Disto16x16_MIPS32; + #if !defined(WORK_AROUND_GCC) - VP8SSE16x16 = SSE16x16; - VP8SSE8x8 = SSE8x8; - VP8SSE16x8 = SSE16x8; - VP8SSE4x4 = SSE4x4; + VP8SSE16x16 = SSE16x16_MIPS32; + VP8SSE8x8 = SSE8x8_MIPS32; + VP8SSE16x8 = SSE16x8_MIPS32; + VP8SSE4x4 = SSE4x4_MIPS32; #endif } diff --git a/src/3rdparty/libwebp/src/dsp/enc_mips_dsp_r2.c b/src/3rdparty/libwebp/src/dsp/enc_mips_dsp_r2.c index 6c8c1c6..9ddd895 100644 --- a/src/3rdparty/libwebp/src/dsp/enc_mips_dsp_r2.c +++ b/src/3rdparty/libwebp/src/dsp/enc_mips_dsp_r2.c @@ -12,13 +12,13 @@ // Author(s): Darko Laus (darko.laus@imgtec.com) // Mirko Raus (mirko.raus@imgtec.com) -#include "./dsp.h" +#include "src/dsp/dsp.h" #if defined(WEBP_USE_MIPS_DSP_R2) -#include "./mips_macro.h" -#include "../enc/cost_enc.h" -#include "../enc/vp8i_enc.h" +#include "src/dsp/mips_macro.h" +#include "src/enc/cost_enc.h" +#include "src/enc/vp8i_enc.h" static const int kC1 = 20091 + (1 << 16); static const int kC2 = 35468; @@ -141,7 +141,8 @@ static const int kC2 = 35468; "sh %[" #TEMP8 "], " #D "(%[temp20]) \n\t" \ "sh %[" #TEMP12 "], " #B "(%[temp20]) \n\t" -static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) { +static void FTransform_MIPSdspR2(const uint8_t* src, const uint8_t* ref, + int16_t* out) { const int c2217 = 2217; const int c5352 = 5352; int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8; @@ -238,16 +239,16 @@ static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in, ); } -static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst, - int do_two) { +static void ITransform_MIPSdspR2(const uint8_t* ref, const int16_t* in, + uint8_t* dst, int do_two) { ITransformOne(ref, in, dst); if (do_two) { ITransformOne(ref + 4, in + 16, dst + 4); } } -static int Disto4x4(const uint8_t* const a, const uint8_t* const b, - const uint16_t* const w) { +static int Disto4x4_MIPSdspR2(const uint8_t* const a, const uint8_t* const b, + const uint16_t* const w) { int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9; int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17; @@ -313,13 +314,14 @@ static int Disto4x4(const uint8_t* const a, const uint8_t* const b, return abs(temp3 - temp17) >> 5; } -static int Disto16x16(const uint8_t* const a, const uint8_t* const b, - const uint16_t* const w) { +static int Disto16x16_MIPSdspR2(const uint8_t* const a, + const uint8_t* const b, + const uint16_t* const w) { int D = 0; int x, y; for (y = 0; y < 16 * BPS; y += 4 * BPS) { for (x = 0; x < 16; x += 4) { - D += Disto4x4(a + x + y, b + x + y, w); + D += Disto4x4_MIPSdspR2(a + x + y, b + x + y, w); } } return D; @@ -1011,8 +1013,8 @@ static void HU4(uint8_t* dst, const uint8_t* top) { //------------------------------------------------------------------------------ // Chroma 8x8 prediction (paragraph 12.2) -static void IntraChromaPreds(uint8_t* dst, const uint8_t* left, - const uint8_t* top) { +static void IntraChromaPreds_MIPSdspR2(uint8_t* dst, const uint8_t* left, + const uint8_t* top) { // U block DCMode8(C8DC8 + dst, left, top); VerticalPred8(C8VE8 + dst, top); @@ -1031,8 +1033,8 @@ static void IntraChromaPreds(uint8_t* dst, const uint8_t* left, //------------------------------------------------------------------------------ // luma 16x16 prediction (paragraph 12.3) -static void Intra16Preds(uint8_t* dst, - const uint8_t* left, const uint8_t* top) { +static void Intra16Preds_MIPSdspR2(uint8_t* dst, + const uint8_t* left, const uint8_t* top) { DCMode16(I16DC16 + dst, left, top); VerticalPred16(I16VE16 + dst, top); HorizontalPred16(I16HE16 + dst, left); @@ -1041,7 +1043,7 @@ static void Intra16Preds(uint8_t* dst, // Left samples are top[-5 .. -2], top_left is top[-1], top are // located at top[0..3], and top right is top[4..7] -static void Intra4Preds(uint8_t* dst, const uint8_t* top) { +static void Intra4Preds_MIPSdspR2(uint8_t* dst, const uint8_t* top) { DC4(I4DC4 + dst, top); TM4(I4TM4 + dst, top); VE4(I4VE4 + dst, top); @@ -1077,7 +1079,7 @@ static void Intra4Preds(uint8_t* dst, const uint8_t* top) { GET_SSE_INNER(C) \ GET_SSE_INNER(D) -static int SSE16x16(const uint8_t* a, const uint8_t* b) { +static int SSE16x16_MIPSdspR2(const uint8_t* a, const uint8_t* b) { int count; int temp0, temp1, temp2, temp3; __asm__ volatile ( @@ -1107,7 +1109,7 @@ static int SSE16x16(const uint8_t* a, const uint8_t* b) { return count; } -static int SSE16x8(const uint8_t* a, const uint8_t* b) { +static int SSE16x8_MIPSdspR2(const uint8_t* a, const uint8_t* b) { int count; int temp0, temp1, temp2, temp3; __asm__ volatile ( @@ -1129,7 +1131,7 @@ static int SSE16x8(const uint8_t* a, const uint8_t* b) { return count; } -static int SSE8x8(const uint8_t* a, const uint8_t* b) { +static int SSE8x8_MIPSdspR2(const uint8_t* a, const uint8_t* b) { int count; int temp0, temp1, temp2, temp3; __asm__ volatile ( @@ -1147,7 +1149,7 @@ static int SSE8x8(const uint8_t* a, const uint8_t* b) { return count; } -static int SSE4x4(const uint8_t* a, const uint8_t* b) { +static int SSE4x4_MIPSdspR2(const uint8_t* a, const uint8_t* b) { int count; int temp0, temp1, temp2, temp3; __asm__ volatile ( @@ -1270,8 +1272,8 @@ static int SSE4x4(const uint8_t* a, const uint8_t* b) { "usw $0, " #J "(%[ppin]) \n\t" \ "3: \n\t" -static int QuantizeBlock(int16_t in[16], int16_t out[16], - const VP8Matrix* const mtx) { +static int QuantizeBlock_MIPSdspR2(int16_t in[16], int16_t out[16], + const VP8Matrix* const mtx) { int temp0, temp1, temp2, temp3, temp4, temp5,temp6; int sign, coeff, level; int max_level = MAX_LEVEL; @@ -1311,11 +1313,11 @@ static int QuantizeBlock(int16_t in[16], int16_t out[16], return (ret != 0); } -static int Quantize2Blocks(int16_t in[32], int16_t out[32], - const VP8Matrix* const mtx) { +static int Quantize2Blocks_MIPSdspR2(int16_t in[32], int16_t out[32], + const VP8Matrix* const mtx) { int nz; - nz = QuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0; - nz |= QuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1; + nz = QuantizeBlock_MIPSdspR2(in + 0 * 16, out + 0 * 16, mtx) << 0; + nz |= QuantizeBlock_MIPSdspR2(in + 1 * 16, out + 1 * 16, mtx) << 1; return nz; } @@ -1358,7 +1360,7 @@ static int Quantize2Blocks(int16_t in[32], int16_t out[32], "usw %[" #TEMP4 "], " #C "(%[out]) \n\t" \ "usw %[" #TEMP6 "], " #D "(%[out]) \n\t" -static void FTransformWHT(const int16_t* in, int16_t* out) { +static void FTransformWHT_MIPSdspR2(const int16_t* in, int16_t* out) { int temp0, temp1, temp2, temp3, temp4; int temp5, temp6, temp7, temp8, temp9; @@ -1450,9 +1452,9 @@ static void FTransformWHT(const int16_t* in, int16_t* out) { "addiu %[temp8], %[temp8], 1 \n\t" \ "sw %[temp8], 0(%[temp3]) \n\t" -static void CollectHistogram(const uint8_t* ref, const uint8_t* pred, - int start_block, int end_block, - VP8Histogram* const histo) { +static void CollectHistogram_MIPSdspR2(const uint8_t* ref, const uint8_t* pred, + int start_block, int end_block, + VP8Histogram* const histo) { int j; int distribution[MAX_COEFF_THRESH + 1] = { 0 }; const int max_coeff = (MAX_COEFF_THRESH << 16) + MAX_COEFF_THRESH; @@ -1484,23 +1486,28 @@ static void CollectHistogram(const uint8_t* ref, const uint8_t* pred, extern void VP8EncDspInitMIPSdspR2(void); WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitMIPSdspR2(void) { - VP8FTransform = FTransform; - VP8ITransform = ITransform; - VP8TDisto4x4 = Disto4x4; - VP8TDisto16x16 = Disto16x16; - VP8EncPredLuma16 = Intra16Preds; - VP8EncPredChroma8 = IntraChromaPreds; - VP8EncPredLuma4 = Intra4Preds; + VP8FTransform = FTransform_MIPSdspR2; + VP8FTransformWHT = FTransformWHT_MIPSdspR2; + VP8ITransform = ITransform_MIPSdspR2; + + VP8TDisto4x4 = Disto4x4_MIPSdspR2; + VP8TDisto16x16 = Disto16x16_MIPSdspR2; + + VP8EncPredLuma16 = Intra16Preds_MIPSdspR2; + VP8EncPredChroma8 = IntraChromaPreds_MIPSdspR2; + VP8EncPredLuma4 = Intra4Preds_MIPSdspR2; + #if !defined(WORK_AROUND_GCC) - VP8SSE16x16 = SSE16x16; - VP8SSE8x8 = SSE8x8; - VP8SSE16x8 = SSE16x8; - VP8SSE4x4 = SSE4x4; + VP8SSE16x16 = SSE16x16_MIPSdspR2; + VP8SSE8x8 = SSE8x8_MIPSdspR2; + VP8SSE16x8 = SSE16x8_MIPSdspR2; + VP8SSE4x4 = SSE4x4_MIPSdspR2; #endif - VP8EncQuantizeBlock = QuantizeBlock; - VP8EncQuantize2Blocks = Quantize2Blocks; - VP8FTransformWHT = FTransformWHT; - VP8CollectHistogram = CollectHistogram; + + VP8EncQuantizeBlock = QuantizeBlock_MIPSdspR2; + VP8EncQuantize2Blocks = Quantize2Blocks_MIPSdspR2; + + VP8CollectHistogram = CollectHistogram_MIPSdspR2; } #else // !WEBP_USE_MIPS_DSP_R2 diff --git a/src/3rdparty/libwebp/src/dsp/enc_msa.c b/src/3rdparty/libwebp/src/dsp/enc_msa.c index 909b46d..6f85add 100644 --- a/src/3rdparty/libwebp/src/dsp/enc_msa.c +++ b/src/3rdparty/libwebp/src/dsp/enc_msa.c @@ -11,13 +11,13 @@ // // Author: Prashant Patil (prashant.patil@imgtec.com) -#include "./dsp.h" +#include "src/dsp/dsp.h" #if defined(WEBP_USE_MSA) #include <stdlib.h> -#include "./msa_macro.h" -#include "../enc/vp8i_enc.h" +#include "src/dsp/msa_macro.h" +#include "src/enc/vp8i_enc.h" //------------------------------------------------------------------------------ // Transforms @@ -69,20 +69,21 @@ static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in, ST4x4_UB(res0, res0, 3, 2, 1, 0, dst, BPS); } -static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst, - int do_two) { +static void ITransform_MSA(const uint8_t* ref, const int16_t* in, uint8_t* dst, + int do_two) { ITransformOne(ref, in, dst); if (do_two) { ITransformOne(ref + 4, in + 16, dst + 4); } } -static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) { +static void FTransform_MSA(const uint8_t* src, const uint8_t* ref, + int16_t* out) { uint64_t out0, out1, out2, out3; uint32_t in0, in1, in2, in3; v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; v8i16 t0, t1, t2, t3; - v16u8 srcl0, srcl1, src0, src1; + v16u8 srcl0, srcl1, src0 = { 0 }, src1 = { 0 }; const v8i16 mask0 = { 0, 4, 8, 12, 1, 5, 9, 13 }; const v8i16 mask1 = { 3, 7, 11, 15, 2, 6, 10, 14 }; const v8i16 mask2 = { 4, 0, 5, 1, 6, 2, 7, 3 }; @@ -130,7 +131,7 @@ static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) { SD4(out0, out1, out2, out3, out, 8); } -static void FTransformWHT(const int16_t* in, int16_t* out) { +static void FTransformWHT_MSA(const int16_t* in, int16_t* out) { v8i16 in0 = { 0 }; v8i16 in1 = { 0 }; v8i16 tmp0, tmp1, tmp2, tmp3; @@ -167,10 +168,10 @@ static void FTransformWHT(const int16_t* in, int16_t* out) { ST_SH2(out0, out1, out, 8); } -static int TTransform(const uint8_t* in, const uint16_t* w) { +static int TTransform_MSA(const uint8_t* in, const uint16_t* w) { int sum; uint32_t in0_m, in1_m, in2_m, in3_m; - v16i8 src0; + v16i8 src0 = { 0 }; v8i16 in0, in1, tmp0, tmp1, tmp2, tmp3; v4i32 dst0, dst1; const v16i8 zero = { 0 }; @@ -199,20 +200,20 @@ static int TTransform(const uint8_t* in, const uint16_t* w) { return sum; } -static int Disto4x4(const uint8_t* const a, const uint8_t* const b, - const uint16_t* const w) { - const int sum1 = TTransform(a, w); - const int sum2 = TTransform(b, w); +static int Disto4x4_MSA(const uint8_t* const a, const uint8_t* const b, + const uint16_t* const w) { + const int sum1 = TTransform_MSA(a, w); + const int sum2 = TTransform_MSA(b, w); return abs(sum2 - sum1) >> 5; } -static int Disto16x16(const uint8_t* const a, const uint8_t* const b, - const uint16_t* const w) { +static int Disto16x16_MSA(const uint8_t* const a, const uint8_t* const b, + const uint16_t* const w) { int D = 0; int x, y; for (y = 0; y < 16 * BPS; y += 4 * BPS) { for (x = 0; x < 16; x += 4) { - D += Disto4x4(a + x + y, b + x + y, w); + D += Disto4x4_MSA(a + x + y, b + x + y, w); } } return D; @@ -221,9 +222,9 @@ static int Disto16x16(const uint8_t* const a, const uint8_t* const b, //------------------------------------------------------------------------------ // Histogram -static void CollectHistogram(const uint8_t* ref, const uint8_t* pred, - int start_block, int end_block, - VP8Histogram* const histo) { +static void CollectHistogram_MSA(const uint8_t* ref, const uint8_t* pred, + int start_block, int end_block, + VP8Histogram* const histo) { int j; int distribution[MAX_COEFF_THRESH + 1] = { 0 }; for (j = start_block; j < end_block; ++j) { @@ -259,8 +260,9 @@ static void CollectHistogram(const uint8_t* ref, const uint8_t* pred, #define AVG2(a, b) (((a) + (b) + 1) >> 1) static WEBP_INLINE void VE4(uint8_t* dst, const uint8_t* top) { // vertical + const v16u8 A1 = { 0 }; const uint64_t val_m = LD(top - 1); - const v16u8 A = (v16u8)__msa_insert_d((v2i64)A, 0, val_m); + const v16u8 A = (v16u8)__msa_insert_d((v2i64)A1, 0, val_m); const v16u8 B = SLDI_UB(A, A, 1); const v16u8 C = SLDI_UB(A, A, 2); const v16u8 AC = __msa_ave_u_b(A, C); @@ -292,8 +294,9 @@ static WEBP_INLINE void DC4(uint8_t* dst, const uint8_t* top) { } static WEBP_INLINE void RD4(uint8_t* dst, const uint8_t* top) { + const v16u8 A2 = { 0 }; const uint64_t val_m = LD(top - 5); - const v16u8 A1 = (v16u8)__msa_insert_d((v2i64)A1, 0, val_m); + const v16u8 A1 = (v16u8)__msa_insert_d((v2i64)A2, 0, val_m); const v16u8 A = (v16u8)__msa_insert_b((v16i8)A1, 8, top[3]); const v16u8 B = SLDI_UB(A, A, 1); const v16u8 C = SLDI_UB(A, A, 2); @@ -311,8 +314,9 @@ static WEBP_INLINE void RD4(uint8_t* dst, const uint8_t* top) { } static WEBP_INLINE void LD4(uint8_t* dst, const uint8_t* top) { + const v16u8 A1 = { 0 }; const uint64_t val_m = LD(top); - const v16u8 A = (v16u8)__msa_insert_d((v2i64)A, 0, val_m); + const v16u8 A = (v16u8)__msa_insert_d((v2i64)A1, 0, val_m); const v16u8 B = SLDI_UB(A, A, 1); const v16u8 C1 = SLDI_UB(A, A, 2); const v16u8 C = (v16u8)__msa_insert_b((v16i8)C1, 6, top[7]); @@ -427,7 +431,7 @@ static WEBP_INLINE void TM4(uint8_t* dst, const uint8_t* top) { #undef AVG3 #undef AVG2 -static void Intra4Preds(uint8_t* dst, const uint8_t* top) { +static void Intra4Preds_MSA(uint8_t* dst, const uint8_t* top) { DC4(I4DC4 + dst, top); TM4(I4TM4 + dst, top); VE4(I4VE4 + dst, top); @@ -544,8 +548,8 @@ static WEBP_INLINE void DCMode16x16(uint8_t* dst, const uint8_t* left, STORE16x16(out, dst); } -static void Intra16Preds(uint8_t* dst, - const uint8_t* left, const uint8_t* top) { +static void Intra16Preds_MSA(uint8_t* dst, + const uint8_t* left, const uint8_t* top) { DCMode16x16(I16DC16 + dst, left, top); VerticalPred16x16(I16VE16 + dst, top); HorizontalPred16x16(I16HE16 + dst, left); @@ -645,7 +649,7 @@ static WEBP_INLINE void TrueMotion8x8(uint8_t* dst, const uint8_t* left, static WEBP_INLINE void DCMode8x8(uint8_t* dst, const uint8_t* left, const uint8_t* top) { uint64_t out; - v16u8 src; + v16u8 src = { 0 }; if (top != NULL && left != NULL) { const uint64_t left_m = LD(left); const uint64_t top_m = LD(top); @@ -666,8 +670,8 @@ static WEBP_INLINE void DCMode8x8(uint8_t* dst, const uint8_t* left, STORE8x8(out, dst); } -static void IntraChromaPreds(uint8_t* dst, const uint8_t* left, - const uint8_t* top) { +static void IntraChromaPreds_MSA(uint8_t* dst, const uint8_t* left, + const uint8_t* top) { // U block DCMode8x8(C8DC8 + dst, left, top); VerticalPred8x8(C8VE8 + dst, top); @@ -708,7 +712,7 @@ static void IntraChromaPreds(uint8_t* dst, const uint8_t* left, DPADD_SH2_SW(tmp2, tmp3, tmp2, tmp3, out2, out3); \ } while (0) -static int SSE16x16(const uint8_t* a, const uint8_t* b) { +static int SSE16x16_MSA(const uint8_t* a, const uint8_t* b) { uint32_t sum; v16u8 src0, src1, src2, src3, src4, src5, src6, src7; v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7; @@ -735,7 +739,7 @@ static int SSE16x16(const uint8_t* a, const uint8_t* b) { return sum; } -static int SSE16x8(const uint8_t* a, const uint8_t* b) { +static int SSE16x8_MSA(const uint8_t* a, const uint8_t* b) { uint32_t sum; v16u8 src0, src1, src2, src3, src4, src5, src6, src7; v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7; @@ -754,7 +758,7 @@ static int SSE16x8(const uint8_t* a, const uint8_t* b) { return sum; } -static int SSE8x8(const uint8_t* a, const uint8_t* b) { +static int SSE8x8_MSA(const uint8_t* a, const uint8_t* b) { uint32_t sum; v16u8 src0, src1, src2, src3, src4, src5, src6, src7; v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7; @@ -774,10 +778,10 @@ static int SSE8x8(const uint8_t* a, const uint8_t* b) { return sum; } -static int SSE4x4(const uint8_t* a, const uint8_t* b) { +static int SSE4x4_MSA(const uint8_t* a, const uint8_t* b) { uint32_t sum = 0; uint32_t src0, src1, src2, src3, ref0, ref1, ref2, ref3; - v16u8 src, ref, tmp0, tmp1; + v16u8 src = { 0 }, ref = { 0 }, tmp0, tmp1; v8i16 diff0, diff1; v4i32 out0, out1; @@ -796,8 +800,8 @@ static int SSE4x4(const uint8_t* a, const uint8_t* b) { //------------------------------------------------------------------------------ // Quantization -static int QuantizeBlock(int16_t in[16], int16_t out[16], - const VP8Matrix* const mtx) { +static int QuantizeBlock_MSA(int16_t in[16], int16_t out[16], + const VP8Matrix* const mtx) { int sum; v8i16 in0, in1, sh0, sh1, out0, out1; v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, sign0, sign1; @@ -828,7 +832,7 @@ static int QuantizeBlock(int16_t in[16], int16_t out[16], tmp1 = (tmp3 > maxlevel); tmp2 = (v8i16)__msa_bmnz_v((v16u8)tmp2, (v16u8)maxlevel, (v16u8)tmp0); tmp3 = (v8i16)__msa_bmnz_v((v16u8)tmp3, (v16u8)maxlevel, (v16u8)tmp1); - SUB2(0, tmp2, 0, tmp3, tmp0, tmp1); + SUB2(zero, tmp2, zero, tmp3, tmp0, tmp1); tmp2 = (v8i16)__msa_bmnz_v((v16u8)tmp2, (v16u8)tmp0, (v16u8)sign0); tmp3 = (v8i16)__msa_bmnz_v((v16u8)tmp3, (v16u8)tmp1, (v16u8)sign1); LD_SW4(&mtx->zthresh_[0], 4, t0, t1, t2, t3); // zthresh @@ -849,8 +853,8 @@ static int QuantizeBlock(int16_t in[16], int16_t out[16], return (sum > 0); } -static int Quantize2Blocks(int16_t in[32], int16_t out[32], - const VP8Matrix* const mtx) { +static int Quantize2Blocks_MSA(int16_t in[32], int16_t out[32], + const VP8Matrix* const mtx) { int nz; nz = VP8EncQuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0; nz |= VP8EncQuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1; @@ -863,26 +867,26 @@ static int Quantize2Blocks(int16_t in[32], int16_t out[32], extern void VP8EncDspInitMSA(void); WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitMSA(void) { - VP8ITransform = ITransform; - VP8FTransform = FTransform; - VP8FTransformWHT = FTransformWHT; - - VP8TDisto4x4 = Disto4x4; - VP8TDisto16x16 = Disto16x16; - VP8CollectHistogram = CollectHistogram; - - VP8EncPredLuma4 = Intra4Preds; - VP8EncPredLuma16 = Intra16Preds; - VP8EncPredChroma8 = IntraChromaPreds; - - VP8SSE16x16 = SSE16x16; - VP8SSE16x8 = SSE16x8; - VP8SSE8x8 = SSE8x8; - VP8SSE4x4 = SSE4x4; - - VP8EncQuantizeBlock = QuantizeBlock; - VP8EncQuantize2Blocks = Quantize2Blocks; - VP8EncQuantizeBlockWHT = QuantizeBlock; + VP8ITransform = ITransform_MSA; + VP8FTransform = FTransform_MSA; + VP8FTransformWHT = FTransformWHT_MSA; + + VP8TDisto4x4 = Disto4x4_MSA; + VP8TDisto16x16 = Disto16x16_MSA; + VP8CollectHistogram = CollectHistogram_MSA; + + VP8EncPredLuma4 = Intra4Preds_MSA; + VP8EncPredLuma16 = Intra16Preds_MSA; + VP8EncPredChroma8 = IntraChromaPreds_MSA; + + VP8SSE16x16 = SSE16x16_MSA; + VP8SSE16x8 = SSE16x8_MSA; + VP8SSE8x8 = SSE8x8_MSA; + VP8SSE4x4 = SSE4x4_MSA; + + VP8EncQuantizeBlock = QuantizeBlock_MSA; + VP8EncQuantize2Blocks = Quantize2Blocks_MSA; + VP8EncQuantizeBlockWHT = QuantizeBlock_MSA; } #else // !WEBP_USE_MSA diff --git a/src/3rdparty/libwebp/src/dsp/enc_neon.c b/src/3rdparty/libwebp/src/dsp/enc_neon.c index 6a078d6..43bf124 100644 --- a/src/3rdparty/libwebp/src/dsp/enc_neon.c +++ b/src/3rdparty/libwebp/src/dsp/enc_neon.c @@ -11,14 +11,14 @@ // // adapted from libvpx (http://www.webmproject.org/code/) -#include "./dsp.h" +#include "src/dsp/dsp.h" #if defined(WEBP_USE_NEON) #include <assert.h> -#include "./neon.h" -#include "../enc/vp8i_enc.h" +#include "src/dsp/neon.h" +#include "src/enc/vp8i_enc.h" //------------------------------------------------------------------------------ // Transforms (Paragraph 14.4) @@ -37,15 +37,15 @@ static const int16_t kC2 = 17734; // half of kC2, actually. See comment above. #if defined(WEBP_USE_INTRINSICS) // Treats 'v' as an uint8x8_t and zero extends to an int16x8_t. -static WEBP_INLINE int16x8_t ConvertU8ToS16(uint32x2_t v) { +static WEBP_INLINE int16x8_t ConvertU8ToS16_NEON(uint32x2_t v) { return vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(v))); } // Performs unsigned 8b saturation on 'dst01' and 'dst23' storing the result // to the corresponding rows of 'dst'. -static WEBP_INLINE void SaturateAndStore4x4(uint8_t* const dst, - const int16x8_t dst01, - const int16x8_t dst23) { +static WEBP_INLINE void SaturateAndStore4x4_NEON(uint8_t* const dst, + const int16x8_t dst01, + const int16x8_t dst23) { // Unsigned saturate to 8b. const uint8x8_t dst01_u8 = vqmovun_s16(dst01); const uint8x8_t dst23_u8 = vqmovun_s16(dst23); @@ -57,8 +57,10 @@ static WEBP_INLINE void SaturateAndStore4x4(uint8_t* const dst, vst1_lane_u32((uint32_t*)(dst + 3 * BPS), vreinterpret_u32_u8(dst23_u8), 1); } -static WEBP_INLINE void Add4x4(const int16x8_t row01, const int16x8_t row23, - const uint8_t* const ref, uint8_t* const dst) { +static WEBP_INLINE void Add4x4_NEON(const int16x8_t row01, + const int16x8_t row23, + const uint8_t* const ref, + uint8_t* const dst) { uint32x2_t dst01 = vdup_n_u32(0); uint32x2_t dst23 = vdup_n_u32(0); @@ -70,19 +72,20 @@ static WEBP_INLINE void Add4x4(const int16x8_t row01, const int16x8_t row23, { // Convert to 16b. - const int16x8_t dst01_s16 = ConvertU8ToS16(dst01); - const int16x8_t dst23_s16 = ConvertU8ToS16(dst23); + const int16x8_t dst01_s16 = ConvertU8ToS16_NEON(dst01); + const int16x8_t dst23_s16 = ConvertU8ToS16_NEON(dst23); // Descale with rounding. const int16x8_t out01 = vrsraq_n_s16(dst01_s16, row01, 3); const int16x8_t out23 = vrsraq_n_s16(dst23_s16, row23, 3); // Add the inverse transform. - SaturateAndStore4x4(dst, out01, out23); + SaturateAndStore4x4_NEON(dst, out01, out23); } } -static WEBP_INLINE void Transpose8x2(const int16x8_t in0, const int16x8_t in1, - int16x8x2_t* const out) { +static WEBP_INLINE void Transpose8x2_NEON(const int16x8_t in0, + const int16x8_t in1, + int16x8x2_t* const out) { // a0 a1 a2 a3 | b0 b1 b2 b3 => a0 b0 c0 d0 | a1 b1 c1 d1 // c0 c1 c2 c3 | d0 d1 d2 d3 a2 b2 c2 d2 | a3 b3 c3 d3 const int16x8x2_t tmp0 = vzipq_s16(in0, in1); // a0 c0 a1 c1 a2 c2 ... @@ -90,7 +93,7 @@ static WEBP_INLINE void Transpose8x2(const int16x8_t in0, const int16x8_t in1, *out = vzipq_s16(tmp0.val[0], tmp0.val[1]); } -static WEBP_INLINE void TransformPass(int16x8x2_t* const rows) { +static WEBP_INLINE void TransformPass_NEON(int16x8x2_t* const rows) { // {rows} = in0 | in4 // in8 | in12 // B1 = in4 | in12 @@ -113,22 +116,22 @@ static WEBP_INLINE void TransformPass(int16x8x2_t* const rows) { const int16x8_t E0 = vqaddq_s16(D0, D1); // a+d | b+c const int16x8_t E_tmp = vqsubq_s16(D0, D1); // a-d | b-c const int16x8_t E1 = vcombine_s16(vget_high_s16(E_tmp), vget_low_s16(E_tmp)); - Transpose8x2(E0, E1, rows); + Transpose8x2_NEON(E0, E1, rows); } -static void ITransformOne(const uint8_t* ref, - const int16_t* in, uint8_t* dst) { +static void ITransformOne_NEON(const uint8_t* ref, + const int16_t* in, uint8_t* dst) { int16x8x2_t rows; INIT_VECTOR2(rows, vld1q_s16(in + 0), vld1q_s16(in + 8)); - TransformPass(&rows); - TransformPass(&rows); - Add4x4(rows.val[0], rows.val[1], ref, dst); + TransformPass_NEON(&rows); + TransformPass_NEON(&rows); + Add4x4_NEON(rows.val[0], rows.val[1], ref, dst); } #else -static void ITransformOne(const uint8_t* ref, - const int16_t* in, uint8_t* dst) { +static void ITransformOne_NEON(const uint8_t* ref, + const int16_t* in, uint8_t* dst) { const int kBPS = BPS; const int16_t kC1C2[] = { kC1, kC2, 0, 0 }; @@ -243,16 +246,16 @@ static void ITransformOne(const uint8_t* ref, #endif // WEBP_USE_INTRINSICS -static void ITransform(const uint8_t* ref, - const int16_t* in, uint8_t* dst, int do_two) { - ITransformOne(ref, in, dst); +static void ITransform_NEON(const uint8_t* ref, + const int16_t* in, uint8_t* dst, int do_two) { + ITransformOne_NEON(ref, in, dst); if (do_two) { - ITransformOne(ref + 4, in + 16, dst + 4); + ITransformOne_NEON(ref + 4, in + 16, dst + 4); } } // Load all 4x4 pixels into a single uint8x16_t variable. -static uint8x16_t Load4x4(const uint8_t* src) { +static uint8x16_t Load4x4_NEON(const uint8_t* src) { uint32x4_t out = vdupq_n_u32(0); out = vld1q_lane_u32((const uint32_t*)(src + 0 * BPS), out, 0); out = vld1q_lane_u32((const uint32_t*)(src + 1 * BPS), out, 1); @@ -265,10 +268,12 @@ static uint8x16_t Load4x4(const uint8_t* src) { #if defined(WEBP_USE_INTRINSICS) -static WEBP_INLINE void Transpose4x4_S16(const int16x4_t A, const int16x4_t B, - const int16x4_t C, const int16x4_t D, - int16x8_t* const out01, - int16x8_t* const out32) { +static WEBP_INLINE void Transpose4x4_S16_NEON(const int16x4_t A, + const int16x4_t B, + const int16x4_t C, + const int16x4_t D, + int16x8_t* const out01, + int16x8_t* const out32) { const int16x4x2_t AB = vtrn_s16(A, B); const int16x4x2_t CD = vtrn_s16(C, D); const int32x2x2_t tmp02 = vtrn_s32(vreinterpret_s32_s16(AB.val[0]), @@ -283,24 +288,24 @@ static WEBP_INLINE void Transpose4x4_S16(const int16x4_t A, const int16x4_t B, vreinterpret_s64_s32(tmp02.val[1]))); } -static WEBP_INLINE int16x8_t DiffU8ToS16(const uint8x8_t a, - const uint8x8_t b) { +static WEBP_INLINE int16x8_t DiffU8ToS16_NEON(const uint8x8_t a, + const uint8x8_t b) { return vreinterpretq_s16_u16(vsubl_u8(a, b)); } -static void FTransform(const uint8_t* src, const uint8_t* ref, - int16_t* out) { +static void FTransform_NEON(const uint8_t* src, const uint8_t* ref, + int16_t* out) { int16x8_t d0d1, d3d2; // working 4x4 int16 variables { - const uint8x16_t S0 = Load4x4(src); - const uint8x16_t R0 = Load4x4(ref); - const int16x8_t D0D1 = DiffU8ToS16(vget_low_u8(S0), vget_low_u8(R0)); - const int16x8_t D2D3 = DiffU8ToS16(vget_high_u8(S0), vget_high_u8(R0)); + const uint8x16_t S0 = Load4x4_NEON(src); + const uint8x16_t R0 = Load4x4_NEON(ref); + const int16x8_t D0D1 = DiffU8ToS16_NEON(vget_low_u8(S0), vget_low_u8(R0)); + const int16x8_t D2D3 = DiffU8ToS16_NEON(vget_high_u8(S0), vget_high_u8(R0)); const int16x4_t D0 = vget_low_s16(D0D1); const int16x4_t D1 = vget_high_s16(D0D1); const int16x4_t D2 = vget_low_s16(D2D3); const int16x4_t D3 = vget_high_s16(D2D3); - Transpose4x4_S16(D0, D1, D2, D3, &d0d1, &d3d2); + Transpose4x4_S16_NEON(D0, D1, D2, D3, &d0d1, &d3d2); } { // 1rst pass const int32x4_t kCst937 = vdupq_n_s32(937); @@ -318,7 +323,7 @@ static void FTransform(const uint8_t* src, const uint8_t* ref, const int32x4_t a3_m_a2 = vmlsl_n_s16(a3_2217, vget_high_s16(a3a2), 5352); const int16x4_t tmp1 = vshrn_n_s32(vaddq_s32(a2_p_a3, kCst1812), 9); const int16x4_t tmp3 = vshrn_n_s32(vaddq_s32(a3_m_a2, kCst937), 9); - Transpose4x4_S16(tmp0, tmp1, tmp2, tmp3, &d0d1, &d3d2); + Transpose4x4_S16_NEON(tmp0, tmp1, tmp2, tmp3, &d0d1, &d3d2); } { // 2nd pass // the (1<<16) addition is for the replacement: a3!=0 <-> 1-(a3==0) @@ -358,8 +363,8 @@ static const int32_t kCoeff32[] = { 51000, 51000, 51000, 51000 }; -static void FTransform(const uint8_t* src, const uint8_t* ref, - int16_t* out) { +static void FTransform_NEON(const uint8_t* src, const uint8_t* ref, + int16_t* out) { const int kBPS = BPS; const uint8_t* src_ptr = src; const uint8_t* ref_ptr = ref; @@ -478,7 +483,7 @@ static void FTransform(const uint8_t* src, const uint8_t* ref, src += stride; \ } while (0) -static void FTransformWHT(const int16_t* src, int16_t* out) { +static void FTransformWHT_NEON(const int16_t* src, int16_t* out) { const int stride = 16; const int16x4_t zero = vdup_n_s16(0); int32x4x4_t tmp0; @@ -516,7 +521,7 @@ static void FTransformWHT(const int16_t* src, int16_t* out) { tmp0.val[3] = vsubq_s32(a0, a1); } { - const int32x4x4_t tmp1 = Transpose4x4(tmp0); + const int32x4x4_t tmp1 = Transpose4x4_NEON(tmp0); // a0 = tmp[0 + i] + tmp[ 8 + i] // a1 = tmp[4 + i] + tmp[12 + i] // a2 = tmp[4 + i] - tmp[12 + i] @@ -560,7 +565,7 @@ static void FTransformWHT(const int16_t* src, int16_t* out) { // a 26ae, b 26ae // a 37bf, b 37bf // -static WEBP_INLINE int16x8x4_t DistoTranspose4x4S16(int16x8x4_t q4_in) { +static WEBP_INLINE int16x8x4_t DistoTranspose4x4S16_NEON(int16x8x4_t q4_in) { const int16x8x2_t q2_tmp0 = vtrnq_s16(q4_in.val[0], q4_in.val[1]); const int16x8x2_t q2_tmp1 = vtrnq_s16(q4_in.val[2], q4_in.val[3]); const int32x4x2_t q2_tmp2 = vtrnq_s32(vreinterpretq_s32_s16(q2_tmp0.val[0]), @@ -574,7 +579,8 @@ static WEBP_INLINE int16x8x4_t DistoTranspose4x4S16(int16x8x4_t q4_in) { return q4_in; } -static WEBP_INLINE int16x8x4_t DistoHorizontalPass(const int16x8x4_t q4_in) { +static WEBP_INLINE int16x8x4_t DistoHorizontalPass_NEON( + const int16x8x4_t q4_in) { // {a0, a1} = {in[0] + in[2], in[1] + in[3]} // {a3, a2} = {in[0] - in[2], in[1] - in[3]} const int16x8_t q_a0 = vaddq_s16(q4_in.val[0], q4_in.val[2]); @@ -593,7 +599,7 @@ static WEBP_INLINE int16x8x4_t DistoHorizontalPass(const int16x8x4_t q4_in) { return q4_out; } -static WEBP_INLINE int16x8x4_t DistoVerticalPass(const uint8x8x4_t q4_in) { +static WEBP_INLINE int16x8x4_t DistoVerticalPass_NEON(const uint8x8x4_t q4_in) { const int16x8_t q_a0 = vreinterpretq_s16_u16(vaddl_u8(q4_in.val[0], q4_in.val[2])); const int16x8_t q_a1 = vreinterpretq_s16_u16(vaddl_u8(q4_in.val[1], @@ -610,7 +616,7 @@ static WEBP_INLINE int16x8x4_t DistoVerticalPass(const uint8x8x4_t q4_in) { return q4_out; } -static WEBP_INLINE int16x4x4_t DistoLoadW(const uint16_t* w) { +static WEBP_INLINE int16x4x4_t DistoLoadW_NEON(const uint16_t* w) { const uint16x8_t q_w07 = vld1q_u16(&w[0]); const uint16x8_t q_w8f = vld1q_u16(&w[8]); int16x4x4_t d4_w; @@ -622,8 +628,8 @@ static WEBP_INLINE int16x4x4_t DistoLoadW(const uint16_t* w) { return d4_w; } -static WEBP_INLINE int32x2_t DistoSum(const int16x8x4_t q4_in, - const int16x4x4_t d4_w) { +static WEBP_INLINE int32x2_t DistoSum_NEON(const int16x8x4_t q4_in, + const int16x4x4_t d4_w) { int32x2_t d_sum; // sum += w[ 0] * abs(b0); // sum += w[ 4] * abs(b1); @@ -652,8 +658,8 @@ static WEBP_INLINE int32x2_t DistoSum(const int16x8x4_t q4_in, // Hadamard transform // Returns the weighted sum of the absolute value of transformed coefficients. // w[] contains a row-major 4 by 4 symmetric matrix. -static int Disto4x4(const uint8_t* const a, const uint8_t* const b, - const uint16_t* const w) { +static int Disto4x4_NEON(const uint8_t* const a, const uint8_t* const b, + const uint16_t* const w) { uint32x2_t d_in_ab_0123 = vdup_n_u32(0); uint32x2_t d_in_ab_4567 = vdup_n_u32(0); uint32x2_t d_in_ab_89ab = vdup_n_u32(0); @@ -679,12 +685,12 @@ static int Disto4x4(const uint8_t* const a, const uint8_t* const b, // Vertical pass first to avoid a transpose (vertical and horizontal passes // are commutative because w/kWeightY is symmetric) and subsequent // transpose. - const int16x8x4_t q4_v = DistoVerticalPass(d4_in); - const int16x4x4_t d4_w = DistoLoadW(w); + const int16x8x4_t q4_v = DistoVerticalPass_NEON(d4_in); + const int16x4x4_t d4_w = DistoLoadW_NEON(w); // horizontal pass - const int16x8x4_t q4_t = DistoTranspose4x4S16(q4_v); - const int16x8x4_t q4_h = DistoHorizontalPass(q4_t); - int32x2_t d_sum = DistoSum(q4_h, d4_w); + const int16x8x4_t q4_t = DistoTranspose4x4S16_NEON(q4_v); + const int16x8x4_t q4_h = DistoHorizontalPass_NEON(q4_t); + int32x2_t d_sum = DistoSum_NEON(q4_h, d4_w); // abs(sum2 - sum1) >> 5 d_sum = vabs_s32(d_sum); @@ -694,13 +700,13 @@ static int Disto4x4(const uint8_t* const a, const uint8_t* const b, } #undef LOAD_LANE_32b -static int Disto16x16(const uint8_t* const a, const uint8_t* const b, - const uint16_t* const w) { +static int Disto16x16_NEON(const uint8_t* const a, const uint8_t* const b, + const uint16_t* const w) { int D = 0; int x, y; for (y = 0; y < 16 * BPS; y += 4 * BPS) { for (x = 0; x < 16; x += 4) { - D += Disto4x4(a + x + y, b + x + y, w); + D += Disto4x4_NEON(a + x + y, b + x + y, w); } } return D; @@ -708,15 +714,15 @@ static int Disto16x16(const uint8_t* const a, const uint8_t* const b, //------------------------------------------------------------------------------ -static void CollectHistogram(const uint8_t* ref, const uint8_t* pred, - int start_block, int end_block, - VP8Histogram* const histo) { +static void CollectHistogram_NEON(const uint8_t* ref, const uint8_t* pred, + int start_block, int end_block, + VP8Histogram* const histo) { const uint16x8_t max_coeff_thresh = vdupq_n_u16(MAX_COEFF_THRESH); int j; int distribution[MAX_COEFF_THRESH + 1] = { 0 }; for (j = start_block; j < end_block; ++j) { int16_t out[16]; - FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out); + FTransform_NEON(ref + VP8DspScan[j], pred + VP8DspScan[j], out); { int k; const int16x8_t a0 = vld1q_s16(out + 0); @@ -740,9 +746,9 @@ static void CollectHistogram(const uint8_t* ref, const uint8_t* pred, //------------------------------------------------------------------------------ -static WEBP_INLINE void AccumulateSSE16(const uint8_t* const a, - const uint8_t* const b, - uint32x4_t* const sum) { +static WEBP_INLINE void AccumulateSSE16_NEON(const uint8_t* const a, + const uint8_t* const b, + uint32x4_t* const sum) { const uint8x16_t a0 = vld1q_u8(a); const uint8x16_t b0 = vld1q_u8(b); const uint8x16_t abs_diff = vabdq_u8(a0, b0); @@ -757,7 +763,7 @@ static WEBP_INLINE void AccumulateSSE16(const uint8_t* const a, } // Horizontal sum of all four uint32_t values in 'sum'. -static int SumToInt(uint32x4_t sum) { +static int SumToInt_NEON(uint32x4_t sum) { const uint64x2_t sum2 = vpaddlq_u32(sum); const uint64_t sum3 = vgetq_lane_u64(sum2, 0) + vgetq_lane_u64(sum2, 1); return (int)sum3; @@ -767,18 +773,18 @@ static int SSE16x16_NEON(const uint8_t* a, const uint8_t* b) { uint32x4_t sum = vdupq_n_u32(0); int y; for (y = 0; y < 16; ++y) { - AccumulateSSE16(a + y * BPS, b + y * BPS, &sum); + AccumulateSSE16_NEON(a + y * BPS, b + y * BPS, &sum); } - return SumToInt(sum); + return SumToInt_NEON(sum); } static int SSE16x8_NEON(const uint8_t* a, const uint8_t* b) { uint32x4_t sum = vdupq_n_u32(0); int y; for (y = 0; y < 8; ++y) { - AccumulateSSE16(a + y * BPS, b + y * BPS, &sum); + AccumulateSSE16_NEON(a + y * BPS, b + y * BPS, &sum); } - return SumToInt(sum); + return SumToInt_NEON(sum); } static int SSE8x8_NEON(const uint8_t* a, const uint8_t* b) { @@ -791,12 +797,12 @@ static int SSE8x8_NEON(const uint8_t* a, const uint8_t* b) { const uint16x8_t prod = vmull_u8(abs_diff, abs_diff); sum = vpadalq_u16(sum, prod); } - return SumToInt(sum); + return SumToInt_NEON(sum); } static int SSE4x4_NEON(const uint8_t* a, const uint8_t* b) { - const uint8x16_t a0 = Load4x4(a); - const uint8x16_t b0 = Load4x4(b); + const uint8x16_t a0 = Load4x4_NEON(a); + const uint8x16_t b0 = Load4x4_NEON(b); const uint8x16_t abs_diff = vabdq_u8(a0, b0); const uint16x8_t prod1 = vmull_u8(vget_low_u8(abs_diff), vget_low_u8(abs_diff)); @@ -805,7 +811,7 @@ static int SSE4x4_NEON(const uint8_t* a, const uint8_t* b) { /* pair-wise adds and widen */ const uint32x4_t sum1 = vpaddlq_u16(prod1); const uint32x4_t sum2 = vpaddlq_u16(prod2); - return SumToInt(vaddq_u32(sum1, sum2)); + return SumToInt_NEON(vaddq_u32(sum1, sum2)); } //------------------------------------------------------------------------------ @@ -813,8 +819,8 @@ static int SSE4x4_NEON(const uint8_t* a, const uint8_t* b) { // Compilation with gcc-4.6.x is problematic for now. #if !defined(WORK_AROUND_GCC) -static int16x8_t Quantize(int16_t* const in, - const VP8Matrix* const mtx, int offset) { +static int16x8_t Quantize_NEON(int16_t* const in, + const VP8Matrix* const mtx, int offset) { const uint16x8_t sharp = vld1q_u16(&mtx->sharpen_[offset]); const uint16x8_t q = vld1q_u16(&mtx->q_[offset]); const uint16x8_t iq = vld1q_u16(&mtx->iq_[offset]); @@ -847,10 +853,10 @@ static const uint8_t kShuffles[4][8] = { { 14, 15, 22, 23, 28, 29, 30, 31 } }; -static int QuantizeBlock(int16_t in[16], int16_t out[16], - const VP8Matrix* const mtx) { - const int16x8_t out0 = Quantize(in, mtx, 0); - const int16x8_t out1 = Quantize(in, mtx, 8); +static int QuantizeBlock_NEON(int16_t in[16], int16_t out[16], + const VP8Matrix* const mtx) { + const int16x8_t out0 = Quantize_NEON(in, mtx, 0); + const int16x8_t out1 = Quantize_NEON(in, mtx, 8); uint8x8x4_t shuffles; // vtbl?_u8 are marked unavailable for iOS arm64 with Xcode < 6.3, use // non-standard versions there. @@ -889,11 +895,11 @@ static int QuantizeBlock(int16_t in[16], int16_t out[16], return 0; } -static int Quantize2Blocks(int16_t in[32], int16_t out[32], - const VP8Matrix* const mtx) { +static int Quantize2Blocks_NEON(int16_t in[32], int16_t out[32], + const VP8Matrix* const mtx) { int nz; - nz = QuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0; - nz |= QuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1; + nz = QuantizeBlock_NEON(in + 0 * 16, out + 0 * 16, mtx) << 0; + nz |= QuantizeBlock_NEON(in + 1 * 16, out + 1 * 16, mtx) << 1; return nz; } @@ -905,14 +911,14 @@ static int Quantize2Blocks(int16_t in[32], int16_t out[32], extern void VP8EncDspInitNEON(void); WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitNEON(void) { - VP8ITransform = ITransform; - VP8FTransform = FTransform; + VP8ITransform = ITransform_NEON; + VP8FTransform = FTransform_NEON; - VP8FTransformWHT = FTransformWHT; + VP8FTransformWHT = FTransformWHT_NEON; - VP8TDisto4x4 = Disto4x4; - VP8TDisto16x16 = Disto16x16; - VP8CollectHistogram = CollectHistogram; + VP8TDisto4x4 = Disto4x4_NEON; + VP8TDisto16x16 = Disto16x16_NEON; + VP8CollectHistogram = CollectHistogram_NEON; VP8SSE16x16 = SSE16x16_NEON; VP8SSE16x8 = SSE16x8_NEON; @@ -920,8 +926,8 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitNEON(void) { VP8SSE4x4 = SSE4x4_NEON; #if !defined(WORK_AROUND_GCC) - VP8EncQuantizeBlock = QuantizeBlock; - VP8EncQuantize2Blocks = Quantize2Blocks; + VP8EncQuantizeBlock = QuantizeBlock_NEON; + VP8EncQuantize2Blocks = Quantize2Blocks_NEON; #endif } diff --git a/src/3rdparty/libwebp/src/dsp/enc_sse2.c b/src/3rdparty/libwebp/src/dsp/enc_sse2.c index 2026a74..b2e78ed 100644 --- a/src/3rdparty/libwebp/src/dsp/enc_sse2.c +++ b/src/3rdparty/libwebp/src/dsp/enc_sse2.c @@ -11,23 +11,23 @@ // // Author: Christian Duvivier (cduvivier@google.com) -#include "./dsp.h" +#include "src/dsp/dsp.h" #if defined(WEBP_USE_SSE2) #include <assert.h> #include <stdlib.h> // for abs() #include <emmintrin.h> -#include "./common_sse2.h" -#include "../enc/cost_enc.h" -#include "../enc/vp8i_enc.h" +#include "src/dsp/common_sse2.h" +#include "src/enc/cost_enc.h" +#include "src/enc/vp8i_enc.h" //------------------------------------------------------------------------------ // Transforms (Paragraph 14.4) // Does one or two inverse transforms. -static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst, - int do_two) { +static void ITransform_SSE2(const uint8_t* ref, const int16_t* in, uint8_t* dst, + int do_two) { // This implementation makes use of 16-bit fixed point versions of two // multiply constants: // K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16 @@ -193,10 +193,10 @@ static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst, } } -static void FTransformPass1(const __m128i* const in01, - const __m128i* const in23, - __m128i* const out01, - __m128i* const out32) { +static void FTransformPass1_SSE2(const __m128i* const in01, + const __m128i* const in23, + __m128i* const out01, + __m128i* const out32) { const __m128i k937 = _mm_set1_epi32(937); const __m128i k1812 = _mm_set1_epi32(1812); @@ -239,8 +239,9 @@ static void FTransformPass1(const __m128i* const in01, *out32 = _mm_shuffle_epi32(v23, _MM_SHUFFLE(1, 0, 3, 2)); // 3 2 3 2 3 2.. } -static void FTransformPass2(const __m128i* const v01, const __m128i* const v32, - int16_t* out) { +static void FTransformPass2_SSE2(const __m128i* const v01, + const __m128i* const v32, + int16_t* out) { const __m128i zero = _mm_setzero_si128(); const __m128i seven = _mm_set1_epi16(7); const __m128i k5352_2217 = _mm_set_epi16(5352, 2217, 5352, 2217, @@ -291,7 +292,8 @@ static void FTransformPass2(const __m128i* const v01, const __m128i* const v32, _mm_storeu_si128((__m128i*)&out[8], d2_f3); } -static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) { +static void FTransform_SSE2(const uint8_t* src, const uint8_t* ref, + int16_t* out) { const __m128i zero = _mm_setzero_si128(); // Load src. const __m128i src0 = _mm_loadl_epi64((const __m128i*)&src[0 * BPS]); @@ -328,13 +330,14 @@ static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) { __m128i v01, v32; // First pass - FTransformPass1(&row01, &row23, &v01, &v32); + FTransformPass1_SSE2(&row01, &row23, &v01, &v32); // Second pass - FTransformPass2(&v01, &v32, out); + FTransformPass2_SSE2(&v01, &v32, out); } -static void FTransform2(const uint8_t* src, const uint8_t* ref, int16_t* out) { +static void FTransform2_SSE2(const uint8_t* src, const uint8_t* ref, + int16_t* out) { const __m128i zero = _mm_setzero_si128(); // Load src and convert to 16b. @@ -374,15 +377,15 @@ static void FTransform2(const uint8_t* src, const uint8_t* ref, int16_t* out) { __m128i v01h, v32h; // First pass - FTransformPass1(&shuf01l, &shuf23l, &v01l, &v32l); - FTransformPass1(&shuf01h, &shuf23h, &v01h, &v32h); + FTransformPass1_SSE2(&shuf01l, &shuf23l, &v01l, &v32l); + FTransformPass1_SSE2(&shuf01h, &shuf23h, &v01h, &v32h); // Second pass - FTransformPass2(&v01l, &v32l, out + 0); - FTransformPass2(&v01h, &v32h, out + 16); + FTransformPass2_SSE2(&v01l, &v32l, out + 0); + FTransformPass2_SSE2(&v01h, &v32h, out + 16); } -static void FTransformWHTRow(const int16_t* const in, __m128i* const out) { +static void FTransformWHTRow_SSE2(const int16_t* const in, __m128i* const out) { const __m128i kMult = _mm_set_epi16(-1, 1, -1, 1, 1, 1, 1, 1); const __m128i src0 = _mm_loadl_epi64((__m128i*)&in[0 * 16]); const __m128i src1 = _mm_loadl_epi64((__m128i*)&in[1 * 16]); @@ -398,14 +401,14 @@ static void FTransformWHTRow(const int16_t* const in, __m128i* const out) { *out = _mm_madd_epi16(D, kMult); } -static void FTransformWHT(const int16_t* in, int16_t* out) { +static void FTransformWHT_SSE2(const int16_t* in, int16_t* out) { // Input is 12b signed. __m128i row0, row1, row2, row3; // Rows are 14b signed. - FTransformWHTRow(in + 0 * 64, &row0); - FTransformWHTRow(in + 1 * 64, &row1); - FTransformWHTRow(in + 2 * 64, &row2); - FTransformWHTRow(in + 3 * 64, &row3); + FTransformWHTRow_SSE2(in + 0 * 64, &row0); + FTransformWHTRow_SSE2(in + 1 * 64, &row1); + FTransformWHTRow_SSE2(in + 2 * 64, &row2); + FTransformWHTRow_SSE2(in + 3 * 64, &row3); { // The a* are 15b signed. @@ -431,9 +434,9 @@ static void FTransformWHT(const int16_t* in, int16_t* out) { // Compute susceptibility based on DCT-coeff histograms: // the higher, the "easier" the macroblock is to compress. -static void CollectHistogram(const uint8_t* ref, const uint8_t* pred, - int start_block, int end_block, - VP8Histogram* const histo) { +static void CollectHistogram_SSE2(const uint8_t* ref, const uint8_t* pred, + int start_block, int end_block, + VP8Histogram* const histo) { const __m128i zero = _mm_setzero_si128(); const __m128i max_coeff_thresh = _mm_set1_epi16(MAX_COEFF_THRESH); int j; @@ -442,7 +445,7 @@ static void CollectHistogram(const uint8_t* ref, const uint8_t* pred, int16_t out[16]; int k; - FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out); + FTransform_SSE2(ref + VP8DspScan[j], pred + VP8DspScan[j], out); // Convert coefficients to bin (within out[]). { @@ -476,7 +479,7 @@ static void CollectHistogram(const uint8_t* ref, const uint8_t* pred, // Intra predictions // helper for chroma-DC predictions -static WEBP_INLINE void Put8x8uv(uint8_t v, uint8_t* dst) { +static WEBP_INLINE void Put8x8uv_SSE2(uint8_t v, uint8_t* dst) { int j; const __m128i values = _mm_set1_epi8(v); for (j = 0; j < 8; ++j) { @@ -484,7 +487,7 @@ static WEBP_INLINE void Put8x8uv(uint8_t v, uint8_t* dst) { } } -static WEBP_INLINE void Put16(uint8_t v, uint8_t* dst) { +static WEBP_INLINE void Put16_SSE2(uint8_t v, uint8_t* dst) { int j; const __m128i values = _mm_set1_epi8(v); for (j = 0; j < 16; ++j) { @@ -492,20 +495,20 @@ static WEBP_INLINE void Put16(uint8_t v, uint8_t* dst) { } } -static WEBP_INLINE void Fill(uint8_t* dst, int value, int size) { +static WEBP_INLINE void Fill_SSE2(uint8_t* dst, int value, int size) { if (size == 4) { int j; for (j = 0; j < 4; ++j) { memset(dst + j * BPS, value, 4); } } else if (size == 8) { - Put8x8uv(value, dst); + Put8x8uv_SSE2(value, dst); } else { - Put16(value, dst); + Put16_SSE2(value, dst); } } -static WEBP_INLINE void VE8uv(uint8_t* dst, const uint8_t* top) { +static WEBP_INLINE void VE8uv_SSE2(uint8_t* dst, const uint8_t* top) { int j; const __m128i top_values = _mm_loadl_epi64((const __m128i*)top); for (j = 0; j < 8; ++j) { @@ -513,7 +516,7 @@ static WEBP_INLINE void VE8uv(uint8_t* dst, const uint8_t* top) { } } -static WEBP_INLINE void VE16(uint8_t* dst, const uint8_t* top) { +static WEBP_INLINE void VE16_SSE2(uint8_t* dst, const uint8_t* top) { const __m128i top_values = _mm_load_si128((const __m128i*)top); int j; for (j = 0; j < 16; ++j) { @@ -521,20 +524,20 @@ static WEBP_INLINE void VE16(uint8_t* dst, const uint8_t* top) { } } -static WEBP_INLINE void VerticalPred(uint8_t* dst, - const uint8_t* top, int size) { +static WEBP_INLINE void VerticalPred_SSE2(uint8_t* dst, + const uint8_t* top, int size) { if (top != NULL) { if (size == 8) { - VE8uv(dst, top); + VE8uv_SSE2(dst, top); } else { - VE16(dst, top); + VE16_SSE2(dst, top); } } else { - Fill(dst, 127, size); + Fill_SSE2(dst, 127, size); } } -static WEBP_INLINE void HE8uv(uint8_t* dst, const uint8_t* left) { +static WEBP_INLINE void HE8uv_SSE2(uint8_t* dst, const uint8_t* left) { int j; for (j = 0; j < 8; ++j) { const __m128i values = _mm_set1_epi8(left[j]); @@ -543,7 +546,7 @@ static WEBP_INLINE void HE8uv(uint8_t* dst, const uint8_t* left) { } } -static WEBP_INLINE void HE16(uint8_t* dst, const uint8_t* left) { +static WEBP_INLINE void HE16_SSE2(uint8_t* dst, const uint8_t* left) { int j; for (j = 0; j < 16; ++j) { const __m128i values = _mm_set1_epi8(left[j]); @@ -552,21 +555,21 @@ static WEBP_INLINE void HE16(uint8_t* dst, const uint8_t* left) { } } -static WEBP_INLINE void HorizontalPred(uint8_t* dst, - const uint8_t* left, int size) { +static WEBP_INLINE void HorizontalPred_SSE2(uint8_t* dst, + const uint8_t* left, int size) { if (left != NULL) { if (size == 8) { - HE8uv(dst, left); + HE8uv_SSE2(dst, left); } else { - HE16(dst, left); + HE16_SSE2(dst, left); } } else { - Fill(dst, 129, size); + Fill_SSE2(dst, 129, size); } } -static WEBP_INLINE void TM(uint8_t* dst, const uint8_t* left, - const uint8_t* top, int size) { +static WEBP_INLINE void TM_SSE2(uint8_t* dst, const uint8_t* left, + const uint8_t* top, int size) { const __m128i zero = _mm_setzero_si128(); int y; if (size == 8) { @@ -593,13 +596,13 @@ static WEBP_INLINE void TM(uint8_t* dst, const uint8_t* left, } } -static WEBP_INLINE void TrueMotion(uint8_t* dst, const uint8_t* left, - const uint8_t* top, int size) { +static WEBP_INLINE void TrueMotion_SSE2(uint8_t* dst, const uint8_t* left, + const uint8_t* top, int size) { if (left != NULL) { if (top != NULL) { - TM(dst, left, top, size); + TM_SSE2(dst, left, top, size); } else { - HorizontalPred(dst, left, size); + HorizontalPred_SSE2(dst, left, size); } } else { // true motion without left samples (hence: with default 129 value) @@ -607,90 +610,90 @@ static WEBP_INLINE void TrueMotion(uint8_t* dst, const uint8_t* left, // Note that if top samples are not available, the default value is // then 129, and not 127 as in the VerticalPred case. if (top != NULL) { - VerticalPred(dst, top, size); + VerticalPred_SSE2(dst, top, size); } else { - Fill(dst, 129, size); + Fill_SSE2(dst, 129, size); } } } -static WEBP_INLINE void DC8uv(uint8_t* dst, const uint8_t* left, - const uint8_t* top) { +static WEBP_INLINE void DC8uv_SSE2(uint8_t* dst, const uint8_t* left, + const uint8_t* top) { const __m128i top_values = _mm_loadl_epi64((const __m128i*)top); const __m128i left_values = _mm_loadl_epi64((const __m128i*)left); const __m128i combined = _mm_unpacklo_epi64(top_values, left_values); const int DC = VP8HorizontalAdd8b(&combined) + 8; - Put8x8uv(DC >> 4, dst); + Put8x8uv_SSE2(DC >> 4, dst); } -static WEBP_INLINE void DC8uvNoLeft(uint8_t* dst, const uint8_t* top) { +static WEBP_INLINE void DC8uvNoLeft_SSE2(uint8_t* dst, const uint8_t* top) { const __m128i zero = _mm_setzero_si128(); const __m128i top_values = _mm_loadl_epi64((const __m128i*)top); const __m128i sum = _mm_sad_epu8(top_values, zero); const int DC = _mm_cvtsi128_si32(sum) + 4; - Put8x8uv(DC >> 3, dst); + Put8x8uv_SSE2(DC >> 3, dst); } -static WEBP_INLINE void DC8uvNoTop(uint8_t* dst, const uint8_t* left) { +static WEBP_INLINE void DC8uvNoTop_SSE2(uint8_t* dst, const uint8_t* left) { // 'left' is contiguous so we can reuse the top summation. - DC8uvNoLeft(dst, left); + DC8uvNoLeft_SSE2(dst, left); } -static WEBP_INLINE void DC8uvNoTopLeft(uint8_t* dst) { - Put8x8uv(0x80, dst); +static WEBP_INLINE void DC8uvNoTopLeft_SSE2(uint8_t* dst) { + Put8x8uv_SSE2(0x80, dst); } -static WEBP_INLINE void DC8uvMode(uint8_t* dst, const uint8_t* left, - const uint8_t* top) { +static WEBP_INLINE void DC8uvMode_SSE2(uint8_t* dst, const uint8_t* left, + const uint8_t* top) { if (top != NULL) { if (left != NULL) { // top and left present - DC8uv(dst, left, top); + DC8uv_SSE2(dst, left, top); } else { // top, but no left - DC8uvNoLeft(dst, top); + DC8uvNoLeft_SSE2(dst, top); } } else if (left != NULL) { // left but no top - DC8uvNoTop(dst, left); + DC8uvNoTop_SSE2(dst, left); } else { // no top, no left, nothing. - DC8uvNoTopLeft(dst); + DC8uvNoTopLeft_SSE2(dst); } } -static WEBP_INLINE void DC16(uint8_t* dst, const uint8_t* left, - const uint8_t* top) { +static WEBP_INLINE void DC16_SSE2(uint8_t* dst, const uint8_t* left, + const uint8_t* top) { const __m128i top_row = _mm_load_si128((const __m128i*)top); const __m128i left_row = _mm_load_si128((const __m128i*)left); const int DC = VP8HorizontalAdd8b(&top_row) + VP8HorizontalAdd8b(&left_row) + 16; - Put16(DC >> 5, dst); + Put16_SSE2(DC >> 5, dst); } -static WEBP_INLINE void DC16NoLeft(uint8_t* dst, const uint8_t* top) { +static WEBP_INLINE void DC16NoLeft_SSE2(uint8_t* dst, const uint8_t* top) { const __m128i top_row = _mm_load_si128((const __m128i*)top); const int DC = VP8HorizontalAdd8b(&top_row) + 8; - Put16(DC >> 4, dst); + Put16_SSE2(DC >> 4, dst); } -static WEBP_INLINE void DC16NoTop(uint8_t* dst, const uint8_t* left) { +static WEBP_INLINE void DC16NoTop_SSE2(uint8_t* dst, const uint8_t* left) { // 'left' is contiguous so we can reuse the top summation. - DC16NoLeft(dst, left); + DC16NoLeft_SSE2(dst, left); } -static WEBP_INLINE void DC16NoTopLeft(uint8_t* dst) { - Put16(0x80, dst); +static WEBP_INLINE void DC16NoTopLeft_SSE2(uint8_t* dst) { + Put16_SSE2(0x80, dst); } -static WEBP_INLINE void DC16Mode(uint8_t* dst, const uint8_t* left, - const uint8_t* top) { +static WEBP_INLINE void DC16Mode_SSE2(uint8_t* dst, const uint8_t* left, + const uint8_t* top) { if (top != NULL) { if (left != NULL) { // top and left present - DC16(dst, left, top); + DC16_SSE2(dst, left, top); } else { // top, but no left - DC16NoLeft(dst, top); + DC16NoLeft_SSE2(dst, top); } } else if (left != NULL) { // left but no top - DC16NoTop(dst, left); + DC16NoTop_SSE2(dst, left); } else { // no top, no left, nothing. - DC16NoTopLeft(dst); + DC16NoTopLeft_SSE2(dst); } } @@ -709,7 +712,8 @@ static WEBP_INLINE void DC16Mode(uint8_t* dst, const uint8_t* left, // where: AC = (a + b + 1) >> 1, BC = (b + c + 1) >> 1 // and ab = a ^ b, bc = b ^ c, lsb = (AC^BC)&1 -static WEBP_INLINE void VE4(uint8_t* dst, const uint8_t* top) { // vertical +static WEBP_INLINE void VE4_SSE2(uint8_t* dst, + const uint8_t* top) { // vertical const __m128i one = _mm_set1_epi8(1); const __m128i ABCDEFGH = _mm_loadl_epi64((__m128i*)(top - 1)); const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 1); @@ -725,7 +729,8 @@ static WEBP_INLINE void VE4(uint8_t* dst, const uint8_t* top) { // vertical } } -static WEBP_INLINE void HE4(uint8_t* dst, const uint8_t* top) { // horizontal +static WEBP_INLINE void HE4_SSE2(uint8_t* dst, + const uint8_t* top) { // horizontal const int X = top[-1]; const int I = top[-2]; const int J = top[-3]; @@ -737,14 +742,15 @@ static WEBP_INLINE void HE4(uint8_t* dst, const uint8_t* top) { // horizontal WebPUint32ToMem(dst + 3 * BPS, 0x01010101U * AVG3(K, L, L)); } -static WEBP_INLINE void DC4(uint8_t* dst, const uint8_t* top) { +static WEBP_INLINE void DC4_SSE2(uint8_t* dst, const uint8_t* top) { uint32_t dc = 4; int i; for (i = 0; i < 4; ++i) dc += top[i] + top[-5 + i]; - Fill(dst, dc >> 3, 4); + Fill_SSE2(dst, dc >> 3, 4); } -static WEBP_INLINE void LD4(uint8_t* dst, const uint8_t* top) { // Down-Left +static WEBP_INLINE void LD4_SSE2(uint8_t* dst, + const uint8_t* top) { // Down-Left const __m128i one = _mm_set1_epi8(1); const __m128i ABCDEFGH = _mm_loadl_epi64((const __m128i*)top); const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 1); @@ -760,8 +766,8 @@ static WEBP_INLINE void LD4(uint8_t* dst, const uint8_t* top) { // Down-Left WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3))); } -static WEBP_INLINE void VR4(uint8_t* dst, - const uint8_t* top) { // Vertical-Right +static WEBP_INLINE void VR4_SSE2(uint8_t* dst, + const uint8_t* top) { // Vertical-Right const __m128i one = _mm_set1_epi8(1); const int I = top[-2]; const int J = top[-3]; @@ -771,7 +777,7 @@ static WEBP_INLINE void VR4(uint8_t* dst, const __m128i ABCD0 = _mm_srli_si128(XABCD, 1); const __m128i abcd = _mm_avg_epu8(XABCD, ABCD0); const __m128i _XABCD = _mm_slli_si128(XABCD, 1); - const __m128i IXABCD = _mm_insert_epi16(_XABCD, I | (X << 8), 0); + const __m128i IXABCD = _mm_insert_epi16(_XABCD, (short)(I | (X << 8)), 0); const __m128i avg1 = _mm_avg_epu8(IXABCD, ABCD0); const __m128i lsb = _mm_and_si128(_mm_xor_si128(IXABCD, ABCD0), one); const __m128i avg2 = _mm_subs_epu8(avg1, lsb); @@ -786,8 +792,8 @@ static WEBP_INLINE void VR4(uint8_t* dst, DST(0, 3) = AVG3(K, J, I); } -static WEBP_INLINE void VL4(uint8_t* dst, - const uint8_t* top) { // Vertical-Left +static WEBP_INLINE void VL4_SSE2(uint8_t* dst, + const uint8_t* top) { // Vertical-Left const __m128i one = _mm_set1_epi8(1); const __m128i ABCDEFGH = _mm_loadl_epi64((const __m128i*)top); const __m128i BCDEFGH_ = _mm_srli_si128(ABCDEFGH, 1); @@ -812,7 +818,8 @@ static WEBP_INLINE void VL4(uint8_t* dst, DST(3, 3) = (extra_out >> 8) & 0xff; } -static WEBP_INLINE void RD4(uint8_t* dst, const uint8_t* top) { // Down-right +static WEBP_INLINE void RD4_SSE2(uint8_t* dst, + const uint8_t* top) { // Down-right const __m128i one = _mm_set1_epi8(1); const __m128i LKJIXABC = _mm_loadl_epi64((const __m128i*)(top - 5)); const __m128i LKJIXABCD = _mm_insert_epi16(LKJIXABC, top[3], 4); @@ -828,7 +835,7 @@ static WEBP_INLINE void RD4(uint8_t* dst, const uint8_t* top) { // Down-right WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3))); } -static WEBP_INLINE void HU4(uint8_t* dst, const uint8_t* top) { +static WEBP_INLINE void HU4_SSE2(uint8_t* dst, const uint8_t* top) { const int I = top[-2]; const int J = top[-3]; const int K = top[-4]; @@ -843,7 +850,7 @@ static WEBP_INLINE void HU4(uint8_t* dst, const uint8_t* top) { DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L; } -static WEBP_INLINE void HD4(uint8_t* dst, const uint8_t* top) { +static WEBP_INLINE void HD4_SSE2(uint8_t* dst, const uint8_t* top) { const int X = top[-1]; const int I = top[-2]; const int J = top[-3]; @@ -866,7 +873,7 @@ static WEBP_INLINE void HD4(uint8_t* dst, const uint8_t* top) { DST(1, 3) = AVG3(L, K, J); } -static WEBP_INLINE void TM4(uint8_t* dst, const uint8_t* top) { +static WEBP_INLINE void TM4_SSE2(uint8_t* dst, const uint8_t* top) { const __m128i zero = _mm_setzero_si128(); const __m128i top_values = _mm_cvtsi32_si128(WebPMemToUint32(top)); const __m128i top_base = _mm_unpacklo_epi8(top_values, zero); @@ -888,55 +895,56 @@ static WEBP_INLINE void TM4(uint8_t* dst, const uint8_t* top) { // Left samples are top[-5 .. -2], top_left is top[-1], top are // located at top[0..3], and top right is top[4..7] -static void Intra4Preds(uint8_t* dst, const uint8_t* top) { - DC4(I4DC4 + dst, top); - TM4(I4TM4 + dst, top); - VE4(I4VE4 + dst, top); - HE4(I4HE4 + dst, top); - RD4(I4RD4 + dst, top); - VR4(I4VR4 + dst, top); - LD4(I4LD4 + dst, top); - VL4(I4VL4 + dst, top); - HD4(I4HD4 + dst, top); - HU4(I4HU4 + dst, top); +static void Intra4Preds_SSE2(uint8_t* dst, const uint8_t* top) { + DC4_SSE2(I4DC4 + dst, top); + TM4_SSE2(I4TM4 + dst, top); + VE4_SSE2(I4VE4 + dst, top); + HE4_SSE2(I4HE4 + dst, top); + RD4_SSE2(I4RD4 + dst, top); + VR4_SSE2(I4VR4 + dst, top); + LD4_SSE2(I4LD4 + dst, top); + VL4_SSE2(I4VL4 + dst, top); + HD4_SSE2(I4HD4 + dst, top); + HU4_SSE2(I4HU4 + dst, top); } //------------------------------------------------------------------------------ // Chroma 8x8 prediction (paragraph 12.2) -static void IntraChromaPreds(uint8_t* dst, const uint8_t* left, - const uint8_t* top) { +static void IntraChromaPreds_SSE2(uint8_t* dst, const uint8_t* left, + const uint8_t* top) { // U block - DC8uvMode(C8DC8 + dst, left, top); - VerticalPred(C8VE8 + dst, top, 8); - HorizontalPred(C8HE8 + dst, left, 8); - TrueMotion(C8TM8 + dst, left, top, 8); + DC8uvMode_SSE2(C8DC8 + dst, left, top); + VerticalPred_SSE2(C8VE8 + dst, top, 8); + HorizontalPred_SSE2(C8HE8 + dst, left, 8); + TrueMotion_SSE2(C8TM8 + dst, left, top, 8); // V block dst += 8; if (top != NULL) top += 8; if (left != NULL) left += 16; - DC8uvMode(C8DC8 + dst, left, top); - VerticalPred(C8VE8 + dst, top, 8); - HorizontalPred(C8HE8 + dst, left, 8); - TrueMotion(C8TM8 + dst, left, top, 8); + DC8uvMode_SSE2(C8DC8 + dst, left, top); + VerticalPred_SSE2(C8VE8 + dst, top, 8); + HorizontalPred_SSE2(C8HE8 + dst, left, 8); + TrueMotion_SSE2(C8TM8 + dst, left, top, 8); } //------------------------------------------------------------------------------ // luma 16x16 prediction (paragraph 12.3) -static void Intra16Preds(uint8_t* dst, - const uint8_t* left, const uint8_t* top) { - DC16Mode(I16DC16 + dst, left, top); - VerticalPred(I16VE16 + dst, top, 16); - HorizontalPred(I16HE16 + dst, left, 16); - TrueMotion(I16TM16 + dst, left, top, 16); +static void Intra16Preds_SSE2(uint8_t* dst, + const uint8_t* left, const uint8_t* top) { + DC16Mode_SSE2(I16DC16 + dst, left, top); + VerticalPred_SSE2(I16VE16 + dst, top, 16); + HorizontalPred_SSE2(I16HE16 + dst, left, 16); + TrueMotion_SSE2(I16TM16 + dst, left, top, 16); } //------------------------------------------------------------------------------ // Metric -static WEBP_INLINE void SubtractAndAccumulate(const __m128i a, const __m128i b, - __m128i* const sum) { +static WEBP_INLINE void SubtractAndAccumulate_SSE2(const __m128i a, + const __m128i b, + __m128i* const sum) { // take abs(a-b) in 8b const __m128i a_b = _mm_subs_epu8(a, b); const __m128i b_a = _mm_subs_epu8(b, a); @@ -951,8 +959,8 @@ static WEBP_INLINE void SubtractAndAccumulate(const __m128i a, const __m128i b, *sum = _mm_add_epi32(sum1, sum2); } -static WEBP_INLINE int SSE_16xN(const uint8_t* a, const uint8_t* b, - int num_pairs) { +static WEBP_INLINE int SSE_16xN_SSE2(const uint8_t* a, const uint8_t* b, + int num_pairs) { __m128i sum = _mm_setzero_si128(); int32_t tmp[4]; int i; @@ -963,8 +971,8 @@ static WEBP_INLINE int SSE_16xN(const uint8_t* a, const uint8_t* b, const __m128i a1 = _mm_loadu_si128((const __m128i*)&a[BPS * 1]); const __m128i b1 = _mm_loadu_si128((const __m128i*)&b[BPS * 1]); __m128i sum1, sum2; - SubtractAndAccumulate(a0, b0, &sum1); - SubtractAndAccumulate(a1, b1, &sum2); + SubtractAndAccumulate_SSE2(a0, b0, &sum1); + SubtractAndAccumulate_SSE2(a1, b1, &sum2); sum = _mm_add_epi32(sum, _mm_add_epi32(sum1, sum2)); a += 2 * BPS; b += 2 * BPS; @@ -973,18 +981,18 @@ static WEBP_INLINE int SSE_16xN(const uint8_t* a, const uint8_t* b, return (tmp[3] + tmp[2] + tmp[1] + tmp[0]); } -static int SSE16x16(const uint8_t* a, const uint8_t* b) { - return SSE_16xN(a, b, 8); +static int SSE16x16_SSE2(const uint8_t* a, const uint8_t* b) { + return SSE_16xN_SSE2(a, b, 8); } -static int SSE16x8(const uint8_t* a, const uint8_t* b) { - return SSE_16xN(a, b, 4); +static int SSE16x8_SSE2(const uint8_t* a, const uint8_t* b) { + return SSE_16xN_SSE2(a, b, 4); } #define LOAD_8x16b(ptr) \ _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(ptr)), zero) -static int SSE8x8(const uint8_t* a, const uint8_t* b) { +static int SSE8x8_SSE2(const uint8_t* a, const uint8_t* b) { const __m128i zero = _mm_setzero_si128(); int num_pairs = 4; __m128i sum = zero; @@ -1011,7 +1019,7 @@ static int SSE8x8(const uint8_t* a, const uint8_t* b) { } #undef LOAD_8x16b -static int SSE4x4(const uint8_t* a, const uint8_t* b) { +static int SSE4x4_SSE2(const uint8_t* a, const uint8_t* b) { const __m128i zero = _mm_setzero_si128(); // Load values. Note that we read 8 pixels instead of 4, @@ -1048,7 +1056,7 @@ static int SSE4x4(const uint8_t* a, const uint8_t* b) { //------------------------------------------------------------------------------ -static void Mean16x4(const uint8_t* ref, uint32_t dc[4]) { +static void Mean16x4_SSE2(const uint8_t* ref, uint32_t dc[4]) { const __m128i mask = _mm_set1_epi16(0x00ff); const __m128i a0 = _mm_loadu_si128((const __m128i*)&ref[BPS * 0]); const __m128i a1 = _mm_loadu_si128((const __m128i*)&ref[BPS * 1]); @@ -1086,8 +1094,8 @@ static void Mean16x4(const uint8_t* ref, uint32_t dc[4]) { // Hadamard transform // Returns the weighted sum of the absolute value of transformed coefficients. // w[] contains a row-major 4 by 4 symmetric matrix. -static int TTransform(const uint8_t* inA, const uint8_t* inB, - const uint16_t* const w) { +static int TTransform_SSE2(const uint8_t* inA, const uint8_t* inB, + const uint16_t* const w) { int32_t sum[4]; __m128i tmp_0, tmp_1, tmp_2, tmp_3; const __m128i zero = _mm_setzero_si128(); @@ -1187,19 +1195,19 @@ static int TTransform(const uint8_t* inA, const uint8_t* inB, return sum[0] + sum[1] + sum[2] + sum[3]; } -static int Disto4x4(const uint8_t* const a, const uint8_t* const b, - const uint16_t* const w) { - const int diff_sum = TTransform(a, b, w); +static int Disto4x4_SSE2(const uint8_t* const a, const uint8_t* const b, + const uint16_t* const w) { + const int diff_sum = TTransform_SSE2(a, b, w); return abs(diff_sum) >> 5; } -static int Disto16x16(const uint8_t* const a, const uint8_t* const b, - const uint16_t* const w) { +static int Disto16x16_SSE2(const uint8_t* const a, const uint8_t* const b, + const uint16_t* const w) { int D = 0; int x, y; for (y = 0; y < 16 * BPS; y += 4 * BPS) { for (x = 0; x < 16; x += 4) { - D += Disto4x4(a + x + y, b + x + y, w); + D += Disto4x4_SSE2(a + x + y, b + x + y, w); } } return D; @@ -1209,9 +1217,9 @@ static int Disto16x16(const uint8_t* const a, const uint8_t* const b, // Quantization // -static WEBP_INLINE int DoQuantizeBlock(int16_t in[16], int16_t out[16], - const uint16_t* const sharpen, - const VP8Matrix* const mtx) { +static WEBP_INLINE int DoQuantizeBlock_SSE2(int16_t in[16], int16_t out[16], + const uint16_t* const sharpen, + const VP8Matrix* const mtx) { const __m128i max_coeff_2047 = _mm_set1_epi16(MAX_LEVEL); const __m128i zero = _mm_setzero_si128(); __m128i coeff0, coeff8; @@ -1321,22 +1329,22 @@ static WEBP_INLINE int DoQuantizeBlock(int16_t in[16], int16_t out[16], return (_mm_movemask_epi8(_mm_cmpeq_epi8(packed_out, zero)) != 0xffff); } -static int QuantizeBlock(int16_t in[16], int16_t out[16], - const VP8Matrix* const mtx) { - return DoQuantizeBlock(in, out, &mtx->sharpen_[0], mtx); +static int QuantizeBlock_SSE2(int16_t in[16], int16_t out[16], + const VP8Matrix* const mtx) { + return DoQuantizeBlock_SSE2(in, out, &mtx->sharpen_[0], mtx); } -static int QuantizeBlockWHT(int16_t in[16], int16_t out[16], - const VP8Matrix* const mtx) { - return DoQuantizeBlock(in, out, NULL, mtx); +static int QuantizeBlockWHT_SSE2(int16_t in[16], int16_t out[16], + const VP8Matrix* const mtx) { + return DoQuantizeBlock_SSE2(in, out, NULL, mtx); } -static int Quantize2Blocks(int16_t in[32], int16_t out[32], - const VP8Matrix* const mtx) { +static int Quantize2Blocks_SSE2(int16_t in[32], int16_t out[32], + const VP8Matrix* const mtx) { int nz; const uint16_t* const sharpen = &mtx->sharpen_[0]; - nz = DoQuantizeBlock(in + 0 * 16, out + 0 * 16, sharpen, mtx) << 0; - nz |= DoQuantizeBlock(in + 1 * 16, out + 1 * 16, sharpen, mtx) << 1; + nz = DoQuantizeBlock_SSE2(in + 0 * 16, out + 0 * 16, sharpen, mtx) << 0; + nz |= DoQuantizeBlock_SSE2(in + 1 * 16, out + 1 * 16, sharpen, mtx) << 1; return nz; } @@ -1346,139 +1354,28 @@ static int Quantize2Blocks(int16_t in[32], int16_t out[32], extern void VP8EncDspInitSSE2(void); WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitSSE2(void) { - VP8CollectHistogram = CollectHistogram; - VP8EncPredLuma16 = Intra16Preds; - VP8EncPredChroma8 = IntraChromaPreds; - VP8EncPredLuma4 = Intra4Preds; - VP8EncQuantizeBlock = QuantizeBlock; - VP8EncQuantize2Blocks = Quantize2Blocks; - VP8EncQuantizeBlockWHT = QuantizeBlockWHT; - VP8ITransform = ITransform; - VP8FTransform = FTransform; - VP8FTransform2 = FTransform2; - VP8FTransformWHT = FTransformWHT; - VP8SSE16x16 = SSE16x16; - VP8SSE16x8 = SSE16x8; - VP8SSE8x8 = SSE8x8; - VP8SSE4x4 = SSE4x4; - VP8TDisto4x4 = Disto4x4; - VP8TDisto16x16 = Disto16x16; - VP8Mean16x4 = Mean16x4; -} - -//------------------------------------------------------------------------------ -// SSIM / PSNR entry point (TODO(skal): move to its own file later) - -static uint32_t AccumulateSSE_SSE2(const uint8_t* src1, - const uint8_t* src2, int len) { - int i = 0; - uint32_t sse2 = 0; - if (len >= 16) { - const int limit = len - 32; - int32_t tmp[4]; - __m128i sum1; - __m128i sum = _mm_setzero_si128(); - __m128i a0 = _mm_loadu_si128((const __m128i*)&src1[i]); - __m128i b0 = _mm_loadu_si128((const __m128i*)&src2[i]); - i += 16; - while (i <= limit) { - const __m128i a1 = _mm_loadu_si128((const __m128i*)&src1[i]); - const __m128i b1 = _mm_loadu_si128((const __m128i*)&src2[i]); - __m128i sum2; - i += 16; - SubtractAndAccumulate(a0, b0, &sum1); - sum = _mm_add_epi32(sum, sum1); - a0 = _mm_loadu_si128((const __m128i*)&src1[i]); - b0 = _mm_loadu_si128((const __m128i*)&src2[i]); - i += 16; - SubtractAndAccumulate(a1, b1, &sum2); - sum = _mm_add_epi32(sum, sum2); - } - SubtractAndAccumulate(a0, b0, &sum1); - sum = _mm_add_epi32(sum, sum1); - _mm_storeu_si128((__m128i*)tmp, sum); - sse2 += (tmp[3] + tmp[2] + tmp[1] + tmp[0]); - } - - for (; i < len; ++i) { - const int32_t diff = src1[i] - src2[i]; - sse2 += diff * diff; - } - return sse2; -} - -static uint32_t HorizontalAdd16b(const __m128i* const m) { - uint16_t tmp[8]; - const __m128i a = _mm_srli_si128(*m, 8); - const __m128i b = _mm_add_epi16(*m, a); - _mm_storeu_si128((__m128i*)tmp, b); - return (uint32_t)tmp[3] + tmp[2] + tmp[1] + tmp[0]; -} - -static uint32_t HorizontalAdd32b(const __m128i* const m) { - const __m128i a = _mm_srli_si128(*m, 8); - const __m128i b = _mm_add_epi32(*m, a); - const __m128i c = _mm_add_epi32(b, _mm_srli_si128(b, 4)); - return (uint32_t)_mm_cvtsi128_si32(c); -} - -static const uint16_t kWeight[] = { 1, 2, 3, 4, 3, 2, 1, 0 }; - -#define ACCUMULATE_ROW(WEIGHT) do { \ - /* compute row weight (Wx * Wy) */ \ - const __m128i Wy = _mm_set1_epi16((WEIGHT)); \ - const __m128i W = _mm_mullo_epi16(Wx, Wy); \ - /* process 8 bytes at a time (7 bytes, actually) */ \ - const __m128i a0 = _mm_loadl_epi64((const __m128i*)src1); \ - const __m128i b0 = _mm_loadl_epi64((const __m128i*)src2); \ - /* convert to 16b and multiply by weight */ \ - const __m128i a1 = _mm_unpacklo_epi8(a0, zero); \ - const __m128i b1 = _mm_unpacklo_epi8(b0, zero); \ - const __m128i wa1 = _mm_mullo_epi16(a1, W); \ - const __m128i wb1 = _mm_mullo_epi16(b1, W); \ - /* accumulate */ \ - xm = _mm_add_epi16(xm, wa1); \ - ym = _mm_add_epi16(ym, wb1); \ - xxm = _mm_add_epi32(xxm, _mm_madd_epi16(a1, wa1)); \ - xym = _mm_add_epi32(xym, _mm_madd_epi16(a1, wb1)); \ - yym = _mm_add_epi32(yym, _mm_madd_epi16(b1, wb1)); \ - src1 += stride1; \ - src2 += stride2; \ -} while (0) - -static double SSIMGet_SSE2(const uint8_t* src1, int stride1, - const uint8_t* src2, int stride2) { - VP8DistoStats stats; - const __m128i zero = _mm_setzero_si128(); - __m128i xm = zero, ym = zero; // 16b accums - __m128i xxm = zero, yym = zero, xym = zero; // 32b accum - const __m128i Wx = _mm_loadu_si128((const __m128i*)kWeight); - assert(2 * VP8_SSIM_KERNEL + 1 == 7); - ACCUMULATE_ROW(1); - ACCUMULATE_ROW(2); - ACCUMULATE_ROW(3); - ACCUMULATE_ROW(4); - ACCUMULATE_ROW(3); - ACCUMULATE_ROW(2); - ACCUMULATE_ROW(1); - stats.xm = HorizontalAdd16b(&xm); - stats.ym = HorizontalAdd16b(&ym); - stats.xxm = HorizontalAdd32b(&xxm); - stats.xym = HorizontalAdd32b(&xym); - stats.yym = HorizontalAdd32b(&yym); - return VP8SSIMFromStats(&stats); -} - -extern void VP8SSIMDspInitSSE2(void); - -WEBP_TSAN_IGNORE_FUNCTION void VP8SSIMDspInitSSE2(void) { - VP8AccumulateSSE = AccumulateSSE_SSE2; - VP8SSIMGet = SSIMGet_SSE2; + VP8CollectHistogram = CollectHistogram_SSE2; + VP8EncPredLuma16 = Intra16Preds_SSE2; + VP8EncPredChroma8 = IntraChromaPreds_SSE2; + VP8EncPredLuma4 = Intra4Preds_SSE2; + VP8EncQuantizeBlock = QuantizeBlock_SSE2; + VP8EncQuantize2Blocks = Quantize2Blocks_SSE2; + VP8EncQuantizeBlockWHT = QuantizeBlockWHT_SSE2; + VP8ITransform = ITransform_SSE2; + VP8FTransform = FTransform_SSE2; + VP8FTransform2 = FTransform2_SSE2; + VP8FTransformWHT = FTransformWHT_SSE2; + VP8SSE16x16 = SSE16x16_SSE2; + VP8SSE16x8 = SSE16x8_SSE2; + VP8SSE8x8 = SSE8x8_SSE2; + VP8SSE4x4 = SSE4x4_SSE2; + VP8TDisto4x4 = Disto4x4_SSE2; + VP8TDisto16x16 = Disto16x16_SSE2; + VP8Mean16x4 = Mean16x4_SSE2; } #else // !WEBP_USE_SSE2 WEBP_DSP_INIT_STUB(VP8EncDspInitSSE2) -WEBP_DSP_INIT_STUB(VP8SSIMDspInitSSE2) #endif // WEBP_USE_SSE2 diff --git a/src/3rdparty/libwebp/src/dsp/enc_sse41.c b/src/3rdparty/libwebp/src/dsp/enc_sse41.c index e32086d..924035a 100644 --- a/src/3rdparty/libwebp/src/dsp/enc_sse41.c +++ b/src/3rdparty/libwebp/src/dsp/enc_sse41.c @@ -11,21 +11,21 @@ // // Author: Skal (pascal.massimino@gmail.com) -#include "./dsp.h" +#include "src/dsp/dsp.h" #if defined(WEBP_USE_SSE41) #include <smmintrin.h> #include <stdlib.h> // for abs() -#include "./common_sse2.h" -#include "../enc/vp8i_enc.h" +#include "src/dsp/common_sse2.h" +#include "src/enc/vp8i_enc.h" //------------------------------------------------------------------------------ // Compute susceptibility based on DCT-coeff histograms. -static void CollectHistogram(const uint8_t* ref, const uint8_t* pred, - int start_block, int end_block, - VP8Histogram* const histo) { +static void CollectHistogram_SSE41(const uint8_t* ref, const uint8_t* pred, + int start_block, int end_block, + VP8Histogram* const histo) { const __m128i max_coeff_thresh = _mm_set1_epi16(MAX_COEFF_THRESH); int j; int distribution[MAX_COEFF_THRESH + 1] = { 0 }; @@ -70,8 +70,8 @@ static void CollectHistogram(const uint8_t* ref, const uint8_t* pred, // Hadamard transform // Returns the weighted sum of the absolute value of transformed coefficients. // w[] contains a row-major 4 by 4 symmetric matrix. -static int TTransform(const uint8_t* inA, const uint8_t* inB, - const uint16_t* const w) { +static int TTransform_SSE41(const uint8_t* inA, const uint8_t* inB, + const uint16_t* const w) { int32_t sum[4]; __m128i tmp_0, tmp_1, tmp_2, tmp_3; @@ -168,19 +168,19 @@ static int TTransform(const uint8_t* inA, const uint8_t* inB, return sum[0] + sum[1] + sum[2] + sum[3]; } -static int Disto4x4(const uint8_t* const a, const uint8_t* const b, - const uint16_t* const w) { - const int diff_sum = TTransform(a, b, w); +static int Disto4x4_SSE41(const uint8_t* const a, const uint8_t* const b, + const uint16_t* const w) { + const int diff_sum = TTransform_SSE41(a, b, w); return abs(diff_sum) >> 5; } -static int Disto16x16(const uint8_t* const a, const uint8_t* const b, - const uint16_t* const w) { +static int Disto16x16_SSE41(const uint8_t* const a, const uint8_t* const b, + const uint16_t* const w) { int D = 0; int x, y; for (y = 0; y < 16 * BPS; y += 4 * BPS) { for (x = 0; x < 16; x += 4) { - D += Disto4x4(a + x + y, b + x + y, w); + D += Disto4x4_SSE41(a + x + y, b + x + y, w); } } return D; @@ -197,9 +197,9 @@ static int Disto16x16(const uint8_t* const a, const uint8_t* const b, 2 * (D) + 1, 2 * (D) + 0, 2 * (C) + 1, 2 * (C) + 0, \ 2 * (B) + 1, 2 * (B) + 0, 2 * (A) + 1, 2 * (A) + 0) -static WEBP_INLINE int DoQuantizeBlock(int16_t in[16], int16_t out[16], - const uint16_t* const sharpen, - const VP8Matrix* const mtx) { +static WEBP_INLINE int DoQuantizeBlock_SSE41(int16_t in[16], int16_t out[16], + const uint16_t* const sharpen, + const VP8Matrix* const mtx) { const __m128i max_coeff_2047 = _mm_set1_epi16(MAX_LEVEL); const __m128i zero = _mm_setzero_si128(); __m128i out0, out8; @@ -300,22 +300,22 @@ static WEBP_INLINE int DoQuantizeBlock(int16_t in[16], int16_t out[16], #undef PSHUFB_CST -static int QuantizeBlock(int16_t in[16], int16_t out[16], - const VP8Matrix* const mtx) { - return DoQuantizeBlock(in, out, &mtx->sharpen_[0], mtx); +static int QuantizeBlock_SSE41(int16_t in[16], int16_t out[16], + const VP8Matrix* const mtx) { + return DoQuantizeBlock_SSE41(in, out, &mtx->sharpen_[0], mtx); } -static int QuantizeBlockWHT(int16_t in[16], int16_t out[16], - const VP8Matrix* const mtx) { - return DoQuantizeBlock(in, out, NULL, mtx); +static int QuantizeBlockWHT_SSE41(int16_t in[16], int16_t out[16], + const VP8Matrix* const mtx) { + return DoQuantizeBlock_SSE41(in, out, NULL, mtx); } -static int Quantize2Blocks(int16_t in[32], int16_t out[32], - const VP8Matrix* const mtx) { +static int Quantize2Blocks_SSE41(int16_t in[32], int16_t out[32], + const VP8Matrix* const mtx) { int nz; const uint16_t* const sharpen = &mtx->sharpen_[0]; - nz = DoQuantizeBlock(in + 0 * 16, out + 0 * 16, sharpen, mtx) << 0; - nz |= DoQuantizeBlock(in + 1 * 16, out + 1 * 16, sharpen, mtx) << 1; + nz = DoQuantizeBlock_SSE41(in + 0 * 16, out + 0 * 16, sharpen, mtx) << 0; + nz |= DoQuantizeBlock_SSE41(in + 1 * 16, out + 1 * 16, sharpen, mtx) << 1; return nz; } @@ -324,12 +324,12 @@ static int Quantize2Blocks(int16_t in[32], int16_t out[32], extern void VP8EncDspInitSSE41(void); WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitSSE41(void) { - VP8CollectHistogram = CollectHistogram; - VP8EncQuantizeBlock = QuantizeBlock; - VP8EncQuantize2Blocks = Quantize2Blocks; - VP8EncQuantizeBlockWHT = QuantizeBlockWHT; - VP8TDisto4x4 = Disto4x4; - VP8TDisto16x16 = Disto16x16; + VP8CollectHistogram = CollectHistogram_SSE41; + VP8EncQuantizeBlock = QuantizeBlock_SSE41; + VP8EncQuantize2Blocks = Quantize2Blocks_SSE41; + VP8EncQuantizeBlockWHT = QuantizeBlockWHT_SSE41; + VP8TDisto4x4 = Disto4x4_SSE41; + VP8TDisto16x16 = Disto16x16_SSE41; } #else // !WEBP_USE_SSE41 diff --git a/src/3rdparty/libwebp/src/dsp/filters.c b/src/3rdparty/libwebp/src/dsp/filters.c index 65f34aa..9e910d9 100644 --- a/src/3rdparty/libwebp/src/dsp/filters.c +++ b/src/3rdparty/libwebp/src/dsp/filters.c @@ -11,7 +11,7 @@ // // Author: Urvang (urvang@google.com) -#include "./dsp.h" +#include "src/dsp/dsp.h" #include <assert.h> #include <stdlib.h> #include <string.h> @@ -20,31 +20,32 @@ // Helpful macro. # define SANITY_CHECK(in, out) \ - assert(in != NULL); \ - assert(out != NULL); \ + assert((in) != NULL); \ + assert((out) != NULL); \ assert(width > 0); \ assert(height > 0); \ assert(stride >= width); \ assert(row >= 0 && num_rows > 0 && row + num_rows <= height); \ (void)height; // Silence unused warning. -static WEBP_INLINE void PredictLine(const uint8_t* src, const uint8_t* pred, - uint8_t* dst, int length, int inverse) { +#if !WEBP_NEON_OMIT_C_CODE +static WEBP_INLINE void PredictLine_C(const uint8_t* src, const uint8_t* pred, + uint8_t* dst, int length, int inverse) { int i; if (inverse) { - for (i = 0; i < length; ++i) dst[i] = src[i] + pred[i]; + for (i = 0; i < length; ++i) dst[i] = (uint8_t)(src[i] + pred[i]); } else { - for (i = 0; i < length; ++i) dst[i] = src[i] - pred[i]; + for (i = 0; i < length; ++i) dst[i] = (uint8_t)(src[i] - pred[i]); } } //------------------------------------------------------------------------------ // Horizontal filter. -static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in, - int width, int height, int stride, - int row, int num_rows, - int inverse, uint8_t* out) { +static WEBP_INLINE void DoHorizontalFilter_C(const uint8_t* in, + int width, int height, int stride, + int row, int num_rows, + int inverse, uint8_t* out) { const uint8_t* preds; const size_t start_offset = row * stride; const int last_row = row + num_rows; @@ -56,7 +57,7 @@ static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in, if (row == 0) { // Leftmost pixel is the same as input for topmost scanline. out[0] = in[0]; - PredictLine(in + 1, preds, out + 1, width - 1, inverse); + PredictLine_C(in + 1, preds, out + 1, width - 1, inverse); row = 1; preds += stride; in += stride; @@ -66,8 +67,8 @@ static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in, // Filter line-by-line. while (row < last_row) { // Leftmost pixel is predicted from above. - PredictLine(in, preds - stride, out, 1, inverse); - PredictLine(in + 1, preds, out + 1, width - 1, inverse); + PredictLine_C(in, preds - stride, out, 1, inverse); + PredictLine_C(in + 1, preds, out + 1, width - 1, inverse); ++row; preds += stride; in += stride; @@ -78,10 +79,10 @@ static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in, //------------------------------------------------------------------------------ // Vertical filter. -static WEBP_INLINE void DoVerticalFilter(const uint8_t* in, - int width, int height, int stride, - int row, int num_rows, - int inverse, uint8_t* out) { +static WEBP_INLINE void DoVerticalFilter_C(const uint8_t* in, + int width, int height, int stride, + int row, int num_rows, + int inverse, uint8_t* out) { const uint8_t* preds; const size_t start_offset = row * stride; const int last_row = row + num_rows; @@ -94,7 +95,7 @@ static WEBP_INLINE void DoVerticalFilter(const uint8_t* in, // Very first top-left pixel is copied. out[0] = in[0]; // Rest of top scan-line is left-predicted. - PredictLine(in + 1, preds, out + 1, width - 1, inverse); + PredictLine_C(in + 1, preds, out + 1, width - 1, inverse); row = 1; in += stride; out += stride; @@ -105,26 +106,28 @@ static WEBP_INLINE void DoVerticalFilter(const uint8_t* in, // Filter line-by-line. while (row < last_row) { - PredictLine(in, preds, out, width, inverse); + PredictLine_C(in, preds, out, width, inverse); ++row; preds += stride; in += stride; out += stride; } } +#endif // !WEBP_NEON_OMIT_C_CODE //------------------------------------------------------------------------------ // Gradient filter. -static WEBP_INLINE int GradientPredictor(uint8_t a, uint8_t b, uint8_t c) { +static WEBP_INLINE int GradientPredictor_C(uint8_t a, uint8_t b, uint8_t c) { const int g = a + b - c; return ((g & ~0xff) == 0) ? g : (g < 0) ? 0 : 255; // clip to 8bit } -static WEBP_INLINE void DoGradientFilter(const uint8_t* in, - int width, int height, int stride, - int row, int num_rows, - int inverse, uint8_t* out) { +#if !WEBP_NEON_OMIT_C_CODE +static WEBP_INLINE void DoGradientFilter_C(const uint8_t* in, + int width, int height, int stride, + int row, int num_rows, + int inverse, uint8_t* out) { const uint8_t* preds; const size_t start_offset = row * stride; const int last_row = row + num_rows; @@ -136,7 +139,7 @@ static WEBP_INLINE void DoGradientFilter(const uint8_t* in, // left prediction for top scan-line if (row == 0) { out[0] = in[0]; - PredictLine(in + 1, preds, out + 1, width - 1, inverse); + PredictLine_C(in + 1, preds, out + 1, width - 1, inverse); row = 1; preds += stride; in += stride; @@ -147,12 +150,12 @@ static WEBP_INLINE void DoGradientFilter(const uint8_t* in, while (row < last_row) { int w; // leftmost pixel: predict from above. - PredictLine(in, preds - stride, out, 1, inverse); + PredictLine_C(in, preds - stride, out, 1, inverse); for (w = 1; w < width; ++w) { - const int pred = GradientPredictor(preds[w - 1], - preds[w - stride], - preds[w - stride - 1]); - out[w] = in[w] + (inverse ? pred : -pred); + const int pred = GradientPredictor_C(preds[w - 1], + preds[w - stride], + preds[w - stride - 1]); + out[w] = (uint8_t)(in[w] + (inverse ? pred : -pred)); } ++row; preds += stride; @@ -160,60 +163,64 @@ static WEBP_INLINE void DoGradientFilter(const uint8_t* in, out += stride; } } +#endif // !WEBP_NEON_OMIT_C_CODE #undef SANITY_CHECK //------------------------------------------------------------------------------ -static void HorizontalFilter(const uint8_t* data, int width, int height, - int stride, uint8_t* filtered_data) { - DoHorizontalFilter(data, width, height, stride, 0, height, 0, filtered_data); +#if !WEBP_NEON_OMIT_C_CODE +static void HorizontalFilter_C(const uint8_t* data, int width, int height, + int stride, uint8_t* filtered_data) { + DoHorizontalFilter_C(data, width, height, stride, 0, height, 0, + filtered_data); } -static void VerticalFilter(const uint8_t* data, int width, int height, - int stride, uint8_t* filtered_data) { - DoVerticalFilter(data, width, height, stride, 0, height, 0, filtered_data); +static void VerticalFilter_C(const uint8_t* data, int width, int height, + int stride, uint8_t* filtered_data) { + DoVerticalFilter_C(data, width, height, stride, 0, height, 0, filtered_data); } - -static void GradientFilter(const uint8_t* data, int width, int height, - int stride, uint8_t* filtered_data) { - DoGradientFilter(data, width, height, stride, 0, height, 0, filtered_data); +static void GradientFilter_C(const uint8_t* data, int width, int height, + int stride, uint8_t* filtered_data) { + DoGradientFilter_C(data, width, height, stride, 0, height, 0, filtered_data); } - +#endif // !WEBP_NEON_OMIT_C_CODE //------------------------------------------------------------------------------ -static void HorizontalUnfilter(const uint8_t* prev, const uint8_t* in, - uint8_t* out, int width) { +static void HorizontalUnfilter_C(const uint8_t* prev, const uint8_t* in, + uint8_t* out, int width) { uint8_t pred = (prev == NULL) ? 0 : prev[0]; int i; for (i = 0; i < width; ++i) { - out[i] = pred + in[i]; + out[i] = (uint8_t)(pred + in[i]); pred = out[i]; } } -static void VerticalUnfilter(const uint8_t* prev, const uint8_t* in, - uint8_t* out, int width) { +#if !WEBP_NEON_OMIT_C_CODE +static void VerticalUnfilter_C(const uint8_t* prev, const uint8_t* in, + uint8_t* out, int width) { if (prev == NULL) { - HorizontalUnfilter(NULL, in, out, width); + HorizontalUnfilter_C(NULL, in, out, width); } else { int i; - for (i = 0; i < width; ++i) out[i] = prev[i] + in[i]; + for (i = 0; i < width; ++i) out[i] = (uint8_t)(prev[i] + in[i]); } } +#endif // !WEBP_NEON_OMIT_C_CODE -static void GradientUnfilter(const uint8_t* prev, const uint8_t* in, - uint8_t* out, int width) { +static void GradientUnfilter_C(const uint8_t* prev, const uint8_t* in, + uint8_t* out, int width) { if (prev == NULL) { - HorizontalUnfilter(NULL, in, out, width); + HorizontalUnfilter_C(NULL, in, out, width); } else { uint8_t top = prev[0], top_left = top, left = top; int i; for (i = 0; i < width; ++i) { top = prev[i]; // need to read this first, in case prev==out - left = in[i] + GradientPredictor(left, top, top_left); + left = (uint8_t)(in[i] + GradientPredictor_C(left, top, top_left)); top_left = top; out[i] = left; } @@ -231,21 +238,20 @@ extern void VP8FiltersInitMSA(void); extern void VP8FiltersInitNEON(void); extern void VP8FiltersInitSSE2(void); -static volatile VP8CPUInfo filters_last_cpuinfo_used = - (VP8CPUInfo)&filters_last_cpuinfo_used; - -WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInit(void) { - if (filters_last_cpuinfo_used == VP8GetCPUInfo) return; - +WEBP_DSP_INIT_FUNC(VP8FiltersInit) { WebPUnfilters[WEBP_FILTER_NONE] = NULL; - WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter; - WebPUnfilters[WEBP_FILTER_VERTICAL] = VerticalUnfilter; - WebPUnfilters[WEBP_FILTER_GRADIENT] = GradientUnfilter; +#if !WEBP_NEON_OMIT_C_CODE + WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter_C; + WebPUnfilters[WEBP_FILTER_VERTICAL] = VerticalUnfilter_C; +#endif + WebPUnfilters[WEBP_FILTER_GRADIENT] = GradientUnfilter_C; WebPFilters[WEBP_FILTER_NONE] = NULL; - WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter; - WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter; - WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter; +#if !WEBP_NEON_OMIT_C_CODE + WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter_C; + WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter_C; + WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter_C; +#endif if (VP8GetCPUInfo != NULL) { #if defined(WEBP_USE_SSE2) @@ -253,11 +259,6 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInit(void) { VP8FiltersInitSSE2(); } #endif -#if defined(WEBP_USE_NEON) - if (VP8GetCPUInfo(kNEON)) { - VP8FiltersInitNEON(); - } -#endif #if defined(WEBP_USE_MIPS_DSP_R2) if (VP8GetCPUInfo(kMIPSdspR2)) { VP8FiltersInitMIPSdspR2(); @@ -269,5 +270,18 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInit(void) { } #endif } - filters_last_cpuinfo_used = VP8GetCPUInfo; + +#if defined(WEBP_USE_NEON) + if (WEBP_NEON_OMIT_C_CODE || + (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) { + VP8FiltersInitNEON(); + } +#endif + + assert(WebPUnfilters[WEBP_FILTER_HORIZONTAL] != NULL); + assert(WebPUnfilters[WEBP_FILTER_VERTICAL] != NULL); + assert(WebPUnfilters[WEBP_FILTER_GRADIENT] != NULL); + assert(WebPFilters[WEBP_FILTER_HORIZONTAL] != NULL); + assert(WebPFilters[WEBP_FILTER_VERTICAL] != NULL); + assert(WebPFilters[WEBP_FILTER_GRADIENT] != NULL); } diff --git a/src/3rdparty/libwebp/src/dsp/filters_mips_dsp_r2.c b/src/3rdparty/libwebp/src/dsp/filters_mips_dsp_r2.c index 1d82e3c..9382b12 100644 --- a/src/3rdparty/libwebp/src/dsp/filters_mips_dsp_r2.c +++ b/src/3rdparty/libwebp/src/dsp/filters_mips_dsp_r2.c @@ -12,11 +12,11 @@ // Author(s): Branimir Vasic (branimir.vasic@imgtec.com) // Djordje Pesut (djordje.pesut@imgtec.com) -#include "./dsp.h" +#include "src/dsp/dsp.h" #if defined(WEBP_USE_MIPS_DSP_R2) -#include "../dsp/dsp.h" +#include "src/dsp/dsp.h" #include <assert.h> #include <stdlib.h> #include <string.h> @@ -101,8 +101,8 @@ ); \ } while (0) -static WEBP_INLINE void PredictLine(const uint8_t* src, uint8_t* dst, - int length) { +static WEBP_INLINE void PredictLine_MIPSdspR2(const uint8_t* src, uint8_t* dst, + int length) { DO_PREDICT_LINE(src, dst, length, 0); } @@ -192,10 +192,11 @@ static WEBP_INLINE void PredictLine(const uint8_t* src, uint8_t* dst, } \ } while (0) -static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in, - int width, int height, int stride, - int row, int num_rows, - uint8_t* out) { +static WEBP_INLINE void DoHorizontalFilter_MIPSdspR2(const uint8_t* in, + int width, int height, + int stride, + int row, int num_rows, + uint8_t* out) { const uint8_t* preds; const size_t start_offset = row * stride; const int last_row = row + num_rows; @@ -207,7 +208,7 @@ static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in, if (row == 0) { // Leftmost pixel is the same as input for topmost scanline. out[0] = in[0]; - PredictLine(in + 1, out + 1, width - 1); + PredictLine_MIPSdspR2(in + 1, out + 1, width - 1); row = 1; preds += stride; in += stride; @@ -219,9 +220,11 @@ static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in, } #undef FILTER_LINE_BY_LINE -static void HorizontalFilter(const uint8_t* data, int width, int height, - int stride, uint8_t* filtered_data) { - DoHorizontalFilter(data, width, height, stride, 0, height, filtered_data); +static void HorizontalFilter_MIPSdspR2(const uint8_t* data, + int width, int height, + int stride, uint8_t* filtered_data) { + DoHorizontalFilter_MIPSdspR2(data, width, height, stride, 0, height, + filtered_data); } //------------------------------------------------------------------------------ @@ -237,9 +240,11 @@ static void HorizontalFilter(const uint8_t* data, int width, int height, } \ } while (0) -static WEBP_INLINE void DoVerticalFilter(const uint8_t* in, - int width, int height, int stride, - int row, int num_rows, uint8_t* out) { +static WEBP_INLINE void DoVerticalFilter_MIPSdspR2(const uint8_t* in, + int width, int height, + int stride, + int row, int num_rows, + uint8_t* out) { const uint8_t* preds; const size_t start_offset = row * stride; const int last_row = row + num_rows; @@ -252,7 +257,7 @@ static WEBP_INLINE void DoVerticalFilter(const uint8_t* in, // Very first top-left pixel is copied. out[0] = in[0]; // Rest of top scan-line is left-predicted. - PredictLine(in + 1, out + 1, width - 1); + PredictLine_MIPSdspR2(in + 1, out + 1, width - 1); row = 1; in += stride; out += stride; @@ -266,15 +271,16 @@ static WEBP_INLINE void DoVerticalFilter(const uint8_t* in, } #undef FILTER_LINE_BY_LINE -static void VerticalFilter(const uint8_t* data, int width, int height, - int stride, uint8_t* filtered_data) { - DoVerticalFilter(data, width, height, stride, 0, height, filtered_data); +static void VerticalFilter_MIPSdspR2(const uint8_t* data, int width, int height, + int stride, uint8_t* filtered_data) { + DoVerticalFilter_MIPSdspR2(data, width, height, stride, 0, height, + filtered_data); } //------------------------------------------------------------------------------ // Gradient filter. -static WEBP_INLINE int GradientPredictor(uint8_t a, uint8_t b, uint8_t c) { +static int GradientPredictor_MIPSdspR2(uint8_t a, uint8_t b, uint8_t c) { int temp0; __asm__ volatile ( "addu %[temp0], %[a], %[b] \n\t" @@ -293,9 +299,9 @@ static WEBP_INLINE int GradientPredictor(uint8_t a, uint8_t b, uint8_t c) { int w; \ PREDICT_LINE_ONE_PASS(in, PREDS - stride, out); \ for (w = 1; w < width; ++w) { \ - const int pred = GradientPredictor(PREDS[w - 1], \ - PREDS[w - stride], \ - PREDS[w - stride - 1]); \ + const int pred = GradientPredictor_MIPSdspR2(PREDS[w - 1], \ + PREDS[w - stride], \ + PREDS[w - stride - 1]); \ out[w] = in[w] OPERATION pred; \ } \ ++row; \ @@ -304,9 +310,9 @@ static WEBP_INLINE int GradientPredictor(uint8_t a, uint8_t b, uint8_t c) { } \ } while (0) -static WEBP_INLINE void DoGradientFilter(const uint8_t* in, - int width, int height, int stride, - int row, int num_rows, uint8_t* out) { +static void DoGradientFilter_MIPSdspR2(const uint8_t* in, + int width, int height, int stride, + int row, int num_rows, uint8_t* out) { const uint8_t* preds; const size_t start_offset = row * stride; const int last_row = row + num_rows; @@ -318,7 +324,7 @@ static WEBP_INLINE void DoGradientFilter(const uint8_t* in, // left prediction for top scan-line if (row == 0) { out[0] = in[0]; - PredictLine(in + 1, out + 1, width - 1); + PredictLine_MIPSdspR2(in + 1, out + 1, width - 1); row = 1; preds += stride; in += stride; @@ -330,38 +336,39 @@ static WEBP_INLINE void DoGradientFilter(const uint8_t* in, } #undef FILTER_LINE_BY_LINE -static void GradientFilter(const uint8_t* data, int width, int height, - int stride, uint8_t* filtered_data) { - DoGradientFilter(data, width, height, stride, 0, height, filtered_data); +static void GradientFilter_MIPSdspR2(const uint8_t* data, int width, int height, + int stride, uint8_t* filtered_data) { + DoGradientFilter_MIPSdspR2(data, width, height, stride, 0, height, + filtered_data); } //------------------------------------------------------------------------------ -static void HorizontalUnfilter(const uint8_t* prev, const uint8_t* in, - uint8_t* out, int width) { +static void HorizontalUnfilter_MIPSdspR2(const uint8_t* prev, const uint8_t* in, + uint8_t* out, int width) { out[0] = in[0] + (prev == NULL ? 0 : prev[0]); DO_PREDICT_LINE(in + 1, out + 1, width - 1, 1); } -static void VerticalUnfilter(const uint8_t* prev, const uint8_t* in, - uint8_t* out, int width) { +static void VerticalUnfilter_MIPSdspR2(const uint8_t* prev, const uint8_t* in, + uint8_t* out, int width) { if (prev == NULL) { - HorizontalUnfilter(NULL, in, out, width); + HorizontalUnfilter_MIPSdspR2(NULL, in, out, width); } else { DO_PREDICT_LINE_VERTICAL(in, prev, out, width, 1); } } -static void GradientUnfilter(const uint8_t* prev, const uint8_t* in, - uint8_t* out, int width) { +static void GradientUnfilter_MIPSdspR2(const uint8_t* prev, const uint8_t* in, + uint8_t* out, int width) { if (prev == NULL) { - HorizontalUnfilter(NULL, in, out, width); + HorizontalUnfilter_MIPSdspR2(NULL, in, out, width); } else { uint8_t top = prev[0], top_left = top, left = top; int i; for (i = 0; i < width; ++i) { top = prev[i]; // need to read this first, in case prev==dst - left = in[i] + GradientPredictor(left, top, top_left); + left = in[i] + GradientPredictor_MIPSdspR2(left, top, top_left); top_left = top; out[i] = left; } @@ -379,13 +386,13 @@ static void GradientUnfilter(const uint8_t* prev, const uint8_t* in, extern void VP8FiltersInitMIPSdspR2(void); WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInitMIPSdspR2(void) { - WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter; - WebPUnfilters[WEBP_FILTER_VERTICAL] = VerticalUnfilter; - WebPUnfilters[WEBP_FILTER_GRADIENT] = GradientUnfilter; + WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter_MIPSdspR2; + WebPUnfilters[WEBP_FILTER_VERTICAL] = VerticalUnfilter_MIPSdspR2; + WebPUnfilters[WEBP_FILTER_GRADIENT] = GradientUnfilter_MIPSdspR2; - WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter; - WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter; - WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter; + WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter_MIPSdspR2; + WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter_MIPSdspR2; + WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter_MIPSdspR2; } #else // !WEBP_USE_MIPS_DSP_R2 diff --git a/src/3rdparty/libwebp/src/dsp/filters_msa.c b/src/3rdparty/libwebp/src/dsp/filters_msa.c index 4b8922d..14c437d 100644 --- a/src/3rdparty/libwebp/src/dsp/filters_msa.c +++ b/src/3rdparty/libwebp/src/dsp/filters_msa.c @@ -11,11 +11,11 @@ // // Author: Prashant Patil (prashant.patil@imgtec.com) -#include "./dsp.h" +#include "src/dsp/dsp.h" #if defined(WEBP_USE_MSA) -#include "./msa_macro.h" +#include "src/dsp/msa_macro.h" #include <assert.h> @@ -66,8 +66,8 @@ static WEBP_INLINE void PredictLineInverse0(const uint8_t* src, //------------------------------------------------------------------------------ // Horrizontal filter -static void HorizontalFilter(const uint8_t* data, int width, int height, - int stride, uint8_t* filtered_data) { +static void HorizontalFilter_MSA(const uint8_t* data, int width, int height, + int stride, uint8_t* filtered_data) { const uint8_t* preds = data; const uint8_t* in = data; uint8_t* out = filtered_data; @@ -129,8 +129,8 @@ static WEBP_INLINE void PredictLineGradient(const uint8_t* pinput, } -static void GradientFilter(const uint8_t* data, int width, int height, - int stride, uint8_t* filtered_data) { +static void GradientFilter_MSA(const uint8_t* data, int width, int height, + int stride, uint8_t* filtered_data) { const uint8_t* in = data; const uint8_t* preds = data; uint8_t* out = filtered_data; @@ -157,8 +157,8 @@ static void GradientFilter(const uint8_t* data, int width, int height, //------------------------------------------------------------------------------ // Vertical filter -static void VerticalFilter(const uint8_t* data, int width, int height, - int stride, uint8_t* filtered_data) { +static void VerticalFilter_MSA(const uint8_t* data, int width, int height, + int stride, uint8_t* filtered_data) { const uint8_t* in = data; const uint8_t* preds = data; uint8_t* out = filtered_data; @@ -190,9 +190,9 @@ static void VerticalFilter(const uint8_t* data, int width, int height, extern void VP8FiltersInitMSA(void); WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInitMSA(void) { - WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter; - WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter; - WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter; + WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter_MSA; + WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter_MSA; + WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter_MSA; } #else // !WEBP_USE_MSA diff --git a/src/3rdparty/libwebp/src/dsp/filters_neon.c b/src/3rdparty/libwebp/src/dsp/filters_neon.c index 4d6e50c..3e6a578 100644 --- a/src/3rdparty/libwebp/src/dsp/filters_neon.c +++ b/src/3rdparty/libwebp/src/dsp/filters_neon.c @@ -11,12 +11,12 @@ // // Author: Skal (pascal.massimino@gmail.com) -#include "./dsp.h" +#include "src/dsp/dsp.h" #if defined(WEBP_USE_NEON) #include <assert.h> -#include "./neon.h" +#include "src/dsp/neon.h" //------------------------------------------------------------------------------ // Helpful macros. @@ -134,7 +134,7 @@ static WEBP_INLINE void DoVerticalFilter_NEON(const uint8_t* in, } static void VerticalFilter_NEON(const uint8_t* data, int width, int height, - int stride, uint8_t* filtered_data) { + int stride, uint8_t* filtered_data) { DoVerticalFilter_NEON(data, width, height, stride, 0, height, filtered_data); } @@ -196,7 +196,7 @@ static WEBP_INLINE void DoGradientFilter_NEON(const uint8_t* in, } static void GradientFilter_NEON(const uint8_t* data, int width, int height, - int stride, uint8_t* filtered_data) { + int stride, uint8_t* filtered_data) { DoGradientFilter_NEON(data, width, height, stride, 0, height, filtered_data); } @@ -251,9 +251,11 @@ static void VerticalUnfilter_NEON(const uint8_t* prev, const uint8_t* in, // GradientUnfilter_NEON is correct but slower than the C-version, // at least on ARM64. For armv7, it's a wash. // So best is to disable it for now, but keep the idea around... -// #define USE_GRADIENT_UNFILTER +#if !defined(USE_GRADIENT_UNFILTER) +#define USE_GRADIENT_UNFILTER 0 // ALTERNATE_CODE +#endif -#if defined(USE_GRADIENT_UNFILTER) +#if (USE_GRADIENT_UNFILTER == 1) #define GRAD_PROCESS_LANE(L) do { \ const uint8x8_t tmp1 = ROTATE_RIGHT_N(pred, 1); /* rotate predictor in */ \ const int16x8_t tmp2 = vaddq_s16(BC, U8_TO_S16(tmp1)); \ @@ -292,7 +294,7 @@ static void GradientPredictInverse_NEON(const uint8_t* const in, #undef GRAD_PROCESS_LANE static void GradientUnfilter_NEON(const uint8_t* prev, const uint8_t* in, - uint8_t* out, int width) { + uint8_t* out, int width) { if (prev == NULL) { HorizontalUnfilter_NEON(NULL, in, out, width); } else { @@ -311,7 +313,7 @@ extern void VP8FiltersInitNEON(void); WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInitNEON(void) { WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter_NEON; WebPUnfilters[WEBP_FILTER_VERTICAL] = VerticalUnfilter_NEON; -#if defined(USE_GRADIENT_UNFILTER) +#if (USE_GRADIENT_UNFILTER == 1) WebPUnfilters[WEBP_FILTER_GRADIENT] = GradientUnfilter_NEON; #endif diff --git a/src/3rdparty/libwebp/src/dsp/filters_sse2.c b/src/3rdparty/libwebp/src/dsp/filters_sse2.c index 67f7799..4b3f2d0 100644 --- a/src/3rdparty/libwebp/src/dsp/filters_sse2.c +++ b/src/3rdparty/libwebp/src/dsp/filters_sse2.c @@ -11,7 +11,7 @@ // // Author: Skal (pascal.massimino@gmail.com) -#include "./dsp.h" +#include "src/dsp/dsp.h" #if defined(WEBP_USE_SSE2) @@ -24,16 +24,16 @@ // Helpful macro. # define SANITY_CHECK(in, out) \ - assert(in != NULL); \ - assert(out != NULL); \ + assert((in) != NULL); \ + assert((out) != NULL); \ assert(width > 0); \ assert(height > 0); \ assert(stride >= width); \ assert(row >= 0 && num_rows > 0 && row + num_rows <= height); \ (void)height; // Silence unused warning. -static void PredictLineTop(const uint8_t* src, const uint8_t* pred, - uint8_t* dst, int length) { +static void PredictLineTop_SSE2(const uint8_t* src, const uint8_t* pred, + uint8_t* dst, int length) { int i; const int max_pos = length & ~31; assert(length >= 0); @@ -51,7 +51,7 @@ static void PredictLineTop(const uint8_t* src, const uint8_t* pred, } // Special case for left-based prediction (when preds==dst-1 or preds==src-1). -static void PredictLineLeft(const uint8_t* src, uint8_t* dst, int length) { +static void PredictLineLeft_SSE2(const uint8_t* src, uint8_t* dst, int length) { int i; const int max_pos = length & ~31; assert(length >= 0); @@ -71,10 +71,11 @@ static void PredictLineLeft(const uint8_t* src, uint8_t* dst, int length) { //------------------------------------------------------------------------------ // Horizontal filter. -static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in, - int width, int height, int stride, - int row, int num_rows, - uint8_t* out) { +static WEBP_INLINE void DoHorizontalFilter_SSE2(const uint8_t* in, + int width, int height, + int stride, + int row, int num_rows, + uint8_t* out) { const size_t start_offset = row * stride; const int last_row = row + num_rows; SANITY_CHECK(in, out); @@ -84,7 +85,7 @@ static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in, if (row == 0) { // Leftmost pixel is the same as input for topmost scanline. out[0] = in[0]; - PredictLineLeft(in + 1, out + 1, width - 1); + PredictLineLeft_SSE2(in + 1, out + 1, width - 1); row = 1; in += stride; out += stride; @@ -94,7 +95,7 @@ static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in, while (row < last_row) { // Leftmost pixel is predicted from above. out[0] = in[0] - in[-stride]; - PredictLineLeft(in + 1, out + 1, width - 1); + PredictLineLeft_SSE2(in + 1, out + 1, width - 1); ++row; in += stride; out += stride; @@ -104,9 +105,10 @@ static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in, //------------------------------------------------------------------------------ // Vertical filter. -static WEBP_INLINE void DoVerticalFilter(const uint8_t* in, - int width, int height, int stride, - int row, int num_rows, uint8_t* out) { +static WEBP_INLINE void DoVerticalFilter_SSE2(const uint8_t* in, + int width, int height, int stride, + int row, int num_rows, + uint8_t* out) { const size_t start_offset = row * stride; const int last_row = row + num_rows; SANITY_CHECK(in, out); @@ -117,7 +119,7 @@ static WEBP_INLINE void DoVerticalFilter(const uint8_t* in, // Very first top-left pixel is copied. out[0] = in[0]; // Rest of top scan-line is left-predicted. - PredictLineLeft(in + 1, out + 1, width - 1); + PredictLineLeft_SSE2(in + 1, out + 1, width - 1); row = 1; in += stride; out += stride; @@ -125,7 +127,7 @@ static WEBP_INLINE void DoVerticalFilter(const uint8_t* in, // Filter line-by-line. while (row < last_row) { - PredictLineTop(in, in - stride, out, width); + PredictLineTop_SSE2(in, in - stride, out, width); ++row; in += stride; out += stride; @@ -135,14 +137,14 @@ static WEBP_INLINE void DoVerticalFilter(const uint8_t* in, //------------------------------------------------------------------------------ // Gradient filter. -static WEBP_INLINE int GradientPredictorC(uint8_t a, uint8_t b, uint8_t c) { +static WEBP_INLINE int GradientPredictor_SSE2(uint8_t a, uint8_t b, uint8_t c) { const int g = a + b - c; return ((g & ~0xff) == 0) ? g : (g < 0) ? 0 : 255; // clip to 8bit } -static void GradientPredictDirect(const uint8_t* const row, - const uint8_t* const top, - uint8_t* const out, int length) { +static void GradientPredictDirect_SSE2(const uint8_t* const row, + const uint8_t* const top, + uint8_t* const out, int length) { const int max_pos = length & ~7; int i; const __m128i zero = _mm_setzero_si128(); @@ -161,14 +163,15 @@ static void GradientPredictDirect(const uint8_t* const row, _mm_storel_epi64((__m128i*)(out + i), H); } for (; i < length; ++i) { - out[i] = row[i] - GradientPredictorC(row[i - 1], top[i], top[i - 1]); + const int delta = GradientPredictor_SSE2(row[i - 1], top[i], top[i - 1]); + out[i] = (uint8_t)(row[i] - delta); } } -static WEBP_INLINE void DoGradientFilter(const uint8_t* in, - int width, int height, int stride, - int row, int num_rows, - uint8_t* out) { +static WEBP_INLINE void DoGradientFilter_SSE2(const uint8_t* in, + int width, int height, int stride, + int row, int num_rows, + uint8_t* out) { const size_t start_offset = row * stride; const int last_row = row + num_rows; SANITY_CHECK(in, out); @@ -178,7 +181,7 @@ static WEBP_INLINE void DoGradientFilter(const uint8_t* in, // left prediction for top scan-line if (row == 0) { out[0] = in[0]; - PredictLineLeft(in + 1, out + 1, width - 1); + PredictLineLeft_SSE2(in + 1, out + 1, width - 1); row = 1; in += stride; out += stride; @@ -186,8 +189,8 @@ static WEBP_INLINE void DoGradientFilter(const uint8_t* in, // Filter line-by-line. while (row < last_row) { - out[0] = in[0] - in[-stride]; - GradientPredictDirect(in + 1, in + 1 - stride, out + 1, width - 1); + out[0] = (uint8_t)(in[0] - in[-stride]); + GradientPredictDirect_SSE2(in + 1, in + 1 - stride, out + 1, width - 1); ++row; in += stride; out += stride; @@ -198,29 +201,30 @@ static WEBP_INLINE void DoGradientFilter(const uint8_t* in, //------------------------------------------------------------------------------ -static void HorizontalFilter(const uint8_t* data, int width, int height, - int stride, uint8_t* filtered_data) { - DoHorizontalFilter(data, width, height, stride, 0, height, filtered_data); +static void HorizontalFilter_SSE2(const uint8_t* data, int width, int height, + int stride, uint8_t* filtered_data) { + DoHorizontalFilter_SSE2(data, width, height, stride, 0, height, + filtered_data); } -static void VerticalFilter(const uint8_t* data, int width, int height, - int stride, uint8_t* filtered_data) { - DoVerticalFilter(data, width, height, stride, 0, height, filtered_data); +static void VerticalFilter_SSE2(const uint8_t* data, int width, int height, + int stride, uint8_t* filtered_data) { + DoVerticalFilter_SSE2(data, width, height, stride, 0, height, filtered_data); } -static void GradientFilter(const uint8_t* data, int width, int height, - int stride, uint8_t* filtered_data) { - DoGradientFilter(data, width, height, stride, 0, height, filtered_data); +static void GradientFilter_SSE2(const uint8_t* data, int width, int height, + int stride, uint8_t* filtered_data) { + DoGradientFilter_SSE2(data, width, height, stride, 0, height, filtered_data); } //------------------------------------------------------------------------------ // Inverse transforms -static void HorizontalUnfilter(const uint8_t* prev, const uint8_t* in, - uint8_t* out, int width) { +static void HorizontalUnfilter_SSE2(const uint8_t* prev, const uint8_t* in, + uint8_t* out, int width) { int i; __m128i last; - out[0] = in[0] + (prev == NULL ? 0 : prev[0]); + out[0] = (uint8_t)(in[0] + (prev == NULL ? 0 : prev[0])); if (width <= 1) return; last = _mm_set_epi32(0, 0, 0, out[0]); for (i = 1; i + 8 <= width; i += 8) { @@ -235,13 +239,13 @@ static void HorizontalUnfilter(const uint8_t* prev, const uint8_t* in, _mm_storel_epi64((__m128i*)(out + i), A7); last = _mm_srli_epi64(A7, 56); } - for (; i < width; ++i) out[i] = in[i] + out[i - 1]; + for (; i < width; ++i) out[i] = (uint8_t)(in[i] + out[i - 1]); } -static void VerticalUnfilter(const uint8_t* prev, const uint8_t* in, - uint8_t* out, int width) { +static void VerticalUnfilter_SSE2(const uint8_t* prev, const uint8_t* in, + uint8_t* out, int width) { if (prev == NULL) { - HorizontalUnfilter(NULL, in, out, width); + HorizontalUnfilter_SSE2(NULL, in, out, width); } else { int i; const int max_pos = width & ~31; @@ -256,13 +260,13 @@ static void VerticalUnfilter(const uint8_t* prev, const uint8_t* in, _mm_storeu_si128((__m128i*)&out[i + 0], C0); _mm_storeu_si128((__m128i*)&out[i + 16], C1); } - for (; i < width; ++i) out[i] = in[i] + prev[i]; + for (; i < width; ++i) out[i] = (uint8_t)(in[i] + prev[i]); } } -static void GradientPredictInverse(const uint8_t* const in, - const uint8_t* const top, - uint8_t* const row, int length) { +static void GradientPredictInverse_SSE2(const uint8_t* const in, + const uint8_t* const top, + uint8_t* const row, int length) { if (length > 0) { int i; const int max_pos = length & ~7; @@ -293,18 +297,19 @@ static void GradientPredictInverse(const uint8_t* const in, _mm_storel_epi64((__m128i*)&row[i], out); } for (; i < length; ++i) { - row[i] = in[i] + GradientPredictorC(row[i - 1], top[i], top[i - 1]); + const int delta = GradientPredictor_SSE2(row[i - 1], top[i], top[i - 1]); + row[i] = (uint8_t)(in[i] + delta); } } } -static void GradientUnfilter(const uint8_t* prev, const uint8_t* in, - uint8_t* out, int width) { +static void GradientUnfilter_SSE2(const uint8_t* prev, const uint8_t* in, + uint8_t* out, int width) { if (prev == NULL) { - HorizontalUnfilter(NULL, in, out, width); + HorizontalUnfilter_SSE2(NULL, in, out, width); } else { - out[0] = in[0] + prev[0]; // predict from above - GradientPredictInverse(in + 1, prev + 1, out + 1, width - 1); + out[0] = (uint8_t)(in[0] + prev[0]); // predict from above + GradientPredictInverse_SSE2(in + 1, prev + 1, out + 1, width - 1); } } @@ -314,13 +319,13 @@ static void GradientUnfilter(const uint8_t* prev, const uint8_t* in, extern void VP8FiltersInitSSE2(void); WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInitSSE2(void) { - WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter; - WebPUnfilters[WEBP_FILTER_VERTICAL] = VerticalUnfilter; - WebPUnfilters[WEBP_FILTER_GRADIENT] = GradientUnfilter; + WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter_SSE2; + WebPUnfilters[WEBP_FILTER_VERTICAL] = VerticalUnfilter_SSE2; + WebPUnfilters[WEBP_FILTER_GRADIENT] = GradientUnfilter_SSE2; - WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter; - WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter; - WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter; + WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter_SSE2; + WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter_SSE2; + WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter_SSE2; } #else // !WEBP_USE_SSE2 diff --git a/src/3rdparty/libwebp/src/dsp/lossless.c b/src/3rdparty/libwebp/src/dsp/lossless.c index 20d18f6..d05af84 100644 --- a/src/3rdparty/libwebp/src/dsp/lossless.c +++ b/src/3rdparty/libwebp/src/dsp/lossless.c @@ -13,16 +13,15 @@ // Jyrki Alakuijala (jyrki@google.com) // Urvang Joshi (urvang@google.com) -#include "./dsp.h" +#include "src/dsp/dsp.h" +#include <assert.h> #include <math.h> #include <stdlib.h> -#include "../dec/vp8li_dec.h" -#include "../utils/endian_inl_utils.h" -#include "./lossless.h" -#include "./lossless_common.h" - -#define MAX_DIFF_COST (1e30f) +#include "src/dec/vp8li_dec.h" +#include "src/utils/endian_inl_utils.h" +#include "src/dsp/lossless.h" +#include "src/dsp/lossless_common.h" //------------------------------------------------------------------------------ // Image transforms. @@ -80,8 +79,9 @@ static WEBP_INLINE uint32_t ClampedAddSubtractHalf(uint32_t c0, uint32_t c1, return ((uint32_t)a << 24) | (r << 16) | (g << 8) | b; } -// gcc-4.9 on ARM generates incorrect code in Select() when Sub3() is inlined. -#if defined(__arm__) && LOCAL_GCC_VERSION == 0x409 +// gcc <= 4.9 on ARM generates incorrect code in Select() when Sub3() is +// inlined. +#if defined(__arm__) && LOCAL_GCC_VERSION <= 0x409 # define LOCAL_INLINE __attribute__ ((noinline)) #else # define LOCAL_INLINE WEBP_INLINE @@ -107,69 +107,69 @@ static WEBP_INLINE uint32_t Select(uint32_t a, uint32_t b, uint32_t c) { //------------------------------------------------------------------------------ // Predictors -static uint32_t Predictor0(uint32_t left, const uint32_t* const top) { +static uint32_t Predictor0_C(uint32_t left, const uint32_t* const top) { (void)top; (void)left; return ARGB_BLACK; } -static uint32_t Predictor1(uint32_t left, const uint32_t* const top) { +static uint32_t Predictor1_C(uint32_t left, const uint32_t* const top) { (void)top; return left; } -static uint32_t Predictor2(uint32_t left, const uint32_t* const top) { +static uint32_t Predictor2_C(uint32_t left, const uint32_t* const top) { (void)left; return top[0]; } -static uint32_t Predictor3(uint32_t left, const uint32_t* const top) { +static uint32_t Predictor3_C(uint32_t left, const uint32_t* const top) { (void)left; return top[1]; } -static uint32_t Predictor4(uint32_t left, const uint32_t* const top) { +static uint32_t Predictor4_C(uint32_t left, const uint32_t* const top) { (void)left; return top[-1]; } -static uint32_t Predictor5(uint32_t left, const uint32_t* const top) { +static uint32_t Predictor5_C(uint32_t left, const uint32_t* const top) { const uint32_t pred = Average3(left, top[0], top[1]); return pred; } -static uint32_t Predictor6(uint32_t left, const uint32_t* const top) { +static uint32_t Predictor6_C(uint32_t left, const uint32_t* const top) { const uint32_t pred = Average2(left, top[-1]); return pred; } -static uint32_t Predictor7(uint32_t left, const uint32_t* const top) { +static uint32_t Predictor7_C(uint32_t left, const uint32_t* const top) { const uint32_t pred = Average2(left, top[0]); return pred; } -static uint32_t Predictor8(uint32_t left, const uint32_t* const top) { +static uint32_t Predictor8_C(uint32_t left, const uint32_t* const top) { const uint32_t pred = Average2(top[-1], top[0]); (void)left; return pred; } -static uint32_t Predictor9(uint32_t left, const uint32_t* const top) { +static uint32_t Predictor9_C(uint32_t left, const uint32_t* const top) { const uint32_t pred = Average2(top[0], top[1]); (void)left; return pred; } -static uint32_t Predictor10(uint32_t left, const uint32_t* const top) { +static uint32_t Predictor10_C(uint32_t left, const uint32_t* const top) { const uint32_t pred = Average4(left, top[-1], top[0], top[1]); return pred; } -static uint32_t Predictor11(uint32_t left, const uint32_t* const top) { +static uint32_t Predictor11_C(uint32_t left, const uint32_t* const top) { const uint32_t pred = Select(top[0], left, top[-1]); return pred; } -static uint32_t Predictor12(uint32_t left, const uint32_t* const top) { +static uint32_t Predictor12_C(uint32_t left, const uint32_t* const top) { const uint32_t pred = ClampedAddSubtractFull(left, top[0], top[-1]); return pred; } -static uint32_t Predictor13(uint32_t left, const uint32_t* const top) { +static uint32_t Predictor13_C(uint32_t left, const uint32_t* const top) { const uint32_t pred = ClampedAddSubtractHalf(left, top[0], top[-1]); return pred; } -GENERATE_PREDICTOR_ADD(Predictor0, PredictorAdd0) -static void PredictorAdd1(const uint32_t* in, const uint32_t* upper, - int num_pixels, uint32_t* out) { +GENERATE_PREDICTOR_ADD(Predictor0_C, PredictorAdd0_C) +static void PredictorAdd1_C(const uint32_t* in, const uint32_t* upper, + int num_pixels, uint32_t* out) { int i; uint32_t left = out[-1]; for (i = 0; i < num_pixels; ++i) { @@ -177,29 +177,29 @@ static void PredictorAdd1(const uint32_t* in, const uint32_t* upper, } (void)upper; } -GENERATE_PREDICTOR_ADD(Predictor2, PredictorAdd2) -GENERATE_PREDICTOR_ADD(Predictor3, PredictorAdd3) -GENERATE_PREDICTOR_ADD(Predictor4, PredictorAdd4) -GENERATE_PREDICTOR_ADD(Predictor5, PredictorAdd5) -GENERATE_PREDICTOR_ADD(Predictor6, PredictorAdd6) -GENERATE_PREDICTOR_ADD(Predictor7, PredictorAdd7) -GENERATE_PREDICTOR_ADD(Predictor8, PredictorAdd8) -GENERATE_PREDICTOR_ADD(Predictor9, PredictorAdd9) -GENERATE_PREDICTOR_ADD(Predictor10, PredictorAdd10) -GENERATE_PREDICTOR_ADD(Predictor11, PredictorAdd11) -GENERATE_PREDICTOR_ADD(Predictor12, PredictorAdd12) -GENERATE_PREDICTOR_ADD(Predictor13, PredictorAdd13) +GENERATE_PREDICTOR_ADD(Predictor2_C, PredictorAdd2_C) +GENERATE_PREDICTOR_ADD(Predictor3_C, PredictorAdd3_C) +GENERATE_PREDICTOR_ADD(Predictor4_C, PredictorAdd4_C) +GENERATE_PREDICTOR_ADD(Predictor5_C, PredictorAdd5_C) +GENERATE_PREDICTOR_ADD(Predictor6_C, PredictorAdd6_C) +GENERATE_PREDICTOR_ADD(Predictor7_C, PredictorAdd7_C) +GENERATE_PREDICTOR_ADD(Predictor8_C, PredictorAdd8_C) +GENERATE_PREDICTOR_ADD(Predictor9_C, PredictorAdd9_C) +GENERATE_PREDICTOR_ADD(Predictor10_C, PredictorAdd10_C) +GENERATE_PREDICTOR_ADD(Predictor11_C, PredictorAdd11_C) +GENERATE_PREDICTOR_ADD(Predictor12_C, PredictorAdd12_C) +GENERATE_PREDICTOR_ADD(Predictor13_C, PredictorAdd13_C) //------------------------------------------------------------------------------ // Inverse prediction. -static void PredictorInverseTransform(const VP8LTransform* const transform, - int y_start, int y_end, - const uint32_t* in, uint32_t* out) { +static void PredictorInverseTransform_C(const VP8LTransform* const transform, + int y_start, int y_end, + const uint32_t* in, uint32_t* out) { const int width = transform->xsize_; if (y_start == 0) { // First Row follows the L (mode=1) mode. - PredictorAdd0(in, NULL, 1, out); - PredictorAdd1(in + 1, NULL, width - 1, out + 1); + PredictorAdd0_C(in, NULL, 1, out); + PredictorAdd1_C(in + 1, NULL, width - 1, out + 1); in += width; out += width; ++y_start; @@ -217,7 +217,7 @@ static void PredictorInverseTransform(const VP8LTransform* const transform, const uint32_t* pred_mode_src = pred_mode_base; int x = 1; // First pixel follows the T (mode=2) mode. - PredictorAdd2(in, out - width, 1, out); + PredictorAdd2_C(in, out - width, 1, out); // .. the rest: while (x < width) { const VP8LPredictorAddSubFunc pred_func = @@ -270,23 +270,23 @@ void VP8LTransformColorInverse_C(const VP8LMultipliers* const m, int i; for (i = 0; i < num_pixels; ++i) { const uint32_t argb = src[i]; - const uint32_t green = argb >> 8; + const int8_t green = (int8_t)(argb >> 8); const uint32_t red = argb >> 16; - int new_red = red; - int new_blue = argb; + int new_red = red & 0xff; + int new_blue = argb & 0xff; new_red += ColorTransformDelta(m->green_to_red_, green); new_red &= 0xff; new_blue += ColorTransformDelta(m->green_to_blue_, green); - new_blue += ColorTransformDelta(m->red_to_blue_, new_red); + new_blue += ColorTransformDelta(m->red_to_blue_, (int8_t)new_red); new_blue &= 0xff; dst[i] = (argb & 0xff00ff00u) | (new_red << 16) | (new_blue); } } // Color space inverse transform. -static void ColorSpaceInverseTransform(const VP8LTransform* const transform, - int y_start, int y_end, - const uint32_t* src, uint32_t* dst) { +static void ColorSpaceInverseTransform_C(const VP8LTransform* const transform, + int y_start, int y_end, + const uint32_t* src, uint32_t* dst) { const int width = transform->xsize_; const int tile_width = 1 << transform->bits_; const int mask = tile_width - 1; @@ -362,10 +362,10 @@ STATIC_DECL void FUNC_NAME(const VP8LTransform* const transform, \ } \ } -COLOR_INDEX_INVERSE(ColorIndexInverseTransform, MapARGB, static, uint32_t, 32b, - VP8GetARGBIndex, VP8GetARGBValue) -COLOR_INDEX_INVERSE(VP8LColorIndexInverseTransformAlpha, MapAlpha, , uint8_t, - 8b, VP8GetAlphaIndex, VP8GetAlphaValue) +COLOR_INDEX_INVERSE(ColorIndexInverseTransform_C, MapARGB_C, static, + uint32_t, 32b, VP8GetARGBIndex, VP8GetARGBValue) +COLOR_INDEX_INVERSE(VP8LColorIndexInverseTransformAlpha, MapAlpha_C, , + uint8_t, 8b, VP8GetAlphaIndex, VP8GetAlphaValue) #undef COLOR_INDEX_INVERSE @@ -380,7 +380,7 @@ void VP8LInverseTransform(const VP8LTransform* const transform, VP8LAddGreenToBlueAndRed(in, (row_end - row_start) * width, out); break; case PREDICTOR_TRANSFORM: - PredictorInverseTransform(transform, row_start, row_end, in, out); + PredictorInverseTransform_C(transform, row_start, row_end, in, out); if (row_end != transform->ysize_) { // The last predicted row in this iteration will be the top-pred row // for the first row in next iteration. @@ -389,7 +389,7 @@ void VP8LInverseTransform(const VP8LTransform* const transform, } break; case CROSS_COLOR_TRANSFORM: - ColorSpaceInverseTransform(transform, row_start, row_end, in, out); + ColorSpaceInverseTransform_C(transform, row_start, row_end, in, out); break; case COLOR_INDEXING_TRANSFORM: if (in == out && transform->bits_ > 0) { @@ -403,9 +403,9 @@ void VP8LInverseTransform(const VP8LTransform* const transform, VP8LSubSampleSize(transform->xsize_, transform->bits_); uint32_t* const src = out + out_stride - in_stride; memmove(src, out, in_stride * sizeof(*src)); - ColorIndexInverseTransform(transform, row_start, row_end, src, out); + ColorIndexInverseTransform_C(transform, row_start, row_end, src, out); } else { - ColorIndexInverseTransform(transform, row_start, row_end, in, out); + ColorIndexInverseTransform_C(transform, row_start, row_end, in, out); } break; } @@ -452,7 +452,7 @@ void VP8LConvertBGRAToRGBA4444_C(const uint32_t* src, const uint32_t argb = *src++; const uint8_t rg = ((argb >> 16) & 0xf0) | ((argb >> 12) & 0xf); const uint8_t ba = ((argb >> 0) & 0xf0) | ((argb >> 28) & 0xf); -#ifdef WEBP_SWAP_16BIT_CSP +#if (WEBP_SWAP_16BIT_CSP == 1) *dst++ = ba; *dst++ = rg; #else @@ -469,7 +469,7 @@ void VP8LConvertBGRAToRGB565_C(const uint32_t* src, const uint32_t argb = *src++; const uint8_t rg = ((argb >> 16) & 0xf8) | ((argb >> 13) & 0x7); const uint8_t gb = ((argb >> 5) & 0xe0) | ((argb >> 3) & 0x1f); -#ifdef WEBP_SWAP_16BIT_CSP +#if (WEBP_SWAP_16BIT_CSP == 1) *dst++ = gb; *dst++ = rg; #else @@ -496,22 +496,7 @@ static void CopyOrSwap(const uint32_t* src, int num_pixels, uint8_t* dst, const uint32_t* const src_end = src + num_pixels; while (src < src_end) { const uint32_t argb = *src++; - -#if !defined(WORDS_BIGENDIAN) -#if !defined(WEBP_REFERENCE_IMPLEMENTATION) WebPUint32ToMem(dst, BSwap32(argb)); -#else // WEBP_REFERENCE_IMPLEMENTATION - dst[0] = (argb >> 24) & 0xff; - dst[1] = (argb >> 16) & 0xff; - dst[2] = (argb >> 8) & 0xff; - dst[3] = (argb >> 0) & 0xff; -#endif -#else // WORDS_BIGENDIAN - dst[0] = (argb >> 0) & 0xff; - dst[1] = (argb >> 8) & 0xff; - dst[2] = (argb >> 16) & 0xff; - dst[3] = (argb >> 24) & 0xff; -#endif dst += sizeof(argb); } } else { @@ -590,48 +575,46 @@ extern void VP8LDspInitNEON(void); extern void VP8LDspInitMIPSdspR2(void); extern void VP8LDspInitMSA(void); -static volatile VP8CPUInfo lossless_last_cpuinfo_used = - (VP8CPUInfo)&lossless_last_cpuinfo_used; - -#define COPY_PREDICTOR_ARRAY(IN, OUT) do { \ - (OUT)[0] = IN##0; \ - (OUT)[1] = IN##1; \ - (OUT)[2] = IN##2; \ - (OUT)[3] = IN##3; \ - (OUT)[4] = IN##4; \ - (OUT)[5] = IN##5; \ - (OUT)[6] = IN##6; \ - (OUT)[7] = IN##7; \ - (OUT)[8] = IN##8; \ - (OUT)[9] = IN##9; \ - (OUT)[10] = IN##10; \ - (OUT)[11] = IN##11; \ - (OUT)[12] = IN##12; \ - (OUT)[13] = IN##13; \ - (OUT)[14] = IN##0; /* <- padding security sentinels*/ \ - (OUT)[15] = IN##0; \ +#define COPY_PREDICTOR_ARRAY(IN, OUT) do { \ + (OUT)[0] = IN##0_C; \ + (OUT)[1] = IN##1_C; \ + (OUT)[2] = IN##2_C; \ + (OUT)[3] = IN##3_C; \ + (OUT)[4] = IN##4_C; \ + (OUT)[5] = IN##5_C; \ + (OUT)[6] = IN##6_C; \ + (OUT)[7] = IN##7_C; \ + (OUT)[8] = IN##8_C; \ + (OUT)[9] = IN##9_C; \ + (OUT)[10] = IN##10_C; \ + (OUT)[11] = IN##11_C; \ + (OUT)[12] = IN##12_C; \ + (OUT)[13] = IN##13_C; \ + (OUT)[14] = IN##0_C; /* <- padding security sentinels*/ \ + (OUT)[15] = IN##0_C; \ } while (0); -WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInit(void) { - if (lossless_last_cpuinfo_used == VP8GetCPUInfo) return; - +WEBP_DSP_INIT_FUNC(VP8LDspInit) { COPY_PREDICTOR_ARRAY(Predictor, VP8LPredictors) COPY_PREDICTOR_ARRAY(Predictor, VP8LPredictors_C) COPY_PREDICTOR_ARRAY(PredictorAdd, VP8LPredictorsAdd) COPY_PREDICTOR_ARRAY(PredictorAdd, VP8LPredictorsAdd_C) +#if !WEBP_NEON_OMIT_C_CODE VP8LAddGreenToBlueAndRed = VP8LAddGreenToBlueAndRed_C; VP8LTransformColorInverse = VP8LTransformColorInverse_C; - VP8LConvertBGRAToRGB = VP8LConvertBGRAToRGB_C; VP8LConvertBGRAToRGBA = VP8LConvertBGRAToRGBA_C; + VP8LConvertBGRAToRGB = VP8LConvertBGRAToRGB_C; + VP8LConvertBGRAToBGR = VP8LConvertBGRAToBGR_C; +#endif + VP8LConvertBGRAToRGBA4444 = VP8LConvertBGRAToRGBA4444_C; VP8LConvertBGRAToRGB565 = VP8LConvertBGRAToRGB565_C; - VP8LConvertBGRAToBGR = VP8LConvertBGRAToBGR_C; - VP8LMapColor32b = MapARGB; - VP8LMapColor8b = MapAlpha; + VP8LMapColor32b = MapARGB_C; + VP8LMapColor8b = MapAlpha_C; // If defined, use CPUInfo() to overwrite some pointers with faster versions. if (VP8GetCPUInfo != NULL) { @@ -640,11 +623,6 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInit(void) { VP8LDspInitSSE2(); } #endif -#if defined(WEBP_USE_NEON) - if (VP8GetCPUInfo(kNEON)) { - VP8LDspInitNEON(); - } -#endif #if defined(WEBP_USE_MIPS_DSP_R2) if (VP8GetCPUInfo(kMIPSdspR2)) { VP8LDspInitMIPSdspR2(); @@ -656,7 +634,23 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInit(void) { } #endif } - lossless_last_cpuinfo_used = VP8GetCPUInfo; + +#if defined(WEBP_USE_NEON) + if (WEBP_NEON_OMIT_C_CODE || + (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) { + VP8LDspInitNEON(); + } +#endif + + assert(VP8LAddGreenToBlueAndRed != NULL); + assert(VP8LTransformColorInverse != NULL); + assert(VP8LConvertBGRAToRGBA != NULL); + assert(VP8LConvertBGRAToRGB != NULL); + assert(VP8LConvertBGRAToBGR != NULL); + assert(VP8LConvertBGRAToRGBA4444 != NULL); + assert(VP8LConvertBGRAToRGB565 != NULL); + assert(VP8LMapColor32b != NULL); + assert(VP8LMapColor8b != NULL); } #undef COPY_PREDICTOR_ARRAY diff --git a/src/3rdparty/libwebp/src/dsp/lossless.h b/src/3rdparty/libwebp/src/dsp/lossless.h index 352a54e..f709cc8 100644 --- a/src/3rdparty/libwebp/src/dsp/lossless.h +++ b/src/3rdparty/libwebp/src/dsp/lossless.h @@ -15,20 +15,16 @@ #ifndef WEBP_DSP_LOSSLESS_H_ #define WEBP_DSP_LOSSLESS_H_ -#include "../webp/types.h" -#include "../webp/decode.h" +#include "src/webp/types.h" +#include "src/webp/decode.h" -#include "../enc/histogram_enc.h" -#include "../utils/utils.h" +#include "src/enc/histogram_enc.h" +#include "src/utils/utils.h" #ifdef __cplusplus extern "C" { #endif -#ifdef WEBP_EXPERIMENTAL_FEATURES -#include "../enc/delta_palettization_enc.h" -#endif // WEBP_EXPERIMENTAL_FEATURES - //------------------------------------------------------------------------------ // Decoding @@ -124,7 +120,7 @@ void VP8LDspInit(void); typedef void (*VP8LProcessEncBlueAndRedFunc)(uint32_t* dst, int num_pixels); extern VP8LProcessEncBlueAndRedFunc VP8LSubtractGreenFromBlueAndRed; typedef void (*VP8LTransformColorFunc)(const VP8LMultipliers* const m, - uint32_t* const dst, int num_pixels); + uint32_t* dst, int num_pixels); extern VP8LTransformColorFunc VP8LTransformColor; typedef void (*VP8LCollectColorBlueTransformsFunc)( const uint32_t* argb, int stride, @@ -167,7 +163,7 @@ extern VP8LCostCombinedFunc VP8LExtraCostCombined; extern VP8LCombinedShannonEntropyFunc VP8LCombinedShannonEntropy; typedef struct { // small struct to hold counters - int counts[2]; // index: 0=zero steak, 1=non-zero streak + int counts[2]; // index: 0=zero streak, 1=non-zero streak int streaks[2][2]; // [zero/non-zero][streak<3 / streak>=3] } VP8LStreaks; @@ -198,10 +194,14 @@ extern VP8LGetEntropyUnrefinedFunc VP8LGetEntropyUnrefined; void VP8LBitsEntropyUnrefined(const uint32_t* const array, int n, VP8LBitEntropy* const entropy); -typedef void (*VP8LHistogramAddFunc)(const VP8LHistogram* const a, - const VP8LHistogram* const b, - VP8LHistogram* const out); -extern VP8LHistogramAddFunc VP8LHistogramAdd; +typedef void (*VP8LAddVectorFunc)(const uint32_t* a, const uint32_t* b, + uint32_t* out, int size); +extern VP8LAddVectorFunc VP8LAddVector; +typedef void (*VP8LAddVectorEqFunc)(const uint32_t* a, uint32_t* out, int size); +extern VP8LAddVectorEqFunc VP8LAddVectorEq; +void VP8LHistogramAdd(const VP8LHistogram* const a, + const VP8LHistogram* const b, + VP8LHistogram* const out); // ----------------------------------------------------------------------------- // PrefixEncode() diff --git a/src/3rdparty/libwebp/src/dsp/lossless_common.h b/src/3rdparty/libwebp/src/dsp/lossless_common.h index c40f711..a2648d1 100644 --- a/src/3rdparty/libwebp/src/dsp/lossless_common.h +++ b/src/3rdparty/libwebp/src/dsp/lossless_common.h @@ -16,9 +16,9 @@ #ifndef WEBP_DSP_LOSSLESS_COMMON_H_ #define WEBP_DSP_LOSSLESS_COMMON_H_ -#include "../webp/types.h" +#include "src/webp/types.h" -#include "../utils/utils.h" +#include "src/utils/utils.h" #ifdef __cplusplus extern "C" { @@ -93,14 +93,6 @@ static WEBP_INLINE float VP8LFastSLog2(uint32_t v) { // ----------------------------------------------------------------------------- // PrefixEncode() -static WEBP_INLINE int VP8LBitsLog2Ceiling(uint32_t n) { - const int log_floor = BitsLog2Floor(n); - if (n == (n & ~(n - 1))) { // zero or a power of two. - return log_floor; - } - return log_floor + 1; -} - // Splitting of distance and length codes into prefixes and // extra bits. The prefixes are encoded with an entropy code // while the extra bits are stored just as normal bits. diff --git a/src/3rdparty/libwebp/src/dsp/lossless_enc.c b/src/3rdparty/libwebp/src/dsp/lossless_enc.c index 4e46fba..9c36055 100644 --- a/src/3rdparty/libwebp/src/dsp/lossless_enc.c +++ b/src/3rdparty/libwebp/src/dsp/lossless_enc.c @@ -13,15 +13,16 @@ // Jyrki Alakuijala (jyrki@google.com) // Urvang Joshi (urvang@google.com) -#include "./dsp.h" +#include "src/dsp/dsp.h" +#include <assert.h> #include <math.h> #include <stdlib.h> -#include "../dec/vp8li_dec.h" -#include "../utils/endian_inl_utils.h" -#include "./lossless.h" -#include "./lossless_common.h" -#include "./yuv.h" +#include "src/dec/vp8li_dec.h" +#include "src/utils/endian_inl_utils.h" +#include "src/dsp/lossless.h" +#include "src/dsp/lossless_common.h" +#include "src/dsp/yuv.h" // lookup table for small values of log2(int) const float kLog2Table[LOG_LOOKUP_IDX_MAX] = { @@ -325,7 +326,7 @@ const uint8_t kPrefixEncodeExtraBitsValue[PREFIX_LOOKUP_IDX_MAX] = { 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126 }; -static float FastSLog2Slow(uint32_t v) { +static float FastSLog2Slow_C(uint32_t v) { assert(v >= LOG_LOOKUP_IDX_MAX); if (v < APPROX_LOG_WITH_CORRECTION_MAX) { int log_cnt = 0; @@ -351,7 +352,7 @@ static float FastSLog2Slow(uint32_t v) { } } -static float FastLog2Slow(uint32_t v) { +static float FastLog2Slow_C(uint32_t v) { assert(v >= LOG_LOOKUP_IDX_MAX); if (v < APPROX_LOG_WITH_CORRECTION_MAX) { int log_cnt = 0; @@ -380,7 +381,7 @@ static float FastLog2Slow(uint32_t v) { // Methods to calculate Entropy (Shannon). // Compute the combined Shanon's entropy for distribution {X} and {X+Y} -static float CombinedShannonEntropy(const int X[256], const int Y[256]) { +static float CombinedShannonEntropy_C(const int X[256], const int Y[256]) { int i; double retval = 0.; int sumX = 0, sumXY = 0; @@ -453,9 +454,9 @@ static WEBP_INLINE void GetEntropyUnrefinedHelper( *i_prev = i; } -static void GetEntropyUnrefined(const uint32_t X[], int length, - VP8LBitEntropy* const bit_entropy, - VP8LStreaks* const stats) { +static void GetEntropyUnrefined_C(const uint32_t X[], int length, + VP8LBitEntropy* const bit_entropy, + VP8LStreaks* const stats) { int i; int i_prev = 0; uint32_t x_prev = X[0]; @@ -474,10 +475,11 @@ static void GetEntropyUnrefined(const uint32_t X[], int length, bit_entropy->entropy += VP8LFastSLog2(bit_entropy->sum); } -static void GetCombinedEntropyUnrefined(const uint32_t X[], const uint32_t Y[], - int length, - VP8LBitEntropy* const bit_entropy, - VP8LStreaks* const stats) { +static void GetCombinedEntropyUnrefined_C(const uint32_t X[], + const uint32_t Y[], + int length, + VP8LBitEntropy* const bit_entropy, + VP8LStreaks* const stats) { int i = 1; int i_prev = 0; uint32_t xy_prev = X[0] + Y[0]; @@ -513,15 +515,19 @@ static WEBP_INLINE int ColorTransformDelta(int8_t color_pred, int8_t color) { return ((int)color_pred * color) >> 5; } +static WEBP_INLINE int8_t U32ToS8(uint32_t v) { + return (int8_t)(v & 0xff); +} + void VP8LTransformColor_C(const VP8LMultipliers* const m, uint32_t* data, int num_pixels) { int i; for (i = 0; i < num_pixels; ++i) { const uint32_t argb = data[i]; - const uint32_t green = argb >> 8; - const uint32_t red = argb >> 16; - int new_red = red; - int new_blue = argb; + const int8_t green = U32ToS8(argb >> 8); + const int8_t red = U32ToS8(argb >> 16); + int new_red = red & 0xff; + int new_blue = argb & 0xff; new_red -= ColorTransformDelta(m->green_to_red_, green); new_red &= 0xff; new_blue -= ColorTransformDelta(m->green_to_blue_, green); @@ -533,7 +539,7 @@ void VP8LTransformColor_C(const VP8LMultipliers* const m, uint32_t* data, static WEBP_INLINE uint8_t TransformColorRed(uint8_t green_to_red, uint32_t argb) { - const uint32_t green = argb >> 8; + const int8_t green = U32ToS8(argb >> 8); int new_red = argb >> 16; new_red -= ColorTransformDelta(green_to_red, green); return (new_red & 0xff); @@ -542,9 +548,9 @@ static WEBP_INLINE uint8_t TransformColorRed(uint8_t green_to_red, static WEBP_INLINE uint8_t TransformColorBlue(uint8_t green_to_blue, uint8_t red_to_blue, uint32_t argb) { - const uint32_t green = argb >> 8; - const uint32_t red = argb >> 16; - uint8_t new_blue = argb; + const int8_t green = U32ToS8(argb >> 8); + const int8_t red = U32ToS8(argb >> 16); + uint8_t new_blue = argb & 0xff; new_blue -= ColorTransformDelta(green_to_blue, green); new_blue -= ColorTransformDelta(red_to_blue, red); return (new_blue & 0xff); @@ -556,7 +562,7 @@ void VP8LCollectColorRedTransforms_C(const uint32_t* argb, int stride, while (tile_height-- > 0) { int x; for (x = 0; x < tile_width; ++x) { - ++histo[TransformColorRed(green_to_red, argb[x])]; + ++histo[TransformColorRed((uint8_t)green_to_red, argb[x])]; } argb += stride; } @@ -569,7 +575,8 @@ void VP8LCollectColorBlueTransforms_C(const uint32_t* argb, int stride, while (tile_height-- > 0) { int x; for (x = 0; x < tile_width; ++x) { - ++histo[TransformColorBlue(green_to_blue, red_to_blue, argb[x])]; + ++histo[TransformColorBlue((uint8_t)green_to_blue, (uint8_t)red_to_blue, + argb[x])]; } argb += stride; } @@ -577,8 +584,8 @@ void VP8LCollectColorBlueTransforms_C(const uint32_t* argb, int stride, //------------------------------------------------------------------------------ -static int VectorMismatch(const uint32_t* const array1, - const uint32_t* const array2, int length) { +static int VectorMismatch_C(const uint32_t* const array1, + const uint32_t* const array2, int length) { int match_len = 0; while (match_len < length && array1[match_len] == array2[match_len]) { @@ -610,15 +617,15 @@ void VP8LBundleColorMap_C(const uint8_t* const row, int width, int xbits, //------------------------------------------------------------------------------ -static double ExtraCost(const uint32_t* population, int length) { +static double ExtraCost_C(const uint32_t* population, int length) { int i; double cost = 0.; for (i = 2; i < length - 2; ++i) cost += (i >> 1) * population[i + 2]; return cost; } -static double ExtraCostCombined(const uint32_t* X, const uint32_t* Y, - int length) { +static double ExtraCostCombined_C(const uint32_t* X, const uint32_t* Y, + int length) { int i; double cost = 0.; for (i = 2; i < length - 2; ++i) { @@ -630,38 +637,67 @@ static double ExtraCostCombined(const uint32_t* X, const uint32_t* Y, //------------------------------------------------------------------------------ -static void HistogramAdd(const VP8LHistogram* const a, - const VP8LHistogram* const b, - VP8LHistogram* const out) { +static void AddVector_C(const uint32_t* a, const uint32_t* b, uint32_t* out, + int size) { + int i; + for (i = 0; i < size; ++i) out[i] = a[i] + b[i]; +} + +static void AddVectorEq_C(const uint32_t* a, uint32_t* out, int size) { + int i; + for (i = 0; i < size; ++i) out[i] += a[i]; +} + +#define ADD(X, ARG, LEN) do { \ + if (a->is_used_[X]) { \ + if (b->is_used_[X]) { \ + VP8LAddVector(a->ARG, b->ARG, out->ARG, (LEN)); \ + } else { \ + memcpy(&out->ARG[0], &a->ARG[0], (LEN) * sizeof(out->ARG[0])); \ + } \ + } else if (b->is_used_[X]) { \ + memcpy(&out->ARG[0], &b->ARG[0], (LEN) * sizeof(out->ARG[0])); \ + } else { \ + memset(&out->ARG[0], 0, (LEN) * sizeof(out->ARG[0])); \ + } \ +} while (0) + +#define ADD_EQ(X, ARG, LEN) do { \ + if (a->is_used_[X]) { \ + if (out->is_used_[X]) { \ + VP8LAddVectorEq(a->ARG, out->ARG, (LEN)); \ + } else { \ + memcpy(&out->ARG[0], &a->ARG[0], (LEN) * sizeof(out->ARG[0])); \ + } \ + } \ +} while (0) + +void VP8LHistogramAdd(const VP8LHistogram* const a, + const VP8LHistogram* const b, VP8LHistogram* const out) { int i; const int literal_size = VP8LHistogramNumCodes(a->palette_code_bits_); assert(a->palette_code_bits_ == b->palette_code_bits_); + if (b != out) { - for (i = 0; i < literal_size; ++i) { - out->literal_[i] = a->literal_[i] + b->literal_[i]; - } - for (i = 0; i < NUM_DISTANCE_CODES; ++i) { - out->distance_[i] = a->distance_[i] + b->distance_[i]; - } - for (i = 0; i < NUM_LITERAL_CODES; ++i) { - out->red_[i] = a->red_[i] + b->red_[i]; - out->blue_[i] = a->blue_[i] + b->blue_[i]; - out->alpha_[i] = a->alpha_[i] + b->alpha_[i]; + ADD(0, literal_, literal_size); + ADD(1, red_, NUM_LITERAL_CODES); + ADD(2, blue_, NUM_LITERAL_CODES); + ADD(3, alpha_, NUM_LITERAL_CODES); + ADD(4, distance_, NUM_DISTANCE_CODES); + for (i = 0; i < 5; ++i) { + out->is_used_[i] = (a->is_used_[i] | b->is_used_[i]); } } else { - for (i = 0; i < literal_size; ++i) { - out->literal_[i] += a->literal_[i]; - } - for (i = 0; i < NUM_DISTANCE_CODES; ++i) { - out->distance_[i] += a->distance_[i]; - } - for (i = 0; i < NUM_LITERAL_CODES; ++i) { - out->red_[i] += a->red_[i]; - out->blue_[i] += a->blue_[i]; - out->alpha_[i] += a->alpha_[i]; - } + ADD_EQ(0, literal_, literal_size); + ADD_EQ(1, red_, NUM_LITERAL_CODES); + ADD_EQ(2, blue_, NUM_LITERAL_CODES); + ADD_EQ(3, alpha_, NUM_LITERAL_CODES); + ADD_EQ(4, distance_, NUM_DISTANCE_CODES); + for (i = 0; i < 5; ++i) out->is_used_[i] |= a->is_used_[i]; } } +#undef ADD +#undef ADD_EQ //------------------------------------------------------------------------------ // Image transforms. @@ -846,7 +882,8 @@ VP8LCombinedShannonEntropyFunc VP8LCombinedShannonEntropy; VP8LGetEntropyUnrefinedFunc VP8LGetEntropyUnrefined; VP8LGetCombinedEntropyUnrefinedFunc VP8LGetCombinedEntropyUnrefined; -VP8LHistogramAddFunc VP8LHistogramAdd; +VP8LAddVectorFunc VP8LAddVector; +VP8LAddVectorEqFunc VP8LAddVectorEq; VP8LVectorMismatchFunc VP8LVectorMismatch; VP8LBundleColorMapFunc VP8LBundleColorMap; @@ -861,34 +898,32 @@ extern void VP8LEncDspInitMIPS32(void); extern void VP8LEncDspInitMIPSdspR2(void); extern void VP8LEncDspInitMSA(void); -static volatile VP8CPUInfo lossless_enc_last_cpuinfo_used = - (VP8CPUInfo)&lossless_enc_last_cpuinfo_used; - -WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInit(void) { - if (lossless_enc_last_cpuinfo_used == VP8GetCPUInfo) return; - +WEBP_DSP_INIT_FUNC(VP8LEncDspInit) { VP8LDspInit(); +#if !WEBP_NEON_OMIT_C_CODE VP8LSubtractGreenFromBlueAndRed = VP8LSubtractGreenFromBlueAndRed_C; VP8LTransformColor = VP8LTransformColor_C; +#endif VP8LCollectColorBlueTransforms = VP8LCollectColorBlueTransforms_C; VP8LCollectColorRedTransforms = VP8LCollectColorRedTransforms_C; - VP8LFastLog2Slow = FastLog2Slow; - VP8LFastSLog2Slow = FastSLog2Slow; + VP8LFastLog2Slow = FastLog2Slow_C; + VP8LFastSLog2Slow = FastSLog2Slow_C; - VP8LExtraCost = ExtraCost; - VP8LExtraCostCombined = ExtraCostCombined; - VP8LCombinedShannonEntropy = CombinedShannonEntropy; + VP8LExtraCost = ExtraCost_C; + VP8LExtraCostCombined = ExtraCostCombined_C; + VP8LCombinedShannonEntropy = CombinedShannonEntropy_C; - VP8LGetEntropyUnrefined = GetEntropyUnrefined; - VP8LGetCombinedEntropyUnrefined = GetCombinedEntropyUnrefined; + VP8LGetEntropyUnrefined = GetEntropyUnrefined_C; + VP8LGetCombinedEntropyUnrefined = GetCombinedEntropyUnrefined_C; - VP8LHistogramAdd = HistogramAdd; + VP8LAddVector = AddVector_C; + VP8LAddVectorEq = AddVectorEq_C; - VP8LVectorMismatch = VectorMismatch; + VP8LVectorMismatch = VectorMismatch_C; VP8LBundleColorMap = VP8LBundleColorMap_C; VP8LPredictorsSub[0] = PredictorSub0_C; @@ -937,11 +972,6 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInit(void) { #endif } #endif -#if defined(WEBP_USE_NEON) - if (VP8GetCPUInfo(kNEON)) { - VP8LEncDspInitNEON(); - } -#endif #if defined(WEBP_USE_MIPS32) if (VP8GetCPUInfo(kMIPS32)) { VP8LEncDspInitMIPS32(); @@ -958,7 +988,61 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInit(void) { } #endif } - lossless_enc_last_cpuinfo_used = VP8GetCPUInfo; + +#if defined(WEBP_USE_NEON) + if (WEBP_NEON_OMIT_C_CODE || + (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) { + VP8LEncDspInitNEON(); + } +#endif + + assert(VP8LSubtractGreenFromBlueAndRed != NULL); + assert(VP8LTransformColor != NULL); + assert(VP8LCollectColorBlueTransforms != NULL); + assert(VP8LCollectColorRedTransforms != NULL); + assert(VP8LFastLog2Slow != NULL); + assert(VP8LFastSLog2Slow != NULL); + assert(VP8LExtraCost != NULL); + assert(VP8LExtraCostCombined != NULL); + assert(VP8LCombinedShannonEntropy != NULL); + assert(VP8LGetEntropyUnrefined != NULL); + assert(VP8LGetCombinedEntropyUnrefined != NULL); + assert(VP8LAddVector != NULL); + assert(VP8LAddVectorEq != NULL); + assert(VP8LVectorMismatch != NULL); + assert(VP8LBundleColorMap != NULL); + assert(VP8LPredictorsSub[0] != NULL); + assert(VP8LPredictorsSub[1] != NULL); + assert(VP8LPredictorsSub[2] != NULL); + assert(VP8LPredictorsSub[3] != NULL); + assert(VP8LPredictorsSub[4] != NULL); + assert(VP8LPredictorsSub[5] != NULL); + assert(VP8LPredictorsSub[6] != NULL); + assert(VP8LPredictorsSub[7] != NULL); + assert(VP8LPredictorsSub[8] != NULL); + assert(VP8LPredictorsSub[9] != NULL); + assert(VP8LPredictorsSub[10] != NULL); + assert(VP8LPredictorsSub[11] != NULL); + assert(VP8LPredictorsSub[12] != NULL); + assert(VP8LPredictorsSub[13] != NULL); + assert(VP8LPredictorsSub[14] != NULL); + assert(VP8LPredictorsSub[15] != NULL); + assert(VP8LPredictorsSub_C[0] != NULL); + assert(VP8LPredictorsSub_C[1] != NULL); + assert(VP8LPredictorsSub_C[2] != NULL); + assert(VP8LPredictorsSub_C[3] != NULL); + assert(VP8LPredictorsSub_C[4] != NULL); + assert(VP8LPredictorsSub_C[5] != NULL); + assert(VP8LPredictorsSub_C[6] != NULL); + assert(VP8LPredictorsSub_C[7] != NULL); + assert(VP8LPredictorsSub_C[8] != NULL); + assert(VP8LPredictorsSub_C[9] != NULL); + assert(VP8LPredictorsSub_C[10] != NULL); + assert(VP8LPredictorsSub_C[11] != NULL); + assert(VP8LPredictorsSub_C[12] != NULL); + assert(VP8LPredictorsSub_C[13] != NULL); + assert(VP8LPredictorsSub_C[14] != NULL); + assert(VP8LPredictorsSub_C[15] != NULL); } //------------------------------------------------------------------------------ diff --git a/src/3rdparty/libwebp/src/dsp/lossless_enc_mips32.c b/src/3rdparty/libwebp/src/dsp/lossless_enc_mips32.c index 4186b9f..0412a09 100644 --- a/src/3rdparty/libwebp/src/dsp/lossless_enc_mips32.c +++ b/src/3rdparty/libwebp/src/dsp/lossless_enc_mips32.c @@ -12,9 +12,9 @@ // Author(s): Djordje Pesut (djordje.pesut@imgtec.com) // Jovan Zelincevic (jovan.zelincevic@imgtec.com) -#include "./dsp.h" -#include "./lossless.h" -#include "./lossless_common.h" +#include "src/dsp/dsp.h" +#include "src/dsp/lossless.h" +#include "src/dsp/lossless_common.h" #if defined(WEBP_USE_MIPS32) @@ -23,7 +23,7 @@ #include <stdlib.h> #include <string.h> -static float FastSLog2Slow(uint32_t v) { +static float FastSLog2Slow_MIPS32(uint32_t v) { assert(v >= LOG_LOOKUP_IDX_MAX); if (v < APPROX_LOG_WITH_CORRECTION_MAX) { uint32_t log_cnt, y, correction; @@ -59,7 +59,7 @@ static float FastSLog2Slow(uint32_t v) { } } -static float FastLog2Slow(uint32_t v) { +static float FastLog2Slow_MIPS32(uint32_t v) { assert(v >= LOG_LOOKUP_IDX_MAX); if (v < APPROX_LOG_WITH_CORRECTION_MAX) { uint32_t log_cnt, y; @@ -104,7 +104,7 @@ static float FastLog2Slow(uint32_t v) { // pop += 2; // } // return (double)cost; -static double ExtraCost(const uint32_t* const population, int length) { +static double ExtraCost_MIPS32(const uint32_t* const population, int length) { int i, temp0, temp1; const uint32_t* pop = &population[4]; const uint32_t* const LoopEnd = &population[length]; @@ -149,8 +149,8 @@ static double ExtraCost(const uint32_t* const population, int length) { // pY += 2; // } // return (double)cost; -static double ExtraCostCombined(const uint32_t* const X, - const uint32_t* const Y, int length) { +static double ExtraCostCombined_MIPS32(const uint32_t* const X, + const uint32_t* const Y, int length) { int i, temp0, temp1, temp2, temp3; const uint32_t* pX = &X[4]; const uint32_t* pY = &Y[4]; @@ -241,9 +241,9 @@ static WEBP_INLINE void GetEntropyUnrefinedHelper( *i_prev = i; } -static void GetEntropyUnrefined(const uint32_t X[], int length, - VP8LBitEntropy* const bit_entropy, - VP8LStreaks* const stats) { +static void GetEntropyUnrefined_MIPS32(const uint32_t X[], int length, + VP8LBitEntropy* const bit_entropy, + VP8LStreaks* const stats) { int i; int i_prev = 0; uint32_t x_prev = X[0]; @@ -262,26 +262,27 @@ static void GetEntropyUnrefined(const uint32_t X[], int length, bit_entropy->entropy += VP8LFastSLog2(bit_entropy->sum); } -static void GetCombinedEntropyUnrefined(const uint32_t X[], const uint32_t Y[], - int length, - VP8LBitEntropy* const bit_entropy, - VP8LStreaks* const stats) { +static void GetCombinedEntropyUnrefined_MIPS32(const uint32_t X[], + const uint32_t Y[], + int length, + VP8LBitEntropy* const entropy, + VP8LStreaks* const stats) { int i = 1; int i_prev = 0; uint32_t xy_prev = X[0] + Y[0]; memset(stats, 0, sizeof(*stats)); - VP8LBitEntropyInit(bit_entropy); + VP8LBitEntropyInit(entropy); for (i = 1; i < length; ++i) { const uint32_t xy = X[i] + Y[i]; if (xy != xy_prev) { - GetEntropyUnrefinedHelper(xy, i, &xy_prev, &i_prev, bit_entropy, stats); + GetEntropyUnrefinedHelper(xy, i, &xy_prev, &i_prev, entropy, stats); } } - GetEntropyUnrefinedHelper(0, i, &xy_prev, &i_prev, bit_entropy, stats); + GetEntropyUnrefinedHelper(0, i, &xy_prev, &i_prev, entropy, stats); - bit_entropy->entropy += VP8LFastSLog2(bit_entropy->sum); + entropy->entropy += VP8LFastSLog2(entropy->sum); } #define ASM_START \ @@ -343,65 +344,29 @@ static void GetCombinedEntropyUnrefined(const uint32_t X[], const uint32_t Y[], ASM_END_COMMON_0 \ ASM_END_COMMON_1 -#define ADD_VECTOR(A, B, OUT, SIZE, EXTRA_SIZE) do { \ - const uint32_t* pa = (const uint32_t*)(A); \ - const uint32_t* pb = (const uint32_t*)(B); \ - uint32_t* pout = (uint32_t*)(OUT); \ - const uint32_t* const LoopEnd = pa + (SIZE); \ - assert((SIZE) % 4 == 0); \ - ASM_START \ - ADD_TO_OUT(0, 4, 8, 12, 1, pa, pb, pout) \ - ASM_END_0 \ - if ((EXTRA_SIZE) > 0) { \ - const int last = (EXTRA_SIZE); \ - int i; \ - for (i = 0; i < last; ++i) pout[i] = pa[i] + pb[i]; \ - } \ -} while (0) - -#define ADD_VECTOR_EQ(A, OUT, SIZE, EXTRA_SIZE) do { \ - const uint32_t* pa = (const uint32_t*)(A); \ - uint32_t* pout = (uint32_t*)(OUT); \ - const uint32_t* const LoopEnd = pa + (SIZE); \ - assert((SIZE) % 4 == 0); \ - ASM_START \ - ADD_TO_OUT(0, 4, 8, 12, 0, pa, pout, pout) \ - ASM_END_1 \ - if ((EXTRA_SIZE) > 0) { \ - const int last = (EXTRA_SIZE); \ - int i; \ - for (i = 0; i < last; ++i) pout[i] += pa[i]; \ - } \ -} while (0) - -static void HistogramAdd(const VP8LHistogram* const a, - const VP8LHistogram* const b, - VP8LHistogram* const out) { +static void AddVector_MIPS32(const uint32_t* pa, const uint32_t* pb, + uint32_t* pout, int size) { uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; - const int extra_cache_size = VP8LHistogramNumCodes(a->palette_code_bits_) - - (NUM_LITERAL_CODES + NUM_LENGTH_CODES); - assert(a->palette_code_bits_ == b->palette_code_bits_); - - if (b != out) { - ADD_VECTOR(a->literal_, b->literal_, out->literal_, - NUM_LITERAL_CODES + NUM_LENGTH_CODES, extra_cache_size); - ADD_VECTOR(a->distance_, b->distance_, out->distance_, - NUM_DISTANCE_CODES, 0); - ADD_VECTOR(a->red_, b->red_, out->red_, NUM_LITERAL_CODES, 0); - ADD_VECTOR(a->blue_, b->blue_, out->blue_, NUM_LITERAL_CODES, 0); - ADD_VECTOR(a->alpha_, b->alpha_, out->alpha_, NUM_LITERAL_CODES, 0); - } else { - ADD_VECTOR_EQ(a->literal_, out->literal_, - NUM_LITERAL_CODES + NUM_LENGTH_CODES, extra_cache_size); - ADD_VECTOR_EQ(a->distance_, out->distance_, NUM_DISTANCE_CODES, 0); - ADD_VECTOR_EQ(a->red_, out->red_, NUM_LITERAL_CODES, 0); - ADD_VECTOR_EQ(a->blue_, out->blue_, NUM_LITERAL_CODES, 0); - ADD_VECTOR_EQ(a->alpha_, out->alpha_, NUM_LITERAL_CODES, 0); - } + const uint32_t end = ((size) / 4) * 4; + const uint32_t* const LoopEnd = pa + end; + int i; + ASM_START + ADD_TO_OUT(0, 4, 8, 12, 1, pa, pb, pout) + ASM_END_0 + for (i = end; i < size; ++i) pout[i] = pa[i] + pb[i]; +} + +static void AddVectorEq_MIPS32(const uint32_t* pa, uint32_t* pout, int size) { + uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; + const uint32_t end = ((size) / 4) * 4; + const uint32_t* const LoopEnd = pa + end; + int i; + ASM_START + ADD_TO_OUT(0, 4, 8, 12, 0, pa, pout, pout) + ASM_END_1 + for (i = end; i < size; ++i) pout[i] += pa[i]; } -#undef ADD_VECTOR_EQ -#undef ADD_VECTOR #undef ASM_END_1 #undef ASM_END_0 #undef ASM_END_COMMON_1 @@ -415,13 +380,14 @@ static void HistogramAdd(const VP8LHistogram* const a, extern void VP8LEncDspInitMIPS32(void); WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitMIPS32(void) { - VP8LFastSLog2Slow = FastSLog2Slow; - VP8LFastLog2Slow = FastLog2Slow; - VP8LExtraCost = ExtraCost; - VP8LExtraCostCombined = ExtraCostCombined; - VP8LGetEntropyUnrefined = GetEntropyUnrefined; - VP8LGetCombinedEntropyUnrefined = GetCombinedEntropyUnrefined; - VP8LHistogramAdd = HistogramAdd; + VP8LFastSLog2Slow = FastSLog2Slow_MIPS32; + VP8LFastLog2Slow = FastLog2Slow_MIPS32; + VP8LExtraCost = ExtraCost_MIPS32; + VP8LExtraCostCombined = ExtraCostCombined_MIPS32; + VP8LGetEntropyUnrefined = GetEntropyUnrefined_MIPS32; + VP8LGetCombinedEntropyUnrefined = GetCombinedEntropyUnrefined_MIPS32; + VP8LAddVector = AddVector_MIPS32; + VP8LAddVectorEq = AddVectorEq_MIPS32; } #else // !WEBP_USE_MIPS32 diff --git a/src/3rdparty/libwebp/src/dsp/lossless_enc_mips_dsp_r2.c b/src/3rdparty/libwebp/src/dsp/lossless_enc_mips_dsp_r2.c index 0abf3c4..5855e6a 100644 --- a/src/3rdparty/libwebp/src/dsp/lossless_enc_mips_dsp_r2.c +++ b/src/3rdparty/libwebp/src/dsp/lossless_enc_mips_dsp_r2.c @@ -12,14 +12,14 @@ // Author(s): Djordje Pesut (djordje.pesut@imgtec.com) // Jovan Zelincevic (jovan.zelincevic@imgtec.com) -#include "./dsp.h" +#include "src/dsp/dsp.h" #if defined(WEBP_USE_MIPS_DSP_R2) -#include "./lossless.h" +#include "src/dsp/lossless.h" -static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, - int num_pixels) { +static void SubtractGreenFromBlueAndRed_MIPSdspR2(uint32_t* argb_data, + int num_pixels) { uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; uint32_t* const p_loop1_end = argb_data + (num_pixels & ~3); uint32_t* const p_loop2_end = p_loop1_end + (num_pixels & 3); @@ -78,8 +78,8 @@ static WEBP_INLINE uint32_t ColorTransformDelta(int8_t color_pred, return (uint32_t)((int)(color_pred) * color) >> 5; } -static void TransformColor(const VP8LMultipliers* const m, uint32_t* data, - int num_pixels) { +static void TransformColor_MIPSdspR2(const VP8LMultipliers* const m, + uint32_t* data, int num_pixels) { int temp0, temp1, temp2, temp3, temp4, temp5; uint32_t argb, argb1, new_red, new_red1; const uint32_t G_to_R = m->green_to_red_; @@ -171,10 +171,13 @@ static WEBP_INLINE uint8_t TransformColorBlue(uint8_t green_to_blue, return (new_blue & 0xff); } -static void CollectColorBlueTransforms(const uint32_t* argb, int stride, - int tile_width, int tile_height, - int green_to_blue, int red_to_blue, - int histo[]) { +static void CollectColorBlueTransforms_MIPSdspR2(const uint32_t* argb, + int stride, + int tile_width, + int tile_height, + int green_to_blue, + int red_to_blue, + int histo[]) { const int rtb = (red_to_blue << 16) | (red_to_blue & 0xffff); const int gtb = (green_to_blue << 16) | (green_to_blue & 0xffff); const uint32_t mask = 0xff00ffu; @@ -222,9 +225,12 @@ static WEBP_INLINE uint8_t TransformColorRed(uint8_t green_to_red, return (new_red & 0xff); } -static void CollectColorRedTransforms(const uint32_t* argb, int stride, - int tile_width, int tile_height, - int green_to_red, int histo[]) { +static void CollectColorRedTransforms_MIPSdspR2(const uint32_t* argb, + int stride, + int tile_width, + int tile_height, + int green_to_red, + int histo[]) { const int gtr = (green_to_red << 16) | (green_to_red & 0xffff); while (tile_height-- > 0) { int x; @@ -262,10 +268,10 @@ static void CollectColorRedTransforms(const uint32_t* argb, int stride, extern void VP8LEncDspInitMIPSdspR2(void); WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitMIPSdspR2(void) { - VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed; - VP8LTransformColor = TransformColor; - VP8LCollectColorBlueTransforms = CollectColorBlueTransforms; - VP8LCollectColorRedTransforms = CollectColorRedTransforms; + VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed_MIPSdspR2; + VP8LTransformColor = TransformColor_MIPSdspR2; + VP8LCollectColorBlueTransforms = CollectColorBlueTransforms_MIPSdspR2; + VP8LCollectColorRedTransforms = CollectColorRedTransforms_MIPSdspR2; } #else // !WEBP_USE_MIPS_DSP_R2 diff --git a/src/3rdparty/libwebp/src/dsp/lossless_enc_msa.c b/src/3rdparty/libwebp/src/dsp/lossless_enc_msa.c index 2f69ba3..600dddf 100644 --- a/src/3rdparty/libwebp/src/dsp/lossless_enc_msa.c +++ b/src/3rdparty/libwebp/src/dsp/lossless_enc_msa.c @@ -11,12 +11,12 @@ // // Authors: Prashant Patil (Prashant.Patil@imgtec.com) -#include "./dsp.h" +#include "src/dsp/dsp.h" #if defined(WEBP_USE_MSA) -#include "./lossless.h" -#include "./msa_macro.h" +#include "src/dsp/lossless.h" +#include "src/dsp/msa_macro.h" #define TRANSFORM_COLOR_8(src0, src1, dst0, dst1, c0, c1, mask0, mask1) do { \ v8i16 g0, g1, t0, t1, t2, t3; \ @@ -48,8 +48,8 @@ dst = VSHF_UB(src, t0, mask1); \ } while (0) -static void TransformColor(const VP8LMultipliers* const m, uint32_t* data, - int num_pixels) { +static void TransformColor_MSA(const VP8LMultipliers* const m, uint32_t* data, + int num_pixels) { v16u8 src0, dst0; const v16i8 g2br = (v16i8)__msa_fill_w(m->green_to_blue_ | (m->green_to_red_ << 16)); @@ -94,7 +94,8 @@ static void TransformColor(const VP8LMultipliers* const m, uint32_t* data, } } -static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) { +static void SubtractGreenFromBlueAndRed_MSA(uint32_t* argb_data, + int num_pixels) { int i; uint8_t* ptemp_data = (uint8_t*)argb_data; v16u8 src0, dst0, tmp0; @@ -136,8 +137,8 @@ static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) { extern void VP8LEncDspInitMSA(void); WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitMSA(void) { - VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed; - VP8LTransformColor = TransformColor; + VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed_MSA; + VP8LTransformColor = TransformColor_MSA; } #else // !WEBP_USE_MSA diff --git a/src/3rdparty/libwebp/src/dsp/lossless_enc_neon.c b/src/3rdparty/libwebp/src/dsp/lossless_enc_neon.c index 4c56f25..7c7b73f 100644 --- a/src/3rdparty/libwebp/src/dsp/lossless_enc_neon.c +++ b/src/3rdparty/libwebp/src/dsp/lossless_enc_neon.c @@ -11,14 +11,14 @@ // // Author: Skal (pascal.massimino@gmail.com) -#include "./dsp.h" +#include "src/dsp/dsp.h" #if defined(WEBP_USE_NEON) #include <arm_neon.h> -#include "./lossless.h" -#include "./neon.h" +#include "src/dsp/lossless.h" +#include "src/dsp/neon.h" //------------------------------------------------------------------------------ // Subtract-Green Transform @@ -36,8 +36,8 @@ static const uint8_t kGreenShuffle[16] = { 1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255 }; -static WEBP_INLINE uint8x16_t DoGreenShuffle(const uint8x16_t argb, - const uint8x16_t shuffle) { +static WEBP_INLINE uint8x16_t DoGreenShuffle_NEON(const uint8x16_t argb, + const uint8x16_t shuffle) { return vcombine_u8(vtbl1q_u8(argb, vget_low_u8(shuffle)), vtbl1q_u8(argb, vget_high_u8(shuffle))); } @@ -45,14 +45,15 @@ static WEBP_INLINE uint8x16_t DoGreenShuffle(const uint8x16_t argb, // 255 = byte will be zeroed static const uint8_t kGreenShuffle[8] = { 1, 255, 1, 255, 5, 255, 5, 255 }; -static WEBP_INLINE uint8x16_t DoGreenShuffle(const uint8x16_t argb, - const uint8x8_t shuffle) { +static WEBP_INLINE uint8x16_t DoGreenShuffle_NEON(const uint8x16_t argb, + const uint8x8_t shuffle) { return vcombine_u8(vtbl1_u8(vget_low_u8(argb), shuffle), vtbl1_u8(vget_high_u8(argb), shuffle)); } #endif // USE_VTBLQ -static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) { +static void SubtractGreenFromBlueAndRed_NEON(uint32_t* argb_data, + int num_pixels) { const uint32_t* const end = argb_data + (num_pixels & ~3); #ifdef USE_VTBLQ const uint8x16_t shuffle = vld1q_u8(kGreenShuffle); @@ -61,7 +62,7 @@ static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) { #endif for (; argb_data < end; argb_data += 4) { const uint8x16_t argb = vld1q_u8((uint8_t*)argb_data); - const uint8x16_t greens = DoGreenShuffle(argb, shuffle); + const uint8x16_t greens = DoGreenShuffle_NEON(argb, shuffle); vst1q_u8((uint8_t*)argb_data, vsubq_u8(argb, greens)); } // fallthrough and finish off with plain-C @@ -71,8 +72,8 @@ static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) { //------------------------------------------------------------------------------ // Color Transform -static void TransformColor(const VP8LMultipliers* const m, - uint32_t* argb_data, int num_pixels) { +static void TransformColor_NEON(const VP8LMultipliers* const m, + uint32_t* argb_data, int num_pixels) { // sign-extended multiplying constants, pre-shifted by 6. #define CST(X) (((int16_t)(m->X << 8)) >> 6) const int16_t rb[8] = { @@ -102,7 +103,7 @@ static void TransformColor(const VP8LMultipliers* const m, for (i = 0; i + 4 <= num_pixels; i += 4) { const uint8x16_t in = vld1q_u8((uint8_t*)(argb_data + i)); // 0 g 0 g - const uint8x16_t greens = DoGreenShuffle(in, shuffle); + const uint8x16_t greens = DoGreenShuffle_NEON(in, shuffle); // x dr x db1 const int16x8_t A = vqdmulhq_s16(vreinterpretq_s16_u8(greens), mults_rb); // r 0 b 0 @@ -132,8 +133,8 @@ static void TransformColor(const VP8LMultipliers* const m, extern void VP8LEncDspInitNEON(void); WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitNEON(void) { - VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed; - VP8LTransformColor = TransformColor; + VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed_NEON; + VP8LTransformColor = TransformColor_NEON; } #else // !WEBP_USE_NEON diff --git a/src/3rdparty/libwebp/src/dsp/lossless_enc_sse2.c b/src/3rdparty/libwebp/src/dsp/lossless_enc_sse2.c index 8ad85d9..8adc521 100644 --- a/src/3rdparty/libwebp/src/dsp/lossless_enc_sse2.c +++ b/src/3rdparty/libwebp/src/dsp/lossless_enc_sse2.c @@ -11,22 +11,23 @@ // // Author: Skal (pascal.massimino@gmail.com) -#include "./dsp.h" +#include "src/dsp/dsp.h" #if defined(WEBP_USE_SSE2) #include <assert.h> #include <emmintrin.h> -#include "./lossless.h" -#include "./common_sse2.h" -#include "./lossless_common.h" +#include "src/dsp/lossless.h" +#include "src/dsp/common_sse2.h" +#include "src/dsp/lossless_common.h" // For sign-extended multiplying constants, pre-shifted by 5: -#define CST_5b(X) (((int16_t)((uint16_t)X << 8)) >> 5) +#define CST_5b(X) (((int16_t)((uint16_t)(X) << 8)) >> 5) //------------------------------------------------------------------------------ // Subtract-Green Transform -static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) { +static void SubtractGreenFromBlueAndRed_SSE2(uint32_t* argb_data, + int num_pixels) { int i; for (i = 0; i + 4 <= num_pixels; i += 4) { const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]); // argb @@ -45,16 +46,14 @@ static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) { //------------------------------------------------------------------------------ // Color Transform -static void TransformColor(const VP8LMultipliers* const m, - uint32_t* argb_data, int num_pixels) { - const __m128i mults_rb = _mm_set_epi16( - CST_5b(m->green_to_red_), CST_5b(m->green_to_blue_), - CST_5b(m->green_to_red_), CST_5b(m->green_to_blue_), - CST_5b(m->green_to_red_), CST_5b(m->green_to_blue_), - CST_5b(m->green_to_red_), CST_5b(m->green_to_blue_)); - const __m128i mults_b2 = _mm_set_epi16( - CST_5b(m->red_to_blue_), 0, CST_5b(m->red_to_blue_), 0, - CST_5b(m->red_to_blue_), 0, CST_5b(m->red_to_blue_), 0); +#define MK_CST_16(HI, LO) \ + _mm_set1_epi32((int)(((uint32_t)(HI) << 16) | ((LO) & 0xffff))) + +static void TransformColor_SSE2(const VP8LMultipliers* const m, + uint32_t* argb_data, int num_pixels) { + const __m128i mults_rb = MK_CST_16(CST_5b(m->green_to_red_), + CST_5b(m->green_to_blue_)); + const __m128i mults_b2 = MK_CST_16(CST_5b(m->red_to_blue_), 0); const __m128i mask_ag = _mm_set1_epi32(0xff00ff00); // alpha-green masks const __m128i mask_rb = _mm_set1_epi32(0x00ff00ff); // red-blue masks int i; @@ -80,16 +79,12 @@ static void TransformColor(const VP8LMultipliers* const m, //------------------------------------------------------------------------------ #define SPAN 8 -static void CollectColorBlueTransforms(const uint32_t* argb, int stride, - int tile_width, int tile_height, - int green_to_blue, int red_to_blue, - int histo[]) { - const __m128i mults_r = _mm_set_epi16( - CST_5b(red_to_blue), 0, CST_5b(red_to_blue), 0, - CST_5b(red_to_blue), 0, CST_5b(red_to_blue), 0); - const __m128i mults_g = _mm_set_epi16( - 0, CST_5b(green_to_blue), 0, CST_5b(green_to_blue), - 0, CST_5b(green_to_blue), 0, CST_5b(green_to_blue)); +static void CollectColorBlueTransforms_SSE2(const uint32_t* argb, int stride, + int tile_width, int tile_height, + int green_to_blue, int red_to_blue, + int histo[]) { + const __m128i mults_r = MK_CST_16(CST_5b(red_to_blue), 0); + const __m128i mults_g = MK_CST_16(0, CST_5b(green_to_blue)); const __m128i mask_g = _mm_set1_epi32(0x00ff00); // green mask const __m128i mask_b = _mm_set1_epi32(0x0000ff); // blue mask int y; @@ -131,12 +126,10 @@ static void CollectColorBlueTransforms(const uint32_t* argb, int stride, } } -static void CollectColorRedTransforms(const uint32_t* argb, int stride, - int tile_width, int tile_height, - int green_to_red, int histo[]) { - const __m128i mults_g = _mm_set_epi16( - 0, CST_5b(green_to_red), 0, CST_5b(green_to_red), - 0, CST_5b(green_to_red), 0, CST_5b(green_to_red)); +static void CollectColorRedTransforms_SSE2(const uint32_t* argb, int stride, + int tile_width, int tile_height, + int green_to_red, int histo[]) { + const __m128i mults_g = MK_CST_16(0, CST_5b(green_to_red)); const __m128i mask_g = _mm_set1_epi32(0x00ff00); // green mask const __m128i mask = _mm_set1_epi32(0xff); @@ -173,15 +166,17 @@ static void CollectColorRedTransforms(const uint32_t* argb, int stride, } } #undef SPAN +#undef MK_CST_16 //------------------------------------------------------------------------------ +// Note we are adding uint32_t's as *signed* int32's (using _mm_add_epi32). But +// that's ok since the histogram values are less than 1<<28 (max picture size). #define LINE_SIZE 16 // 8 or 16 -static void AddVector(const uint32_t* a, const uint32_t* b, uint32_t* out, - int size) { +static void AddVector_SSE2(const uint32_t* a, const uint32_t* b, uint32_t* out, + int size) { int i; - assert(size % LINE_SIZE == 0); - for (i = 0; i < size; i += LINE_SIZE) { + for (i = 0; i + LINE_SIZE <= size; i += LINE_SIZE) { const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i + 0]); const __m128i a1 = _mm_loadu_si128((const __m128i*)&a[i + 4]); #if (LINE_SIZE == 16) @@ -201,12 +196,14 @@ static void AddVector(const uint32_t* a, const uint32_t* b, uint32_t* out, _mm_storeu_si128((__m128i*)&out[i + 12], _mm_add_epi32(a3, b3)); #endif } + for (; i < size; ++i) { + out[i] = a[i] + b[i]; + } } -static void AddVectorEq(const uint32_t* a, uint32_t* out, int size) { +static void AddVectorEq_SSE2(const uint32_t* a, uint32_t* out, int size) { int i; - assert(size % LINE_SIZE == 0); - for (i = 0; i < size; i += LINE_SIZE) { + for (i = 0; i + LINE_SIZE <= size; i += LINE_SIZE) { const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i + 0]); const __m128i a1 = _mm_loadu_si128((const __m128i*)&a[i + 4]); #if (LINE_SIZE == 16) @@ -226,44 +223,20 @@ static void AddVectorEq(const uint32_t* a, uint32_t* out, int size) { _mm_storeu_si128((__m128i*)&out[i + 12], _mm_add_epi32(a3, b3)); #endif } -} -#undef LINE_SIZE - -// Note we are adding uint32_t's as *signed* int32's (using _mm_add_epi32). But -// that's ok since the histogram values are less than 1<<28 (max picture size). -static void HistogramAdd(const VP8LHistogram* const a, - const VP8LHistogram* const b, - VP8LHistogram* const out) { - int i; - const int literal_size = VP8LHistogramNumCodes(a->palette_code_bits_); - assert(a->palette_code_bits_ == b->palette_code_bits_); - if (b != out) { - AddVector(a->literal_, b->literal_, out->literal_, NUM_LITERAL_CODES); - AddVector(a->red_, b->red_, out->red_, NUM_LITERAL_CODES); - AddVector(a->blue_, b->blue_, out->blue_, NUM_LITERAL_CODES); - AddVector(a->alpha_, b->alpha_, out->alpha_, NUM_LITERAL_CODES); - } else { - AddVectorEq(a->literal_, out->literal_, NUM_LITERAL_CODES); - AddVectorEq(a->red_, out->red_, NUM_LITERAL_CODES); - AddVectorEq(a->blue_, out->blue_, NUM_LITERAL_CODES); - AddVectorEq(a->alpha_, out->alpha_, NUM_LITERAL_CODES); - } - for (i = NUM_LITERAL_CODES; i < literal_size; ++i) { - out->literal_[i] = a->literal_[i] + b->literal_[i]; - } - for (i = 0; i < NUM_DISTANCE_CODES; ++i) { - out->distance_[i] = a->distance_[i] + b->distance_[i]; + for (; i < size; ++i) { + out[i] += a[i]; } } +#undef LINE_SIZE //------------------------------------------------------------------------------ // Entropy // Checks whether the X or Y contribution is worth computing and adding. // Used in loop unrolling. -#define ANALYZE_X_OR_Y(x_or_y, j) \ - do { \ - if (x_or_y[i + j] != 0) retval -= VP8LFastSLog2(x_or_y[i + j]); \ +#define ANALYZE_X_OR_Y(x_or_y, j) \ + do { \ + if ((x_or_y)[i + (j)] != 0) retval -= VP8LFastSLog2((x_or_y)[i + (j)]); \ } while (0) // Checks whether the X + Y contribution is worth computing and adding. @@ -276,7 +249,7 @@ static void HistogramAdd(const VP8LHistogram* const a, } \ } while (0) -static float CombinedShannonEntropy(const int X[256], const int Y[256]) { +static float CombinedShannonEntropy_SSE2(const int X[256], const int Y[256]) { int i; double retval = 0.; int sumX, sumXY; @@ -332,8 +305,8 @@ static float CombinedShannonEntropy(const int X[256], const int Y[256]) { //------------------------------------------------------------------------------ -static int VectorMismatch(const uint32_t* const array1, - const uint32_t* const array2, int length) { +static int VectorMismatch_SSE2(const uint32_t* const array1, + const uint32_t* const array2, int length) { int match_len; if (length >= 12) { @@ -390,7 +363,7 @@ static void BundleColorMap_SSE2(const uint8_t* const row, int width, int xbits, assert(xbits <= 3); switch (xbits) { case 0: { - const __m128i ff = _mm_set1_epi16(0xff00); + const __m128i ff = _mm_set1_epi16((short)0xff00); const __m128i zero = _mm_setzero_si128(); // Store 0xff000000 | (row[x] << 8). for (x = 0; x + 16 <= width; x += 16, dst += 16) { @@ -409,7 +382,7 @@ static void BundleColorMap_SSE2(const uint8_t* const row, int width, int xbits, break; } case 1: { - const __m128i ff = _mm_set1_epi16(0xff00); + const __m128i ff = _mm_set1_epi16((short)0xff00); const __m128i mul = _mm_set1_epi16(0x110); for (x = 0; x + 16 <= width; x += 16, dst += 8) { // 0a0b | (where a/b are 4 bits). @@ -574,8 +547,8 @@ static void PredictorSub10_SSE2(const uint32_t* in, const uint32_t* upper, } // Predictor11: select. -static void GetSumAbsDiff32(const __m128i* const A, const __m128i* const B, - __m128i* const out) { +static void GetSumAbsDiff32_SSE2(const __m128i* const A, const __m128i* const B, + __m128i* const out) { // We can unpack with any value on the upper 32 bits, provided it's the same // on both operands (to that their sum of abs diff is zero). Here we use *A. const __m128i A_lo = _mm_unpacklo_epi32(*A, *A); @@ -596,8 +569,8 @@ static void PredictorSub11_SSE2(const uint32_t* in, const uint32_t* upper, const __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]); const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); __m128i pa, pb; - GetSumAbsDiff32(&T, &TL, &pa); // pa = sum |T-TL| - GetSumAbsDiff32(&L, &TL, &pb); // pb = sum |L-TL| + GetSumAbsDiff32_SSE2(&T, &TL, &pa); // pa = sum |T-TL| + GetSumAbsDiff32_SSE2(&L, &TL, &pb); // pb = sum |L-TL| { const __m128i mask = _mm_cmpgt_epi32(pb, pa); const __m128i A = _mm_and_si128(mask, L); @@ -677,13 +650,14 @@ static void PredictorSub13_SSE2(const uint32_t* in, const uint32_t* upper, extern void VP8LEncDspInitSSE2(void); WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitSSE2(void) { - VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed; - VP8LTransformColor = TransformColor; - VP8LCollectColorBlueTransforms = CollectColorBlueTransforms; - VP8LCollectColorRedTransforms = CollectColorRedTransforms; - VP8LHistogramAdd = HistogramAdd; - VP8LCombinedShannonEntropy = CombinedShannonEntropy; - VP8LVectorMismatch = VectorMismatch; + VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed_SSE2; + VP8LTransformColor = TransformColor_SSE2; + VP8LCollectColorBlueTransforms = CollectColorBlueTransforms_SSE2; + VP8LCollectColorRedTransforms = CollectColorRedTransforms_SSE2; + VP8LAddVector = AddVector_SSE2; + VP8LAddVectorEq = AddVectorEq_SSE2; + VP8LCombinedShannonEntropy = CombinedShannonEntropy_SSE2; + VP8LVectorMismatch = VectorMismatch_SSE2; VP8LBundleColorMap = BundleColorMap_SSE2; VP8LPredictorsSub[0] = PredictorSub0_SSE2; diff --git a/src/3rdparty/libwebp/src/dsp/lossless_enc_sse41.c b/src/3rdparty/libwebp/src/dsp/lossless_enc_sse41.c index 821057c..719d8ed 100644 --- a/src/3rdparty/libwebp/src/dsp/lossless_enc_sse41.c +++ b/src/3rdparty/libwebp/src/dsp/lossless_enc_sse41.c @@ -11,17 +11,21 @@ // // Author: Skal (pascal.massimino@gmail.com) -#include "./dsp.h" +#include "src/dsp/dsp.h" #if defined(WEBP_USE_SSE41) #include <assert.h> #include <smmintrin.h> -#include "./lossless.h" +#include "src/dsp/lossless.h" + +// For sign-extended multiplying constants, pre-shifted by 5: +#define CST_5b(X) (((int16_t)((uint16_t)(X) << 8)) >> 5) //------------------------------------------------------------------------------ // Subtract-Green Transform -static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) { +static void SubtractGreenFromBlueAndRed_SSE41(uint32_t* argb_data, + int num_pixels) { int i; const __m128i kCstShuffle = _mm_set_epi8(-1, 13, -1, 13, -1, 9, -1, 9, -1, 5, -1, 5, -1, 1, -1, 1); @@ -38,12 +42,103 @@ static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) { } //------------------------------------------------------------------------------ +// Color Transform + +#define SPAN 8 +static void CollectColorBlueTransforms_SSE41(const uint32_t* argb, int stride, + int tile_width, int tile_height, + int green_to_blue, int red_to_blue, + int histo[]) { + const __m128i mults_r = _mm_set1_epi16(CST_5b(red_to_blue)); + const __m128i mults_g = _mm_set1_epi16(CST_5b(green_to_blue)); + const __m128i mask_g = _mm_set1_epi16((short)0xff00); // green mask + const __m128i mask_gb = _mm_set1_epi32(0xffff); // green/blue mask + const __m128i mask_b = _mm_set1_epi16(0x00ff); // blue mask + const __m128i shuffler_lo = _mm_setr_epi8(-1, 2, -1, 6, -1, 10, -1, 14, -1, + -1, -1, -1, -1, -1, -1, -1); + const __m128i shuffler_hi = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, + 2, -1, 6, -1, 10, -1, 14); + int y; + for (y = 0; y < tile_height; ++y) { + const uint32_t* const src = argb + y * stride; + int i, x; + for (x = 0; x + SPAN <= tile_width; x += SPAN) { + uint16_t values[SPAN]; + const __m128i in0 = _mm_loadu_si128((__m128i*)&src[x + 0]); + const __m128i in1 = _mm_loadu_si128((__m128i*)&src[x + SPAN / 2]); + const __m128i r0 = _mm_shuffle_epi8(in0, shuffler_lo); + const __m128i r1 = _mm_shuffle_epi8(in1, shuffler_hi); + const __m128i r = _mm_or_si128(r0, r1); // r 0 + const __m128i gb0 = _mm_and_si128(in0, mask_gb); + const __m128i gb1 = _mm_and_si128(in1, mask_gb); + const __m128i gb = _mm_packus_epi32(gb0, gb1); // g b + const __m128i g = _mm_and_si128(gb, mask_g); // g 0 + const __m128i A = _mm_mulhi_epi16(r, mults_r); // x dbr + const __m128i B = _mm_mulhi_epi16(g, mults_g); // x dbg + const __m128i C = _mm_sub_epi8(gb, B); // x b' + const __m128i D = _mm_sub_epi8(C, A); // x b'' + const __m128i E = _mm_and_si128(D, mask_b); // 0 b'' + _mm_storeu_si128((__m128i*)values, E); + for (i = 0; i < SPAN; ++i) ++histo[values[i]]; + } + } + { + const int left_over = tile_width & (SPAN - 1); + if (left_over > 0) { + VP8LCollectColorBlueTransforms_C(argb + tile_width - left_over, stride, + left_over, tile_height, + green_to_blue, red_to_blue, histo); + } + } +} + +static void CollectColorRedTransforms_SSE41(const uint32_t* argb, int stride, + int tile_width, int tile_height, + int green_to_red, int histo[]) { + const __m128i mults_g = _mm_set1_epi16(CST_5b(green_to_red)); + const __m128i mask_g = _mm_set1_epi32(0x00ff00); // green mask + const __m128i mask = _mm_set1_epi16(0xff); + + int y; + for (y = 0; y < tile_height; ++y) { + const uint32_t* const src = argb + y * stride; + int i, x; + for (x = 0; x + SPAN <= tile_width; x += SPAN) { + uint16_t values[SPAN]; + const __m128i in0 = _mm_loadu_si128((__m128i*)&src[x + 0]); + const __m128i in1 = _mm_loadu_si128((__m128i*)&src[x + SPAN / 2]); + const __m128i g0 = _mm_and_si128(in0, mask_g); // 0 0 | g 0 + const __m128i g1 = _mm_and_si128(in1, mask_g); + const __m128i g = _mm_packus_epi32(g0, g1); // g 0 + const __m128i A0 = _mm_srli_epi32(in0, 16); // 0 0 | x r + const __m128i A1 = _mm_srli_epi32(in1, 16); + const __m128i A = _mm_packus_epi32(A0, A1); // x r + const __m128i B = _mm_mulhi_epi16(g, mults_g); // x dr + const __m128i C = _mm_sub_epi8(A, B); // x r' + const __m128i D = _mm_and_si128(C, mask); // 0 r' + _mm_storeu_si128((__m128i*)values, D); + for (i = 0; i < SPAN; ++i) ++histo[values[i]]; + } + } + { + const int left_over = tile_width & (SPAN - 1); + if (left_over > 0) { + VP8LCollectColorRedTransforms_C(argb + tile_width - left_over, stride, + left_over, tile_height, green_to_red, + histo); + } + } +} + +//------------------------------------------------------------------------------ // Entry point extern void VP8LEncDspInitSSE41(void); WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitSSE41(void) { - VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed; + VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed_SSE41; + VP8LCollectColorBlueTransforms = CollectColorBlueTransforms_SSE41; + VP8LCollectColorRedTransforms = CollectColorRedTransforms_SSE41; } #else // !WEBP_USE_SSE41 diff --git a/src/3rdparty/libwebp/src/dsp/lossless_mips_dsp_r2.c b/src/3rdparty/libwebp/src/dsp/lossless_mips_dsp_r2.c index 2984ce8..9888854 100644 --- a/src/3rdparty/libwebp/src/dsp/lossless_mips_dsp_r2.c +++ b/src/3rdparty/libwebp/src/dsp/lossless_mips_dsp_r2.c @@ -12,12 +12,12 @@ // Author(s): Djordje Pesut (djordje.pesut@imgtec.com) // Jovan Zelincevic (jovan.zelincevic@imgtec.com) -#include "./dsp.h" +#include "src/dsp/dsp.h" #if defined(WEBP_USE_MIPS_DSP_R2) -#include "./lossless.h" -#include "./lossless_common.h" +#include "src/dsp/lossless.h" +#include "src/dsp/lossless_common.h" #define MAP_COLOR_FUNCS(FUNC_NAME, TYPE, GET_INDEX, GET_VALUE) \ static void FUNC_NAME(const TYPE* src, \ @@ -86,8 +86,8 @@ static void FUNC_NAME(const TYPE* src, \ } \ } -MAP_COLOR_FUNCS(MapARGB, uint32_t, VP8GetARGBIndex, VP8GetARGBValue) -MAP_COLOR_FUNCS(MapAlpha, uint8_t, VP8GetAlphaIndex, VP8GetAlphaValue) +MAP_COLOR_FUNCS(MapARGB_MIPSdspR2, uint32_t, VP8GetARGBIndex, VP8GetARGBValue) +MAP_COLOR_FUNCS(MapAlpha_MIPSdspR2, uint8_t, VP8GetAlphaIndex, VP8GetAlphaValue) #undef MAP_COLOR_FUNCS @@ -188,48 +188,52 @@ static WEBP_INLINE uint32_t Average4(uint32_t a0, uint32_t a1, return Average2(Average2(a0, a1), Average2(a2, a3)); } -static uint32_t Predictor5(uint32_t left, const uint32_t* const top) { +static uint32_t Predictor5_MIPSdspR2(uint32_t left, const uint32_t* const top) { return Average3(left, top[0], top[1]); } -static uint32_t Predictor6(uint32_t left, const uint32_t* const top) { +static uint32_t Predictor6_MIPSdspR2(uint32_t left, const uint32_t* const top) { return Average2(left, top[-1]); } -static uint32_t Predictor7(uint32_t left, const uint32_t* const top) { +static uint32_t Predictor7_MIPSdspR2(uint32_t left, const uint32_t* const top) { return Average2(left, top[0]); } -static uint32_t Predictor8(uint32_t left, const uint32_t* const top) { +static uint32_t Predictor8_MIPSdspR2(uint32_t left, const uint32_t* const top) { (void)left; return Average2(top[-1], top[0]); } -static uint32_t Predictor9(uint32_t left, const uint32_t* const top) { +static uint32_t Predictor9_MIPSdspR2(uint32_t left, const uint32_t* const top) { (void)left; return Average2(top[0], top[1]); } -static uint32_t Predictor10(uint32_t left, const uint32_t* const top) { +static uint32_t Predictor10_MIPSdspR2(uint32_t left, + const uint32_t* const top) { return Average4(left, top[-1], top[0], top[1]); } -static uint32_t Predictor11(uint32_t left, const uint32_t* const top) { +static uint32_t Predictor11_MIPSdspR2(uint32_t left, + const uint32_t* const top) { return Select(top[0], left, top[-1]); } -static uint32_t Predictor12(uint32_t left, const uint32_t* const top) { +static uint32_t Predictor12_MIPSdspR2(uint32_t left, + const uint32_t* const top) { return ClampedAddSubtractFull(left, top[0], top[-1]); } -static uint32_t Predictor13(uint32_t left, const uint32_t* const top) { +static uint32_t Predictor13_MIPSdspR2(uint32_t left, + const uint32_t* const top) { return ClampedAddSubtractHalf(left, top[0], top[-1]); } // Add green to blue and red channels (i.e. perform the inverse transform of // 'subtract green'). -static void AddGreenToBlueAndRed(const uint32_t* src, int num_pixels, - uint32_t* dst) { +static void AddGreenToBlueAndRed_MIPSdspR2(const uint32_t* src, int num_pixels, + uint32_t* dst) { uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; const uint32_t* const p_loop1_end = src + (num_pixels & ~3); const uint32_t* const p_loop2_end = src + num_pixels; @@ -285,9 +289,9 @@ static void AddGreenToBlueAndRed(const uint32_t* src, int num_pixels, ); } -static void TransformColorInverse(const VP8LMultipliers* const m, - const uint32_t* src, int num_pixels, - uint32_t* dst) { +static void TransformColorInverse_MIPSdspR2(const VP8LMultipliers* const m, + const uint32_t* src, int num_pixels, + uint32_t* dst) { int temp0, temp1, temp2, temp3, temp4, temp5; uint32_t argb, argb1, new_red; const uint32_t G_to_R = m->green_to_red_; @@ -356,8 +360,8 @@ static void TransformColorInverse(const VP8LMultipliers* const m, if (num_pixels & 1) VP8LTransformColorInverse_C(m, src, 1, dst); } -static void ConvertBGRAToRGB(const uint32_t* src, - int num_pixels, uint8_t* dst) { +static void ConvertBGRAToRGB_MIPSdspR2(const uint32_t* src, + int num_pixels, uint8_t* dst) { int temp0, temp1, temp2, temp3; const uint32_t* const p_loop1_end = src + (num_pixels & ~3); const uint32_t* const p_loop2_end = src + num_pixels; @@ -408,8 +412,8 @@ static void ConvertBGRAToRGB(const uint32_t* src, ); } -static void ConvertBGRAToRGBA(const uint32_t* src, - int num_pixels, uint8_t* dst) { +static void ConvertBGRAToRGBA_MIPSdspR2(const uint32_t* src, + int num_pixels, uint8_t* dst) { int temp0, temp1, temp2, temp3; const uint32_t* const p_loop1_end = src + (num_pixels & ~3); const uint32_t* const p_loop2_end = src + num_pixels; @@ -458,8 +462,8 @@ static void ConvertBGRAToRGBA(const uint32_t* src, ); } -static void ConvertBGRAToRGBA4444(const uint32_t* src, - int num_pixels, uint8_t* dst) { +static void ConvertBGRAToRGBA4444_MIPSdspR2(const uint32_t* src, + int num_pixels, uint8_t* dst) { int temp0, temp1, temp2, temp3, temp4, temp5; const uint32_t* const p_loop1_end = src + (num_pixels & ~3); const uint32_t* const p_loop2_end = src + num_pixels; @@ -492,7 +496,7 @@ static void ConvertBGRAToRGBA4444(const uint32_t* src, "ins %[temp3], %[temp5], 16, 4 \n\t" "addiu %[src], %[src], 16 \n\t" "precr.qb.ph %[temp3], %[temp3], %[temp2] \n\t" -#ifdef WEBP_SWAP_16BIT_CSP +#if (WEBP_SWAP_16BIT_CSP == 1) "usw %[temp1], 0(%[dst]) \n\t" "usw %[temp3], 4(%[dst]) \n\t" #else @@ -514,7 +518,7 @@ static void ConvertBGRAToRGBA4444(const uint32_t* src, "ins %[temp0], %[temp5], 16, 4 \n\t" "addiu %[src], %[src], 4 \n\t" "precr.qb.ph %[temp0], %[temp0], %[temp0] \n\t" -#ifdef WEBP_SWAP_16BIT_CSP +#if (WEBP_SWAP_16BIT_CSP == 1) "ush %[temp0], 0(%[dst]) \n\t" #else "wsbh %[temp0], %[temp0] \n\t" @@ -532,8 +536,8 @@ static void ConvertBGRAToRGBA4444(const uint32_t* src, ); } -static void ConvertBGRAToRGB565(const uint32_t* src, - int num_pixels, uint8_t* dst) { +static void ConvertBGRAToRGB565_MIPSdspR2(const uint32_t* src, + int num_pixels, uint8_t* dst) { int temp0, temp1, temp2, temp3, temp4, temp5; const uint32_t* const p_loop1_end = src + (num_pixels & ~3); const uint32_t* const p_loop2_end = src + num_pixels; @@ -570,7 +574,7 @@ static void ConvertBGRAToRGB565(const uint32_t* src, "ins %[temp2], %[temp3], 0, 5 \n\t" "addiu %[src], %[src], 16 \n\t" "append %[temp2], %[temp1], 16 \n\t" -#ifdef WEBP_SWAP_16BIT_CSP +#if (WEBP_SWAP_16BIT_CSP == 1) "usw %[temp0], 0(%[dst]) \n\t" "usw %[temp2], 4(%[dst]) \n\t" #else @@ -592,7 +596,7 @@ static void ConvertBGRAToRGB565(const uint32_t* src, "ins %[temp4], %[temp5], 0, 11 \n\t" "addiu %[src], %[src], 4 \n\t" "ins %[temp4], %[temp0], 0, 5 \n\t" -#ifdef WEBP_SWAP_16BIT_CSP +#if (WEBP_SWAP_16BIT_CSP == 1) "ush %[temp4], 0(%[dst]) \n\t" #else "wsbh %[temp4], %[temp4] \n\t" @@ -610,8 +614,8 @@ static void ConvertBGRAToRGB565(const uint32_t* src, ); } -static void ConvertBGRAToBGR(const uint32_t* src, - int num_pixels, uint8_t* dst) { +static void ConvertBGRAToBGR_MIPSdspR2(const uint32_t* src, + int num_pixels, uint8_t* dst) { int temp0, temp1, temp2, temp3; const uint32_t* const p_loop1_end = src + (num_pixels & ~3); const uint32_t* const p_loop2_end = src + num_pixels; @@ -662,24 +666,27 @@ static void ConvertBGRAToBGR(const uint32_t* src, extern void VP8LDspInitMIPSdspR2(void); WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitMIPSdspR2(void) { - VP8LMapColor32b = MapARGB; - VP8LMapColor8b = MapAlpha; - VP8LPredictors[5] = Predictor5; - VP8LPredictors[6] = Predictor6; - VP8LPredictors[7] = Predictor7; - VP8LPredictors[8] = Predictor8; - VP8LPredictors[9] = Predictor9; - VP8LPredictors[10] = Predictor10; - VP8LPredictors[11] = Predictor11; - VP8LPredictors[12] = Predictor12; - VP8LPredictors[13] = Predictor13; - VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed; - VP8LTransformColorInverse = TransformColorInverse; - VP8LConvertBGRAToRGB = ConvertBGRAToRGB; - VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA; - VP8LConvertBGRAToRGBA4444 = ConvertBGRAToRGBA4444; - VP8LConvertBGRAToRGB565 = ConvertBGRAToRGB565; - VP8LConvertBGRAToBGR = ConvertBGRAToBGR; + VP8LMapColor32b = MapARGB_MIPSdspR2; + VP8LMapColor8b = MapAlpha_MIPSdspR2; + + VP8LPredictors[5] = Predictor5_MIPSdspR2; + VP8LPredictors[6] = Predictor6_MIPSdspR2; + VP8LPredictors[7] = Predictor7_MIPSdspR2; + VP8LPredictors[8] = Predictor8_MIPSdspR2; + VP8LPredictors[9] = Predictor9_MIPSdspR2; + VP8LPredictors[10] = Predictor10_MIPSdspR2; + VP8LPredictors[11] = Predictor11_MIPSdspR2; + VP8LPredictors[12] = Predictor12_MIPSdspR2; + VP8LPredictors[13] = Predictor13_MIPSdspR2; + + VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed_MIPSdspR2; + VP8LTransformColorInverse = TransformColorInverse_MIPSdspR2; + + VP8LConvertBGRAToRGB = ConvertBGRAToRGB_MIPSdspR2; + VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA_MIPSdspR2; + VP8LConvertBGRAToRGBA4444 = ConvertBGRAToRGBA4444_MIPSdspR2; + VP8LConvertBGRAToRGB565 = ConvertBGRAToRGB565_MIPSdspR2; + VP8LConvertBGRAToBGR = ConvertBGRAToBGR_MIPSdspR2; } #else // !WEBP_USE_MIPS_DSP_R2 diff --git a/src/3rdparty/libwebp/src/dsp/lossless_msa.c b/src/3rdparty/libwebp/src/dsp/lossless_msa.c index f6dd564..9f54720 100644 --- a/src/3rdparty/libwebp/src/dsp/lossless_msa.c +++ b/src/3rdparty/libwebp/src/dsp/lossless_msa.c @@ -11,12 +11,12 @@ // // Author: Prashant Patil (prashant.patil@imgtec.com) -#include "./dsp.h" +#include "src/dsp/dsp.h" #if defined(WEBP_USE_MSA) -#include "./lossless.h" -#include "./msa_macro.h" +#include "src/dsp/lossless.h" +#include "src/dsp/msa_macro.h" //------------------------------------------------------------------------------ // Colorspace conversion functions @@ -43,7 +43,7 @@ #define CONVERT8_BGRA_XXX(psrc, pdst, m0, m1) do { \ uint64_t pix_d; \ - v16u8 src0, src1, src2, dst0, dst1; \ + v16u8 src0, src1, src2 = { 0 }, dst0, dst1; \ LD_UB2(psrc, 16, src0, src1); \ VSHF_B2_UB(src0, src1, src1, src2, m0, m1, dst0, dst1); \ ST_UB(dst0, pdst); \ @@ -109,8 +109,8 @@ dst = VSHF_UB(src, t0, mask1); \ } while (0) -static void ConvertBGRAToRGBA(const uint32_t* src, - int num_pixels, uint8_t* dst) { +static void ConvertBGRAToRGBA_MSA(const uint32_t* src, + int num_pixels, uint8_t* dst) { int i; const uint8_t* ptemp_src = (const uint8_t*)src; uint8_t* ptemp_dst = (uint8_t*)dst; @@ -150,8 +150,8 @@ static void ConvertBGRAToRGBA(const uint32_t* src, } } -static void ConvertBGRAToBGR(const uint32_t* src, - int num_pixels, uint8_t* dst) { +static void ConvertBGRAToBGR_MSA(const uint32_t* src, + int num_pixels, uint8_t* dst) { const uint8_t* ptemp_src = (const uint8_t*)src; uint8_t* ptemp_dst = (uint8_t*)dst; const v16u8 mask0 = { 0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, @@ -197,8 +197,8 @@ static void ConvertBGRAToBGR(const uint32_t* src, } } -static void ConvertBGRAToRGB(const uint32_t* src, - int num_pixels, uint8_t* dst) { +static void ConvertBGRAToRGB_MSA(const uint32_t* src, + int num_pixels, uint8_t* dst) { const uint8_t* ptemp_src = (const uint8_t*)src; uint8_t* ptemp_dst = (uint8_t*)dst; const v16u8 mask0 = { 2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, @@ -244,8 +244,8 @@ static void ConvertBGRAToRGB(const uint32_t* src, } } -static void AddGreenToBlueAndRed(const uint32_t* const src, int num_pixels, - uint32_t* dst) { +static void AddGreenToBlueAndRed_MSA(const uint32_t* const src, int num_pixels, + uint32_t* dst) { int i; const uint8_t* in = (const uint8_t*)src; uint8_t* out = (uint8_t*)dst; @@ -286,9 +286,9 @@ static void AddGreenToBlueAndRed(const uint32_t* const src, int num_pixels, } } -static void TransformColorInverse(const VP8LMultipliers* const m, - const uint32_t* src, int num_pixels, - uint32_t* dst) { +static void TransformColorInverse_MSA(const VP8LMultipliers* const m, + const uint32_t* src, int num_pixels, + uint32_t* dst) { v16u8 src0, dst0; const v16i8 g2br = (v16i8)__msa_fill_w(m->green_to_blue_ | (m->green_to_red_ << 16)); @@ -341,11 +341,12 @@ static void TransformColorInverse(const VP8LMultipliers* const m, extern void VP8LDspInitMSA(void); WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitMSA(void) { - VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA; - VP8LConvertBGRAToBGR = ConvertBGRAToBGR; - VP8LConvertBGRAToRGB = ConvertBGRAToRGB; - VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed; - VP8LTransformColorInverse = TransformColorInverse; + VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA_MSA; + VP8LConvertBGRAToBGR = ConvertBGRAToBGR_MSA; + VP8LConvertBGRAToRGB = ConvertBGRAToRGB_MSA; + + VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed_MSA; + VP8LTransformColorInverse = TransformColorInverse_MSA; } #else // !WEBP_USE_MSA diff --git a/src/3rdparty/libwebp/src/dsp/lossless_neon.c b/src/3rdparty/libwebp/src/dsp/lossless_neon.c index 1145d5f..76a1b6f 100644 --- a/src/3rdparty/libwebp/src/dsp/lossless_neon.c +++ b/src/3rdparty/libwebp/src/dsp/lossless_neon.c @@ -11,14 +11,14 @@ // // Author: Skal (pascal.massimino@gmail.com) -#include "./dsp.h" +#include "src/dsp/dsp.h" #if defined(WEBP_USE_NEON) #include <arm_neon.h> -#include "./lossless.h" -#include "./neon.h" +#include "src/dsp/lossless.h" +#include "src/dsp/neon.h" //------------------------------------------------------------------------------ // Colorspace conversion functions @@ -26,8 +26,8 @@ #if !defined(WORK_AROUND_GCC) // gcc 4.6.0 had some trouble (NDK-r9) with this code. We only use it for // gcc-4.8.x at least. -static void ConvertBGRAToRGBA(const uint32_t* src, - int num_pixels, uint8_t* dst) { +static void ConvertBGRAToRGBA_NEON(const uint32_t* src, + int num_pixels, uint8_t* dst) { const uint32_t* const end = src + (num_pixels & ~15); for (; src < end; src += 16) { uint8x16x4_t pixel = vld4q_u8((uint8_t*)src); @@ -41,8 +41,8 @@ static void ConvertBGRAToRGBA(const uint32_t* src, VP8LConvertBGRAToRGBA_C(src, num_pixels & 15, dst); // left-overs } -static void ConvertBGRAToBGR(const uint32_t* src, - int num_pixels, uint8_t* dst) { +static void ConvertBGRAToBGR_NEON(const uint32_t* src, + int num_pixels, uint8_t* dst) { const uint32_t* const end = src + (num_pixels & ~15); for (; src < end; src += 16) { const uint8x16x4_t pixel = vld4q_u8((uint8_t*)src); @@ -53,8 +53,8 @@ static void ConvertBGRAToBGR(const uint32_t* src, VP8LConvertBGRAToBGR_C(src, num_pixels & 15, dst); // left-overs } -static void ConvertBGRAToRGB(const uint32_t* src, - int num_pixels, uint8_t* dst) { +static void ConvertBGRAToRGB_NEON(const uint32_t* src, + int num_pixels, uint8_t* dst) { const uint32_t* const end = src + (num_pixels & ~15); for (; src < end; src += 16) { const uint8x16x4_t pixel = vld4q_u8((uint8_t*)src); @@ -71,8 +71,8 @@ static void ConvertBGRAToRGB(const uint32_t* src, static const uint8_t kRGBAShuffle[8] = { 2, 1, 0, 3, 6, 5, 4, 7 }; -static void ConvertBGRAToRGBA(const uint32_t* src, - int num_pixels, uint8_t* dst) { +static void ConvertBGRAToRGBA_NEON(const uint32_t* src, + int num_pixels, uint8_t* dst) { const uint32_t* const end = src + (num_pixels & ~1); const uint8x8_t shuffle = vld1_u8(kRGBAShuffle); for (; src < end; src += 2) { @@ -89,8 +89,8 @@ static const uint8_t kBGRShuffle[3][8] = { { 21, 22, 24, 25, 26, 28, 29, 30 } }; -static void ConvertBGRAToBGR(const uint32_t* src, - int num_pixels, uint8_t* dst) { +static void ConvertBGRAToBGR_NEON(const uint32_t* src, + int num_pixels, uint8_t* dst) { const uint32_t* const end = src + (num_pixels & ~7); const uint8x8_t shuffle0 = vld1_u8(kBGRShuffle[0]); const uint8x8_t shuffle1 = vld1_u8(kBGRShuffle[1]); @@ -116,8 +116,8 @@ static const uint8_t kRGBShuffle[3][8] = { { 21, 20, 26, 25, 24, 30, 29, 28 } }; -static void ConvertBGRAToRGB(const uint32_t* src, - int num_pixels, uint8_t* dst) { +static void ConvertBGRAToRGB_NEON(const uint32_t* src, + int num_pixels, uint8_t* dst) { const uint32_t* const end = src + (num_pixels & ~7); const uint8x8_t shuffle0 = vld1_u8(kRGBShuffle[0]); const uint8x8_t shuffle1 = vld1_u8(kRGBShuffle[1]); @@ -139,7 +139,6 @@ static void ConvertBGRAToRGB(const uint32_t* src, #endif // !WORK_AROUND_GCC - //------------------------------------------------------------------------------ // Predictor Transform @@ -506,8 +505,8 @@ static const uint8_t kGreenShuffle[16] = { 1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255 }; -static WEBP_INLINE uint8x16_t DoGreenShuffle(const uint8x16_t argb, - const uint8x16_t shuffle) { +static WEBP_INLINE uint8x16_t DoGreenShuffle_NEON(const uint8x16_t argb, + const uint8x16_t shuffle) { return vcombine_u8(vtbl1q_u8(argb, vget_low_u8(shuffle)), vtbl1q_u8(argb, vget_high_u8(shuffle))); } @@ -515,15 +514,15 @@ static WEBP_INLINE uint8x16_t DoGreenShuffle(const uint8x16_t argb, // 255 = byte will be zeroed static const uint8_t kGreenShuffle[8] = { 1, 255, 1, 255, 5, 255, 5, 255 }; -static WEBP_INLINE uint8x16_t DoGreenShuffle(const uint8x16_t argb, - const uint8x8_t shuffle) { +static WEBP_INLINE uint8x16_t DoGreenShuffle_NEON(const uint8x16_t argb, + const uint8x8_t shuffle) { return vcombine_u8(vtbl1_u8(vget_low_u8(argb), shuffle), vtbl1_u8(vget_high_u8(argb), shuffle)); } #endif // USE_VTBLQ -static void AddGreenToBlueAndRed(const uint32_t* src, int num_pixels, - uint32_t* dst) { +static void AddGreenToBlueAndRed_NEON(const uint32_t* src, int num_pixels, + uint32_t* dst) { const uint32_t* const end = src + (num_pixels & ~3); #ifdef USE_VTBLQ const uint8x16_t shuffle = vld1q_u8(kGreenShuffle); @@ -532,7 +531,7 @@ static void AddGreenToBlueAndRed(const uint32_t* src, int num_pixels, #endif for (; src < end; src += 4, dst += 4) { const uint8x16_t argb = vld1q_u8((const uint8_t*)src); - const uint8x16_t greens = DoGreenShuffle(argb, shuffle); + const uint8x16_t greens = DoGreenShuffle_NEON(argb, shuffle); vst1q_u8((uint8_t*)dst, vaddq_u8(argb, greens)); } // fallthrough and finish off with plain-C @@ -542,9 +541,9 @@ static void AddGreenToBlueAndRed(const uint32_t* src, int num_pixels, //------------------------------------------------------------------------------ // Color Transform -static void TransformColorInverse(const VP8LMultipliers* const m, - const uint32_t* const src, int num_pixels, - uint32_t* dst) { +static void TransformColorInverse_NEON(const VP8LMultipliers* const m, + const uint32_t* const src, + int num_pixels, uint32_t* dst) { // sign-extended multiplying constants, pre-shifted by 6. #define CST(X) (((int16_t)(m->X << 8)) >> 6) const int16_t rb[8] = { @@ -575,7 +574,7 @@ static void TransformColorInverse(const VP8LMultipliers* const m, const uint8x16_t in = vld1q_u8((const uint8_t*)(src + i)); const uint32x4_t a0g0 = vandq_u32(vreinterpretq_u32_u8(in), mask_ag); // 0 g 0 g - const uint8x16_t greens = DoGreenShuffle(in, shuffle); + const uint8x16_t greens = DoGreenShuffle_NEON(in, shuffle); // x dr x db1 const int16x8_t A = vqdmulhq_s16(vreinterpretq_s16_u8(greens), mults_rb); // x r' x b' @@ -627,12 +626,12 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitNEON(void) { VP8LPredictorsAdd[12] = PredictorAdd12_NEON; VP8LPredictorsAdd[13] = PredictorAdd13_NEON; - VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA; - VP8LConvertBGRAToBGR = ConvertBGRAToBGR; - VP8LConvertBGRAToRGB = ConvertBGRAToRGB; + VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA_NEON; + VP8LConvertBGRAToBGR = ConvertBGRAToBGR_NEON; + VP8LConvertBGRAToRGB = ConvertBGRAToRGB_NEON; - VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed; - VP8LTransformColorInverse = TransformColorInverse; + VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed_NEON; + VP8LTransformColorInverse = TransformColorInverse_NEON; } #else // !WEBP_USE_NEON diff --git a/src/3rdparty/libwebp/src/dsp/lossless_sse2.c b/src/3rdparty/libwebp/src/dsp/lossless_sse2.c index 15aae93..17d7576 100644 --- a/src/3rdparty/libwebp/src/dsp/lossless_sse2.c +++ b/src/3rdparty/libwebp/src/dsp/lossless_sse2.c @@ -11,21 +11,22 @@ // // Author: Skal (pascal.massimino@gmail.com) -#include "./dsp.h" +#include "src/dsp/dsp.h" #if defined(WEBP_USE_SSE2) -#include "./common_sse2.h" -#include "./lossless.h" -#include "./lossless_common.h" +#include "src/dsp/common_sse2.h" +#include "src/dsp/lossless.h" +#include "src/dsp/lossless_common.h" #include <assert.h> #include <emmintrin.h> //------------------------------------------------------------------------------ // Predictor Transform -static WEBP_INLINE uint32_t ClampedAddSubtractFull(uint32_t c0, uint32_t c1, - uint32_t c2) { +static WEBP_INLINE uint32_t ClampedAddSubtractFull_SSE2(uint32_t c0, + uint32_t c1, + uint32_t c2) { const __m128i zero = _mm_setzero_si128(); const __m128i C0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c0), zero); const __m128i C1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c1), zero); @@ -37,8 +38,9 @@ static WEBP_INLINE uint32_t ClampedAddSubtractFull(uint32_t c0, uint32_t c1, return output; } -static WEBP_INLINE uint32_t ClampedAddSubtractHalf(uint32_t c0, uint32_t c1, - uint32_t c2) { +static WEBP_INLINE uint32_t ClampedAddSubtractHalf_SSE2(uint32_t c0, + uint32_t c1, + uint32_t c2) { const __m128i zero = _mm_setzero_si128(); const __m128i C0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c0), zero); const __m128i C1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c1), zero); @@ -55,7 +57,7 @@ static WEBP_INLINE uint32_t ClampedAddSubtractHalf(uint32_t c0, uint32_t c1, return output; } -static WEBP_INLINE uint32_t Select(uint32_t a, uint32_t b, uint32_t c) { +static WEBP_INLINE uint32_t Select_SSE2(uint32_t a, uint32_t b, uint32_t c) { int pa_minus_pb; const __m128i zero = _mm_setzero_si128(); const __m128i A0 = _mm_cvtsi32_si128(a); @@ -88,8 +90,9 @@ static WEBP_INLINE void Average2_m128i(const __m128i* const a0, *avg = _mm_sub_epi8(avg1, one); } -static WEBP_INLINE void Average2_uint32(const uint32_t a0, const uint32_t a1, - __m128i* const avg) { +static WEBP_INLINE void Average2_uint32_SSE2(const uint32_t a0, + const uint32_t a1, + __m128i* const avg) { // (a + b) >> 1 = ((a + b + 1) >> 1) - ((a ^ b) & 1) const __m128i ones = _mm_set1_epi8(1); const __m128i A0 = _mm_cvtsi32_si128(a0); @@ -99,7 +102,7 @@ static WEBP_INLINE void Average2_uint32(const uint32_t a0, const uint32_t a1, *avg = _mm_sub_epi8(avg1, one); } -static WEBP_INLINE __m128i Average2_uint32_16(uint32_t a0, uint32_t a1) { +static WEBP_INLINE __m128i Average2_uint32_16_SSE2(uint32_t a0, uint32_t a1) { const __m128i zero = _mm_setzero_si128(); const __m128i A0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(a0), zero); const __m128i A1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(a1), zero); @@ -107,15 +110,16 @@ static WEBP_INLINE __m128i Average2_uint32_16(uint32_t a0, uint32_t a1) { return _mm_srli_epi16(sum, 1); } -static WEBP_INLINE uint32_t Average2(uint32_t a0, uint32_t a1) { +static WEBP_INLINE uint32_t Average2_SSE2(uint32_t a0, uint32_t a1) { __m128i output; - Average2_uint32(a0, a1, &output); + Average2_uint32_SSE2(a0, a1, &output); return _mm_cvtsi128_si32(output); } -static WEBP_INLINE uint32_t Average3(uint32_t a0, uint32_t a1, uint32_t a2) { +static WEBP_INLINE uint32_t Average3_SSE2(uint32_t a0, uint32_t a1, + uint32_t a2) { const __m128i zero = _mm_setzero_si128(); - const __m128i avg1 = Average2_uint32_16(a0, a2); + const __m128i avg1 = Average2_uint32_16_SSE2(a0, a2); const __m128i A1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(a1), zero); const __m128i sum = _mm_add_epi16(avg1, A1); const __m128i avg2 = _mm_srli_epi16(sum, 1); @@ -124,10 +128,10 @@ static WEBP_INLINE uint32_t Average3(uint32_t a0, uint32_t a1, uint32_t a2) { return output; } -static WEBP_INLINE uint32_t Average4(uint32_t a0, uint32_t a1, - uint32_t a2, uint32_t a3) { - const __m128i avg1 = Average2_uint32_16(a0, a1); - const __m128i avg2 = Average2_uint32_16(a2, a3); +static WEBP_INLINE uint32_t Average4_SSE2(uint32_t a0, uint32_t a1, + uint32_t a2, uint32_t a3) { + const __m128i avg1 = Average2_uint32_16_SSE2(a0, a1); + const __m128i avg2 = Average2_uint32_16_SSE2(a2, a3); const __m128i sum = _mm_add_epi16(avg2, avg1); const __m128i avg3 = _mm_srli_epi16(sum, 1); const __m128i A0 = _mm_packus_epi16(avg3, avg3); @@ -136,41 +140,41 @@ static WEBP_INLINE uint32_t Average4(uint32_t a0, uint32_t a1, } static uint32_t Predictor5_SSE2(uint32_t left, const uint32_t* const top) { - const uint32_t pred = Average3(left, top[0], top[1]); + const uint32_t pred = Average3_SSE2(left, top[0], top[1]); return pred; } static uint32_t Predictor6_SSE2(uint32_t left, const uint32_t* const top) { - const uint32_t pred = Average2(left, top[-1]); + const uint32_t pred = Average2_SSE2(left, top[-1]); return pred; } static uint32_t Predictor7_SSE2(uint32_t left, const uint32_t* const top) { - const uint32_t pred = Average2(left, top[0]); + const uint32_t pred = Average2_SSE2(left, top[0]); return pred; } static uint32_t Predictor8_SSE2(uint32_t left, const uint32_t* const top) { - const uint32_t pred = Average2(top[-1], top[0]); + const uint32_t pred = Average2_SSE2(top[-1], top[0]); (void)left; return pred; } static uint32_t Predictor9_SSE2(uint32_t left, const uint32_t* const top) { - const uint32_t pred = Average2(top[0], top[1]); + const uint32_t pred = Average2_SSE2(top[0], top[1]); (void)left; return pred; } static uint32_t Predictor10_SSE2(uint32_t left, const uint32_t* const top) { - const uint32_t pred = Average4(left, top[-1], top[0], top[1]); + const uint32_t pred = Average4_SSE2(left, top[-1], top[0], top[1]); return pred; } static uint32_t Predictor11_SSE2(uint32_t left, const uint32_t* const top) { - const uint32_t pred = Select(top[0], left, top[-1]); + const uint32_t pred = Select_SSE2(top[0], left, top[-1]); return pred; } static uint32_t Predictor12_SSE2(uint32_t left, const uint32_t* const top) { - const uint32_t pred = ClampedAddSubtractFull(left, top[0], top[-1]); + const uint32_t pred = ClampedAddSubtractFull_SSE2(left, top[0], top[-1]); return pred; } static uint32_t Predictor13_SSE2(uint32_t left, const uint32_t* const top) { - const uint32_t pred = ClampedAddSubtractHalf(left, top[0], top[-1]); + const uint32_t pred = ClampedAddSubtractHalf_SSE2(left, top[0], top[-1]); return pred; } @@ -272,9 +276,24 @@ GENERATE_PREDICTOR_2(9, upper[i + 1]) #undef GENERATE_PREDICTOR_2 // Predictor10: average of (average of (L,TL), average of (T, TR)). +#define DO_PRED10(OUT) do { \ + __m128i avgLTL, avg; \ + Average2_m128i(&L, &TL, &avgLTL); \ + Average2_m128i(&avgTTR, &avgLTL, &avg); \ + L = _mm_add_epi8(avg, src); \ + out[i + (OUT)] = _mm_cvtsi128_si32(L); \ +} while (0) + +#define DO_PRED10_SHIFT do { \ + /* Rotate the pre-computed values for the next iteration.*/ \ + avgTTR = _mm_srli_si128(avgTTR, 4); \ + TL = _mm_srli_si128(TL, 4); \ + src = _mm_srli_si128(src, 4); \ +} while (0) + static void PredictorAdd10_SSE2(const uint32_t* in, const uint32_t* upper, int num_pixels, uint32_t* out) { - int i, j; + int i; __m128i L = _mm_cvtsi32_si128(out[-1]); for (i = 0; i + 4 <= num_pixels; i += 4) { __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); @@ -283,79 +302,90 @@ static void PredictorAdd10_SSE2(const uint32_t* in, const uint32_t* upper, const __m128i TR = _mm_loadu_si128((const __m128i*)&upper[i + 1]); __m128i avgTTR; Average2_m128i(&T, &TR, &avgTTR); - for (j = 0; j < 4; ++j) { - __m128i avgLTL, avg; - Average2_m128i(&L, &TL, &avgLTL); - Average2_m128i(&avgTTR, &avgLTL, &avg); - L = _mm_add_epi8(avg, src); - out[i + j] = _mm_cvtsi128_si32(L); - // Rotate the pre-computed values for the next iteration. - avgTTR = _mm_srli_si128(avgTTR, 4); - TL = _mm_srli_si128(TL, 4); - src = _mm_srli_si128(src, 4); - } + DO_PRED10(0); + DO_PRED10_SHIFT; + DO_PRED10(1); + DO_PRED10_SHIFT; + DO_PRED10(2); + DO_PRED10_SHIFT; + DO_PRED10(3); } if (i != num_pixels) { VP8LPredictorsAdd_C[10](in + i, upper + i, num_pixels - i, out + i); } } +#undef DO_PRED10 +#undef DO_PRED10_SHIFT // Predictor11: select. -static void GetSumAbsDiff32(const __m128i* const A, const __m128i* const B, - __m128i* const out) { - // We can unpack with any value on the upper 32 bits, provided it's the same - // on both operands (to that their sum of abs diff is zero). Here we use *A. - const __m128i A_lo = _mm_unpacklo_epi32(*A, *A); - const __m128i B_lo = _mm_unpacklo_epi32(*B, *A); - const __m128i A_hi = _mm_unpackhi_epi32(*A, *A); - const __m128i B_hi = _mm_unpackhi_epi32(*B, *A); - const __m128i s_lo = _mm_sad_epu8(A_lo, B_lo); - const __m128i s_hi = _mm_sad_epu8(A_hi, B_hi); - *out = _mm_packs_epi32(s_lo, s_hi); -} +#define DO_PRED11(OUT) do { \ + const __m128i L_lo = _mm_unpacklo_epi32(L, T); \ + const __m128i TL_lo = _mm_unpacklo_epi32(TL, T); \ + const __m128i pb = _mm_sad_epu8(L_lo, TL_lo); /* pb = sum |L-TL|*/ \ + const __m128i mask = _mm_cmpgt_epi32(pb, pa); \ + const __m128i A = _mm_and_si128(mask, L); \ + const __m128i B = _mm_andnot_si128(mask, T); \ + const __m128i pred = _mm_or_si128(A, B); /* pred = (pa > b)? L : T*/ \ + L = _mm_add_epi8(src, pred); \ + out[i + (OUT)] = _mm_cvtsi128_si32(L); \ +} while (0) + +#define DO_PRED11_SHIFT do { \ + /* Shift the pre-computed value for the next iteration.*/ \ + T = _mm_srli_si128(T, 4); \ + TL = _mm_srli_si128(TL, 4); \ + src = _mm_srli_si128(src, 4); \ + pa = _mm_srli_si128(pa, 4); \ +} while (0) static void PredictorAdd11_SSE2(const uint32_t* in, const uint32_t* upper, int num_pixels, uint32_t* out) { - int i, j; + int i; + __m128i pa; __m128i L = _mm_cvtsi32_si128(out[-1]); for (i = 0; i + 4 <= num_pixels; i += 4) { __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]); __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]); __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); - __m128i pa; - GetSumAbsDiff32(&T, &TL, &pa); // pa = sum |T-TL| - for (j = 0; j < 4; ++j) { - const __m128i L_lo = _mm_unpacklo_epi32(L, L); - const __m128i TL_lo = _mm_unpacklo_epi32(TL, L); - const __m128i pb = _mm_sad_epu8(L_lo, TL_lo); // pb = sum |L-TL| - const __m128i mask = _mm_cmpgt_epi32(pb, pa); - const __m128i A = _mm_and_si128(mask, L); - const __m128i B = _mm_andnot_si128(mask, T); - const __m128i pred = _mm_or_si128(A, B); // pred = (L > T)? L : T - L = _mm_add_epi8(src, pred); - out[i + j] = _mm_cvtsi128_si32(L); - // Shift the pre-computed value for the next iteration. - T = _mm_srli_si128(T, 4); - TL = _mm_srli_si128(TL, 4); - src = _mm_srli_si128(src, 4); - pa = _mm_srli_si128(pa, 4); + { + // We can unpack with any value on the upper 32 bits, provided it's the + // same on both operands (so that their sum of abs diff is zero). Here we + // use T. + const __m128i T_lo = _mm_unpacklo_epi32(T, T); + const __m128i TL_lo = _mm_unpacklo_epi32(TL, T); + const __m128i T_hi = _mm_unpackhi_epi32(T, T); + const __m128i TL_hi = _mm_unpackhi_epi32(TL, T); + const __m128i s_lo = _mm_sad_epu8(T_lo, TL_lo); + const __m128i s_hi = _mm_sad_epu8(T_hi, TL_hi); + pa = _mm_packs_epi32(s_lo, s_hi); // pa = sum |T-TL| } + DO_PRED11(0); + DO_PRED11_SHIFT; + DO_PRED11(1); + DO_PRED11_SHIFT; + DO_PRED11(2); + DO_PRED11_SHIFT; + DO_PRED11(3); } if (i != num_pixels) { VP8LPredictorsAdd_C[11](in + i, upper + i, num_pixels - i, out + i); } } +#undef DO_PRED11 +#undef DO_PRED11_SHIFT // Predictor12: ClampedAddSubtractFull. -#define DO_PRED12(DIFF, LANE, OUT) \ -do { \ - const __m128i all = _mm_add_epi16(L, (DIFF)); \ - const __m128i alls = _mm_packus_epi16(all, all); \ - const __m128i res = _mm_add_epi8(src, alls); \ - out[i + (OUT)] = _mm_cvtsi128_si32(res); \ - L = _mm_unpacklo_epi8(res, zero); \ +#define DO_PRED12(DIFF, LANE, OUT) do { \ + const __m128i all = _mm_add_epi16(L, (DIFF)); \ + const __m128i alls = _mm_packus_epi16(all, all); \ + const __m128i res = _mm_add_epi8(src, alls); \ + out[i + (OUT)] = _mm_cvtsi128_si32(res); \ + L = _mm_unpacklo_epi8(res, zero); \ +} while (0) + +#define DO_PRED12_SHIFT(DIFF, LANE) do { \ /* Shift the pre-computed value for the next iteration.*/ \ - if (LANE == 0) (DIFF) = _mm_srli_si128((DIFF), 8); \ + if ((LANE) == 0) (DIFF) = _mm_srli_si128((DIFF), 8); \ src = _mm_srli_si128(src, 4); \ } while (0) @@ -377,8 +407,11 @@ static void PredictorAdd12_SSE2(const uint32_t* in, const uint32_t* upper, __m128i diff_lo = _mm_sub_epi16(T_lo, TL_lo); __m128i diff_hi = _mm_sub_epi16(T_hi, TL_hi); DO_PRED12(diff_lo, 0, 0); + DO_PRED12_SHIFT(diff_lo, 0); DO_PRED12(diff_lo, 1, 1); + DO_PRED12_SHIFT(diff_lo, 1); DO_PRED12(diff_hi, 0, 2); + DO_PRED12_SHIFT(diff_hi, 0); DO_PRED12(diff_hi, 1, 3); } if (i != num_pixels) { @@ -386,6 +419,7 @@ static void PredictorAdd12_SSE2(const uint32_t* in, const uint32_t* upper, } } #undef DO_PRED12 +#undef DO_PRED12_SHIFT // Due to averages with integers, values cannot be accumulated in parallel for // predictors 13. @@ -394,8 +428,8 @@ GENERATE_PREDICTOR_ADD(Predictor13_SSE2, PredictorAdd13_SSE2) //------------------------------------------------------------------------------ // Subtract-Green Transform -static void AddGreenToBlueAndRed(const uint32_t* const src, int num_pixels, - uint32_t* dst) { +static void AddGreenToBlueAndRed_SSE2(const uint32_t* const src, int num_pixels, + uint32_t* dst) { int i; for (i = 0; i + 4 <= num_pixels; i += 4) { const __m128i in = _mm_loadu_si128((const __m128i*)&src[i]); // argb @@ -414,19 +448,16 @@ static void AddGreenToBlueAndRed(const uint32_t* const src, int num_pixels, //------------------------------------------------------------------------------ // Color Transform -static void TransformColorInverse(const VP8LMultipliers* const m, - const uint32_t* const src, int num_pixels, - uint32_t* dst) { +static void TransformColorInverse_SSE2(const VP8LMultipliers* const m, + const uint32_t* const src, + int num_pixels, uint32_t* dst) { // sign-extended multiplying constants, pre-shifted by 5. #define CST(X) (((int16_t)(m->X << 8)) >> 5) // sign-extend - const __m128i mults_rb = _mm_set_epi16( - CST(green_to_red_), CST(green_to_blue_), - CST(green_to_red_), CST(green_to_blue_), - CST(green_to_red_), CST(green_to_blue_), - CST(green_to_red_), CST(green_to_blue_)); - const __m128i mults_b2 = _mm_set_epi16( - CST(red_to_blue_), 0, CST(red_to_blue_), 0, - CST(red_to_blue_), 0, CST(red_to_blue_), 0); +#define MK_CST_16(HI, LO) \ + _mm_set1_epi32((int)(((uint32_t)(HI) << 16) | ((LO) & 0xffff))) + const __m128i mults_rb = MK_CST_16(CST(green_to_red_), CST(green_to_blue_)); + const __m128i mults_b2 = MK_CST_16(CST(red_to_blue_), 0); +#undef MK_CST_16 #undef CST const __m128i mask_ag = _mm_set1_epi32(0xff00ff00); // alpha-green masks int i; @@ -454,8 +485,8 @@ static void TransformColorInverse(const VP8LMultipliers* const m, //------------------------------------------------------------------------------ // Color-space conversion functions -static void ConvertBGRAToRGB(const uint32_t* src, int num_pixels, - uint8_t* dst) { +static void ConvertBGRAToRGB_SSE2(const uint32_t* src, int num_pixels, + uint8_t* dst) { const __m128i* in = (const __m128i*)src; __m128i* out = (__m128i*)dst; @@ -469,11 +500,11 @@ static void ConvertBGRAToRGB(const uint32_t* src, int num_pixels, __m128i in5 = _mm_loadu_si128(in + 5); __m128i in6 = _mm_loadu_si128(in + 6); __m128i in7 = _mm_loadu_si128(in + 7); - VP8L32bToPlanar(&in0, &in1, &in2, &in3); - VP8L32bToPlanar(&in4, &in5, &in6, &in7); + VP8L32bToPlanar_SSE2(&in0, &in1, &in2, &in3); + VP8L32bToPlanar_SSE2(&in4, &in5, &in6, &in7); // At this points, in1/in5 contains red only, in2/in6 green only ... // Pack the colors in 24b RGB. - VP8PlanarTo24b(&in1, &in5, &in2, &in6, &in3, &in7); + VP8PlanarTo24b_SSE2(&in1, &in5, &in2, &in6, &in3, &in7); _mm_storeu_si128(out + 0, in1); _mm_storeu_si128(out + 1, in5); _mm_storeu_si128(out + 2, in2); @@ -490,27 +521,26 @@ static void ConvertBGRAToRGB(const uint32_t* src, int num_pixels, } } -static void ConvertBGRAToRGBA(const uint32_t* src, - int num_pixels, uint8_t* dst) { +static void ConvertBGRAToRGBA_SSE2(const uint32_t* src, + int num_pixels, uint8_t* dst) { + const __m128i red_blue_mask = _mm_set1_epi32(0x00ff00ffu); const __m128i* in = (const __m128i*)src; __m128i* out = (__m128i*)dst; while (num_pixels >= 8) { - const __m128i bgra0 = _mm_loadu_si128(in++); // bgra0|bgra1|bgra2|bgra3 - const __m128i bgra4 = _mm_loadu_si128(in++); // bgra4|bgra5|bgra6|bgra7 - const __m128i v0l = _mm_unpacklo_epi8(bgra0, bgra4); // b0b4g0g4r0r4a0a4... - const __m128i v0h = _mm_unpackhi_epi8(bgra0, bgra4); // b2b6g2g6r2r6a2a6... - const __m128i v1l = _mm_unpacklo_epi8(v0l, v0h); // b0b2b4b6g0g2g4g6... - const __m128i v1h = _mm_unpackhi_epi8(v0l, v0h); // b1b3b5b7g1g3g5g7... - const __m128i v2l = _mm_unpacklo_epi8(v1l, v1h); // b0...b7 | g0...g7 - const __m128i v2h = _mm_unpackhi_epi8(v1l, v1h); // r0...r7 | a0...a7 - const __m128i ga0 = _mm_unpackhi_epi64(v2l, v2h); // g0...g7 | a0...a7 - const __m128i rb0 = _mm_unpacklo_epi64(v2h, v2l); // r0...r7 | b0...b7 - const __m128i rg0 = _mm_unpacklo_epi8(rb0, ga0); // r0g0r1g1 ... r6g6r7g7 - const __m128i ba0 = _mm_unpackhi_epi8(rb0, ga0); // b0a0b1a1 ... b6a6b7a7 - const __m128i rgba0 = _mm_unpacklo_epi16(rg0, ba0); // rgba0|rgba1... - const __m128i rgba4 = _mm_unpackhi_epi16(rg0, ba0); // rgba4|rgba5... - _mm_storeu_si128(out++, rgba0); - _mm_storeu_si128(out++, rgba4); + const __m128i A1 = _mm_loadu_si128(in++); + const __m128i A2 = _mm_loadu_si128(in++); + const __m128i B1 = _mm_and_si128(A1, red_blue_mask); // R 0 B 0 + const __m128i B2 = _mm_and_si128(A2, red_blue_mask); // R 0 B 0 + const __m128i C1 = _mm_andnot_si128(red_blue_mask, A1); // 0 G 0 A + const __m128i C2 = _mm_andnot_si128(red_blue_mask, A2); // 0 G 0 A + const __m128i D1 = _mm_shufflelo_epi16(B1, _MM_SHUFFLE(2, 3, 0, 1)); + const __m128i D2 = _mm_shufflelo_epi16(B2, _MM_SHUFFLE(2, 3, 0, 1)); + const __m128i E1 = _mm_shufflehi_epi16(D1, _MM_SHUFFLE(2, 3, 0, 1)); + const __m128i E2 = _mm_shufflehi_epi16(D2, _MM_SHUFFLE(2, 3, 0, 1)); + const __m128i F1 = _mm_or_si128(E1, C1); + const __m128i F2 = _mm_or_si128(E2, C2); + _mm_storeu_si128(out++, F1); + _mm_storeu_si128(out++, F2); num_pixels -= 8; } // left-overs @@ -519,8 +549,8 @@ static void ConvertBGRAToRGBA(const uint32_t* src, } } -static void ConvertBGRAToRGBA4444(const uint32_t* src, - int num_pixels, uint8_t* dst) { +static void ConvertBGRAToRGBA4444_SSE2(const uint32_t* src, + int num_pixels, uint8_t* dst) { const __m128i mask_0x0f = _mm_set1_epi8(0x0f); const __m128i mask_0xf0 = _mm_set1_epi8(0xf0); const __m128i* in = (const __m128i*)src; @@ -541,7 +571,7 @@ static void ConvertBGRAToRGBA4444(const uint32_t* src, const __m128i ga2 = _mm_and_si128(ga1, mask_0x0f); // g0-|g1-|...|a6-|a7- const __m128i rgba0 = _mm_or_si128(ga2, rb1); // rg0..rg7 | ba0..ba7 const __m128i rgba1 = _mm_srli_si128(rgba0, 8); // ba0..ba7 | 0 -#ifdef WEBP_SWAP_16BIT_CSP +#if (WEBP_SWAP_16BIT_CSP == 1) const __m128i rgba = _mm_unpacklo_epi8(rgba1, rgba0); // barg0...barg7 #else const __m128i rgba = _mm_unpacklo_epi8(rgba0, rgba1); // rgba0...rgba7 @@ -555,8 +585,8 @@ static void ConvertBGRAToRGBA4444(const uint32_t* src, } } -static void ConvertBGRAToRGB565(const uint32_t* src, - int num_pixels, uint8_t* dst) { +static void ConvertBGRAToRGB565_SSE2(const uint32_t* src, + int num_pixels, uint8_t* dst) { const __m128i mask_0xe0 = _mm_set1_epi8(0xe0); const __m128i mask_0xf8 = _mm_set1_epi8(0xf8); const __m128i mask_0x07 = _mm_set1_epi8(0x07); @@ -582,7 +612,7 @@ static void ConvertBGRAToRGB565(const uint32_t* src, const __m128i rg1 = _mm_or_si128(rb1, g_lo2); // gr0...gr7|xx const __m128i b1 = _mm_srli_epi16(b0, 3); const __m128i gb1 = _mm_or_si128(b1, g_hi2); // bg0...bg7|xx -#ifdef WEBP_SWAP_16BIT_CSP +#if (WEBP_SWAP_16BIT_CSP == 1) const __m128i rgba = _mm_unpacklo_epi8(gb1, rg1); // rggb0...rggb7 #else const __m128i rgba = _mm_unpacklo_epi8(rg1, gb1); // bgrb0...bgrb7 @@ -596,8 +626,8 @@ static void ConvertBGRAToRGB565(const uint32_t* src, } } -static void ConvertBGRAToBGR(const uint32_t* src, - int num_pixels, uint8_t* dst) { +static void ConvertBGRAToBGR_SSE2(const uint32_t* src, + int num_pixels, uint8_t* dst) { const __m128i mask_l = _mm_set_epi32(0, 0x00ffffff, 0, 0x00ffffff); const __m128i mask_h = _mm_set_epi32(0x00ffffff, 0, 0x00ffffff, 0); const __m128i* in = (const __m128i*)src; @@ -660,14 +690,14 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitSSE2(void) { VP8LPredictorsAdd[12] = PredictorAdd12_SSE2; VP8LPredictorsAdd[13] = PredictorAdd13_SSE2; - VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed; - VP8LTransformColorInverse = TransformColorInverse; + VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed_SSE2; + VP8LTransformColorInverse = TransformColorInverse_SSE2; - VP8LConvertBGRAToRGB = ConvertBGRAToRGB; - VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA; - VP8LConvertBGRAToRGBA4444 = ConvertBGRAToRGBA4444; - VP8LConvertBGRAToRGB565 = ConvertBGRAToRGB565; - VP8LConvertBGRAToBGR = ConvertBGRAToBGR; + VP8LConvertBGRAToRGB = ConvertBGRAToRGB_SSE2; + VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA_SSE2; + VP8LConvertBGRAToRGBA4444 = ConvertBGRAToRGBA4444_SSE2; + VP8LConvertBGRAToRGB565 = ConvertBGRAToRGB565_SSE2; + VP8LConvertBGRAToBGR = ConvertBGRAToBGR_SSE2; } #else // !WEBP_USE_SSE2 diff --git a/src/3rdparty/libwebp/src/dsp/msa_macro.h b/src/3rdparty/libwebp/src/dsp/msa_macro.h index d0e5f45..de026a1 100644 --- a/src/3rdparty/libwebp/src/dsp/msa_macro.h +++ b/src/3rdparty/libwebp/src/dsp/msa_macro.h @@ -22,6 +22,7 @@ #endif #ifdef CLANG_BUILD + #define ALPHAVAL (-1) #define ADDVI_H(a, b) __msa_addvi_h((v8i16)a, b) #define ADDVI_W(a, b) __msa_addvi_w((v4i32)a, b) #define SRAI_B(a, b) __msa_srai_b((v16i8)a, b) @@ -32,6 +33,7 @@ #define ANDI_B(a, b) __msa_andi_b((v16u8)a, b) #define ORI_B(a, b) __msa_ori_b((v16u8)a, b) #else + #define ALPHAVAL (0xff) #define ADDVI_H(a, b) (a + b) #define ADDVI_W(a, b) (a + b) #define SRAI_B(a, b) (a >> b) @@ -1387,4 +1389,4 @@ static WEBP_INLINE uint32_t func_hadd_uh_u32(v8u16 in) { } while (0) #define AVER_UB2_UB(...) AVER_UB2(v16u8, __VA_ARGS__) -#endif /* WEBP_DSP_MSA_MACRO_H_ */ +#endif // WEBP_DSP_MSA_MACRO_H_ diff --git a/src/3rdparty/libwebp/src/dsp/neon.h b/src/3rdparty/libwebp/src/dsp/neon.h index 3b548a6..aa1dea1 100644 --- a/src/3rdparty/libwebp/src/dsp/neon.h +++ b/src/3rdparty/libwebp/src/dsp/neon.h @@ -14,11 +14,12 @@ #include <arm_neon.h> -#include "./dsp.h" +#include "src/dsp/dsp.h" // Right now, some intrinsics functions seem slower, so we disable them -// everywhere except aarch64 where the inline assembly is incompatible. -#if defined(__aarch64__) +// everywhere except newer clang/gcc or aarch64 where the inline assembly is +// incompatible. +#if LOCAL_CLANG_PREREQ(3,8) || LOCAL_GCC_PREREQ(4,9) || defined(__aarch64__) #define WEBP_USE_INTRINSICS // use intrinsics when possible #endif @@ -43,11 +44,11 @@ // if using intrinsics, this flag avoids some functions that make gcc-4.6.3 // crash ("internal compiler error: in immed_double_const, at emit-rtl."). // (probably similar to gcc.gnu.org/bugzilla/show_bug.cgi?id=48183) -#if !(LOCAL_GCC_PREREQ(4,8) || defined(__aarch64__)) +#if !(LOCAL_CLANG_PREREQ(3,8) || LOCAL_GCC_PREREQ(4,8) || defined(__aarch64__)) #define WORK_AROUND_GCC #endif -static WEBP_INLINE int32x4x4_t Transpose4x4(const int32x4x4_t rows) { +static WEBP_INLINE int32x4x4_t Transpose4x4_NEON(const int32x4x4_t rows) { uint64x2x2_t row01, row23; row01.val[0] = vreinterpretq_u64_s32(rows.val[0]); diff --git a/src/3rdparty/libwebp/src/dsp/quant.h b/src/3rdparty/libwebp/src/dsp/quant.h new file mode 100644 index 0000000..cfded0a --- /dev/null +++ b/src/3rdparty/libwebp/src/dsp/quant.h @@ -0,0 +1,86 @@ +// Copyright 2018 Google Inc. All Rights Reserved. +// +// Use of this source code is governed by a BSD-style license +// that can be found in the COPYING file in the root of the source +// tree. An additional intellectual property rights grant can be found +// in the file PATENTS. All contributing project authors may +// be found in the AUTHORS file in the root of the source tree. +// ----------------------------------------------------------------------------- + +#ifndef WEBP_DSP_QUANT_H_ +#define WEBP_DSP_QUANT_H_ + +#include <string.h> + +#include "src/dsp/dsp.h" +#include "src/webp/types.h" + +#if defined(WEBP_USE_NEON) && !defined(WEBP_ANDROID_NEON) && \ + !defined(WEBP_HAVE_NEON_RTCD) +#include <arm_neon.h> + +#define IsFlat IsFlat_NEON + +static uint32x2_t horizontal_add_uint32x4(const uint32x4_t a) { + const uint64x2_t b = vpaddlq_u32(a); + return vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), + vreinterpret_u32_u64(vget_high_u64(b))); +} + +static WEBP_INLINE int IsFlat(const int16_t* levels, int num_blocks, + int thresh) { + const int16x8_t tst_ones = vdupq_n_s16(-1); + uint32x4_t sum = vdupq_n_u32(0); + + int i; // ### Patched for Qt 5.9, in order to build on QNX + for (i = 0; i < num_blocks; ++i) { + // Set DC to zero. + const int16x8_t a_0 = vsetq_lane_s16(0, vld1q_s16(levels), 0); + const int16x8_t a_1 = vld1q_s16(levels + 8); + + const uint16x8_t b_0 = vshrq_n_u16(vtstq_s16(a_0, tst_ones), 15); + const uint16x8_t b_1 = vshrq_n_u16(vtstq_s16(a_1, tst_ones), 15); + + sum = vpadalq_u16(sum, b_0); + sum = vpadalq_u16(sum, b_1); + + levels += 16; + } + return thresh >= (int32_t)vget_lane_u32(horizontal_add_uint32x4(sum), 0); +} + +#else + +#define IsFlat IsFlat_C + +static WEBP_INLINE int IsFlat(const int16_t* levels, int num_blocks, + int thresh) { + int score = 0; + while (num_blocks-- > 0) { // TODO(skal): refine positional scoring? + int i; + for (i = 1; i < 16; ++i) { // omit DC, we're only interested in AC + score += (levels[i] != 0); + if (score > thresh) return 0; + } + levels += 16; + } + return 1; +} + +#endif // defined(WEBP_USE_NEON) && !defined(WEBP_ANDROID_NEON) && + // !defined(WEBP_HAVE_NEON_RTCD) + +static WEBP_INLINE int IsFlatSource16(const uint8_t* src) { + const uint32_t v = src[0] * 0x01010101u; + int i; + for (i = 0; i < 16; ++i) { + if (memcmp(src + 0, &v, 4) || memcmp(src + 4, &v, 4) || + memcmp(src + 8, &v, 4) || memcmp(src + 12, &v, 4)) { + return 0; + } + src += BPS; + } + return 1; +} + +#endif // WEBP_DSP_QUANT_H_ diff --git a/src/3rdparty/libwebp/src/dsp/rescaler.c b/src/3rdparty/libwebp/src/dsp/rescaler.c index 0f54502..c5a01e8 100644 --- a/src/3rdparty/libwebp/src/dsp/rescaler.c +++ b/src/3rdparty/libwebp/src/dsp/rescaler.c @@ -13,19 +13,21 @@ #include <assert.h> -#include "./dsp.h" -#include "../utils/rescaler_utils.h" +#include "src/dsp/dsp.h" +#include "src/utils/rescaler_utils.h" //------------------------------------------------------------------------------ // Implementations of critical functions ImportRow / ExportRow #define ROUNDER (WEBP_RESCALER_ONE >> 1) #define MULT_FIX(x, y) (((uint64_t)(x) * (y) + ROUNDER) >> WEBP_RESCALER_RFIX) +#define MULT_FIX_FLOOR(x, y) (((uint64_t)(x) * (y)) >> WEBP_RESCALER_RFIX) //------------------------------------------------------------------------------ // Row import -void WebPRescalerImportRowExpandC(WebPRescaler* const wrk, const uint8_t* src) { +void WebPRescalerImportRowExpand_C(WebPRescaler* const wrk, + const uint8_t* src) { const int x_stride = wrk->num_channels; const int x_out_max = wrk->dst_width * wrk->num_channels; int channel; @@ -56,7 +58,8 @@ void WebPRescalerImportRowExpandC(WebPRescaler* const wrk, const uint8_t* src) { } } -void WebPRescalerImportRowShrinkC(WebPRescaler* const wrk, const uint8_t* src) { +void WebPRescalerImportRowShrink_C(WebPRescaler* const wrk, + const uint8_t* src) { const int x_stride = wrk->num_channels; const int x_out_max = wrk->dst_width * wrk->num_channels; int channel; @@ -92,7 +95,7 @@ void WebPRescalerImportRowShrinkC(WebPRescaler* const wrk, const uint8_t* src) { //------------------------------------------------------------------------------ // Row export -void WebPRescalerExportRowExpandC(WebPRescaler* const wrk) { +void WebPRescalerExportRowExpand_C(WebPRescaler* const wrk) { int x_out; uint8_t* const dst = wrk->dst; rescaler_t* const irow = wrk->irow; @@ -106,8 +109,7 @@ void WebPRescalerExportRowExpandC(WebPRescaler* const wrk) { for (x_out = 0; x_out < x_out_max; ++x_out) { const uint32_t J = frow[x_out]; const int v = (int)MULT_FIX(J, wrk->fy_scale); - assert(v >= 0 && v <= 255); - dst[x_out] = v; + dst[x_out] = (v > 255) ? 255u : (uint8_t)v; } } else { const uint32_t B = WEBP_RESCALER_FRAC(-wrk->y_accum, wrk->y_sub); @@ -117,13 +119,12 @@ void WebPRescalerExportRowExpandC(WebPRescaler* const wrk) { + (uint64_t)B * irow[x_out]; const uint32_t J = (uint32_t)((I + ROUNDER) >> WEBP_RESCALER_RFIX); const int v = (int)MULT_FIX(J, wrk->fy_scale); - assert(v >= 0 && v <= 255); - dst[x_out] = v; + dst[x_out] = (v > 255) ? 255u : (uint8_t)v; } } } -void WebPRescalerExportRowShrinkC(WebPRescaler* const wrk) { +void WebPRescalerExportRowShrink_C(WebPRescaler* const wrk) { int x_out; uint8_t* const dst = wrk->dst; rescaler_t* const irow = wrk->irow; @@ -135,22 +136,21 @@ void WebPRescalerExportRowShrinkC(WebPRescaler* const wrk) { assert(!wrk->y_expand); if (yscale) { for (x_out = 0; x_out < x_out_max; ++x_out) { - const uint32_t frac = (uint32_t)MULT_FIX(frow[x_out], yscale); + const uint32_t frac = (uint32_t)MULT_FIX_FLOOR(frow[x_out], yscale); const int v = (int)MULT_FIX(irow[x_out] - frac, wrk->fxy_scale); - assert(v >= 0 && v <= 255); - dst[x_out] = v; + dst[x_out] = (v > 255) ? 255u : (uint8_t)v; irow[x_out] = frac; // new fractional start } } else { for (x_out = 0; x_out < x_out_max; ++x_out) { const int v = (int)MULT_FIX(irow[x_out], wrk->fxy_scale); - assert(v >= 0 && v <= 255); - dst[x_out] = v; + dst[x_out] = (v > 255) ? 255u : (uint8_t)v; irow[x_out] = 0; } } } +#undef MULT_FIX_FLOOR #undef MULT_FIX #undef ROUNDER @@ -202,16 +202,15 @@ extern void WebPRescalerDspInitMIPSdspR2(void); extern void WebPRescalerDspInitMSA(void); extern void WebPRescalerDspInitNEON(void); -static volatile VP8CPUInfo rescaler_last_cpuinfo_used = - (VP8CPUInfo)&rescaler_last_cpuinfo_used; - -WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInit(void) { - if (rescaler_last_cpuinfo_used == VP8GetCPUInfo) return; +WEBP_DSP_INIT_FUNC(WebPRescalerDspInit) { +#if !defined(WEBP_REDUCE_SIZE) +#if !WEBP_NEON_OMIT_C_CODE + WebPRescalerExportRowExpand = WebPRescalerExportRowExpand_C; + WebPRescalerExportRowShrink = WebPRescalerExportRowShrink_C; +#endif - WebPRescalerImportRowExpand = WebPRescalerImportRowExpandC; - WebPRescalerImportRowShrink = WebPRescalerImportRowShrinkC; - WebPRescalerExportRowExpand = WebPRescalerExportRowExpandC; - WebPRescalerExportRowShrink = WebPRescalerExportRowShrinkC; + WebPRescalerImportRowExpand = WebPRescalerImportRowExpand_C; + WebPRescalerImportRowShrink = WebPRescalerImportRowShrink_C; if (VP8GetCPUInfo != NULL) { #if defined(WEBP_USE_SSE2) @@ -219,11 +218,6 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInit(void) { WebPRescalerDspInitSSE2(); } #endif -#if defined(WEBP_USE_NEON) - if (VP8GetCPUInfo(kNEON)) { - WebPRescalerDspInitNEON(); - } -#endif #if defined(WEBP_USE_MIPS32) if (VP8GetCPUInfo(kMIPS32)) { WebPRescalerDspInitMIPS32(); @@ -240,5 +234,17 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInit(void) { } #endif } - rescaler_last_cpuinfo_used = VP8GetCPUInfo; + +#if defined(WEBP_USE_NEON) + if (WEBP_NEON_OMIT_C_CODE || + (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) { + WebPRescalerDspInitNEON(); + } +#endif + + assert(WebPRescalerExportRowExpand != NULL); + assert(WebPRescalerExportRowShrink != NULL); + assert(WebPRescalerImportRowExpand != NULL); + assert(WebPRescalerImportRowShrink != NULL); +#endif // WEBP_REDUCE_SIZE } diff --git a/src/3rdparty/libwebp/src/dsp/rescaler_mips32.c b/src/3rdparty/libwebp/src/dsp/rescaler_mips32.c index e09ad5d..61f63c6 100644 --- a/src/3rdparty/libwebp/src/dsp/rescaler_mips32.c +++ b/src/3rdparty/libwebp/src/dsp/rescaler_mips32.c @@ -11,17 +11,18 @@ // // Author(s): Djordje Pesut (djordje.pesut@imgtec.com) -#include "./dsp.h" +#include "src/dsp/dsp.h" -#if defined(WEBP_USE_MIPS32) +#if defined(WEBP_USE_MIPS32) && !defined(WEBP_REDUCE_SIZE) #include <assert.h> -#include "../utils/rescaler_utils.h" +#include "src/utils/rescaler_utils.h" //------------------------------------------------------------------------------ // Row import -static void ImportRowShrink(WebPRescaler* const wrk, const uint8_t* src) { +static void ImportRowShrink_MIPS32(WebPRescaler* const wrk, + const uint8_t* src) { const int x_stride = wrk->num_channels; const int x_out_max = wrk->dst_width * wrk->num_channels; const int fx_scale = wrk->fx_scale; @@ -80,7 +81,8 @@ static void ImportRowShrink(WebPRescaler* const wrk, const uint8_t* src) { } } -static void ImportRowExpand(WebPRescaler* const wrk, const uint8_t* src) { +static void ImportRowExpand_MIPS32(WebPRescaler* const wrk, + const uint8_t* src) { const int x_stride = wrk->num_channels; const int x_out_max = wrk->dst_width * wrk->num_channels; const int x_add = wrk->x_add; @@ -144,7 +146,7 @@ static void ImportRowExpand(WebPRescaler* const wrk, const uint8_t* src) { //------------------------------------------------------------------------------ // Row export -static void ExportRowExpand(WebPRescaler* const wrk) { +static void ExportRowExpand_MIPS32(WebPRescaler* const wrk) { uint8_t* dst = wrk->dst; rescaler_t* irow = wrk->irow; const int x_out_max = wrk->dst_width * wrk->num_channels; @@ -207,7 +209,8 @@ static void ExportRowExpand(WebPRescaler* const wrk) { } } -static void ExportRowShrink(WebPRescaler* const wrk) { +#if 0 // disabled for now. TODO(skal): make match the C-code +static void ExportRowShrink_MIPS32(WebPRescaler* const wrk) { const int x_out_max = wrk->dst_width * wrk->num_channels; uint8_t* dst = wrk->dst; rescaler_t* irow = wrk->irow; @@ -271,6 +274,7 @@ static void ExportRowShrink(WebPRescaler* const wrk) { ); } } +#endif // 0 //------------------------------------------------------------------------------ // Entry point @@ -278,10 +282,10 @@ static void ExportRowShrink(WebPRescaler* const wrk) { extern void WebPRescalerDspInitMIPS32(void); WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInitMIPS32(void) { - WebPRescalerImportRowExpand = ImportRowExpand; - WebPRescalerImportRowShrink = ImportRowShrink; - WebPRescalerExportRowExpand = ExportRowExpand; - WebPRescalerExportRowShrink = ExportRowShrink; + WebPRescalerImportRowExpand = ImportRowExpand_MIPS32; + WebPRescalerImportRowShrink = ImportRowShrink_MIPS32; + WebPRescalerExportRowExpand = ExportRowExpand_MIPS32; +// WebPRescalerExportRowShrink = ExportRowShrink_MIPS32; } #else // !WEBP_USE_MIPS32 diff --git a/src/3rdparty/libwebp/src/dsp/rescaler_mips_dsp_r2.c b/src/3rdparty/libwebp/src/dsp/rescaler_mips_dsp_r2.c index 2308d64..419b741 100644 --- a/src/3rdparty/libwebp/src/dsp/rescaler_mips_dsp_r2.c +++ b/src/3rdparty/libwebp/src/dsp/rescaler_mips_dsp_r2.c @@ -11,20 +11,22 @@ // // Author(s): Djordje Pesut (djordje.pesut@imgtec.com) -#include "./dsp.h" +#include "src/dsp/dsp.h" -#if defined(WEBP_USE_MIPS_DSP_R2) +#if defined(WEBP_USE_MIPS_DSP_R2) && !defined(WEBP_REDUCE_SIZE) #include <assert.h> -#include "../utils/rescaler_utils.h" +#include "src/utils/rescaler_utils.h" #define ROUNDER (WEBP_RESCALER_ONE >> 1) #define MULT_FIX(x, y) (((uint64_t)(x) * (y) + ROUNDER) >> WEBP_RESCALER_RFIX) +#define MULT_FIX_FLOOR(x, y) (((uint64_t)(x) * (y)) >> WEBP_RESCALER_RFIX) //------------------------------------------------------------------------------ // Row export -static void ExportRowShrink(WebPRescaler* const wrk) { +#if 0 // disabled for now. TODO(skal): make match the C-code +static void ExportRowShrink_MIPSdspR2(WebPRescaler* const wrk) { int i; const int x_out_max = wrk->dst_width * wrk->num_channels; uint8_t* dst = wrk->dst; @@ -105,10 +107,9 @@ static void ExportRowShrink(WebPRescaler* const wrk) { ); } for (i = 0; i < (x_out_max & 0x3); ++i) { - const uint32_t frac = (uint32_t)MULT_FIX(*frow++, yscale); + const uint32_t frac = (uint32_t)MULT_FIX_FLOOR(*frow++, yscale); const int v = (int)MULT_FIX(*irow - frac, wrk->fxy_scale); - assert(v >= 0 && v <= 255); - *dst++ = v; + *dst++ = (v > 255) ? 255u : (uint8_t)v; *irow++ = frac; // new fractional start } } else { @@ -154,15 +155,15 @@ static void ExportRowShrink(WebPRescaler* const wrk) { ); } for (i = 0; i < (x_out_max & 0x3); ++i) { - const int v = (int)MULT_FIX(*irow, wrk->fxy_scale); - assert(v >= 0 && v <= 255); - *dst++ = v; + const int v = (int)MULT_FIX_FLOOR(*irow, wrk->fxy_scale); + *dst++ = (v > 255) ? 255u : (uint8_t)v; *irow++ = 0; } } } +#endif // 0 -static void ExportRowExpand(WebPRescaler* const wrk) { +static void ExportRowExpand_MIPSdspR2(WebPRescaler* const wrk) { int i; uint8_t* dst = wrk->dst; rescaler_t* irow = wrk->irow; @@ -216,8 +217,7 @@ static void ExportRowExpand(WebPRescaler* const wrk) { for (i = 0; i < (x_out_max & 0x3); ++i) { const uint32_t J = *frow++; const int v = (int)MULT_FIX(J, wrk->fy_scale); - assert(v >= 0 && v <= 255); - *dst++ = v; + *dst++ = (v > 255) ? 255u : (uint8_t)v; } } else { const uint32_t B = WEBP_RESCALER_FRAC(-wrk->y_accum, wrk->y_sub); @@ -288,12 +288,12 @@ static void ExportRowExpand(WebPRescaler* const wrk) { + (uint64_t)B * *irow++; const uint32_t J = (uint32_t)((I + ROUNDER) >> WEBP_RESCALER_RFIX); const int v = (int)MULT_FIX(J, wrk->fy_scale); - assert(v >= 0 && v <= 255); - *dst++ = v; + *dst++ = (v > 255) ? 255u : (uint8_t)v; } } } +#undef MULT_FIX_FLOOR #undef MULT_FIX #undef ROUNDER @@ -303,8 +303,8 @@ static void ExportRowExpand(WebPRescaler* const wrk) { extern void WebPRescalerDspInitMIPSdspR2(void); WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInitMIPSdspR2(void) { - WebPRescalerExportRowExpand = ExportRowExpand; - WebPRescalerExportRowShrink = ExportRowShrink; + WebPRescalerExportRowExpand = ExportRowExpand_MIPSdspR2; +// WebPRescalerExportRowShrink = ExportRowShrink_MIPSdspR2; } #else // !WEBP_USE_MIPS_DSP_R2 diff --git a/src/3rdparty/libwebp/src/dsp/rescaler_msa.c b/src/3rdparty/libwebp/src/dsp/rescaler_msa.c index 2c10e55..256dbdd 100644 --- a/src/3rdparty/libwebp/src/dsp/rescaler_msa.c +++ b/src/3rdparty/libwebp/src/dsp/rescaler_msa.c @@ -11,17 +11,18 @@ // // Author: Prashant Patil (prashant.patil@imgtec.com) -#include "./dsp.h" +#include "src/dsp/dsp.h" -#if defined(WEBP_USE_MSA) +#if defined(WEBP_USE_MSA) && !defined(WEBP_REDUCE_SIZE) #include <assert.h> -#include "../utils/rescaler_utils.h" -#include "./msa_macro.h" +#include "src/utils/rescaler_utils.h" +#include "src/dsp/msa_macro.h" #define ROUNDER (WEBP_RESCALER_ONE >> 1) #define MULT_FIX(x, y) (((uint64_t)(x) * (y) + ROUNDER) >> WEBP_RESCALER_RFIX) +#define MULT_FIX_FLOOR(x, y) (((uint64_t)(x) * (y)) >> WEBP_RESCALER_RFIX) #define CALC_MULT_FIX_16(in0, in1, in2, in3, scale, shift, dst) do { \ v4u32 tmp0, tmp1, tmp2, tmp3; \ @@ -165,8 +166,7 @@ static WEBP_INLINE void ExportRowExpand_0(const uint32_t* frow, uint8_t* dst, for (x_out = 0; x_out < length; ++x_out) { const uint32_t J = frow[x_out]; const int v = (int)MULT_FIX(J, wrk->fy_scale); - assert(v >= 0 && v <= 255); - dst[x_out] = v; + dst[x_out] = (v > 255) ? 255u : (uint8_t)v; } } } @@ -240,13 +240,12 @@ static WEBP_INLINE void ExportRowExpand_1(const uint32_t* frow, uint32_t* irow, + (uint64_t)B * irow[x_out]; const uint32_t J = (uint32_t)((I + ROUNDER) >> WEBP_RESCALER_RFIX); const int v = (int)MULT_FIX(J, wrk->fy_scale); - assert(v >= 0 && v <= 255); - dst[x_out] = v; + dst[x_out] = (v > 255) ? 255u : (uint8_t)v; } } } -static void RescalerExportRowExpand(WebPRescaler* const wrk) { +static void RescalerExportRowExpand_MIPSdspR2(WebPRescaler* const wrk) { uint8_t* dst = wrk->dst; rescaler_t* irow = wrk->irow; const int x_out_max = wrk->dst_width * wrk->num_channels; @@ -262,6 +261,7 @@ static void RescalerExportRowExpand(WebPRescaler* const wrk) { } } +#if 0 // disabled for now. TODO(skal): make match the C-code static WEBP_INLINE void ExportRowShrink_0(const uint32_t* frow, uint32_t* irow, uint8_t* dst, int length, const uint32_t yscale, @@ -340,10 +340,9 @@ static WEBP_INLINE void ExportRowShrink_0(const uint32_t* frow, uint32_t* irow, length -= 4; } for (x_out = 0; x_out < length; ++x_out) { - const uint32_t frac = (uint32_t)MULT_FIX(frow[x_out], yscale); + const uint32_t frac = (uint32_t)MULT_FIX_FLOOR(frow[x_out], yscale); const int v = (int)MULT_FIX(irow[x_out] - frac, wrk->fxy_scale); - assert(v >= 0 && v <= 255); - dst[x_out] = v; + dst[x_out] = (v > 255) ? 255u : (uint8_t)v; irow[x_out] = frac; } } @@ -404,14 +403,13 @@ static WEBP_INLINE void ExportRowShrink_1(uint32_t* irow, uint8_t* dst, } for (x_out = 0; x_out < length; ++x_out) { const int v = (int)MULT_FIX(irow[x_out], wrk->fxy_scale); - assert(v >= 0 && v <= 255); - dst[x_out] = v; + dst[x_out] = (v > 255) ? 255u : (uint8_t)v; irow[x_out] = 0; } } } -static void RescalerExportRowShrink(WebPRescaler* const wrk) { +static void RescalerExportRowShrink_MIPSdspR2(WebPRescaler* const wrk) { uint8_t* dst = wrk->dst; rescaler_t* irow = wrk->irow; const int x_out_max = wrk->dst_width * wrk->num_channels; @@ -426,6 +424,7 @@ static void RescalerExportRowShrink(WebPRescaler* const wrk) { ExportRowShrink_1(irow, dst, x_out_max, wrk); } } +#endif // 0 //------------------------------------------------------------------------------ // Entry point @@ -433,8 +432,8 @@ static void RescalerExportRowShrink(WebPRescaler* const wrk) { extern void WebPRescalerDspInitMSA(void); WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInitMSA(void) { - WebPRescalerExportRowExpand = RescalerExportRowExpand; - WebPRescalerExportRowShrink = RescalerExportRowShrink; + WebPRescalerExportRowExpand = RescalerExportRowExpand_MIPSdspR2; +// WebPRescalerExportRowShrink = RescalerExportRowShrink_MIPSdspR2; } #else // !WEBP_USE_MSA diff --git a/src/3rdparty/libwebp/src/dsp/rescaler_neon.c b/src/3rdparty/libwebp/src/dsp/rescaler_neon.c index b2dd8f3..b976a85 100644 --- a/src/3rdparty/libwebp/src/dsp/rescaler_neon.c +++ b/src/3rdparty/libwebp/src/dsp/rescaler_neon.c @@ -11,17 +11,18 @@ // // Author: Skal (pascal.massimino@gmail.com) -#include "./dsp.h" +#include "src/dsp/dsp.h" -#if defined(WEBP_USE_NEON) +#if defined(WEBP_USE_NEON) && !defined(WEBP_REDUCE_SIZE) #include <arm_neon.h> #include <assert.h> -#include "./neon.h" -#include "../utils/rescaler_utils.h" +#include "src/dsp/neon.h" +#include "src/utils/rescaler_utils.h" #define ROUNDER (WEBP_RESCALER_ONE >> 1) #define MULT_FIX_C(x, y) (((uint64_t)(x) * (y) + ROUNDER) >> WEBP_RESCALER_RFIX) +#define MULT_FIX_FLOOR_C(x, y) (((uint64_t)(x) * (y)) >> WEBP_RESCALER_RFIX) #define LOAD_32x4(SRC, DST) const uint32x4_t DST = vld1q_u32((SRC)) #define LOAD_32x8(SRC, DST0, DST1) \ @@ -35,15 +36,18 @@ #if (WEBP_RESCALER_RFIX == 32) #define MAKE_HALF_CST(C) vdupq_n_s32((int32_t)((C) >> 1)) -#define MULT_FIX(A, B) /* note: B is actualy scale>>1. See MAKE_HALF_CST */ \ +// note: B is actualy scale>>1. See MAKE_HALF_CST +#define MULT_FIX(A, B) \ vreinterpretq_u32_s32(vqrdmulhq_s32(vreinterpretq_s32_u32((A)), (B))) +#define MULT_FIX_FLOOR(A, B) \ + vreinterpretq_u32_s32(vqdmulhq_s32(vreinterpretq_s32_u32((A)), (B))) #else #error "MULT_FIX/WEBP_RESCALER_RFIX need some more work" #endif -static uint32x4_t Interpolate(const rescaler_t* const frow, - const rescaler_t* const irow, - uint32_t A, uint32_t B) { +static uint32x4_t Interpolate_NEON(const rescaler_t* const frow, + const rescaler_t* const irow, + uint32_t A, uint32_t B) { LOAD_32x4(frow, A0); LOAD_32x4(irow, B0); const uint64x2_t C0 = vmull_n_u32(vget_low_u32(A0), A); @@ -56,7 +60,7 @@ static uint32x4_t Interpolate(const rescaler_t* const frow, return E; } -static void RescalerExportRowExpand(WebPRescaler* const wrk) { +static void RescalerExportRowExpand_NEON(WebPRescaler* const wrk) { int x_out; uint8_t* const dst = wrk->dst; rescaler_t* const irow = wrk->irow; @@ -77,28 +81,27 @@ static void RescalerExportRowExpand(WebPRescaler* const wrk) { const uint32x4_t B1 = MULT_FIX(A1, fy_scale_half); const uint16x4_t C0 = vmovn_u32(B0); const uint16x4_t C1 = vmovn_u32(B1); - const uint8x8_t D = vmovn_u16(vcombine_u16(C0, C1)); + const uint8x8_t D = vqmovn_u16(vcombine_u16(C0, C1)); vst1_u8(dst + x_out, D); } for (; x_out < x_out_max; ++x_out) { const uint32_t J = frow[x_out]; const int v = (int)MULT_FIX_C(J, fy_scale); - assert(v >= 0 && v <= 255); - dst[x_out] = v; + dst[x_out] = (v > 255) ? 255u : (uint8_t)v; } } else { const uint32_t B = WEBP_RESCALER_FRAC(-wrk->y_accum, wrk->y_sub); const uint32_t A = (uint32_t)(WEBP_RESCALER_ONE - B); for (x_out = 0; x_out < max_span; x_out += 8) { const uint32x4_t C0 = - Interpolate(frow + x_out + 0, irow + x_out + 0, A, B); + Interpolate_NEON(frow + x_out + 0, irow + x_out + 0, A, B); const uint32x4_t C1 = - Interpolate(frow + x_out + 4, irow + x_out + 4, A, B); + Interpolate_NEON(frow + x_out + 4, irow + x_out + 4, A, B); const uint32x4_t D0 = MULT_FIX(C0, fy_scale_half); const uint32x4_t D1 = MULT_FIX(C1, fy_scale_half); const uint16x4_t E0 = vmovn_u32(D0); const uint16x4_t E1 = vmovn_u32(D1); - const uint8x8_t F = vmovn_u16(vcombine_u16(E0, E1)); + const uint8x8_t F = vqmovn_u16(vcombine_u16(E0, E1)); vst1_u8(dst + x_out, F); } for (; x_out < x_out_max; ++x_out) { @@ -106,13 +109,12 @@ static void RescalerExportRowExpand(WebPRescaler* const wrk) { + (uint64_t)B * irow[x_out]; const uint32_t J = (uint32_t)((I + ROUNDER) >> WEBP_RESCALER_RFIX); const int v = (int)MULT_FIX_C(J, fy_scale); - assert(v >= 0 && v <= 255); - dst[x_out] = v; + dst[x_out] = (v > 255) ? 255u : (uint8_t)v; } } } -static void RescalerExportRowShrink(WebPRescaler* const wrk) { +static void RescalerExportRowShrink_NEON(WebPRescaler* const wrk) { int x_out; uint8_t* const dst = wrk->dst; rescaler_t* const irow = wrk->irow; @@ -131,23 +133,22 @@ static void RescalerExportRowShrink(WebPRescaler* const wrk) { for (x_out = 0; x_out < max_span; x_out += 8) { LOAD_32x8(frow + x_out, in0, in1); LOAD_32x8(irow + x_out, in2, in3); - const uint32x4_t A0 = MULT_FIX(in0, yscale_half); - const uint32x4_t A1 = MULT_FIX(in1, yscale_half); + const uint32x4_t A0 = MULT_FIX_FLOOR(in0, yscale_half); + const uint32x4_t A1 = MULT_FIX_FLOOR(in1, yscale_half); const uint32x4_t B0 = vqsubq_u32(in2, A0); const uint32x4_t B1 = vqsubq_u32(in3, A1); const uint32x4_t C0 = MULT_FIX(B0, fxy_scale_half); const uint32x4_t C1 = MULT_FIX(B1, fxy_scale_half); const uint16x4_t D0 = vmovn_u32(C0); const uint16x4_t D1 = vmovn_u32(C1); - const uint8x8_t E = vmovn_u16(vcombine_u16(D0, D1)); + const uint8x8_t E = vqmovn_u16(vcombine_u16(D0, D1)); vst1_u8(dst + x_out, E); STORE_32x8(A0, A1, irow + x_out); } for (; x_out < x_out_max; ++x_out) { - const uint32_t frac = (uint32_t)MULT_FIX_C(frow[x_out], yscale); - const int v = (int)MULT_FIX_C(irow[x_out] - frac, wrk->fxy_scale); - assert(v >= 0 && v <= 255); - dst[x_out] = v; + const uint32_t frac = (uint32_t)MULT_FIX_FLOOR_C(frow[x_out], yscale); + const int v = (int)MULT_FIX_C(irow[x_out] - frac, fxy_scale); + dst[x_out] = (v > 255) ? 255u : (uint8_t)v; irow[x_out] = frac; // new fractional start } } else { @@ -157,26 +158,31 @@ static void RescalerExportRowShrink(WebPRescaler* const wrk) { const uint32x4_t A1 = MULT_FIX(in1, fxy_scale_half); const uint16x4_t B0 = vmovn_u32(A0); const uint16x4_t B1 = vmovn_u32(A1); - const uint8x8_t C = vmovn_u16(vcombine_u16(B0, B1)); + const uint8x8_t C = vqmovn_u16(vcombine_u16(B0, B1)); vst1_u8(dst + x_out, C); STORE_32x8(zero, zero, irow + x_out); } for (; x_out < x_out_max; ++x_out) { const int v = (int)MULT_FIX_C(irow[x_out], fxy_scale); - assert(v >= 0 && v <= 255); - dst[x_out] = v; + dst[x_out] = (v > 255) ? 255u : (uint8_t)v; irow[x_out] = 0; } } } +#undef MULT_FIX_FLOOR_C +#undef MULT_FIX_C +#undef MULT_FIX_FLOOR +#undef MULT_FIX +#undef ROUNDER + //------------------------------------------------------------------------------ extern void WebPRescalerDspInitNEON(void); WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInitNEON(void) { - WebPRescalerExportRowExpand = RescalerExportRowExpand; - WebPRescalerExportRowShrink = RescalerExportRowShrink; + WebPRescalerExportRowExpand = RescalerExportRowExpand_NEON; + WebPRescalerExportRowShrink = RescalerExportRowShrink_NEON; } #else // !WEBP_USE_NEON diff --git a/src/3rdparty/libwebp/src/dsp/rescaler_sse2.c b/src/3rdparty/libwebp/src/dsp/rescaler_sse2.c index 8271c22..d7effea 100644 --- a/src/3rdparty/libwebp/src/dsp/rescaler_sse2.c +++ b/src/3rdparty/libwebp/src/dsp/rescaler_sse2.c @@ -11,23 +11,24 @@ // // Author: Skal (pascal.massimino@gmail.com) -#include "./dsp.h" +#include "src/dsp/dsp.h" -#if defined(WEBP_USE_SSE2) +#if defined(WEBP_USE_SSE2) && !defined(WEBP_REDUCE_SIZE) #include <emmintrin.h> #include <assert.h> -#include "../utils/rescaler_utils.h" -#include "../utils/utils.h" +#include "src/utils/rescaler_utils.h" +#include "src/utils/utils.h" //------------------------------------------------------------------------------ // Implementations of critical functions ImportRow / ExportRow #define ROUNDER (WEBP_RESCALER_ONE >> 1) #define MULT_FIX(x, y) (((uint64_t)(x) * (y) + ROUNDER) >> WEBP_RESCALER_RFIX) +#define MULT_FIX_FLOOR(x, y) (((uint64_t)(x) * (y)) >> WEBP_RESCALER_RFIX) // input: 8 bytes ABCDEFGH -> output: A0E0B0F0C0G0D0H0 -static void LoadTwoPixels(const uint8_t* const src, __m128i* out) { +static void LoadTwoPixels_SSE2(const uint8_t* const src, __m128i* out) { const __m128i zero = _mm_setzero_si128(); const __m128i A = _mm_loadl_epi64((const __m128i*)(src)); // ABCDEFGH const __m128i B = _mm_unpacklo_epi8(A, zero); // A0B0C0D0E0F0G0H0 @@ -36,28 +37,30 @@ static void LoadTwoPixels(const uint8_t* const src, __m128i* out) { } // input: 8 bytes ABCDEFGH -> output: A0B0C0D0E0F0G0H0 -static void LoadHeightPixels(const uint8_t* const src, __m128i* out) { +static void LoadEightPixels_SSE2(const uint8_t* const src, __m128i* out) { const __m128i zero = _mm_setzero_si128(); const __m128i A = _mm_loadl_epi64((const __m128i*)(src)); // ABCDEFGH *out = _mm_unpacklo_epi8(A, zero); } -static void RescalerImportRowExpandSSE2(WebPRescaler* const wrk, - const uint8_t* src) { +static void RescalerImportRowExpand_SSE2(WebPRescaler* const wrk, + const uint8_t* src) { rescaler_t* frow = wrk->frow; const rescaler_t* const frow_end = frow + wrk->dst_width * wrk->num_channels; const int x_add = wrk->x_add; int accum = x_add; __m128i cur_pixels; + // SSE2 implementation only works with 16b signed arithmetic at max. + if (wrk->src_width < 8 || accum >= (1 << 15)) { + WebPRescalerImportRowExpand_C(wrk, src); + return; + } + assert(!WebPRescalerInputDone(wrk)); assert(wrk->x_expand); if (wrk->num_channels == 4) { - if (wrk->src_width < 2) { - WebPRescalerImportRowExpandC(wrk, src); - return; - } - LoadTwoPixels(src, &cur_pixels); + LoadTwoPixels_SSE2(src, &cur_pixels); src += 4; while (1) { const __m128i mult = _mm_set1_epi32(((x_add - accum) << 16) | accum); @@ -67,7 +70,7 @@ static void RescalerImportRowExpandSSE2(WebPRescaler* const wrk, if (frow >= frow_end) break; accum -= wrk->x_sub; if (accum < 0) { - LoadTwoPixels(src, &cur_pixels); + LoadTwoPixels_SSE2(src, &cur_pixels); src += 4; accum += x_add; } @@ -75,11 +78,7 @@ static void RescalerImportRowExpandSSE2(WebPRescaler* const wrk, } else { int left; const uint8_t* const src_limit = src + wrk->src_width - 8; - if (wrk->src_width < 8) { - WebPRescalerImportRowExpandC(wrk, src); - return; - } - LoadHeightPixels(src, &cur_pixels); + LoadEightPixels_SSE2(src, &cur_pixels); src += 7; left = 7; while (1) { @@ -94,7 +93,7 @@ static void RescalerImportRowExpandSSE2(WebPRescaler* const wrk, if (--left) { cur_pixels = _mm_srli_si128(cur_pixels, 2); } else if (src <= src_limit) { - LoadHeightPixels(src, &cur_pixels); + LoadEightPixels_SSE2(src, &cur_pixels); src += 7; left = 7; } else { // tail @@ -110,8 +109,8 @@ static void RescalerImportRowExpandSSE2(WebPRescaler* const wrk, assert(accum == 0); } -static void RescalerImportRowShrinkSSE2(WebPRescaler* const wrk, - const uint8_t* src) { +static void RescalerImportRowShrink_SSE2(WebPRescaler* const wrk, + const uint8_t* src) { const int x_sub = wrk->x_sub; int accum = 0; const __m128i zero = _mm_setzero_si128(); @@ -123,7 +122,7 @@ static void RescalerImportRowShrinkSSE2(WebPRescaler* const wrk, const rescaler_t* const frow_end = wrk->frow + 4 * wrk->dst_width; if (wrk->num_channels != 4 || wrk->x_add > (x_sub << 7)) { - WebPRescalerImportRowShrinkC(wrk, src); + WebPRescalerImportRowShrink_C(wrk, src); return; } assert(!WebPRescalerInputDone(wrk)); @@ -169,12 +168,12 @@ static void RescalerImportRowShrinkSSE2(WebPRescaler* const wrk, // Row export // load *src as epi64, multiply by mult and store result in [out0 ... out3] -static WEBP_INLINE void LoadDispatchAndMult(const rescaler_t* const src, - const __m128i* const mult, - __m128i* const out0, - __m128i* const out1, - __m128i* const out2, - __m128i* const out3) { +static WEBP_INLINE void LoadDispatchAndMult_SSE2(const rescaler_t* const src, + const __m128i* const mult, + __m128i* const out0, + __m128i* const out1, + __m128i* const out2, + __m128i* const out3) { const __m128i A0 = _mm_loadu_si128((const __m128i*)(src + 0)); const __m128i A1 = _mm_loadu_si128((const __m128i*)(src + 4)); const __m128i A2 = _mm_srli_epi64(A0, 32); @@ -192,12 +191,12 @@ static WEBP_INLINE void LoadDispatchAndMult(const rescaler_t* const src, } } -static WEBP_INLINE void ProcessRow(const __m128i* const A0, - const __m128i* const A1, - const __m128i* const A2, - const __m128i* const A3, - const __m128i* const mult, - uint8_t* const dst) { +static WEBP_INLINE void ProcessRow_SSE2(const __m128i* const A0, + const __m128i* const A1, + const __m128i* const A2, + const __m128i* const A3, + const __m128i* const mult, + uint8_t* const dst) { const __m128i rounder = _mm_set_epi32(0, ROUNDER, 0, ROUNDER); const __m128i mask = _mm_set_epi32(0xffffffffu, 0, 0xffffffffu, 0); const __m128i B0 = _mm_mul_epu32(*A0, *mult); @@ -210,7 +209,7 @@ static WEBP_INLINE void ProcessRow(const __m128i* const A0, const __m128i C3 = _mm_add_epi64(B3, rounder); const __m128i D0 = _mm_srli_epi64(C0, WEBP_RESCALER_RFIX); const __m128i D1 = _mm_srli_epi64(C1, WEBP_RESCALER_RFIX); -#if (WEBP_RESCALER_FIX < 32) +#if (WEBP_RESCALER_RFIX < 32) const __m128i D2 = _mm_and_si128(_mm_slli_epi64(C2, 32 - WEBP_RESCALER_RFIX), mask); const __m128i D3 = @@ -226,7 +225,7 @@ static WEBP_INLINE void ProcessRow(const __m128i* const A0, _mm_storel_epi64((__m128i*)dst, G); } -static void RescalerExportRowExpandSSE2(WebPRescaler* const wrk) { +static void RescalerExportRowExpand_SSE2(WebPRescaler* const wrk) { int x_out; uint8_t* const dst = wrk->dst; rescaler_t* const irow = wrk->irow; @@ -240,14 +239,13 @@ static void RescalerExportRowExpandSSE2(WebPRescaler* const wrk) { if (wrk->y_accum == 0) { for (x_out = 0; x_out + 8 <= x_out_max; x_out += 8) { __m128i A0, A1, A2, A3; - LoadDispatchAndMult(frow + x_out, NULL, &A0, &A1, &A2, &A3); - ProcessRow(&A0, &A1, &A2, &A3, &mult, dst + x_out); + LoadDispatchAndMult_SSE2(frow + x_out, NULL, &A0, &A1, &A2, &A3); + ProcessRow_SSE2(&A0, &A1, &A2, &A3, &mult, dst + x_out); } for (; x_out < x_out_max; ++x_out) { const uint32_t J = frow[x_out]; const int v = (int)MULT_FIX(J, wrk->fy_scale); - assert(v >= 0 && v <= 255); - dst[x_out] = v; + dst[x_out] = (v > 255) ? 255u : (uint8_t)v; } } else { const uint32_t B = WEBP_RESCALER_FRAC(-wrk->y_accum, wrk->y_sub); @@ -257,8 +255,8 @@ static void RescalerExportRowExpandSSE2(WebPRescaler* const wrk) { const __m128i rounder = _mm_set_epi32(0, ROUNDER, 0, ROUNDER); for (x_out = 0; x_out + 8 <= x_out_max; x_out += 8) { __m128i A0, A1, A2, A3, B0, B1, B2, B3; - LoadDispatchAndMult(frow + x_out, &mA, &A0, &A1, &A2, &A3); - LoadDispatchAndMult(irow + x_out, &mB, &B0, &B1, &B2, &B3); + LoadDispatchAndMult_SSE2(frow + x_out, &mA, &A0, &A1, &A2, &A3); + LoadDispatchAndMult_SSE2(irow + x_out, &mB, &B0, &B1, &B2, &B3); { const __m128i C0 = _mm_add_epi64(A0, B0); const __m128i C1 = _mm_add_epi64(A1, B1); @@ -272,7 +270,7 @@ static void RescalerExportRowExpandSSE2(WebPRescaler* const wrk) { const __m128i E1 = _mm_srli_epi64(D1, WEBP_RESCALER_RFIX); const __m128i E2 = _mm_srli_epi64(D2, WEBP_RESCALER_RFIX); const __m128i E3 = _mm_srli_epi64(D3, WEBP_RESCALER_RFIX); - ProcessRow(&E0, &E1, &E2, &E3, &mult, dst + x_out); + ProcessRow_SSE2(&E0, &E1, &E2, &E3, &mult, dst + x_out); } } for (; x_out < x_out_max; ++x_out) { @@ -280,13 +278,12 @@ static void RescalerExportRowExpandSSE2(WebPRescaler* const wrk) { + (uint64_t)B * irow[x_out]; const uint32_t J = (uint32_t)((I + ROUNDER) >> WEBP_RESCALER_RFIX); const int v = (int)MULT_FIX(J, wrk->fy_scale); - assert(v >= 0 && v <= 255); - dst[x_out] = v; + dst[x_out] = (v > 255) ? 255u : (uint8_t)v; } } } -static void RescalerExportRowShrinkSSE2(WebPRescaler* const wrk) { +static void RescalerExportRowShrink_SSE2(WebPRescaler* const wrk) { int x_out; uint8_t* const dst = wrk->dst; rescaler_t* const irow = wrk->irow; @@ -300,20 +297,15 @@ static void RescalerExportRowShrinkSSE2(WebPRescaler* const wrk) { const int scale_xy = wrk->fxy_scale; const __m128i mult_xy = _mm_set_epi32(0, scale_xy, 0, scale_xy); const __m128i mult_y = _mm_set_epi32(0, yscale, 0, yscale); - const __m128i rounder = _mm_set_epi32(0, ROUNDER, 0, ROUNDER); for (x_out = 0; x_out + 8 <= x_out_max; x_out += 8) { __m128i A0, A1, A2, A3, B0, B1, B2, B3; - LoadDispatchAndMult(irow + x_out, NULL, &A0, &A1, &A2, &A3); - LoadDispatchAndMult(frow + x_out, &mult_y, &B0, &B1, &B2, &B3); + LoadDispatchAndMult_SSE2(irow + x_out, NULL, &A0, &A1, &A2, &A3); + LoadDispatchAndMult_SSE2(frow + x_out, &mult_y, &B0, &B1, &B2, &B3); { - const __m128i C0 = _mm_add_epi64(B0, rounder); - const __m128i C1 = _mm_add_epi64(B1, rounder); - const __m128i C2 = _mm_add_epi64(B2, rounder); - const __m128i C3 = _mm_add_epi64(B3, rounder); - const __m128i D0 = _mm_srli_epi64(C0, WEBP_RESCALER_RFIX); // = frac - const __m128i D1 = _mm_srli_epi64(C1, WEBP_RESCALER_RFIX); - const __m128i D2 = _mm_srli_epi64(C2, WEBP_RESCALER_RFIX); - const __m128i D3 = _mm_srli_epi64(C3, WEBP_RESCALER_RFIX); + const __m128i D0 = _mm_srli_epi64(B0, WEBP_RESCALER_RFIX); // = frac + const __m128i D1 = _mm_srli_epi64(B1, WEBP_RESCALER_RFIX); + const __m128i D2 = _mm_srli_epi64(B2, WEBP_RESCALER_RFIX); + const __m128i D3 = _mm_srli_epi64(B3, WEBP_RESCALER_RFIX); const __m128i E0 = _mm_sub_epi64(A0, D0); // irow[x] - frac const __m128i E1 = _mm_sub_epi64(A1, D1); const __m128i E2 = _mm_sub_epi64(A2, D2); @@ -324,14 +316,13 @@ static void RescalerExportRowShrinkSSE2(WebPRescaler* const wrk) { const __m128i G1 = _mm_or_si128(D1, F3); _mm_storeu_si128((__m128i*)(irow + x_out + 0), G0); _mm_storeu_si128((__m128i*)(irow + x_out + 4), G1); - ProcessRow(&E0, &E1, &E2, &E3, &mult_xy, dst + x_out); + ProcessRow_SSE2(&E0, &E1, &E2, &E3, &mult_xy, dst + x_out); } } for (; x_out < x_out_max; ++x_out) { - const uint32_t frac = (int)MULT_FIX(frow[x_out], yscale); + const uint32_t frac = (int)MULT_FIX_FLOOR(frow[x_out], yscale); const int v = (int)MULT_FIX(irow[x_out] - frac, wrk->fxy_scale); - assert(v >= 0 && v <= 255); - dst[x_out] = v; + dst[x_out] = (v > 255) ? 255u : (uint8_t)v; irow[x_out] = frac; // new fractional start } } else { @@ -340,20 +331,20 @@ static void RescalerExportRowShrinkSSE2(WebPRescaler* const wrk) { const __m128i zero = _mm_setzero_si128(); for (x_out = 0; x_out + 8 <= x_out_max; x_out += 8) { __m128i A0, A1, A2, A3; - LoadDispatchAndMult(irow + x_out, NULL, &A0, &A1, &A2, &A3); + LoadDispatchAndMult_SSE2(irow + x_out, NULL, &A0, &A1, &A2, &A3); _mm_storeu_si128((__m128i*)(irow + x_out + 0), zero); _mm_storeu_si128((__m128i*)(irow + x_out + 4), zero); - ProcessRow(&A0, &A1, &A2, &A3, &mult, dst + x_out); + ProcessRow_SSE2(&A0, &A1, &A2, &A3, &mult, dst + x_out); } for (; x_out < x_out_max; ++x_out) { const int v = (int)MULT_FIX(irow[x_out], scale); - assert(v >= 0 && v <= 255); - dst[x_out] = v; + dst[x_out] = (v > 255) ? 255u : (uint8_t)v; irow[x_out] = 0; } } } +#undef MULT_FIX_FLOOR #undef MULT_FIX #undef ROUNDER @@ -362,10 +353,10 @@ static void RescalerExportRowShrinkSSE2(WebPRescaler* const wrk) { extern void WebPRescalerDspInitSSE2(void); WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInitSSE2(void) { - WebPRescalerImportRowExpand = RescalerImportRowExpandSSE2; - WebPRescalerImportRowShrink = RescalerImportRowShrinkSSE2; - WebPRescalerExportRowExpand = RescalerExportRowExpandSSE2; - WebPRescalerExportRowShrink = RescalerExportRowShrinkSSE2; + WebPRescalerImportRowExpand = RescalerImportRowExpand_SSE2; + WebPRescalerImportRowShrink = RescalerImportRowShrink_SSE2; + WebPRescalerExportRowExpand = RescalerExportRowExpand_SSE2; + WebPRescalerExportRowShrink = RescalerExportRowShrink_SSE2; } #else // !WEBP_USE_SSE2 diff --git a/src/3rdparty/libwebp/src/dsp/ssim.c b/src/3rdparty/libwebp/src/dsp/ssim.c new file mode 100644 index 0000000..989ce82 --- /dev/null +++ b/src/3rdparty/libwebp/src/dsp/ssim.c @@ -0,0 +1,159 @@ +// Copyright 2017 Google Inc. All Rights Reserved. +// +// Use of this source code is governed by a BSD-style license +// that can be found in the COPYING file in the root of the source +// tree. An additional intellectual property rights grant can be found +// in the file PATENTS. All contributing project authors may +// be found in the AUTHORS file in the root of the source tree. +// ----------------------------------------------------------------------------- +// +// distortion calculation +// +// Author: Skal (pascal.massimino@gmail.com) + +#include <assert.h> +#include <stdlib.h> // for abs() + +#include "src/dsp/dsp.h" + +#if !defined(WEBP_REDUCE_SIZE) + +//------------------------------------------------------------------------------ +// SSIM / PSNR + +// hat-shaped filter. Sum of coefficients is equal to 16. +static const uint32_t kWeight[2 * VP8_SSIM_KERNEL + 1] = { + 1, 2, 3, 4, 3, 2, 1 +}; +static const uint32_t kWeightSum = 16 * 16; // sum{kWeight}^2 + +static WEBP_INLINE double SSIMCalculation( + const VP8DistoStats* const stats, uint32_t N /*num samples*/) { + const uint32_t w2 = N * N; + const uint32_t C1 = 20 * w2; + const uint32_t C2 = 60 * w2; + const uint32_t C3 = 8 * 8 * w2; // 'dark' limit ~= 6 + const uint64_t xmxm = (uint64_t)stats->xm * stats->xm; + const uint64_t ymym = (uint64_t)stats->ym * stats->ym; + if (xmxm + ymym >= C3) { + const int64_t xmym = (int64_t)stats->xm * stats->ym; + const int64_t sxy = (int64_t)stats->xym * N - xmym; // can be negative + const uint64_t sxx = (uint64_t)stats->xxm * N - xmxm; + const uint64_t syy = (uint64_t)stats->yym * N - ymym; + // we descale by 8 to prevent overflow during the fnum/fden multiply. + const uint64_t num_S = (2 * (uint64_t)(sxy < 0 ? 0 : sxy) + C2) >> 8; + const uint64_t den_S = (sxx + syy + C2) >> 8; + const uint64_t fnum = (2 * xmym + C1) * num_S; + const uint64_t fden = (xmxm + ymym + C1) * den_S; + const double r = (double)fnum / fden; + assert(r >= 0. && r <= 1.0); + return r; + } + return 1.; // area is too dark to contribute meaningfully +} + +double VP8SSIMFromStats(const VP8DistoStats* const stats) { + return SSIMCalculation(stats, kWeightSum); +} + +double VP8SSIMFromStatsClipped(const VP8DistoStats* const stats) { + return SSIMCalculation(stats, stats->w); +} + +static double SSIMGetClipped_C(const uint8_t* src1, int stride1, + const uint8_t* src2, int stride2, + int xo, int yo, int W, int H) { + VP8DistoStats stats = { 0, 0, 0, 0, 0, 0 }; + const int ymin = (yo - VP8_SSIM_KERNEL < 0) ? 0 : yo - VP8_SSIM_KERNEL; + const int ymax = (yo + VP8_SSIM_KERNEL > H - 1) ? H - 1 + : yo + VP8_SSIM_KERNEL; + const int xmin = (xo - VP8_SSIM_KERNEL < 0) ? 0 : xo - VP8_SSIM_KERNEL; + const int xmax = (xo + VP8_SSIM_KERNEL > W - 1) ? W - 1 + : xo + VP8_SSIM_KERNEL; + int x, y; + src1 += ymin * stride1; + src2 += ymin * stride2; + for (y = ymin; y <= ymax; ++y, src1 += stride1, src2 += stride2) { + for (x = xmin; x <= xmax; ++x) { + const uint32_t w = kWeight[VP8_SSIM_KERNEL + x - xo] + * kWeight[VP8_SSIM_KERNEL + y - yo]; + const uint32_t s1 = src1[x]; + const uint32_t s2 = src2[x]; + stats.w += w; + stats.xm += w * s1; + stats.ym += w * s2; + stats.xxm += w * s1 * s1; + stats.xym += w * s1 * s2; + stats.yym += w * s2 * s2; + } + } + return VP8SSIMFromStatsClipped(&stats); +} + +static double SSIMGet_C(const uint8_t* src1, int stride1, + const uint8_t* src2, int stride2) { + VP8DistoStats stats = { 0, 0, 0, 0, 0, 0 }; + int x, y; + for (y = 0; y <= 2 * VP8_SSIM_KERNEL; ++y, src1 += stride1, src2 += stride2) { + for (x = 0; x <= 2 * VP8_SSIM_KERNEL; ++x) { + const uint32_t w = kWeight[x] * kWeight[y]; + const uint32_t s1 = src1[x]; + const uint32_t s2 = src2[x]; + stats.xm += w * s1; + stats.ym += w * s2; + stats.xxm += w * s1 * s1; + stats.xym += w * s1 * s2; + stats.yym += w * s2 * s2; + } + } + return VP8SSIMFromStats(&stats); +} + +#endif // !defined(WEBP_REDUCE_SIZE) + +//------------------------------------------------------------------------------ + +#if !defined(WEBP_DISABLE_STATS) +static uint32_t AccumulateSSE_C(const uint8_t* src1, + const uint8_t* src2, int len) { + int i; + uint32_t sse2 = 0; + assert(len <= 65535); // to ensure that accumulation fits within uint32_t + for (i = 0; i < len; ++i) { + const int32_t diff = src1[i] - src2[i]; + sse2 += diff * diff; + } + return sse2; +} +#endif + +//------------------------------------------------------------------------------ + +#if !defined(WEBP_REDUCE_SIZE) +VP8SSIMGetFunc VP8SSIMGet; +VP8SSIMGetClippedFunc VP8SSIMGetClipped; +#endif +#if !defined(WEBP_DISABLE_STATS) +VP8AccumulateSSEFunc VP8AccumulateSSE; +#endif + +extern void VP8SSIMDspInitSSE2(void); + +WEBP_DSP_INIT_FUNC(VP8SSIMDspInit) { +#if !defined(WEBP_REDUCE_SIZE) + VP8SSIMGetClipped = SSIMGetClipped_C; + VP8SSIMGet = SSIMGet_C; +#endif + +#if !defined(WEBP_DISABLE_STATS) + VP8AccumulateSSE = AccumulateSSE_C; +#endif + + if (VP8GetCPUInfo != NULL) { +#if defined(WEBP_USE_SSE2) + if (VP8GetCPUInfo(kSSE2)) { + VP8SSIMDspInitSSE2(); + } +#endif + } +} diff --git a/src/3rdparty/libwebp/src/dsp/ssim_sse2.c b/src/3rdparty/libwebp/src/dsp/ssim_sse2.c new file mode 100644 index 0000000..1dcb0eb --- /dev/null +++ b/src/3rdparty/libwebp/src/dsp/ssim_sse2.c @@ -0,0 +1,165 @@ +// Copyright 2017 Google Inc. All Rights Reserved. +// +// Use of this source code is governed by a BSD-style license +// that can be found in the COPYING file in the root of the source +// tree. An additional intellectual property rights grant can be found +// in the file PATENTS. All contributing project authors may +// be found in the AUTHORS file in the root of the source tree. +// ----------------------------------------------------------------------------- +// +// SSE2 version of distortion calculation +// +// Author: Skal (pascal.massimino@gmail.com) + +#include "src/dsp/dsp.h" + +#if defined(WEBP_USE_SSE2) + +#include <assert.h> +#include <emmintrin.h> + +#include "src/dsp/common_sse2.h" + +#if !defined(WEBP_DISABLE_STATS) + +// Helper function +static WEBP_INLINE void SubtractAndSquare_SSE2(const __m128i a, const __m128i b, + __m128i* const sum) { + // take abs(a-b) in 8b + const __m128i a_b = _mm_subs_epu8(a, b); + const __m128i b_a = _mm_subs_epu8(b, a); + const __m128i abs_a_b = _mm_or_si128(a_b, b_a); + // zero-extend to 16b + const __m128i zero = _mm_setzero_si128(); + const __m128i C0 = _mm_unpacklo_epi8(abs_a_b, zero); + const __m128i C1 = _mm_unpackhi_epi8(abs_a_b, zero); + // multiply with self + const __m128i sum1 = _mm_madd_epi16(C0, C0); + const __m128i sum2 = _mm_madd_epi16(C1, C1); + *sum = _mm_add_epi32(sum1, sum2); +} + +//------------------------------------------------------------------------------ +// SSIM / PSNR entry point + +static uint32_t AccumulateSSE_SSE2(const uint8_t* src1, + const uint8_t* src2, int len) { + int i = 0; + uint32_t sse2 = 0; + if (len >= 16) { + const int limit = len - 32; + int32_t tmp[4]; + __m128i sum1; + __m128i sum = _mm_setzero_si128(); + __m128i a0 = _mm_loadu_si128((const __m128i*)&src1[i]); + __m128i b0 = _mm_loadu_si128((const __m128i*)&src2[i]); + i += 16; + while (i <= limit) { + const __m128i a1 = _mm_loadu_si128((const __m128i*)&src1[i]); + const __m128i b1 = _mm_loadu_si128((const __m128i*)&src2[i]); + __m128i sum2; + i += 16; + SubtractAndSquare_SSE2(a0, b0, &sum1); + sum = _mm_add_epi32(sum, sum1); + a0 = _mm_loadu_si128((const __m128i*)&src1[i]); + b0 = _mm_loadu_si128((const __m128i*)&src2[i]); + i += 16; + SubtractAndSquare_SSE2(a1, b1, &sum2); + sum = _mm_add_epi32(sum, sum2); + } + SubtractAndSquare_SSE2(a0, b0, &sum1); + sum = _mm_add_epi32(sum, sum1); + _mm_storeu_si128((__m128i*)tmp, sum); + sse2 += (tmp[3] + tmp[2] + tmp[1] + tmp[0]); + } + + for (; i < len; ++i) { + const int32_t diff = src1[i] - src2[i]; + sse2 += diff * diff; + } + return sse2; +} +#endif // !defined(WEBP_DISABLE_STATS) + +#if !defined(WEBP_REDUCE_SIZE) + +static uint32_t HorizontalAdd16b_SSE2(const __m128i* const m) { + uint16_t tmp[8]; + const __m128i a = _mm_srli_si128(*m, 8); + const __m128i b = _mm_add_epi16(*m, a); + _mm_storeu_si128((__m128i*)tmp, b); + return (uint32_t)tmp[3] + tmp[2] + tmp[1] + tmp[0]; +} + +static uint32_t HorizontalAdd32b_SSE2(const __m128i* const m) { + const __m128i a = _mm_srli_si128(*m, 8); + const __m128i b = _mm_add_epi32(*m, a); + const __m128i c = _mm_add_epi32(b, _mm_srli_si128(b, 4)); + return (uint32_t)_mm_cvtsi128_si32(c); +} + +static const uint16_t kWeight[] = { 1, 2, 3, 4, 3, 2, 1, 0 }; + +#define ACCUMULATE_ROW(WEIGHT) do { \ + /* compute row weight (Wx * Wy) */ \ + const __m128i Wy = _mm_set1_epi16((WEIGHT)); \ + const __m128i W = _mm_mullo_epi16(Wx, Wy); \ + /* process 8 bytes at a time (7 bytes, actually) */ \ + const __m128i a0 = _mm_loadl_epi64((const __m128i*)src1); \ + const __m128i b0 = _mm_loadl_epi64((const __m128i*)src2); \ + /* convert to 16b and multiply by weight */ \ + const __m128i a1 = _mm_unpacklo_epi8(a0, zero); \ + const __m128i b1 = _mm_unpacklo_epi8(b0, zero); \ + const __m128i wa1 = _mm_mullo_epi16(a1, W); \ + const __m128i wb1 = _mm_mullo_epi16(b1, W); \ + /* accumulate */ \ + xm = _mm_add_epi16(xm, wa1); \ + ym = _mm_add_epi16(ym, wb1); \ + xxm = _mm_add_epi32(xxm, _mm_madd_epi16(a1, wa1)); \ + xym = _mm_add_epi32(xym, _mm_madd_epi16(a1, wb1)); \ + yym = _mm_add_epi32(yym, _mm_madd_epi16(b1, wb1)); \ + src1 += stride1; \ + src2 += stride2; \ +} while (0) + +static double SSIMGet_SSE2(const uint8_t* src1, int stride1, + const uint8_t* src2, int stride2) { + VP8DistoStats stats; + const __m128i zero = _mm_setzero_si128(); + __m128i xm = zero, ym = zero; // 16b accums + __m128i xxm = zero, yym = zero, xym = zero; // 32b accum + const __m128i Wx = _mm_loadu_si128((const __m128i*)kWeight); + assert(2 * VP8_SSIM_KERNEL + 1 == 7); + ACCUMULATE_ROW(1); + ACCUMULATE_ROW(2); + ACCUMULATE_ROW(3); + ACCUMULATE_ROW(4); + ACCUMULATE_ROW(3); + ACCUMULATE_ROW(2); + ACCUMULATE_ROW(1); + stats.xm = HorizontalAdd16b_SSE2(&xm); + stats.ym = HorizontalAdd16b_SSE2(&ym); + stats.xxm = HorizontalAdd32b_SSE2(&xxm); + stats.xym = HorizontalAdd32b_SSE2(&xym); + stats.yym = HorizontalAdd32b_SSE2(&yym); + return VP8SSIMFromStats(&stats); +} + +#endif // !defined(WEBP_REDUCE_SIZE) + +extern void VP8SSIMDspInitSSE2(void); + +WEBP_TSAN_IGNORE_FUNCTION void VP8SSIMDspInitSSE2(void) { +#if !defined(WEBP_DISABLE_STATS) + VP8AccumulateSSE = AccumulateSSE_SSE2; +#endif +#if !defined(WEBP_REDUCE_SIZE) + VP8SSIMGet = SSIMGet_SSE2; +#endif +} + +#else // !WEBP_USE_SSE2 + +WEBP_DSP_INIT_STUB(VP8SSIMDspInitSSE2) + +#endif // WEBP_USE_SSE2 diff --git a/src/3rdparty/libwebp/src/dsp/upsampling.c b/src/3rdparty/libwebp/src/dsp/upsampling.c index 265e722..9b60da5 100644 --- a/src/3rdparty/libwebp/src/dsp/upsampling.c +++ b/src/3rdparty/libwebp/src/dsp/upsampling.c @@ -11,8 +11,8 @@ // // Author: somnath@google.com (Somnath Banerjee) -#include "./dsp.h" -#include "./yuv.h" +#include "src/dsp/dsp.h" +#include "src/dsp/yuv.h" #include <assert.h> @@ -63,17 +63,17 @@ static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y, \ const uint32_t uv0 = (diag_12 + tl_uv) >> 1; \ const uint32_t uv1 = (diag_03 + t_uv) >> 1; \ FUNC(top_y[2 * x - 1], uv0 & 0xff, (uv0 >> 16), \ - top_dst + (2 * x - 1) * XSTEP); \ + top_dst + (2 * x - 1) * (XSTEP)); \ FUNC(top_y[2 * x - 0], uv1 & 0xff, (uv1 >> 16), \ - top_dst + (2 * x - 0) * XSTEP); \ + top_dst + (2 * x - 0) * (XSTEP)); \ } \ if (bottom_y != NULL) { \ const uint32_t uv0 = (diag_03 + l_uv) >> 1; \ const uint32_t uv1 = (diag_12 + uv) >> 1; \ FUNC(bottom_y[2 * x - 1], uv0 & 0xff, (uv0 >> 16), \ - bottom_dst + (2 * x - 1) * XSTEP); \ + bottom_dst + (2 * x - 1) * (XSTEP)); \ FUNC(bottom_y[2 * x + 0], uv1 & 0xff, (uv1 >> 16), \ - bottom_dst + (2 * x + 0) * XSTEP); \ + bottom_dst + (2 * x + 0) * (XSTEP)); \ } \ tl_uv = t_uv; \ l_uv = uv; \ @@ -82,24 +82,50 @@ static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y, \ { \ const uint32_t uv0 = (3 * tl_uv + l_uv + 0x00020002u) >> 2; \ FUNC(top_y[len - 1], uv0 & 0xff, (uv0 >> 16), \ - top_dst + (len - 1) * XSTEP); \ + top_dst + (len - 1) * (XSTEP)); \ } \ if (bottom_y != NULL) { \ const uint32_t uv0 = (3 * l_uv + tl_uv + 0x00020002u) >> 2; \ FUNC(bottom_y[len - 1], uv0 & 0xff, (uv0 >> 16), \ - bottom_dst + (len - 1) * XSTEP); \ + bottom_dst + (len - 1) * (XSTEP)); \ } \ } \ } // All variants implemented. -UPSAMPLE_FUNC(UpsampleRgbLinePair, VP8YuvToRgb, 3) -UPSAMPLE_FUNC(UpsampleBgrLinePair, VP8YuvToBgr, 3) -UPSAMPLE_FUNC(UpsampleRgbaLinePair, VP8YuvToRgba, 4) -UPSAMPLE_FUNC(UpsampleBgraLinePair, VP8YuvToBgra, 4) -UPSAMPLE_FUNC(UpsampleArgbLinePair, VP8YuvToArgb, 4) -UPSAMPLE_FUNC(UpsampleRgba4444LinePair, VP8YuvToRgba4444, 2) -UPSAMPLE_FUNC(UpsampleRgb565LinePair, VP8YuvToRgb565, 2) +#if !WEBP_NEON_OMIT_C_CODE +UPSAMPLE_FUNC(UpsampleRgbaLinePair_C, VP8YuvToRgba, 4) +UPSAMPLE_FUNC(UpsampleBgraLinePair_C, VP8YuvToBgra, 4) +#if !defined(WEBP_REDUCE_CSP) +UPSAMPLE_FUNC(UpsampleArgbLinePair_C, VP8YuvToArgb, 4) +UPSAMPLE_FUNC(UpsampleRgbLinePair_C, VP8YuvToRgb, 3) +UPSAMPLE_FUNC(UpsampleBgrLinePair_C, VP8YuvToBgr, 3) +UPSAMPLE_FUNC(UpsampleRgba4444LinePair_C, VP8YuvToRgba4444, 2) +UPSAMPLE_FUNC(UpsampleRgb565LinePair_C, VP8YuvToRgb565, 2) +#else +static void EmptyUpsampleFunc(const uint8_t* top_y, const uint8_t* bottom_y, + const uint8_t* top_u, const uint8_t* top_v, + const uint8_t* cur_u, const uint8_t* cur_v, + uint8_t* top_dst, uint8_t* bottom_dst, int len) { + (void)top_y; + (void)bottom_y; + (void)top_u; + (void)top_v; + (void)cur_u; + (void)cur_v; + (void)top_dst; + (void)bottom_dst; + (void)len; + assert(0); // COLORSPACE SUPPORT NOT COMPILED +} +#define UpsampleArgbLinePair_C EmptyUpsampleFunc +#define UpsampleRgbLinePair_C EmptyUpsampleFunc +#define UpsampleBgrLinePair_C EmptyUpsampleFunc +#define UpsampleRgba4444LinePair_C EmptyUpsampleFunc +#define UpsampleRgb565LinePair_C EmptyUpsampleFunc +#endif // WEBP_REDUCE_CSP + +#endif #undef LOAD_UV #undef UPSAMPLE_FUNC @@ -141,7 +167,6 @@ DUAL_SAMPLE_FUNC(DualLineSamplerARGB, VP8YuvToArgb) WebPUpsampleLinePairFunc WebPGetLinePairConverter(int alpha_is_last) { WebPInitUpsamplers(); - VP8YUVInit(); #ifdef FANCY_UPSAMPLING return WebPUpsamplers[alpha_is_last ? MODE_BGRA : MODE_ARGB]; #else @@ -158,16 +183,33 @@ extern void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v, \ void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v, \ uint8_t* dst, int len) { \ int i; \ - for (i = 0; i < len; ++i) FUNC(y[i], u[i], v[i], &dst[i * XSTEP]); \ + for (i = 0; i < len; ++i) FUNC(y[i], u[i], v[i], &dst[i * (XSTEP)]); \ } -YUV444_FUNC(WebPYuv444ToRgbC, VP8YuvToRgb, 3) -YUV444_FUNC(WebPYuv444ToBgrC, VP8YuvToBgr, 3) -YUV444_FUNC(WebPYuv444ToRgbaC, VP8YuvToRgba, 4) -YUV444_FUNC(WebPYuv444ToBgraC, VP8YuvToBgra, 4) -YUV444_FUNC(WebPYuv444ToArgbC, VP8YuvToArgb, 4) -YUV444_FUNC(WebPYuv444ToRgba4444C, VP8YuvToRgba4444, 2) -YUV444_FUNC(WebPYuv444ToRgb565C, VP8YuvToRgb565, 2) +YUV444_FUNC(WebPYuv444ToRgba_C, VP8YuvToRgba, 4) +YUV444_FUNC(WebPYuv444ToBgra_C, VP8YuvToBgra, 4) +#if !defined(WEBP_REDUCE_CSP) +YUV444_FUNC(WebPYuv444ToRgb_C, VP8YuvToRgb, 3) +YUV444_FUNC(WebPYuv444ToBgr_C, VP8YuvToBgr, 3) +YUV444_FUNC(WebPYuv444ToArgb_C, VP8YuvToArgb, 4) +YUV444_FUNC(WebPYuv444ToRgba4444_C, VP8YuvToRgba4444, 2) +YUV444_FUNC(WebPYuv444ToRgb565_C, VP8YuvToRgb565, 2) +#else +static void EmptyYuv444Func(const uint8_t* y, + const uint8_t* u, const uint8_t* v, + uint8_t* dst, int len) { + (void)y; + (void)u; + (void)v; + (void)dst; + (void)len; +} +#define WebPYuv444ToRgb_C EmptyYuv444Func +#define WebPYuv444ToBgr_C EmptyYuv444Func +#define WebPYuv444ToArgb_C EmptyYuv444Func +#define WebPYuv444ToRgba4444_C EmptyYuv444Func +#define WebPYuv444ToRgb565_C EmptyYuv444Func +#endif // WEBP_REDUCE_CSP #undef YUV444_FUNC @@ -175,24 +217,20 @@ WebPYUV444Converter WebPYUV444Converters[MODE_LAST]; extern void WebPInitYUV444ConvertersMIPSdspR2(void); extern void WebPInitYUV444ConvertersSSE2(void); - -static volatile VP8CPUInfo upsampling_last_cpuinfo_used1 = - (VP8CPUInfo)&upsampling_last_cpuinfo_used1; - -WEBP_TSAN_IGNORE_FUNCTION void WebPInitYUV444Converters(void) { - if (upsampling_last_cpuinfo_used1 == VP8GetCPUInfo) return; - - WebPYUV444Converters[MODE_RGB] = WebPYuv444ToRgbC; - WebPYUV444Converters[MODE_RGBA] = WebPYuv444ToRgbaC; - WebPYUV444Converters[MODE_BGR] = WebPYuv444ToBgrC; - WebPYUV444Converters[MODE_BGRA] = WebPYuv444ToBgraC; - WebPYUV444Converters[MODE_ARGB] = WebPYuv444ToArgbC; - WebPYUV444Converters[MODE_RGBA_4444] = WebPYuv444ToRgba4444C; - WebPYUV444Converters[MODE_RGB_565] = WebPYuv444ToRgb565C; - WebPYUV444Converters[MODE_rgbA] = WebPYuv444ToRgbaC; - WebPYUV444Converters[MODE_bgrA] = WebPYuv444ToBgraC; - WebPYUV444Converters[MODE_Argb] = WebPYuv444ToArgbC; - WebPYUV444Converters[MODE_rgbA_4444] = WebPYuv444ToRgba4444C; +extern void WebPInitYUV444ConvertersSSE41(void); + +WEBP_DSP_INIT_FUNC(WebPInitYUV444Converters) { + WebPYUV444Converters[MODE_RGBA] = WebPYuv444ToRgba_C; + WebPYUV444Converters[MODE_BGRA] = WebPYuv444ToBgra_C; + WebPYUV444Converters[MODE_RGB] = WebPYuv444ToRgb_C; + WebPYUV444Converters[MODE_BGR] = WebPYuv444ToBgr_C; + WebPYUV444Converters[MODE_ARGB] = WebPYuv444ToArgb_C; + WebPYUV444Converters[MODE_RGBA_4444] = WebPYuv444ToRgba4444_C; + WebPYUV444Converters[MODE_RGB_565] = WebPYuv444ToRgb565_C; + WebPYUV444Converters[MODE_rgbA] = WebPYuv444ToRgba_C; + WebPYUV444Converters[MODE_bgrA] = WebPYuv444ToBgra_C; + WebPYUV444Converters[MODE_Argb] = WebPYuv444ToArgb_C; + WebPYUV444Converters[MODE_rgbA_4444] = WebPYuv444ToRgba4444_C; if (VP8GetCPUInfo != NULL) { #if defined(WEBP_USE_SSE2) @@ -200,41 +238,43 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitYUV444Converters(void) { WebPInitYUV444ConvertersSSE2(); } #endif +#if defined(WEBP_USE_SSE41) + if (VP8GetCPUInfo(kSSE4_1)) { + WebPInitYUV444ConvertersSSE41(); + } +#endif #if defined(WEBP_USE_MIPS_DSP_R2) if (VP8GetCPUInfo(kMIPSdspR2)) { WebPInitYUV444ConvertersMIPSdspR2(); } #endif } - upsampling_last_cpuinfo_used1 = VP8GetCPUInfo; } //------------------------------------------------------------------------------ // Main calls extern void WebPInitUpsamplersSSE2(void); +extern void WebPInitUpsamplersSSE41(void); extern void WebPInitUpsamplersNEON(void); extern void WebPInitUpsamplersMIPSdspR2(void); extern void WebPInitUpsamplersMSA(void); -static volatile VP8CPUInfo upsampling_last_cpuinfo_used2 = - (VP8CPUInfo)&upsampling_last_cpuinfo_used2; - -WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplers(void) { - if (upsampling_last_cpuinfo_used2 == VP8GetCPUInfo) return; - +WEBP_DSP_INIT_FUNC(WebPInitUpsamplers) { #ifdef FANCY_UPSAMPLING - WebPUpsamplers[MODE_RGB] = UpsampleRgbLinePair; - WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePair; - WebPUpsamplers[MODE_BGR] = UpsampleBgrLinePair; - WebPUpsamplers[MODE_BGRA] = UpsampleBgraLinePair; - WebPUpsamplers[MODE_ARGB] = UpsampleArgbLinePair; - WebPUpsamplers[MODE_RGBA_4444] = UpsampleRgba4444LinePair; - WebPUpsamplers[MODE_RGB_565] = UpsampleRgb565LinePair; - WebPUpsamplers[MODE_rgbA] = UpsampleRgbaLinePair; - WebPUpsamplers[MODE_bgrA] = UpsampleBgraLinePair; - WebPUpsamplers[MODE_Argb] = UpsampleArgbLinePair; - WebPUpsamplers[MODE_rgbA_4444] = UpsampleRgba4444LinePair; +#if !WEBP_NEON_OMIT_C_CODE + WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePair_C; + WebPUpsamplers[MODE_BGRA] = UpsampleBgraLinePair_C; + WebPUpsamplers[MODE_rgbA] = UpsampleRgbaLinePair_C; + WebPUpsamplers[MODE_bgrA] = UpsampleBgraLinePair_C; + WebPUpsamplers[MODE_RGB] = UpsampleRgbLinePair_C; + WebPUpsamplers[MODE_BGR] = UpsampleBgrLinePair_C; + WebPUpsamplers[MODE_ARGB] = UpsampleArgbLinePair_C; + WebPUpsamplers[MODE_RGBA_4444] = UpsampleRgba4444LinePair_C; + WebPUpsamplers[MODE_RGB_565] = UpsampleRgb565LinePair_C; + WebPUpsamplers[MODE_Argb] = UpsampleArgbLinePair_C; + WebPUpsamplers[MODE_rgbA_4444] = UpsampleRgba4444LinePair_C; +#endif // If defined, use CPUInfo() to overwrite some pointers with faster versions. if (VP8GetCPUInfo != NULL) { @@ -243,9 +283,9 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplers(void) { WebPInitUpsamplersSSE2(); } #endif -#if defined(WEBP_USE_NEON) - if (VP8GetCPUInfo(kNEON)) { - WebPInitUpsamplersNEON(); +#if defined(WEBP_USE_SSE41) + if (VP8GetCPUInfo(kSSE4_1)) { + WebPInitUpsamplersSSE41(); } #endif #if defined(WEBP_USE_MIPS_DSP_R2) @@ -259,8 +299,29 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplers(void) { } #endif } + +#if defined(WEBP_USE_NEON) + if (WEBP_NEON_OMIT_C_CODE || + (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) { + WebPInitUpsamplersNEON(); + } +#endif + + assert(WebPUpsamplers[MODE_RGBA] != NULL); + assert(WebPUpsamplers[MODE_BGRA] != NULL); + assert(WebPUpsamplers[MODE_rgbA] != NULL); + assert(WebPUpsamplers[MODE_bgrA] != NULL); +#if !defined(WEBP_REDUCE_CSP) || !WEBP_NEON_OMIT_C_CODE + assert(WebPUpsamplers[MODE_RGB] != NULL); + assert(WebPUpsamplers[MODE_BGR] != NULL); + assert(WebPUpsamplers[MODE_ARGB] != NULL); + assert(WebPUpsamplers[MODE_RGBA_4444] != NULL); + assert(WebPUpsamplers[MODE_RGB_565] != NULL); + assert(WebPUpsamplers[MODE_Argb] != NULL); + assert(WebPUpsamplers[MODE_rgbA_4444] != NULL); +#endif + #endif // FANCY_UPSAMPLING - upsampling_last_cpuinfo_used2 = VP8GetCPUInfo; } //------------------------------------------------------------------------------ diff --git a/src/3rdparty/libwebp/src/dsp/upsampling_mips_dsp_r2.c b/src/3rdparty/libwebp/src/dsp/upsampling_mips_dsp_r2.c index ed2eb74..10d499d 100644 --- a/src/3rdparty/libwebp/src/dsp/upsampling_mips_dsp_r2.c +++ b/src/3rdparty/libwebp/src/dsp/upsampling_mips_dsp_r2.c @@ -12,14 +12,12 @@ // Author(s): Branimir Vasic (branimir.vasic@imgtec.com) // Djordje Pesut (djordje.pesut@imgtec.com) -#include "./dsp.h" +#include "src/dsp/dsp.h" #if defined(WEBP_USE_MIPS_DSP_R2) #include <assert.h> -#include "./yuv.h" - -#if !defined(WEBP_YUV_USE_TABLE) +#include "src/dsp/yuv.h" #define YUV_TO_RGB(Y, U, V, R, G, B) do { \ const int t1 = MultHi(Y, 19077); \ @@ -48,6 +46,7 @@ ); \ } while (0) +#if !defined(WEBP_REDUCE_CSP) static WEBP_INLINE void YuvToRgb(int y, int u, int v, uint8_t* const rgb) { int r, g, b; YUV_TO_RGB(y, u, v, r, g, b); @@ -68,7 +67,7 @@ static WEBP_INLINE void YuvToRgb565(int y, int u, int v, uint8_t* const rgb) { { const int rg = (r & 0xf8) | (g >> 5); const int gb = ((g << 3) & 0xe0) | (b >> 3); -#ifdef WEBP_SWAP_16BIT_CSP +#if (WEBP_SWAP_16BIT_CSP == 1) rgb[0] = gb; rgb[1] = rg; #else @@ -84,7 +83,7 @@ static WEBP_INLINE void YuvToRgba4444(int y, int u, int v, { const int rg = (r & 0xf0) | (g >> 4); const int ba = (b & 0xf0) | 0x0f; // overwrite the lower 4 bits -#ifdef WEBP_SWAP_16BIT_CSP +#if (WEBP_SWAP_16BIT_CSP == 1) argb[0] = ba; argb[1] = rg; #else @@ -93,11 +92,12 @@ static WEBP_INLINE void YuvToRgba4444(int y, int u, int v, #endif } } -#endif // WEBP_YUV_USE_TABLE +#endif // WEBP_REDUCE_CSP //----------------------------------------------------------------------------- // Alpha handling variants +#if !defined(WEBP_REDUCE_CSP) static WEBP_INLINE void YuvToArgb(uint8_t y, uint8_t u, uint8_t v, uint8_t* const argb) { int r, g, b; @@ -107,6 +107,7 @@ static WEBP_INLINE void YuvToArgb(uint8_t y, uint8_t u, uint8_t v, argb[2] = g; argb[3] = b; } +#endif // WEBP_REDUCE_CSP static WEBP_INLINE void YuvToBgra(uint8_t y, uint8_t u, uint8_t v, uint8_t* const bgra) { int r, g, b; @@ -200,13 +201,15 @@ static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y, \ } // All variants implemented. -UPSAMPLE_FUNC(UpsampleRgbLinePair, YuvToRgb, 3) -UPSAMPLE_FUNC(UpsampleBgrLinePair, YuvToBgr, 3) UPSAMPLE_FUNC(UpsampleRgbaLinePair, YuvToRgba, 4) UPSAMPLE_FUNC(UpsampleBgraLinePair, YuvToBgra, 4) +#if !defined(WEBP_REDUCE_CSP) +UPSAMPLE_FUNC(UpsampleRgbLinePair, YuvToRgb, 3) +UPSAMPLE_FUNC(UpsampleBgrLinePair, YuvToBgr, 3) UPSAMPLE_FUNC(UpsampleArgbLinePair, YuvToArgb, 4) UPSAMPLE_FUNC(UpsampleRgba4444LinePair, YuvToRgba4444, 2) UPSAMPLE_FUNC(UpsampleRgb565LinePair, YuvToRgb565, 2) +#endif // WEBP_REDUCE_CSP #undef LOAD_UV #undef UPSAMPLE_FUNC @@ -217,17 +220,19 @@ UPSAMPLE_FUNC(UpsampleRgb565LinePair, YuvToRgb565, 2) extern void WebPInitUpsamplersMIPSdspR2(void); WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplersMIPSdspR2(void) { - WebPUpsamplers[MODE_RGB] = UpsampleRgbLinePair; WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePair; - WebPUpsamplers[MODE_BGR] = UpsampleBgrLinePair; WebPUpsamplers[MODE_BGRA] = UpsampleBgraLinePair; + WebPUpsamplers[MODE_rgbA] = UpsampleRgbaLinePair; + WebPUpsamplers[MODE_bgrA] = UpsampleBgraLinePair; +#if !defined(WEBP_REDUCE_CSP) + WebPUpsamplers[MODE_RGB] = UpsampleRgbLinePair; + WebPUpsamplers[MODE_BGR] = UpsampleBgrLinePair; WebPUpsamplers[MODE_ARGB] = UpsampleArgbLinePair; WebPUpsamplers[MODE_RGBA_4444] = UpsampleRgba4444LinePair; WebPUpsamplers[MODE_RGB_565] = UpsampleRgb565LinePair; - WebPUpsamplers[MODE_rgbA] = UpsampleRgbaLinePair; - WebPUpsamplers[MODE_bgrA] = UpsampleBgraLinePair; WebPUpsamplers[MODE_Argb] = UpsampleArgbLinePair; WebPUpsamplers[MODE_rgbA_4444] = UpsampleRgba4444LinePair; +#endif // WEBP_REDUCE_CSP } #endif // FANCY_UPSAMPLING @@ -242,13 +247,15 @@ static void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v, \ for (i = 0; i < len; ++i) FUNC(y[i], u[i], v[i], &dst[i * XSTEP]); \ } -YUV444_FUNC(Yuv444ToRgb, YuvToRgb, 3) -YUV444_FUNC(Yuv444ToBgr, YuvToBgr, 3) YUV444_FUNC(Yuv444ToRgba, YuvToRgba, 4) YUV444_FUNC(Yuv444ToBgra, YuvToBgra, 4) +#if !defined(WEBP_REDUCE_CSP) +YUV444_FUNC(Yuv444ToRgb, YuvToRgb, 3) +YUV444_FUNC(Yuv444ToBgr, YuvToBgr, 3) YUV444_FUNC(Yuv444ToArgb, YuvToArgb, 4) YUV444_FUNC(Yuv444ToRgba4444, YuvToRgba4444, 2) YUV444_FUNC(Yuv444ToRgb565, YuvToRgb565, 2) +#endif // WEBP_REDUCE_CSP #undef YUV444_FUNC @@ -258,17 +265,19 @@ YUV444_FUNC(Yuv444ToRgb565, YuvToRgb565, 2) extern void WebPInitYUV444ConvertersMIPSdspR2(void); WEBP_TSAN_IGNORE_FUNCTION void WebPInitYUV444ConvertersMIPSdspR2(void) { - WebPYUV444Converters[MODE_RGB] = Yuv444ToRgb; WebPYUV444Converters[MODE_RGBA] = Yuv444ToRgba; - WebPYUV444Converters[MODE_BGR] = Yuv444ToBgr; WebPYUV444Converters[MODE_BGRA] = Yuv444ToBgra; + WebPYUV444Converters[MODE_rgbA] = Yuv444ToRgba; + WebPYUV444Converters[MODE_bgrA] = Yuv444ToBgra; +#if !defined(WEBP_REDUCE_CSP) + WebPYUV444Converters[MODE_RGB] = Yuv444ToRgb; + WebPYUV444Converters[MODE_BGR] = Yuv444ToBgr; WebPYUV444Converters[MODE_ARGB] = Yuv444ToArgb; WebPYUV444Converters[MODE_RGBA_4444] = Yuv444ToRgba4444; WebPYUV444Converters[MODE_RGB_565] = Yuv444ToRgb565; - WebPYUV444Converters[MODE_rgbA] = Yuv444ToRgba; - WebPYUV444Converters[MODE_bgrA] = Yuv444ToBgra; WebPYUV444Converters[MODE_Argb] = Yuv444ToArgb; WebPYUV444Converters[MODE_rgbA_4444] = Yuv444ToRgba4444; +#endif // WEBP_REDUCE_CSP } #else // !WEBP_USE_MIPS_DSP_R2 diff --git a/src/3rdparty/libwebp/src/dsp/upsampling_msa.c b/src/3rdparty/libwebp/src/dsp/upsampling_msa.c index f24926f..99eea70 100644 --- a/src/3rdparty/libwebp/src/dsp/upsampling_msa.c +++ b/src/3rdparty/libwebp/src/dsp/upsampling_msa.c @@ -12,12 +12,12 @@ // Author: Prashant Patil (prashant.patil@imgtec.com) #include <string.h> -#include "./dsp.h" +#include "src/dsp/dsp.h" #if defined(WEBP_USE_MSA) -#include "./msa_macro.h" -#include "./yuv.h" +#include "src/dsp/msa_macro.h" +#include "src/dsp/yuv.h" #ifdef FANCY_UPSAMPLING @@ -264,6 +264,7 @@ static void YuvToBgr(int y, int u, int v, uint8_t* const bgr) { bgr[2] = Clip8(r1 >> 6); } +#if !defined(WEBP_REDUCE_CSP) static void YuvToRgb565(int y, int u, int v, uint8_t* const rgb) { const int y1 = MultHi(y, 19077); const int r1 = y1 + MultHi(v, 26149) - 14234; @@ -274,7 +275,7 @@ static void YuvToRgb565(int y, int u, int v, uint8_t* const rgb) { const int b = Clip8(b1 >> 6); const int rg = (r & 0xf8) | (g >> 5); const int gb = ((g << 3) & 0xe0) | (b >> 3); -#ifdef WEBP_SWAP_16BIT_CSP +#if (WEBP_SWAP_16BIT_CSP == 1) rgb[0] = gb; rgb[1] = rg; #else @@ -293,7 +294,7 @@ static void YuvToRgba4444(int y, int u, int v, uint8_t* const argb) { const int b = Clip8(b1 >> 6); const int rg = (r & 0xf0) | (g >> 4); const int ba = (b & 0xf0) | 0x0f; // overwrite the lower 4 bits -#ifdef WEBP_SWAP_16BIT_CSP +#if (WEBP_SWAP_16BIT_CSP == 1) argb[0] = ba; argb[1] = rg; #else @@ -306,6 +307,7 @@ static void YuvToArgb(uint8_t y, uint8_t u, uint8_t v, uint8_t* const argb) { argb[0] = 0xff; YuvToRgb(y, u, v, argb + 1); } +#endif // WEBP_REDUCE_CSP static void YuvToBgra(uint8_t y, uint8_t u, uint8_t v, uint8_t* const bgra) { YuvToBgr(y, u, v, bgra); @@ -317,6 +319,7 @@ static void YuvToRgba(uint8_t y, uint8_t u, uint8_t v, uint8_t* const rgba) { rgba[3] = 0xff; } +#if !defined(WEBP_REDUCE_CSP) static void YuvToRgbLine(const uint8_t* y, const uint8_t* u, const uint8_t* v, uint8_t* dst, int length) { v16u8 R, G, B; @@ -370,11 +373,12 @@ static void YuvToBgrLine(const uint8_t* y, const uint8_t* u, memcpy(dst, temp, length * 3 * sizeof(*dst)); } } +#endif // WEBP_REDUCE_CSP static void YuvToRgbaLine(const uint8_t* y, const uint8_t* u, const uint8_t* v, uint8_t* dst, int length) { v16u8 R, G, B; - const v16u8 A = (v16u8)__msa_ldi_b(0xff); + const v16u8 A = (v16u8)__msa_ldi_b(ALPHAVAL); while (length >= 16) { CALC_RGB16(y, u, v, R, G, B); STORE16_4(R, G, B, A, dst); @@ -402,7 +406,7 @@ static void YuvToRgbaLine(const uint8_t* y, const uint8_t* u, static void YuvToBgraLine(const uint8_t* y, const uint8_t* u, const uint8_t* v, uint8_t* dst, int length) { v16u8 R, G, B; - const v16u8 A = (v16u8)__msa_ldi_b(0xff); + const v16u8 A = (v16u8)__msa_ldi_b(ALPHAVAL); while (length >= 16) { CALC_RGB16(y, u, v, R, G, B); STORE16_4(B, G, R, A, dst); @@ -427,10 +431,11 @@ static void YuvToBgraLine(const uint8_t* y, const uint8_t* u, } } +#if !defined(WEBP_REDUCE_CSP) static void YuvToArgbLine(const uint8_t* y, const uint8_t* u, const uint8_t* v, uint8_t* dst, int length) { v16u8 R, G, B; - const v16u8 A = (v16u8)__msa_ldi_b(0xff); + const v16u8 A = (v16u8)__msa_ldi_b(ALPHAVAL); while (length >= 16) { CALC_RGB16(y, u, v, R, G, B); STORE16_4(A, R, G, B, dst); @@ -459,11 +464,11 @@ static void YuvToRgba4444Line(const uint8_t* y, const uint8_t* u, const uint8_t* v, uint8_t* dst, int length) { v16u8 R, G, B, RG, BA, tmp0, tmp1; while (length >= 16) { - #ifdef WEBP_SWAP_16BIT_CSP +#if (WEBP_SWAP_16BIT_CSP == 1) CALC_RGBA4444(y, u, v, BA, RG, 16, dst); - #else +#else CALC_RGBA4444(y, u, v, RG, BA, 16, dst); - #endif +#endif y += 16; u += 16; v += 16; @@ -473,7 +478,7 @@ static void YuvToRgba4444Line(const uint8_t* y, const uint8_t* u, if (length > 8) { uint8_t temp[2 * 16] = { 0 }; memcpy(temp, y, length * sizeof(*temp)); -#ifdef WEBP_SWAP_16BIT_CSP +#if (WEBP_SWAP_16BIT_CSP == 1) CALC_RGBA4444(temp, u, v, BA, RG, 16, temp); #else CALC_RGBA4444(temp, u, v, RG, BA, 16, temp); @@ -482,7 +487,7 @@ static void YuvToRgba4444Line(const uint8_t* y, const uint8_t* u, } else if (length > 0) { uint8_t temp[2 * 8] = { 0 }; memcpy(temp, y, length * sizeof(*temp)); -#ifdef WEBP_SWAP_16BIT_CSP +#if (WEBP_SWAP_16BIT_CSP == 1) CALC_RGBA4444(temp, u, v, BA, RG, 8, temp); #else CALC_RGBA4444(temp, u, v, RG, BA, 8, temp); @@ -495,11 +500,11 @@ static void YuvToRgb565Line(const uint8_t* y, const uint8_t* u, const uint8_t* v, uint8_t* dst, int length) { v16u8 R, G, B, RG, GB, tmp0, tmp1; while (length >= 16) { - #ifdef WEBP_SWAP_16BIT_CSP +#if (WEBP_SWAP_16BIT_CSP == 1) CALC_RGB565(y, u, v, GB, RG, 16, dst); - #else +#else CALC_RGB565(y, u, v, RG, GB, 16, dst); - #endif +#endif y += 16; u += 16; v += 16; @@ -509,7 +514,7 @@ static void YuvToRgb565Line(const uint8_t* y, const uint8_t* u, if (length > 8) { uint8_t temp[2 * 16] = { 0 }; memcpy(temp, y, length * sizeof(*temp)); -#ifdef WEBP_SWAP_16BIT_CSP +#if (WEBP_SWAP_16BIT_CSP == 1) CALC_RGB565(temp, u, v, GB, RG, 16, temp); #else CALC_RGB565(temp, u, v, RG, GB, 16, temp); @@ -518,7 +523,7 @@ static void YuvToRgb565Line(const uint8_t* y, const uint8_t* u, } else if (length > 0) { uint8_t temp[2 * 8] = { 0 }; memcpy(temp, y, length * sizeof(*temp)); -#ifdef WEBP_SWAP_16BIT_CSP +#if (WEBP_SWAP_16BIT_CSP == 1) CALC_RGB565(temp, u, v, GB, RG, 8, temp); #else CALC_RGB565(temp, u, v, RG, GB, 8, temp); @@ -526,6 +531,7 @@ static void YuvToRgb565Line(const uint8_t* y, const uint8_t* u, memcpy(dst, temp, length * 2 * sizeof(*dst)); } } +#endif // WEBP_REDUCE_CSP #define UPSAMPLE_32PIXELS(a, b, c, d) do { \ v16u8 s = __msa_aver_u_b(a, d); \ @@ -640,13 +646,15 @@ static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bot_y, \ } \ } -UPSAMPLE_FUNC(UpsampleRgbLinePair, YuvToRgb, 3) -UPSAMPLE_FUNC(UpsampleBgrLinePair, YuvToBgr, 3) UPSAMPLE_FUNC(UpsampleRgbaLinePair, YuvToRgba, 4) UPSAMPLE_FUNC(UpsampleBgraLinePair, YuvToBgra, 4) +#if !defined(WEBP_REDUCE_CSP) +UPSAMPLE_FUNC(UpsampleRgbLinePair, YuvToRgb, 3) +UPSAMPLE_FUNC(UpsampleBgrLinePair, YuvToBgr, 3) UPSAMPLE_FUNC(UpsampleArgbLinePair, YuvToArgb, 4) UPSAMPLE_FUNC(UpsampleRgba4444LinePair, YuvToRgba4444, 2) UPSAMPLE_FUNC(UpsampleRgb565LinePair, YuvToRgb565, 2) +#endif // WEBP_REDUCE_CSP //------------------------------------------------------------------------------ // Entry point @@ -656,17 +664,19 @@ extern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */]; extern void WebPInitUpsamplersMSA(void); WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplersMSA(void) { - WebPUpsamplers[MODE_RGB] = UpsampleRgbLinePair; WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePair; - WebPUpsamplers[MODE_BGR] = UpsampleBgrLinePair; WebPUpsamplers[MODE_BGRA] = UpsampleBgraLinePair; - WebPUpsamplers[MODE_ARGB] = UpsampleArgbLinePair; WebPUpsamplers[MODE_rgbA] = UpsampleRgbaLinePair; WebPUpsamplers[MODE_bgrA] = UpsampleBgraLinePair; +#if !defined(WEBP_REDUCE_CSP) + WebPUpsamplers[MODE_RGB] = UpsampleRgbLinePair; + WebPUpsamplers[MODE_BGR] = UpsampleBgrLinePair; + WebPUpsamplers[MODE_ARGB] = UpsampleArgbLinePair; WebPUpsamplers[MODE_Argb] = UpsampleArgbLinePair; WebPUpsamplers[MODE_RGB_565] = UpsampleRgb565LinePair; WebPUpsamplers[MODE_RGBA_4444] = UpsampleRgba4444LinePair; WebPUpsamplers[MODE_rgbA_4444] = UpsampleRgba4444LinePair; +#endif // WEBP_REDUCE_CSP } #endif // FANCY_UPSAMPLING diff --git a/src/3rdparty/libwebp/src/dsp/upsampling_neon.c b/src/3rdparty/libwebp/src/dsp/upsampling_neon.c index d371a83..17cbc9f 100644 --- a/src/3rdparty/libwebp/src/dsp/upsampling_neon.c +++ b/src/3rdparty/libwebp/src/dsp/upsampling_neon.c @@ -12,15 +12,15 @@ // Author: mans@mansr.com (Mans Rullgard) // Based on SSE code by: somnath@google.com (Somnath Banerjee) -#include "./dsp.h" +#include "src/dsp/dsp.h" #if defined(WEBP_USE_NEON) #include <assert.h> #include <arm_neon.h> #include <string.h> -#include "./neon.h" -#include "./yuv.h" +#include "src/dsp/neon.h" +#include "src/dsp/yuv.h" #ifdef FANCY_UPSAMPLING @@ -58,8 +58,8 @@ } while (0) // Turn the macro into a function for reducing code-size when non-critical -static void Upsample16Pixels(const uint8_t *r1, const uint8_t *r2, - uint8_t *out) { +static void Upsample16Pixels_NEON(const uint8_t *r1, const uint8_t *r2, + uint8_t *out) { UPSAMPLE_16PIXELS(r1, r2, out); } @@ -70,7 +70,7 @@ static void Upsample16Pixels(const uint8_t *r1, const uint8_t *r2, /* replicate last byte */ \ memset(r1 + (num_pixels), r1[(num_pixels) - 1], 9 - (num_pixels)); \ memset(r2 + (num_pixels), r2[(num_pixels) - 1], 9 - (num_pixels)); \ - Upsample16Pixels(r1, r2, out); \ + Upsample16Pixels_NEON(r1, r2, out); \ } //----------------------------------------------------------------------------- @@ -243,13 +243,15 @@ static void FUNC_NAME(const uint8_t *top_y, const uint8_t *bottom_y, \ } // NEON variants of the fancy upsampler. -NEON_UPSAMPLE_FUNC(UpsampleRgbLinePair, Rgb, 3) -NEON_UPSAMPLE_FUNC(UpsampleBgrLinePair, Bgr, 3) -NEON_UPSAMPLE_FUNC(UpsampleRgbaLinePair, Rgba, 4) -NEON_UPSAMPLE_FUNC(UpsampleBgraLinePair, Bgra, 4) -NEON_UPSAMPLE_FUNC(UpsampleArgbLinePair, Argb, 4) -NEON_UPSAMPLE_FUNC(UpsampleRgba4444LinePair, Rgba4444, 2) -NEON_UPSAMPLE_FUNC(UpsampleRgb565LinePair, Rgb565, 2) +NEON_UPSAMPLE_FUNC(UpsampleRgbaLinePair_NEON, Rgba, 4) +NEON_UPSAMPLE_FUNC(UpsampleBgraLinePair_NEON, Bgra, 4) +#if !defined(WEBP_REDUCE_CSP) +NEON_UPSAMPLE_FUNC(UpsampleRgbLinePair_NEON, Rgb, 3) +NEON_UPSAMPLE_FUNC(UpsampleBgrLinePair_NEON, Bgr, 3) +NEON_UPSAMPLE_FUNC(UpsampleArgbLinePair_NEON, Argb, 4) +NEON_UPSAMPLE_FUNC(UpsampleRgba4444LinePair_NEON, Rgba4444, 2) +NEON_UPSAMPLE_FUNC(UpsampleRgb565LinePair_NEON, Rgb565, 2) +#endif // WEBP_REDUCE_CSP //------------------------------------------------------------------------------ // Entry point @@ -259,17 +261,19 @@ extern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */]; extern void WebPInitUpsamplersNEON(void); WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplersNEON(void) { - WebPUpsamplers[MODE_RGB] = UpsampleRgbLinePair; - WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePair; - WebPUpsamplers[MODE_BGR] = UpsampleBgrLinePair; - WebPUpsamplers[MODE_BGRA] = UpsampleBgraLinePair; - WebPUpsamplers[MODE_ARGB] = UpsampleArgbLinePair; - WebPUpsamplers[MODE_rgbA] = UpsampleRgbaLinePair; - WebPUpsamplers[MODE_bgrA] = UpsampleBgraLinePair; - WebPUpsamplers[MODE_Argb] = UpsampleArgbLinePair; - WebPUpsamplers[MODE_RGB_565] = UpsampleRgb565LinePair; - WebPUpsamplers[MODE_RGBA_4444] = UpsampleRgba4444LinePair; - WebPUpsamplers[MODE_rgbA_4444] = UpsampleRgba4444LinePair; + WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePair_NEON; + WebPUpsamplers[MODE_BGRA] = UpsampleBgraLinePair_NEON; + WebPUpsamplers[MODE_rgbA] = UpsampleRgbaLinePair_NEON; + WebPUpsamplers[MODE_bgrA] = UpsampleBgraLinePair_NEON; +#if !defined(WEBP_REDUCE_CSP) + WebPUpsamplers[MODE_RGB] = UpsampleRgbLinePair_NEON; + WebPUpsamplers[MODE_BGR] = UpsampleBgrLinePair_NEON; + WebPUpsamplers[MODE_ARGB] = UpsampleArgbLinePair_NEON; + WebPUpsamplers[MODE_Argb] = UpsampleArgbLinePair_NEON; + WebPUpsamplers[MODE_RGB_565] = UpsampleRgb565LinePair_NEON; + WebPUpsamplers[MODE_RGBA_4444] = UpsampleRgba4444LinePair_NEON; + WebPUpsamplers[MODE_rgbA_4444] = UpsampleRgba4444LinePair_NEON; +#endif // WEBP_REDUCE_CSP } #endif // FANCY_UPSAMPLING diff --git a/src/3rdparty/libwebp/src/dsp/upsampling_sse2.c b/src/3rdparty/libwebp/src/dsp/upsampling_sse2.c index b5b6689..340f1e2 100644 --- a/src/3rdparty/libwebp/src/dsp/upsampling_sse2.c +++ b/src/3rdparty/libwebp/src/dsp/upsampling_sse2.c @@ -11,14 +11,14 @@ // // Author: somnath@google.com (Somnath Banerjee) -#include "./dsp.h" +#include "src/dsp/dsp.h" #if defined(WEBP_USE_SSE2) #include <assert.h> #include <emmintrin.h> #include <string.h> -#include "./yuv.h" +#include "src/dsp/yuv.h" #ifdef FANCY_UPSAMPLING @@ -83,13 +83,13 @@ GET_M(ad, s, diag2); /* diag2 = (3a + b + c + 3d) / 8 */ \ \ /* pack the alternate pixels */ \ - PACK_AND_STORE(a, b, diag1, diag2, out + 0); /* store top */ \ - PACK_AND_STORE(c, d, diag2, diag1, out + 2 * 32); /* store bottom */ \ + PACK_AND_STORE(a, b, diag1, diag2, (out) + 0); /* store top */ \ + PACK_AND_STORE(c, d, diag2, diag1, (out) + 2 * 32); /* store bottom */ \ } // Turn the macro into a function for reducing code-size when non-critical -static void Upsample32Pixels(const uint8_t r1[], const uint8_t r2[], - uint8_t* const out) { +static void Upsample32Pixels_SSE2(const uint8_t r1[], const uint8_t r2[], + uint8_t* const out) { UPSAMPLE_32PIXELS(r1, r2, out); } @@ -101,30 +101,15 @@ static void Upsample32Pixels(const uint8_t r1[], const uint8_t r2[], memset(r1 + (num_pixels), r1[(num_pixels) - 1], 17 - (num_pixels)); \ memset(r2 + (num_pixels), r2[(num_pixels) - 1], 17 - (num_pixels)); \ /* using the shared function instead of the macro saves ~3k code size */ \ - Upsample32Pixels(r1, r2, out); \ -} - -#define CONVERT2RGB(FUNC, XSTEP, top_y, bottom_y, \ - top_dst, bottom_dst, cur_x, num_pixels) { \ - int n; \ - for (n = 0; n < (num_pixels); ++n) { \ - FUNC(top_y[(cur_x) + n], r_u[n], r_v[n], \ - top_dst + ((cur_x) + n) * XSTEP); \ - } \ - if (bottom_y != NULL) { \ - for (n = 0; n < (num_pixels); ++n) { \ - FUNC(bottom_y[(cur_x) + n], r_u[64 + n], r_v[64 + n], \ - bottom_dst + ((cur_x) + n) * XSTEP); \ - } \ - } \ + Upsample32Pixels_SSE2(r1, r2, out); \ } #define CONVERT2RGB_32(FUNC, XSTEP, top_y, bottom_y, \ top_dst, bottom_dst, cur_x) do { \ - FUNC##32(top_y + (cur_x), r_u, r_v, top_dst + (cur_x) * XSTEP); \ - if (bottom_y != NULL) { \ - FUNC##32(bottom_y + (cur_x), r_u + 64, r_v + 64, \ - bottom_dst + (cur_x) * XSTEP); \ + FUNC##32_SSE2((top_y) + (cur_x), r_u, r_v, (top_dst) + (cur_x) * (XSTEP)); \ + if ((bottom_y) != NULL) { \ + FUNC##32_SSE2((bottom_y) + (cur_x), r_u + 64, r_v + 64, \ + (bottom_dst) + (cur_x) * (XSTEP)); \ } \ } while (0) @@ -135,7 +120,7 @@ static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y, \ uint8_t* top_dst, uint8_t* bottom_dst, int len) { \ int uv_pos, pos; \ /* 16byte-aligned array to cache reconstructed u and v */ \ - uint8_t uv_buf[4 * 32 + 15]; \ + uint8_t uv_buf[14 * 32 + 15] = { 0 }; \ uint8_t* const r_u = (uint8_t*)((uintptr_t)(uv_buf + 15) & ~15); \ uint8_t* const r_v = r_u + 32; \ \ @@ -160,22 +145,36 @@ static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y, \ } \ if (len > 1) { \ const int left_over = ((len + 1) >> 1) - (pos >> 1); \ + uint8_t* const tmp_top_dst = r_u + 4 * 32; \ + uint8_t* const tmp_bottom_dst = tmp_top_dst + 4 * 32; \ + uint8_t* const tmp_top = tmp_bottom_dst + 4 * 32; \ + uint8_t* const tmp_bottom = (bottom_y == NULL) ? NULL : tmp_top + 32; \ assert(left_over > 0); \ UPSAMPLE_LAST_BLOCK(top_u + uv_pos, cur_u + uv_pos, left_over, r_u); \ UPSAMPLE_LAST_BLOCK(top_v + uv_pos, cur_v + uv_pos, left_over, r_v); \ - CONVERT2RGB(FUNC, XSTEP, top_y, bottom_y, top_dst, bottom_dst, \ - pos, len - pos); \ + memcpy(tmp_top, top_y + pos, len - pos); \ + if (bottom_y != NULL) memcpy(tmp_bottom, bottom_y + pos, len - pos); \ + CONVERT2RGB_32(FUNC, XSTEP, tmp_top, tmp_bottom, tmp_top_dst, \ + tmp_bottom_dst, 0); \ + memcpy(top_dst + pos * (XSTEP), tmp_top_dst, (len - pos) * (XSTEP)); \ + if (bottom_y != NULL) { \ + memcpy(bottom_dst + pos * (XSTEP), tmp_bottom_dst, \ + (len - pos) * (XSTEP)); \ + } \ } \ } // SSE2 variants of the fancy upsampler. -SSE2_UPSAMPLE_FUNC(UpsampleRgbLinePair, VP8YuvToRgb, 3) -SSE2_UPSAMPLE_FUNC(UpsampleBgrLinePair, VP8YuvToBgr, 3) -SSE2_UPSAMPLE_FUNC(UpsampleRgbaLinePair, VP8YuvToRgba, 4) -SSE2_UPSAMPLE_FUNC(UpsampleBgraLinePair, VP8YuvToBgra, 4) -SSE2_UPSAMPLE_FUNC(UpsampleArgbLinePair, VP8YuvToArgb, 4) -SSE2_UPSAMPLE_FUNC(UpsampleRgba4444LinePair, VP8YuvToRgba4444, 2) -SSE2_UPSAMPLE_FUNC(UpsampleRgb565LinePair, VP8YuvToRgb565, 2) +SSE2_UPSAMPLE_FUNC(UpsampleRgbaLinePair_SSE2, VP8YuvToRgba, 4) +SSE2_UPSAMPLE_FUNC(UpsampleBgraLinePair_SSE2, VP8YuvToBgra, 4) + +#if !defined(WEBP_REDUCE_CSP) +SSE2_UPSAMPLE_FUNC(UpsampleRgbLinePair_SSE2, VP8YuvToRgb, 3) +SSE2_UPSAMPLE_FUNC(UpsampleBgrLinePair_SSE2, VP8YuvToBgr, 3) +SSE2_UPSAMPLE_FUNC(UpsampleArgbLinePair_SSE2, VP8YuvToArgb, 4) +SSE2_UPSAMPLE_FUNC(UpsampleRgba4444LinePair_SSE2, VP8YuvToRgba4444, 2) +SSE2_UPSAMPLE_FUNC(UpsampleRgb565LinePair_SSE2, VP8YuvToRgb565, 2) +#endif // WEBP_REDUCE_CSP #undef GET_M #undef PACK_AND_STORE @@ -193,17 +192,19 @@ extern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */]; extern void WebPInitUpsamplersSSE2(void); WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplersSSE2(void) { - WebPUpsamplers[MODE_RGB] = UpsampleRgbLinePair; - WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePair; - WebPUpsamplers[MODE_BGR] = UpsampleBgrLinePair; - WebPUpsamplers[MODE_BGRA] = UpsampleBgraLinePair; - WebPUpsamplers[MODE_ARGB] = UpsampleArgbLinePair; - WebPUpsamplers[MODE_rgbA] = UpsampleRgbaLinePair; - WebPUpsamplers[MODE_bgrA] = UpsampleBgraLinePair; - WebPUpsamplers[MODE_Argb] = UpsampleArgbLinePair; - WebPUpsamplers[MODE_RGB_565] = UpsampleRgb565LinePair; - WebPUpsamplers[MODE_RGBA_4444] = UpsampleRgba4444LinePair; - WebPUpsamplers[MODE_rgbA_4444] = UpsampleRgba4444LinePair; + WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePair_SSE2; + WebPUpsamplers[MODE_BGRA] = UpsampleBgraLinePair_SSE2; + WebPUpsamplers[MODE_rgbA] = UpsampleRgbaLinePair_SSE2; + WebPUpsamplers[MODE_bgrA] = UpsampleBgraLinePair_SSE2; +#if !defined(WEBP_REDUCE_CSP) + WebPUpsamplers[MODE_RGB] = UpsampleRgbLinePair_SSE2; + WebPUpsamplers[MODE_BGR] = UpsampleBgrLinePair_SSE2; + WebPUpsamplers[MODE_ARGB] = UpsampleArgbLinePair_SSE2; + WebPUpsamplers[MODE_Argb] = UpsampleArgbLinePair_SSE2; + WebPUpsamplers[MODE_RGB_565] = UpsampleRgb565LinePair_SSE2; + WebPUpsamplers[MODE_RGBA_4444] = UpsampleRgba4444LinePair_SSE2; + WebPUpsamplers[MODE_rgbA_4444] = UpsampleRgba4444LinePair_SSE2; +#endif // WEBP_REDUCE_CSP } #endif // FANCY_UPSAMPLING @@ -213,29 +214,46 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplersSSE2(void) { extern WebPYUV444Converter WebPYUV444Converters[/* MODE_LAST */]; extern void WebPInitYUV444ConvertersSSE2(void); -#define YUV444_FUNC(FUNC_NAME, CALL, XSTEP) \ -extern void WebP##FUNC_NAME##C(const uint8_t* y, const uint8_t* u, \ - const uint8_t* v, uint8_t* dst, int len); \ +#define YUV444_FUNC(FUNC_NAME, CALL, CALL_C, XSTEP) \ +extern void CALL_C(const uint8_t* y, const uint8_t* u, const uint8_t* v, \ + uint8_t* dst, int len); \ static void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v, \ uint8_t* dst, int len) { \ int i; \ const int max_len = len & ~31; \ - for (i = 0; i < max_len; i += 32) CALL(y + i, u + i, v + i, dst + i * XSTEP);\ + for (i = 0; i < max_len; i += 32) { \ + CALL(y + i, u + i, v + i, dst + i * (XSTEP)); \ + } \ if (i < len) { /* C-fallback */ \ - WebP##FUNC_NAME##C(y + i, u + i, v + i, dst + i * XSTEP, len - i); \ + CALL_C(y + i, u + i, v + i, dst + i * (XSTEP), len - i); \ } \ } -YUV444_FUNC(Yuv444ToRgba, VP8YuvToRgba32, 4); -YUV444_FUNC(Yuv444ToBgra, VP8YuvToBgra32, 4); -YUV444_FUNC(Yuv444ToRgb, VP8YuvToRgb32, 3); -YUV444_FUNC(Yuv444ToBgr, VP8YuvToBgr32, 3); +YUV444_FUNC(Yuv444ToRgba_SSE2, VP8YuvToRgba32_SSE2, WebPYuv444ToRgba_C, 4); +YUV444_FUNC(Yuv444ToBgra_SSE2, VP8YuvToBgra32_SSE2, WebPYuv444ToBgra_C, 4); +#if !defined(WEBP_REDUCE_CSP) +YUV444_FUNC(Yuv444ToRgb_SSE2, VP8YuvToRgb32_SSE2, WebPYuv444ToRgb_C, 3); +YUV444_FUNC(Yuv444ToBgr_SSE2, VP8YuvToBgr32_SSE2, WebPYuv444ToBgr_C, 3); +YUV444_FUNC(Yuv444ToArgb_SSE2, VP8YuvToArgb32_SSE2, WebPYuv444ToArgb_C, 4) +YUV444_FUNC(Yuv444ToRgba4444_SSE2, VP8YuvToRgba444432_SSE2, \ + WebPYuv444ToRgba4444_C, 2) +YUV444_FUNC(Yuv444ToRgb565_SSE2, VP8YuvToRgb56532_SSE2, WebPYuv444ToRgb565_C, 2) +#endif // WEBP_REDUCE_CSP WEBP_TSAN_IGNORE_FUNCTION void WebPInitYUV444ConvertersSSE2(void) { - WebPYUV444Converters[MODE_RGBA] = Yuv444ToRgba; - WebPYUV444Converters[MODE_BGRA] = Yuv444ToBgra; - WebPYUV444Converters[MODE_RGB] = Yuv444ToRgb; - WebPYUV444Converters[MODE_BGR] = Yuv444ToBgr; + WebPYUV444Converters[MODE_RGBA] = Yuv444ToRgba_SSE2; + WebPYUV444Converters[MODE_BGRA] = Yuv444ToBgra_SSE2; + WebPYUV444Converters[MODE_rgbA] = Yuv444ToRgba_SSE2; + WebPYUV444Converters[MODE_bgrA] = Yuv444ToBgra_SSE2; +#if !defined(WEBP_REDUCE_CSP) + WebPYUV444Converters[MODE_RGB] = Yuv444ToRgb_SSE2; + WebPYUV444Converters[MODE_BGR] = Yuv444ToBgr_SSE2; + WebPYUV444Converters[MODE_ARGB] = Yuv444ToArgb_SSE2; + WebPYUV444Converters[MODE_RGBA_4444] = Yuv444ToRgba4444_SSE2; + WebPYUV444Converters[MODE_RGB_565] = Yuv444ToRgb565_SSE2; + WebPYUV444Converters[MODE_Argb] = Yuv444ToArgb_SSE2; + WebPYUV444Converters[MODE_rgbA_4444] = Yuv444ToRgba4444_SSE2; +#endif // WEBP_REDUCE_CSP } #else diff --git a/src/3rdparty/libwebp/src/dsp/upsampling_sse41.c b/src/3rdparty/libwebp/src/dsp/upsampling_sse41.c new file mode 100644 index 0000000..648d456 --- /dev/null +++ b/src/3rdparty/libwebp/src/dsp/upsampling_sse41.c @@ -0,0 +1,239 @@ +// Copyright 2011 Google Inc. All Rights Reserved. +// +// Use of this source code is governed by a BSD-style license +// that can be found in the COPYING file in the root of the source +// tree. An additional intellectual property rights grant can be found +// in the file PATENTS. All contributing project authors may +// be found in the AUTHORS file in the root of the source tree. +// ----------------------------------------------------------------------------- +// +// SSE41 version of YUV to RGB upsampling functions. +// +// Author: somnath@google.com (Somnath Banerjee) + +#include "src/dsp/dsp.h" + +#if defined(WEBP_USE_SSE41) + +#include <assert.h> +#include <smmintrin.h> +#include <string.h> +#include "src/dsp/yuv.h" + +#ifdef FANCY_UPSAMPLING + +#if !defined(WEBP_REDUCE_CSP) + +// We compute (9*a + 3*b + 3*c + d + 8) / 16 as follows +// u = (9*a + 3*b + 3*c + d + 8) / 16 +// = (a + (a + 3*b + 3*c + d) / 8 + 1) / 2 +// = (a + m + 1) / 2 +// where m = (a + 3*b + 3*c + d) / 8 +// = ((a + b + c + d) / 2 + b + c) / 4 +// +// Let's say k = (a + b + c + d) / 4. +// We can compute k as +// k = (s + t + 1) / 2 - ((a^d) | (b^c) | (s^t)) & 1 +// where s = (a + d + 1) / 2 and t = (b + c + 1) / 2 +// +// Then m can be written as +// m = (k + t + 1) / 2 - (((b^c) & (s^t)) | (k^t)) & 1 + +// Computes out = (k + in + 1) / 2 - ((ij & (s^t)) | (k^in)) & 1 +#define GET_M(ij, in, out) do { \ + const __m128i tmp0 = _mm_avg_epu8(k, (in)); /* (k + in + 1) / 2 */ \ + const __m128i tmp1 = _mm_and_si128((ij), st); /* (ij) & (s^t) */ \ + const __m128i tmp2 = _mm_xor_si128(k, (in)); /* (k^in) */ \ + const __m128i tmp3 = _mm_or_si128(tmp1, tmp2); /* ((ij) & (s^t)) | (k^in) */\ + const __m128i tmp4 = _mm_and_si128(tmp3, one); /* & 1 -> lsb_correction */ \ + (out) = _mm_sub_epi8(tmp0, tmp4); /* (k + in + 1) / 2 - lsb_correction */ \ +} while (0) + +// pack and store two alternating pixel rows +#define PACK_AND_STORE(a, b, da, db, out) do { \ + const __m128i t_a = _mm_avg_epu8(a, da); /* (9a + 3b + 3c + d + 8) / 16 */ \ + const __m128i t_b = _mm_avg_epu8(b, db); /* (3a + 9b + c + 3d + 8) / 16 */ \ + const __m128i t_1 = _mm_unpacklo_epi8(t_a, t_b); \ + const __m128i t_2 = _mm_unpackhi_epi8(t_a, t_b); \ + _mm_store_si128(((__m128i*)(out)) + 0, t_1); \ + _mm_store_si128(((__m128i*)(out)) + 1, t_2); \ +} while (0) + +// Loads 17 pixels each from rows r1 and r2 and generates 32 pixels. +#define UPSAMPLE_32PIXELS(r1, r2, out) { \ + const __m128i one = _mm_set1_epi8(1); \ + const __m128i a = _mm_loadu_si128((const __m128i*)&(r1)[0]); \ + const __m128i b = _mm_loadu_si128((const __m128i*)&(r1)[1]); \ + const __m128i c = _mm_loadu_si128((const __m128i*)&(r2)[0]); \ + const __m128i d = _mm_loadu_si128((const __m128i*)&(r2)[1]); \ + \ + const __m128i s = _mm_avg_epu8(a, d); /* s = (a + d + 1) / 2 */ \ + const __m128i t = _mm_avg_epu8(b, c); /* t = (b + c + 1) / 2 */ \ + const __m128i st = _mm_xor_si128(s, t); /* st = s^t */ \ + \ + const __m128i ad = _mm_xor_si128(a, d); /* ad = a^d */ \ + const __m128i bc = _mm_xor_si128(b, c); /* bc = b^c */ \ + \ + const __m128i t1 = _mm_or_si128(ad, bc); /* (a^d) | (b^c) */ \ + const __m128i t2 = _mm_or_si128(t1, st); /* (a^d) | (b^c) | (s^t) */ \ + const __m128i t3 = _mm_and_si128(t2, one); /* (a^d) | (b^c) | (s^t) & 1 */ \ + const __m128i t4 = _mm_avg_epu8(s, t); \ + const __m128i k = _mm_sub_epi8(t4, t3); /* k = (a + b + c + d) / 4 */ \ + __m128i diag1, diag2; \ + \ + GET_M(bc, t, diag1); /* diag1 = (a + 3b + 3c + d) / 8 */ \ + GET_M(ad, s, diag2); /* diag2 = (3a + b + c + 3d) / 8 */ \ + \ + /* pack the alternate pixels */ \ + PACK_AND_STORE(a, b, diag1, diag2, (out) + 0); /* store top */ \ + PACK_AND_STORE(c, d, diag2, diag1, (out) + 2 * 32); /* store bottom */ \ +} + +// Turn the macro into a function for reducing code-size when non-critical +static void Upsample32Pixels_SSE41(const uint8_t r1[], const uint8_t r2[], + uint8_t* const out) { + UPSAMPLE_32PIXELS(r1, r2, out); +} + +#define UPSAMPLE_LAST_BLOCK(tb, bb, num_pixels, out) { \ + uint8_t r1[17], r2[17]; \ + memcpy(r1, (tb), (num_pixels)); \ + memcpy(r2, (bb), (num_pixels)); \ + /* replicate last byte */ \ + memset(r1 + (num_pixels), r1[(num_pixels) - 1], 17 - (num_pixels)); \ + memset(r2 + (num_pixels), r2[(num_pixels) - 1], 17 - (num_pixels)); \ + /* using the shared function instead of the macro saves ~3k code size */ \ + Upsample32Pixels_SSE41(r1, r2, out); \ +} + +#define CONVERT2RGB_32(FUNC, XSTEP, top_y, bottom_y, \ + top_dst, bottom_dst, cur_x) do { \ + FUNC##32_SSE41((top_y) + (cur_x), r_u, r_v, (top_dst) + (cur_x) * (XSTEP)); \ + if ((bottom_y) != NULL) { \ + FUNC##32_SSE41((bottom_y) + (cur_x), r_u + 64, r_v + 64, \ + (bottom_dst) + (cur_x) * (XSTEP)); \ + } \ +} while (0) + +#define SSE4_UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP) \ +static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y, \ + const uint8_t* top_u, const uint8_t* top_v, \ + const uint8_t* cur_u, const uint8_t* cur_v, \ + uint8_t* top_dst, uint8_t* bottom_dst, int len) { \ + int uv_pos, pos; \ + /* 16byte-aligned array to cache reconstructed u and v */ \ + uint8_t uv_buf[14 * 32 + 15] = { 0 }; \ + uint8_t* const r_u = (uint8_t*)((uintptr_t)(uv_buf + 15) & ~15); \ + uint8_t* const r_v = r_u + 32; \ + \ + assert(top_y != NULL); \ + { /* Treat the first pixel in regular way */ \ + const int u_diag = ((top_u[0] + cur_u[0]) >> 1) + 1; \ + const int v_diag = ((top_v[0] + cur_v[0]) >> 1) + 1; \ + const int u0_t = (top_u[0] + u_diag) >> 1; \ + const int v0_t = (top_v[0] + v_diag) >> 1; \ + FUNC(top_y[0], u0_t, v0_t, top_dst); \ + if (bottom_y != NULL) { \ + const int u0_b = (cur_u[0] + u_diag) >> 1; \ + const int v0_b = (cur_v[0] + v_diag) >> 1; \ + FUNC(bottom_y[0], u0_b, v0_b, bottom_dst); \ + } \ + } \ + /* For UPSAMPLE_32PIXELS, 17 u/v values must be read-able for each block */ \ + for (pos = 1, uv_pos = 0; pos + 32 + 1 <= len; pos += 32, uv_pos += 16) { \ + UPSAMPLE_32PIXELS(top_u + uv_pos, cur_u + uv_pos, r_u); \ + UPSAMPLE_32PIXELS(top_v + uv_pos, cur_v + uv_pos, r_v); \ + CONVERT2RGB_32(FUNC, XSTEP, top_y, bottom_y, top_dst, bottom_dst, pos); \ + } \ + if (len > 1) { \ + const int left_over = ((len + 1) >> 1) - (pos >> 1); \ + uint8_t* const tmp_top_dst = r_u + 4 * 32; \ + uint8_t* const tmp_bottom_dst = tmp_top_dst + 4 * 32; \ + uint8_t* const tmp_top = tmp_bottom_dst + 4 * 32; \ + uint8_t* const tmp_bottom = (bottom_y == NULL) ? NULL : tmp_top + 32; \ + assert(left_over > 0); \ + UPSAMPLE_LAST_BLOCK(top_u + uv_pos, cur_u + uv_pos, left_over, r_u); \ + UPSAMPLE_LAST_BLOCK(top_v + uv_pos, cur_v + uv_pos, left_over, r_v); \ + memcpy(tmp_top, top_y + pos, len - pos); \ + if (bottom_y != NULL) memcpy(tmp_bottom, bottom_y + pos, len - pos); \ + CONVERT2RGB_32(FUNC, XSTEP, tmp_top, tmp_bottom, tmp_top_dst, \ + tmp_bottom_dst, 0); \ + memcpy(top_dst + pos * (XSTEP), tmp_top_dst, (len - pos) * (XSTEP)); \ + if (bottom_y != NULL) { \ + memcpy(bottom_dst + pos * (XSTEP), tmp_bottom_dst, \ + (len - pos) * (XSTEP)); \ + } \ + } \ +} + +// SSE4 variants of the fancy upsampler. +SSE4_UPSAMPLE_FUNC(UpsampleRgbLinePair_SSE41, VP8YuvToRgb, 3) +SSE4_UPSAMPLE_FUNC(UpsampleBgrLinePair_SSE41, VP8YuvToBgr, 3) + +#undef GET_M +#undef PACK_AND_STORE +#undef UPSAMPLE_32PIXELS +#undef UPSAMPLE_LAST_BLOCK +#undef CONVERT2RGB +#undef CONVERT2RGB_32 +#undef SSE4_UPSAMPLE_FUNC + +#endif // WEBP_REDUCE_CSP + +//------------------------------------------------------------------------------ +// Entry point + +extern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */]; + +extern void WebPInitUpsamplersSSE41(void); + +WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplersSSE41(void) { +#if !defined(WEBP_REDUCE_CSP) + WebPUpsamplers[MODE_RGB] = UpsampleRgbLinePair_SSE41; + WebPUpsamplers[MODE_BGR] = UpsampleBgrLinePair_SSE41; +#endif // WEBP_REDUCE_CSP +} + +#endif // FANCY_UPSAMPLING + +//------------------------------------------------------------------------------ + +extern WebPYUV444Converter WebPYUV444Converters[/* MODE_LAST */]; +extern void WebPInitYUV444ConvertersSSE41(void); + +#define YUV444_FUNC(FUNC_NAME, CALL, CALL_C, XSTEP) \ +extern void CALL_C(const uint8_t* y, const uint8_t* u, const uint8_t* v, \ + uint8_t* dst, int len); \ +static void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v, \ + uint8_t* dst, int len) { \ + int i; \ + const int max_len = len & ~31; \ + for (i = 0; i < max_len; i += 32) { \ + CALL(y + i, u + i, v + i, dst + i * (XSTEP)); \ + } \ + if (i < len) { /* C-fallback */ \ + CALL_C(y + i, u + i, v + i, dst + i * (XSTEP), len - i); \ + } \ +} + +#if !defined(WEBP_REDUCE_CSP) +YUV444_FUNC(Yuv444ToRgb_SSE41, VP8YuvToRgb32_SSE41, WebPYuv444ToRgb_C, 3); +YUV444_FUNC(Yuv444ToBgr_SSE41, VP8YuvToBgr32_SSE41, WebPYuv444ToBgr_C, 3); +#endif // WEBP_REDUCE_CSP + +WEBP_TSAN_IGNORE_FUNCTION void WebPInitYUV444ConvertersSSE41(void) { +#if !defined(WEBP_REDUCE_CSP) + WebPYUV444Converters[MODE_RGB] = Yuv444ToRgb_SSE41; + WebPYUV444Converters[MODE_BGR] = Yuv444ToBgr_SSE41; +#endif // WEBP_REDUCE_CSP +} + +#else + +WEBP_DSP_INIT_STUB(WebPInitYUV444ConvertersSSE41) + +#endif // WEBP_USE_SSE41 + +#if !(defined(FANCY_UPSAMPLING) && defined(WEBP_USE_SSE41)) +WEBP_DSP_INIT_STUB(WebPInitUpsamplersSSE41) +#endif diff --git a/src/3rdparty/libwebp/src/dsp/yuv.c b/src/3rdparty/libwebp/src/dsp/yuv.c index dd7d9de..14e67fc 100644 --- a/src/3rdparty/libwebp/src/dsp/yuv.c +++ b/src/3rdparty/libwebp/src/dsp/yuv.c @@ -11,63 +11,11 @@ // // Author: Skal (pascal.massimino@gmail.com) -#include "./yuv.h" +#include "src/dsp/yuv.h" +#include <assert.h> #include <stdlib.h> -#if defined(WEBP_YUV_USE_TABLE) - -static int done = 0; - -static WEBP_INLINE uint8_t clip(int v, int max_value) { - return v < 0 ? 0 : v > max_value ? max_value : v; -} - -int16_t VP8kVToR[256], VP8kUToB[256]; -int32_t VP8kVToG[256], VP8kUToG[256]; -uint8_t VP8kClip[YUV_RANGE_MAX - YUV_RANGE_MIN]; -uint8_t VP8kClip4Bits[YUV_RANGE_MAX - YUV_RANGE_MIN]; - -WEBP_TSAN_IGNORE_FUNCTION void VP8YUVInit(void) { - int i; - if (done) { - return; - } -#ifndef USE_YUVj - for (i = 0; i < 256; ++i) { - VP8kVToR[i] = (89858 * (i - 128) + YUV_HALF) >> YUV_FIX; - VP8kUToG[i] = -22014 * (i - 128) + YUV_HALF; - VP8kVToG[i] = -45773 * (i - 128); - VP8kUToB[i] = (113618 * (i - 128) + YUV_HALF) >> YUV_FIX; - } - for (i = YUV_RANGE_MIN; i < YUV_RANGE_MAX; ++i) { - const int k = ((i - 16) * 76283 + YUV_HALF) >> YUV_FIX; - VP8kClip[i - YUV_RANGE_MIN] = clip(k, 255); - VP8kClip4Bits[i - YUV_RANGE_MIN] = clip((k + 8) >> 4, 15); - } -#else - for (i = 0; i < 256; ++i) { - VP8kVToR[i] = (91881 * (i - 128) + YUV_HALF) >> YUV_FIX; - VP8kUToG[i] = -22554 * (i - 128) + YUV_HALF; - VP8kVToG[i] = -46802 * (i - 128); - VP8kUToB[i] = (116130 * (i - 128) + YUV_HALF) >> YUV_FIX; - } - for (i = YUV_RANGE_MIN; i < YUV_RANGE_MAX; ++i) { - const int k = i; - VP8kClip[i - YUV_RANGE_MIN] = clip(k, 255); - VP8kClip4Bits[i - YUV_RANGE_MIN] = clip((k + 8) >> 4, 15); - } -#endif - - done = 1; -} - -#else - -WEBP_TSAN_IGNORE_FUNCTION void VP8YUVInit(void) {} - -#endif // WEBP_YUV_USE_TABLE - //----------------------------------------------------------------------------- // Plain-C version @@ -75,14 +23,14 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8YUVInit(void) {} static void FUNC_NAME(const uint8_t* y, \ const uint8_t* u, const uint8_t* v, \ uint8_t* dst, int len) { \ - const uint8_t* const end = dst + (len & ~1) * XSTEP; \ + const uint8_t* const end = dst + (len & ~1) * (XSTEP); \ while (dst != end) { \ FUNC(y[0], u[0], v[0], dst); \ - FUNC(y[1], u[0], v[0], dst + XSTEP); \ + FUNC(y[1], u[0], v[0], dst + (XSTEP)); \ y += 2; \ ++u; \ ++v; \ - dst += 2 * XSTEP; \ + dst += 2 * (XSTEP); \ } \ if (len & 1) { \ FUNC(y[0], u[0], v[0], dst); \ @@ -123,15 +71,11 @@ void WebPSamplerProcessPlane(const uint8_t* y, int y_stride, WebPSamplerRowFunc WebPSamplers[MODE_LAST]; extern void WebPInitSamplersSSE2(void); +extern void WebPInitSamplersSSE41(void); extern void WebPInitSamplersMIPS32(void); extern void WebPInitSamplersMIPSdspR2(void); -static volatile VP8CPUInfo yuv_last_cpuinfo_used = - (VP8CPUInfo)&yuv_last_cpuinfo_used; - -WEBP_TSAN_IGNORE_FUNCTION void WebPInitSamplers(void) { - if (yuv_last_cpuinfo_used == VP8GetCPUInfo) return; - +WEBP_DSP_INIT_FUNC(WebPInitSamplers) { WebPSamplers[MODE_RGB] = YuvToRgbRow; WebPSamplers[MODE_RGBA] = YuvToRgbaRow; WebPSamplers[MODE_BGR] = YuvToBgrRow; @@ -151,6 +95,11 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitSamplers(void) { WebPInitSamplersSSE2(); } #endif // WEBP_USE_SSE2 +#if defined(WEBP_USE_SSE41) + if (VP8GetCPUInfo(kSSE4_1)) { + WebPInitSamplersSSE41(); + } +#endif // WEBP_USE_SSE41 #if defined(WEBP_USE_MIPS32) if (VP8GetCPUInfo(kMIPS32)) { WebPInitSamplersMIPS32(); @@ -162,13 +111,12 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitSamplers(void) { } #endif // WEBP_USE_MIPS_DSP_R2 } - yuv_last_cpuinfo_used = VP8GetCPUInfo; } //----------------------------------------------------------------------------- // ARGB -> YUV converters -static void ConvertARGBToY(const uint32_t* argb, uint8_t* y, int width) { +static void ConvertARGBToY_C(const uint32_t* argb, uint8_t* y, int width) { int i; for (i = 0; i < width; ++i) { const uint32_t p = argb[i]; @@ -220,14 +168,14 @@ void WebPConvertARGBToUV_C(const uint32_t* argb, uint8_t* u, uint8_t* v, //----------------------------------------------------------------------------- -static void ConvertRGB24ToY(const uint8_t* rgb, uint8_t* y, int width) { +static void ConvertRGB24ToY_C(const uint8_t* rgb, uint8_t* y, int width) { int i; for (i = 0; i < width; ++i, rgb += 3) { y[i] = VP8RGBToY(rgb[0], rgb[1], rgb[2], YUV_HALF); } } -static void ConvertBGR24ToY(const uint8_t* bgr, uint8_t* y, int width) { +static void ConvertBGR24ToY_C(const uint8_t* bgr, uint8_t* y, int width) { int i; for (i = 0; i < width; ++i, bgr += 3) { y[i] = VP8RGBToY(bgr[2], bgr[1], bgr[0], YUV_HALF); @@ -246,6 +194,7 @@ void WebPConvertRGBA32ToUV_C(const uint16_t* rgb, //----------------------------------------------------------------------------- +#if !WEBP_NEON_OMIT_C_CODE #define MAX_Y ((1 << 10) - 1) // 10b precision over 16b-arithmetic static uint16_t clip_y(int v) { return (v < 0) ? 0 : (v > MAX_Y) ? MAX_Y : (uint16_t)v; @@ -283,6 +232,7 @@ static void SharpYUVFilterRow_C(const int16_t* A, const int16_t* B, int len, out[2 * i + 1] = clip_y(best_y[2 * i + 1] + v1); } } +#endif // !WEBP_NEON_OMIT_C_CODE #undef MAX_Y @@ -304,26 +254,26 @@ void (*WebPSharpYUVUpdateRGB)(const int16_t* ref, const int16_t* src, void (*WebPSharpYUVFilterRow)(const int16_t* A, const int16_t* B, int len, const uint16_t* best_y, uint16_t* out); -static volatile VP8CPUInfo rgba_to_yuv_last_cpuinfo_used = - (VP8CPUInfo)&rgba_to_yuv_last_cpuinfo_used; - extern void WebPInitConvertARGBToYUVSSE2(void); +extern void WebPInitConvertARGBToYUVSSE41(void); +extern void WebPInitConvertARGBToYUVNEON(void); extern void WebPInitSharpYUVSSE2(void); +extern void WebPInitSharpYUVNEON(void); -WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUV(void) { - if (rgba_to_yuv_last_cpuinfo_used == VP8GetCPUInfo) return; - - WebPConvertARGBToY = ConvertARGBToY; +WEBP_DSP_INIT_FUNC(WebPInitConvertARGBToYUV) { + WebPConvertARGBToY = ConvertARGBToY_C; WebPConvertARGBToUV = WebPConvertARGBToUV_C; - WebPConvertRGB24ToY = ConvertRGB24ToY; - WebPConvertBGR24ToY = ConvertBGR24ToY; + WebPConvertRGB24ToY = ConvertRGB24ToY_C; + WebPConvertBGR24ToY = ConvertBGR24ToY_C; WebPConvertRGBA32ToUV = WebPConvertRGBA32ToUV_C; +#if !WEBP_NEON_OMIT_C_CODE WebPSharpYUVUpdateY = SharpYUVUpdateY_C; WebPSharpYUVUpdateRGB = SharpYUVUpdateRGB_C; WebPSharpYUVFilterRow = SharpYUVFilterRow_C; +#endif if (VP8GetCPUInfo != NULL) { #if defined(WEBP_USE_SSE2) @@ -332,6 +282,27 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUV(void) { WebPInitSharpYUVSSE2(); } #endif // WEBP_USE_SSE2 +#if defined(WEBP_USE_SSE41) + if (VP8GetCPUInfo(kSSE4_1)) { + WebPInitConvertARGBToYUVSSE41(); + } +#endif // WEBP_USE_SSE41 + } + +#if defined(WEBP_USE_NEON) + if (WEBP_NEON_OMIT_C_CODE || + (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) { + WebPInitConvertARGBToYUVNEON(); + WebPInitSharpYUVNEON(); } - rgba_to_yuv_last_cpuinfo_used = VP8GetCPUInfo; +#endif // WEBP_USE_NEON + + assert(WebPConvertARGBToY != NULL); + assert(WebPConvertARGBToUV != NULL); + assert(WebPConvertRGB24ToY != NULL); + assert(WebPConvertBGR24ToY != NULL); + assert(WebPConvertRGBA32ToUV != NULL); + assert(WebPSharpYUVUpdateY != NULL); + assert(WebPSharpYUVUpdateRGB != NULL); + assert(WebPSharpYUVFilterRow != NULL); } diff --git a/src/3rdparty/libwebp/src/dsp/yuv.h b/src/3rdparty/libwebp/src/dsp/yuv.h index 1d33b58..c12be1d 100644 --- a/src/3rdparty/libwebp/src/dsp/yuv.h +++ b/src/3rdparty/libwebp/src/dsp/yuv.h @@ -35,18 +35,8 @@ #ifndef WEBP_DSP_YUV_H_ #define WEBP_DSP_YUV_H_ -#include "./dsp.h" -#include "../dec/vp8_dec.h" - -#if defined(WEBP_EXPERIMENTAL_FEATURES) -// Do NOT activate this feature for real compression. This is only experimental! -// This flag is for comparison purpose against JPEG's "YUVj" natural colorspace. -// This colorspace is close to Rec.601's Y'CbCr model with the notable -// difference of allowing larger range for luma/chroma. -// See http://en.wikipedia.org/wiki/YCbCr#JPEG_conversion paragraph, and its -// difference with http://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.601_conversion -// #define USE_YUVj -#endif +#include "src/dsp/dsp.h" +#include "src/dec/vp8_dec.h" //------------------------------------------------------------------------------ // YUV -> RGB conversion @@ -58,12 +48,8 @@ extern "C" { enum { YUV_FIX = 16, // fixed-point precision for RGB->YUV YUV_HALF = 1 << (YUV_FIX - 1), - YUV_MASK = (256 << YUV_FIX) - 1, - YUV_RANGE_MIN = -227, // min value of r/g/b output - YUV_RANGE_MAX = 256 + 226, // max value of r/g/b output YUV_FIX2 = 6, // fixed-point precision for YUV->RGB - YUV_HALF2 = 1 << YUV_FIX2 >> 1, YUV_MASK2 = (256 << YUV_FIX2) - 1 }; @@ -111,7 +97,7 @@ static WEBP_INLINE void VP8YuvToRgb565(int y, int u, int v, const int b = VP8YUVToB(y, u); // 5 usable bits const int rg = (r & 0xf8) | (g >> 5); const int gb = ((g << 3) & 0xe0) | (b >> 3); -#ifdef WEBP_SWAP_16BIT_CSP +#if (WEBP_SWAP_16BIT_CSP == 1) rgb[0] = gb; rgb[1] = rg; #else @@ -127,7 +113,7 @@ static WEBP_INLINE void VP8YuvToRgba4444(int y, int u, int v, const int b = VP8YUVToB(y, u); // 4 usable bits const int rg = (r & 0xf0) | (g >> 4); const int ba = (b & 0xf0) | 0x0f; // overwrite the lower 4 bits -#ifdef WEBP_SWAP_16BIT_CSP +#if (WEBP_SWAP_16BIT_CSP == 1) argb[0] = ba; argb[1] = rg; #else @@ -157,32 +143,42 @@ static WEBP_INLINE void VP8YuvToRgba(uint8_t y, uint8_t u, uint8_t v, rgba[3] = 0xff; } -// Must be called before everything, to initialize the tables. -void VP8YUVInit(void); - //----------------------------------------------------------------------------- // SSE2 extra functions (mostly for upsampling_sse2.c) #if defined(WEBP_USE_SSE2) // Process 32 pixels and store the result (16b, 24b or 32b per pixel) in *dst. -void VP8YuvToRgba32(const uint8_t* y, const uint8_t* u, const uint8_t* v, - uint8_t* dst); -void VP8YuvToRgb32(const uint8_t* y, const uint8_t* u, const uint8_t* v, - uint8_t* dst); -void VP8YuvToBgra32(const uint8_t* y, const uint8_t* u, const uint8_t* v, - uint8_t* dst); -void VP8YuvToBgr32(const uint8_t* y, const uint8_t* u, const uint8_t* v, - uint8_t* dst); -void VP8YuvToArgb32(const uint8_t* y, const uint8_t* u, const uint8_t* v, - uint8_t* dst); -void VP8YuvToRgba444432(const uint8_t* y, const uint8_t* u, const uint8_t* v, +void VP8YuvToRgba32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v, + uint8_t* dst); +void VP8YuvToRgb32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v, + uint8_t* dst); +void VP8YuvToBgra32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v, + uint8_t* dst); +void VP8YuvToBgr32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v, uint8_t* dst); -void VP8YuvToRgb56532(const uint8_t* y, const uint8_t* u, const uint8_t* v, - uint8_t* dst); +void VP8YuvToArgb32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v, + uint8_t* dst); +void VP8YuvToRgba444432_SSE2(const uint8_t* y, const uint8_t* u, + const uint8_t* v, uint8_t* dst); +void VP8YuvToRgb56532_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v, + uint8_t* dst); #endif // WEBP_USE_SSE2 +//----------------------------------------------------------------------------- +// SSE41 extra functions (mostly for upsampling_sse41.c) + +#if defined(WEBP_USE_SSE41) + +// Process 32 pixels and store the result (16b, 24b or 32b per pixel) in *dst. +void VP8YuvToRgb32_SSE41(const uint8_t* y, const uint8_t* u, const uint8_t* v, + uint8_t* dst); +void VP8YuvToBgr32_SSE41(const uint8_t* y, const uint8_t* u, const uint8_t* v, + uint8_t* dst); + +#endif // WEBP_USE_SSE41 + //------------------------------------------------------------------------------ // RGB -> YUV conversion @@ -192,8 +188,6 @@ static WEBP_INLINE int VP8ClipUV(int uv, int rounding) { return ((uv & ~0xff) == 0) ? uv : (uv < 0) ? 0 : 255; } -#ifndef USE_YUVj - static WEBP_INLINE int VP8RGBToY(int r, int g, int b, int rounding) { const int luma = 16839 * r + 33059 * g + 6420 * b; return (luma + rounding + (16 << YUV_FIX)) >> YUV_FIX; // no need to clip @@ -209,30 +203,8 @@ static WEBP_INLINE int VP8RGBToV(int r, int g, int b, int rounding) { return VP8ClipUV(v, rounding); } -#else - -// This JPEG-YUV colorspace, only for comparison! -// These are also 16bit precision coefficients from Rec.601, but with full -// [0..255] output range. -static WEBP_INLINE int VP8RGBToY(int r, int g, int b, int rounding) { - const int luma = 19595 * r + 38470 * g + 7471 * b; - return (luma + rounding) >> YUV_FIX; // no need to clip -} - -static WEBP_INLINE int VP8RGBToU(int r, int g, int b, int rounding) { - const int u = -11058 * r - 21710 * g + 32768 * b; - return VP8ClipUV(u, rounding); -} - -static WEBP_INLINE int VP8RGBToV(int r, int g, int b, int rounding) { - const int v = 32768 * r - 27439 * g - 5329 * b; - return VP8ClipUV(v, rounding); -} - -#endif // USE_YUVj - #ifdef __cplusplus } // extern "C" #endif -#endif /* WEBP_DSP_YUV_H_ */ +#endif // WEBP_DSP_YUV_H_ diff --git a/src/3rdparty/libwebp/src/dsp/yuv_mips32.c b/src/3rdparty/libwebp/src/dsp/yuv_mips32.c index e61aac5..9d0a887 100644 --- a/src/3rdparty/libwebp/src/dsp/yuv_mips32.c +++ b/src/3rdparty/libwebp/src/dsp/yuv_mips32.c @@ -12,11 +12,11 @@ // Author(s): Djordje Pesut (djordje.pesut@imgtec.com) // Jovan Zelincevic (jovan.zelincevic@imgtec.com) -#include "./dsp.h" +#include "src/dsp/dsp.h" #if defined(WEBP_USE_MIPS32) -#include "./yuv.h" +#include "src/dsp/yuv.h" //------------------------------------------------------------------------------ // simple point-sampling @@ -77,10 +77,10 @@ static void FUNC_NAME(const uint8_t* y, \ } \ } -ROW_FUNC(YuvToRgbRow, 3, 0, 1, 2, 0) -ROW_FUNC(YuvToRgbaRow, 4, 0, 1, 2, 3) -ROW_FUNC(YuvToBgrRow, 3, 2, 1, 0, 0) -ROW_FUNC(YuvToBgraRow, 4, 2, 1, 0, 3) +ROW_FUNC(YuvToRgbRow_MIPS32, 3, 0, 1, 2, 0) +ROW_FUNC(YuvToRgbaRow_MIPS32, 4, 0, 1, 2, 3) +ROW_FUNC(YuvToBgrRow_MIPS32, 3, 2, 1, 0, 0) +ROW_FUNC(YuvToBgraRow_MIPS32, 4, 2, 1, 0, 3) #undef ROW_FUNC @@ -90,10 +90,10 @@ ROW_FUNC(YuvToBgraRow, 4, 2, 1, 0, 3) extern void WebPInitSamplersMIPS32(void); WEBP_TSAN_IGNORE_FUNCTION void WebPInitSamplersMIPS32(void) { - WebPSamplers[MODE_RGB] = YuvToRgbRow; - WebPSamplers[MODE_RGBA] = YuvToRgbaRow; - WebPSamplers[MODE_BGR] = YuvToBgrRow; - WebPSamplers[MODE_BGRA] = YuvToBgraRow; + WebPSamplers[MODE_RGB] = YuvToRgbRow_MIPS32; + WebPSamplers[MODE_RGBA] = YuvToRgbaRow_MIPS32; + WebPSamplers[MODE_BGR] = YuvToBgrRow_MIPS32; + WebPSamplers[MODE_BGRA] = YuvToBgraRow_MIPS32; } #else // !WEBP_USE_MIPS32 diff --git a/src/3rdparty/libwebp/src/dsp/yuv_mips_dsp_r2.c b/src/3rdparty/libwebp/src/dsp/yuv_mips_dsp_r2.c index 1720d41..cc8afcc 100644 --- a/src/3rdparty/libwebp/src/dsp/yuv_mips_dsp_r2.c +++ b/src/3rdparty/libwebp/src/dsp/yuv_mips_dsp_r2.c @@ -12,11 +12,11 @@ // Author(s): Branimir Vasic (branimir.vasic@imgtec.com) // Djordje Pesut (djordje.pesut@imgtec.com) -#include "./dsp.h" +#include "src/dsp/dsp.h" #if defined(WEBP_USE_MIPS_DSP_R2) -#include "./yuv.h" +#include "src/dsp/yuv.h" //------------------------------------------------------------------------------ // simple point-sampling @@ -105,10 +105,10 @@ static void FUNC_NAME(const uint8_t* y, \ } \ } -ROW_FUNC(YuvToRgbRow, 3, 0, 1, 2, 0) -ROW_FUNC(YuvToRgbaRow, 4, 0, 1, 2, 3) -ROW_FUNC(YuvToBgrRow, 3, 2, 1, 0, 0) -ROW_FUNC(YuvToBgraRow, 4, 2, 1, 0, 3) +ROW_FUNC(YuvToRgbRow_MIPSdspR2, 3, 0, 1, 2, 0) +ROW_FUNC(YuvToRgbaRow_MIPSdspR2, 4, 0, 1, 2, 3) +ROW_FUNC(YuvToBgrRow_MIPSdspR2, 3, 2, 1, 0, 0) +ROW_FUNC(YuvToBgraRow_MIPSdspR2, 4, 2, 1, 0, 3) #undef ROW_FUNC #undef ASM_CLOBBER_LIST @@ -121,10 +121,10 @@ ROW_FUNC(YuvToBgraRow, 4, 2, 1, 0, 3) extern void WebPInitSamplersMIPSdspR2(void); WEBP_TSAN_IGNORE_FUNCTION void WebPInitSamplersMIPSdspR2(void) { - WebPSamplers[MODE_RGB] = YuvToRgbRow; - WebPSamplers[MODE_RGBA] = YuvToRgbaRow; - WebPSamplers[MODE_BGR] = YuvToBgrRow; - WebPSamplers[MODE_BGRA] = YuvToBgraRow; + WebPSamplers[MODE_RGB] = YuvToRgbRow_MIPSdspR2; + WebPSamplers[MODE_RGBA] = YuvToRgbaRow_MIPSdspR2; + WebPSamplers[MODE_BGR] = YuvToBgrRow_MIPSdspR2; + WebPSamplers[MODE_BGRA] = YuvToBgraRow_MIPSdspR2; } #else // !WEBP_USE_MIPS_DSP_R2 diff --git a/src/3rdparty/libwebp/src/dsp/yuv_neon.c b/src/3rdparty/libwebp/src/dsp/yuv_neon.c new file mode 100644 index 0000000..a34d602 --- /dev/null +++ b/src/3rdparty/libwebp/src/dsp/yuv_neon.c @@ -0,0 +1,288 @@ +// Copyright 2017 Google Inc. All Rights Reserved. +// +// Use of this source code is governed by a BSD-style license +// that can be found in the COPYING file in the root of the source +// tree. An additional intellectual property rights grant can be found +// in the file PATENTS. All contributing project authors may +// be found in the AUTHORS file in the root of the source tree. +// ----------------------------------------------------------------------------- +// +// YUV->RGB conversion functions +// +// Author: Skal (pascal.massimino@gmail.com) + +#include "src/dsp/yuv.h" + +#if defined(WEBP_USE_NEON) + +#include <assert.h> +#include <stdlib.h> + +#include "src/dsp/neon.h" + +//----------------------------------------------------------------------------- + +static uint8x8_t ConvertRGBToY_NEON(const uint8x8_t R, + const uint8x8_t G, + const uint8x8_t B) { + const uint16x8_t r = vmovl_u8(R); + const uint16x8_t g = vmovl_u8(G); + const uint16x8_t b = vmovl_u8(B); + const uint16x4_t r_lo = vget_low_u16(r); + const uint16x4_t r_hi = vget_high_u16(r); + const uint16x4_t g_lo = vget_low_u16(g); + const uint16x4_t g_hi = vget_high_u16(g); + const uint16x4_t b_lo = vget_low_u16(b); + const uint16x4_t b_hi = vget_high_u16(b); + const uint32x4_t tmp0_lo = vmull_n_u16( r_lo, 16839u); + const uint32x4_t tmp0_hi = vmull_n_u16( r_hi, 16839u); + const uint32x4_t tmp1_lo = vmlal_n_u16(tmp0_lo, g_lo, 33059u); + const uint32x4_t tmp1_hi = vmlal_n_u16(tmp0_hi, g_hi, 33059u); + const uint32x4_t tmp2_lo = vmlal_n_u16(tmp1_lo, b_lo, 6420u); + const uint32x4_t tmp2_hi = vmlal_n_u16(tmp1_hi, b_hi, 6420u); + const uint16x8_t Y1 = vcombine_u16(vrshrn_n_u32(tmp2_lo, 16), + vrshrn_n_u32(tmp2_hi, 16)); + const uint16x8_t Y2 = vaddq_u16(Y1, vdupq_n_u16(16)); + return vqmovn_u16(Y2); +} + +static void ConvertRGB24ToY_NEON(const uint8_t* rgb, uint8_t* y, int width) { + int i; + for (i = 0; i + 8 <= width; i += 8, rgb += 3 * 8) { + const uint8x8x3_t RGB = vld3_u8(rgb); + const uint8x8_t Y = ConvertRGBToY_NEON(RGB.val[0], RGB.val[1], RGB.val[2]); + vst1_u8(y + i, Y); + } + for (; i < width; ++i, rgb += 3) { // left-over + y[i] = VP8RGBToY(rgb[0], rgb[1], rgb[2], YUV_HALF); + } +} + +static void ConvertBGR24ToY_NEON(const uint8_t* bgr, uint8_t* y, int width) { + int i; + for (i = 0; i + 8 <= width; i += 8, bgr += 3 * 8) { + const uint8x8x3_t BGR = vld3_u8(bgr); + const uint8x8_t Y = ConvertRGBToY_NEON(BGR.val[2], BGR.val[1], BGR.val[0]); + vst1_u8(y + i, Y); + } + for (; i < width; ++i, bgr += 3) { // left-over + y[i] = VP8RGBToY(bgr[2], bgr[1], bgr[0], YUV_HALF); + } +} + +static void ConvertARGBToY_NEON(const uint32_t* argb, uint8_t* y, int width) { + int i; + for (i = 0; i + 8 <= width; i += 8) { + const uint8x8x4_t RGB = vld4_u8((const uint8_t*)&argb[i]); + const uint8x8_t Y = ConvertRGBToY_NEON(RGB.val[2], RGB.val[1], RGB.val[0]); + vst1_u8(y + i, Y); + } + for (; i < width; ++i) { // left-over + const uint32_t p = argb[i]; + y[i] = VP8RGBToY((p >> 16) & 0xff, (p >> 8) & 0xff, (p >> 0) & 0xff, + YUV_HALF); + } +} + +//----------------------------------------------------------------------------- + +// computes: DST_s16 = [(C0 * r + C1 * g + C2 * b) >> 16] + CST +#define MULTIPLY_16b_PREAMBLE(r, g, b) \ + const int16x4_t r_lo = vreinterpret_s16_u16(vget_low_u16(r)); \ + const int16x4_t r_hi = vreinterpret_s16_u16(vget_high_u16(r)); \ + const int16x4_t g_lo = vreinterpret_s16_u16(vget_low_u16(g)); \ + const int16x4_t g_hi = vreinterpret_s16_u16(vget_high_u16(g)); \ + const int16x4_t b_lo = vreinterpret_s16_u16(vget_low_u16(b)); \ + const int16x4_t b_hi = vreinterpret_s16_u16(vget_high_u16(b)) + +#define MULTIPLY_16b(C0, C1, C2, CST, DST_s16) do { \ + const int32x4_t tmp0_lo = vmull_n_s16( r_lo, C0); \ + const int32x4_t tmp0_hi = vmull_n_s16( r_hi, C0); \ + const int32x4_t tmp1_lo = vmlal_n_s16(tmp0_lo, g_lo, C1); \ + const int32x4_t tmp1_hi = vmlal_n_s16(tmp0_hi, g_hi, C1); \ + const int32x4_t tmp2_lo = vmlal_n_s16(tmp1_lo, b_lo, C2); \ + const int32x4_t tmp2_hi = vmlal_n_s16(tmp1_hi, b_hi, C2); \ + const int16x8_t tmp3 = vcombine_s16(vshrn_n_s32(tmp2_lo, 16), \ + vshrn_n_s32(tmp2_hi, 16)); \ + DST_s16 = vaddq_s16(tmp3, vdupq_n_s16(CST)); \ +} while (0) + +// This needs to be a macro, since (128 << SHIFT) needs to be an immediate. +#define CONVERT_RGB_TO_UV(r, g, b, SHIFT, U_DST, V_DST) do { \ + MULTIPLY_16b_PREAMBLE(r, g, b); \ + MULTIPLY_16b(-9719, -19081, 28800, 128 << SHIFT, U_DST); \ + MULTIPLY_16b(28800, -24116, -4684, 128 << SHIFT, V_DST); \ +} while (0) + +static void ConvertRGBA32ToUV_NEON(const uint16_t* rgb, + uint8_t* u, uint8_t* v, int width) { + int i; + for (i = 0; i + 8 <= width; i += 8, rgb += 4 * 8) { + const uint16x8x4_t RGB = vld4q_u16((const uint16_t*)rgb); + int16x8_t U, V; + CONVERT_RGB_TO_UV(RGB.val[0], RGB.val[1], RGB.val[2], 2, U, V); + vst1_u8(u + i, vqrshrun_n_s16(U, 2)); + vst1_u8(v + i, vqrshrun_n_s16(V, 2)); + } + for (; i < width; i += 1, rgb += 4) { + const int r = rgb[0], g = rgb[1], b = rgb[2]; + u[i] = VP8RGBToU(r, g, b, YUV_HALF << 2); + v[i] = VP8RGBToV(r, g, b, YUV_HALF << 2); + } +} + +static void ConvertARGBToUV_NEON(const uint32_t* argb, uint8_t* u, uint8_t* v, + int src_width, int do_store) { + int i; + for (i = 0; i + 16 <= src_width; i += 16, u += 8, v += 8) { + const uint8x16x4_t RGB = vld4q_u8((const uint8_t*)&argb[i]); + const uint16x8_t R = vpaddlq_u8(RGB.val[2]); // pair-wise adds + const uint16x8_t G = vpaddlq_u8(RGB.val[1]); + const uint16x8_t B = vpaddlq_u8(RGB.val[0]); + int16x8_t U_tmp, V_tmp; + CONVERT_RGB_TO_UV(R, G, B, 1, U_tmp, V_tmp); + { + const uint8x8_t U = vqrshrun_n_s16(U_tmp, 1); + const uint8x8_t V = vqrshrun_n_s16(V_tmp, 1); + if (do_store) { + vst1_u8(u, U); + vst1_u8(v, V); + } else { + const uint8x8_t prev_u = vld1_u8(u); + const uint8x8_t prev_v = vld1_u8(v); + vst1_u8(u, vrhadd_u8(U, prev_u)); + vst1_u8(v, vrhadd_u8(V, prev_v)); + } + } + } + if (i < src_width) { // left-over + WebPConvertARGBToUV_C(argb + i, u, v, src_width - i, do_store); + } +} + + +//------------------------------------------------------------------------------ + +extern void WebPInitConvertARGBToYUVNEON(void); + +WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUVNEON(void) { + WebPConvertRGB24ToY = ConvertRGB24ToY_NEON; + WebPConvertBGR24ToY = ConvertBGR24ToY_NEON; + WebPConvertARGBToY = ConvertARGBToY_NEON; + WebPConvertARGBToUV = ConvertARGBToUV_NEON; + WebPConvertRGBA32ToUV = ConvertRGBA32ToUV_NEON; +} + +//------------------------------------------------------------------------------ + +#define MAX_Y ((1 << 10) - 1) // 10b precision over 16b-arithmetic +static uint16_t clip_y_NEON(int v) { + return (v < 0) ? 0 : (v > MAX_Y) ? MAX_Y : (uint16_t)v; +} + +static uint64_t SharpYUVUpdateY_NEON(const uint16_t* ref, const uint16_t* src, + uint16_t* dst, int len) { + int i; + const int16x8_t zero = vdupq_n_s16(0); + const int16x8_t max = vdupq_n_s16(MAX_Y); + uint64x2_t sum = vdupq_n_u64(0); + uint64_t diff; + + for (i = 0; i + 8 <= len; i += 8) { + const int16x8_t A = vreinterpretq_s16_u16(vld1q_u16(ref + i)); + const int16x8_t B = vreinterpretq_s16_u16(vld1q_u16(src + i)); + const int16x8_t C = vreinterpretq_s16_u16(vld1q_u16(dst + i)); + const int16x8_t D = vsubq_s16(A, B); // diff_y + const int16x8_t F = vaddq_s16(C, D); // new_y + const uint16x8_t H = + vreinterpretq_u16_s16(vmaxq_s16(vminq_s16(F, max), zero)); + const int16x8_t I = vabsq_s16(D); // abs(diff_y) + vst1q_u16(dst + i, H); + sum = vpadalq_u32(sum, vpaddlq_u16(vreinterpretq_u16_s16(I))); + } + diff = vgetq_lane_u64(sum, 0) + vgetq_lane_u64(sum, 1); + for (; i < len; ++i) { + const int diff_y = ref[i] - src[i]; + const int new_y = (int)(dst[i]) + diff_y; + dst[i] = clip_y_NEON(new_y); + diff += (uint64_t)(abs(diff_y)); + } + return diff; +} + +static void SharpYUVUpdateRGB_NEON(const int16_t* ref, const int16_t* src, + int16_t* dst, int len) { + int i; + for (i = 0; i + 8 <= len; i += 8) { + const int16x8_t A = vld1q_s16(ref + i); + const int16x8_t B = vld1q_s16(src + i); + const int16x8_t C = vld1q_s16(dst + i); + const int16x8_t D = vsubq_s16(A, B); // diff_uv + const int16x8_t E = vaddq_s16(C, D); // new_uv + vst1q_s16(dst + i, E); + } + for (; i < len; ++i) { + const int diff_uv = ref[i] - src[i]; + dst[i] += diff_uv; + } +} + +static void SharpYUVFilterRow_NEON(const int16_t* A, const int16_t* B, int len, + const uint16_t* best_y, uint16_t* out) { + int i; + const int16x8_t max = vdupq_n_s16(MAX_Y); + const int16x8_t zero = vdupq_n_s16(0); + for (i = 0; i + 8 <= len; i += 8) { + const int16x8_t a0 = vld1q_s16(A + i + 0); + const int16x8_t a1 = vld1q_s16(A + i + 1); + const int16x8_t b0 = vld1q_s16(B + i + 0); + const int16x8_t b1 = vld1q_s16(B + i + 1); + const int16x8_t a0b1 = vaddq_s16(a0, b1); + const int16x8_t a1b0 = vaddq_s16(a1, b0); + const int16x8_t a0a1b0b1 = vaddq_s16(a0b1, a1b0); // A0+A1+B0+B1 + const int16x8_t a0b1_2 = vaddq_s16(a0b1, a0b1); // 2*(A0+B1) + const int16x8_t a1b0_2 = vaddq_s16(a1b0, a1b0); // 2*(A1+B0) + const int16x8_t c0 = vshrq_n_s16(vaddq_s16(a0b1_2, a0a1b0b1), 3); + const int16x8_t c1 = vshrq_n_s16(vaddq_s16(a1b0_2, a0a1b0b1), 3); + const int16x8_t d0 = vaddq_s16(c1, a0); + const int16x8_t d1 = vaddq_s16(c0, a1); + const int16x8_t e0 = vrshrq_n_s16(d0, 1); + const int16x8_t e1 = vrshrq_n_s16(d1, 1); + const int16x8x2_t f = vzipq_s16(e0, e1); + const int16x8_t g0 = vreinterpretq_s16_u16(vld1q_u16(best_y + 2 * i + 0)); + const int16x8_t g1 = vreinterpretq_s16_u16(vld1q_u16(best_y + 2 * i + 8)); + const int16x8_t h0 = vaddq_s16(g0, f.val[0]); + const int16x8_t h1 = vaddq_s16(g1, f.val[1]); + const int16x8_t i0 = vmaxq_s16(vminq_s16(h0, max), zero); + const int16x8_t i1 = vmaxq_s16(vminq_s16(h1, max), zero); + vst1q_u16(out + 2 * i + 0, vreinterpretq_u16_s16(i0)); + vst1q_u16(out + 2 * i + 8, vreinterpretq_u16_s16(i1)); + } + for (; i < len; ++i) { + const int a0b1 = A[i + 0] + B[i + 1]; + const int a1b0 = A[i + 1] + B[i + 0]; + const int a0a1b0b1 = a0b1 + a1b0 + 8; + const int v0 = (8 * A[i + 0] + 2 * a1b0 + a0a1b0b1) >> 4; + const int v1 = (8 * A[i + 1] + 2 * a0b1 + a0a1b0b1) >> 4; + out[2 * i + 0] = clip_y_NEON(best_y[2 * i + 0] + v0); + out[2 * i + 1] = clip_y_NEON(best_y[2 * i + 1] + v1); + } +} +#undef MAX_Y + +//------------------------------------------------------------------------------ + +extern void WebPInitSharpYUVNEON(void); + +WEBP_TSAN_IGNORE_FUNCTION void WebPInitSharpYUVNEON(void) { + WebPSharpYUVUpdateY = SharpYUVUpdateY_NEON; + WebPSharpYUVUpdateRGB = SharpYUVUpdateRGB_NEON; + WebPSharpYUVFilterRow = SharpYUVFilterRow_NEON; +} + +#else // !WEBP_USE_NEON + +WEBP_DSP_INIT_STUB(WebPInitConvertARGBToYUVNEON) +WEBP_DSP_INIT_STUB(WebPInitSharpYUVNEON) + +#endif // WEBP_USE_NEON diff --git a/src/3rdparty/libwebp/src/dsp/yuv_sse2.c b/src/3rdparty/libwebp/src/dsp/yuv_sse2.c index e33c2bb..baa48d5 100644 --- a/src/3rdparty/libwebp/src/dsp/yuv_sse2.c +++ b/src/3rdparty/libwebp/src/dsp/yuv_sse2.c @@ -11,11 +11,11 @@ // // Author: Skal (pascal.massimino@gmail.com) -#include "./yuv.h" +#include "src/dsp/yuv.h" #if defined(WEBP_USE_SSE2) -#include "./common_sse2.h" +#include "src/dsp/common_sse2.h" #include <stdlib.h> #include <emmintrin.h> @@ -26,12 +26,12 @@ // R = (19077 * y + 26149 * v - 14234) >> 6 // G = (19077 * y - 6419 * u - 13320 * v + 8708) >> 6 // B = (19077 * y + 33050 * u - 17685) >> 6 -static void ConvertYUV444ToRGB(const __m128i* const Y0, - const __m128i* const U0, - const __m128i* const V0, - __m128i* const R, - __m128i* const G, - __m128i* const B) { +static void ConvertYUV444ToRGB_SSE2(const __m128i* const Y0, + const __m128i* const U0, + const __m128i* const V0, + __m128i* const R, + __m128i* const G, + __m128i* const B) { const __m128i k19077 = _mm_set1_epi16(19077); const __m128i k26149 = _mm_set1_epi16(26149); const __m128i k14234 = _mm_set1_epi16(14234); @@ -66,13 +66,13 @@ static void ConvertYUV444ToRGB(const __m128i* const Y0, } // Load the bytes into the *upper* part of 16b words. That's "<< 8", basically. -static WEBP_INLINE __m128i Load_HI_16(const uint8_t* src) { +static WEBP_INLINE __m128i Load_HI_16_SSE2(const uint8_t* src) { const __m128i zero = _mm_setzero_si128(); return _mm_unpacklo_epi8(zero, _mm_loadl_epi64((const __m128i*)src)); } // Load and replicate the U/V samples -static WEBP_INLINE __m128i Load_UV_HI_8(const uint8_t* src) { +static WEBP_INLINE __m128i Load_UV_HI_8_SSE2(const uint8_t* src) { const __m128i zero = _mm_setzero_si128(); const __m128i tmp0 = _mm_cvtsi32_si128(*(const uint32_t*)src); const __m128i tmp1 = _mm_unpacklo_epi8(zero, tmp0); @@ -80,29 +80,33 @@ static WEBP_INLINE __m128i Load_UV_HI_8(const uint8_t* src) { } // Convert 32 samples of YUV444 to R/G/B -static void YUV444ToRGB(const uint8_t* const y, - const uint8_t* const u, - const uint8_t* const v, - __m128i* const R, __m128i* const G, __m128i* const B) { - const __m128i Y0 = Load_HI_16(y), U0 = Load_HI_16(u), V0 = Load_HI_16(v); - ConvertYUV444ToRGB(&Y0, &U0, &V0, R, G, B); +static void YUV444ToRGB_SSE2(const uint8_t* const y, + const uint8_t* const u, + const uint8_t* const v, + __m128i* const R, __m128i* const G, + __m128i* const B) { + const __m128i Y0 = Load_HI_16_SSE2(y), U0 = Load_HI_16_SSE2(u), + V0 = Load_HI_16_SSE2(v); + ConvertYUV444ToRGB_SSE2(&Y0, &U0, &V0, R, G, B); } // Convert 32 samples of YUV420 to R/G/B -static void YUV420ToRGB(const uint8_t* const y, - const uint8_t* const u, - const uint8_t* const v, - __m128i* const R, __m128i* const G, __m128i* const B) { - const __m128i Y0 = Load_HI_16(y), U0 = Load_UV_HI_8(u), V0 = Load_UV_HI_8(v); - ConvertYUV444ToRGB(&Y0, &U0, &V0, R, G, B); +static void YUV420ToRGB_SSE2(const uint8_t* const y, + const uint8_t* const u, + const uint8_t* const v, + __m128i* const R, __m128i* const G, + __m128i* const B) { + const __m128i Y0 = Load_HI_16_SSE2(y), U0 = Load_UV_HI_8_SSE2(u), + V0 = Load_UV_HI_8_SSE2(v); + ConvertYUV444ToRGB_SSE2(&Y0, &U0, &V0, R, G, B); } // Pack R/G/B/A results into 32b output. -static WEBP_INLINE void PackAndStore4(const __m128i* const R, - const __m128i* const G, - const __m128i* const B, - const __m128i* const A, - uint8_t* const dst) { +static WEBP_INLINE void PackAndStore4_SSE2(const __m128i* const R, + const __m128i* const G, + const __m128i* const B, + const __m128i* const A, + uint8_t* const dst) { const __m128i rb = _mm_packus_epi16(*R, *B); const __m128i ga = _mm_packus_epi16(*G, *A); const __m128i rg = _mm_unpacklo_epi8(rb, ga); @@ -114,12 +118,12 @@ static WEBP_INLINE void PackAndStore4(const __m128i* const R, } // Pack R/G/B/A results into 16b output. -static WEBP_INLINE void PackAndStore4444(const __m128i* const R, - const __m128i* const G, - const __m128i* const B, - const __m128i* const A, - uint8_t* const dst) { -#if !defined(WEBP_SWAP_16BIT_CSP) +static WEBP_INLINE void PackAndStore4444_SSE2(const __m128i* const R, + const __m128i* const G, + const __m128i* const B, + const __m128i* const A, + uint8_t* const dst) { +#if (WEBP_SWAP_16BIT_CSP == 0) const __m128i rg0 = _mm_packus_epi16(*R, *G); const __m128i ba0 = _mm_packus_epi16(*B, *A); #else @@ -136,10 +140,10 @@ static WEBP_INLINE void PackAndStore4444(const __m128i* const R, } // Pack R/G/B results into 16b output. -static WEBP_INLINE void PackAndStore565(const __m128i* const R, - const __m128i* const G, - const __m128i* const B, - uint8_t* const dst) { +static WEBP_INLINE void PackAndStore565_SSE2(const __m128i* const R, + const __m128i* const G, + const __m128i* const B, + uint8_t* const dst) { const __m128i r0 = _mm_packus_epi16(*R, *R); const __m128i g0 = _mm_packus_epi16(*G, *G); const __m128i b0 = _mm_packus_epi16(*B, *B); @@ -149,7 +153,7 @@ static WEBP_INLINE void PackAndStore565(const __m128i* const R, const __m128i g2 = _mm_slli_epi16(_mm_and_si128(g0, _mm_set1_epi8(0x1c)), 3); const __m128i rg = _mm_or_si128(r1, g1); const __m128i gb = _mm_or_si128(g2, b1); -#if !defined(WEBP_SWAP_16BIT_CSP) +#if (WEBP_SWAP_16BIT_CSP == 0) const __m128i rgb565 = _mm_unpacklo_epi8(rg, gb); #else const __m128i rgb565 = _mm_unpacklo_epi8(gb, rg); @@ -160,10 +164,10 @@ static WEBP_INLINE void PackAndStore565(const __m128i* const R, // Pack the planar buffers // rrrr... rrrr... gggg... gggg... bbbb... bbbb.... // triplet by triplet in the output buffer rgb as rgbrgbrgbrgb ... -static WEBP_INLINE void PlanarTo24b(__m128i* const in0, __m128i* const in1, - __m128i* const in2, __m128i* const in3, - __m128i* const in4, __m128i* const in5, - uint8_t* const rgb) { +static WEBP_INLINE void PlanarTo24b_SSE2(__m128i* const in0, __m128i* const in1, + __m128i* const in2, __m128i* const in3, + __m128i* const in4, __m128i* const in5, + uint8_t* const rgb) { // The input is 6 registers of sixteen 8b but for the sake of explanation, // let's take 6 registers of four 8b values. // To pack, we will keep taking one every two 8b integer and move it @@ -176,7 +180,7 @@ static WEBP_INLINE void PlanarTo24b(__m128i* const in0, __m128i* const in1, // Repeat the same permutations twice more: // r0r4g0g4 | b0b4r1r5 | g1g5b1b5 | r2r6g2g6 | b2b6r3r7 | g3g7b3b7 // r0g0b0r1 | g1b1r2g2 | b2r3g3b3 | r4g4b4r5 | g5b5r6g6 | b6r7g7b7 - VP8PlanarTo24b(in0, in1, in2, in3, in4, in5); + VP8PlanarTo24b_SSE2(in0, in1, in2, in3, in4, in5); _mm_storeu_si128((__m128i*)(rgb + 0), *in0); _mm_storeu_si128((__m128i*)(rgb + 16), *in1); @@ -186,69 +190,69 @@ static WEBP_INLINE void PlanarTo24b(__m128i* const in0, __m128i* const in1, _mm_storeu_si128((__m128i*)(rgb + 80), *in5); } -void VP8YuvToRgba32(const uint8_t* y, const uint8_t* u, const uint8_t* v, - uint8_t* dst) { +void VP8YuvToRgba32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v, + uint8_t* dst) { const __m128i kAlpha = _mm_set1_epi16(255); int n; for (n = 0; n < 32; n += 8, dst += 32) { __m128i R, G, B; - YUV444ToRGB(y + n, u + n, v + n, &R, &G, &B); - PackAndStore4(&R, &G, &B, &kAlpha, dst); + YUV444ToRGB_SSE2(y + n, u + n, v + n, &R, &G, &B); + PackAndStore4_SSE2(&R, &G, &B, &kAlpha, dst); } } -void VP8YuvToBgra32(const uint8_t* y, const uint8_t* u, const uint8_t* v, - uint8_t* dst) { +void VP8YuvToBgra32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v, + uint8_t* dst) { const __m128i kAlpha = _mm_set1_epi16(255); int n; for (n = 0; n < 32; n += 8, dst += 32) { __m128i R, G, B; - YUV444ToRGB(y + n, u + n, v + n, &R, &G, &B); - PackAndStore4(&B, &G, &R, &kAlpha, dst); + YUV444ToRGB_SSE2(y + n, u + n, v + n, &R, &G, &B); + PackAndStore4_SSE2(&B, &G, &R, &kAlpha, dst); } } -void VP8YuvToArgb32(const uint8_t* y, const uint8_t* u, const uint8_t* v, - uint8_t* dst) { +void VP8YuvToArgb32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v, + uint8_t* dst) { const __m128i kAlpha = _mm_set1_epi16(255); int n; for (n = 0; n < 32; n += 8, dst += 32) { __m128i R, G, B; - YUV444ToRGB(y + n, u + n, v + n, &R, &G, &B); - PackAndStore4(&kAlpha, &R, &G, &B, dst); + YUV444ToRGB_SSE2(y + n, u + n, v + n, &R, &G, &B); + PackAndStore4_SSE2(&kAlpha, &R, &G, &B, dst); } } -void VP8YuvToRgba444432(const uint8_t* y, const uint8_t* u, const uint8_t* v, - uint8_t* dst) { +void VP8YuvToRgba444432_SSE2(const uint8_t* y, const uint8_t* u, + const uint8_t* v, uint8_t* dst) { const __m128i kAlpha = _mm_set1_epi16(255); int n; for (n = 0; n < 32; n += 8, dst += 16) { __m128i R, G, B; - YUV444ToRGB(y + n, u + n, v + n, &R, &G, &B); - PackAndStore4444(&R, &G, &B, &kAlpha, dst); + YUV444ToRGB_SSE2(y + n, u + n, v + n, &R, &G, &B); + PackAndStore4444_SSE2(&R, &G, &B, &kAlpha, dst); } } -void VP8YuvToRgb56532(const uint8_t* y, const uint8_t* u, const uint8_t* v, - uint8_t* dst) { +void VP8YuvToRgb56532_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v, + uint8_t* dst) { int n; for (n = 0; n < 32; n += 8, dst += 16) { __m128i R, G, B; - YUV444ToRGB(y + n, u + n, v + n, &R, &G, &B); - PackAndStore565(&R, &G, &B, dst); + YUV444ToRGB_SSE2(y + n, u + n, v + n, &R, &G, &B); + PackAndStore565_SSE2(&R, &G, &B, dst); } } -void VP8YuvToRgb32(const uint8_t* y, const uint8_t* u, const uint8_t* v, - uint8_t* dst) { +void VP8YuvToRgb32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v, + uint8_t* dst) { __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3; __m128i rgb0, rgb1, rgb2, rgb3, rgb4, rgb5; - YUV444ToRGB(y + 0, u + 0, v + 0, &R0, &G0, &B0); - YUV444ToRGB(y + 8, u + 8, v + 8, &R1, &G1, &B1); - YUV444ToRGB(y + 16, u + 16, v + 16, &R2, &G2, &B2); - YUV444ToRGB(y + 24, u + 24, v + 24, &R3, &G3, &B3); + YUV444ToRGB_SSE2(y + 0, u + 0, v + 0, &R0, &G0, &B0); + YUV444ToRGB_SSE2(y + 8, u + 8, v + 8, &R1, &G1, &B1); + YUV444ToRGB_SSE2(y + 16, u + 16, v + 16, &R2, &G2, &B2); + YUV444ToRGB_SSE2(y + 24, u + 24, v + 24, &R3, &G3, &B3); // Cast to 8b and store as RRRRGGGGBBBB. rgb0 = _mm_packus_epi16(R0, R1); @@ -259,18 +263,18 @@ void VP8YuvToRgb32(const uint8_t* y, const uint8_t* u, const uint8_t* v, rgb5 = _mm_packus_epi16(B2, B3); // Pack as RGBRGBRGBRGB. - PlanarTo24b(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst); + PlanarTo24b_SSE2(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst); } -void VP8YuvToBgr32(const uint8_t* y, const uint8_t* u, const uint8_t* v, - uint8_t* dst) { +void VP8YuvToBgr32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v, + uint8_t* dst) { __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3; __m128i bgr0, bgr1, bgr2, bgr3, bgr4, bgr5; - YUV444ToRGB(y + 0, u + 0, v + 0, &R0, &G0, &B0); - YUV444ToRGB(y + 8, u + 8, v + 8, &R1, &G1, &B1); - YUV444ToRGB(y + 16, u + 16, v + 16, &R2, &G2, &B2); - YUV444ToRGB(y + 24, u + 24, v + 24, &R3, &G3, &B3); + YUV444ToRGB_SSE2(y + 0, u + 0, v + 0, &R0, &G0, &B0); + YUV444ToRGB_SSE2(y + 8, u + 8, v + 8, &R1, &G1, &B1); + YUV444ToRGB_SSE2(y + 16, u + 16, v + 16, &R2, &G2, &B2); + YUV444ToRGB_SSE2(y + 24, u + 24, v + 24, &R3, &G3, &B3); // Cast to 8b and store as BBBBGGGGRRRR. bgr0 = _mm_packus_epi16(B0, B1); @@ -281,20 +285,21 @@ void VP8YuvToBgr32(const uint8_t* y, const uint8_t* u, const uint8_t* v, bgr5= _mm_packus_epi16(R2, R3); // Pack as BGRBGRBGRBGR. - PlanarTo24b(&bgr0, &bgr1, &bgr2, &bgr3, &bgr4, &bgr5, dst); + PlanarTo24b_SSE2(&bgr0, &bgr1, &bgr2, &bgr3, &bgr4, &bgr5, dst); } //----------------------------------------------------------------------------- // Arbitrary-length row conversion functions -static void YuvToRgbaRow(const uint8_t* y, const uint8_t* u, const uint8_t* v, - uint8_t* dst, int len) { +static void YuvToRgbaRow_SSE2(const uint8_t* y, + const uint8_t* u, const uint8_t* v, + uint8_t* dst, int len) { const __m128i kAlpha = _mm_set1_epi16(255); int n; for (n = 0; n + 8 <= len; n += 8, dst += 32) { __m128i R, G, B; - YUV420ToRGB(y, u, v, &R, &G, &B); - PackAndStore4(&R, &G, &B, &kAlpha, dst); + YUV420ToRGB_SSE2(y, u, v, &R, &G, &B); + PackAndStore4_SSE2(&R, &G, &B, &kAlpha, dst); y += 8; u += 4; v += 4; @@ -308,14 +313,15 @@ static void YuvToRgbaRow(const uint8_t* y, const uint8_t* u, const uint8_t* v, } } -static void YuvToBgraRow(const uint8_t* y, const uint8_t* u, const uint8_t* v, - uint8_t* dst, int len) { +static void YuvToBgraRow_SSE2(const uint8_t* y, + const uint8_t* u, const uint8_t* v, + uint8_t* dst, int len) { const __m128i kAlpha = _mm_set1_epi16(255); int n; for (n = 0; n + 8 <= len; n += 8, dst += 32) { __m128i R, G, B; - YUV420ToRGB(y, u, v, &R, &G, &B); - PackAndStore4(&B, &G, &R, &kAlpha, dst); + YUV420ToRGB_SSE2(y, u, v, &R, &G, &B); + PackAndStore4_SSE2(&B, &G, &R, &kAlpha, dst); y += 8; u += 4; v += 4; @@ -329,14 +335,15 @@ static void YuvToBgraRow(const uint8_t* y, const uint8_t* u, const uint8_t* v, } } -static void YuvToArgbRow(const uint8_t* y, const uint8_t* u, const uint8_t* v, - uint8_t* dst, int len) { +static void YuvToArgbRow_SSE2(const uint8_t* y, + const uint8_t* u, const uint8_t* v, + uint8_t* dst, int len) { const __m128i kAlpha = _mm_set1_epi16(255); int n; for (n = 0; n + 8 <= len; n += 8, dst += 32) { __m128i R, G, B; - YUV420ToRGB(y, u, v, &R, &G, &B); - PackAndStore4(&kAlpha, &R, &G, &B, dst); + YUV420ToRGB_SSE2(y, u, v, &R, &G, &B); + PackAndStore4_SSE2(&kAlpha, &R, &G, &B, dst); y += 8; u += 4; v += 4; @@ -350,17 +357,18 @@ static void YuvToArgbRow(const uint8_t* y, const uint8_t* u, const uint8_t* v, } } -static void YuvToRgbRow(const uint8_t* y, const uint8_t* u, const uint8_t* v, - uint8_t* dst, int len) { +static void YuvToRgbRow_SSE2(const uint8_t* y, + const uint8_t* u, const uint8_t* v, + uint8_t* dst, int len) { int n; for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) { __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3; __m128i rgb0, rgb1, rgb2, rgb3, rgb4, rgb5; - YUV420ToRGB(y + 0, u + 0, v + 0, &R0, &G0, &B0); - YUV420ToRGB(y + 8, u + 4, v + 4, &R1, &G1, &B1); - YUV420ToRGB(y + 16, u + 8, v + 8, &R2, &G2, &B2); - YUV420ToRGB(y + 24, u + 12, v + 12, &R3, &G3, &B3); + YUV420ToRGB_SSE2(y + 0, u + 0, v + 0, &R0, &G0, &B0); + YUV420ToRGB_SSE2(y + 8, u + 4, v + 4, &R1, &G1, &B1); + YUV420ToRGB_SSE2(y + 16, u + 8, v + 8, &R2, &G2, &B2); + YUV420ToRGB_SSE2(y + 24, u + 12, v + 12, &R3, &G3, &B3); // Cast to 8b and store as RRRRGGGGBBBB. rgb0 = _mm_packus_epi16(R0, R1); @@ -371,7 +379,7 @@ static void YuvToRgbRow(const uint8_t* y, const uint8_t* u, const uint8_t* v, rgb5 = _mm_packus_epi16(B2, B3); // Pack as RGBRGBRGBRGB. - PlanarTo24b(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst); + PlanarTo24b_SSE2(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst); y += 32; u += 16; @@ -386,17 +394,18 @@ static void YuvToRgbRow(const uint8_t* y, const uint8_t* u, const uint8_t* v, } } -static void YuvToBgrRow(const uint8_t* y, const uint8_t* u, const uint8_t* v, - uint8_t* dst, int len) { +static void YuvToBgrRow_SSE2(const uint8_t* y, + const uint8_t* u, const uint8_t* v, + uint8_t* dst, int len) { int n; for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) { __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3; __m128i bgr0, bgr1, bgr2, bgr3, bgr4, bgr5; - YUV420ToRGB(y + 0, u + 0, v + 0, &R0, &G0, &B0); - YUV420ToRGB(y + 8, u + 4, v + 4, &R1, &G1, &B1); - YUV420ToRGB(y + 16, u + 8, v + 8, &R2, &G2, &B2); - YUV420ToRGB(y + 24, u + 12, v + 12, &R3, &G3, &B3); + YUV420ToRGB_SSE2(y + 0, u + 0, v + 0, &R0, &G0, &B0); + YUV420ToRGB_SSE2(y + 8, u + 4, v + 4, &R1, &G1, &B1); + YUV420ToRGB_SSE2(y + 16, u + 8, v + 8, &R2, &G2, &B2); + YUV420ToRGB_SSE2(y + 24, u + 12, v + 12, &R3, &G3, &B3); // Cast to 8b and store as BBBBGGGGRRRR. bgr0 = _mm_packus_epi16(B0, B1); @@ -407,7 +416,7 @@ static void YuvToBgrRow(const uint8_t* y, const uint8_t* u, const uint8_t* v, bgr5 = _mm_packus_epi16(R2, R3); // Pack as BGRBGRBGRBGR. - PlanarTo24b(&bgr0, &bgr1, &bgr2, &bgr3, &bgr4, &bgr5, dst); + PlanarTo24b_SSE2(&bgr0, &bgr1, &bgr2, &bgr3, &bgr4, &bgr5, dst); y += 32; u += 16; @@ -428,11 +437,11 @@ static void YuvToBgrRow(const uint8_t* y, const uint8_t* u, const uint8_t* v, extern void WebPInitSamplersSSE2(void); WEBP_TSAN_IGNORE_FUNCTION void WebPInitSamplersSSE2(void) { - WebPSamplers[MODE_RGB] = YuvToRgbRow; - WebPSamplers[MODE_RGBA] = YuvToRgbaRow; - WebPSamplers[MODE_BGR] = YuvToBgrRow; - WebPSamplers[MODE_BGRA] = YuvToBgraRow; - WebPSamplers[MODE_ARGB] = YuvToArgbRow; + WebPSamplers[MODE_RGB] = YuvToRgbRow_SSE2; + WebPSamplers[MODE_RGBA] = YuvToRgbaRow_SSE2; + WebPSamplers[MODE_BGR] = YuvToBgrRow_SSE2; + WebPSamplers[MODE_BGRA] = YuvToBgraRow_SSE2; + WebPSamplers[MODE_ARGB] = YuvToArgbRow_SSE2; } //------------------------------------------------------------------------------ @@ -445,7 +454,7 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitSamplersSSE2(void) { // Function that inserts a value of the second half of the in buffer in between // every two char of the first half. -static WEBP_INLINE void RGB24PackedToPlanarHelper( +static WEBP_INLINE void RGB24PackedToPlanarHelper_SSE2( const __m128i* const in /*in[6]*/, __m128i* const out /*out[6]*/) { out[0] = _mm_unpacklo_epi8(in[0], in[3]); out[1] = _mm_unpackhi_epi8(in[0], in[3]); @@ -458,8 +467,8 @@ static WEBP_INLINE void RGB24PackedToPlanarHelper( // Unpack the 8b input rgbrgbrgbrgb ... as contiguous registers: // rrrr... rrrr... gggg... gggg... bbbb... bbbb.... // Similar to PlanarTo24bHelper(), but in reverse order. -static WEBP_INLINE void RGB24PackedToPlanar(const uint8_t* const rgb, - __m128i* const out /*out[6]*/) { +static WEBP_INLINE void RGB24PackedToPlanar_SSE2( + const uint8_t* const rgb, __m128i* const out /*out[6]*/) { __m128i tmp[6]; tmp[0] = _mm_loadu_si128((const __m128i*)(rgb + 0)); tmp[1] = _mm_loadu_si128((const __m128i*)(rgb + 16)); @@ -468,22 +477,22 @@ static WEBP_INLINE void RGB24PackedToPlanar(const uint8_t* const rgb, tmp[4] = _mm_loadu_si128((const __m128i*)(rgb + 64)); tmp[5] = _mm_loadu_si128((const __m128i*)(rgb + 80)); - RGB24PackedToPlanarHelper(tmp, out); - RGB24PackedToPlanarHelper(out, tmp); - RGB24PackedToPlanarHelper(tmp, out); - RGB24PackedToPlanarHelper(out, tmp); - RGB24PackedToPlanarHelper(tmp, out); + RGB24PackedToPlanarHelper_SSE2(tmp, out); + RGB24PackedToPlanarHelper_SSE2(out, tmp); + RGB24PackedToPlanarHelper_SSE2(tmp, out); + RGB24PackedToPlanarHelper_SSE2(out, tmp); + RGB24PackedToPlanarHelper_SSE2(tmp, out); } // Convert 8 packed ARGB to r[], g[], b[] -static WEBP_INLINE void RGB32PackedToPlanar(const uint32_t* const argb, - __m128i* const rgb /*in[6]*/) { +static WEBP_INLINE void RGB32PackedToPlanar_SSE2(const uint32_t* const argb, + __m128i* const rgb /*in[6]*/) { const __m128i zero = _mm_setzero_si128(); __m128i a0 = LOAD_16(argb + 0); __m128i a1 = LOAD_16(argb + 4); __m128i a2 = LOAD_16(argb + 8); __m128i a3 = LOAD_16(argb + 12); - VP8L32bToPlanar(&a0, &a1, &a2, &a3); + VP8L32bToPlanar_SSE2(&a0, &a1, &a2, &a3); rgb[0] = _mm_unpacklo_epi8(a1, zero); rgb[1] = _mm_unpackhi_epi8(a1, zero); rgb[2] = _mm_unpacklo_epi8(a2, zero); @@ -511,10 +520,10 @@ static WEBP_INLINE void RGB32PackedToPlanar(const uint32_t* const argb, } while (0) #define MK_CST_16(A, B) _mm_set_epi16((B), (A), (B), (A), (B), (A), (B), (A)) -static WEBP_INLINE void ConvertRGBToY(const __m128i* const R, - const __m128i* const G, - const __m128i* const B, - __m128i* const Y) { +static WEBP_INLINE void ConvertRGBToY_SSE2(const __m128i* const R, + const __m128i* const G, + const __m128i* const B, + __m128i* const Y) { const __m128i kRG_y = MK_CST_16(16839, 33059 - 16384); const __m128i kGB_y = MK_CST_16(16384, 6420); const __m128i kHALF_Y = _mm_set1_epi32((16 << YUV_FIX) + YUV_HALF); @@ -526,10 +535,11 @@ static WEBP_INLINE void ConvertRGBToY(const __m128i* const R, TRANSFORM(RG_lo, RG_hi, GB_lo, GB_hi, kRG_y, kGB_y, kHALF_Y, YUV_FIX, *Y); } -static WEBP_INLINE void ConvertRGBToUV(const __m128i* const R, - const __m128i* const G, - const __m128i* const B, - __m128i* const U, __m128i* const V) { +static WEBP_INLINE void ConvertRGBToUV_SSE2(const __m128i* const R, + const __m128i* const G, + const __m128i* const B, + __m128i* const U, + __m128i* const V) { const __m128i kRG_u = MK_CST_16(-9719, -19081); const __m128i kGB_u = MK_CST_16(0, 28800); const __m128i kRG_v = MK_CST_16(28800, 0); @@ -549,14 +559,14 @@ static WEBP_INLINE void ConvertRGBToUV(const __m128i* const R, #undef MK_CST_16 #undef TRANSFORM -static void ConvertRGB24ToY(const uint8_t* rgb, uint8_t* y, int width) { +static void ConvertRGB24ToY_SSE2(const uint8_t* rgb, uint8_t* y, int width) { const int max_width = width & ~31; int i; for (i = 0; i < max_width; rgb += 3 * 16 * 2) { __m128i rgb_plane[6]; int j; - RGB24PackedToPlanar(rgb, rgb_plane); + RGB24PackedToPlanar_SSE2(rgb, rgb_plane); for (j = 0; j < 2; ++j, i += 16) { const __m128i zero = _mm_setzero_si128(); @@ -566,13 +576,13 @@ static void ConvertRGB24ToY(const uint8_t* rgb, uint8_t* y, int width) { r = _mm_unpacklo_epi8(rgb_plane[0 + j], zero); g = _mm_unpacklo_epi8(rgb_plane[2 + j], zero); b = _mm_unpacklo_epi8(rgb_plane[4 + j], zero); - ConvertRGBToY(&r, &g, &b, &Y0); + ConvertRGBToY_SSE2(&r, &g, &b, &Y0); // Convert to 16-bit Y. r = _mm_unpackhi_epi8(rgb_plane[0 + j], zero); g = _mm_unpackhi_epi8(rgb_plane[2 + j], zero); b = _mm_unpackhi_epi8(rgb_plane[4 + j], zero); - ConvertRGBToY(&r, &g, &b, &Y1); + ConvertRGBToY_SSE2(&r, &g, &b, &Y1); // Cast to 8-bit and store. STORE_16(_mm_packus_epi16(Y0, Y1), y + i); @@ -583,14 +593,14 @@ static void ConvertRGB24ToY(const uint8_t* rgb, uint8_t* y, int width) { } } -static void ConvertBGR24ToY(const uint8_t* bgr, uint8_t* y, int width) { +static void ConvertBGR24ToY_SSE2(const uint8_t* bgr, uint8_t* y, int width) { const int max_width = width & ~31; int i; for (i = 0; i < max_width; bgr += 3 * 16 * 2) { __m128i bgr_plane[6]; int j; - RGB24PackedToPlanar(bgr, bgr_plane); + RGB24PackedToPlanar_SSE2(bgr, bgr_plane); for (j = 0; j < 2; ++j, i += 16) { const __m128i zero = _mm_setzero_si128(); @@ -600,13 +610,13 @@ static void ConvertBGR24ToY(const uint8_t* bgr, uint8_t* y, int width) { b = _mm_unpacklo_epi8(bgr_plane[0 + j], zero); g = _mm_unpacklo_epi8(bgr_plane[2 + j], zero); r = _mm_unpacklo_epi8(bgr_plane[4 + j], zero); - ConvertRGBToY(&r, &g, &b, &Y0); + ConvertRGBToY_SSE2(&r, &g, &b, &Y0); // Convert to 16-bit Y. b = _mm_unpackhi_epi8(bgr_plane[0 + j], zero); g = _mm_unpackhi_epi8(bgr_plane[2 + j], zero); r = _mm_unpackhi_epi8(bgr_plane[4 + j], zero); - ConvertRGBToY(&r, &g, &b, &Y1); + ConvertRGBToY_SSE2(&r, &g, &b, &Y1); // Cast to 8-bit and store. STORE_16(_mm_packus_epi16(Y0, Y1), y + i); @@ -617,14 +627,14 @@ static void ConvertBGR24ToY(const uint8_t* bgr, uint8_t* y, int width) { } } -static void ConvertARGBToY(const uint32_t* argb, uint8_t* y, int width) { +static void ConvertARGBToY_SSE2(const uint32_t* argb, uint8_t* y, int width) { const int max_width = width & ~15; int i; for (i = 0; i < max_width; i += 16) { __m128i Y0, Y1, rgb[6]; - RGB32PackedToPlanar(&argb[i], rgb); - ConvertRGBToY(&rgb[0], &rgb[2], &rgb[4], &Y0); - ConvertRGBToY(&rgb[1], &rgb[3], &rgb[5], &Y1); + RGB32PackedToPlanar_SSE2(&argb[i], rgb); + ConvertRGBToY_SSE2(&rgb[0], &rgb[2], &rgb[4], &Y0); + ConvertRGBToY_SSE2(&rgb[1], &rgb[3], &rgb[5], &Y1); STORE_16(_mm_packus_epi16(Y0, Y1), y + i); } for (; i < width; ++i) { // left-over @@ -636,31 +646,33 @@ static void ConvertARGBToY(const uint32_t* argb, uint8_t* y, int width) { // Horizontal add (doubled) of two 16b values, result is 16b. // in: A | B | C | D | ... -> out: 2*(A+B) | 2*(C+D) | ... -static void HorizontalAddPack(const __m128i* const A, const __m128i* const B, - __m128i* const out) { +static void HorizontalAddPack_SSE2(const __m128i* const A, + const __m128i* const B, + __m128i* const out) { const __m128i k2 = _mm_set1_epi16(2); const __m128i C = _mm_madd_epi16(*A, k2); const __m128i D = _mm_madd_epi16(*B, k2); *out = _mm_packs_epi32(C, D); } -static void ConvertARGBToUV(const uint32_t* argb, uint8_t* u, uint8_t* v, - int src_width, int do_store) { +static void ConvertARGBToUV_SSE2(const uint32_t* argb, + uint8_t* u, uint8_t* v, + int src_width, int do_store) { const int max_width = src_width & ~31; int i; for (i = 0; i < max_width; i += 32, u += 16, v += 16) { __m128i rgb[6], U0, V0, U1, V1; - RGB32PackedToPlanar(&argb[i], rgb); - HorizontalAddPack(&rgb[0], &rgb[1], &rgb[0]); - HorizontalAddPack(&rgb[2], &rgb[3], &rgb[2]); - HorizontalAddPack(&rgb[4], &rgb[5], &rgb[4]); - ConvertRGBToUV(&rgb[0], &rgb[2], &rgb[4], &U0, &V0); - - RGB32PackedToPlanar(&argb[i + 16], rgb); - HorizontalAddPack(&rgb[0], &rgb[1], &rgb[0]); - HorizontalAddPack(&rgb[2], &rgb[3], &rgb[2]); - HorizontalAddPack(&rgb[4], &rgb[5], &rgb[4]); - ConvertRGBToUV(&rgb[0], &rgb[2], &rgb[4], &U1, &V1); + RGB32PackedToPlanar_SSE2(&argb[i], rgb); + HorizontalAddPack_SSE2(&rgb[0], &rgb[1], &rgb[0]); + HorizontalAddPack_SSE2(&rgb[2], &rgb[3], &rgb[2]); + HorizontalAddPack_SSE2(&rgb[4], &rgb[5], &rgb[4]); + ConvertRGBToUV_SSE2(&rgb[0], &rgb[2], &rgb[4], &U0, &V0); + + RGB32PackedToPlanar_SSE2(&argb[i + 16], rgb); + HorizontalAddPack_SSE2(&rgb[0], &rgb[1], &rgb[0]); + HorizontalAddPack_SSE2(&rgb[2], &rgb[3], &rgb[2]); + HorizontalAddPack_SSE2(&rgb[4], &rgb[5], &rgb[4]); + ConvertRGBToUV_SSE2(&rgb[0], &rgb[2], &rgb[4], &U1, &V1); U0 = _mm_packus_epi16(U0, U1); V0 = _mm_packus_epi16(V0, V1); @@ -679,10 +691,9 @@ static void ConvertARGBToUV(const uint32_t* argb, uint8_t* u, uint8_t* v, } // Convert 16 packed ARGB 16b-values to r[], g[], b[] -static WEBP_INLINE void RGBA32PackedToPlanar_16b(const uint16_t* const rgbx, - __m128i* const r, - __m128i* const g, - __m128i* const b) { +static WEBP_INLINE void RGBA32PackedToPlanar_16b_SSE2( + const uint16_t* const rgbx, + __m128i* const r, __m128i* const g, __m128i* const b) { const __m128i in0 = LOAD_16(rgbx + 0); // r0 | g0 | b0 |x| r1 | g1 | b1 |x const __m128i in1 = LOAD_16(rgbx + 8); // r2 | g2 | b2 |x| r3 | g3 | b3 |x const __m128i in2 = LOAD_16(rgbx + 16); // r4 | ... @@ -701,16 +712,16 @@ static WEBP_INLINE void RGBA32PackedToPlanar_16b(const uint16_t* const rgbx, *b = _mm_unpacklo_epi64(B1, B3); } -static void ConvertRGBA32ToUV(const uint16_t* rgb, - uint8_t* u, uint8_t* v, int width) { +static void ConvertRGBA32ToUV_SSE2(const uint16_t* rgb, + uint8_t* u, uint8_t* v, int width) { const int max_width = width & ~15; const uint16_t* const last_rgb = rgb + 4 * max_width; while (rgb < last_rgb) { __m128i r, g, b, U0, V0, U1, V1; - RGBA32PackedToPlanar_16b(rgb + 0, &r, &g, &b); - ConvertRGBToUV(&r, &g, &b, &U0, &V0); - RGBA32PackedToPlanar_16b(rgb + 32, &r, &g, &b); - ConvertRGBToUV(&r, &g, &b, &U1, &V1); + RGBA32PackedToPlanar_16b_SSE2(rgb + 0, &r, &g, &b); + ConvertRGBToUV_SSE2(&r, &g, &b, &U0, &V0); + RGBA32PackedToPlanar_16b_SSE2(rgb + 32, &r, &g, &b); + ConvertRGBToUV_SSE2(&r, &g, &b, &U1, &V1); STORE_16(_mm_packus_epi16(U0, U1), u); STORE_16(_mm_packus_epi16(V0, V1), v); u += 16; @@ -727,13 +738,13 @@ static void ConvertRGBA32ToUV(const uint16_t* rgb, extern void WebPInitConvertARGBToYUVSSE2(void); WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUVSSE2(void) { - WebPConvertARGBToY = ConvertARGBToY; - WebPConvertARGBToUV = ConvertARGBToUV; + WebPConvertARGBToY = ConvertARGBToY_SSE2; + WebPConvertARGBToUV = ConvertARGBToUV_SSE2; - WebPConvertRGB24ToY = ConvertRGB24ToY; - WebPConvertBGR24ToY = ConvertBGR24ToY; + WebPConvertRGB24ToY = ConvertRGB24ToY_SSE2; + WebPConvertBGR24ToY = ConvertBGR24ToY_SSE2; - WebPConvertRGBA32ToUV = ConvertRGBA32ToUV; + WebPConvertRGBA32ToUV = ConvertRGBA32ToUV_SSE2; } //------------------------------------------------------------------------------ diff --git a/src/3rdparty/libwebp/src/dsp/yuv_sse41.c b/src/3rdparty/libwebp/src/dsp/yuv_sse41.c new file mode 100644 index 0000000..579d1f7 --- /dev/null +++ b/src/3rdparty/libwebp/src/dsp/yuv_sse41.c @@ -0,0 +1,613 @@ +// Copyright 2014 Google Inc. All Rights Reserved. +// +// Use of this source code is governed by a BSD-style license +// that can be found in the COPYING file in the root of the source +// tree. An additional intellectual property rights grant can be found +// in the file PATENTS. All contributing project authors may +// be found in the AUTHORS file in the root of the source tree. +// ----------------------------------------------------------------------------- +// +// YUV->RGB conversion functions +// +// Author: Skal (pascal.massimino@gmail.com) + +#include "src/dsp/yuv.h" + +#if defined(WEBP_USE_SSE41) + +#include "src/dsp/common_sse41.h" +#include <stdlib.h> +#include <smmintrin.h> + +//----------------------------------------------------------------------------- +// Convert spans of 32 pixels to various RGB formats for the fancy upsampler. + +// These constants are 14b fixed-point version of ITU-R BT.601 constants. +// R = (19077 * y + 26149 * v - 14234) >> 6 +// G = (19077 * y - 6419 * u - 13320 * v + 8708) >> 6 +// B = (19077 * y + 33050 * u - 17685) >> 6 +static void ConvertYUV444ToRGB_SSE41(const __m128i* const Y0, + const __m128i* const U0, + const __m128i* const V0, + __m128i* const R, + __m128i* const G, + __m128i* const B) { + const __m128i k19077 = _mm_set1_epi16(19077); + const __m128i k26149 = _mm_set1_epi16(26149); + const __m128i k14234 = _mm_set1_epi16(14234); + // 33050 doesn't fit in a signed short: only use this with unsigned arithmetic + const __m128i k33050 = _mm_set1_epi16((short)33050); + const __m128i k17685 = _mm_set1_epi16(17685); + const __m128i k6419 = _mm_set1_epi16(6419); + const __m128i k13320 = _mm_set1_epi16(13320); + const __m128i k8708 = _mm_set1_epi16(8708); + + const __m128i Y1 = _mm_mulhi_epu16(*Y0, k19077); + + const __m128i R0 = _mm_mulhi_epu16(*V0, k26149); + const __m128i R1 = _mm_sub_epi16(Y1, k14234); + const __m128i R2 = _mm_add_epi16(R1, R0); + + const __m128i G0 = _mm_mulhi_epu16(*U0, k6419); + const __m128i G1 = _mm_mulhi_epu16(*V0, k13320); + const __m128i G2 = _mm_add_epi16(Y1, k8708); + const __m128i G3 = _mm_add_epi16(G0, G1); + const __m128i G4 = _mm_sub_epi16(G2, G3); + + // be careful with the saturated *unsigned* arithmetic here! + const __m128i B0 = _mm_mulhi_epu16(*U0, k33050); + const __m128i B1 = _mm_adds_epu16(B0, Y1); + const __m128i B2 = _mm_subs_epu16(B1, k17685); + + // use logical shift for B2, which can be larger than 32767 + *R = _mm_srai_epi16(R2, 6); // range: [-14234, 30815] + *G = _mm_srai_epi16(G4, 6); // range: [-10953, 27710] + *B = _mm_srli_epi16(B2, 6); // range: [0, 34238] +} + +// Load the bytes into the *upper* part of 16b words. That's "<< 8", basically. +static WEBP_INLINE __m128i Load_HI_16_SSE41(const uint8_t* src) { + const __m128i zero = _mm_setzero_si128(); + return _mm_unpacklo_epi8(zero, _mm_loadl_epi64((const __m128i*)src)); +} + +// Load and replicate the U/V samples +static WEBP_INLINE __m128i Load_UV_HI_8_SSE41(const uint8_t* src) { + const __m128i zero = _mm_setzero_si128(); + const __m128i tmp0 = _mm_cvtsi32_si128(*(const uint32_t*)src); + const __m128i tmp1 = _mm_unpacklo_epi8(zero, tmp0); + return _mm_unpacklo_epi16(tmp1, tmp1); // replicate samples +} + +// Convert 32 samples of YUV444 to R/G/B +static void YUV444ToRGB_SSE41(const uint8_t* const y, + const uint8_t* const u, + const uint8_t* const v, + __m128i* const R, __m128i* const G, + __m128i* const B) { + const __m128i Y0 = Load_HI_16_SSE41(y), U0 = Load_HI_16_SSE41(u), + V0 = Load_HI_16_SSE41(v); + ConvertYUV444ToRGB_SSE41(&Y0, &U0, &V0, R, G, B); +} + +// Convert 32 samples of YUV420 to R/G/B +static void YUV420ToRGB_SSE41(const uint8_t* const y, + const uint8_t* const u, + const uint8_t* const v, + __m128i* const R, __m128i* const G, + __m128i* const B) { + const __m128i Y0 = Load_HI_16_SSE41(y), U0 = Load_UV_HI_8_SSE41(u), + V0 = Load_UV_HI_8_SSE41(v); + ConvertYUV444ToRGB_SSE41(&Y0, &U0, &V0, R, G, B); +} + +// Pack the planar buffers +// rrrr... rrrr... gggg... gggg... bbbb... bbbb.... +// triplet by triplet in the output buffer rgb as rgbrgbrgbrgb ... +static WEBP_INLINE void PlanarTo24b_SSE41( + __m128i* const in0, __m128i* const in1, __m128i* const in2, + __m128i* const in3, __m128i* const in4, __m128i* const in5, + uint8_t* const rgb) { + // The input is 6 registers of sixteen 8b but for the sake of explanation, + // let's take 6 registers of four 8b values. + // To pack, we will keep taking one every two 8b integer and move it + // around as follows: + // Input: + // r0r1r2r3 | r4r5r6r7 | g0g1g2g3 | g4g5g6g7 | b0b1b2b3 | b4b5b6b7 + // Split the 6 registers in two sets of 3 registers: the first set as the even + // 8b bytes, the second the odd ones: + // r0r2r4r6 | g0g2g4g6 | b0b2b4b6 | r1r3r5r7 | g1g3g5g7 | b1b3b5b7 + // Repeat the same permutations twice more: + // r0r4g0g4 | b0b4r1r5 | g1g5b1b5 | r2r6g2g6 | b2b6r3r7 | g3g7b3b7 + // r0g0b0r1 | g1b1r2g2 | b2r3g3b3 | r4g4b4r5 | g5b5r6g6 | b6r7g7b7 + VP8PlanarTo24b_SSE41(in0, in1, in2, in3, in4, in5); + + _mm_storeu_si128((__m128i*)(rgb + 0), *in0); + _mm_storeu_si128((__m128i*)(rgb + 16), *in1); + _mm_storeu_si128((__m128i*)(rgb + 32), *in2); + _mm_storeu_si128((__m128i*)(rgb + 48), *in3); + _mm_storeu_si128((__m128i*)(rgb + 64), *in4); + _mm_storeu_si128((__m128i*)(rgb + 80), *in5); +} + +void VP8YuvToRgb32_SSE41(const uint8_t* y, const uint8_t* u, const uint8_t* v, + uint8_t* dst) { + __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3; + __m128i rgb0, rgb1, rgb2, rgb3, rgb4, rgb5; + + YUV444ToRGB_SSE41(y + 0, u + 0, v + 0, &R0, &G0, &B0); + YUV444ToRGB_SSE41(y + 8, u + 8, v + 8, &R1, &G1, &B1); + YUV444ToRGB_SSE41(y + 16, u + 16, v + 16, &R2, &G2, &B2); + YUV444ToRGB_SSE41(y + 24, u + 24, v + 24, &R3, &G3, &B3); + + // Cast to 8b and store as RRRRGGGGBBBB. + rgb0 = _mm_packus_epi16(R0, R1); + rgb1 = _mm_packus_epi16(R2, R3); + rgb2 = _mm_packus_epi16(G0, G1); + rgb3 = _mm_packus_epi16(G2, G3); + rgb4 = _mm_packus_epi16(B0, B1); + rgb5 = _mm_packus_epi16(B2, B3); + + // Pack as RGBRGBRGBRGB. + PlanarTo24b_SSE41(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst); +} + +void VP8YuvToBgr32_SSE41(const uint8_t* y, const uint8_t* u, const uint8_t* v, + uint8_t* dst) { + __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3; + __m128i bgr0, bgr1, bgr2, bgr3, bgr4, bgr5; + + YUV444ToRGB_SSE41(y + 0, u + 0, v + 0, &R0, &G0, &B0); + YUV444ToRGB_SSE41(y + 8, u + 8, v + 8, &R1, &G1, &B1); + YUV444ToRGB_SSE41(y + 16, u + 16, v + 16, &R2, &G2, &B2); + YUV444ToRGB_SSE41(y + 24, u + 24, v + 24, &R3, &G3, &B3); + + // Cast to 8b and store as BBBBGGGGRRRR. + bgr0 = _mm_packus_epi16(B0, B1); + bgr1 = _mm_packus_epi16(B2, B3); + bgr2 = _mm_packus_epi16(G0, G1); + bgr3 = _mm_packus_epi16(G2, G3); + bgr4 = _mm_packus_epi16(R0, R1); + bgr5= _mm_packus_epi16(R2, R3); + + // Pack as BGRBGRBGRBGR. + PlanarTo24b_SSE41(&bgr0, &bgr1, &bgr2, &bgr3, &bgr4, &bgr5, dst); +} + +//----------------------------------------------------------------------------- +// Arbitrary-length row conversion functions + +static void YuvToRgbRow_SSE41(const uint8_t* y, + const uint8_t* u, const uint8_t* v, + uint8_t* dst, int len) { + int n; + for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) { + __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3; + __m128i rgb0, rgb1, rgb2, rgb3, rgb4, rgb5; + + YUV420ToRGB_SSE41(y + 0, u + 0, v + 0, &R0, &G0, &B0); + YUV420ToRGB_SSE41(y + 8, u + 4, v + 4, &R1, &G1, &B1); + YUV420ToRGB_SSE41(y + 16, u + 8, v + 8, &R2, &G2, &B2); + YUV420ToRGB_SSE41(y + 24, u + 12, v + 12, &R3, &G3, &B3); + + // Cast to 8b and store as RRRRGGGGBBBB. + rgb0 = _mm_packus_epi16(R0, R1); + rgb1 = _mm_packus_epi16(R2, R3); + rgb2 = _mm_packus_epi16(G0, G1); + rgb3 = _mm_packus_epi16(G2, G3); + rgb4 = _mm_packus_epi16(B0, B1); + rgb5 = _mm_packus_epi16(B2, B3); + + // Pack as RGBRGBRGBRGB. + PlanarTo24b_SSE41(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst); + + y += 32; + u += 16; + v += 16; + } + for (; n < len; ++n) { // Finish off + VP8YuvToRgb(y[0], u[0], v[0], dst); + dst += 3; + y += 1; + u += (n & 1); + v += (n & 1); + } +} + +static void YuvToBgrRow_SSE41(const uint8_t* y, + const uint8_t* u, const uint8_t* v, + uint8_t* dst, int len) { + int n; + for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) { + __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3; + __m128i bgr0, bgr1, bgr2, bgr3, bgr4, bgr5; + + YUV420ToRGB_SSE41(y + 0, u + 0, v + 0, &R0, &G0, &B0); + YUV420ToRGB_SSE41(y + 8, u + 4, v + 4, &R1, &G1, &B1); + YUV420ToRGB_SSE41(y + 16, u + 8, v + 8, &R2, &G2, &B2); + YUV420ToRGB_SSE41(y + 24, u + 12, v + 12, &R3, &G3, &B3); + + // Cast to 8b and store as BBBBGGGGRRRR. + bgr0 = _mm_packus_epi16(B0, B1); + bgr1 = _mm_packus_epi16(B2, B3); + bgr2 = _mm_packus_epi16(G0, G1); + bgr3 = _mm_packus_epi16(G2, G3); + bgr4 = _mm_packus_epi16(R0, R1); + bgr5 = _mm_packus_epi16(R2, R3); + + // Pack as BGRBGRBGRBGR. + PlanarTo24b_SSE41(&bgr0, &bgr1, &bgr2, &bgr3, &bgr4, &bgr5, dst); + + y += 32; + u += 16; + v += 16; + } + for (; n < len; ++n) { // Finish off + VP8YuvToBgr(y[0], u[0], v[0], dst); + dst += 3; + y += 1; + u += (n & 1); + v += (n & 1); + } +} + +//------------------------------------------------------------------------------ +// Entry point + +extern void WebPInitSamplersSSE41(void); + +WEBP_TSAN_IGNORE_FUNCTION void WebPInitSamplersSSE41(void) { + WebPSamplers[MODE_RGB] = YuvToRgbRow_SSE41; + WebPSamplers[MODE_BGR] = YuvToBgrRow_SSE41; +} + +//------------------------------------------------------------------------------ +// RGB24/32 -> YUV converters + +// Load eight 16b-words from *src. +#define LOAD_16(src) _mm_loadu_si128((const __m128i*)(src)) +// Store either 16b-words into *dst +#define STORE_16(V, dst) _mm_storeu_si128((__m128i*)(dst), (V)) + +#define WEBP_SSE41_SHUFF(OUT) do { \ + const __m128i tmp0 = _mm_shuffle_epi8(A0, shuff0); \ + const __m128i tmp1 = _mm_shuffle_epi8(A1, shuff1); \ + const __m128i tmp2 = _mm_shuffle_epi8(A2, shuff2); \ + const __m128i tmp3 = _mm_shuffle_epi8(A3, shuff0); \ + const __m128i tmp4 = _mm_shuffle_epi8(A4, shuff1); \ + const __m128i tmp5 = _mm_shuffle_epi8(A5, shuff2); \ + \ + /* OR everything to get one channel */ \ + const __m128i tmp6 = _mm_or_si128(tmp0, tmp1); \ + const __m128i tmp7 = _mm_or_si128(tmp3, tmp4); \ + out[OUT + 0] = _mm_or_si128(tmp6, tmp2); \ + out[OUT + 1] = _mm_or_si128(tmp7, tmp5); \ +} while (0); + +// Unpack the 8b input rgbrgbrgbrgb ... as contiguous registers: +// rrrr... rrrr... gggg... gggg... bbbb... bbbb.... +// Similar to PlanarTo24bHelper(), but in reverse order. +static WEBP_INLINE void RGB24PackedToPlanar_SSE41( + const uint8_t* const rgb, __m128i* const out /*out[6]*/) { + const __m128i A0 = _mm_loadu_si128((const __m128i*)(rgb + 0)); + const __m128i A1 = _mm_loadu_si128((const __m128i*)(rgb + 16)); + const __m128i A2 = _mm_loadu_si128((const __m128i*)(rgb + 32)); + const __m128i A3 = _mm_loadu_si128((const __m128i*)(rgb + 48)); + const __m128i A4 = _mm_loadu_si128((const __m128i*)(rgb + 64)); + const __m128i A5 = _mm_loadu_si128((const __m128i*)(rgb + 80)); + + // Compute RR. + { + const __m128i shuff0 = _mm_set_epi8( + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 15, 12, 9, 6, 3, 0); + const __m128i shuff1 = _mm_set_epi8( + -1, -1, -1, -1, -1, 14, 11, 8, 5, 2, -1, -1, -1, -1, -1, -1); + const __m128i shuff2 = _mm_set_epi8( + 13, 10, 7, 4, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); + WEBP_SSE41_SHUFF(0) + } + // Compute GG. + { + const __m128i shuff0 = _mm_set_epi8( + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 13, 10, 7, 4, 1); + const __m128i shuff1 = _mm_set_epi8( + -1, -1, -1, -1, -1, 15, 12, 9, 6, 3, 0, -1, -1, -1, -1, -1); + const __m128i shuff2 = _mm_set_epi8( + 14, 11, 8, 5, 2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); + WEBP_SSE41_SHUFF(2) + } + // Compute BB. + { + const __m128i shuff0 = _mm_set_epi8( + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 14, 11, 8, 5, 2); + const __m128i shuff1 = _mm_set_epi8( + -1, -1, -1, -1, -1, -1, 13, 10, 7, 4, 1, -1, -1, -1, -1, -1); + const __m128i shuff2 = _mm_set_epi8( + 15, 12, 9, 6, 3, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); + WEBP_SSE41_SHUFF(4) + } +} + +#undef WEBP_SSE41_SHUFF + +// Convert 8 packed ARGB to r[], g[], b[] +static WEBP_INLINE void RGB32PackedToPlanar_SSE41( + const uint32_t* const argb, __m128i* const rgb /*in[6]*/) { + const __m128i zero = _mm_setzero_si128(); + __m128i a0 = LOAD_16(argb + 0); + __m128i a1 = LOAD_16(argb + 4); + __m128i a2 = LOAD_16(argb + 8); + __m128i a3 = LOAD_16(argb + 12); + VP8L32bToPlanar_SSE41(&a0, &a1, &a2, &a3); + rgb[0] = _mm_unpacklo_epi8(a1, zero); + rgb[1] = _mm_unpackhi_epi8(a1, zero); + rgb[2] = _mm_unpacklo_epi8(a2, zero); + rgb[3] = _mm_unpackhi_epi8(a2, zero); + rgb[4] = _mm_unpacklo_epi8(a3, zero); + rgb[5] = _mm_unpackhi_epi8(a3, zero); +} + +// This macro computes (RG * MULT_RG + GB * MULT_GB + ROUNDER) >> DESCALE_FIX +// It's a macro and not a function because we need to use immediate values with +// srai_epi32, e.g. +#define TRANSFORM(RG_LO, RG_HI, GB_LO, GB_HI, MULT_RG, MULT_GB, \ + ROUNDER, DESCALE_FIX, OUT) do { \ + const __m128i V0_lo = _mm_madd_epi16(RG_LO, MULT_RG); \ + const __m128i V0_hi = _mm_madd_epi16(RG_HI, MULT_RG); \ + const __m128i V1_lo = _mm_madd_epi16(GB_LO, MULT_GB); \ + const __m128i V1_hi = _mm_madd_epi16(GB_HI, MULT_GB); \ + const __m128i V2_lo = _mm_add_epi32(V0_lo, V1_lo); \ + const __m128i V2_hi = _mm_add_epi32(V0_hi, V1_hi); \ + const __m128i V3_lo = _mm_add_epi32(V2_lo, ROUNDER); \ + const __m128i V3_hi = _mm_add_epi32(V2_hi, ROUNDER); \ + const __m128i V5_lo = _mm_srai_epi32(V3_lo, DESCALE_FIX); \ + const __m128i V5_hi = _mm_srai_epi32(V3_hi, DESCALE_FIX); \ + (OUT) = _mm_packs_epi32(V5_lo, V5_hi); \ +} while (0) + +#define MK_CST_16(A, B) _mm_set_epi16((B), (A), (B), (A), (B), (A), (B), (A)) +static WEBP_INLINE void ConvertRGBToY_SSE41(const __m128i* const R, + const __m128i* const G, + const __m128i* const B, + __m128i* const Y) { + const __m128i kRG_y = MK_CST_16(16839, 33059 - 16384); + const __m128i kGB_y = MK_CST_16(16384, 6420); + const __m128i kHALF_Y = _mm_set1_epi32((16 << YUV_FIX) + YUV_HALF); + + const __m128i RG_lo = _mm_unpacklo_epi16(*R, *G); + const __m128i RG_hi = _mm_unpackhi_epi16(*R, *G); + const __m128i GB_lo = _mm_unpacklo_epi16(*G, *B); + const __m128i GB_hi = _mm_unpackhi_epi16(*G, *B); + TRANSFORM(RG_lo, RG_hi, GB_lo, GB_hi, kRG_y, kGB_y, kHALF_Y, YUV_FIX, *Y); +} + +static WEBP_INLINE void ConvertRGBToUV_SSE41(const __m128i* const R, + const __m128i* const G, + const __m128i* const B, + __m128i* const U, + __m128i* const V) { + const __m128i kRG_u = MK_CST_16(-9719, -19081); + const __m128i kGB_u = MK_CST_16(0, 28800); + const __m128i kRG_v = MK_CST_16(28800, 0); + const __m128i kGB_v = MK_CST_16(-24116, -4684); + const __m128i kHALF_UV = _mm_set1_epi32(((128 << YUV_FIX) + YUV_HALF) << 2); + + const __m128i RG_lo = _mm_unpacklo_epi16(*R, *G); + const __m128i RG_hi = _mm_unpackhi_epi16(*R, *G); + const __m128i GB_lo = _mm_unpacklo_epi16(*G, *B); + const __m128i GB_hi = _mm_unpackhi_epi16(*G, *B); + TRANSFORM(RG_lo, RG_hi, GB_lo, GB_hi, kRG_u, kGB_u, + kHALF_UV, YUV_FIX + 2, *U); + TRANSFORM(RG_lo, RG_hi, GB_lo, GB_hi, kRG_v, kGB_v, + kHALF_UV, YUV_FIX + 2, *V); +} + +#undef MK_CST_16 +#undef TRANSFORM + +static void ConvertRGB24ToY_SSE41(const uint8_t* rgb, uint8_t* y, int width) { + const int max_width = width & ~31; + int i; + for (i = 0; i < max_width; rgb += 3 * 16 * 2) { + __m128i rgb_plane[6]; + int j; + + RGB24PackedToPlanar_SSE41(rgb, rgb_plane); + + for (j = 0; j < 2; ++j, i += 16) { + const __m128i zero = _mm_setzero_si128(); + __m128i r, g, b, Y0, Y1; + + // Convert to 16-bit Y. + r = _mm_unpacklo_epi8(rgb_plane[0 + j], zero); + g = _mm_unpacklo_epi8(rgb_plane[2 + j], zero); + b = _mm_unpacklo_epi8(rgb_plane[4 + j], zero); + ConvertRGBToY_SSE41(&r, &g, &b, &Y0); + + // Convert to 16-bit Y. + r = _mm_unpackhi_epi8(rgb_plane[0 + j], zero); + g = _mm_unpackhi_epi8(rgb_plane[2 + j], zero); + b = _mm_unpackhi_epi8(rgb_plane[4 + j], zero); + ConvertRGBToY_SSE41(&r, &g, &b, &Y1); + + // Cast to 8-bit and store. + STORE_16(_mm_packus_epi16(Y0, Y1), y + i); + } + } + for (; i < width; ++i, rgb += 3) { // left-over + y[i] = VP8RGBToY(rgb[0], rgb[1], rgb[2], YUV_HALF); + } +} + +static void ConvertBGR24ToY_SSE41(const uint8_t* bgr, uint8_t* y, int width) { + const int max_width = width & ~31; + int i; + for (i = 0; i < max_width; bgr += 3 * 16 * 2) { + __m128i bgr_plane[6]; + int j; + + RGB24PackedToPlanar_SSE41(bgr, bgr_plane); + + for (j = 0; j < 2; ++j, i += 16) { + const __m128i zero = _mm_setzero_si128(); + __m128i r, g, b, Y0, Y1; + + // Convert to 16-bit Y. + b = _mm_unpacklo_epi8(bgr_plane[0 + j], zero); + g = _mm_unpacklo_epi8(bgr_plane[2 + j], zero); + r = _mm_unpacklo_epi8(bgr_plane[4 + j], zero); + ConvertRGBToY_SSE41(&r, &g, &b, &Y0); + + // Convert to 16-bit Y. + b = _mm_unpackhi_epi8(bgr_plane[0 + j], zero); + g = _mm_unpackhi_epi8(bgr_plane[2 + j], zero); + r = _mm_unpackhi_epi8(bgr_plane[4 + j], zero); + ConvertRGBToY_SSE41(&r, &g, &b, &Y1); + + // Cast to 8-bit and store. + STORE_16(_mm_packus_epi16(Y0, Y1), y + i); + } + } + for (; i < width; ++i, bgr += 3) { // left-over + y[i] = VP8RGBToY(bgr[2], bgr[1], bgr[0], YUV_HALF); + } +} + +static void ConvertARGBToY_SSE41(const uint32_t* argb, uint8_t* y, int width) { + const int max_width = width & ~15; + int i; + for (i = 0; i < max_width; i += 16) { + __m128i Y0, Y1, rgb[6]; + RGB32PackedToPlanar_SSE41(&argb[i], rgb); + ConvertRGBToY_SSE41(&rgb[0], &rgb[2], &rgb[4], &Y0); + ConvertRGBToY_SSE41(&rgb[1], &rgb[3], &rgb[5], &Y1); + STORE_16(_mm_packus_epi16(Y0, Y1), y + i); + } + for (; i < width; ++i) { // left-over + const uint32_t p = argb[i]; + y[i] = VP8RGBToY((p >> 16) & 0xff, (p >> 8) & 0xff, (p >> 0) & 0xff, + YUV_HALF); + } +} + +// Horizontal add (doubled) of two 16b values, result is 16b. +// in: A | B | C | D | ... -> out: 2*(A+B) | 2*(C+D) | ... +static void HorizontalAddPack_SSE41(const __m128i* const A, + const __m128i* const B, + __m128i* const out) { + const __m128i k2 = _mm_set1_epi16(2); + const __m128i C = _mm_madd_epi16(*A, k2); + const __m128i D = _mm_madd_epi16(*B, k2); + *out = _mm_packs_epi32(C, D); +} + +static void ConvertARGBToUV_SSE41(const uint32_t* argb, + uint8_t* u, uint8_t* v, + int src_width, int do_store) { + const int max_width = src_width & ~31; + int i; + for (i = 0; i < max_width; i += 32, u += 16, v += 16) { + __m128i rgb[6], U0, V0, U1, V1; + RGB32PackedToPlanar_SSE41(&argb[i], rgb); + HorizontalAddPack_SSE41(&rgb[0], &rgb[1], &rgb[0]); + HorizontalAddPack_SSE41(&rgb[2], &rgb[3], &rgb[2]); + HorizontalAddPack_SSE41(&rgb[4], &rgb[5], &rgb[4]); + ConvertRGBToUV_SSE41(&rgb[0], &rgb[2], &rgb[4], &U0, &V0); + + RGB32PackedToPlanar_SSE41(&argb[i + 16], rgb); + HorizontalAddPack_SSE41(&rgb[0], &rgb[1], &rgb[0]); + HorizontalAddPack_SSE41(&rgb[2], &rgb[3], &rgb[2]); + HorizontalAddPack_SSE41(&rgb[4], &rgb[5], &rgb[4]); + ConvertRGBToUV_SSE41(&rgb[0], &rgb[2], &rgb[4], &U1, &V1); + + U0 = _mm_packus_epi16(U0, U1); + V0 = _mm_packus_epi16(V0, V1); + if (!do_store) { + const __m128i prev_u = LOAD_16(u); + const __m128i prev_v = LOAD_16(v); + U0 = _mm_avg_epu8(U0, prev_u); + V0 = _mm_avg_epu8(V0, prev_v); + } + STORE_16(U0, u); + STORE_16(V0, v); + } + if (i < src_width) { // left-over + WebPConvertARGBToUV_C(argb + i, u, v, src_width - i, do_store); + } +} + +// Convert 16 packed ARGB 16b-values to r[], g[], b[] +static WEBP_INLINE void RGBA32PackedToPlanar_16b_SSE41( + const uint16_t* const rgbx, + __m128i* const r, __m128i* const g, __m128i* const b) { + const __m128i in0 = LOAD_16(rgbx + 0); // r0 | g0 | b0 |x| r1 | g1 | b1 |x + const __m128i in1 = LOAD_16(rgbx + 8); // r2 | g2 | b2 |x| r3 | g3 | b3 |x + const __m128i in2 = LOAD_16(rgbx + 16); // r4 | ... + const __m128i in3 = LOAD_16(rgbx + 24); // r6 | ... + // aarrggbb as 16-bit. + const __m128i shuff0 = + _mm_set_epi8(-1, -1, -1, -1, 13, 12, 5, 4, 11, 10, 3, 2, 9, 8, 1, 0); + const __m128i shuff1 = + _mm_set_epi8(13, 12, 5, 4, -1, -1, -1, -1, 11, 10, 3, 2, 9, 8, 1, 0); + const __m128i A0 = _mm_shuffle_epi8(in0, shuff0); + const __m128i A1 = _mm_shuffle_epi8(in1, shuff1); + const __m128i A2 = _mm_shuffle_epi8(in2, shuff0); + const __m128i A3 = _mm_shuffle_epi8(in3, shuff1); + // R0R1G0G1 + // B0B1**** + // R2R3G2G3 + // B2B3**** + // (OR is used to free port 5 for the unpack) + const __m128i B0 = _mm_unpacklo_epi32(A0, A1); + const __m128i B1 = _mm_or_si128(A0, A1); + const __m128i B2 = _mm_unpacklo_epi32(A2, A3); + const __m128i B3 = _mm_or_si128(A2, A3); + // Gather the channels. + *r = _mm_unpacklo_epi64(B0, B2); + *g = _mm_unpackhi_epi64(B0, B2); + *b = _mm_unpackhi_epi64(B1, B3); +} + +static void ConvertRGBA32ToUV_SSE41(const uint16_t* rgb, + uint8_t* u, uint8_t* v, int width) { + const int max_width = width & ~15; + const uint16_t* const last_rgb = rgb + 4 * max_width; + while (rgb < last_rgb) { + __m128i r, g, b, U0, V0, U1, V1; + RGBA32PackedToPlanar_16b_SSE41(rgb + 0, &r, &g, &b); + ConvertRGBToUV_SSE41(&r, &g, &b, &U0, &V0); + RGBA32PackedToPlanar_16b_SSE41(rgb + 32, &r, &g, &b); + ConvertRGBToUV_SSE41(&r, &g, &b, &U1, &V1); + STORE_16(_mm_packus_epi16(U0, U1), u); + STORE_16(_mm_packus_epi16(V0, V1), v); + u += 16; + v += 16; + rgb += 2 * 32; + } + if (max_width < width) { // left-over + WebPConvertRGBA32ToUV_C(rgb, u, v, width - max_width); + } +} + +//------------------------------------------------------------------------------ + +extern void WebPInitConvertARGBToYUVSSE41(void); + +WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUVSSE41(void) { + WebPConvertARGBToY = ConvertARGBToY_SSE41; + WebPConvertARGBToUV = ConvertARGBToUV_SSE41; + + WebPConvertRGB24ToY = ConvertRGB24ToY_SSE41; + WebPConvertBGR24ToY = ConvertBGR24ToY_SSE41; + + WebPConvertRGBA32ToUV = ConvertRGBA32ToUV_SSE41; +} + +//------------------------------------------------------------------------------ + +#else // !WEBP_USE_SSE41 + +WEBP_DSP_INIT_STUB(WebPInitSamplersSSE41) +WEBP_DSP_INIT_STUB(WebPInitConvertARGBToYUVSSE41) + +#endif // WEBP_USE_SSE41 diff --git a/src/3rdparty/libwebp/src/enc/alpha_enc.c b/src/3rdparty/libwebp/src/enc/alpha_enc.c index 5a2c931..dce9ca9 100644 --- a/src/3rdparty/libwebp/src/enc/alpha_enc.c +++ b/src/3rdparty/libwebp/src/enc/alpha_enc.c @@ -14,12 +14,12 @@ #include <assert.h> #include <stdlib.h> -#include "./vp8i_enc.h" -#include "../dsp/dsp.h" -#include "../utils/filters_utils.h" -#include "../utils/quant_levels_utils.h" -#include "../utils/utils.h" -#include "../webp/format_constants.h" +#include "src/enc/vp8i_enc.h" +#include "src/dsp/dsp.h" +#include "src/utils/filters_utils.h" +#include "src/utils/quant_levels_utils.h" +#include "src/utils/utils.h" +#include "src/webp/format_constants.h" // ----------------------------------------------------------------------------- // Encodes the given alpha data via specified compression method 'method'. @@ -44,11 +44,11 @@ // invalid quality or method, or // memory allocation for the compressed data fails. -#include "../enc/vp8li_enc.h" +#include "src/enc/vp8li_enc.h" static int EncodeLossless(const uint8_t* const data, int width, int height, int effort_level, // in [0..6] range - VP8LBitWriter* const bw, + int use_quality_100, VP8LBitWriter* const bw, WebPAuxStats* const stats) { int ok = 0; WebPConfig config; @@ -76,7 +76,10 @@ static int EncodeLossless(const uint8_t* const data, int width, int height, // Set a low default quality for encoding alpha. Ensure that Alpha quality at // lower methods (3 and below) is less than the threshold for triggering // costly 'BackwardReferencesTraceBackwards'. - config.quality = 8.f * effort_level; + // If the alpha quality is set to 100 and the method to 6, allow for a high + // lossless quality to trigger the cruncher. + config.quality = + (use_quality_100 && effort_level == 6) ? 100 : 8.f * effort_level; assert(config.quality >= 0 && config.quality <= 100.f); // TODO(urvang): Temporary fix to avoid generating images that trigger @@ -134,7 +137,7 @@ static int EncodeAlphaInternal(const uint8_t* const data, int width, int height, if (method != ALPHA_NO_COMPRESSION) { ok = VP8LBitWriterInit(&tmp_bw, data_size >> 3); ok = ok && EncodeLossless(alpha_src, width, height, effort_level, - &tmp_bw, &result->stats); + !reduce_levels, &tmp_bw, &result->stats); if (ok) { output = VP8LBitWriterFinish(&tmp_bw); output_size = VP8LBitWriterNumBytes(&tmp_bw); @@ -264,6 +267,7 @@ static int ApplyFiltersAndEncode(const uint8_t* alpha, int width, int height, reduce_levels, effort_level, NULL, &best); } if (ok) { +#if !defined(WEBP_DISABLE_STATS) if (stats != NULL) { stats->lossless_features = best.stats.lossless_features; stats->histogram_bits = best.stats.histogram_bits; @@ -274,6 +278,9 @@ static int ApplyFiltersAndEncode(const uint8_t* alpha, int width, int height, stats->lossless_hdr_size = best.stats.lossless_hdr_size; stats->lossless_data_size = best.stats.lossless_data_size; } +#else + (void)stats; +#endif *output_size = VP8BitWriterSize(&best.bw); *output = VP8BitWriterBuf(&best.bw); } else { @@ -339,10 +346,12 @@ static int EncodeAlpha(VP8Encoder* const enc, ok = ApplyFiltersAndEncode(quant_alpha, width, height, data_size, method, filter, reduce_levels, effort_level, output, output_size, pic->stats); +#if !defined(WEBP_DISABLE_STATS) if (pic->stats != NULL) { // need stats? pic->stats->coded_size += (int)(*output_size); enc->sse_[3] = sse; } +#endif } WebPSafeFree(quant_alpha); @@ -352,7 +361,8 @@ static int EncodeAlpha(VP8Encoder* const enc, //------------------------------------------------------------------------------ // Main calls -static int CompressAlphaJob(VP8Encoder* const enc, void* dummy) { +static int CompressAlphaJob(void* arg1, void* dummy) { + VP8Encoder* const enc = (VP8Encoder*)arg1; const WebPConfig* config = enc->config_; uint8_t* alpha_data = NULL; size_t alpha_size = 0; @@ -385,7 +395,7 @@ void VP8EncInitAlpha(VP8Encoder* const enc) { WebPGetWorkerInterface()->Init(worker); worker->data1 = enc; worker->data2 = NULL; - worker->hook = (WebPWorkerHook)CompressAlphaJob; + worker->hook = CompressAlphaJob; } } diff --git a/src/3rdparty/libwebp/src/enc/analysis_enc.c b/src/3rdparty/libwebp/src/enc/analysis_enc.c index dce159b..687757a 100644 --- a/src/3rdparty/libwebp/src/enc/analysis_enc.c +++ b/src/3rdparty/libwebp/src/enc/analysis_enc.c @@ -15,9 +15,9 @@ #include <string.h> #include <assert.h> -#include "./vp8i_enc.h" -#include "./cost_enc.h" -#include "../utils/utils.h" +#include "src/enc/vp8i_enc.h" +#include "src/enc/cost_enc.h" +#include "src/utils/utils.h" #define MAX_ITERS_K_MEANS 6 @@ -434,7 +434,9 @@ typedef struct { } SegmentJob; // main work call -static int DoSegmentsJob(SegmentJob* const job, VP8EncIterator* const it) { +static int DoSegmentsJob(void* arg1, void* arg2) { + SegmentJob* const job = (SegmentJob*)arg1; + VP8EncIterator* const it = (VP8EncIterator*)arg2; int ok = 1; if (!VP8IteratorIsDone(it)) { uint8_t tmp[32 + WEBP_ALIGN_CST]; @@ -456,13 +458,13 @@ static void MergeJobs(const SegmentJob* const src, SegmentJob* const dst) { dst->uv_alpha += src->uv_alpha; } -// initialize the job struct with some TODOs +// initialize the job struct with some tasks to perform static void InitSegmentJob(VP8Encoder* const enc, SegmentJob* const job, int start_row, int end_row) { WebPGetWorkerInterface()->Init(&job->worker); job->worker.data1 = job; job->worker.data2 = &job->it; - job->worker.hook = (WebPWorkerHook)DoSegmentsJob; + job->worker.hook = DoSegmentsJob; VP8IteratorInit(enc, &job->it); VP8IteratorSetRow(&job->it, start_row); VP8IteratorSetCountDown(&job->it, (end_row - start_row) * enc->mb_w_); diff --git a/src/3rdparty/libwebp/src/enc/backward_references_cost_enc.c b/src/3rdparty/libwebp/src/enc/backward_references_cost_enc.c new file mode 100644 index 0000000..516abd7 --- /dev/null +++ b/src/3rdparty/libwebp/src/enc/backward_references_cost_enc.c @@ -0,0 +1,790 @@ +// Copyright 2017 Google Inc. All Rights Reserved. +// +// Use of this source code is governed by a BSD-style license +// that can be found in the COPYING file in the root of the source +// tree. An additional intellectual property rights grant can be found +// in the file PATENTS. All contributing project authors may +// be found in the AUTHORS file in the root of the source tree. +// ----------------------------------------------------------------------------- +// +// Improves a given set of backward references by analyzing its bit cost. +// The algorithm is similar to the Zopfli compression algorithm but tailored to +// images. +// +// Author: Vincent Rabaud (vrabaud@google.com) +// + +#include <assert.h> + +#include "src/enc/backward_references_enc.h" +#include "src/enc/histogram_enc.h" +#include "src/dsp/lossless_common.h" +#include "src/utils/color_cache_utils.h" +#include "src/utils/utils.h" + +#define VALUES_IN_BYTE 256 + +extern void VP8LClearBackwardRefs(VP8LBackwardRefs* const refs); +extern int VP8LDistanceToPlaneCode(int xsize, int dist); +extern void VP8LBackwardRefsCursorAdd(VP8LBackwardRefs* const refs, + const PixOrCopy v); + +typedef struct { + double alpha_[VALUES_IN_BYTE]; + double red_[VALUES_IN_BYTE]; + double blue_[VALUES_IN_BYTE]; + double distance_[NUM_DISTANCE_CODES]; + double* literal_; +} CostModel; + +static void ConvertPopulationCountTableToBitEstimates( + int num_symbols, const uint32_t population_counts[], double output[]) { + uint32_t sum = 0; + int nonzeros = 0; + int i; + for (i = 0; i < num_symbols; ++i) { + sum += population_counts[i]; + if (population_counts[i] > 0) { + ++nonzeros; + } + } + if (nonzeros <= 1) { + memset(output, 0, num_symbols * sizeof(*output)); + } else { + const double logsum = VP8LFastLog2(sum); + for (i = 0; i < num_symbols; ++i) { + output[i] = logsum - VP8LFastLog2(population_counts[i]); + } + } +} + +static int CostModelBuild(CostModel* const m, int xsize, int cache_bits, + const VP8LBackwardRefs* const refs) { + int ok = 0; + VP8LRefsCursor c = VP8LRefsCursorInit(refs); + VP8LHistogram* const histo = VP8LAllocateHistogram(cache_bits); + if (histo == NULL) goto Error; + + // The following code is similar to VP8LHistogramCreate but converts the + // distance to plane code. + VP8LHistogramInit(histo, cache_bits, /*init_arrays=*/ 1); + while (VP8LRefsCursorOk(&c)) { + VP8LHistogramAddSinglePixOrCopy(histo, c.cur_pos, VP8LDistanceToPlaneCode, + xsize); + VP8LRefsCursorNext(&c); + } + + ConvertPopulationCountTableToBitEstimates( + VP8LHistogramNumCodes(histo->palette_code_bits_), + histo->literal_, m->literal_); + ConvertPopulationCountTableToBitEstimates( + VALUES_IN_BYTE, histo->red_, m->red_); + ConvertPopulationCountTableToBitEstimates( + VALUES_IN_BYTE, histo->blue_, m->blue_); + ConvertPopulationCountTableToBitEstimates( + VALUES_IN_BYTE, histo->alpha_, m->alpha_); + ConvertPopulationCountTableToBitEstimates( + NUM_DISTANCE_CODES, histo->distance_, m->distance_); + ok = 1; + + Error: + VP8LFreeHistogram(histo); + return ok; +} + +static WEBP_INLINE double GetLiteralCost(const CostModel* const m, uint32_t v) { + return m->alpha_[v >> 24] + + m->red_[(v >> 16) & 0xff] + + m->literal_[(v >> 8) & 0xff] + + m->blue_[v & 0xff]; +} + +static WEBP_INLINE double GetCacheCost(const CostModel* const m, uint32_t idx) { + const int literal_idx = VALUES_IN_BYTE + NUM_LENGTH_CODES + idx; + return m->literal_[literal_idx]; +} + +static WEBP_INLINE double GetLengthCost(const CostModel* const m, + uint32_t length) { + int code, extra_bits; + VP8LPrefixEncodeBits(length, &code, &extra_bits); + return m->literal_[VALUES_IN_BYTE + code] + extra_bits; +} + +static WEBP_INLINE double GetDistanceCost(const CostModel* const m, + uint32_t distance) { + int code, extra_bits; + VP8LPrefixEncodeBits(distance, &code, &extra_bits); + return m->distance_[code] + extra_bits; +} + +static WEBP_INLINE void AddSingleLiteralWithCostModel( + const uint32_t* const argb, VP8LColorCache* const hashers, + const CostModel* const cost_model, int idx, int use_color_cache, + float prev_cost, float* const cost, uint16_t* const dist_array) { + double cost_val = prev_cost; + const uint32_t color = argb[idx]; + const int ix = use_color_cache ? VP8LColorCacheContains(hashers, color) : -1; + if (ix >= 0) { + // use_color_cache is true and hashers contains color + const double mul0 = 0.68; + cost_val += GetCacheCost(cost_model, ix) * mul0; + } else { + const double mul1 = 0.82; + if (use_color_cache) VP8LColorCacheInsert(hashers, color); + cost_val += GetLiteralCost(cost_model, color) * mul1; + } + if (cost[idx] > cost_val) { + cost[idx] = (float)cost_val; + dist_array[idx] = 1; // only one is inserted. + } +} + +// ----------------------------------------------------------------------------- +// CostManager and interval handling + +// Empirical value to avoid high memory consumption but good for performance. +#define COST_CACHE_INTERVAL_SIZE_MAX 500 + +// To perform backward reference every pixel at index index_ is considered and +// the cost for the MAX_LENGTH following pixels computed. Those following pixels +// at index index_ + k (k from 0 to MAX_LENGTH) have a cost of: +// cost_ = distance cost at index + GetLengthCost(cost_model, k) +// and the minimum value is kept. GetLengthCost(cost_model, k) is cached in an +// array of size MAX_LENGTH. +// Instead of performing MAX_LENGTH comparisons per pixel, we keep track of the +// minimal values using intervals of constant cost. +// An interval is defined by the index_ of the pixel that generated it and +// is only useful in a range of indices from start_ to end_ (exclusive), i.e. +// it contains the minimum value for pixels between start_ and end_. +// Intervals are stored in a linked list and ordered by start_. When a new +// interval has a better value, old intervals are split or removed. There are +// therefore no overlapping intervals. +typedef struct CostInterval CostInterval; +struct CostInterval { + float cost_; + int start_; + int end_; + int index_; + CostInterval* previous_; + CostInterval* next_; +}; + +// The GetLengthCost(cost_model, k) are cached in a CostCacheInterval. +typedef struct { + double cost_; + int start_; + int end_; // Exclusive. +} CostCacheInterval; + +// This structure is in charge of managing intervals and costs. +// It caches the different CostCacheInterval, caches the different +// GetLengthCost(cost_model, k) in cost_cache_ and the CostInterval's (whose +// count_ is limited by COST_CACHE_INTERVAL_SIZE_MAX). +#define COST_MANAGER_MAX_FREE_LIST 10 +typedef struct { + CostInterval* head_; + int count_; // The number of stored intervals. + CostCacheInterval* cache_intervals_; + size_t cache_intervals_size_; + double cost_cache_[MAX_LENGTH]; // Contains the GetLengthCost(cost_model, k). + float* costs_; + uint16_t* dist_array_; + // Most of the time, we only need few intervals -> use a free-list, to avoid + // fragmentation with small allocs in most common cases. + CostInterval intervals_[COST_MANAGER_MAX_FREE_LIST]; + CostInterval* free_intervals_; + // These are regularly malloc'd remains. This list can't grow larger than than + // size COST_CACHE_INTERVAL_SIZE_MAX - COST_MANAGER_MAX_FREE_LIST, note. + CostInterval* recycled_intervals_; +} CostManager; + +static void CostIntervalAddToFreeList(CostManager* const manager, + CostInterval* const interval) { + interval->next_ = manager->free_intervals_; + manager->free_intervals_ = interval; +} + +static int CostIntervalIsInFreeList(const CostManager* const manager, + const CostInterval* const interval) { + return (interval >= &manager->intervals_[0] && + interval <= &manager->intervals_[COST_MANAGER_MAX_FREE_LIST - 1]); +} + +static void CostManagerInitFreeList(CostManager* const manager) { + int i; + manager->free_intervals_ = NULL; + for (i = 0; i < COST_MANAGER_MAX_FREE_LIST; ++i) { + CostIntervalAddToFreeList(manager, &manager->intervals_[i]); + } +} + +static void DeleteIntervalList(CostManager* const manager, + const CostInterval* interval) { + while (interval != NULL) { + const CostInterval* const next = interval->next_; + if (!CostIntervalIsInFreeList(manager, interval)) { + WebPSafeFree((void*)interval); + } // else: do nothing + interval = next; + } +} + +static void CostManagerClear(CostManager* const manager) { + if (manager == NULL) return; + + WebPSafeFree(manager->costs_); + WebPSafeFree(manager->cache_intervals_); + + // Clear the interval lists. + DeleteIntervalList(manager, manager->head_); + manager->head_ = NULL; + DeleteIntervalList(manager, manager->recycled_intervals_); + manager->recycled_intervals_ = NULL; + + // Reset pointers, count_ and cache_intervals_size_. + memset(manager, 0, sizeof(*manager)); + CostManagerInitFreeList(manager); +} + +static int CostManagerInit(CostManager* const manager, + uint16_t* const dist_array, int pix_count, + const CostModel* const cost_model) { + int i; + const int cost_cache_size = (pix_count > MAX_LENGTH) ? MAX_LENGTH : pix_count; + + manager->costs_ = NULL; + manager->cache_intervals_ = NULL; + manager->head_ = NULL; + manager->recycled_intervals_ = NULL; + manager->count_ = 0; + manager->dist_array_ = dist_array; + CostManagerInitFreeList(manager); + + // Fill in the cost_cache_. + manager->cache_intervals_size_ = 1; + manager->cost_cache_[0] = GetLengthCost(cost_model, 0); + for (i = 1; i < cost_cache_size; ++i) { + manager->cost_cache_[i] = GetLengthCost(cost_model, i); + // Get the number of bound intervals. + if (manager->cost_cache_[i] != manager->cost_cache_[i - 1]) { + ++manager->cache_intervals_size_; + } + } + + // With the current cost model, we usually have below 20 intervals. + // The worst case scenario with a cost model would be if every length has a + // different cost, hence MAX_LENGTH but that is impossible with the current + // implementation that spirals around a pixel. + assert(manager->cache_intervals_size_ <= MAX_LENGTH); + manager->cache_intervals_ = (CostCacheInterval*)WebPSafeMalloc( + manager->cache_intervals_size_, sizeof(*manager->cache_intervals_)); + if (manager->cache_intervals_ == NULL) { + CostManagerClear(manager); + return 0; + } + + // Fill in the cache_intervals_. + { + CostCacheInterval* cur = manager->cache_intervals_; + + // Consecutive values in cost_cache_ are compared and if a big enough + // difference is found, a new interval is created and bounded. + cur->start_ = 0; + cur->end_ = 1; + cur->cost_ = manager->cost_cache_[0]; + for (i = 1; i < cost_cache_size; ++i) { + const double cost_val = manager->cost_cache_[i]; + if (cost_val != cur->cost_) { + ++cur; + // Initialize an interval. + cur->start_ = i; + cur->cost_ = cost_val; + } + cur->end_ = i + 1; + } + } + + manager->costs_ = (float*)WebPSafeMalloc(pix_count, sizeof(*manager->costs_)); + if (manager->costs_ == NULL) { + CostManagerClear(manager); + return 0; + } + // Set the initial costs_ high for every pixel as we will keep the minimum. + for (i = 0; i < pix_count; ++i) manager->costs_[i] = 1e38f; + + return 1; +} + +// Given the cost and the position that define an interval, update the cost at +// pixel 'i' if it is smaller than the previously computed value. +static WEBP_INLINE void UpdateCost(CostManager* const manager, int i, + int position, float cost) { + const int k = i - position; + assert(k >= 0 && k < MAX_LENGTH); + + if (manager->costs_[i] > cost) { + manager->costs_[i] = cost; + manager->dist_array_[i] = k + 1; + } +} + +// Given the cost and the position that define an interval, update the cost for +// all the pixels between 'start' and 'end' excluded. +static WEBP_INLINE void UpdateCostPerInterval(CostManager* const manager, + int start, int end, int position, + float cost) { + int i; + for (i = start; i < end; ++i) UpdateCost(manager, i, position, cost); +} + +// Given two intervals, make 'prev' be the previous one of 'next' in 'manager'. +static WEBP_INLINE void ConnectIntervals(CostManager* const manager, + CostInterval* const prev, + CostInterval* const next) { + if (prev != NULL) { + prev->next_ = next; + } else { + manager->head_ = next; + } + + if (next != NULL) next->previous_ = prev; +} + +// Pop an interval in the manager. +static WEBP_INLINE void PopInterval(CostManager* const manager, + CostInterval* const interval) { + if (interval == NULL) return; + + ConnectIntervals(manager, interval->previous_, interval->next_); + if (CostIntervalIsInFreeList(manager, interval)) { + CostIntervalAddToFreeList(manager, interval); + } else { // recycle regularly malloc'd intervals too + interval->next_ = manager->recycled_intervals_; + manager->recycled_intervals_ = interval; + } + --manager->count_; + assert(manager->count_ >= 0); +} + +// Update the cost at index i by going over all the stored intervals that +// overlap with i. +// If 'do_clean_intervals' is set to something different than 0, intervals that +// end before 'i' will be popped. +static WEBP_INLINE void UpdateCostAtIndex(CostManager* const manager, int i, + int do_clean_intervals) { + CostInterval* current = manager->head_; + + while (current != NULL && current->start_ <= i) { + CostInterval* const next = current->next_; + if (current->end_ <= i) { + if (do_clean_intervals) { + // We have an outdated interval, remove it. + PopInterval(manager, current); + } + } else { + UpdateCost(manager, i, current->index_, current->cost_); + } + current = next; + } +} + +// Given a current orphan interval and its previous interval, before +// it was orphaned (which can be NULL), set it at the right place in the list +// of intervals using the start_ ordering and the previous interval as a hint. +static WEBP_INLINE void PositionOrphanInterval(CostManager* const manager, + CostInterval* const current, + CostInterval* previous) { + assert(current != NULL); + + if (previous == NULL) previous = manager->head_; + while (previous != NULL && current->start_ < previous->start_) { + previous = previous->previous_; + } + while (previous != NULL && previous->next_ != NULL && + previous->next_->start_ < current->start_) { + previous = previous->next_; + } + + if (previous != NULL) { + ConnectIntervals(manager, current, previous->next_); + } else { + ConnectIntervals(manager, current, manager->head_); + } + ConnectIntervals(manager, previous, current); +} + +// Insert an interval in the list contained in the manager by starting at +// interval_in as a hint. The intervals are sorted by start_ value. +static WEBP_INLINE void InsertInterval(CostManager* const manager, + CostInterval* const interval_in, + float cost, int position, int start, + int end) { + CostInterval* interval_new; + + if (start >= end) return; + if (manager->count_ >= COST_CACHE_INTERVAL_SIZE_MAX) { + // Serialize the interval if we cannot store it. + UpdateCostPerInterval(manager, start, end, position, cost); + return; + } + if (manager->free_intervals_ != NULL) { + interval_new = manager->free_intervals_; + manager->free_intervals_ = interval_new->next_; + } else if (manager->recycled_intervals_ != NULL) { + interval_new = manager->recycled_intervals_; + manager->recycled_intervals_ = interval_new->next_; + } else { // malloc for good + interval_new = (CostInterval*)WebPSafeMalloc(1, sizeof(*interval_new)); + if (interval_new == NULL) { + // Write down the interval if we cannot create it. + UpdateCostPerInterval(manager, start, end, position, cost); + return; + } + } + + interval_new->cost_ = cost; + interval_new->index_ = position; + interval_new->start_ = start; + interval_new->end_ = end; + PositionOrphanInterval(manager, interval_new, interval_in); + + ++manager->count_; +} + +// Given a new cost interval defined by its start at position, its length value +// and distance_cost, add its contributions to the previous intervals and costs. +// If handling the interval or one of its subintervals becomes to heavy, its +// contribution is added to the costs right away. +static WEBP_INLINE void PushInterval(CostManager* const manager, + double distance_cost, int position, + int len) { + size_t i; + CostInterval* interval = manager->head_; + CostInterval* interval_next; + const CostCacheInterval* const cost_cache_intervals = + manager->cache_intervals_; + // If the interval is small enough, no need to deal with the heavy + // interval logic, just serialize it right away. This constant is empirical. + const int kSkipDistance = 10; + + if (len < kSkipDistance) { + int j; + for (j = position; j < position + len; ++j) { + const int k = j - position; + float cost_tmp; + assert(k >= 0 && k < MAX_LENGTH); + cost_tmp = (float)(distance_cost + manager->cost_cache_[k]); + + if (manager->costs_[j] > cost_tmp) { + manager->costs_[j] = cost_tmp; + manager->dist_array_[j] = k + 1; + } + } + return; + } + + for (i = 0; i < manager->cache_intervals_size_ && + cost_cache_intervals[i].start_ < len; + ++i) { + // Define the intersection of the ith interval with the new one. + int start = position + cost_cache_intervals[i].start_; + const int end = position + (cost_cache_intervals[i].end_ > len + ? len + : cost_cache_intervals[i].end_); + const float cost = (float)(distance_cost + cost_cache_intervals[i].cost_); + + for (; interval != NULL && interval->start_ < end; + interval = interval_next) { + interval_next = interval->next_; + + // Make sure we have some overlap + if (start >= interval->end_) continue; + + if (cost >= interval->cost_) { + // When intervals are represented, the lower, the better. + // [**********************************************************[ + // start end + // [----------------------------------[ + // interval->start_ interval->end_ + // If we are worse than what we already have, add whatever we have so + // far up to interval. + const int start_new = interval->end_; + InsertInterval(manager, interval, cost, position, start, + interval->start_); + start = start_new; + if (start >= end) break; + continue; + } + + if (start <= interval->start_) { + if (interval->end_ <= end) { + // [----------------------------------[ + // interval->start_ interval->end_ + // [**************************************************************[ + // start end + // We can safely remove the old interval as it is fully included. + PopInterval(manager, interval); + } else { + // [------------------------------------[ + // interval->start_ interval->end_ + // [*****************************[ + // start end + interval->start_ = end; + break; + } + } else { + if (end < interval->end_) { + // [--------------------------------------------------------------[ + // interval->start_ interval->end_ + // [*****************************[ + // start end + // We have to split the old interval as it fully contains the new one. + const int end_original = interval->end_; + interval->end_ = start; + InsertInterval(manager, interval, interval->cost_, interval->index_, + end, end_original); + interval = interval->next_; + break; + } else { + // [------------------------------------[ + // interval->start_ interval->end_ + // [*****************************[ + // start end + interval->end_ = start; + } + } + } + // Insert the remaining interval from start to end. + InsertInterval(manager, interval, cost, position, start, end); + } +} + +static int BackwardReferencesHashChainDistanceOnly( + int xsize, int ysize, const uint32_t* const argb, int cache_bits, + const VP8LHashChain* const hash_chain, const VP8LBackwardRefs* const refs, + uint16_t* const dist_array) { + int i; + int ok = 0; + int cc_init = 0; + const int pix_count = xsize * ysize; + const int use_color_cache = (cache_bits > 0); + const size_t literal_array_size = + sizeof(double) * (NUM_LITERAL_CODES + NUM_LENGTH_CODES + + ((cache_bits > 0) ? (1 << cache_bits) : 0)); + const size_t cost_model_size = sizeof(CostModel) + literal_array_size; + CostModel* const cost_model = + (CostModel*)WebPSafeCalloc(1ULL, cost_model_size); + VP8LColorCache hashers; + CostManager* cost_manager = + (CostManager*)WebPSafeMalloc(1ULL, sizeof(*cost_manager)); + int offset_prev = -1, len_prev = -1; + double offset_cost = -1; + int first_offset_is_constant = -1; // initialized with 'impossible' value + int reach = 0; + + if (cost_model == NULL || cost_manager == NULL) goto Error; + + cost_model->literal_ = (double*)(cost_model + 1); + if (use_color_cache) { + cc_init = VP8LColorCacheInit(&hashers, cache_bits); + if (!cc_init) goto Error; + } + + if (!CostModelBuild(cost_model, xsize, cache_bits, refs)) { + goto Error; + } + + if (!CostManagerInit(cost_manager, dist_array, pix_count, cost_model)) { + goto Error; + } + + // We loop one pixel at a time, but store all currently best points to + // non-processed locations from this point. + dist_array[0] = 0; + // Add first pixel as literal. + AddSingleLiteralWithCostModel(argb, &hashers, cost_model, 0, use_color_cache, + 0.f, cost_manager->costs_, dist_array); + + for (i = 1; i < pix_count; ++i) { + const float prev_cost = cost_manager->costs_[i - 1]; + int offset, len; + VP8LHashChainFindCopy(hash_chain, i, &offset, &len); + + // Try adding the pixel as a literal. + AddSingleLiteralWithCostModel(argb, &hashers, cost_model, i, + use_color_cache, prev_cost, + cost_manager->costs_, dist_array); + + // If we are dealing with a non-literal. + if (len >= 2) { + if (offset != offset_prev) { + const int code = VP8LDistanceToPlaneCode(xsize, offset); + offset_cost = GetDistanceCost(cost_model, code); + first_offset_is_constant = 1; + PushInterval(cost_manager, prev_cost + offset_cost, i, len); + } else { + assert(offset_cost >= 0); + assert(len_prev >= 0); + assert(first_offset_is_constant == 0 || first_offset_is_constant == 1); + // Instead of considering all contributions from a pixel i by calling: + // PushInterval(cost_manager, prev_cost + offset_cost, i, len); + // we optimize these contributions in case offset_cost stays the same + // for consecutive pixels. This describes a set of pixels similar to a + // previous set (e.g. constant color regions). + if (first_offset_is_constant) { + reach = i - 1 + len_prev - 1; + first_offset_is_constant = 0; + } + + if (i + len - 1 > reach) { + // We can only be go further with the same offset if the previous + // length was maxed, hence len_prev == len == MAX_LENGTH. + // TODO(vrabaud), bump i to the end right away (insert cache and + // update cost). + // TODO(vrabaud), check if one of the points in between does not have + // a lower cost. + // Already consider the pixel at "reach" to add intervals that are + // better than whatever we add. + int offset_j, len_j = 0; + int j; + assert(len == MAX_LENGTH || len == pix_count - i); + // Figure out the last consecutive pixel within [i, reach + 1] with + // the same offset. + for (j = i; j <= reach; ++j) { + VP8LHashChainFindCopy(hash_chain, j + 1, &offset_j, &len_j); + if (offset_j != offset) { + VP8LHashChainFindCopy(hash_chain, j, &offset_j, &len_j); + break; + } + } + // Update the cost at j - 1 and j. + UpdateCostAtIndex(cost_manager, j - 1, 0); + UpdateCostAtIndex(cost_manager, j, 0); + + PushInterval(cost_manager, cost_manager->costs_[j - 1] + offset_cost, + j, len_j); + reach = j + len_j - 1; + } + } + } + + UpdateCostAtIndex(cost_manager, i, 1); + offset_prev = offset; + len_prev = len; + } + + ok = !refs->error_; +Error: + if (cc_init) VP8LColorCacheClear(&hashers); + CostManagerClear(cost_manager); + WebPSafeFree(cost_model); + WebPSafeFree(cost_manager); + return ok; +} + +// We pack the path at the end of *dist_array and return +// a pointer to this part of the array. Example: +// dist_array = [1x2xx3x2] => packed [1x2x1232], chosen_path = [1232] +static void TraceBackwards(uint16_t* const dist_array, + int dist_array_size, + uint16_t** const chosen_path, + int* const chosen_path_size) { + uint16_t* path = dist_array + dist_array_size; + uint16_t* cur = dist_array + dist_array_size - 1; + while (cur >= dist_array) { + const int k = *cur; + --path; + *path = k; + cur -= k; + } + *chosen_path = path; + *chosen_path_size = (int)(dist_array + dist_array_size - path); +} + +static int BackwardReferencesHashChainFollowChosenPath( + const uint32_t* const argb, int cache_bits, + const uint16_t* const chosen_path, int chosen_path_size, + const VP8LHashChain* const hash_chain, VP8LBackwardRefs* const refs) { + const int use_color_cache = (cache_bits > 0); + int ix; + int i = 0; + int ok = 0; + int cc_init = 0; + VP8LColorCache hashers; + + if (use_color_cache) { + cc_init = VP8LColorCacheInit(&hashers, cache_bits); + if (!cc_init) goto Error; + } + + VP8LClearBackwardRefs(refs); + for (ix = 0; ix < chosen_path_size; ++ix) { + const int len = chosen_path[ix]; + if (len != 1) { + int k; + const int offset = VP8LHashChainFindOffset(hash_chain, i); + VP8LBackwardRefsCursorAdd(refs, PixOrCopyCreateCopy(offset, len)); + if (use_color_cache) { + for (k = 0; k < len; ++k) { + VP8LColorCacheInsert(&hashers, argb[i + k]); + } + } + i += len; + } else { + PixOrCopy v; + const int idx = + use_color_cache ? VP8LColorCacheContains(&hashers, argb[i]) : -1; + if (idx >= 0) { + // use_color_cache is true and hashers contains argb[i] + // push pixel as a color cache index + v = PixOrCopyCreateCacheIdx(idx); + } else { + if (use_color_cache) VP8LColorCacheInsert(&hashers, argb[i]); + v = PixOrCopyCreateLiteral(argb[i]); + } + VP8LBackwardRefsCursorAdd(refs, v); + ++i; + } + } + ok = !refs->error_; + Error: + if (cc_init) VP8LColorCacheClear(&hashers); + return ok; +} + +// Returns 1 on success. +extern int VP8LBackwardReferencesTraceBackwards( + int xsize, int ysize, const uint32_t* const argb, int cache_bits, + const VP8LHashChain* const hash_chain, + const VP8LBackwardRefs* const refs_src, VP8LBackwardRefs* const refs_dst); +int VP8LBackwardReferencesTraceBackwards(int xsize, int ysize, + const uint32_t* const argb, + int cache_bits, + const VP8LHashChain* const hash_chain, + const VP8LBackwardRefs* const refs_src, + VP8LBackwardRefs* const refs_dst) { + int ok = 0; + const int dist_array_size = xsize * ysize; + uint16_t* chosen_path = NULL; + int chosen_path_size = 0; + uint16_t* dist_array = + (uint16_t*)WebPSafeMalloc(dist_array_size, sizeof(*dist_array)); + + if (dist_array == NULL) goto Error; + + if (!BackwardReferencesHashChainDistanceOnly( + xsize, ysize, argb, cache_bits, hash_chain, refs_src, dist_array)) { + goto Error; + } + TraceBackwards(dist_array, dist_array_size, &chosen_path, &chosen_path_size); + if (!BackwardReferencesHashChainFollowChosenPath( + argb, cache_bits, chosen_path, chosen_path_size, hash_chain, + refs_dst)) { + goto Error; + } + ok = 1; + Error: + WebPSafeFree(dist_array); + return ok; +} diff --git a/src/3rdparty/libwebp/src/enc/backward_references_enc.c b/src/3rdparty/libwebp/src/enc/backward_references_enc.c index 7c0559f..d445b40 100644 --- a/src/3rdparty/libwebp/src/enc/backward_references_enc.c +++ b/src/3rdparty/libwebp/src/enc/backward_references_enc.c @@ -13,35 +13,24 @@ #include <assert.h> #include <math.h> -#include "./backward_references_enc.h" -#include "./histogram_enc.h" -#include "../dsp/lossless.h" -#include "../dsp/lossless_common.h" -#include "../dsp/dsp.h" -#include "../utils/color_cache_utils.h" -#include "../utils/utils.h" - -#define VALUES_IN_BYTE 256 +#include "src/enc/backward_references_enc.h" +#include "src/enc/histogram_enc.h" +#include "src/dsp/lossless.h" +#include "src/dsp/lossless_common.h" +#include "src/dsp/dsp.h" +#include "src/utils/color_cache_utils.h" +#include "src/utils/utils.h" #define MIN_BLOCK_SIZE 256 // minimum block size for backward references #define MAX_ENTROPY (1e30f) // 1M window (4M bytes) minus 120 special codes for short distances. -#define WINDOW_SIZE_BITS 20 #define WINDOW_SIZE ((1 << WINDOW_SIZE_BITS) - 120) // Minimum number of pixels for which it is cheaper to encode a // distance + length instead of each pixel as a literal. #define MIN_LENGTH 4 -// If you change this, you need MAX_LENGTH_BITS + WINDOW_SIZE_BITS <= 32 as it -// is used in VP8LHashChain. -#define MAX_LENGTH_BITS 12 -// We want the max value to be attainable and stored in MAX_LENGTH_BITS bits. -#define MAX_LENGTH ((1 << MAX_LENGTH_BITS) - 1) -#if MAX_LENGTH_BITS + WINDOW_SIZE_BITS > 32 -#error "MAX_LENGTH_BITS + WINDOW_SIZE_BITS > 32" -#endif // ----------------------------------------------------------------------------- @@ -56,7 +45,8 @@ static const uint8_t plane_to_code_lut[128] = { 119, 116, 111, 106, 97, 88, 84, 74, 72, 75, 85, 89, 98, 107, 112, 117 }; -static int DistanceToPlaneCode(int xsize, int dist) { +extern int VP8LDistanceToPlaneCode(int xsize, int dist); +int VP8LDistanceToPlaneCode(int xsize, int dist) { const int yoffset = dist / xsize; const int xoffset = dist - yoffset * xsize; if (xoffset <= 8 && yoffset < 8) { @@ -91,7 +81,8 @@ struct PixOrCopyBlock { int size_; // currently used size }; -static void ClearBackwardRefs(VP8LBackwardRefs* const refs) { +extern void VP8LClearBackwardRefs(VP8LBackwardRefs* const refs); +void VP8LClearBackwardRefs(VP8LBackwardRefs* const refs) { assert(refs != NULL); if (refs->tail_ != NULL) { *refs->tail_ = refs->free_blocks_; // recycle all blocks at once @@ -104,7 +95,7 @@ static void ClearBackwardRefs(VP8LBackwardRefs* const refs) { void VP8LBackwardRefsClear(VP8LBackwardRefs* const refs) { assert(refs != NULL); - ClearBackwardRefs(refs); + VP8LClearBackwardRefs(refs); while (refs->free_blocks_ != NULL) { PixOrCopyBlock* const next = refs->free_blocks_->next_; WebPSafeFree(refs->free_blocks_); @@ -163,8 +154,10 @@ static PixOrCopyBlock* BackwardRefsNewBlock(VP8LBackwardRefs* const refs) { return b; } -static WEBP_INLINE void BackwardRefsCursorAdd(VP8LBackwardRefs* const refs, - const PixOrCopy v) { +extern void VP8LBackwardRefsCursorAdd(VP8LBackwardRefs* const refs, + const PixOrCopy v); +void VP8LBackwardRefsCursorAdd(VP8LBackwardRefs* const refs, + const PixOrCopy v) { PixOrCopyBlock* b = refs->last_block_; if (b == NULL || b->size_ == refs->block_size_) { b = BackwardRefsNewBlock(refs); @@ -173,21 +166,6 @@ static WEBP_INLINE void BackwardRefsCursorAdd(VP8LBackwardRefs* const refs, b->start_[b->size_++] = v; } -int VP8LBackwardRefsCopy(const VP8LBackwardRefs* const src, - VP8LBackwardRefs* const dst) { - const PixOrCopyBlock* b = src->refs_; - ClearBackwardRefs(dst); - assert(src->block_size_ == dst->block_size_); - while (b != NULL) { - PixOrCopyBlock* const new_b = BackwardRefsNewBlock(dst); - if (new_b == NULL) return 0; // dst->error_ is set - memcpy(new_b->start_, b->start_, b->size_ * sizeof(*b->start_)); - new_b->size_ = b->size_; - b = b->next_; - } - return 1; -} - // ----------------------------------------------------------------------------- // Hash chains @@ -213,13 +191,14 @@ void VP8LHashChainClear(VP8LHashChain* const p) { // ----------------------------------------------------------------------------- -#define HASH_MULTIPLIER_HI (0xc6a4a793ULL) -#define HASH_MULTIPLIER_LO (0x5bd1e996ULL) +static const uint32_t kHashMultiplierHi = 0xc6a4a793u; +static const uint32_t kHashMultiplierLo = 0x5bd1e996u; -static WEBP_INLINE uint32_t GetPixPairHash64(const uint32_t* const argb) { +static WEBP_UBSAN_IGNORE_UNSIGNED_OVERFLOW WEBP_INLINE +uint32_t GetPixPairHash64(const uint32_t* const argb) { uint32_t key; - key = (argb[1] * HASH_MULTIPLIER_HI) & 0xffffffffu; - key += (argb[0] * HASH_MULTIPLIER_LO) & 0xffffffffu; + key = argb[1] * kHashMultiplierHi; + key += argb[0] * kHashMultiplierLo; key = key >> (32 - HASH_BITS); return key; } @@ -411,24 +390,6 @@ int VP8LHashChainFill(VP8LHashChain* const p, int quality, return 1; } -static WEBP_INLINE int HashChainFindOffset(const VP8LHashChain* const p, - const int base_position) { - return p->offset_length_[base_position] >> MAX_LENGTH_BITS; -} - -static WEBP_INLINE int HashChainFindLength(const VP8LHashChain* const p, - const int base_position) { - return p->offset_length_[base_position] & ((1U << MAX_LENGTH_BITS) - 1); -} - -static WEBP_INLINE void HashChainFindCopy(const VP8LHashChain* const p, - int base_position, - int* const offset_ptr, - int* const length_ptr) { - *offset_ptr = HashChainFindOffset(p, base_position); - *length_ptr = HashChainFindLength(p, base_position); -} - static WEBP_INLINE void AddSingleLiteral(uint32_t pixel, int use_color_cache, VP8LColorCache* const hashers, VP8LBackwardRefs* const refs) { @@ -444,7 +405,7 @@ static WEBP_INLINE void AddSingleLiteral(uint32_t pixel, int use_color_cache, } else { v = PixOrCopyCreateLiteral(pixel); } - BackwardRefsCursorAdd(refs, v); + VP8LBackwardRefsCursorAdd(refs, v); } static int BackwardReferencesRle(int xsize, int ysize, @@ -458,7 +419,7 @@ static int BackwardReferencesRle(int xsize, int ysize, if (use_color_cache && !VP8LColorCacheInit(&hashers, cache_bits)) { return 0; } - ClearBackwardRefs(refs); + VP8LClearBackwardRefs(refs); // Add first pixel as literal. AddSingleLiteral(argb[0], use_color_cache, &hashers, refs); i = 1; @@ -468,13 +429,13 @@ static int BackwardReferencesRle(int xsize, int ysize, const int prev_row_len = (i < xsize) ? 0 : FindMatchLength(argb + i, argb + i - xsize, 0, max_len); if (rle_len >= prev_row_len && rle_len >= MIN_LENGTH) { - BackwardRefsCursorAdd(refs, PixOrCopyCreateCopy(1, rle_len)); + VP8LBackwardRefsCursorAdd(refs, PixOrCopyCreateCopy(1, rle_len)); // We don't need to update the color cache here since it is always the // same pixel being copied, and that does not change the color cache // state. i += rle_len; } else if (prev_row_len >= MIN_LENGTH) { - BackwardRefsCursorAdd(refs, PixOrCopyCreateCopy(xsize, prev_row_len)); + VP8LBackwardRefsCursorAdd(refs, PixOrCopyCreateCopy(xsize, prev_row_len)); if (use_color_cache) { for (k = 0; k < prev_row_len; ++k) { VP8LColorCacheInsert(&hashers, argb[i + k]); @@ -506,17 +467,18 @@ static int BackwardReferencesLz77(int xsize, int ysize, cc_init = VP8LColorCacheInit(&hashers, cache_bits); if (!cc_init) goto Error; } - ClearBackwardRefs(refs); + VP8LClearBackwardRefs(refs); for (i = 0; i < pix_count;) { // Alternative#1: Code the pixels starting at 'i' using backward reference. int offset = 0; int len = 0; int j; - HashChainFindCopy(hash_chain, i, &offset, &len); + VP8LHashChainFindCopy(hash_chain, i, &offset, &len); if (len >= MIN_LENGTH) { const int len_ini = len; int max_reach = 0; - assert(i + len < pix_count); + const int j_max = + (i + len_ini >= pix_count) ? pix_count - 1 : i + len_ini; // Only start from what we have not checked already. i_last_check = (i > i_last_check) ? i : i_last_check; // We know the best match for the current pixel but we try to find the @@ -525,13 +487,14 @@ static int BackwardReferencesLz77(int xsize, int ysize, // [i,i+len) + [i+len, length of best match at i+len) // while we check if we can use: // [i,j) (where j<=i+len) + [j, length of best match at j) - for (j = i_last_check + 1; j <= i + len_ini; ++j) { - const int len_j = HashChainFindLength(hash_chain, j); + for (j = i_last_check + 1; j <= j_max; ++j) { + const int len_j = VP8LHashChainFindLength(hash_chain, j); const int reach = j + (len_j >= MIN_LENGTH ? len_j : 1); // 1 for single literal. if (reach > max_reach) { len = j - i; max_reach = reach; + if (max_reach >= pix_count) break; } } } else { @@ -542,7 +505,7 @@ static int BackwardReferencesLz77(int xsize, int ysize, if (len == 1) { AddSingleLiteral(argb[i], use_color_cache, &hashers, refs); } else { - BackwardRefsCursorAdd(refs, PixOrCopyCreateCopy(offset, len)); + VP8LBackwardRefsCursorAdd(refs, PixOrCopyCreateCopy(offset, len)); if (use_color_cache) { for (j = i; j < i + len; ++j) VP8LColorCacheInsert(&hashers, argb[j]); } @@ -556,1051 +519,268 @@ static int BackwardReferencesLz77(int xsize, int ysize, return ok; } -// ----------------------------------------------------------------------------- - -typedef struct { - double alpha_[VALUES_IN_BYTE]; - double red_[VALUES_IN_BYTE]; - double blue_[VALUES_IN_BYTE]; - double distance_[NUM_DISTANCE_CODES]; - double* literal_; -} CostModel; - -static int BackwardReferencesTraceBackwards( - int xsize, int ysize, const uint32_t* const argb, int quality, - int cache_bits, const VP8LHashChain* const hash_chain, - VP8LBackwardRefs* const refs); - -static void ConvertPopulationCountTableToBitEstimates( - int num_symbols, const uint32_t population_counts[], double output[]) { - uint32_t sum = 0; - int nonzeros = 0; - int i; - for (i = 0; i < num_symbols; ++i) { - sum += population_counts[i]; - if (population_counts[i] > 0) { - ++nonzeros; - } - } - if (nonzeros <= 1) { - memset(output, 0, num_symbols * sizeof(*output)); - } else { - const double logsum = VP8LFastLog2(sum); - for (i = 0; i < num_symbols; ++i) { - output[i] = logsum - VP8LFastLog2(population_counts[i]); - } - } -} - -static int CostModelBuild(CostModel* const m, int cache_bits, - VP8LBackwardRefs* const refs) { - int ok = 0; - VP8LHistogram* const histo = VP8LAllocateHistogram(cache_bits); - if (histo == NULL) goto Error; - - VP8LHistogramCreate(histo, refs, cache_bits); - - ConvertPopulationCountTableToBitEstimates( - VP8LHistogramNumCodes(histo->palette_code_bits_), - histo->literal_, m->literal_); - ConvertPopulationCountTableToBitEstimates( - VALUES_IN_BYTE, histo->red_, m->red_); - ConvertPopulationCountTableToBitEstimates( - VALUES_IN_BYTE, histo->blue_, m->blue_); - ConvertPopulationCountTableToBitEstimates( - VALUES_IN_BYTE, histo->alpha_, m->alpha_); - ConvertPopulationCountTableToBitEstimates( - NUM_DISTANCE_CODES, histo->distance_, m->distance_); - ok = 1; - - Error: - VP8LFreeHistogram(histo); - return ok; -} - -static WEBP_INLINE double GetLiteralCost(const CostModel* const m, uint32_t v) { - return m->alpha_[v >> 24] + - m->red_[(v >> 16) & 0xff] + - m->literal_[(v >> 8) & 0xff] + - m->blue_[v & 0xff]; -} - -static WEBP_INLINE double GetCacheCost(const CostModel* const m, uint32_t idx) { - const int literal_idx = VALUES_IN_BYTE + NUM_LENGTH_CODES + idx; - return m->literal_[literal_idx]; -} - -static WEBP_INLINE double GetLengthCost(const CostModel* const m, - uint32_t length) { - int code, extra_bits; - VP8LPrefixEncodeBits(length, &code, &extra_bits); - return m->literal_[VALUES_IN_BYTE + code] + extra_bits; -} - -static WEBP_INLINE double GetDistanceCost(const CostModel* const m, - uint32_t distance) { - int code, extra_bits; - VP8LPrefixEncodeBits(distance, &code, &extra_bits); - return m->distance_[code] + extra_bits; -} - -static void AddSingleLiteralWithCostModel(const uint32_t* const argb, - VP8LColorCache* const hashers, - const CostModel* const cost_model, - int idx, int use_color_cache, - double prev_cost, float* const cost, - uint16_t* const dist_array) { - double cost_val = prev_cost; - const uint32_t color = argb[0]; - const int ix = use_color_cache ? VP8LColorCacheContains(hashers, color) : -1; - if (ix >= 0) { - // use_color_cache is true and hashers contains color - const double mul0 = 0.68; - cost_val += GetCacheCost(cost_model, ix) * mul0; - } else { - const double mul1 = 0.82; - if (use_color_cache) VP8LColorCacheInsert(hashers, color); - cost_val += GetLiteralCost(cost_model, color) * mul1; - } - if (cost[idx] > cost_val) { - cost[idx] = (float)cost_val; - dist_array[idx] = 1; // only one is inserted. - } -} - -// ----------------------------------------------------------------------------- -// CostManager and interval handling - -// Empirical value to avoid high memory consumption but good for performance. -#define COST_CACHE_INTERVAL_SIZE_MAX 100 - -// To perform backward reference every pixel at index index_ is considered and -// the cost for the MAX_LENGTH following pixels computed. Those following pixels -// at index index_ + k (k from 0 to MAX_LENGTH) have a cost of: -// distance_cost_ at index_ + GetLengthCost(cost_model, k) -// (named cost) (named cached cost) -// and the minimum value is kept. GetLengthCost(cost_model, k) is cached in an -// array of size MAX_LENGTH. -// Instead of performing MAX_LENGTH comparisons per pixel, we keep track of the -// minimal values using intervals, for which lower_ and upper_ bounds are kept. -// An interval is defined by the index_ of the pixel that generated it and -// is only useful in a range of indices from start_ to end_ (exclusive), i.e. -// it contains the minimum value for pixels between start_ and end_. -// Intervals are stored in a linked list and ordered by start_. When a new -// interval has a better minimum, old intervals are split or removed. -typedef struct CostInterval CostInterval; -struct CostInterval { - double lower_; - double upper_; - int start_; - int end_; - double distance_cost_; - int index_; - CostInterval* previous_; - CostInterval* next_; -}; - -// The GetLengthCost(cost_model, k) part of the costs is also bounded for -// efficiency in a set of intervals of a different type. -// If those intervals are small enough, they are not used for comparison and -// written into the costs right away. -typedef struct { - double lower_; // Lower bound of the interval. - double upper_; // Upper bound of the interval. - int start_; - int end_; // Exclusive. - int do_write_; // If !=0, the interval is saved to cost instead of being kept - // for comparison. -} CostCacheInterval; - -// This structure is in charge of managing intervals and costs. -// It caches the different CostCacheInterval, caches the different -// GetLengthCost(cost_model, k) in cost_cache_ and the CostInterval's (whose -// count_ is limited by COST_CACHE_INTERVAL_SIZE_MAX). -#define COST_MANAGER_MAX_FREE_LIST 10 -typedef struct { - CostInterval* head_; - int count_; // The number of stored intervals. - CostCacheInterval* cache_intervals_; - size_t cache_intervals_size_; - double cost_cache_[MAX_LENGTH]; // Contains the GetLengthCost(cost_model, k). - double min_cost_cache_; // The minimum value in cost_cache_[1:]. - double max_cost_cache_; // The maximum value in cost_cache_[1:]. - float* costs_; - uint16_t* dist_array_; - // Most of the time, we only need few intervals -> use a free-list, to avoid - // fragmentation with small allocs in most common cases. - CostInterval intervals_[COST_MANAGER_MAX_FREE_LIST]; - CostInterval* free_intervals_; - // These are regularly malloc'd remains. This list can't grow larger than than - // size COST_CACHE_INTERVAL_SIZE_MAX - COST_MANAGER_MAX_FREE_LIST, note. - CostInterval* recycled_intervals_; - // Buffer used in BackwardReferencesHashChainDistanceOnly to store the ends - // of the intervals that can have impacted the cost at a pixel. - int* interval_ends_; - int interval_ends_size_; -} CostManager; - -static int IsCostCacheIntervalWritable(int start, int end) { - // 100 is the length for which we consider an interval for comparison, and not - // for writing. - // The first intervals are very small and go in increasing size. This constant - // helps merging them into one big interval (up to index 150/200 usually from - // which intervals start getting much bigger). - // This value is empirical. - return (end - start + 1 < 100); -} - -static void CostIntervalAddToFreeList(CostManager* const manager, - CostInterval* const interval) { - interval->next_ = manager->free_intervals_; - manager->free_intervals_ = interval; -} - -static int CostIntervalIsInFreeList(const CostManager* const manager, - const CostInterval* const interval) { - return (interval >= &manager->intervals_[0] && - interval <= &manager->intervals_[COST_MANAGER_MAX_FREE_LIST - 1]); -} - -static void CostManagerInitFreeList(CostManager* const manager) { - int i; - manager->free_intervals_ = NULL; - for (i = 0; i < COST_MANAGER_MAX_FREE_LIST; ++i) { - CostIntervalAddToFreeList(manager, &manager->intervals_[i]); - } -} - -static void DeleteIntervalList(CostManager* const manager, - const CostInterval* interval) { - while (interval != NULL) { - const CostInterval* const next = interval->next_; - if (!CostIntervalIsInFreeList(manager, interval)) { - WebPSafeFree((void*)interval); - } // else: do nothing - interval = next; - } -} - -static void CostManagerClear(CostManager* const manager) { - if (manager == NULL) return; - - WebPSafeFree(manager->costs_); - WebPSafeFree(manager->cache_intervals_); - WebPSafeFree(manager->interval_ends_); - - // Clear the interval lists. - DeleteIntervalList(manager, manager->head_); - manager->head_ = NULL; - DeleteIntervalList(manager, manager->recycled_intervals_); - manager->recycled_intervals_ = NULL; - - // Reset pointers, count_ and cache_intervals_size_. - memset(manager, 0, sizeof(*manager)); - CostManagerInitFreeList(manager); -} - -static int CostManagerInit(CostManager* const manager, - uint16_t* const dist_array, int pix_count, - const CostModel* const cost_model) { +// Compute an LZ77 by forcing matches to happen within a given distance cost. +// We therefore limit the algorithm to the lowest 32 values in the PlaneCode +// definition. +#define WINDOW_OFFSETS_SIZE_MAX 32 +static int BackwardReferencesLz77Box(int xsize, int ysize, + const uint32_t* const argb, int cache_bits, + const VP8LHashChain* const hash_chain_best, + VP8LHashChain* hash_chain, + VP8LBackwardRefs* const refs) { int i; - const int cost_cache_size = (pix_count > MAX_LENGTH) ? MAX_LENGTH : pix_count; - // This constant is tied to the cost_model we use. - // Empirically, differences between intervals is usually of more than 1. - const double min_cost_diff = 0.1; - - manager->costs_ = NULL; - manager->cache_intervals_ = NULL; - manager->interval_ends_ = NULL; - manager->head_ = NULL; - manager->recycled_intervals_ = NULL; - manager->count_ = 0; - manager->dist_array_ = dist_array; - CostManagerInitFreeList(manager); - - // Fill in the cost_cache_. - manager->cache_intervals_size_ = 1; - manager->cost_cache_[0] = 0; - for (i = 1; i < cost_cache_size; ++i) { - manager->cost_cache_[i] = GetLengthCost(cost_model, i); - // Get an approximation of the number of bound intervals. - if (fabs(manager->cost_cache_[i] - manager->cost_cache_[i - 1]) > - min_cost_diff) { - ++manager->cache_intervals_size_; - } - // Compute the minimum of cost_cache_. - if (i == 1) { - manager->min_cost_cache_ = manager->cost_cache_[1]; - manager->max_cost_cache_ = manager->cost_cache_[1]; - } else if (manager->cost_cache_[i] < manager->min_cost_cache_) { - manager->min_cost_cache_ = manager->cost_cache_[i]; - } else if (manager->cost_cache_[i] > manager->max_cost_cache_) { - manager->max_cost_cache_ = manager->cost_cache_[i]; - } - } - - // With the current cost models, we have 15 intervals, so we are safe by - // setting a maximum of COST_CACHE_INTERVAL_SIZE_MAX. - if (manager->cache_intervals_size_ > COST_CACHE_INTERVAL_SIZE_MAX) { - manager->cache_intervals_size_ = COST_CACHE_INTERVAL_SIZE_MAX; - } - manager->cache_intervals_ = (CostCacheInterval*)WebPSafeMalloc( - manager->cache_intervals_size_, sizeof(*manager->cache_intervals_)); - if (manager->cache_intervals_ == NULL) { - CostManagerClear(manager); - return 0; - } - - // Fill in the cache_intervals_. - { - double cost_prev = -1e38f; // unprobably low initial value - CostCacheInterval* prev = NULL; - CostCacheInterval* cur = manager->cache_intervals_; - const CostCacheInterval* const end = - manager->cache_intervals_ + manager->cache_intervals_size_; - - // Consecutive values in cost_cache_ are compared and if a big enough - // difference is found, a new interval is created and bounded. - for (i = 0; i < cost_cache_size; ++i) { - const double cost_val = manager->cost_cache_[i]; - if (i == 0 || - (fabs(cost_val - cost_prev) > min_cost_diff && cur + 1 < end)) { - if (i > 1) { - const int is_writable = - IsCostCacheIntervalWritable(cur->start_, cur->end_); - // Merge with the previous interval if both are writable. - if (is_writable && cur != manager->cache_intervals_ && - prev->do_write_) { - // Update the previous interval. - prev->end_ = cur->end_; - if (cur->lower_ < prev->lower_) { - prev->lower_ = cur->lower_; - } else if (cur->upper_ > prev->upper_) { - prev->upper_ = cur->upper_; - } - } else { - cur->do_write_ = is_writable; - prev = cur; - ++cur; - } - } - // Initialize an interval. - cur->start_ = i; - cur->do_write_ = 0; - cur->lower_ = cost_val; - cur->upper_ = cost_val; - } else { - // Update the current interval bounds. - if (cost_val < cur->lower_) { - cur->lower_ = cost_val; - } else if (cost_val > cur->upper_) { - cur->upper_ = cost_val; - } - } - cur->end_ = i + 1; - cost_prev = cost_val; + const int pix_count = xsize * ysize; + uint16_t* counts; + int window_offsets[WINDOW_OFFSETS_SIZE_MAX] = {0}; + int window_offsets_new[WINDOW_OFFSETS_SIZE_MAX] = {0}; + int window_offsets_size = 0; + int window_offsets_new_size = 0; + uint16_t* const counts_ini = + (uint16_t*)WebPSafeMalloc(xsize * ysize, sizeof(*counts_ini)); + int best_offset_prev = -1, best_length_prev = -1; + if (counts_ini == NULL) return 0; + + // counts[i] counts how many times a pixel is repeated starting at position i. + i = pix_count - 2; + counts = counts_ini + i; + counts[1] = 1; + for (; i >= 0; --i, --counts) { + if (argb[i] == argb[i + 1]) { + // Max out the counts to MAX_LENGTH. + counts[0] = counts[1] + (counts[1] != MAX_LENGTH); + } else { + counts[0] = 1; } - manager->cache_intervals_size_ = cur + 1 - manager->cache_intervals_; } - manager->costs_ = (float*)WebPSafeMalloc(pix_count, sizeof(*manager->costs_)); - if (manager->costs_ == NULL) { - CostManagerClear(manager); - return 0; - } - // Set the initial costs_ high for every pixel as we will keep the minimum. - for (i = 0; i < pix_count; ++i) manager->costs_[i] = 1e38f; - - // The cost at pixel is influenced by the cost intervals from previous pixels. - // Let us take the specific case where the offset is the same (which actually - // happens a lot in case of uniform regions). - // pixel i contributes to j>i a cost of: offset cost + cost_cache_[j-i] - // pixel i+1 contributes to j>i a cost of: 2*offset cost + cost_cache_[j-i-1] - // pixel i+2 contributes to j>i a cost of: 3*offset cost + cost_cache_[j-i-2] - // and so on. - // A pixel i influences the following length(j) < MAX_LENGTH pixels. What is - // the value of j such that pixel i + j cannot influence any of those pixels? - // This value is such that: - // max of cost_cache_ < j*offset cost + min of cost_cache_ - // (pixel i + j 's cost cannot beat the worst cost given by pixel i). - // This value will be used to optimize the cost computation in - // BackwardReferencesHashChainDistanceOnly. + // Figure out the window offsets around a pixel. They are stored in a + // spiraling order around the pixel as defined by VP8LDistanceToPlaneCode. { - // The offset cost is computed in GetDistanceCost and has a minimum value of - // the minimum in cost_model->distance_. The case where the offset cost is 0 - // will be dealt with differently later so we are only interested in the - // minimum non-zero offset cost. - double offset_cost_min = 0.; - int size; - for (i = 0; i < NUM_DISTANCE_CODES; ++i) { - if (cost_model->distance_[i] != 0) { - if (offset_cost_min == 0.) { - offset_cost_min = cost_model->distance_[i]; - } else if (cost_model->distance_[i] < offset_cost_min) { - offset_cost_min = cost_model->distance_[i]; - } + int x, y; + for (y = 0; y <= 6; ++y) { + for (x = -6; x <= 6; ++x) { + const int offset = y * xsize + x; + int plane_code; + // Ignore offsets that bring us after the pixel. + if (offset <= 0) continue; + plane_code = VP8LDistanceToPlaneCode(xsize, offset) - 1; + if (plane_code >= WINDOW_OFFSETS_SIZE_MAX) continue; + window_offsets[plane_code] = offset; } } - // In case all the cost_model->distance_ is 0, the next non-zero cost we - // can have is from the extra bit in GetDistanceCost, hence 1. - if (offset_cost_min < 1.) offset_cost_min = 1.; - - size = 1 + (int)ceil((manager->max_cost_cache_ - manager->min_cost_cache_) / - offset_cost_min); - // Empirically, we usually end up with a value below 100. - if (size > MAX_LENGTH) size = MAX_LENGTH; - - manager->interval_ends_ = - (int*)WebPSafeMalloc(size, sizeof(*manager->interval_ends_)); - if (manager->interval_ends_ == NULL) { - CostManagerClear(manager); - return 0; - } - manager->interval_ends_size_ = size; - } - - return 1; -} - -// Given the distance_cost for pixel 'index', update the cost at pixel 'i' if it -// is smaller than the previously computed value. -static WEBP_INLINE void UpdateCost(CostManager* const manager, int i, int index, - double distance_cost) { - int k = i - index; - double cost_tmp; - assert(k >= 0 && k < MAX_LENGTH); - cost_tmp = distance_cost + manager->cost_cache_[k]; - - if (manager->costs_[i] > cost_tmp) { - manager->costs_[i] = (float)cost_tmp; - manager->dist_array_[i] = k + 1; - } -} - -// Given the distance_cost for pixel 'index', update the cost for all the pixels -// between 'start' and 'end' excluded. -static WEBP_INLINE void UpdateCostPerInterval(CostManager* const manager, - int start, int end, int index, - double distance_cost) { - int i; - for (i = start; i < end; ++i) UpdateCost(manager, i, index, distance_cost); -} - -// Given two intervals, make 'prev' be the previous one of 'next' in 'manager'. -static WEBP_INLINE void ConnectIntervals(CostManager* const manager, - CostInterval* const prev, - CostInterval* const next) { - if (prev != NULL) { - prev->next_ = next; - } else { - manager->head_ = next; - } - - if (next != NULL) next->previous_ = prev; -} - -// Pop an interval in the manager. -static WEBP_INLINE void PopInterval(CostManager* const manager, - CostInterval* const interval) { - CostInterval* const next = interval->next_; - - if (interval == NULL) return; - - ConnectIntervals(manager, interval->previous_, next); - if (CostIntervalIsInFreeList(manager, interval)) { - CostIntervalAddToFreeList(manager, interval); - } else { // recycle regularly malloc'd intervals too - interval->next_ = manager->recycled_intervals_; - manager->recycled_intervals_ = interval; - } - --manager->count_; - assert(manager->count_ >= 0); -} - -// Update the cost at index i by going over all the stored intervals that -// overlap with i. -static WEBP_INLINE void UpdateCostPerIndex(CostManager* const manager, int i) { - CostInterval* current = manager->head_; - - while (current != NULL && current->start_ <= i) { - if (current->end_ <= i) { - // We have an outdated interval, remove it. - CostInterval* next = current->next_; - PopInterval(manager, current); - current = next; - } else { - UpdateCost(manager, i, current->index_, current->distance_cost_); - current = current->next_; - } - } -} - -// Given a current orphan interval and its previous interval, before -// it was orphaned (which can be NULL), set it at the right place in the list -// of intervals using the start_ ordering and the previous interval as a hint. -static WEBP_INLINE void PositionOrphanInterval(CostManager* const manager, - CostInterval* const current, - CostInterval* previous) { - assert(current != NULL); - - if (previous == NULL) previous = manager->head_; - while (previous != NULL && current->start_ < previous->start_) { - previous = previous->previous_; - } - while (previous != NULL && previous->next_ != NULL && - previous->next_->start_ < current->start_) { - previous = previous->next_; - } - - if (previous != NULL) { - ConnectIntervals(manager, current, previous->next_); - } else { - ConnectIntervals(manager, current, manager->head_); - } - ConnectIntervals(manager, previous, current); -} - -// Insert an interval in the list contained in the manager by starting at -// interval_in as a hint. The intervals are sorted by start_ value. -static WEBP_INLINE void InsertInterval(CostManager* const manager, - CostInterval* const interval_in, - double distance_cost, double lower, - double upper, int index, int start, - int end) { - CostInterval* interval_new; - - if (IsCostCacheIntervalWritable(start, end) || - manager->count_ >= COST_CACHE_INTERVAL_SIZE_MAX) { - // Write down the interval if it is too small. - UpdateCostPerInterval(manager, start, end, index, distance_cost); - return; - } - if (manager->free_intervals_ != NULL) { - interval_new = manager->free_intervals_; - manager->free_intervals_ = interval_new->next_; - } else if (manager->recycled_intervals_ != NULL) { - interval_new = manager->recycled_intervals_; - manager->recycled_intervals_ = interval_new->next_; - } else { // malloc for good - interval_new = (CostInterval*)WebPSafeMalloc(1, sizeof(*interval_new)); - if (interval_new == NULL) { - // Write down the interval if we cannot create it. - UpdateCostPerInterval(manager, start, end, index, distance_cost); - return; - } - } - - interval_new->distance_cost_ = distance_cost; - interval_new->lower_ = lower; - interval_new->upper_ = upper; - interval_new->index_ = index; - interval_new->start_ = start; - interval_new->end_ = end; - PositionOrphanInterval(manager, interval_new, interval_in); - - ++manager->count_; -} - -// When an interval has its start_ or end_ modified, it needs to be -// repositioned in the linked list. -static WEBP_INLINE void RepositionInterval(CostManager* const manager, - CostInterval* const interval) { - if (IsCostCacheIntervalWritable(interval->start_, interval->end_)) { - // Maybe interval has been resized and is small enough to be removed. - UpdateCostPerInterval(manager, interval->start_, interval->end_, - interval->index_, interval->distance_cost_); - PopInterval(manager, interval); - return; - } - - // Early exit if interval is at the right spot. - if ((interval->previous_ == NULL || - interval->previous_->start_ <= interval->start_) && - (interval->next_ == NULL || - interval->start_ <= interval->next_->start_)) { - return; - } - - ConnectIntervals(manager, interval->previous_, interval->next_); - PositionOrphanInterval(manager, interval, interval->previous_); -} - -// Given a new cost interval defined by its start at index, its last value and -// distance_cost, add its contributions to the previous intervals and costs. -// If handling the interval or one of its subintervals becomes to heavy, its -// contribution is added to the costs right away. -static WEBP_INLINE void PushInterval(CostManager* const manager, - double distance_cost, int index, - int last) { - size_t i; - CostInterval* interval = manager->head_; - CostInterval* interval_next; - const CostCacheInterval* const cost_cache_intervals = - manager->cache_intervals_; - - for (i = 0; i < manager->cache_intervals_size_ && - cost_cache_intervals[i].start_ < last; - ++i) { - // Define the intersection of the ith interval with the new one. - int start = index + cost_cache_intervals[i].start_; - const int end = index + (cost_cache_intervals[i].end_ > last - ? last - : cost_cache_intervals[i].end_); - const double lower_in = cost_cache_intervals[i].lower_; - const double upper_in = cost_cache_intervals[i].upper_; - const double lower_full_in = distance_cost + lower_in; - const double upper_full_in = distance_cost + upper_in; - - if (cost_cache_intervals[i].do_write_) { - UpdateCostPerInterval(manager, start, end, index, distance_cost); - continue; + // For narrow images, not all plane codes are reached, so remove those. + for (i = 0; i < WINDOW_OFFSETS_SIZE_MAX; ++i) { + if (window_offsets[i] == 0) continue; + window_offsets[window_offsets_size++] = window_offsets[i]; } - - for (; interval != NULL && interval->start_ < end && start < end; - interval = interval_next) { - const double lower_full_interval = - interval->distance_cost_ + interval->lower_; - const double upper_full_interval = - interval->distance_cost_ + interval->upper_; - - interval_next = interval->next_; - - // Make sure we have some overlap - if (start >= interval->end_) continue; - - if (lower_full_in >= upper_full_interval) { - // When intervals are represented, the lower, the better. - // [**********************************************************] - // start end - // [----------------------------------] - // interval->start_ interval->end_ - // If we are worse than what we already have, add whatever we have so - // far up to interval. - const int start_new = interval->end_; - InsertInterval(manager, interval, distance_cost, lower_in, upper_in, - index, start, interval->start_); - start = start_new; - continue; + // Given a pixel P, find the offsets that reach pixels unreachable from P-1 + // with any of the offsets in window_offsets[]. + for (i = 0; i < window_offsets_size; ++i) { + int j; + int is_reachable = 0; + for (j = 0; j < window_offsets_size && !is_reachable; ++j) { + is_reachable |= (window_offsets[i] == window_offsets[j] + 1); } - - // We know the two intervals intersect. - if (upper_full_in >= lower_full_interval) { - // There is no clear cut on which is best, so let's keep both. - // [*********[*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*]***********] - // start interval->start_ interval->end_ end - // OR - // [*********[*-*-*-*-*-*-*-*-*-*-*-]----------------------] - // start interval->start_ end interval->end_ - const int end_new = (interval->end_ <= end) ? interval->end_ : end; - InsertInterval(manager, interval, distance_cost, lower_in, upper_in, - index, start, end_new); - start = end_new; - } else if (start <= interval->start_ && interval->end_ <= end) { - // [----------------------------------] - // interval->start_ interval->end_ - // [**************************************************************] - // start end - // We can safely remove the old interval as it is fully included. - PopInterval(manager, interval); - } else { - if (interval->start_ <= start && end <= interval->end_) { - // [--------------------------------------------------------------] - // interval->start_ interval->end_ - // [*****************************] - // start end - // We have to split the old interval as it fully contains the new one. - const int end_original = interval->end_; - interval->end_ = start; - InsertInterval(manager, interval, interval->distance_cost_, - interval->lower_, interval->upper_, interval->index_, - end, end_original); - } else if (interval->start_ < start) { - // [------------------------------------] - // interval->start_ interval->end_ - // [*****************************] - // start end - interval->end_ = start; - } else { - // [------------------------------------] - // interval->start_ interval->end_ - // [*****************************] - // start end - interval->start_ = end; - } - - // The interval has been modified, we need to reposition it or write it. - RepositionInterval(manager, interval); + if (!is_reachable) { + window_offsets_new[window_offsets_new_size] = window_offsets[i]; + ++window_offsets_new_size; } } - // Insert the remaining interval from start to end. - InsertInterval(manager, interval, distance_cost, lower_in, upper_in, index, - start, end); - } -} - -static int BackwardReferencesHashChainDistanceOnly( - int xsize, int ysize, const uint32_t* const argb, int quality, - int cache_bits, const VP8LHashChain* const hash_chain, - VP8LBackwardRefs* const refs, uint16_t* const dist_array) { - int i; - int ok = 0; - int cc_init = 0; - const int pix_count = xsize * ysize; - const int use_color_cache = (cache_bits > 0); - const size_t literal_array_size = sizeof(double) * - (NUM_LITERAL_CODES + NUM_LENGTH_CODES + - ((cache_bits > 0) ? (1 << cache_bits) : 0)); - const size_t cost_model_size = sizeof(CostModel) + literal_array_size; - CostModel* const cost_model = - (CostModel*)WebPSafeCalloc(1ULL, cost_model_size); - VP8LColorCache hashers; - const int skip_length = 32 + quality; - const int skip_min_distance_code = 2; - CostManager* cost_manager = - (CostManager*)WebPSafeMalloc(1ULL, sizeof(*cost_manager)); - - if (cost_model == NULL || cost_manager == NULL) goto Error; - - cost_model->literal_ = (double*)(cost_model + 1); - if (use_color_cache) { - cc_init = VP8LColorCacheInit(&hashers, cache_bits); - if (!cc_init) goto Error; } - if (!CostModelBuild(cost_model, cache_bits, refs)) { - goto Error; - } - - if (!CostManagerInit(cost_manager, dist_array, pix_count, cost_model)) { - goto Error; - } - - // We loop one pixel at a time, but store all currently best points to - // non-processed locations from this point. - dist_array[0] = 0; - // Add first pixel as literal. - AddSingleLiteralWithCostModel(argb + 0, &hashers, cost_model, 0, - use_color_cache, 0.0, cost_manager->costs_, - dist_array); - - for (i = 1; i < pix_count - 1; ++i) { - int offset = 0, len = 0; - double prev_cost = cost_manager->costs_[i - 1]; - HashChainFindCopy(hash_chain, i, &offset, &len); - if (len >= 2) { - // If we are dealing with a non-literal. - const int code = DistanceToPlaneCode(xsize, offset); - const double offset_cost = GetDistanceCost(cost_model, code); - const int first_i = i; - int j_max = 0, interval_ends_index = 0; - const int is_offset_zero = (offset_cost == 0.); - - if (!is_offset_zero) { - j_max = (int)ceil( - (cost_manager->max_cost_cache_ - cost_manager->min_cost_cache_) / - offset_cost); - if (j_max < 1) { - j_max = 1; - } else if (j_max > cost_manager->interval_ends_size_ - 1) { - // This could only happen in the case of MAX_LENGTH. - j_max = cost_manager->interval_ends_size_ - 1; + hash_chain->offset_length_[0] = 0; + for (i = 1; i < pix_count; ++i) { + int ind; + int best_length = VP8LHashChainFindLength(hash_chain_best, i); + int best_offset; + int do_compute = 1; + + if (best_length >= MAX_LENGTH) { + // Do not recompute the best match if we already have a maximal one in the + // window. + best_offset = VP8LHashChainFindOffset(hash_chain_best, i); + for (ind = 0; ind < window_offsets_size; ++ind) { + if (best_offset == window_offsets[ind]) { + do_compute = 0; + break; } - } // else j_max is unused anyway. - - // Instead of considering all contributions from a pixel i by calling: - // PushInterval(cost_manager, prev_cost + offset_cost, i, len); - // we optimize these contributions in case offset_cost stays the same for - // consecutive pixels. This describes a set of pixels similar to a - // previous set (e.g. constant color regions). - for (; i < pix_count - 1; ++i) { - int offset_next, len_next; - prev_cost = cost_manager->costs_[i - 1]; - - if (is_offset_zero) { - // No optimization can be made so we just push all of the - // contributions from i. - PushInterval(cost_manager, prev_cost, i, len); - } else { - // j_max is chosen as the smallest j such that: - // max of cost_cache_ < j*offset cost + min of cost_cache_ - // Therefore, the pixel influenced by i-j_max, cannot be influenced - // by i. Only the costs after the end of what i contributed need to be - // updated. cost_manager->interval_ends_ is a circular buffer that - // stores those ends. - const double distance_cost = prev_cost + offset_cost; - int j = cost_manager->interval_ends_[interval_ends_index]; - if (i - first_i <= j_max || - !IsCostCacheIntervalWritable(j, i + len)) { - PushInterval(cost_manager, distance_cost, i, len); - } else { - for (; j < i + len; ++j) { - UpdateCost(cost_manager, j, i, distance_cost); - } - } - // Store the new end in the circular buffer. - assert(interval_ends_index < cost_manager->interval_ends_size_); - cost_manager->interval_ends_[interval_ends_index] = i + len; - if (++interval_ends_index > j_max) interval_ends_index = 0; - } - - // Check whether i is the last pixel to consider, as it is handled - // differently. - if (i + 1 >= pix_count - 1) break; - HashChainFindCopy(hash_chain, i + 1, &offset_next, &len_next); - if (offset_next != offset) break; - len = len_next; - UpdateCostPerIndex(cost_manager, i); - AddSingleLiteralWithCostModel(argb + i, &hashers, cost_model, i, - use_color_cache, prev_cost, - cost_manager->costs_, dist_array); } - // Submit the last pixel. - UpdateCostPerIndex(cost_manager, i + 1); - - // This if is for speedup only. It roughly doubles the speed, and - // makes compression worse by .1 %. - if (len >= skip_length && code <= skip_min_distance_code) { - // Long copy for short distances, let's skip the middle - // lookups for better copies. - // 1) insert the hashes. - if (use_color_cache) { - int k; - for (k = 0; k < len; ++k) { - VP8LColorCacheInsert(&hashers, argb[i + k]); + } + if (do_compute) { + // Figure out if we should use the offset/length from the previous pixel + // as an initial guess and therefore only inspect the offsets in + // window_offsets_new[]. + const int use_prev = + (best_length_prev > 1) && (best_length_prev < MAX_LENGTH); + const int num_ind = + use_prev ? window_offsets_new_size : window_offsets_size; + best_length = use_prev ? best_length_prev - 1 : 0; + best_offset = use_prev ? best_offset_prev : 0; + // Find the longest match in a window around the pixel. + for (ind = 0; ind < num_ind; ++ind) { + int curr_length = 0; + int j = i; + int j_offset = + use_prev ? i - window_offsets_new[ind] : i - window_offsets[ind]; + if (j_offset < 0 || argb[j_offset] != argb[i]) continue; + // The longest match is the sum of how many times each pixel is + // repeated. + do { + const int counts_j_offset = counts_ini[j_offset]; + const int counts_j = counts_ini[j]; + if (counts_j_offset != counts_j) { + curr_length += + (counts_j_offset < counts_j) ? counts_j_offset : counts_j; + break; + } + // The same color is repeated counts_pos times at j_offset and j. + curr_length += counts_j_offset; + j_offset += counts_j_offset; + j += counts_j_offset; + } while (curr_length <= MAX_LENGTH && j < pix_count && + argb[j_offset] == argb[j]); + if (best_length < curr_length) { + best_offset = + use_prev ? window_offsets_new[ind] : window_offsets[ind]; + if (curr_length >= MAX_LENGTH) { + best_length = MAX_LENGTH; + break; + } else { + best_length = curr_length; } - } - // 2) jump. - { - const int i_next = i + len - 1; // for loop does ++i, thus -1 here. - for (; i <= i_next; ++i) UpdateCostPerIndex(cost_manager, i + 1); - i = i_next; - } - goto next_symbol; - } - if (len > 2) { - // Also try the smallest interval possible (size 2). - double cost_total = - prev_cost + offset_cost + GetLengthCost(cost_model, 1); - if (cost_manager->costs_[i + 1] > cost_total) { - cost_manager->costs_[i + 1] = (float)cost_total; - dist_array[i + 1] = 2; } } - } else { - // The pixel is added as a single literal so just update the costs. - UpdateCostPerIndex(cost_manager, i + 1); } - AddSingleLiteralWithCostModel(argb + i, &hashers, cost_model, i, - use_color_cache, prev_cost, - cost_manager->costs_, dist_array); - - next_symbol: ; - } - // Handle the last pixel. - if (i == (pix_count - 1)) { - AddSingleLiteralWithCostModel( - argb + i, &hashers, cost_model, i, use_color_cache, - cost_manager->costs_[pix_count - 2], cost_manager->costs_, dist_array); - } - - ok = !refs->error_; - Error: - if (cc_init) VP8LColorCacheClear(&hashers); - CostManagerClear(cost_manager); - WebPSafeFree(cost_model); - WebPSafeFree(cost_manager); - return ok; -} - -// We pack the path at the end of *dist_array and return -// a pointer to this part of the array. Example: -// dist_array = [1x2xx3x2] => packed [1x2x1232], chosen_path = [1232] -static void TraceBackwards(uint16_t* const dist_array, - int dist_array_size, - uint16_t** const chosen_path, - int* const chosen_path_size) { - uint16_t* path = dist_array + dist_array_size; - uint16_t* cur = dist_array + dist_array_size - 1; - while (cur >= dist_array) { - const int k = *cur; - --path; - *path = k; - cur -= k; - } - *chosen_path = path; - *chosen_path_size = (int)(dist_array + dist_array_size - path); -} - -static int BackwardReferencesHashChainFollowChosenPath( - const uint32_t* const argb, int cache_bits, - const uint16_t* const chosen_path, int chosen_path_size, - const VP8LHashChain* const hash_chain, VP8LBackwardRefs* const refs) { - const int use_color_cache = (cache_bits > 0); - int ix; - int i = 0; - int ok = 0; - int cc_init = 0; - VP8LColorCache hashers; - - if (use_color_cache) { - cc_init = VP8LColorCacheInit(&hashers, cache_bits); - if (!cc_init) goto Error; - } - - ClearBackwardRefs(refs); - for (ix = 0; ix < chosen_path_size; ++ix) { - const int len = chosen_path[ix]; - if (len != 1) { - int k; - const int offset = HashChainFindOffset(hash_chain, i); - BackwardRefsCursorAdd(refs, PixOrCopyCreateCopy(offset, len)); - if (use_color_cache) { - for (k = 0; k < len; ++k) { - VP8LColorCacheInsert(&hashers, argb[i + k]); - } - } - i += len; + assert(i + best_length <= pix_count); + assert(best_length <= MAX_LENGTH); + if (best_length <= MIN_LENGTH) { + hash_chain->offset_length_[i] = 0; + best_offset_prev = 0; + best_length_prev = 0; } else { - PixOrCopy v; - const int idx = - use_color_cache ? VP8LColorCacheContains(&hashers, argb[i]) : -1; - if (idx >= 0) { - // use_color_cache is true and hashers contains argb[i] - // push pixel as a color cache index - v = PixOrCopyCreateCacheIdx(idx); - } else { - if (use_color_cache) VP8LColorCacheInsert(&hashers, argb[i]); - v = PixOrCopyCreateLiteral(argb[i]); - } - BackwardRefsCursorAdd(refs, v); - ++i; + hash_chain->offset_length_[i] = + (best_offset << MAX_LENGTH_BITS) | (uint32_t)best_length; + best_offset_prev = best_offset; + best_length_prev = best_length; } } - ok = !refs->error_; - Error: - if (cc_init) VP8LColorCacheClear(&hashers); - return ok; -} + hash_chain->offset_length_[0] = 0; + WebPSafeFree(counts_ini); -// Returns 1 on success. -static int BackwardReferencesTraceBackwards( - int xsize, int ysize, const uint32_t* const argb, int quality, - int cache_bits, const VP8LHashChain* const hash_chain, - VP8LBackwardRefs* const refs) { - int ok = 0; - const int dist_array_size = xsize * ysize; - uint16_t* chosen_path = NULL; - int chosen_path_size = 0; - uint16_t* dist_array = - (uint16_t*)WebPSafeMalloc(dist_array_size, sizeof(*dist_array)); - - if (dist_array == NULL) goto Error; - - if (!BackwardReferencesHashChainDistanceOnly( - xsize, ysize, argb, quality, cache_bits, hash_chain, - refs, dist_array)) { - goto Error; - } - TraceBackwards(dist_array, dist_array_size, &chosen_path, &chosen_path_size); - if (!BackwardReferencesHashChainFollowChosenPath( - argb, cache_bits, chosen_path, chosen_path_size, hash_chain, refs)) { - goto Error; - } - ok = 1; - Error: - WebPSafeFree(dist_array); - return ok; + return BackwardReferencesLz77(xsize, ysize, argb, cache_bits, hash_chain, + refs); } +// ----------------------------------------------------------------------------- + static void BackwardReferences2DLocality(int xsize, const VP8LBackwardRefs* const refs) { VP8LRefsCursor c = VP8LRefsCursorInit(refs); while (VP8LRefsCursorOk(&c)) { if (PixOrCopyIsCopy(c.cur_pos)) { const int dist = c.cur_pos->argb_or_distance; - const int transformed_dist = DistanceToPlaneCode(xsize, dist); + const int transformed_dist = VP8LDistanceToPlaneCode(xsize, dist); c.cur_pos->argb_or_distance = transformed_dist; } VP8LRefsCursorNext(&c); } } -// Computes the entropies for a color cache size (in bits) between 0 (unused) -// and cache_bits_max (inclusive). -// Returns 1 on success, 0 in case of allocation error. -static int ComputeCacheEntropies(const uint32_t* argb, - const VP8LBackwardRefs* const refs, - int cache_bits_max, double entropies[]) { +// Evaluate optimal cache bits for the local color cache. +// The input *best_cache_bits sets the maximum cache bits to use (passing 0 +// implies disabling the local color cache). The local color cache is also +// disabled for the lower (<= 25) quality. +// Returns 0 in case of memory error. +static int CalculateBestCacheSize(const uint32_t* argb, int quality, + const VP8LBackwardRefs* const refs, + int* const best_cache_bits) { + int i; + const int cache_bits_max = (quality <= 25) ? 0 : *best_cache_bits; + double entropy_min = MAX_ENTROPY; int cc_init[MAX_COLOR_CACHE_BITS + 1] = { 0 }; VP8LColorCache hashers[MAX_COLOR_CACHE_BITS + 1]; VP8LRefsCursor c = VP8LRefsCursorInit(refs); VP8LHistogram* histos[MAX_COLOR_CACHE_BITS + 1] = { NULL }; int ok = 0; - int i; + assert(cache_bits_max >= 0 && cache_bits_max <= MAX_COLOR_CACHE_BITS); + + if (cache_bits_max == 0) { + *best_cache_bits = 0; + // Local color cache is disabled. + return 1; + } + + // Allocate data. for (i = 0; i <= cache_bits_max; ++i) { histos[i] = VP8LAllocateHistogram(i); if (histos[i] == NULL) goto Error; + VP8LHistogramInit(histos[i], i, /*init_arrays=*/ 1); if (i == 0) continue; cc_init[i] = VP8LColorCacheInit(&hashers[i], i); if (!cc_init[i]) goto Error; } - assert(cache_bits_max >= 0); - // Do not use the color cache for cache_bits=0. + // Find the cache_bits giving the lowest entropy. The search is done in a + // brute-force way as the function (entropy w.r.t cache_bits) can be + // anything in practice. while (VP8LRefsCursorOk(&c)) { - VP8LHistogramAddSinglePixOrCopy(histos[0], c.cur_pos); - VP8LRefsCursorNext(&c); - } - if (cache_bits_max > 0) { - c = VP8LRefsCursorInit(refs); - while (VP8LRefsCursorOk(&c)) { - const PixOrCopy* const v = c.cur_pos; - if (PixOrCopyIsLiteral(v)) { - const uint32_t pix = *argb++; - // The keys of the caches can be derived from the longest one. - int key = HashPix(pix, 32 - cache_bits_max); - for (i = cache_bits_max; i >= 1; --i, key >>= 1) { - if (VP8LColorCacheLookup(&hashers[i], key) == pix) { - ++histos[i]->literal_[NUM_LITERAL_CODES + NUM_LENGTH_CODES + key]; - } else { - VP8LColorCacheSet(&hashers[i], key, pix); - ++histos[i]->blue_[pix & 0xff]; - ++histos[i]->literal_[(pix >> 8) & 0xff]; - ++histos[i]->red_[(pix >> 16) & 0xff]; - ++histos[i]->alpha_[pix >> 24]; - } - } - } else { - // Update the histograms for distance/length. - int len = PixOrCopyLength(v); - int code_dist, code_len, extra_bits; - uint32_t argb_prev = *argb ^ 0xffffffffu; - VP8LPrefixEncodeBits(len, &code_len, &extra_bits); - VP8LPrefixEncodeBits(PixOrCopyDistance(v), &code_dist, &extra_bits); - for (i = 1; i <= cache_bits_max; ++i) { - ++histos[i]->literal_[NUM_LITERAL_CODES + code_len]; - ++histos[i]->distance_[code_dist]; + const PixOrCopy* const v = c.cur_pos; + if (PixOrCopyIsLiteral(v)) { + const uint32_t pix = *argb++; + const uint32_t a = (pix >> 24) & 0xff; + const uint32_t r = (pix >> 16) & 0xff; + const uint32_t g = (pix >> 8) & 0xff; + const uint32_t b = (pix >> 0) & 0xff; + // The keys of the caches can be derived from the longest one. + int key = VP8LHashPix(pix, 32 - cache_bits_max); + // Do not use the color cache for cache_bits = 0. + ++histos[0]->blue_[b]; + ++histos[0]->literal_[g]; + ++histos[0]->red_[r]; + ++histos[0]->alpha_[a]; + // Deal with cache_bits > 0. + for (i = cache_bits_max; i >= 1; --i, key >>= 1) { + if (VP8LColorCacheLookup(&hashers[i], key) == pix) { + ++histos[i]->literal_[NUM_LITERAL_CODES + NUM_LENGTH_CODES + key]; + } else { + VP8LColorCacheSet(&hashers[i], key, pix); + ++histos[i]->blue_[b]; + ++histos[i]->literal_[g]; + ++histos[i]->red_[r]; + ++histos[i]->alpha_[a]; } - // Update the colors caches. - do { - if (*argb != argb_prev) { - // Efficiency: insert only if the color changes. - int key = HashPix(*argb, 32 - cache_bits_max); - for (i = cache_bits_max; i >= 1; --i, key >>= 1) { - hashers[i].colors_[key] = *argb; - } - argb_prev = *argb; - } - argb++; - } while (--len != 0); } - VP8LRefsCursorNext(&c); + } else { + // We should compute the contribution of the (distance,length) + // histograms but those are the same independently from the cache size. + // As those constant contributions are in the end added to the other + // histogram contributions, we can safely ignore them. + int len = PixOrCopyLength(v); + uint32_t argb_prev = *argb ^ 0xffffffffu; + // Update the color caches. + do { + if (*argb != argb_prev) { + // Efficiency: insert only if the color changes. + int key = VP8LHashPix(*argb, 32 - cache_bits_max); + for (i = cache_bits_max; i >= 1; --i, key >>= 1) { + hashers[i].colors_[key] = *argb; + } + argb_prev = *argb; + } + argb++; + } while (--len != 0); } + VP8LRefsCursorNext(&c); } + for (i = 0; i <= cache_bits_max; ++i) { - entropies[i] = VP8LHistogramEstimateBits(histos[i]); + const double entropy = VP8LHistogramEstimateBits(histos[i]); + if (i == 0 || entropy < entropy_min) { + entropy_min = entropy; + *best_cache_bits = i; + } } ok = 1; Error: @@ -1611,50 +791,6 @@ Error: return ok; } -// Evaluate optimal cache bits for the local color cache. -// The input *best_cache_bits sets the maximum cache bits to use (passing 0 -// implies disabling the local color cache). The local color cache is also -// disabled for the lower (<= 25) quality. -// Returns 0 in case of memory error. -static int CalculateBestCacheSize(const uint32_t* const argb, - int xsize, int ysize, int quality, - const VP8LHashChain* const hash_chain, - VP8LBackwardRefs* const refs, - int* const lz77_computed, - int* const best_cache_bits) { - int i; - int cache_bits_high = (quality <= 25) ? 0 : *best_cache_bits; - double entropy_min = MAX_ENTROPY; - double entropies[MAX_COLOR_CACHE_BITS + 1]; - - assert(cache_bits_high <= MAX_COLOR_CACHE_BITS); - - *lz77_computed = 0; - if (cache_bits_high == 0) { - *best_cache_bits = 0; - // Local color cache is disabled. - return 1; - } - // Compute LZ77 with no cache (0 bits), as the ideal LZ77 with a color cache - // is not that different in practice. - if (!BackwardReferencesLz77(xsize, ysize, argb, 0, hash_chain, refs)) { - return 0; - } - // Find the cache_bits giving the lowest entropy. The search is done in a - // brute-force way as the function (entropy w.r.t cache_bits) can be - // anything in practice. - if (!ComputeCacheEntropies(argb, refs, cache_bits_high, entropies)) { - return 0; - } - for (i = 0; i <= cache_bits_high; ++i) { - if (i == 0 || entropies[i] < entropy_min) { - entropy_min = entropies[i]; - *best_cache_bits = i; - } - } - return 1; -} - // Update (in-place) backward references for specified cache_bits. static int BackwardRefsWithLocalCache(const uint32_t* const argb, int cache_bits, @@ -1693,8 +829,7 @@ static int BackwardRefsWithLocalCache(const uint32_t* const argb, static VP8LBackwardRefs* GetBackwardReferencesLowEffort( int width, int height, const uint32_t* const argb, int* const cache_bits, const VP8LHashChain* const hash_chain, - VP8LBackwardRefs refs_array[2]) { - VP8LBackwardRefs* refs_lz77 = &refs_array[0]; + VP8LBackwardRefs* const refs_lz77) { *cache_bits = 0; if (!BackwardReferencesLz77(width, height, argb, 0, hash_chain, refs_lz77)) { return NULL; @@ -1703,98 +838,108 @@ static VP8LBackwardRefs* GetBackwardReferencesLowEffort( return refs_lz77; } +extern int VP8LBackwardReferencesTraceBackwards( + int xsize, int ysize, const uint32_t* const argb, int cache_bits, + const VP8LHashChain* const hash_chain, + const VP8LBackwardRefs* const refs_src, VP8LBackwardRefs* const refs_dst); static VP8LBackwardRefs* GetBackwardReferences( int width, int height, const uint32_t* const argb, int quality, - int* const cache_bits, const VP8LHashChain* const hash_chain, - VP8LBackwardRefs refs_array[2]) { - int lz77_is_useful; - int lz77_computed; - double bit_cost_lz77, bit_cost_rle; - VP8LBackwardRefs* best = NULL; - VP8LBackwardRefs* refs_lz77 = &refs_array[0]; - VP8LBackwardRefs* refs_rle = &refs_array[1]; + int lz77_types_to_try, int* const cache_bits, + const VP8LHashChain* const hash_chain, VP8LBackwardRefs* best, + VP8LBackwardRefs* worst) { + const int cache_bits_initial = *cache_bits; + double bit_cost_best = -1; VP8LHistogram* histo = NULL; + int lz77_type, lz77_type_best = 0; + VP8LHashChain hash_chain_box; + memset(&hash_chain_box, 0, sizeof(hash_chain_box)); - if (!CalculateBestCacheSize(argb, width, height, quality, hash_chain, - refs_lz77, &lz77_computed, cache_bits)) { - goto Error; - } + histo = VP8LAllocateHistogram(MAX_COLOR_CACHE_BITS); + if (histo == NULL) goto Error; - if (lz77_computed) { - // Transform refs_lz77 for the optimized cache_bits. - if (*cache_bits > 0) { - if (!BackwardRefsWithLocalCache(argb, *cache_bits, refs_lz77)) { - goto Error; - } + for (lz77_type = 1; lz77_types_to_try; + lz77_types_to_try &= ~lz77_type, lz77_type <<= 1) { + int res = 0; + double bit_cost; + int cache_bits_tmp = cache_bits_initial; + if ((lz77_types_to_try & lz77_type) == 0) continue; + switch (lz77_type) { + case kLZ77RLE: + res = BackwardReferencesRle(width, height, argb, 0, worst); + break; + case kLZ77Standard: + // Compute LZ77 with no cache (0 bits), as the ideal LZ77 with a color + // cache is not that different in practice. + res = BackwardReferencesLz77(width, height, argb, 0, hash_chain, worst); + break; + case kLZ77Box: + if (!VP8LHashChainInit(&hash_chain_box, width * height)) goto Error; + res = BackwardReferencesLz77Box(width, height, argb, 0, hash_chain, + &hash_chain_box, worst); + break; + default: + assert(0); } - } else { - if (!BackwardReferencesLz77(width, height, argb, *cache_bits, hash_chain, - refs_lz77)) { + if (!res) goto Error; + + // Next, try with a color cache and update the references. + if (!CalculateBestCacheSize(argb, quality, worst, &cache_bits_tmp)) { goto Error; } - } - - if (!BackwardReferencesRle(width, height, argb, *cache_bits, refs_rle)) { - goto Error; - } - - histo = VP8LAllocateHistogram(*cache_bits); - if (histo == NULL) goto Error; - - { - // Evaluate LZ77 coding. - VP8LHistogramCreate(histo, refs_lz77, *cache_bits); - bit_cost_lz77 = VP8LHistogramEstimateBits(histo); - // Evaluate RLE coding. - VP8LHistogramCreate(histo, refs_rle, *cache_bits); - bit_cost_rle = VP8LHistogramEstimateBits(histo); - // Decide if LZ77 is useful. - lz77_is_useful = (bit_cost_lz77 < bit_cost_rle); - } - - // Choose appropriate backward reference. - if (lz77_is_useful) { - // TraceBackwards is costly. Don't execute it at lower quality. - const int try_lz77_trace_backwards = (quality >= 25); - best = refs_lz77; // default guess: lz77 is better - if (try_lz77_trace_backwards) { - VP8LBackwardRefs* const refs_trace = refs_rle; - if (!VP8LBackwardRefsCopy(refs_lz77, refs_trace)) { - best = NULL; + if (cache_bits_tmp > 0) { + if (!BackwardRefsWithLocalCache(argb, cache_bits_tmp, worst)) { goto Error; } - if (BackwardReferencesTraceBackwards(width, height, argb, quality, - *cache_bits, hash_chain, - refs_trace)) { - double bit_cost_trace; - // Evaluate LZ77 coding. - VP8LHistogramCreate(histo, refs_trace, *cache_bits); - bit_cost_trace = VP8LHistogramEstimateBits(histo); - if (bit_cost_trace < bit_cost_lz77) { - best = refs_trace; - } - } } - } else { - best = refs_rle; + + // Keep the best backward references. + VP8LHistogramCreate(histo, worst, cache_bits_tmp); + bit_cost = VP8LHistogramEstimateBits(histo); + if (lz77_type_best == 0 || bit_cost < bit_cost_best) { + VP8LBackwardRefs* const tmp = worst; + worst = best; + best = tmp; + bit_cost_best = bit_cost; + *cache_bits = cache_bits_tmp; + lz77_type_best = lz77_type; + } + } + assert(lz77_type_best > 0); + + // Improve on simple LZ77 but only for high quality (TraceBackwards is + // costly). + if ((lz77_type_best == kLZ77Standard || lz77_type_best == kLZ77Box) && + quality >= 25) { + const VP8LHashChain* const hash_chain_tmp = + (lz77_type_best == kLZ77Standard) ? hash_chain : &hash_chain_box; + if (VP8LBackwardReferencesTraceBackwards(width, height, argb, *cache_bits, + hash_chain_tmp, best, worst)) { + double bit_cost_trace; + VP8LHistogramCreate(histo, worst, *cache_bits); + bit_cost_trace = VP8LHistogramEstimateBits(histo); + if (bit_cost_trace < bit_cost_best) best = worst; + } } BackwardReferences2DLocality(width, best); - Error: +Error: + VP8LHashChainClear(&hash_chain_box); VP8LFreeHistogram(histo); return best; } VP8LBackwardRefs* VP8LGetBackwardReferences( int width, int height, const uint32_t* const argb, int quality, - int low_effort, int* const cache_bits, - const VP8LHashChain* const hash_chain, VP8LBackwardRefs refs_array[2]) { + int low_effort, int lz77_types_to_try, int* const cache_bits, + const VP8LHashChain* const hash_chain, VP8LBackwardRefs* const refs_tmp1, + VP8LBackwardRefs* const refs_tmp2) { if (low_effort) { return GetBackwardReferencesLowEffort(width, height, argb, cache_bits, - hash_chain, refs_array); + hash_chain, refs_tmp1); } else { - return GetBackwardReferences(width, height, argb, quality, cache_bits, - hash_chain, refs_array); + return GetBackwardReferences(width, height, argb, quality, + lz77_types_to_try, cache_bits, hash_chain, + refs_tmp1, refs_tmp2); } } diff --git a/src/3rdparty/libwebp/src/enc/backward_references_enc.h b/src/3rdparty/libwebp/src/enc/backward_references_enc.h index 3a19aa7..103ddfd 100644 --- a/src/3rdparty/libwebp/src/enc/backward_references_enc.h +++ b/src/3rdparty/libwebp/src/enc/backward_references_enc.h @@ -10,13 +10,13 @@ // Author: Jyrki Alakuijala (jyrki@google.com) // -#ifndef WEBP_ENC_BACKWARD_REFERENCES_H_ -#define WEBP_ENC_BACKWARD_REFERENCES_H_ +#ifndef WEBP_ENC_BACKWARD_REFERENCES_ENC_H_ +#define WEBP_ENC_BACKWARD_REFERENCES_ENC_H_ #include <assert.h> #include <stdlib.h> -#include "../webp/types.h" -#include "../webp/format_constants.h" +#include "src/webp/types.h" +#include "src/webp/format_constants.h" #ifdef __cplusplus extern "C" { @@ -91,11 +91,6 @@ static WEBP_INLINE uint32_t PixOrCopyLength(const PixOrCopy* const p) { return p->len; } -static WEBP_INLINE uint32_t PixOrCopyArgb(const PixOrCopy* const p) { - assert(p->mode == kLiteral); - return p->argb_or_distance; -} - static WEBP_INLINE uint32_t PixOrCopyCacheIdx(const PixOrCopy* const p) { assert(p->mode == kCacheIdx); assert(p->argb_or_distance < (1U << MAX_COLOR_CACHE_BITS)); @@ -113,6 +108,16 @@ static WEBP_INLINE uint32_t PixOrCopyDistance(const PixOrCopy* const p) { #define HASH_BITS 18 #define HASH_SIZE (1 << HASH_BITS) +// If you change this, you need MAX_LENGTH_BITS + WINDOW_SIZE_BITS <= 32 as it +// is used in VP8LHashChain. +#define MAX_LENGTH_BITS 12 +#define WINDOW_SIZE_BITS 20 +// We want the max value to be attainable and stored in MAX_LENGTH_BITS bits. +#define MAX_LENGTH ((1 << MAX_LENGTH_BITS) - 1) +#if MAX_LENGTH_BITS + WINDOW_SIZE_BITS > 32 +#error "MAX_LENGTH_BITS + WINDOW_SIZE_BITS > 32" +#endif + typedef struct VP8LHashChain VP8LHashChain; struct VP8LHashChain { // The 20 most significant bits contain the offset at which the best match @@ -134,6 +139,24 @@ int VP8LHashChainFill(VP8LHashChain* const p, int quality, int low_effort); void VP8LHashChainClear(VP8LHashChain* const p); // release memory +static WEBP_INLINE int VP8LHashChainFindOffset(const VP8LHashChain* const p, + const int base_position) { + return p->offset_length_[base_position] >> MAX_LENGTH_BITS; +} + +static WEBP_INLINE int VP8LHashChainFindLength(const VP8LHashChain* const p, + const int base_position) { + return p->offset_length_[base_position] & ((1U << MAX_LENGTH_BITS) - 1); +} + +static WEBP_INLINE void VP8LHashChainFindCopy(const VP8LHashChain* const p, + int base_position, + int* const offset_ptr, + int* const length_ptr) { + *offset_ptr = VP8LHashChainFindOffset(p, base_position); + *length_ptr = VP8LHashChainFindLength(p, base_position); +} + // ----------------------------------------------------------------------------- // VP8LBackwardRefs (block-based backward-references storage) @@ -158,9 +181,6 @@ struct VP8LBackwardRefs { void VP8LBackwardRefsInit(VP8LBackwardRefs* const refs, int block_size); // Release memory for backward references. void VP8LBackwardRefsClear(VP8LBackwardRefs* const refs); -// Copies the 'src' backward refs to the 'dst'. Returns 0 in case of error. -int VP8LBackwardRefsCopy(const VP8LBackwardRefs* const src, - VP8LBackwardRefs* const dst); // Cursor for iterating on references content typedef struct { @@ -189,6 +209,12 @@ static WEBP_INLINE void VP8LRefsCursorNext(VP8LRefsCursor* const c) { // ----------------------------------------------------------------------------- // Main entry points +enum VP8LLZ77Type { + kLZ77Standard = 1, + kLZ77RLE = 2, + kLZ77Box = 4 +}; + // Evaluates best possible backward references for specified quality. // The input cache_bits to 'VP8LGetBackwardReferences' sets the maximum cache // bits to use (passing 0 implies disabling the local color cache). @@ -197,11 +223,12 @@ static WEBP_INLINE void VP8LRefsCursorNext(VP8LRefsCursor* const c) { // refs[0] or refs[1]. VP8LBackwardRefs* VP8LGetBackwardReferences( int width, int height, const uint32_t* const argb, int quality, - int low_effort, int* const cache_bits, - const VP8LHashChain* const hash_chain, VP8LBackwardRefs refs[2]); + int low_effort, int lz77_types_to_try, int* const cache_bits, + const VP8LHashChain* const hash_chain, VP8LBackwardRefs* const refs_tmp1, + VP8LBackwardRefs* const refs_tmp2); #ifdef __cplusplus } #endif -#endif // WEBP_ENC_BACKWARD_REFERENCES_H_ +#endif // WEBP_ENC_BACKWARD_REFERENCES_ENC_H_ diff --git a/src/3rdparty/libwebp/src/enc/config_enc.c b/src/3rdparty/libwebp/src/enc/config_enc.c index 4589dc0..9d48289 100644 --- a/src/3rdparty/libwebp/src/enc/config_enc.c +++ b/src/3rdparty/libwebp/src/enc/config_enc.c @@ -12,10 +12,10 @@ // Author: Skal (pascal.massimino@gmail.com) #ifdef HAVE_CONFIG_H -#include "../webp/config.h" +#include "src/webp/config.h" #endif -#include "../webp/encode.h" +#include "src/webp/encode.h" //------------------------------------------------------------------------------ // WebPConfig diff --git a/src/3rdparty/libwebp/src/enc/cost_enc.c b/src/3rdparty/libwebp/src/enc/cost_enc.c index c823f5a..48fd9bc 100644 --- a/src/3rdparty/libwebp/src/enc/cost_enc.c +++ b/src/3rdparty/libwebp/src/enc/cost_enc.c @@ -11,7 +11,7 @@ // // Author: Skal (pascal.massimino@gmail.com) -#include "./cost_enc.h" +#include "src/enc/cost_enc.h" //------------------------------------------------------------------------------ // Level cost tables diff --git a/src/3rdparty/libwebp/src/enc/cost_enc.h b/src/3rdparty/libwebp/src/enc/cost_enc.h index 99e4b37..a4b177b 100644 --- a/src/3rdparty/libwebp/src/enc/cost_enc.h +++ b/src/3rdparty/libwebp/src/enc/cost_enc.h @@ -11,12 +11,12 @@ // // Author: Skal (pascal.massimino@gmail.com) -#ifndef WEBP_ENC_COST_H_ -#define WEBP_ENC_COST_H_ +#ifndef WEBP_ENC_COST_ENC_H_ +#define WEBP_ENC_COST_ENC_H_ #include <assert.h> #include <stdlib.h> -#include "./vp8i_enc.h" +#include "src/enc/vp8i_enc.h" #ifdef __cplusplus extern "C" { @@ -79,4 +79,4 @@ extern const uint16_t VP8FixedCostsI4[NUM_BMODES][NUM_BMODES][NUM_BMODES]; } // extern "C" #endif -#endif /* WEBP_ENC_COST_H_ */ +#endif // WEBP_ENC_COST_ENC_H_ diff --git a/src/3rdparty/libwebp/src/enc/delta_palettization_enc.c b/src/3rdparty/libwebp/src/enc/delta_palettization_enc.c deleted file mode 100644 index eaf0f05..0000000 --- a/src/3rdparty/libwebp/src/enc/delta_palettization_enc.c +++ /dev/null @@ -1,455 +0,0 @@ -// Copyright 2015 Google Inc. All Rights Reserved. -// -// Use of this source code is governed by a BSD-style license -// that can be found in the COPYING file in the root of the source -// tree. An additional intellectual property rights grant can be found -// in the file PATENTS. All contributing project authors may -// be found in the AUTHORS file in the root of the source tree. -// ----------------------------------------------------------------------------- -// -// Author: Mislav Bradac (mislavm@google.com) -// - -#include "./delta_palettization_enc.h" - -#ifdef WEBP_EXPERIMENTAL_FEATURES -#include "../webp/types.h" -#include "../dsp/lossless.h" - -#define MK_COL(r, g, b) (((r) << 16) + ((g) << 8) + (b)) - -// Format allows palette up to 256 entries, but more palette entries produce -// bigger entropy. In the future it will probably be useful to add more entries -// that are far from the origin of the palette or choose remaining entries -// dynamically. -#define DELTA_PALETTE_SIZE 226 - -// Palette used for delta_palettization. Entries are roughly sorted by distance -// of their signed equivalents from the origin. -static const uint32_t kDeltaPalette[DELTA_PALETTE_SIZE] = { - MK_COL(0u, 0u, 0u), - MK_COL(255u, 255u, 255u), - MK_COL(1u, 1u, 1u), - MK_COL(254u, 254u, 254u), - MK_COL(2u, 2u, 2u), - MK_COL(4u, 4u, 4u), - MK_COL(252u, 252u, 252u), - MK_COL(250u, 0u, 0u), - MK_COL(0u, 250u, 0u), - MK_COL(0u, 0u, 250u), - MK_COL(6u, 0u, 0u), - MK_COL(0u, 6u, 0u), - MK_COL(0u, 0u, 6u), - MK_COL(0u, 0u, 248u), - MK_COL(0u, 0u, 8u), - MK_COL(0u, 248u, 0u), - MK_COL(0u, 248u, 248u), - MK_COL(0u, 248u, 8u), - MK_COL(0u, 8u, 0u), - MK_COL(0u, 8u, 248u), - MK_COL(0u, 8u, 8u), - MK_COL(8u, 8u, 8u), - MK_COL(248u, 0u, 0u), - MK_COL(248u, 0u, 248u), - MK_COL(248u, 0u, 8u), - MK_COL(248u, 248u, 0u), - MK_COL(248u, 8u, 0u), - MK_COL(8u, 0u, 0u), - MK_COL(8u, 0u, 248u), - MK_COL(8u, 0u, 8u), - MK_COL(8u, 248u, 0u), - MK_COL(8u, 8u, 0u), - MK_COL(23u, 23u, 23u), - MK_COL(13u, 13u, 13u), - MK_COL(232u, 232u, 232u), - MK_COL(244u, 244u, 244u), - MK_COL(245u, 245u, 250u), - MK_COL(50u, 50u, 50u), - MK_COL(204u, 204u, 204u), - MK_COL(236u, 236u, 236u), - MK_COL(16u, 16u, 16u), - MK_COL(240u, 16u, 16u), - MK_COL(16u, 240u, 16u), - MK_COL(240u, 240u, 16u), - MK_COL(16u, 16u, 240u), - MK_COL(240u, 16u, 240u), - MK_COL(16u, 240u, 240u), - MK_COL(240u, 240u, 240u), - MK_COL(0u, 0u, 232u), - MK_COL(0u, 232u, 0u), - MK_COL(232u, 0u, 0u), - MK_COL(0u, 0u, 24u), - MK_COL(0u, 24u, 0u), - MK_COL(24u, 0u, 0u), - MK_COL(32u, 32u, 32u), - MK_COL(224u, 32u, 32u), - MK_COL(32u, 224u, 32u), - MK_COL(224u, 224u, 32u), - MK_COL(32u, 32u, 224u), - MK_COL(224u, 32u, 224u), - MK_COL(32u, 224u, 224u), - MK_COL(224u, 224u, 224u), - MK_COL(0u, 0u, 176u), - MK_COL(0u, 0u, 80u), - MK_COL(0u, 176u, 0u), - MK_COL(0u, 176u, 176u), - MK_COL(0u, 176u, 80u), - MK_COL(0u, 80u, 0u), - MK_COL(0u, 80u, 176u), - MK_COL(0u, 80u, 80u), - MK_COL(176u, 0u, 0u), - MK_COL(176u, 0u, 176u), - MK_COL(176u, 0u, 80u), - MK_COL(176u, 176u, 0u), - MK_COL(176u, 80u, 0u), - MK_COL(80u, 0u, 0u), - MK_COL(80u, 0u, 176u), - MK_COL(80u, 0u, 80u), - MK_COL(80u, 176u, 0u), - MK_COL(80u, 80u, 0u), - MK_COL(0u, 0u, 152u), - MK_COL(0u, 0u, 104u), - MK_COL(0u, 152u, 0u), - MK_COL(0u, 152u, 152u), - MK_COL(0u, 152u, 104u), - MK_COL(0u, 104u, 0u), - MK_COL(0u, 104u, 152u), - MK_COL(0u, 104u, 104u), - MK_COL(152u, 0u, 0u), - MK_COL(152u, 0u, 152u), - MK_COL(152u, 0u, 104u), - MK_COL(152u, 152u, 0u), - MK_COL(152u, 104u, 0u), - MK_COL(104u, 0u, 0u), - MK_COL(104u, 0u, 152u), - MK_COL(104u, 0u, 104u), - MK_COL(104u, 152u, 0u), - MK_COL(104u, 104u, 0u), - MK_COL(216u, 216u, 216u), - MK_COL(216u, 216u, 40u), - MK_COL(216u, 216u, 176u), - MK_COL(216u, 216u, 80u), - MK_COL(216u, 40u, 216u), - MK_COL(216u, 40u, 40u), - MK_COL(216u, 40u, 176u), - MK_COL(216u, 40u, 80u), - MK_COL(216u, 176u, 216u), - MK_COL(216u, 176u, 40u), - MK_COL(216u, 176u, 176u), - MK_COL(216u, 176u, 80u), - MK_COL(216u, 80u, 216u), - MK_COL(216u, 80u, 40u), - MK_COL(216u, 80u, 176u), - MK_COL(216u, 80u, 80u), - MK_COL(40u, 216u, 216u), - MK_COL(40u, 216u, 40u), - MK_COL(40u, 216u, 176u), - MK_COL(40u, 216u, 80u), - MK_COL(40u, 40u, 216u), - MK_COL(40u, 40u, 40u), - MK_COL(40u, 40u, 176u), - MK_COL(40u, 40u, 80u), - MK_COL(40u, 176u, 216u), - MK_COL(40u, 176u, 40u), - MK_COL(40u, 176u, 176u), - MK_COL(40u, 176u, 80u), - MK_COL(40u, 80u, 216u), - MK_COL(40u, 80u, 40u), - MK_COL(40u, 80u, 176u), - MK_COL(40u, 80u, 80u), - MK_COL(80u, 216u, 216u), - MK_COL(80u, 216u, 40u), - MK_COL(80u, 216u, 176u), - MK_COL(80u, 216u, 80u), - MK_COL(80u, 40u, 216u), - MK_COL(80u, 40u, 40u), - MK_COL(80u, 40u, 176u), - MK_COL(80u, 40u, 80u), - MK_COL(80u, 176u, 216u), - MK_COL(80u, 176u, 40u), - MK_COL(80u, 176u, 176u), - MK_COL(80u, 176u, 80u), - MK_COL(80u, 80u, 216u), - MK_COL(80u, 80u, 40u), - MK_COL(80u, 80u, 176u), - MK_COL(80u, 80u, 80u), - MK_COL(0u, 0u, 192u), - MK_COL(0u, 0u, 64u), - MK_COL(0u, 0u, 128u), - MK_COL(0u, 192u, 0u), - MK_COL(0u, 192u, 192u), - MK_COL(0u, 192u, 64u), - MK_COL(0u, 192u, 128u), - MK_COL(0u, 64u, 0u), - MK_COL(0u, 64u, 192u), - MK_COL(0u, 64u, 64u), - MK_COL(0u, 64u, 128u), - MK_COL(0u, 128u, 0u), - MK_COL(0u, 128u, 192u), - MK_COL(0u, 128u, 64u), - MK_COL(0u, 128u, 128u), - MK_COL(176u, 216u, 216u), - MK_COL(176u, 216u, 40u), - MK_COL(176u, 216u, 176u), - MK_COL(176u, 216u, 80u), - MK_COL(176u, 40u, 216u), - MK_COL(176u, 40u, 40u), - MK_COL(176u, 40u, 176u), - MK_COL(176u, 40u, 80u), - MK_COL(176u, 176u, 216u), - MK_COL(176u, 176u, 40u), - MK_COL(176u, 176u, 176u), - MK_COL(176u, 176u, 80u), - MK_COL(176u, 80u, 216u), - MK_COL(176u, 80u, 40u), - MK_COL(176u, 80u, 176u), - MK_COL(176u, 80u, 80u), - MK_COL(192u, 0u, 0u), - MK_COL(192u, 0u, 192u), - MK_COL(192u, 0u, 64u), - MK_COL(192u, 0u, 128u), - MK_COL(192u, 192u, 0u), - MK_COL(192u, 192u, 192u), - MK_COL(192u, 192u, 64u), - MK_COL(192u, 192u, 128u), - MK_COL(192u, 64u, 0u), - MK_COL(192u, 64u, 192u), - MK_COL(192u, 64u, 64u), - MK_COL(192u, 64u, 128u), - MK_COL(192u, 128u, 0u), - MK_COL(192u, 128u, 192u), - MK_COL(192u, 128u, 64u), - MK_COL(192u, 128u, 128u), - MK_COL(64u, 0u, 0u), - MK_COL(64u, 0u, 192u), - MK_COL(64u, 0u, 64u), - MK_COL(64u, 0u, 128u), - MK_COL(64u, 192u, 0u), - MK_COL(64u, 192u, 192u), - MK_COL(64u, 192u, 64u), - MK_COL(64u, 192u, 128u), - MK_COL(64u, 64u, 0u), - MK_COL(64u, 64u, 192u), - MK_COL(64u, 64u, 64u), - MK_COL(64u, 64u, 128u), - MK_COL(64u, 128u, 0u), - MK_COL(64u, 128u, 192u), - MK_COL(64u, 128u, 64u), - MK_COL(64u, 128u, 128u), - MK_COL(128u, 0u, 0u), - MK_COL(128u, 0u, 192u), - MK_COL(128u, 0u, 64u), - MK_COL(128u, 0u, 128u), - MK_COL(128u, 192u, 0u), - MK_COL(128u, 192u, 192u), - MK_COL(128u, 192u, 64u), - MK_COL(128u, 192u, 128u), - MK_COL(128u, 64u, 0u), - MK_COL(128u, 64u, 192u), - MK_COL(128u, 64u, 64u), - MK_COL(128u, 64u, 128u), - MK_COL(128u, 128u, 0u), - MK_COL(128u, 128u, 192u), - MK_COL(128u, 128u, 64u), - MK_COL(128u, 128u, 128u), -}; - -#undef MK_COL - -//------------------------------------------------------------------------------ -// TODO(skal): move the functions to dsp/lossless.c when the correct -// granularity is found. For now, we'll just copy-paste some useful bits -// here instead. - -// In-place sum of each component with mod 256. -static WEBP_INLINE void AddPixelsEq(uint32_t* a, uint32_t b) { - const uint32_t alpha_and_green = (*a & 0xff00ff00u) + (b & 0xff00ff00u); - const uint32_t red_and_blue = (*a & 0x00ff00ffu) + (b & 0x00ff00ffu); - *a = (alpha_and_green & 0xff00ff00u) | (red_and_blue & 0x00ff00ffu); -} - -static WEBP_INLINE uint32_t Clip255(uint32_t a) { - if (a < 256) { - return a; - } - // return 0, when a is a negative integer. - // return 255, when a is positive. - return ~a >> 24; -} - -// Delta palettization functions. -static WEBP_INLINE int Square(int x) { - return x * x; -} - -static WEBP_INLINE uint32_t Intensity(uint32_t a) { - return - 30 * ((a >> 16) & 0xff) + - 59 * ((a >> 8) & 0xff) + - 11 * ((a >> 0) & 0xff); -} - -static uint32_t CalcDist(uint32_t predicted_value, uint32_t actual_value, - uint32_t palette_entry) { - int i; - uint32_t distance = 0; - AddPixelsEq(&predicted_value, palette_entry); - for (i = 0; i < 32; i += 8) { - const int32_t av = (actual_value >> i) & 0xff; - const int32_t pv = (predicted_value >> i) & 0xff; - distance += Square(pv - av); - } - // We sum square of intensity difference with factor 10, but because Intensity - // returns 100 times real intensity we need to multiply differences of colors - // by 1000. - distance *= 1000u; - distance += Square(Intensity(predicted_value) - - Intensity(actual_value)); - return distance; -} - -static uint32_t Predict(int x, int y, uint32_t* image) { - const uint32_t t = (y == 0) ? ARGB_BLACK : image[x]; - const uint32_t l = (x == 0) ? ARGB_BLACK : image[x - 1]; - const uint32_t p = - (((((t >> 24) & 0xff) + ((l >> 24) & 0xff)) / 2) << 24) + - (((((t >> 16) & 0xff) + ((l >> 16) & 0xff)) / 2) << 16) + - (((((t >> 8) & 0xff) + ((l >> 8) & 0xff)) / 2) << 8) + - (((((t >> 0) & 0xff) + ((l >> 0) & 0xff)) / 2) << 0); - if (x == 0 && y == 0) return ARGB_BLACK; - if (x == 0) return t; - if (y == 0) return l; - return p; -} - -static WEBP_INLINE int AddSubtractComponentFullWithCoefficient( - int a, int b, int c) { - return Clip255(a + ((b - c) >> 2)); -} - -static WEBP_INLINE uint32_t ClampedAddSubtractFullWithCoefficient( - uint32_t c0, uint32_t c1, uint32_t c2) { - const int a = AddSubtractComponentFullWithCoefficient( - c0 >> 24, c1 >> 24, c2 >> 24); - const int r = AddSubtractComponentFullWithCoefficient((c0 >> 16) & 0xff, - (c1 >> 16) & 0xff, - (c2 >> 16) & 0xff); - const int g = AddSubtractComponentFullWithCoefficient((c0 >> 8) & 0xff, - (c1 >> 8) & 0xff, - (c2 >> 8) & 0xff); - const int b = AddSubtractComponentFullWithCoefficient( - c0 & 0xff, c1 & 0xff, c2 & 0xff); - return ((uint32_t)a << 24) | (r << 16) | (g << 8) | b; -} - -//------------------------------------------------------------------------------ - -// Find palette entry with minimum error from difference of actual pixel value -// and predicted pixel value. Propagate error of pixel to its top and left pixel -// in src array. Write predicted_value + palette_entry to new_image. Return -// index of best palette entry. -static int FindBestPaletteEntry(uint32_t src, uint32_t predicted_value, - const uint32_t palette[], int palette_size) { - int i; - int idx = 0; - uint32_t best_distance = CalcDist(predicted_value, src, palette[0]); - for (i = 1; i < palette_size; ++i) { - const uint32_t distance = CalcDist(predicted_value, src, palette[i]); - if (distance < best_distance) { - best_distance = distance; - idx = i; - } - } - return idx; -} - -static void ApplyBestPaletteEntry(int x, int y, - uint32_t new_value, uint32_t palette_value, - uint32_t* src, int src_stride, - uint32_t* new_image) { - AddPixelsEq(&new_value, palette_value); - if (x > 0) { - src[x - 1] = ClampedAddSubtractFullWithCoefficient(src[x - 1], - new_value, src[x]); - } - if (y > 0) { - src[x - src_stride] = - ClampedAddSubtractFullWithCoefficient(src[x - src_stride], - new_value, src[x]); - } - new_image[x] = new_value; -} - -//------------------------------------------------------------------------------ -// Main entry point - -static WebPEncodingError ApplyDeltaPalette(uint32_t* src, uint32_t* dst, - uint32_t src_stride, - uint32_t dst_stride, - const uint32_t* palette, - int palette_size, - int width, int height, - int num_passes) { - int x, y; - WebPEncodingError err = VP8_ENC_OK; - uint32_t* new_image = (uint32_t*)WebPSafeMalloc(width, sizeof(*new_image)); - uint8_t* const tmp_row = (uint8_t*)WebPSafeMalloc(width, sizeof(*tmp_row)); - if (new_image == NULL || tmp_row == NULL) { - err = VP8_ENC_ERROR_OUT_OF_MEMORY; - goto Error; - } - - while (num_passes--) { - uint32_t* cur_src = src; - uint32_t* cur_dst = dst; - for (y = 0; y < height; ++y) { - for (x = 0; x < width; ++x) { - const uint32_t predicted_value = Predict(x, y, new_image); - tmp_row[x] = FindBestPaletteEntry(cur_src[x], predicted_value, - palette, palette_size); - ApplyBestPaletteEntry(x, y, predicted_value, palette[tmp_row[x]], - cur_src, src_stride, new_image); - } - for (x = 0; x < width; ++x) { - cur_dst[x] = palette[tmp_row[x]]; - } - cur_src += src_stride; - cur_dst += dst_stride; - } - } - Error: - WebPSafeFree(new_image); - WebPSafeFree(tmp_row); - return err; -} - -// replaces enc->argb_ by a palettizable approximation of it, -// and generates optimal enc->palette_[] -WebPEncodingError WebPSearchOptimalDeltaPalette(VP8LEncoder* const enc) { - const WebPPicture* const pic = enc->pic_; - uint32_t* src = pic->argb; - uint32_t* dst = enc->argb_; - const int width = pic->width; - const int height = pic->height; - - WebPEncodingError err = VP8_ENC_OK; - memcpy(enc->palette_, kDeltaPalette, sizeof(kDeltaPalette)); - enc->palette_[DELTA_PALETTE_SIZE - 1] = src[0] - 0xff000000u; - enc->palette_size_ = DELTA_PALETTE_SIZE; - err = ApplyDeltaPalette(src, dst, pic->argb_stride, enc->current_width_, - enc->palette_, enc->palette_size_, - width, height, 2); - if (err != VP8_ENC_OK) goto Error; - - Error: - return err; -} - -#else // !WEBP_EXPERIMENTAL_FEATURES - -WebPEncodingError WebPSearchOptimalDeltaPalette(VP8LEncoder* const enc) { - (void)enc; - return VP8_ENC_ERROR_INVALID_CONFIGURATION; -} - -#endif // WEBP_EXPERIMENTAL_FEATURES diff --git a/src/3rdparty/libwebp/src/enc/delta_palettization_enc.h b/src/3rdparty/libwebp/src/enc/delta_palettization_enc.h deleted file mode 100644 index 63048ec..0000000 --- a/src/3rdparty/libwebp/src/enc/delta_palettization_enc.h +++ /dev/null @@ -1,25 +0,0 @@ -// Copyright 2015 Google Inc. All Rights Reserved. -// -// Use of this source code is governed by a BSD-style license -// that can be found in the COPYING file in the root of the source -// tree. An additional intellectual property rights grant can be found -// in the file PATENTS. All contributing project authors may -// be found in the AUTHORS file in the root of the source tree. -// ----------------------------------------------------------------------------- -// -// Author: Mislav Bradac (mislavm@google.com) -// - -#ifndef WEBP_ENC_DELTA_PALETTIZATION_H_ -#define WEBP_ENC_DELTA_PALETTIZATION_H_ - -#include "../webp/encode.h" -#include "../enc/vp8li_enc.h" - -// Replaces enc->argb_[] input by a palettizable approximation of it, -// and generates optimal enc->palette_[]. -// This function can revert enc->use_palette_ / enc->use_predict_ flag -// if delta-palettization is not producing expected saving. -WebPEncodingError WebPSearchOptimalDeltaPalette(VP8LEncoder* const enc); - -#endif // WEBP_ENC_DELTA_PALETTIZATION_H_ diff --git a/src/3rdparty/libwebp/src/enc/filter_enc.c b/src/3rdparty/libwebp/src/enc/filter_enc.c index 4bc3672..580800b 100644 --- a/src/3rdparty/libwebp/src/enc/filter_enc.c +++ b/src/3rdparty/libwebp/src/enc/filter_enc.c @@ -12,8 +12,8 @@ // Author: somnath@google.com (Somnath Banerjee) #include <assert.h> -#include "./vp8i_enc.h" -#include "../dsp/dsp.h" +#include "src/enc/vp8i_enc.h" +#include "src/dsp/dsp.h" // This table gives, for a given sharpness, the filtering strength to be // used (at least) in order to filter a given edge step delta. @@ -65,6 +65,8 @@ int VP8FilterStrengthFromDelta(int sharpness, int delta) { //------------------------------------------------------------------------------ // Paragraph 15.4: compute the inner-edge filtering strength +#if !defined(WEBP_REDUCE_SIZE) + static int GetILevel(int sharpness, int level) { if (sharpness > 0) { if (sharpness > 4) { @@ -129,11 +131,14 @@ static double GetMBSSIM(const uint8_t* yuv1, const uint8_t* yuv2) { return sum; } +#endif // !defined(WEBP_REDUCE_SIZE) + //------------------------------------------------------------------------------ // Exposed APIs: Encoder should call the following 3 functions to adjust // loop filter strength void VP8InitFilter(VP8EncIterator* const it) { +#if !defined(WEBP_REDUCE_SIZE) if (it->lf_stats_ != NULL) { int s, i; for (s = 0; s < NUM_MB_SEGMENTS; s++) { @@ -143,9 +148,13 @@ void VP8InitFilter(VP8EncIterator* const it) { } VP8SSIMDspInit(); } +#else + (void)it; +#endif } void VP8StoreFilterStats(VP8EncIterator* const it) { +#if !defined(WEBP_REDUCE_SIZE) int d; VP8Encoder* const enc = it->enc_; const int s = it->mb_->segment_; @@ -177,10 +186,14 @@ void VP8StoreFilterStats(VP8EncIterator* const it) { DoFilter(it, level); (*it->lf_stats_)[s][level] += GetMBSSIM(it->yuv_in_, it->yuv_out2_); } +#else // defined(WEBP_REDUCE_SIZE) + (void)it; +#endif // !defined(WEBP_REDUCE_SIZE) } void VP8AdjustFilterStrength(VP8EncIterator* const it) { VP8Encoder* const enc = it->enc_; +#if !defined(WEBP_REDUCE_SIZE) if (it->lf_stats_ != NULL) { int s; for (s = 0; s < NUM_MB_SEGMENTS; s++) { @@ -196,7 +209,10 @@ void VP8AdjustFilterStrength(VP8EncIterator* const it) { } enc->dqm_[s].fstrength_ = best_level; } - } else if (enc->config_->filter_strength > 0) { + return; + } +#endif // !defined(WEBP_REDUCE_SIZE) + if (enc->config_->filter_strength > 0) { int max_level = 0; int s; for (s = 0; s < NUM_MB_SEGMENTS; s++) { diff --git a/src/3rdparty/libwebp/src/enc/frame_enc.c b/src/3rdparty/libwebp/src/enc/frame_enc.c index abef523..1aec376 100644 --- a/src/3rdparty/libwebp/src/enc/frame_enc.c +++ b/src/3rdparty/libwebp/src/enc/frame_enc.c @@ -14,10 +14,10 @@ #include <string.h> #include <math.h> -#include "./cost_enc.h" -#include "./vp8i_enc.h" -#include "../dsp/dsp.h" -#include "../webp/format_constants.h" // RIFF constants +#include "src/enc/cost_enc.h" +#include "src/enc/vp8i_enc.h" +#include "src/dsp/dsp.h" +#include "src/webp/format_constants.h" // RIFF constants #define SEGMENT_VISU 0 #define DEBUG_SEARCH 0 // useful to track search convergence @@ -198,13 +198,15 @@ static void SetSegmentProbas(VP8Encoder* const enc) { for (n = 0; n < enc->mb_w_ * enc->mb_h_; ++n) { const VP8MBInfo* const mb = &enc->mb_info_[n]; - p[mb->segment_]++; + ++p[mb->segment_]; } +#if !defined(WEBP_DISABLE_STATS) if (enc->pic_->stats != NULL) { for (n = 0; n < NUM_MB_SEGMENTS; ++n) { enc->pic_->stats->segment_size[n] = p[n]; } } +#endif if (enc->segment_hdr_.num_segments_ > 1) { uint8_t* const probas = enc->proba_.segments_; probas[0] = GetProba(p[0] + p[1], p[2] + p[3]); @@ -452,6 +454,8 @@ static int RecordTokens(VP8EncIterator* const it, const VP8ModeScore* const rd, //------------------------------------------------------------------------------ // ExtraInfo map / Debug function +#if !defined(WEBP_DISABLE_STATS) + #if SEGMENT_VISU static void SetBlock(uint8_t* p, int value, int size) { int y; @@ -516,6 +520,34 @@ static void StoreSideInfo(const VP8EncIterator* const it) { #endif } +static void ResetSideInfo(const VP8EncIterator* const it) { + VP8Encoder* const enc = it->enc_; + WebPPicture* const pic = enc->pic_; + if (pic->stats != NULL) { + memset(enc->block_count_, 0, sizeof(enc->block_count_)); + } + ResetSSE(enc); +} +#else // defined(WEBP_DISABLE_STATS) +static void ResetSSE(VP8Encoder* const enc) { + (void)enc; +} +static void StoreSideInfo(const VP8EncIterator* const it) { + VP8Encoder* const enc = it->enc_; + WebPPicture* const pic = enc->pic_; + if (pic->extra_info != NULL) { + if (it->x_ == 0 && it->y_ == 0) { // only do it once, at start + memset(pic->extra_info, 0, + enc->mb_w_ * enc->mb_h_ * sizeof(*pic->extra_info)); + } + } +} + +static void ResetSideInfo(const VP8EncIterator* const it) { + (void)it; +} +#endif // !defined(WEBP_DISABLE_STATS) + static double GetPSNR(uint64_t mse, uint64_t size) { return (mse > 0 && size > 0) ? 10. * log10(255. * 255. * size / mse) : 99; } @@ -552,7 +584,7 @@ static uint64_t OneStatPass(VP8Encoder* const enc, VP8RDLevel rd_opt, VP8IteratorImport(&it, NULL); if (VP8Decimate(&it, &info, rd_opt)) { // Just record the number of skips and act like skip_proba is not used. - enc->proba_.nb_skip_++; + ++enc->proba_.nb_skip_; } RecordResiduals(&it, &info); size += info.R + info.H; @@ -640,7 +672,7 @@ static int StatLoop(VP8Encoder* const enc) { // Main loops // -static const int kAverageBytesPerMB[8] = { 50, 24, 16, 9, 7, 5, 3, 2 }; +static const uint8_t kAverageBytesPerMB[8] = { 50, 24, 16, 9, 7, 5, 3, 2 }; static int PreLoopInitialize(VP8Encoder* const enc) { int p; @@ -670,6 +702,7 @@ static int PostLoopFinalize(VP8EncIterator* const it, int ok) { } if (ok) { // All good. Finish up. +#if !defined(WEBP_DISABLE_STATS) if (enc->pic_->stats != NULL) { // finalize byte counters... int i, s; for (i = 0; i <= 2; ++i) { @@ -678,6 +711,7 @@ static int PostLoopFinalize(VP8EncIterator* const it, int ok) { } } } +#endif VP8AdjustFilterStrength(it); // ...and store filter stats. } else { // Something bad happened -> need to do some memory cleanup. @@ -821,6 +855,9 @@ int VP8EncTokenLoop(VP8Encoder* const enc) { if (enc->max_i4_header_bits_ > 0 && size_p0 > PARTITION0_SIZE_LIMIT) { ++num_pass_left; enc->max_i4_header_bits_ >>= 1; // strengthen header bit limitation... + if (is_last_pass) { + ResetSideInfo(&it); + } continue; // ...and start over } if (is_last_pass) { @@ -851,4 +888,3 @@ int VP8EncTokenLoop(VP8Encoder* const enc) { #endif // DISABLE_TOKEN_BUFFER //------------------------------------------------------------------------------ - diff --git a/src/3rdparty/libwebp/src/enc/histogram_enc.c b/src/3rdparty/libwebp/src/enc/histogram_enc.c index 808b6f7..d89b985 100644 --- a/src/3rdparty/libwebp/src/enc/histogram_enc.c +++ b/src/3rdparty/libwebp/src/enc/histogram_enc.c @@ -10,16 +10,16 @@ // Author: Jyrki Alakuijala (jyrki@google.com) // #ifdef HAVE_CONFIG_H -#include "../webp/config.h" +#include "src/webp/config.h" #endif #include <math.h> -#include "./backward_references_enc.h" -#include "./histogram_enc.h" -#include "../dsp/lossless.h" -#include "../dsp/lossless_common.h" -#include "../utils/utils.h" +#include "src/enc/backward_references_enc.h" +#include "src/enc/histogram_enc.h" +#include "src/dsp/lossless.h" +#include "src/dsp/lossless_common.h" +#include "src/utils/utils.h" #define MAX_COST 1.e38 @@ -51,10 +51,12 @@ static void HistogramCopy(const VP8LHistogram* const src, VP8LHistogram* const dst) { uint32_t* const dst_literal = dst->literal_; const int dst_cache_bits = dst->palette_code_bits_; + const int literal_size = VP8LHistogramNumCodes(dst_cache_bits); const int histo_size = VP8LGetHistogramSize(dst_cache_bits); assert(src->palette_code_bits_ == dst_cache_bits); memcpy(dst, src, histo_size); dst->literal_ = dst_literal; + memcpy(dst->literal_, src->literal_, literal_size * sizeof(*dst->literal_)); } int VP8LGetHistogramSize(int cache_bits) { @@ -76,7 +78,7 @@ void VP8LHistogramStoreRefs(const VP8LBackwardRefs* const refs, VP8LHistogram* const histo) { VP8LRefsCursor c = VP8LRefsCursorInit(refs); while (VP8LRefsCursorOk(&c)) { - VP8LHistogramAddSinglePixOrCopy(histo, c.cur_pos); + VP8LHistogramAddSinglePixOrCopy(histo, c.cur_pos, NULL, 0); VP8LRefsCursorNext(&c); } } @@ -91,9 +93,19 @@ void VP8LHistogramCreate(VP8LHistogram* const p, VP8LHistogramStoreRefs(refs, p); } -void VP8LHistogramInit(VP8LHistogram* const p, int palette_code_bits) { +void VP8LHistogramInit(VP8LHistogram* const p, int palette_code_bits, + int init_arrays) { p->palette_code_bits_ = palette_code_bits; - HistogramClear(p); + if (init_arrays) { + HistogramClear(p); + } else { + p->trivial_symbol_ = 0; + p->bit_cost_ = 0.; + p->literal_cost_ = 0.; + p->red_cost_ = 0.; + p->blue_cost_ = 0.; + memset(p->is_used_, 0, sizeof(p->is_used_)); + } } VP8LHistogram* VP8LAllocateHistogram(int cache_bits) { @@ -104,41 +116,90 @@ VP8LHistogram* VP8LAllocateHistogram(int cache_bits) { histo = (VP8LHistogram*)memory; // literal_ won't necessary be aligned. histo->literal_ = (uint32_t*)(memory + sizeof(VP8LHistogram)); - VP8LHistogramInit(histo, cache_bits); + VP8LHistogramInit(histo, cache_bits, /*init_arrays=*/ 0); return histo; } +// Resets the pointers of the histograms to point to the bit buffer in the set. +static void HistogramSetResetPointers(VP8LHistogramSet* const set, + int cache_bits) { + int i; + const int histo_size = VP8LGetHistogramSize(cache_bits); + uint8_t* memory = (uint8_t*) (set->histograms); + memory += set->max_size * sizeof(*set->histograms); + for (i = 0; i < set->max_size; ++i) { + memory = (uint8_t*) WEBP_ALIGN(memory); + set->histograms[i] = (VP8LHistogram*) memory; + // literal_ won't necessary be aligned. + set->histograms[i]->literal_ = (uint32_t*)(memory + sizeof(VP8LHistogram)); + memory += histo_size; + } +} + +// Returns the total size of the VP8LHistogramSet. +static size_t HistogramSetTotalSize(int size, int cache_bits) { + const int histo_size = VP8LGetHistogramSize(cache_bits); + return (sizeof(VP8LHistogramSet) + size * (sizeof(VP8LHistogram*) + + histo_size + WEBP_ALIGN_CST)); +} + VP8LHistogramSet* VP8LAllocateHistogramSet(int size, int cache_bits) { int i; VP8LHistogramSet* set; - const int histo_size = VP8LGetHistogramSize(cache_bits); - const size_t total_size = - sizeof(*set) + size * (sizeof(*set->histograms) + - histo_size + WEBP_ALIGN_CST); + const size_t total_size = HistogramSetTotalSize(size, cache_bits); uint8_t* memory = (uint8_t*)WebPSafeMalloc(total_size, sizeof(*memory)); if (memory == NULL) return NULL; set = (VP8LHistogramSet*)memory; memory += sizeof(*set); set->histograms = (VP8LHistogram**)memory; - memory += size * sizeof(*set->histograms); set->max_size = size; set->size = size; + HistogramSetResetPointers(set, cache_bits); for (i = 0; i < size; ++i) { - memory = (uint8_t*)WEBP_ALIGN(memory); - set->histograms[i] = (VP8LHistogram*)memory; - // literal_ won't necessary be aligned. - set->histograms[i]->literal_ = (uint32_t*)(memory + sizeof(VP8LHistogram)); - VP8LHistogramInit(set->histograms[i], cache_bits); - memory += histo_size; + VP8LHistogramInit(set->histograms[i], cache_bits, /*init_arrays=*/ 0); } return set; } +void VP8LHistogramSetClear(VP8LHistogramSet* const set) { + int i; + const int cache_bits = set->histograms[0]->palette_code_bits_; + const int size = set->max_size; + const size_t total_size = HistogramSetTotalSize(size, cache_bits); + uint8_t* memory = (uint8_t*)set; + + memset(memory, 0, total_size); + memory += sizeof(*set); + set->histograms = (VP8LHistogram**)memory; + set->max_size = size; + set->size = size; + HistogramSetResetPointers(set, cache_bits); + for (i = 0; i < size; ++i) { + set->histograms[i]->palette_code_bits_ = cache_bits; + } +} + +// Removes the histogram 'i' from 'set' by setting it to NULL. +static void HistogramSetRemoveHistogram(VP8LHistogramSet* const set, int i, + int* const num_used) { + assert(set->histograms[i] != NULL); + set->histograms[i] = NULL; + --*num_used; + // If we remove the last valid one, shrink until the next valid one. + if (i == set->size - 1) { + while (set->size >= 1 && set->histograms[set->size - 1] == NULL) { + --set->size; + } + } +} + // ----------------------------------------------------------------------------- void VP8LHistogramAddSinglePixOrCopy(VP8LHistogram* const histo, - const PixOrCopy* const v) { + const PixOrCopy* const v, + int (*const distance_modifier)(int, int), + int distance_modifier_arg0) { if (PixOrCopyIsLiteral(v)) { ++histo->alpha_[PixOrCopyLiteral(v, 3)]; ++histo->red_[PixOrCopyLiteral(v, 2)]; @@ -152,7 +213,13 @@ void VP8LHistogramAddSinglePixOrCopy(VP8LHistogram* const histo, int code, extra_bits; VP8LPrefixEncodeBits(PixOrCopyLength(v), &code, &extra_bits); ++histo->literal_[NUM_LITERAL_CODES + code]; - VP8LPrefixEncodeBits(PixOrCopyDistance(v), &code, &extra_bits); + if (distance_modifier == NULL) { + VP8LPrefixEncodeBits(PixOrCopyDistance(v), &code, &extra_bits); + } else { + VP8LPrefixEncodeBits( + distance_modifier(distance_modifier_arg0, PixOrCopyDistance(v)), + &code, &extra_bits); + } ++histo->distance_[code]; } } @@ -192,14 +259,9 @@ static WEBP_INLINE double BitsEntropyRefine(const VP8LBitEntropy* entropy) { } } -double VP8LBitsEntropy(const uint32_t* const array, int n, - uint32_t* const trivial_symbol) { +double VP8LBitsEntropy(const uint32_t* const array, int n) { VP8LBitEntropy entropy; VP8LBitsEntropyUnrefined(array, n, &entropy); - if (trivial_symbol != NULL) { - *trivial_symbol = - (entropy.nonzeros == 1) ? entropy.nonzero_code : VP8L_NON_TRIVIAL_SYM; - } return BitsEntropyRefine(&entropy); } @@ -234,7 +296,8 @@ static double FinalHuffmanCost(const VP8LStreaks* const stats) { // Get the symbol entropy for the distribution 'population'. // Set 'trivial_sym', if there's only one symbol present in the distribution. static double PopulationCost(const uint32_t* const population, int length, - uint32_t* const trivial_sym) { + uint32_t* const trivial_sym, + uint8_t* const is_used) { VP8LBitEntropy bit_entropy; VP8LStreaks stats; VP8LGetEntropyUnrefined(population, length, &bit_entropy, &stats); @@ -242,6 +305,8 @@ static double PopulationCost(const uint32_t* const population, int length, *trivial_sym = (bit_entropy.nonzeros == 1) ? bit_entropy.nonzero_code : VP8L_NON_TRIVIAL_SYM; } + // The histogram is used if there is at least one non-zero streak. + *is_used = (stats.streaks[1][0] != 0 || stats.streaks[1][1] != 0); return BitsEntropyRefine(&bit_entropy) + FinalHuffmanCost(&stats); } @@ -250,7 +315,9 @@ static double PopulationCost(const uint32_t* const population, int length, // non-zero: both the zero-th one, or both the last one. static WEBP_INLINE double GetCombinedEntropy(const uint32_t* const X, const uint32_t* const Y, - int length, int trivial_at_end) { + int length, int is_X_used, + int is_Y_used, + int trivial_at_end) { VP8LStreaks stats; if (trivial_at_end) { // This configuration is due to palettization that transforms an indexed @@ -259,28 +326,43 @@ static WEBP_INLINE double GetCombinedEntropy(const uint32_t* const X, // Only FinalHuffmanCost needs to be evaluated. memset(&stats, 0, sizeof(stats)); // Deal with the non-zero value at index 0 or length-1. - stats.streaks[1][0] += 1; + stats.streaks[1][0] = 1; // Deal with the following/previous zero streak. - stats.counts[0] += 1; - stats.streaks[0][1] += length - 1; + stats.counts[0] = 1; + stats.streaks[0][1] = length - 1; return FinalHuffmanCost(&stats); } else { VP8LBitEntropy bit_entropy; - VP8LGetCombinedEntropyUnrefined(X, Y, length, &bit_entropy, &stats); + if (is_X_used) { + if (is_Y_used) { + VP8LGetCombinedEntropyUnrefined(X, Y, length, &bit_entropy, &stats); + } else { + VP8LGetEntropyUnrefined(X, length, &bit_entropy, &stats); + } + } else { + if (is_Y_used) { + VP8LGetEntropyUnrefined(Y, length, &bit_entropy, &stats); + } else { + memset(&stats, 0, sizeof(stats)); + stats.counts[0] = 1; + stats.streaks[0][length > 3] = length; + VP8LBitEntropyInit(&bit_entropy); + } + } return BitsEntropyRefine(&bit_entropy) + FinalHuffmanCost(&stats); } } // Estimates the Entropy + Huffman + other block overhead size cost. -double VP8LHistogramEstimateBits(const VP8LHistogram* const p) { +double VP8LHistogramEstimateBits(VP8LHistogram* const p) { return - PopulationCost( - p->literal_, VP8LHistogramNumCodes(p->palette_code_bits_), NULL) - + PopulationCost(p->red_, NUM_LITERAL_CODES, NULL) - + PopulationCost(p->blue_, NUM_LITERAL_CODES, NULL) - + PopulationCost(p->alpha_, NUM_LITERAL_CODES, NULL) - + PopulationCost(p->distance_, NUM_DISTANCE_CODES, NULL) + PopulationCost(p->literal_, VP8LHistogramNumCodes(p->palette_code_bits_), + NULL, &p->is_used_[0]) + + PopulationCost(p->red_, NUM_LITERAL_CODES, NULL, &p->is_used_[1]) + + PopulationCost(p->blue_, NUM_LITERAL_CODES, NULL, &p->is_used_[2]) + + PopulationCost(p->alpha_, NUM_LITERAL_CODES, NULL, &p->is_used_[3]) + + PopulationCost(p->distance_, NUM_DISTANCE_CODES, NULL, &p->is_used_[4]) + VP8LExtraCost(p->literal_ + NUM_LITERAL_CODES, NUM_LENGTH_CODES) + VP8LExtraCost(p->distance_, NUM_DISTANCE_CODES); } @@ -296,7 +378,8 @@ static int GetCombinedHistogramEntropy(const VP8LHistogram* const a, int trivial_at_end = 0; assert(a->palette_code_bits_ == b->palette_code_bits_); *cost += GetCombinedEntropy(a->literal_, b->literal_, - VP8LHistogramNumCodes(palette_code_bits), 0); + VP8LHistogramNumCodes(palette_code_bits), + a->is_used_[0], b->is_used_[0], 0); *cost += VP8LExtraCostCombined(a->literal_ + NUM_LITERAL_CODES, b->literal_ + NUM_LITERAL_CODES, NUM_LENGTH_CODES); @@ -316,19 +399,23 @@ static int GetCombinedHistogramEntropy(const VP8LHistogram* const a, } *cost += - GetCombinedEntropy(a->red_, b->red_, NUM_LITERAL_CODES, trivial_at_end); + GetCombinedEntropy(a->red_, b->red_, NUM_LITERAL_CODES, a->is_used_[1], + b->is_used_[1], trivial_at_end); if (*cost > cost_threshold) return 0; *cost += - GetCombinedEntropy(a->blue_, b->blue_, NUM_LITERAL_CODES, trivial_at_end); + GetCombinedEntropy(a->blue_, b->blue_, NUM_LITERAL_CODES, a->is_used_[2], + b->is_used_[2], trivial_at_end); if (*cost > cost_threshold) return 0; - *cost += GetCombinedEntropy(a->alpha_, b->alpha_, NUM_LITERAL_CODES, - trivial_at_end); + *cost += + GetCombinedEntropy(a->alpha_, b->alpha_, NUM_LITERAL_CODES, + a->is_used_[3], b->is_used_[3], trivial_at_end); if (*cost > cost_threshold) return 0; *cost += - GetCombinedEntropy(a->distance_, b->distance_, NUM_DISTANCE_CODES, 0); + GetCombinedEntropy(a->distance_, b->distance_, NUM_DISTANCE_CODES, + a->is_used_[4], b->is_used_[4], 0); *cost += VP8LExtraCostCombined(a->distance_, b->distance_, NUM_DISTANCE_CODES); if (*cost > cost_threshold) return 0; @@ -374,7 +461,9 @@ static double HistogramAddEval(const VP8LHistogram* const a, static double HistogramAddThresh(const VP8LHistogram* const a, const VP8LHistogram* const b, double cost_threshold) { - double cost = -a->bit_cost_; + double cost; + assert(a != NULL && b != NULL); + cost = -a->bit_cost_; GetCombinedHistogramEntropy(a, b, cost_threshold, &cost); return cost; } @@ -416,16 +505,19 @@ static void UpdateDominantCostRange( static void UpdateHistogramCost(VP8LHistogram* const h) { uint32_t alpha_sym, red_sym, blue_sym; const double alpha_cost = - PopulationCost(h->alpha_, NUM_LITERAL_CODES, &alpha_sym); + PopulationCost(h->alpha_, NUM_LITERAL_CODES, &alpha_sym, + &h->is_used_[3]); const double distance_cost = - PopulationCost(h->distance_, NUM_DISTANCE_CODES, NULL) + + PopulationCost(h->distance_, NUM_DISTANCE_CODES, NULL, &h->is_used_[4]) + VP8LExtraCost(h->distance_, NUM_DISTANCE_CODES); const int num_codes = VP8LHistogramNumCodes(h->palette_code_bits_); - h->literal_cost_ = PopulationCost(h->literal_, num_codes, NULL) + - VP8LExtraCost(h->literal_ + NUM_LITERAL_CODES, - NUM_LENGTH_CODES); - h->red_cost_ = PopulationCost(h->red_, NUM_LITERAL_CODES, &red_sym); - h->blue_cost_ = PopulationCost(h->blue_, NUM_LITERAL_CODES, &blue_sym); + h->literal_cost_ = + PopulationCost(h->literal_, num_codes, NULL, &h->is_used_[0]) + + VP8LExtraCost(h->literal_ + NUM_LITERAL_CODES, NUM_LENGTH_CODES); + h->red_cost_ = + PopulationCost(h->red_, NUM_LITERAL_CODES, &red_sym, &h->is_used_[1]); + h->blue_cost_ = + PopulationCost(h->blue_, NUM_LITERAL_CODES, &blue_sym, &h->is_used_[2]); h->bit_cost_ = h->literal_cost_ + h->red_cost_ + h->blue_cost_ + alpha_cost + distance_cost; if ((alpha_sym | red_sym | blue_sym) == VP8L_NON_TRIVIAL_SYM) { @@ -470,10 +562,11 @@ static void HistogramBuild( VP8LHistogram** const histograms = image_histo->histograms; VP8LRefsCursor c = VP8LRefsCursorInit(backward_refs); assert(histo_bits > 0); + VP8LHistogramSetClear(image_histo); while (VP8LRefsCursorOk(&c)) { const PixOrCopy* const v = c.cur_pos; const int ix = (y >> histo_bits) * histo_xsize + (x >> histo_bits); - VP8LHistogramAddSinglePixOrCopy(histograms[ix], v); + VP8LHistogramAddSinglePixOrCopy(histograms[ix], v, NULL, 0); x += PixOrCopyLength(v); while (x >= xsize) { x -= xsize; @@ -484,17 +577,37 @@ static void HistogramBuild( } // Copies the histograms and computes its bit_cost. -static void HistogramCopyAndAnalyze( - VP8LHistogramSet* const orig_histo, VP8LHistogramSet* const image_histo) { - int i; - const int histo_size = orig_histo->size; +static const uint16_t kInvalidHistogramSymbol = (uint16_t)(-1); +static void HistogramCopyAndAnalyze(VP8LHistogramSet* const orig_histo, + VP8LHistogramSet* const image_histo, + int* const num_used, + uint16_t* const histogram_symbols) { + int i, cluster_id; + int num_used_orig = *num_used; VP8LHistogram** const orig_histograms = orig_histo->histograms; VP8LHistogram** const histograms = image_histo->histograms; - for (i = 0; i < histo_size; ++i) { + assert(image_histo->max_size == orig_histo->max_size); + for (cluster_id = 0, i = 0; i < orig_histo->max_size; ++i) { VP8LHistogram* const histo = orig_histograms[i]; UpdateHistogramCost(histo); - // Copy histograms from orig_histo[] to image_histo[]. - HistogramCopy(histo, histograms[i]); + + // Skip the histogram if it is completely empty, which can happen for tiles + // with no information (when they are skipped because of LZ77). + if (!histo->is_used_[0] && !histo->is_used_[1] && !histo->is_used_[2] + && !histo->is_used_[3] && !histo->is_used_[4]) { + // The first histogram is always used. If an histogram is empty, we set + // its id to be the same as the previous one: this will improve + // compressibility for later LZ77. + assert(i > 0); + HistogramSetRemoveHistogram(image_histo, i, num_used); + HistogramSetRemoveHistogram(orig_histo, i, &num_used_orig); + histogram_symbols[i] = kInvalidHistogramSymbol; + } else { + // Copy histograms from orig_histo[] to image_histo[]. + HistogramCopy(histo, histograms[i]); + histogram_symbols[i] = cluster_id++; + assert(cluster_id <= image_histo->max_size); + } } } @@ -511,28 +624,33 @@ static void HistogramAnalyzeEntropyBin(VP8LHistogramSet* const image_histo, // Analyze the dominant (literal, red and blue) entropy costs. for (i = 0; i < histo_size; ++i) { + if (histograms[i] == NULL) continue; UpdateDominantCostRange(histograms[i], &cost_range); } // bin-hash histograms on three of the dominant (literal, red and blue) // symbol costs and store the resulting bin_id for each histogram. for (i = 0; i < histo_size; ++i) { + // bin_map[i] is not set to a special value as its use will later be guarded + // by another (histograms[i] == NULL). + if (histograms[i] == NULL) continue; bin_map[i] = GetHistoBinIndex(histograms[i], &cost_range, low_effort); } } -// Compact image_histo[] by merging some histograms with same bin_id together if -// it's advantageous. -static VP8LHistogram* HistogramCombineEntropyBin( - VP8LHistogramSet* const image_histo, - VP8LHistogram* cur_combo, - const uint16_t* const bin_map, int bin_map_size, int num_bins, - double combine_cost_factor, int low_effort) { +// Merges some histograms with same bin_id together if it's advantageous. +// Sets the remaining histograms to NULL. +static void HistogramCombineEntropyBin(VP8LHistogramSet* const image_histo, + int *num_used, + const uint16_t* const clusters, + uint16_t* const cluster_mappings, + VP8LHistogram* cur_combo, + const uint16_t* const bin_map, + int num_bins, + double combine_cost_factor, + int low_effort) { VP8LHistogram** const histograms = image_histo->histograms; int idx; - // Work in-place: processed histograms are put at the beginning of - // image_histo[]. At the end, we just have to truncate the array. - int size = 0; struct { int16_t first; // position of the histogram that accumulates all // histograms with the same bin_id @@ -545,16 +663,19 @@ static VP8LHistogram* HistogramCombineEntropyBin( bin_info[idx].num_combine_failures = 0; } - for (idx = 0; idx < bin_map_size; ++idx) { - const int bin_id = bin_map[idx]; - const int first = bin_info[bin_id].first; - assert(size <= idx); + // By default, a cluster matches itself. + for (idx = 0; idx < *num_used; ++idx) cluster_mappings[idx] = idx; + for (idx = 0; idx < image_histo->size; ++idx) { + int bin_id, first; + if (histograms[idx] == NULL) continue; + bin_id = bin_map[idx]; + first = bin_info[bin_id].first; if (first == -1) { - // just move histogram #idx to its final position - histograms[size] = histograms[idx]; - bin_info[bin_id].first = size++; + bin_info[bin_id].first = idx; } else if (low_effort) { HistogramAdd(histograms[idx], histograms[first], histograms[first]); + HistogramSetRemoveHistogram(image_histo, idx, num_used); + cluster_mappings[clusters[idx]] = clusters[first]; } else { // try to merge #idx into #first (both share the same bin_id) const double bit_cost = histograms[idx]->bit_cost_; @@ -577,30 +698,28 @@ static VP8LHistogram* HistogramCombineEntropyBin( bin_info[bin_id].num_combine_failures >= max_combine_failures) { // move the (better) merged histogram to its final slot HistogramSwap(&cur_combo, &histograms[first]); + HistogramSetRemoveHistogram(image_histo, idx, num_used); + cluster_mappings[clusters[idx]] = clusters[first]; } else { - histograms[size++] = histograms[idx]; ++bin_info[bin_id].num_combine_failures; } - } else { - histograms[size++] = histograms[idx]; } } } - image_histo->size = size; if (low_effort) { // for low_effort case, update the final cost when everything is merged - for (idx = 0; idx < size; ++idx) { + for (idx = 0; idx < image_histo->size; ++idx) { + if (histograms[idx] == NULL) continue; UpdateHistogramCost(histograms[idx]); } } - return cur_combo; } +// Implement a Lehmer random number generator with a multiplicative constant of +// 48271 and a modulo constant of 2^31 - 1. static uint32_t MyRand(uint32_t* const seed) { - *seed = (*seed * 16807ull) & 0xffffffffu; - if (*seed == 0) { - *seed = 1; - } + *seed = (uint32_t)(((uint64_t)(*seed) * 48271u) % 2147483647u); + assert(*seed > 0); return *seed; } @@ -621,16 +740,9 @@ typedef struct { int max_size; } HistoQueue; -static int HistoQueueInit(HistoQueue* const histo_queue, const int max_index) { +static int HistoQueueInit(HistoQueue* const histo_queue, const int max_size) { histo_queue->size = 0; - // max_index^2 for the queue size is safe. If you look at - // HistogramCombineGreedy, and imagine that UpdateQueueFront always pushes - // data to the queue, you insert at most: - // - max_index*(max_index-1)/2 (the first two for loops) - // - max_index - 1 in the last for loop at the first iteration of the while - // loop, max_index - 2 at the second iteration ... therefore - // max_index*(max_index-1)/2 overall too - histo_queue->max_size = max_index * max_index; + histo_queue->max_size = max_size; // We allocate max_size + 1 because the last element at index "size" is // used as temporary data (and it could be up to max_size). histo_queue->queue = (HistogramPair*)WebPSafeMalloc( @@ -641,249 +753,319 @@ static int HistoQueueInit(HistoQueue* const histo_queue, const int max_index) { static void HistoQueueClear(HistoQueue* const histo_queue) { assert(histo_queue != NULL); WebPSafeFree(histo_queue->queue); + histo_queue->size = 0; + histo_queue->max_size = 0; } -static void SwapHistogramPairs(HistogramPair *p1, - HistogramPair *p2) { - const HistogramPair tmp = *p1; - *p1 = *p2; - *p2 = tmp; +// Pop a specific pair in the queue by replacing it with the last one +// and shrinking the queue. +static void HistoQueuePopPair(HistoQueue* const histo_queue, + HistogramPair* const pair) { + assert(pair >= histo_queue->queue && + pair < (histo_queue->queue + histo_queue->size)); + assert(histo_queue->size > 0); + *pair = histo_queue->queue[histo_queue->size - 1]; + --histo_queue->size; } -// Given a valid priority queue in range [0, queue_size) this function checks -// whether histo_queue[queue_size] should be accepted and swaps it with the -// front if it is smaller. Otherwise, it leaves it as is. -static void UpdateQueueFront(HistoQueue* const histo_queue) { - if (histo_queue->queue[histo_queue->size].cost_diff >= 0) return; - - if (histo_queue->queue[histo_queue->size].cost_diff < - histo_queue->queue[0].cost_diff) { - SwapHistogramPairs(histo_queue->queue, - histo_queue->queue + histo_queue->size); +// Check whether a pair in the queue should be updated as head or not. +static void HistoQueueUpdateHead(HistoQueue* const histo_queue, + HistogramPair* const pair) { + assert(pair->cost_diff < 0.); + assert(pair >= histo_queue->queue && + pair < (histo_queue->queue + histo_queue->size)); + assert(histo_queue->size > 0); + if (pair->cost_diff < histo_queue->queue[0].cost_diff) { + // Replace the best pair. + const HistogramPair tmp = histo_queue->queue[0]; + histo_queue->queue[0] = *pair; + *pair = tmp; } - ++histo_queue->size; - - // We cannot add more elements than the capacity. - // The allocation adds an extra element to the official capacity so that - // histo_queue->queue[histo_queue->max_size] is read/written within bound. - assert(histo_queue->size <= histo_queue->max_size); } -// ----------------------------------------------------------------------------- - -static void PreparePair(VP8LHistogram** histograms, int idx1, int idx2, - HistogramPair* const pair) { - VP8LHistogram* h1; - VP8LHistogram* h2; - double sum_cost; +// Update the cost diff and combo of a pair of histograms. This needs to be +// called when the the histograms have been merged with a third one. +static void HistoQueueUpdatePair(const VP8LHistogram* const h1, + const VP8LHistogram* const h2, + double threshold, + HistogramPair* const pair) { + const double sum_cost = h1->bit_cost_ + h2->bit_cost_; + pair->cost_combo = 0.; + GetCombinedHistogramEntropy(h1, h2, sum_cost + threshold, &pair->cost_combo); + pair->cost_diff = pair->cost_combo - sum_cost; +} +// Create a pair from indices "idx1" and "idx2" provided its cost +// is inferior to "threshold", a negative entropy. +// It returns the cost of the pair, or 0. if it superior to threshold. +static double HistoQueuePush(HistoQueue* const histo_queue, + VP8LHistogram** const histograms, int idx1, + int idx2, double threshold) { + const VP8LHistogram* h1; + const VP8LHistogram* h2; + HistogramPair pair; + + // Stop here if the queue is full. + if (histo_queue->size == histo_queue->max_size) return 0.; + assert(threshold <= 0.); if (idx1 > idx2) { const int tmp = idx2; idx2 = idx1; idx1 = tmp; } - pair->idx1 = idx1; - pair->idx2 = idx2; + pair.idx1 = idx1; + pair.idx2 = idx2; h1 = histograms[idx1]; h2 = histograms[idx2]; - sum_cost = h1->bit_cost_ + h2->bit_cost_; - pair->cost_combo = 0.; - GetCombinedHistogramEntropy(h1, h2, sum_cost, &pair->cost_combo); - pair->cost_diff = pair->cost_combo - sum_cost; + + HistoQueueUpdatePair(h1, h2, threshold, &pair); + + // Do not even consider the pair if it does not improve the entropy. + if (pair.cost_diff >= threshold) return 0.; + + histo_queue->queue[histo_queue->size++] = pair; + HistoQueueUpdateHead(histo_queue, &histo_queue->queue[histo_queue->size - 1]); + + return pair.cost_diff; } +// ----------------------------------------------------------------------------- + // Combines histograms by continuously choosing the one with the highest cost // reduction. -static int HistogramCombineGreedy(VP8LHistogramSet* const image_histo) { +static int HistogramCombineGreedy(VP8LHistogramSet* const image_histo, + int* const num_used) { int ok = 0; - int image_histo_size = image_histo->size; + const int image_histo_size = image_histo->size; int i, j; VP8LHistogram** const histograms = image_histo->histograms; - // Indexes of remaining histograms. - int* const clusters = - (int*)WebPSafeMalloc(image_histo_size, sizeof(*clusters)); // Priority queue of histogram pairs. HistoQueue histo_queue; - if (!HistoQueueInit(&histo_queue, image_histo_size) || clusters == NULL) { + // image_histo_size^2 for the queue size is safe. If you look at + // HistogramCombineGreedy, and imagine that UpdateQueueFront always pushes + // data to the queue, you insert at most: + // - image_histo_size*(image_histo_size-1)/2 (the first two for loops) + // - image_histo_size - 1 in the last for loop at the first iteration of + // the while loop, image_histo_size - 2 at the second iteration ... + // therefore image_histo_size*(image_histo_size-1)/2 overall too + if (!HistoQueueInit(&histo_queue, image_histo_size * image_histo_size)) { goto End; } for (i = 0; i < image_histo_size; ++i) { - // Initialize clusters indexes. - clusters[i] = i; + if (image_histo->histograms[i] == NULL) continue; for (j = i + 1; j < image_histo_size; ++j) { - // Initialize positions array. - PreparePair(histograms, i, j, &histo_queue.queue[histo_queue.size]); - UpdateQueueFront(&histo_queue); + // Initialize queue. + if (image_histo->histograms[j] == NULL) continue; + HistoQueuePush(&histo_queue, histograms, i, j, 0.); } } - while (image_histo_size > 1 && histo_queue.size > 0) { - HistogramPair* copy_to; + while (histo_queue.size > 0) { const int idx1 = histo_queue.queue[0].idx1; const int idx2 = histo_queue.queue[0].idx2; HistogramAdd(histograms[idx2], histograms[idx1], histograms[idx1]); histograms[idx1]->bit_cost_ = histo_queue.queue[0].cost_combo; + // Remove merged histogram. - for (i = 0; i + 1 < image_histo_size; ++i) { - if (clusters[i] >= idx2) { - clusters[i] = clusters[i + 1]; - } - } - --image_histo_size; + HistogramSetRemoveHistogram(image_histo, idx2, num_used); - // Remove pairs intersecting the just combined best pair. This will - // therefore pop the head of the queue. - copy_to = histo_queue.queue; - for (i = 0; i < histo_queue.size; ++i) { + // Remove pairs intersecting the just combined best pair. + for (i = 0; i < histo_queue.size;) { HistogramPair* const p = histo_queue.queue + i; if (p->idx1 == idx1 || p->idx2 == idx1 || p->idx1 == idx2 || p->idx2 == idx2) { - // Do not copy the invalid pair. - continue; - } - if (p->cost_diff < histo_queue.queue[0].cost_diff) { - // Replace the top of the queue if we found better. - SwapHistogramPairs(histo_queue.queue, p); + HistoQueuePopPair(&histo_queue, p); + } else { + HistoQueueUpdateHead(&histo_queue, p); + ++i; } - SwapHistogramPairs(copy_to, p); - ++copy_to; } - histo_queue.size = (int)(copy_to - histo_queue.queue); // Push new pairs formed with combined histogram to the queue. - for (i = 0; i < image_histo_size; ++i) { - if (clusters[i] != idx1) { - PreparePair(histograms, idx1, clusters[i], - &histo_queue.queue[histo_queue.size]); - UpdateQueueFront(&histo_queue); - } - } - } - // Move remaining histograms to the beginning of the array. - for (i = 0; i < image_histo_size; ++i) { - if (i != clusters[i]) { // swap the two histograms - HistogramSwap(&histograms[i], &histograms[clusters[i]]); + for (i = 0; i < image_histo->size; ++i) { + if (i == idx1 || image_histo->histograms[i] == NULL) continue; + HistoQueuePush(&histo_queue, image_histo->histograms, idx1, i, 0.); } } - image_histo->size = image_histo_size; ok = 1; End: - WebPSafeFree(clusters); HistoQueueClear(&histo_queue); return ok; } -static void HistogramCombineStochastic(VP8LHistogramSet* const image_histo, - VP8LHistogram* tmp_histo, - VP8LHistogram* best_combo, - int quality, int min_cluster_size) { - int iter; - uint32_t seed = 0; +// Perform histogram aggregation using a stochastic approach. +// 'do_greedy' is set to 1 if a greedy approach needs to be performed +// afterwards, 0 otherwise. +static int PairComparison(const void* idx1, const void* idx2) { + // To be used with bsearch: <0 when *idx1<*idx2, >0 if >, 0 when ==. + return (*(int*) idx1 - *(int*) idx2); +} +static int HistogramCombineStochastic(VP8LHistogramSet* const image_histo, + int* const num_used, int min_cluster_size, + int* const do_greedy) { + int j, iter; + uint32_t seed = 1; int tries_with_no_success = 0; - int image_histo_size = image_histo->size; - const int iter_mult = (quality < 25) ? 2 : 2 + (quality - 25) / 8; - const int outer_iters = image_histo_size * iter_mult; - const int num_pairs = image_histo_size / 2; + const int outer_iters = *num_used; const int num_tries_no_success = outer_iters / 2; - int idx2_max = image_histo_size - 1; - int do_brute_dorce = 0; VP8LHistogram** const histograms = image_histo->histograms; + // Priority queue of histogram pairs. Its size of 'kHistoQueueSize' + // impacts the quality of the compression and the speed: the smaller the + // faster but the worse for the compression. + HistoQueue histo_queue; + const int kHistoQueueSize = 9; + int ok = 0; + // mapping from an index in image_histo with no NULL histogram to the full + // blown image_histo. + int* mappings; + + if (*num_used < min_cluster_size) { + *do_greedy = 1; + return 1; + } + + mappings = (int*) WebPSafeMalloc(*num_used, sizeof(*mappings)); + if (mappings == NULL) return 0; + if (!HistoQueueInit(&histo_queue, kHistoQueueSize)) goto End; + // Fill the initial mapping. + for (j = 0, iter = 0; iter < image_histo->size; ++iter) { + if (histograms[iter] == NULL) continue; + mappings[j++] = iter; + } + assert(j == *num_used); // Collapse similar histograms in 'image_histo'. - ++min_cluster_size; for (iter = 0; - iter < outer_iters && image_histo_size >= min_cluster_size; + iter < outer_iters && *num_used >= min_cluster_size && + ++tries_with_no_success < num_tries_no_success; ++iter) { - double best_cost_diff = 0.; + int* mapping_index; + double best_cost = + (histo_queue.size == 0) ? 0. : histo_queue.queue[0].cost_diff; int best_idx1 = -1, best_idx2 = 1; - int j; - int num_tries = - (num_pairs < image_histo_size) ? num_pairs : image_histo_size; - // Use a brute force approach if: - // - stochastic has not worked for a while and - // - if the number of iterations for brute force is less than the number of - // iterations if we never find a match ever again stochastically (hence - // num_tries times the number of remaining outer iterations). - do_brute_dorce = - (tries_with_no_success > 10) && - (idx2_max * (idx2_max + 1) < 2 * num_tries * (outer_iters - iter)); - if (do_brute_dorce) num_tries = idx2_max; - - seed += iter; - for (j = 0; j < num_tries; ++j) { - double curr_cost_diff; - // Choose two histograms at random and try to combine them. - uint32_t idx1, idx2; - if (do_brute_dorce) { - // Use a brute force approach. - idx1 = (uint32_t)j; - idx2 = (uint32_t)idx2_max; - } else { - const uint32_t tmp = (j & 7) + 1; - const uint32_t diff = - (tmp < 3) ? tmp : MyRand(&seed) % (image_histo_size - 1); - idx1 = MyRand(&seed) % image_histo_size; - idx2 = (idx1 + diff + 1) % image_histo_size; - if (idx1 == idx2) { - continue; - } - } - - // Calculate cost reduction on combining. - curr_cost_diff = HistogramAddEval(histograms[idx1], histograms[idx2], - tmp_histo, best_cost_diff); - if (curr_cost_diff < best_cost_diff) { // found a better pair? - HistogramSwap(&best_combo, &tmp_histo); - best_cost_diff = curr_cost_diff; - best_idx1 = idx1; - best_idx2 = idx2; + const uint32_t rand_range = (*num_used - 1) * (*num_used); + // (*num_used) / 2 was chosen empirically. Less means faster but worse + // compression. + const int num_tries = (*num_used) / 2; + + // Pick random samples. + for (j = 0; *num_used >= 2 && j < num_tries; ++j) { + double curr_cost; + // Choose two different histograms at random and try to combine them. + const uint32_t tmp = MyRand(&seed) % rand_range; + uint32_t idx1 = tmp / (*num_used - 1); + uint32_t idx2 = tmp % (*num_used - 1); + if (idx2 >= idx1) ++idx2; + idx1 = mappings[idx1]; + idx2 = mappings[idx2]; + + // Calculate cost reduction on combination. + curr_cost = + HistoQueuePush(&histo_queue, histograms, idx1, idx2, best_cost); + if (curr_cost < 0) { // found a better pair? + best_cost = curr_cost; + // Empty the queue if we reached full capacity. + if (histo_queue.size == histo_queue.max_size) break; } } - if (do_brute_dorce) --idx2_max; - - if (best_idx1 >= 0) { - HistogramSwap(&best_combo, &histograms[best_idx1]); - // swap best_idx2 slot with last one (which is now unused) - --image_histo_size; - if (idx2_max >= image_histo_size) idx2_max = image_histo_size - 1; - if (best_idx2 != image_histo_size) { - HistogramSwap(&histograms[image_histo_size], &histograms[best_idx2]); - histograms[image_histo_size] = NULL; + if (histo_queue.size == 0) continue; + + // Get the best histograms. + best_idx1 = histo_queue.queue[0].idx1; + best_idx2 = histo_queue.queue[0].idx2; + assert(best_idx1 < best_idx2); + // Pop best_idx2 from mappings. + mapping_index = (int*) bsearch(&best_idx2, mappings, *num_used, + sizeof(best_idx2), &PairComparison); + assert(mapping_index != NULL); + memmove(mapping_index, mapping_index + 1, sizeof(*mapping_index) * + ((*num_used) - (mapping_index - mappings) - 1)); + // Merge the histograms and remove best_idx2 from the queue. + HistogramAdd(histograms[best_idx2], histograms[best_idx1], + histograms[best_idx1]); + histograms[best_idx1]->bit_cost_ = histo_queue.queue[0].cost_combo; + HistogramSetRemoveHistogram(image_histo, best_idx2, num_used); + // Parse the queue and update each pair that deals with best_idx1, + // best_idx2 or image_histo_size. + for (j = 0; j < histo_queue.size;) { + HistogramPair* const p = histo_queue.queue + j; + const int is_idx1_best = p->idx1 == best_idx1 || p->idx1 == best_idx2; + const int is_idx2_best = p->idx2 == best_idx1 || p->idx2 == best_idx2; + int do_eval = 0; + // The front pair could have been duplicated by a random pick so + // check for it all the time nevertheless. + if (is_idx1_best && is_idx2_best) { + HistoQueuePopPair(&histo_queue, p); + continue; } - tries_with_no_success = 0; - } - if (++tries_with_no_success >= num_tries_no_success || idx2_max == 0) { - break; + // Any pair containing one of the two best indices should only refer to + // best_idx1. Its cost should also be updated. + if (is_idx1_best) { + p->idx1 = best_idx1; + do_eval = 1; + } else if (is_idx2_best) { + p->idx2 = best_idx1; + do_eval = 1; + } + // Make sure the index order is respected. + if (p->idx1 > p->idx2) { + const int tmp = p->idx2; + p->idx2 = p->idx1; + p->idx1 = tmp; + } + if (do_eval) { + // Re-evaluate the cost of an updated pair. + HistoQueueUpdatePair(histograms[p->idx1], histograms[p->idx2], 0., p); + if (p->cost_diff >= 0.) { + HistoQueuePopPair(&histo_queue, p); + continue; + } + } + HistoQueueUpdateHead(&histo_queue, p); + ++j; } + tries_with_no_success = 0; } - image_histo->size = image_histo_size; + *do_greedy = (*num_used <= min_cluster_size); + ok = 1; + +End: + HistoQueueClear(&histo_queue); + WebPSafeFree(mappings); + return ok; } // ----------------------------------------------------------------------------- // Histogram refinement // Find the best 'out' histogram for each of the 'in' histograms. +// At call-time, 'out' contains the histograms of the clusters. // Note: we assume that out[]->bit_cost_ is already up-to-date. static void HistogramRemap(const VP8LHistogramSet* const in, - const VP8LHistogramSet* const out, + VP8LHistogramSet* const out, uint16_t* const symbols) { int i; VP8LHistogram** const in_histo = in->histograms; VP8LHistogram** const out_histo = out->histograms; - const int in_size = in->size; + const int in_size = out->max_size; const int out_size = out->size; if (out_size > 1) { for (i = 0; i < in_size; ++i) { int best_out = 0; double best_bits = MAX_COST; int k; + if (in_histo[i] == NULL) { + // Arbitrarily set to the previous value if unused to help future LZ77. + symbols[i] = symbols[i - 1]; + continue; + } for (k = 0; k < out_size; ++k) { - const double cur_bits = - HistogramAddThresh(out_histo[k], in_histo[i], best_bits); + double cur_bits; + cur_bits = HistogramAddThresh(out_histo[k], in_histo[i], best_bits); if (k == 0 || cur_bits < best_bits) { best_bits = cur_bits; best_out = k; @@ -899,12 +1081,13 @@ static void HistogramRemap(const VP8LHistogramSet* const in, } // Recompute each out based on raw and symbols. - for (i = 0; i < out_size; ++i) { - HistogramClear(out_histo[i]); - } + VP8LHistogramSetClear(out); + out->size = out_size; for (i = 0; i < in_size; ++i) { - const int idx = symbols[i]; + int idx; + if (in_histo[i] == NULL) continue; + idx = symbols[i]; HistogramAdd(in_histo[i], out_histo[idx], out_histo[idx]); } } @@ -920,12 +1103,76 @@ static double GetCombineCostFactor(int histo_size, int quality) { return combine_cost_factor; } +// Given a HistogramSet 'set', the mapping of clusters 'cluster_mapping' and the +// current assignment of the cells in 'symbols', merge the clusters and +// assign the smallest possible clusters values. +static void OptimizeHistogramSymbols(const VP8LHistogramSet* const set, + uint16_t* const cluster_mappings, + int num_clusters, + uint16_t* const cluster_mappings_tmp, + uint16_t* const symbols) { + int i, cluster_max; + int do_continue = 1; + // First, assign the lowest cluster to each pixel. + while (do_continue) { + do_continue = 0; + for (i = 0; i < num_clusters; ++i) { + int k; + k = cluster_mappings[i]; + while (k != cluster_mappings[k]) { + cluster_mappings[k] = cluster_mappings[cluster_mappings[k]]; + k = cluster_mappings[k]; + } + if (k != cluster_mappings[i]) { + do_continue = 1; + cluster_mappings[i] = k; + } + } + } + // Create a mapping from a cluster id to its minimal version. + cluster_max = 0; + memset(cluster_mappings_tmp, 0, + set->max_size * sizeof(*cluster_mappings_tmp)); + assert(cluster_mappings[0] == 0); + // Re-map the ids. + for (i = 0; i < set->max_size; ++i) { + int cluster; + if (symbols[i] == kInvalidHistogramSymbol) continue; + cluster = cluster_mappings[symbols[i]]; + assert(symbols[i] < num_clusters); + if (cluster > 0 && cluster_mappings_tmp[cluster] == 0) { + ++cluster_max; + cluster_mappings_tmp[cluster] = cluster_max; + } + symbols[i] = cluster_mappings_tmp[cluster]; + } + + // Make sure all cluster values are used. + cluster_max = 0; + for (i = 0; i < set->max_size; ++i) { + if (symbols[i] == kInvalidHistogramSymbol) continue; + if (symbols[i] <= cluster_max) continue; + ++cluster_max; + assert(symbols[i] == cluster_max); + } +} + +static void RemoveEmptyHistograms(VP8LHistogramSet* const image_histo) { + uint32_t size; + int i; + for (i = 0, size = 0; i < image_histo->size; ++i) { + if (image_histo->histograms[i] == NULL) continue; + image_histo->histograms[size++] = image_histo->histograms[i]; + } + image_histo->size = size; +} + int VP8LGetHistoImageSymbols(int xsize, int ysize, const VP8LBackwardRefs* const refs, int quality, int low_effort, int histo_bits, int cache_bits, VP8LHistogramSet* const image_histo, - VP8LHistogramSet* const tmp_histos, + VP8LHistogram* const tmp_histo, uint16_t* const histogram_symbols) { int ok = 0; const int histo_xsize = histo_bits ? VP8LSubSampleSize(xsize, histo_bits) : 1; @@ -933,35 +1180,41 @@ int VP8LGetHistoImageSymbols(int xsize, int ysize, const int image_histo_raw_size = histo_xsize * histo_ysize; VP8LHistogramSet* const orig_histo = VP8LAllocateHistogramSet(image_histo_raw_size, cache_bits); - VP8LHistogram* cur_combo; // Don't attempt linear bin-partition heuristic for // histograms of small sizes (as bin_map will be very sparse) and // maximum quality q==100 (to preserve the compression gains at that level). const int entropy_combine_num_bins = low_effort ? NUM_PARTITIONS : BIN_SIZE; - const int entropy_combine = - (orig_histo->size > entropy_combine_num_bins * 2) && (quality < 100); - - if (orig_histo == NULL) goto Error; + int entropy_combine; + uint16_t* const map_tmp = + WebPSafeMalloc(2 * image_histo_raw_size, sizeof(map_tmp)); + uint16_t* const cluster_mappings = map_tmp + image_histo_raw_size; + int num_used = image_histo_raw_size; + if (orig_histo == NULL || map_tmp == NULL) goto Error; // Construct the histograms from backward references. HistogramBuild(xsize, histo_bits, refs, orig_histo); // Copies the histograms and computes its bit_cost. - HistogramCopyAndAnalyze(orig_histo, image_histo); + // histogram_symbols is optimized + HistogramCopyAndAnalyze(orig_histo, image_histo, &num_used, + histogram_symbols); + + entropy_combine = + (num_used > entropy_combine_num_bins * 2) && (quality < 100); - cur_combo = tmp_histos->histograms[1]; // pick up working slot if (entropy_combine) { - const int bin_map_size = orig_histo->size; - // Reuse histogram_symbols storage. By definition, it's guaranteed to be ok. - uint16_t* const bin_map = histogram_symbols; + uint16_t* const bin_map = map_tmp; const double combine_cost_factor = GetCombineCostFactor(image_histo_raw_size, quality); + const uint32_t num_clusters = num_used; - HistogramAnalyzeEntropyBin(orig_histo, bin_map, low_effort); + HistogramAnalyzeEntropyBin(image_histo, bin_map, low_effort); // Collapse histograms with similar entropy. - cur_combo = HistogramCombineEntropyBin(image_histo, cur_combo, - bin_map, bin_map_size, - entropy_combine_num_bins, - combine_cost_factor, low_effort); + HistogramCombineEntropyBin(image_histo, &num_used, histogram_symbols, + cluster_mappings, tmp_histo, bin_map, + entropy_combine_num_bins, combine_cost_factor, + low_effort); + OptimizeHistogramSymbols(image_histo, cluster_mappings, num_clusters, + map_tmp, histogram_symbols); } // Don't combine the histograms using stochastic and greedy heuristics for @@ -970,21 +1223,27 @@ int VP8LGetHistoImageSymbols(int xsize, int ysize, const float x = quality / 100.f; // cubic ramp between 1 and MAX_HISTO_GREEDY: const int threshold_size = (int)(1 + (x * x * x) * (MAX_HISTO_GREEDY - 1)); - HistogramCombineStochastic(image_histo, tmp_histos->histograms[0], - cur_combo, quality, threshold_size); - if ((image_histo->size <= threshold_size) && - !HistogramCombineGreedy(image_histo)) { + int do_greedy; + if (!HistogramCombineStochastic(image_histo, &num_used, threshold_size, + &do_greedy)) { goto Error; } + if (do_greedy) { + RemoveEmptyHistograms(image_histo); + if (!HistogramCombineGreedy(image_histo, &num_used)) { + goto Error; + } + } } - // TODO(vikasa): Optimize HistogramRemap for low-effort compression mode also. // Find the optimal map from original histograms to the final ones. + RemoveEmptyHistograms(image_histo); HistogramRemap(orig_histo, image_histo, histogram_symbols); ok = 1; Error: VP8LFreeHistogramSet(orig_histo); + WebPSafeFree(map_tmp); return ok; } diff --git a/src/3rdparty/libwebp/src/enc/histogram_enc.h b/src/3rdparty/libwebp/src/enc/histogram_enc.h index a9d258a..54c2d21 100644 --- a/src/3rdparty/libwebp/src/enc/histogram_enc.h +++ b/src/3rdparty/libwebp/src/enc/histogram_enc.h @@ -11,14 +11,14 @@ // // Models the histograms of literal and distance codes. -#ifndef WEBP_ENC_HISTOGRAM_H_ -#define WEBP_ENC_HISTOGRAM_H_ +#ifndef WEBP_ENC_HISTOGRAM_ENC_H_ +#define WEBP_ENC_HISTOGRAM_ENC_H_ #include <string.h> -#include "./backward_references_enc.h" -#include "../webp/format_constants.h" -#include "../webp/types.h" +#include "src/enc/backward_references_enc.h" +#include "src/webp/format_constants.h" +#include "src/webp/types.h" #ifdef __cplusplus extern "C" { @@ -44,6 +44,7 @@ typedef struct { double literal_cost_; // Cached values of dominant entropy costs: double red_cost_; // literal, red & blue. double blue_cost_; + uint8_t is_used_[5]; // 5 for literal, red, blue, alpha, distance } VP8LHistogram; // Collection of histograms with fixed capacity, allocated as one @@ -67,7 +68,9 @@ void VP8LHistogramCreate(VP8LHistogram* const p, int VP8LGetHistogramSize(int palette_code_bits); // Set the palette_code_bits and reset the stats. -void VP8LHistogramInit(VP8LHistogram* const p, int palette_code_bits); +// If init_arrays is true, the arrays are also filled with 0's. +void VP8LHistogramInit(VP8LHistogram* const p, int palette_code_bits, + int init_arrays); // Collect all the references into a histogram (without reset) void VP8LHistogramStoreRefs(const VP8LBackwardRefs* const refs, @@ -83,6 +86,9 @@ void VP8LFreeHistogramSet(VP8LHistogramSet* const histo); // using 'cache_bits'. Return NULL in case of memory error. VP8LHistogramSet* VP8LAllocateHistogramSet(int size, int cache_bits); +// Set the histograms in set to 0. +void VP8LHistogramSetClear(VP8LHistogramSet* const set); + // Allocate and initialize histogram object with specified 'cache_bits'. // Returns NULL in case of memory error. // Special case of VP8LAllocateHistogramSet, with size equals 1. @@ -90,7 +96,9 @@ VP8LHistogram* VP8LAllocateHistogram(int cache_bits); // Accumulate a token 'v' into a histogram. void VP8LHistogramAddSinglePixOrCopy(VP8LHistogram* const histo, - const PixOrCopy* const v); + const PixOrCopy* const v, + int (*const distance_modifier)(int, int), + int distance_modifier_arg0); static WEBP_INLINE int VP8LHistogramNumCodes(int palette_code_bits) { return NUM_LITERAL_CODES + NUM_LENGTH_CODES + @@ -103,21 +111,18 @@ int VP8LGetHistoImageSymbols(int xsize, int ysize, int quality, int low_effort, int histogram_bits, int cache_bits, VP8LHistogramSet* const image_in, - VP8LHistogramSet* const tmp_histos, + VP8LHistogram* const tmp_histo, uint16_t* const histogram_symbols); // Returns the entropy for the symbols in the input array. -// Also sets trivial_symbol to the code value, if the array has only one code -// value. Otherwise, set it to VP8L_NON_TRIVIAL_SYM. -double VP8LBitsEntropy(const uint32_t* const array, int n, - uint32_t* const trivial_symbol); +double VP8LBitsEntropy(const uint32_t* const array, int n); // Estimate how many bits the combined entropy of literals and distance // approximately maps to. -double VP8LHistogramEstimateBits(const VP8LHistogram* const p); +double VP8LHistogramEstimateBits(VP8LHistogram* const p); #ifdef __cplusplus } #endif -#endif // WEBP_ENC_HISTOGRAM_H_ +#endif // WEBP_ENC_HISTOGRAM_ENC_H_ diff --git a/src/3rdparty/libwebp/src/enc/iterator_enc.c b/src/3rdparty/libwebp/src/enc/iterator_enc.c index e48d30b..29f91d8 100644 --- a/src/3rdparty/libwebp/src/enc/iterator_enc.c +++ b/src/3rdparty/libwebp/src/enc/iterator_enc.c @@ -13,7 +13,7 @@ #include <string.h> -#include "./vp8i_enc.h" +#include "src/enc/vp8i_enc.h" //------------------------------------------------------------------------------ // VP8Iterator @@ -26,6 +26,9 @@ static void InitLeft(VP8EncIterator* const it) { memset(it->u_left_, 129, 8); memset(it->v_left_, 129, 8); it->left_nz_[8] = 0; + if (it->top_derr_ != NULL) { + memset(&it->left_derr_, 0, sizeof(it->left_derr_)); + } } static void InitTop(VP8EncIterator* const it) { @@ -33,6 +36,9 @@ static void InitTop(VP8EncIterator* const it) { const size_t top_size = enc->mb_w_ * 16; memset(enc->y_top_, 127, 2 * top_size); memset(enc->nz_, 0, enc->mb_w_ * sizeof(*enc->nz_)); + if (enc->top_derr_ != NULL) { + memset(enc->top_derr_, 0, enc->mb_w_ * sizeof(*enc->top_derr_)); + } } void VP8IteratorSetRow(VP8EncIterator* const it, int y) { @@ -76,6 +82,7 @@ void VP8IteratorInit(VP8Encoder* const enc, VP8EncIterator* const it) { it->y_left_ = (uint8_t*)WEBP_ALIGN(it->yuv_left_mem_ + 1); it->u_left_ = it->y_left_ + 16 + 16; it->v_left_ = it->u_left_ + 16; + it->top_derr_ = enc->top_derr_; VP8IteratorReset(it); } @@ -121,7 +128,7 @@ static void ImportLine(const uint8_t* src, int src_stride, for (; i < total_len; ++i) dst[i] = dst[len - 1]; } -void VP8IteratorImport(VP8EncIterator* const it, uint8_t* tmp_32) { +void VP8IteratorImport(VP8EncIterator* const it, uint8_t* const tmp_32) { const VP8Encoder* const enc = it->enc_; const int x = it->x_, y = it->y_; const WebPPicture* const pic = enc->pic_; @@ -450,4 +457,3 @@ int VP8IteratorRotateI4(VP8EncIterator* const it, } //------------------------------------------------------------------------------ - diff --git a/src/3rdparty/libwebp/src/enc/near_lossless_enc.c b/src/3rdparty/libwebp/src/enc/near_lossless_enc.c index 2bd03ab..5517a7e 100644 --- a/src/3rdparty/libwebp/src/enc/near_lossless_enc.c +++ b/src/3rdparty/libwebp/src/enc/near_lossless_enc.c @@ -17,18 +17,20 @@ #include <assert.h> #include <stdlib.h> -#include "../dsp/lossless_common.h" -#include "../utils/utils.h" -#include "./vp8i_enc.h" +#include "src/dsp/lossless_common.h" +#include "src/utils/utils.h" +#include "src/enc/vp8li_enc.h" + +#if (WEBP_NEAR_LOSSLESS == 1) #define MIN_DIM_FOR_NEAR_LOSSLESS 64 #define MAX_LIMIT_BITS 5 // Quantizes the value up or down to a multiple of 1<<bits (or to 255), // choosing the closer one, resolving ties using bankers' rounding. -static int FindClosestDiscretized(int a, int bits) { - const int mask = (1 << bits) - 1; - const int biased = a + (mask >> 1) + ((a >> bits) & 1); +static uint32_t FindClosestDiscretized(uint32_t a, int bits) { + const uint32_t mask = (1u << bits) - 1; + const uint32_t biased = a + (mask >> 1) + ((a >> bits) & 1); assert(bits > 0); if (biased > 0xff) return 0xff; return biased & ~mask; @@ -69,22 +71,30 @@ static int IsSmooth(const uint32_t* const prev_row, } // Adjusts pixel values of image with given maximum error. -static void NearLossless(int xsize, int ysize, uint32_t* argb, - int limit_bits, uint32_t* copy_buffer) { +static void NearLossless(int xsize, int ysize, const uint32_t* argb_src, + int stride, int limit_bits, uint32_t* copy_buffer, + uint32_t* argb_dst) { int x, y; const int limit = 1 << limit_bits; uint32_t* prev_row = copy_buffer; uint32_t* curr_row = prev_row + xsize; uint32_t* next_row = curr_row + xsize; - memcpy(copy_buffer, argb, xsize * 2 * sizeof(argb[0])); + memcpy(curr_row, argb_src, xsize * sizeof(argb_src[0])); + memcpy(next_row, argb_src + stride, xsize * sizeof(argb_src[0])); - for (y = 1; y < ysize - 1; ++y) { - uint32_t* const curr_argb_row = argb + y * xsize; - uint32_t* const next_argb_row = curr_argb_row + xsize; - memcpy(next_row, next_argb_row, xsize * sizeof(argb[0])); - for (x = 1; x < xsize - 1; ++x) { - if (!IsSmooth(prev_row, curr_row, next_row, x, limit)) { - curr_argb_row[x] = ClosestDiscretizedArgb(curr_row[x], limit_bits); + for (y = 0; y < ysize; ++y, argb_src += stride, argb_dst += xsize) { + if (y == 0 || y == ysize - 1) { + memcpy(argb_dst, argb_src, xsize * sizeof(argb_src[0])); + } else { + memcpy(next_row, argb_src + stride, xsize * sizeof(argb_src[0])); + argb_dst[0] = argb_src[0]; + argb_dst[xsize - 1] = argb_src[xsize - 1]; + for (x = 1; x < xsize - 1; ++x) { + if (IsSmooth(prev_row, curr_row, next_row, x, limit)) { + argb_dst[x] = curr_row[x]; + } else { + argb_dst[x] = ClosestDiscretizedArgb(curr_row[x], limit_bits); + } } } { @@ -97,26 +107,45 @@ static void NearLossless(int xsize, int ysize, uint32_t* argb, } } -int VP8ApplyNearLossless(int xsize, int ysize, uint32_t* argb, int quality) { +int VP8ApplyNearLossless(const WebPPicture* const picture, int quality, + uint32_t* const argb_dst) { int i; + const int xsize = picture->width; + const int ysize = picture->height; + const int stride = picture->argb_stride; uint32_t* const copy_buffer = (uint32_t*)WebPSafeMalloc(xsize * 3, sizeof(*copy_buffer)); const int limit_bits = VP8LNearLosslessBits(quality); - assert(argb != NULL); - assert(limit_bits >= 0); + assert(argb_dst != NULL); + assert(limit_bits > 0); assert(limit_bits <= MAX_LIMIT_BITS); if (copy_buffer == NULL) { return 0; } // For small icon images, don't attempt to apply near-lossless compression. - if (xsize < MIN_DIM_FOR_NEAR_LOSSLESS && ysize < MIN_DIM_FOR_NEAR_LOSSLESS) { + if ((xsize < MIN_DIM_FOR_NEAR_LOSSLESS && + ysize < MIN_DIM_FOR_NEAR_LOSSLESS) || + ysize < 3) { + for (i = 0; i < ysize; ++i) { + memcpy(argb_dst + i * xsize, picture->argb + i * picture->argb_stride, + xsize * sizeof(*argb_dst)); + } WebPSafeFree(copy_buffer); return 1; } - for (i = limit_bits; i != 0; --i) { - NearLossless(xsize, ysize, argb, i, copy_buffer); + NearLossless(xsize, ysize, picture->argb, stride, limit_bits, copy_buffer, + argb_dst); + for (i = limit_bits - 1; i != 0; --i) { + NearLossless(xsize, ysize, argb_dst, xsize, i, copy_buffer, argb_dst); } WebPSafeFree(copy_buffer); return 1; } +#else // (WEBP_NEAR_LOSSLESS == 1) + +// Define a stub to suppress compiler warnings. +extern void VP8LNearLosslessStub(void); +void VP8LNearLosslessStub(void) {} + +#endif // (WEBP_NEAR_LOSSLESS == 1) diff --git a/src/3rdparty/libwebp/src/enc/picture_csp_enc.c b/src/3rdparty/libwebp/src/enc/picture_csp_enc.c index e5d1c75..02d9df7 100644 --- a/src/3rdparty/libwebp/src/enc/picture_csp_enc.c +++ b/src/3rdparty/libwebp/src/enc/picture_csp_enc.c @@ -15,10 +15,12 @@ #include <stdlib.h> #include <math.h> -#include "./vp8i_enc.h" -#include "../utils/random_utils.h" -#include "../utils/utils.h" -#include "../dsp/yuv.h" +#include "src/enc/vp8i_enc.h" +#include "src/utils/random_utils.h" +#include "src/utils/utils.h" +#include "src/dsp/dsp.h" +#include "src/dsp/lossless.h" +#include "src/dsp/yuv.h" // Uncomment to disable gamma-compression during RGB->U/V averaging #define USE_GAMMA_COMPRESSION @@ -26,11 +28,11 @@ // If defined, use table to compute x / alpha. #define USE_INVERSE_ALPHA_TABLE -static const union { - uint32_t argb; - uint8_t bytes[4]; -} test_endian = { 0xff000000u }; -#define ALPHA_IS_LAST (test_endian.bytes[3] == 0xff) +#ifdef WORDS_BIGENDIAN +#define ALPHA_OFFSET 0 // uint32_t 0xff000000 is 0xff,00,00,00 in memory +#else +#define ALPHA_OFFSET 3 // uint32_t 0xff000000 is 0x00,00,00,ff in memory +#endif //------------------------------------------------------------------------------ // Detection of non-trivial transparency @@ -39,12 +41,15 @@ static const union { static int CheckNonOpaque(const uint8_t* alpha, int width, int height, int x_step, int y_step) { if (alpha == NULL) return 0; - while (height-- > 0) { - int x; - for (x = 0; x < width * x_step; x += x_step) { - if (alpha[x] != 0xff) return 1; // TODO(skal): check 4/8 bytes at a time. + WebPInitAlphaProcessing(); + if (x_step == 1) { + for (; height-- > 0; alpha += y_step) { + if (WebPHasAlpha8b(alpha, width)) return 1; + } + } else { + for (; height-- > 0; alpha += y_step) { + if (WebPHasAlpha32b(alpha, width)) return 1; } - alpha += y_step; } return 0; } @@ -56,15 +61,10 @@ int WebPPictureHasTransparency(const WebPPicture* picture) { return CheckNonOpaque(picture->a, picture->width, picture->height, 1, picture->a_stride); } else { - int x, y; - const uint32_t* argb = picture->argb; - if (argb == NULL) return 0; - for (y = 0; y < picture->height; ++y) { - for (x = 0; x < picture->width; ++x) { - if (argb[x] < 0xff000000u) return 1; // test any alpha values != 0xff - } - argb += picture->argb_stride; - } + const int alpha_offset = ALPHA_OFFSET; + return CheckNonOpaque((const uint8_t*)picture->argb + alpha_offset, + picture->width, picture->height, + 4, picture->argb_stride * sizeof(*picture->argb)); } return 0; } @@ -126,7 +126,7 @@ static WEBP_INLINE int LinearToGamma(uint32_t base_value, int shift) { #else -static WEBP_TSAN_IGNORE_FUNCTION void InitGammaTables(void) {} +static void InitGammaTables(void) {} static WEBP_INLINE uint32_t GammaToLinear(uint8_t v) { return v; } static WEBP_INLINE int LinearToGamma(uint32_t base_value, int shift) { return (int)(base_value << shift); @@ -170,29 +170,33 @@ typedef uint16_t fixed_y_t; // unsigned type with extra SFIX precision for W #if defined(USE_GAMMA_COMPRESSION) -// float variant of gamma-correction -// We use tables of different size and precision for the Rec709 +// We use tables of different size and precision for the Rec709 / BT2020 // transfer function. #define kGammaF (1./0.45) -static float kGammaToLinearTabF[MAX_Y_T + 1]; // size scales with Y_FIX -static float kLinearToGammaTabF[kGammaTabSize + 2]; -static volatile int kGammaTablesFOk = 0; - -static WEBP_TSAN_IGNORE_FUNCTION void InitGammaTablesF(void) { - if (!kGammaTablesFOk) { +static uint32_t kLinearToGammaTabS[kGammaTabSize + 2]; +#define GAMMA_TO_LINEAR_BITS 14 +static uint32_t kGammaToLinearTabS[MAX_Y_T + 1]; // size scales with Y_FIX +static volatile int kGammaTablesSOk = 0; + +static WEBP_TSAN_IGNORE_FUNCTION void InitGammaTablesS(void) { + assert(2 * GAMMA_TO_LINEAR_BITS < 32); // we use uint32_t intermediate values + if (!kGammaTablesSOk) { int v; const double norm = 1. / MAX_Y_T; const double scale = 1. / kGammaTabSize; - const double a = 0.099; - const double thresh = 0.018; + const double a = 0.09929682680944; + const double thresh = 0.018053968510807; + const double final_scale = 1 << GAMMA_TO_LINEAR_BITS; for (v = 0; v <= MAX_Y_T; ++v) { const double g = norm * v; + double value; if (g <= thresh * 4.5) { - kGammaToLinearTabF[v] = (float)(g / 4.5); + value = g / 4.5; } else { const double a_rec = 1. / (1. + a); - kGammaToLinearTabF[v] = (float)pow(a_rec * (g + a), kGammaF); + value = pow(a_rec * (g + a), kGammaF); } + kGammaToLinearTabS[v] = (uint32_t)(value * final_scale + .5); } for (v = 0; v <= kGammaTabSize; ++v) { const double g = scale * v; @@ -202,37 +206,44 @@ static WEBP_TSAN_IGNORE_FUNCTION void InitGammaTablesF(void) { } else { value = (1. + a) * pow(g, 1. / kGammaF) - a; } - kLinearToGammaTabF[v] = (float)(MAX_Y_T * value); + // we already incorporate the 1/2 rounding constant here + kLinearToGammaTabS[v] = + (uint32_t)(MAX_Y_T * value) + (1 << GAMMA_TO_LINEAR_BITS >> 1); } // to prevent small rounding errors to cause read-overflow: - kLinearToGammaTabF[kGammaTabSize + 1] = kLinearToGammaTabF[kGammaTabSize]; - kGammaTablesFOk = 1; + kLinearToGammaTabS[kGammaTabSize + 1] = kLinearToGammaTabS[kGammaTabSize]; + kGammaTablesSOk = 1; } } -static WEBP_INLINE float GammaToLinearF(int v) { - return kGammaToLinearTabF[v]; +// return value has a fixed-point precision of GAMMA_TO_LINEAR_BITS +static WEBP_INLINE uint32_t GammaToLinearS(int v) { + return kGammaToLinearTabS[v]; } -static WEBP_INLINE int LinearToGammaF(float value) { - const float v = value * kGammaTabSize; - const int tab_pos = (int)v; - const float x = v - (float)tab_pos; // fractional part - const float v0 = kLinearToGammaTabF[tab_pos + 0]; - const float v1 = kLinearToGammaTabF[tab_pos + 1]; - const float y = v1 * x + v0 * (1.f - x); // interpolate - return (int)(y + .5); +static WEBP_INLINE uint32_t LinearToGammaS(uint32_t value) { + // 'value' is in GAMMA_TO_LINEAR_BITS fractional precision + const uint32_t v = value * kGammaTabSize; + const uint32_t tab_pos = v >> GAMMA_TO_LINEAR_BITS; + // fractional part, in GAMMA_TO_LINEAR_BITS fixed-point precision + const uint32_t x = v - (tab_pos << GAMMA_TO_LINEAR_BITS); // fractional part + // v0 / v1 are in GAMMA_TO_LINEAR_BITS fixed-point precision (range [0..1]) + const uint32_t v0 = kLinearToGammaTabS[tab_pos + 0]; + const uint32_t v1 = kLinearToGammaTabS[tab_pos + 1]; + // Final interpolation. Note that rounding is already included. + const uint32_t v2 = (v1 - v0) * x; // note: v1 >= v0. + const uint32_t result = v0 + (v2 >> GAMMA_TO_LINEAR_BITS); + return result; } #else -static WEBP_TSAN_IGNORE_FUNCTION void InitGammaTablesF(void) {} -static WEBP_INLINE float GammaToLinearF(int v) { - const float norm = 1.f / MAX_Y_T; - return norm * v; +static void InitGammaTablesS(void) {} +static WEBP_INLINE uint32_t GammaToLinearS(int v) { + return (v << GAMMA_TO_LINEAR_BITS) / MAX_Y_T; } -static WEBP_INLINE int LinearToGammaF(float value) { - return (int)(MAX_Y_T * value + .5); +static WEBP_INLINE uint32_t LinearToGammaS(uint32_t value) { + return (MAX_Y_T * value) >> GAMMA_TO_LINEAR_BITS; } #endif // USE_GAMMA_COMPRESSION @@ -254,26 +265,22 @@ static int RGBToGray(int r, int g, int b) { return (luma >> YUV_FIX); } -static float RGBToGrayF(float r, float g, float b) { - return (float)(0.2126 * r + 0.7152 * g + 0.0722 * b); -} - -static int ScaleDown(int a, int b, int c, int d) { - const float A = GammaToLinearF(a); - const float B = GammaToLinearF(b); - const float C = GammaToLinearF(c); - const float D = GammaToLinearF(d); - return LinearToGammaF(0.25f * (A + B + C + D)); +static uint32_t ScaleDown(int a, int b, int c, int d) { + const uint32_t A = GammaToLinearS(a); + const uint32_t B = GammaToLinearS(b); + const uint32_t C = GammaToLinearS(c); + const uint32_t D = GammaToLinearS(d); + return LinearToGammaS((A + B + C + D + 2) >> 2); } static WEBP_INLINE void UpdateW(const fixed_y_t* src, fixed_y_t* dst, int w) { int i; for (i = 0; i < w; ++i) { - const float R = GammaToLinearF(src[0 * w + i]); - const float G = GammaToLinearF(src[1 * w + i]); - const float B = GammaToLinearF(src[2 * w + i]); - const float Y = RGBToGrayF(R, G, B); - dst[i] = (fixed_y_t)LinearToGammaF(Y); + const uint32_t R = GammaToLinearS(src[0 * w + i]); + const uint32_t G = GammaToLinearS(src[1 * w + i]); + const uint32_t B = GammaToLinearS(src[2 * w + i]); + const uint32_t Y = RGBToGray(R, G, B); + dst[i] = (fixed_y_t)LinearToGammaS(Y); } } @@ -856,7 +863,6 @@ static int ImportYUVAFromRGBA(const uint8_t* r_ptr, return 0; } if (has_alpha) { - WebPInitAlphaProcessing(); assert(step == 4); #if defined(USE_GAMMA_COMPRESSION) && defined(USE_INVERSE_ALPHA_TABLE) assert(kAlphaFix + kGammaFix <= 31); @@ -864,7 +870,7 @@ static int ImportYUVAFromRGBA(const uint8_t* r_ptr, } if (use_iterative_conversion) { - InitGammaTablesF(); + InitGammaTablesS(); if (!PreprocessARGB(r_ptr, g_ptr, b_ptr, step, rgb_stride, picture)) { return 0; } @@ -991,10 +997,10 @@ static int PictureARGBToYUVA(WebPPicture* picture, WebPEncCSP colorspace, return WebPEncodingSetError(picture, VP8_ENC_ERROR_INVALID_CONFIGURATION); } else { const uint8_t* const argb = (const uint8_t*)picture->argb; - const uint8_t* const r = ALPHA_IS_LAST ? argb + 2 : argb + 1; - const uint8_t* const g = ALPHA_IS_LAST ? argb + 1 : argb + 2; - const uint8_t* const b = ALPHA_IS_LAST ? argb + 0 : argb + 3; - const uint8_t* const a = ALPHA_IS_LAST ? argb + 3 : argb + 0; + const uint8_t* const a = argb + (0 ^ ALPHA_OFFSET); + const uint8_t* const r = argb + (1 ^ ALPHA_OFFSET); + const uint8_t* const g = argb + (2 ^ ALPHA_OFFSET); + const uint8_t* const b = argb + (3 ^ ALPHA_OFFSET); picture->colorspace = WEBP_YUV420; return ImportYUVAFromRGBA(r, g, b, a, 4, 4 * picture->argb_stride, @@ -1045,7 +1051,8 @@ int WebPPictureYUVAToARGB(WebPPicture* picture) { const int argb_stride = 4 * picture->argb_stride; uint8_t* dst = (uint8_t*)picture->argb; const uint8_t *cur_u = picture->u, *cur_v = picture->v, *cur_y = picture->y; - WebPUpsampleLinePairFunc upsample = WebPGetLinePairConverter(ALPHA_IS_LAST); + WebPUpsampleLinePairFunc upsample = + WebPGetLinePairConverter(ALPHA_OFFSET > 0); // First row, with replicated top samples. upsample(cur_y, NULL, cur_u, cur_v, cur_u, cur_v, dst, NULL, width); @@ -1085,40 +1092,59 @@ int WebPPictureYUVAToARGB(WebPPicture* picture) { // automatic import / conversion static int Import(WebPPicture* const picture, - const uint8_t* const rgb, int rgb_stride, + const uint8_t* rgb, int rgb_stride, int step, int swap_rb, int import_alpha) { int y; + // swap_rb -> b,g,r,a , !swap_rb -> r,g,b,a const uint8_t* r_ptr = rgb + (swap_rb ? 2 : 0); const uint8_t* g_ptr = rgb + 1; const uint8_t* b_ptr = rgb + (swap_rb ? 0 : 2); - const uint8_t* a_ptr = import_alpha ? rgb + 3 : NULL; const int width = picture->width; const int height = picture->height; if (!picture->use_argb) { + const uint8_t* a_ptr = import_alpha ? rgb + 3 : NULL; return ImportYUVAFromRGBA(r_ptr, g_ptr, b_ptr, a_ptr, step, rgb_stride, 0.f /* no dithering */, 0, picture); } if (!WebPPictureAlloc(picture)) return 0; - VP8EncDspARGBInit(); + VP8LDspInit(); + WebPInitAlphaProcessing(); if (import_alpha) { + // dst[] byte order is {a,r,g,b} for big-endian, {b,g,r,a} for little endian uint32_t* dst = picture->argb; + const int do_copy = (ALPHA_OFFSET == 3) && swap_rb; assert(step == 4); - for (y = 0; y < height; ++y) { - VP8PackARGB(a_ptr, r_ptr, g_ptr, b_ptr, width, dst); - a_ptr += rgb_stride; - r_ptr += rgb_stride; - g_ptr += rgb_stride; - b_ptr += rgb_stride; - dst += picture->argb_stride; + if (do_copy) { + for (y = 0; y < height; ++y) { + memcpy(dst, rgb, width * 4); + rgb += rgb_stride; + dst += picture->argb_stride; + } + } else { + for (y = 0; y < height; ++y) { +#ifdef WORDS_BIGENDIAN + // BGRA or RGBA input order. + const uint8_t* a_ptr = rgb + 3; + WebPPackARGB(a_ptr, r_ptr, g_ptr, b_ptr, width, dst); + r_ptr += rgb_stride; + g_ptr += rgb_stride; + b_ptr += rgb_stride; +#else + // RGBA input order. Need to swap R and B. + VP8LConvertBGRAToRGBA((const uint32_t*)rgb, width, (uint8_t*)dst); +#endif + rgb += rgb_stride; + dst += picture->argb_stride; + } } } else { uint32_t* dst = picture->argb; assert(step >= 3); for (y = 0; y < height; ++y) { - VP8PackRGB(r_ptr, g_ptr, b_ptr, width, step, dst); + WebPPackRGB(r_ptr, g_ptr, b_ptr, width, step, dst); r_ptr += rgb_stride; g_ptr += rgb_stride; b_ptr += rgb_stride; @@ -1130,12 +1156,7 @@ static int Import(WebPPicture* const picture, // Public API -int WebPPictureImportRGB(WebPPicture* picture, - const uint8_t* rgb, int rgb_stride) { - return (picture != NULL && rgb != NULL) - ? Import(picture, rgb, rgb_stride, 3, 0, 0) - : 0; -} +#if !defined(WEBP_REDUCE_CSP) int WebPPictureImportBGR(WebPPicture* picture, const uint8_t* rgb, int rgb_stride) { @@ -1144,31 +1165,41 @@ int WebPPictureImportBGR(WebPPicture* picture, : 0; } -int WebPPictureImportRGBA(WebPPicture* picture, +int WebPPictureImportBGRA(WebPPicture* picture, const uint8_t* rgba, int rgba_stride) { return (picture != NULL && rgba != NULL) - ? Import(picture, rgba, rgba_stride, 4, 0, 1) + ? Import(picture, rgba, rgba_stride, 4, 1, 1) : 0; } -int WebPPictureImportBGRA(WebPPicture* picture, + +int WebPPictureImportBGRX(WebPPicture* picture, const uint8_t* rgba, int rgba_stride) { return (picture != NULL && rgba != NULL) - ? Import(picture, rgba, rgba_stride, 4, 1, 1) + ? Import(picture, rgba, rgba_stride, 4, 1, 0) : 0; } -int WebPPictureImportRGBX(WebPPicture* picture, +#endif // WEBP_REDUCE_CSP + +int WebPPictureImportRGB(WebPPicture* picture, + const uint8_t* rgb, int rgb_stride) { + return (picture != NULL && rgb != NULL) + ? Import(picture, rgb, rgb_stride, 3, 0, 0) + : 0; +} + +int WebPPictureImportRGBA(WebPPicture* picture, const uint8_t* rgba, int rgba_stride) { return (picture != NULL && rgba != NULL) - ? Import(picture, rgba, rgba_stride, 4, 0, 0) + ? Import(picture, rgba, rgba_stride, 4, 0, 1) : 0; } -int WebPPictureImportBGRX(WebPPicture* picture, +int WebPPictureImportRGBX(WebPPicture* picture, const uint8_t* rgba, int rgba_stride) { return (picture != NULL && rgba != NULL) - ? Import(picture, rgba, rgba_stride, 4, 1, 0) + ? Import(picture, rgba, rgba_stride, 4, 0, 0) : 0; } diff --git a/src/3rdparty/libwebp/src/enc/picture_enc.c b/src/3rdparty/libwebp/src/enc/picture_enc.c index dfa6651..c691622 100644 --- a/src/3rdparty/libwebp/src/enc/picture_enc.c +++ b/src/3rdparty/libwebp/src/enc/picture_enc.c @@ -14,9 +14,9 @@ #include <assert.h> #include <stdlib.h> -#include "./vp8i_enc.h" -#include "../dsp/dsp.h" -#include "../utils/utils.h" +#include "src/enc/vp8i_enc.h" +#include "src/dsp/dsp.h" +#include "src/utils/utils.h" //------------------------------------------------------------------------------ // WebPPicture @@ -76,13 +76,12 @@ int WebPPictureAllocARGB(WebPPicture* const picture, int width, int height) { return WebPEncodingSetError(picture, VP8_ENC_ERROR_BAD_DIMENSION); } // allocate a new buffer. - memory = WebPSafeMalloc(argb_size, sizeof(*picture->argb)); + memory = WebPSafeMalloc(argb_size + WEBP_ALIGN_CST, sizeof(*picture->argb)); if (memory == NULL) { return WebPEncodingSetError(picture, VP8_ENC_ERROR_OUT_OF_MEMORY); } - // TODO(skal): align plane to cache line? picture->memory_argb_ = memory; - picture->argb = (uint32_t*)memory; + picture->argb = (uint32_t*)WEBP_ALIGN(memory); picture->argb_stride = width; return 1; } @@ -92,8 +91,8 @@ int WebPPictureAllocYUVA(WebPPicture* const picture, int width, int height) { (WebPEncCSP)((int)picture->colorspace & WEBP_CSP_UV_MASK); const int has_alpha = (int)picture->colorspace & WEBP_CSP_ALPHA_BIT; const int y_stride = width; - const int uv_width = (width + 1) >> 1; - const int uv_height = (height + 1) >> 1; + const int uv_width = (int)(((int64_t)width + 1) >> 1); + const int uv_height = (int)(((int64_t)height + 1) >> 1); const int uv_stride = uv_width; int a_width, a_stride; uint64_t y_size, uv_size, a_size, total_size; @@ -118,8 +117,8 @@ int WebPPictureAllocYUVA(WebPPicture* const picture, int width, int height) { total_size = y_size + a_size + 2 * uv_size; // Security and validation checks - if (width <= 0 || height <= 0 || // luma/alpha param error - uv_width < 0 || uv_height < 0) { // u/v param error + if (width <= 0 || height <= 0 || // luma/alpha param error + uv_width <= 0 || uv_height <= 0) { // u/v param error return WebPEncodingSetError(picture, VP8_ENC_ERROR_BAD_DIMENSION); } // allocate a new buffer. @@ -271,9 +270,11 @@ size_t NAME(const uint8_t* in, int w, int h, int bps, float q, \ } ENCODE_FUNC(WebPEncodeRGB, WebPPictureImportRGB) -ENCODE_FUNC(WebPEncodeBGR, WebPPictureImportBGR) ENCODE_FUNC(WebPEncodeRGBA, WebPPictureImportRGBA) +#if !defined(WEBP_REDUCE_CSP) +ENCODE_FUNC(WebPEncodeBGR, WebPPictureImportBGR) ENCODE_FUNC(WebPEncodeBGRA, WebPPictureImportBGRA) +#endif // WEBP_REDUCE_CSP #undef ENCODE_FUNC @@ -284,9 +285,11 @@ size_t NAME(const uint8_t* in, int w, int h, int bps, uint8_t** out) { \ } LOSSLESS_ENCODE_FUNC(WebPEncodeLosslessRGB, WebPPictureImportRGB) -LOSSLESS_ENCODE_FUNC(WebPEncodeLosslessBGR, WebPPictureImportBGR) LOSSLESS_ENCODE_FUNC(WebPEncodeLosslessRGBA, WebPPictureImportRGBA) +#if !defined(WEBP_REDUCE_CSP) +LOSSLESS_ENCODE_FUNC(WebPEncodeLosslessBGR, WebPPictureImportBGR) LOSSLESS_ENCODE_FUNC(WebPEncodeLosslessBGRA, WebPPictureImportBGRA) +#endif // WEBP_REDUCE_CSP #undef LOSSLESS_ENCODE_FUNC diff --git a/src/3rdparty/libwebp/src/enc/picture_psnr_enc.c b/src/3rdparty/libwebp/src/enc/picture_psnr_enc.c index 9c0b229..1a2f0be 100644 --- a/src/3rdparty/libwebp/src/enc/picture_psnr_enc.c +++ b/src/3rdparty/libwebp/src/enc/picture_psnr_enc.c @@ -11,11 +11,16 @@ // // Author: Skal (pascal.massimino@gmail.com) +#include "src/webp/encode.h" + +#if !(defined(WEBP_DISABLE_STATS) || defined(WEBP_REDUCE_SIZE)) + #include <math.h> #include <stdlib.h> -#include "./vp8i_enc.h" -#include "../utils/utils.h" +#include "src/dsp/dsp.h" +#include "src/enc/vp8i_enc.h" +#include "src/utils/utils.h" typedef double (*AccumulateFunc)(const uint8_t* src, int src_stride, const uint8_t* ref, int ref_stride, @@ -165,6 +170,12 @@ int WebPPlaneDistortion(const uint8_t* src, size_t src_stride, return 1; } +#ifdef WORDS_BIGENDIAN +#define BLUE_OFFSET 3 // uint32_t 0x000000ff is 0x00,00,00,ff in memory +#else +#define BLUE_OFFSET 0 // uint32_t 0x000000ff is 0xff,00,00,00 in memory +#endif + int WebPPictureDistortion(const WebPPicture* src, const WebPPicture* ref, int type, float results[5]) { int w, h, c; @@ -191,8 +202,10 @@ int WebPPictureDistortion(const WebPPicture* src, const WebPPicture* ref, float distortion; const size_t stride0 = 4 * (size_t)p0.argb_stride; const size_t stride1 = 4 * (size_t)p1.argb_stride; - if (!WebPPlaneDistortion((const uint8_t*)p0.argb + c, stride0, - (const uint8_t*)p1.argb + c, stride1, + // results are reported as BGRA + const int offset = c ^ BLUE_OFFSET; + if (!WebPPlaneDistortion((const uint8_t*)p0.argb + offset, stride0, + (const uint8_t*)p1.argb + offset, stride1, w, h, 4, type, &distortion, results + c)) { goto Error; } @@ -210,4 +223,36 @@ int WebPPictureDistortion(const WebPPicture* src, const WebPPicture* ref, return ok; } -//------------------------------------------------------------------------------ +#undef BLUE_OFFSET + +#else // defined(WEBP_DISABLE_STATS) +int WebPPlaneDistortion(const uint8_t* src, size_t src_stride, + const uint8_t* ref, size_t ref_stride, + int width, int height, size_t x_step, + int type, float* distortion, float* result) { + (void)src; + (void)src_stride; + (void)ref; + (void)ref_stride; + (void)width; + (void)height; + (void)x_step; + (void)type; + if (distortion == NULL || result == NULL) return 0; + *distortion = 0.f; + *result = 0.f; + return 1; +} + +int WebPPictureDistortion(const WebPPicture* src, const WebPPicture* ref, + int type, float results[5]) { + int i; + (void)src; + (void)ref; + (void)type; + if (results == NULL) return 0; + for (i = 0; i < 5; ++i) results[i] = 0.f; + return 1; +} + +#endif // !defined(WEBP_DISABLE_STATS) diff --git a/src/3rdparty/libwebp/src/enc/picture_rescale_enc.c b/src/3rdparty/libwebp/src/enc/picture_rescale_enc.c index 0b7181c..58a6ae7 100644 --- a/src/3rdparty/libwebp/src/enc/picture_rescale_enc.c +++ b/src/3rdparty/libwebp/src/enc/picture_rescale_enc.c @@ -11,12 +11,16 @@ // // Author: Skal (pascal.massimino@gmail.com) +#include "src/webp/encode.h" + +#if !defined(WEBP_REDUCE_SIZE) + #include <assert.h> #include <stdlib.h> -#include "./vp8i_enc.h" -#include "../utils/rescaler_utils.h" -#include "../utils/utils.h" +#include "src/enc/vp8i_enc.h" +#include "src/utils/rescaler_utils.h" +#include "src/utils/utils.h" #define HALVE(x) (((x) + 1) >> 1) @@ -261,4 +265,45 @@ int WebPPictureRescale(WebPPicture* pic, int width, int height) { return 1; } -//------------------------------------------------------------------------------ +#else // defined(WEBP_REDUCE_SIZE) + +int WebPPictureCopy(const WebPPicture* src, WebPPicture* dst) { + (void)src; + (void)dst; + return 0; +} + +int WebPPictureIsView(const WebPPicture* picture) { + (void)picture; + return 0; +} + +int WebPPictureView(const WebPPicture* src, + int left, int top, int width, int height, + WebPPicture* dst) { + (void)src; + (void)left; + (void)top; + (void)width; + (void)height; + (void)dst; + return 0; +} + +int WebPPictureCrop(WebPPicture* pic, + int left, int top, int width, int height) { + (void)pic; + (void)left; + (void)top; + (void)width; + (void)height; + return 0; +} + +int WebPPictureRescale(WebPPicture* pic, int width, int height) { + (void)pic; + (void)width; + (void)height; + return 0; +} +#endif // !defined(WEBP_REDUCE_SIZE) diff --git a/src/3rdparty/libwebp/src/enc/picture_tools_enc.c b/src/3rdparty/libwebp/src/enc/picture_tools_enc.c index 895df51..d0e8a49 100644 --- a/src/3rdparty/libwebp/src/enc/picture_tools_enc.c +++ b/src/3rdparty/libwebp/src/enc/picture_tools_enc.c @@ -13,32 +13,15 @@ #include <assert.h> -#include "./vp8i_enc.h" -#include "../dsp/yuv.h" - -static WEBP_INLINE uint32_t MakeARGB32(int r, int g, int b) { - return (0xff000000u | (r << 16) | (g << 8) | b); -} +#include "src/enc/vp8i_enc.h" +#include "src/dsp/yuv.h" //------------------------------------------------------------------------------ // Helper: clean up fully transparent area to help compressibility. #define SIZE 8 #define SIZE2 (SIZE / 2) -static int is_transparent_area(const uint8_t* ptr, int stride, int size) { - int y, x; - for (y = 0; y < size; ++y) { - for (x = 0; x < size; ++x) { - if (ptr[x]) { - return 0; - } - } - ptr += stride; - } - return 1; -} - -static int is_transparent_argb_area(const uint32_t* ptr, int stride, int size) { +static int IsTransparentARGBArea(const uint32_t* ptr, int stride, int size) { int y, x; for (y = 0; y < size; ++y) { for (x = 0; x < size; ++x) { @@ -51,7 +34,7 @@ static int is_transparent_argb_area(const uint32_t* ptr, int stride, int size) { return 1; } -static void flatten(uint8_t* ptr, int v, int stride, int size) { +static void Flatten(uint8_t* ptr, int v, int stride, int size) { int y; for (y = 0; y < size; ++y) { memset(ptr, v, size); @@ -59,7 +42,7 @@ static void flatten(uint8_t* ptr, int v, int stride, int size) { } } -static void flatten_argb(uint32_t* ptr, uint32_t v, int stride, int size) { +static void FlattenARGB(uint32_t* ptr, uint32_t v, int stride, int size) { int x, y; for (y = 0; y < size; ++y) { for (x = 0; x < size; ++x) ptr[x] = v; @@ -67,54 +50,114 @@ static void flatten_argb(uint32_t* ptr, uint32_t v, int stride, int size) { } } +// Smoothen the luma components of transparent pixels. Return true if the whole +// block is transparent. +static int SmoothenBlock(const uint8_t* a_ptr, int a_stride, uint8_t* y_ptr, + int y_stride, int width, int height) { + int sum = 0, count = 0; + int x, y; + const uint8_t* alpha_ptr = a_ptr; + uint8_t* luma_ptr = y_ptr; + for (y = 0; y < height; ++y) { + for (x = 0; x < width; ++x) { + if (alpha_ptr[x] != 0) { + ++count; + sum += luma_ptr[x]; + } + } + alpha_ptr += a_stride; + luma_ptr += y_stride; + } + if (count > 0 && count < width * height) { + const uint8_t avg_u8 = (uint8_t)(sum / count); + alpha_ptr = a_ptr; + luma_ptr = y_ptr; + for (y = 0; y < height; ++y) { + for (x = 0; x < width; ++x) { + if (alpha_ptr[x] == 0) luma_ptr[x] = avg_u8; + } + alpha_ptr += a_stride; + luma_ptr += y_stride; + } + } + return (count == 0); +} + void WebPCleanupTransparentArea(WebPPicture* pic) { int x, y, w, h; if (pic == NULL) return; w = pic->width / SIZE; h = pic->height / SIZE; - // note: we ignore the left-overs on right/bottom + // note: we ignore the left-overs on right/bottom, except for SmoothenBlock(). if (pic->use_argb) { uint32_t argb_value = 0; for (y = 0; y < h; ++y) { int need_reset = 1; for (x = 0; x < w; ++x) { const int off = (y * pic->argb_stride + x) * SIZE; - if (is_transparent_argb_area(pic->argb + off, pic->argb_stride, SIZE)) { + if (IsTransparentARGBArea(pic->argb + off, pic->argb_stride, SIZE)) { if (need_reset) { argb_value = pic->argb[off]; need_reset = 0; } - flatten_argb(pic->argb + off, argb_value, pic->argb_stride, SIZE); + FlattenARGB(pic->argb + off, argb_value, pic->argb_stride, SIZE); } else { need_reset = 1; } } } } else { - const uint8_t* const a_ptr = pic->a; + const int width = pic->width; + const int height = pic->height; + const int y_stride = pic->y_stride; + const int uv_stride = pic->uv_stride; + const int a_stride = pic->a_stride; + uint8_t* y_ptr = pic->y; + uint8_t* u_ptr = pic->u; + uint8_t* v_ptr = pic->v; + const uint8_t* a_ptr = pic->a; int values[3] = { 0 }; - if (a_ptr == NULL) return; // nothing to do - for (y = 0; y < h; ++y) { + if (a_ptr == NULL || y_ptr == NULL || u_ptr == NULL || v_ptr == NULL) { + return; + } + for (y = 0; y + SIZE <= height; y += SIZE) { int need_reset = 1; - for (x = 0; x < w; ++x) { - const int off_a = (y * pic->a_stride + x) * SIZE; - const int off_y = (y * pic->y_stride + x) * SIZE; - const int off_uv = (y * pic->uv_stride + x) * SIZE2; - if (is_transparent_area(a_ptr + off_a, pic->a_stride, SIZE)) { + for (x = 0; x + SIZE <= width; x += SIZE) { + if (SmoothenBlock(a_ptr + x, a_stride, y_ptr + x, y_stride, + SIZE, SIZE)) { if (need_reset) { - values[0] = pic->y[off_y]; - values[1] = pic->u[off_uv]; - values[2] = pic->v[off_uv]; + values[0] = y_ptr[x]; + values[1] = u_ptr[x >> 1]; + values[2] = v_ptr[x >> 1]; need_reset = 0; } - flatten(pic->y + off_y, values[0], pic->y_stride, SIZE); - flatten(pic->u + off_uv, values[1], pic->uv_stride, SIZE2); - flatten(pic->v + off_uv, values[2], pic->uv_stride, SIZE2); + Flatten(y_ptr + x, values[0], y_stride, SIZE); + Flatten(u_ptr + (x >> 1), values[1], uv_stride, SIZE2); + Flatten(v_ptr + (x >> 1), values[2], uv_stride, SIZE2); } else { need_reset = 1; } } + if (x < width) { + SmoothenBlock(a_ptr + x, a_stride, y_ptr + x, y_stride, + width - x, SIZE); + } + a_ptr += SIZE * a_stride; + y_ptr += SIZE * y_stride; + u_ptr += SIZE2 * uv_stride; + v_ptr += SIZE2 * uv_stride; + } + if (y < height) { + const int sub_height = height - y; + for (x = 0; x + SIZE <= width; x += SIZE) { + SmoothenBlock(a_ptr + x, a_stride, y_ptr + x, y_stride, + SIZE, sub_height); + } + if (x < width) { + SmoothenBlock(a_ptr + x, a_stride, y_ptr + x, y_stride, + width - x, sub_height); + } } } } @@ -144,9 +187,13 @@ void WebPCleanupTransparentAreaLossless(WebPPicture* const pic) { // Blend color and remove transparency info #define BLEND(V0, V1, ALPHA) \ - ((((V0) * (255 - (ALPHA)) + (V1) * (ALPHA)) * 0x101) >> 16) + ((((V0) * (255 - (ALPHA)) + (V1) * (ALPHA)) * 0x101 + 256) >> 16) #define BLEND_10BIT(V0, V1, ALPHA) \ - ((((V0) * (1020 - (ALPHA)) + (V1) * (ALPHA)) * 0x101) >> 18) + ((((V0) * (1020 - (ALPHA)) + (V1) * (ALPHA)) * 0x101 + 1024) >> 18) + +static WEBP_INLINE uint32_t MakeARGB32(int r, int g, int b) { + return (0xff000000u | (r << 16) | (g << 8) | b); +} void WebPBlendAlpha(WebPPicture* pic, uint32_t background_rgb) { const int red = (background_rgb >> 16) & 0xff; @@ -161,39 +208,44 @@ void WebPBlendAlpha(WebPPicture* pic, uint32_t background_rgb) { const int U0 = VP8RGBToU(4 * red, 4 * green, 4 * blue, 4 * YUV_HALF); const int V0 = VP8RGBToV(4 * red, 4 * green, 4 * blue, 4 * YUV_HALF); const int has_alpha = pic->colorspace & WEBP_CSP_ALPHA_BIT; - if (!has_alpha || pic->a == NULL) return; // nothing to do + uint8_t* y_ptr = pic->y; + uint8_t* u_ptr = pic->u; + uint8_t* v_ptr = pic->v; + uint8_t* a_ptr = pic->a; + if (!has_alpha || a_ptr == NULL) return; // nothing to do for (y = 0; y < pic->height; ++y) { // Luma blending - uint8_t* const y_ptr = pic->y + y * pic->y_stride; - uint8_t* const a_ptr = pic->a + y * pic->a_stride; for (x = 0; x < pic->width; ++x) { - const int alpha = a_ptr[x]; + const uint8_t alpha = a_ptr[x]; if (alpha < 0xff) { - y_ptr[x] = BLEND(Y0, y_ptr[x], a_ptr[x]); + y_ptr[x] = BLEND(Y0, y_ptr[x], alpha); } } // Chroma blending every even line if ((y & 1) == 0) { - uint8_t* const u = pic->u + (y >> 1) * pic->uv_stride; - uint8_t* const v = pic->v + (y >> 1) * pic->uv_stride; uint8_t* const a_ptr2 = (y + 1 == pic->height) ? a_ptr : a_ptr + pic->a_stride; for (x = 0; x < uv_width; ++x) { // Average four alpha values into a single blending weight. // TODO(skal): might lead to visible contouring. Can we do better? - const int alpha = + const uint32_t alpha = a_ptr[2 * x + 0] + a_ptr[2 * x + 1] + a_ptr2[2 * x + 0] + a_ptr2[2 * x + 1]; - u[x] = BLEND_10BIT(U0, u[x], alpha); - v[x] = BLEND_10BIT(V0, v[x], alpha); + u_ptr[x] = BLEND_10BIT(U0, u_ptr[x], alpha); + v_ptr[x] = BLEND_10BIT(V0, v_ptr[x], alpha); } if (pic->width & 1) { // rightmost pixel - const int alpha = 2 * (a_ptr[2 * x + 0] + a_ptr2[2 * x + 0]); - u[x] = BLEND_10BIT(U0, u[x], alpha); - v[x] = BLEND_10BIT(V0, v[x], alpha); + const uint32_t alpha = 2 * (a_ptr[2 * x + 0] + a_ptr2[2 * x + 0]); + u_ptr[x] = BLEND_10BIT(U0, u_ptr[x], alpha); + v_ptr[x] = BLEND_10BIT(V0, v_ptr[x], alpha); } + } else { + u_ptr += pic->uv_stride; + v_ptr += pic->uv_stride; } - memset(a_ptr, 0xff, pic->width); + memset(a_ptr, 0xff, pic->width); // reset alpha value to opaque + a_ptr += pic->a_stride; + y_ptr += pic->y_stride; } } else { uint32_t* argb = pic->argb; diff --git a/src/3rdparty/libwebp/src/enc/predictor_enc.c b/src/3rdparty/libwebp/src/enc/predictor_enc.c index 0639b74..2e6762e 100644 --- a/src/3rdparty/libwebp/src/enc/predictor_enc.c +++ b/src/3rdparty/libwebp/src/enc/predictor_enc.c @@ -14,9 +14,9 @@ // Urvang Joshi (urvang@google.com) // Vincent Rabaud (vrabaud@google.com) -#include "../dsp/lossless.h" -#include "../dsp/lossless_common.h" -#include "./vp8li_enc.h" +#include "src/dsp/lossless.h" +#include "src/dsp/lossless_common.h" +#include "src/enc/vp8li_enc.h" #define MAX_DIFF_COST (1e30f) @@ -26,7 +26,6 @@ static const uint32_t kMaskAlpha = 0xff000000; // Mostly used to reduce code size + readability static WEBP_INLINE int GetMin(int a, int b) { return (a > b) ? b : a; } -static WEBP_INLINE int GetMax(int a, int b) { return (a < b) ? b : a; } //------------------------------------------------------------------------------ // Methods to calculate Entropy (Shannon). @@ -90,6 +89,9 @@ static WEBP_INLINE void PredictBatch(int mode, int x_start, int y, } } +#if (WEBP_NEAR_LOSSLESS == 1) +static WEBP_INLINE int GetMax(int a, int b) { return (a < b) ? b : a; } + static int MaxDiffBetweenPixels(uint32_t p1, uint32_t p2) { const int diff_a = abs((int)(p1 >> 24) - (int)(p2 >> 24)); const int diff_r = abs((int)((p1 >> 16) & 0xff) - (int)((p2 >> 16) & 0xff)); @@ -175,6 +177,10 @@ static uint8_t NearLosslessComponent(uint8_t value, uint8_t predict, } } +static WEBP_INLINE uint8_t NearLosslessDiff(uint8_t a, uint8_t b) { + return (uint8_t)((((int)(a) - (int)(b))) & 0xff); +} + // Quantize every component of the difference between the actual pixel value and // its prediction to a multiple of a quantization (a power of 2, not larger than // max_quantization which is a power of 2, smaller than max_diff). Take care if @@ -196,7 +202,7 @@ static uint32_t NearLossless(uint32_t value, uint32_t predict, } if ((value >> 24) == 0 || (value >> 24) == 0xff) { // Preserve transparency of fully transparent or fully opaque pixels. - a = ((value >> 24) - (predict >> 24)) & 0xff; + a = NearLosslessDiff((value >> 24) & 0xff, (predict >> 24) & 0xff); } else { a = NearLosslessComponent(value >> 24, predict >> 24, 0xff, quantization); } @@ -209,15 +215,16 @@ static uint32_t NearLossless(uint32_t value, uint32_t predict, // The amount by which green has been adjusted during quantization. It is // subtracted from red and blue for compensation, to avoid accumulating two // quantization errors in them. - green_diff = (new_green - (value >> 8)) & 0xff; + green_diff = NearLosslessDiff(new_green, (value >> 8) & 0xff); } - r = NearLosslessComponent(((value >> 16) - green_diff) & 0xff, + r = NearLosslessComponent(NearLosslessDiff((value >> 16) & 0xff, green_diff), (predict >> 16) & 0xff, 0xff - new_green, quantization); - b = NearLosslessComponent((value - green_diff) & 0xff, predict & 0xff, - 0xff - new_green, quantization); + b = NearLosslessComponent(NearLosslessDiff(value & 0xff, green_diff), + predict & 0xff, 0xff - new_green, quantization); return ((uint32_t)a << 24) | ((uint32_t)r << 16) | ((uint32_t)g << 8) | b; } +#endif // (WEBP_NEAR_LOSSLESS == 1) // Stores the difference between the pixel and its prediction in "out". // In case of a lossy encoding, updates the source image to avoid propagating @@ -244,6 +251,7 @@ static WEBP_INLINE void GetResidual( } else { predict = pred_func(current_row[x - 1], upper_row + x); } +#if (WEBP_NEAR_LOSSLESS == 1) if (max_quantization == 1 || mode == 0 || y == 0 || y == height - 1 || x == 0 || x == width - 1) { residual = VP8LSubPixels(current_row[x], predict); @@ -254,6 +262,13 @@ static WEBP_INLINE void GetResidual( current_row[x] = VP8LAddPixels(predict, residual); // x is never 0 here so we do not need to update upper_row like below. } +#else + (void)max_diffs; + (void)height; + (void)max_quantization; + (void)used_subtract_green; + residual = VP8LSubPixels(current_row[x], predict); +#endif if ((current_row[x] & kMaskAlpha) == 0) { // If alpha is 0, cleanup RGB. We can choose the RGB values of the // residual for best compression. The prediction of alpha itself can be @@ -296,11 +311,12 @@ static int GetBestPredictorForTile(int width, int height, const int max_x = GetMin(tile_size, width - start_x); // Whether there exist columns just outside the tile. const int have_left = (start_x > 0); - const int have_right = (max_x < width - start_x); // Position and size of the strip covering the tile and adjacent columns if // they exist. const int context_start_x = start_x - have_left; - const int context_width = max_x + have_left + have_right; +#if (WEBP_NEAR_LOSSLESS == 1) + const int context_width = max_x + have_left + (max_x < width - start_x); +#endif const int tiles_per_row = VP8LSubSampleSize(width, bits); // Prediction modes of the left and above neighbor tiles. const int left_mode = (tile_x > 0) ? @@ -352,10 +368,12 @@ static int GetBestPredictorForTile(int width, int height, memcpy(current_row + context_start_x, argb + y * width + context_start_x, sizeof(*argb) * (max_x + have_left + (y + 1 < height))); +#if (WEBP_NEAR_LOSSLESS == 1) if (max_quantization > 1 && y >= 1 && y + 1 < height) { MaxDiffsForRow(context_width, width, argb + y * width + context_start_x, max_diffs + context_start_x, used_subtract_green); } +#endif GetResidual(width, height, upper_row, current_row, max_diffs, mode, start_x, start_x + max_x, y, max_quantization, exact, @@ -405,7 +423,9 @@ static void CopyImageWithPrediction(int width, int height, uint32_t* upper_row = argb_scratch; uint32_t* current_row = upper_row + width + 1; uint8_t* current_max_diffs = (uint8_t*)(current_row + width + 1); +#if (WEBP_NEAR_LOSSLESS == 1) uint8_t* lower_max_diffs = current_max_diffs + width; +#endif int y; for (y = 0; y < height; ++y) { @@ -420,6 +440,7 @@ static void CopyImageWithPrediction(int width, int height, PredictBatch(kPredLowEffort, 0, y, width, current_row, upper_row, argb + y * width); } else { +#if (WEBP_NEAR_LOSSLESS == 1) if (max_quantization > 1) { // Compute max_diffs for the lower row now, because that needs the // contents of argb for the current row, which we will overwrite with @@ -432,6 +453,7 @@ static void CopyImageWithPrediction(int width, int height, used_subtract_green); } } +#endif for (x = 0; x < width;) { const int mode = (modes[(y >> bits) * tiles_per_row + (x >> bits)] >> 8) & 0xff; @@ -565,7 +587,7 @@ static void GetBestGreenToRed( } } } - best_tx->green_to_red_ = green_to_red_best; + best_tx->green_to_red_ = (green_to_red_best & 0xff); } static float GetPredictionCostCrossColorBlue( @@ -644,8 +666,8 @@ static void GetBestGreenRedToBlue( break; // out of iter-loop. } } - best_tx->green_to_blue_ = green_to_blue_best; - best_tx->red_to_blue_ = red_to_blue_best; + best_tx->green_to_blue_ = green_to_blue_best & 0xff; + best_tx->red_to_blue_ = red_to_blue_best & 0xff; } #undef kGreenRedToBlueMaxIters #undef kGreenRedToBlueNumAxis diff --git a/src/3rdparty/libwebp/src/enc/quant_enc.c b/src/3rdparty/libwebp/src/enc/quant_enc.c index b118fb2..01eb565 100644 --- a/src/3rdparty/libwebp/src/enc/quant_enc.c +++ b/src/3rdparty/libwebp/src/enc/quant_enc.c @@ -15,8 +15,9 @@ #include <math.h> #include <stdlib.h> // for abs() -#include "./vp8i_enc.h" -#include "./cost_enc.h" +#include "src/dsp/quant.h" +#include "src/enc/vp8i_enc.h" +#include "src/enc/cost_enc.h" #define DO_TRELLIS_I4 1 #define DO_TRELLIS_I16 1 // not a huge gain, but ok at low bitrate. @@ -32,7 +33,7 @@ // number of non-zero coeffs below which we consider the block very flat // (and apply a penalty to complex predictions) -#define FLATNESS_LIMIT_I16 10 // I16 mode +#define FLATNESS_LIMIT_I16 0 // I16 mode (special case) #define FLATNESS_LIMIT_I4 3 // I4 mode #define FLATNESS_LIMIT_UV 2 // UV mode #define FLATNESS_PENALTY 140 // roughly ~1bit per block @@ -457,11 +458,11 @@ void VP8SetSegmentParams(VP8Encoder* const enc, float quality) { // Form the predictions in cache // Must be ordered using {DC_PRED, TM_PRED, V_PRED, H_PRED} as index -const int VP8I16ModeOffsets[4] = { I16DC16, I16TM16, I16VE16, I16HE16 }; -const int VP8UVModeOffsets[4] = { C8DC8, C8TM8, C8VE8, C8HE8 }; +const uint16_t VP8I16ModeOffsets[4] = { I16DC16, I16TM16, I16VE16, I16HE16 }; +const uint16_t VP8UVModeOffsets[4] = { C8DC8, C8TM8, C8VE8, C8HE8 }; // Must be indexed using {B_DC_PRED -> B_HU_PRED} as index -const int VP8I4ModeOffsets[NUM_BMODES] = { +const uint16_t VP8I4ModeOffsets[NUM_BMODES] = { I4DC4, I4TM4, I4VE4, I4HE4, I4RD4, I4VR4, I4LD4, I4VL4, I4HD4, I4HU4 }; @@ -492,14 +493,14 @@ void VP8MakeIntra4Preds(const VP8EncIterator* const it) { // |YYYY|....| 12 // +----+----+ -const int VP8Scan[16] = { // Luma +const uint16_t VP8Scan[16] = { // Luma 0 + 0 * BPS, 4 + 0 * BPS, 8 + 0 * BPS, 12 + 0 * BPS, 0 + 4 * BPS, 4 + 4 * BPS, 8 + 4 * BPS, 12 + 4 * BPS, 0 + 8 * BPS, 4 + 8 * BPS, 8 + 8 * BPS, 12 + 8 * BPS, 0 + 12 * BPS, 4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS, }; -static const int VP8ScanUV[4 + 4] = { +static const uint16_t VP8ScanUV[4 + 4] = { 0 + 0 * BPS, 4 + 0 * BPS, 0 + 4 * BPS, 4 + 4 * BPS, // U 8 + 0 * BPS, 12 + 0 * BPS, 8 + 4 * BPS, 12 + 4 * BPS // V }; @@ -826,6 +827,85 @@ static int ReconstructIntra4(VP8EncIterator* const it, return nz; } +//------------------------------------------------------------------------------ +// DC-error diffusion + +// Diffusion weights. We under-correct a bit (15/16th of the error is actually +// diffused) to avoid 'rainbow' chessboard pattern of blocks at q~=0. +#define C1 7 // fraction of error sent to the 4x4 block below +#define C2 8 // fraction of error sent to the 4x4 block on the right +#define DSHIFT 4 +#define DSCALE 1 // storage descaling, needed to make the error fit int8_t + +// Quantize as usual, but also compute and return the quantization error. +// Error is already divided by DSHIFT. +static int QuantizeSingle(int16_t* const v, const VP8Matrix* const mtx) { + int V = *v; + const int sign = (V < 0); + if (sign) V = -V; + if (V > (int)mtx->zthresh_[0]) { + const int qV = QUANTDIV(V, mtx->iq_[0], mtx->bias_[0]) * mtx->q_[0]; + const int err = (V - qV); + *v = sign ? -qV : qV; + return (sign ? -err : err) >> DSCALE; + } + *v = 0; + return (sign ? -V : V) >> DSCALE; +} + +static void CorrectDCValues(const VP8EncIterator* const it, + const VP8Matrix* const mtx, + int16_t tmp[][16], VP8ModeScore* const rd) { + // | top[0] | top[1] + // --------+--------+--------- + // left[0] | tmp[0] tmp[1] <-> err0 err1 + // left[1] | tmp[2] tmp[3] err2 err3 + // + // Final errors {err1,err2,err3} are preserved and later restored + // as top[]/left[] on the next block. + int ch; + for (ch = 0; ch <= 1; ++ch) { + const int8_t* const top = it->top_derr_[it->x_][ch]; + const int8_t* const left = it->left_derr_[ch]; + int16_t (* const c)[16] = &tmp[ch * 4]; + int err0, err1, err2, err3; + c[0][0] += (C1 * top[0] + C2 * left[0]) >> (DSHIFT - DSCALE); + err0 = QuantizeSingle(&c[0][0], mtx); + c[1][0] += (C1 * top[1] + C2 * err0) >> (DSHIFT - DSCALE); + err1 = QuantizeSingle(&c[1][0], mtx); + c[2][0] += (C1 * err0 + C2 * left[1]) >> (DSHIFT - DSCALE); + err2 = QuantizeSingle(&c[2][0], mtx); + c[3][0] += (C1 * err1 + C2 * err2) >> (DSHIFT - DSCALE); + err3 = QuantizeSingle(&c[3][0], mtx); + // error 'err' is bounded by mtx->q_[0] which is 132 at max. Hence + // err >> DSCALE will fit in an int8_t type if DSCALE>=1. + assert(abs(err1) <= 127 && abs(err2) <= 127 && abs(err3) <= 127); + rd->derr[ch][0] = (int8_t)err1; + rd->derr[ch][1] = (int8_t)err2; + rd->derr[ch][2] = (int8_t)err3; + } +} + +static void StoreDiffusionErrors(VP8EncIterator* const it, + const VP8ModeScore* const rd) { + int ch; + for (ch = 0; ch <= 1; ++ch) { + int8_t* const top = it->top_derr_[it->x_][ch]; + int8_t* const left = it->left_derr_[ch]; + left[0] = rd->derr[ch][0]; // restore err1 + left[1] = 3 * rd->derr[ch][2] >> 2; // ... 3/4th of err3 + top[0] = rd->derr[ch][1]; // ... err2 + top[1] = rd->derr[ch][2] - left[1]; // ... 1/4th of err3. + } +} + +#undef C1 +#undef C2 +#undef DSHIFT +#undef DSCALE + +//------------------------------------------------------------------------------ + static int ReconstructUV(VP8EncIterator* const it, VP8ModeScore* const rd, uint8_t* const yuv_out, int mode) { const VP8Encoder* const enc = it->enc_; @@ -839,6 +919,8 @@ static int ReconstructUV(VP8EncIterator* const it, VP8ModeScore* const rd, for (n = 0; n < 8; n += 2) { VP8FTransform2(src + VP8ScanUV[n], ref + VP8ScanUV[n], tmp[n]); } + if (it->top_derr_ != NULL) CorrectDCValues(it, &dqm->uv_, tmp, rd); + if (DO_TRELLIS_UV && it->do_trellis_) { int ch, x, y; for (ch = 0, n = 0; ch <= 2; ch += 2) { @@ -896,19 +978,6 @@ static void SwapOut(VP8EncIterator* const it) { SwapPtr(&it->yuv_out_, &it->yuv_out2_); } -static score_t IsFlat(const int16_t* levels, int num_blocks, score_t thresh) { - score_t score = 0; - while (num_blocks-- > 0) { // TODO(skal): refine positional scoring? - int i; - for (i = 1; i < 16; ++i) { // omit DC, we're only interested in AC - score += (levels[i] != 0); - if (score > thresh) return 0; - } - levels += 16; - } - return 1; -} - static void PickBestIntra16(VP8EncIterator* const it, VP8ModeScore* rd) { const int kNumBlocks = 16; VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_]; @@ -919,6 +988,7 @@ static void PickBestIntra16(VP8EncIterator* const it, VP8ModeScore* rd) { VP8ModeScore* rd_cur = &rd_tmp; VP8ModeScore* rd_best = rd; int mode; + int is_flat = IsFlatSource16(it->yuv_in_ + Y_OFF_ENC); rd->mode_i16 = -1; for (mode = 0; mode < NUM_PRED_MODES; ++mode) { @@ -934,10 +1004,14 @@ static void PickBestIntra16(VP8EncIterator* const it, VP8ModeScore* rd) { tlambda ? MULT_8B(tlambda, VP8TDisto16x16(src, tmp_dst, kWeightY)) : 0; rd_cur->H = VP8FixedCostsI16[mode]; rd_cur->R = VP8GetCostLuma16(it, rd_cur); - if (mode > 0 && - IsFlat(rd_cur->y_ac_levels[0], kNumBlocks, FLATNESS_LIMIT_I16)) { - // penalty to avoid flat area to be mispredicted by complex mode - rd_cur->R += FLATNESS_PENALTY * kNumBlocks; + if (is_flat) { + // refine the first impression (which was in pixel space) + is_flat = IsFlat(rd_cur->y_ac_levels[0], kNumBlocks, FLATNESS_LIMIT_I16); + if (is_flat) { + // Block is very flat. We put emphasis on the distortion being very low! + rd_cur->D *= 2; + rd_cur->SD *= 2; + } } // Since we always examine Intra16 first, we can overwrite *rd directly. @@ -1018,7 +1092,8 @@ static int PickBestIntra4(VP8EncIterator* const it, VP8ModeScore* const rd) { : 0; rd_tmp.H = mode_costs[mode]; - // Add flatness penalty + // Add flatness penalty, to avoid flat area to be mispredicted + // by a complex mode. if (mode > 0 && IsFlat(tmp_levels, kNumBlocks, FLATNESS_LIMIT_I4)) { rd_tmp.R = FLATNESS_PENALTY * kNumBlocks; } else { @@ -1101,6 +1176,9 @@ static void PickBestUV(VP8EncIterator* const it, VP8ModeScore* const rd) { CopyScore(&rd_best, &rd_uv); rd->mode_uv = mode; memcpy(rd->uv_levels, rd_uv.uv_levels, sizeof(rd->uv_levels)); + if (it->top_derr_ != NULL) { + memcpy(rd->derr, rd_uv.derr, sizeof(rd_uv.derr)); + } SwapPtr(&dst, &tmp_dst); } } @@ -1109,6 +1187,9 @@ static void PickBestUV(VP8EncIterator* const it, VP8ModeScore* const rd) { if (dst != dst0) { // copy 16x8 block if needed VP8Copy16x8(dst, dst0); } + if (it->top_derr_ != NULL) { // store diffusion errors for next block + StoreDiffusionErrors(it, rd); + } } //------------------------------------------------------------------------------ @@ -1162,16 +1243,24 @@ static void RefineUsingDistortion(VP8EncIterator* const it, const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC; for (mode = 0; mode < NUM_PRED_MODES; ++mode) { const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode]; - const score_t score = VP8SSE16x16(src, ref) * RD_DISTO_MULT + const score_t score = (score_t)VP8SSE16x16(src, ref) * RD_DISTO_MULT + VP8FixedCostsI16[mode] * lambda_d_i16; if (mode > 0 && VP8FixedCostsI16[mode] > bit_limit) { continue; } + if (score < best_score) { best_mode = mode; best_score = score; } } + if (it->x_ == 0 || it->y_ == 0) { + // avoid starting a checkerboard resonance from the border. See bug #432. + if (IsFlatSource16(src)) { + best_mode = (it->x_ == 0) ? 0 : 2; + try_both_modes = 0; // stick to i16 + } + } VP8SetIntra16Mode(it, best_mode); // we'll reconstruct later, if i16 mode actually gets selected } diff --git a/src/3rdparty/libwebp/src/enc/syntax_enc.c b/src/3rdparty/libwebp/src/enc/syntax_enc.c index 90665bd..a9e5a6c 100644 --- a/src/3rdparty/libwebp/src/enc/syntax_enc.c +++ b/src/3rdparty/libwebp/src/enc/syntax_enc.c @@ -13,10 +13,10 @@ #include <assert.h> -#include "../utils/utils.h" -#include "../webp/format_constants.h" // RIFF constants -#include "../webp/mux_types.h" // ALPHA_FLAG -#include "./vp8i_enc.h" +#include "src/utils/utils.h" +#include "src/webp/format_constants.h" // RIFF constants +#include "src/webp/mux_types.h" // ALPHA_FLAG +#include "src/enc/vp8i_enc.h" //------------------------------------------------------------------------------ // Helper functions @@ -289,11 +289,17 @@ static int GeneratePartition0(VP8Encoder* const enc) { pos3 = VP8BitWriterPos(bw); +#if !defined(WEBP_DISABLE_STATS) if (enc->pic_->stats) { enc->pic_->stats->header_bytes[0] = (int)((pos2 - pos1 + 7) >> 3); enc->pic_->stats->header_bytes[1] = (int)((pos3 - pos2 + 7) >> 3); enc->pic_->stats->alpha_data_size = (int)enc->alpha_data_size_; } +#else + (void)pos1; + (void)pos2; + (void)pos3; +#endif if (bw->error_) { return WebPEncodingSetError(enc->pic_, VP8_ENC_ERROR_OUT_OF_MEMORY); } diff --git a/src/3rdparty/libwebp/src/enc/token_enc.c b/src/3rdparty/libwebp/src/enc/token_enc.c index 02a0d72..3a2192a 100644 --- a/src/3rdparty/libwebp/src/enc/token_enc.c +++ b/src/3rdparty/libwebp/src/enc/token_enc.c @@ -20,9 +20,9 @@ #include <stdlib.h> #include <string.h> -#include "./cost_enc.h" -#include "./vp8i_enc.h" -#include "../utils/utils.h" +#include "src/enc/cost_enc.h" +#include "src/enc/vp8i_enc.h" +#include "src/utils/utils.h" #if !defined(DISABLE_TOKEN_BUFFER) @@ -195,39 +195,6 @@ int VP8RecordCoeffTokens(int ctx, const struct VP8Residual* const res, #undef TOKEN_ID //------------------------------------------------------------------------------ -// This function works, but isn't currently used. Saved for later. - -#if 0 - -static void Record(int bit, proba_t* const stats) { - proba_t p = *stats; - if (p >= 0xffff0000u) { // an overflow is inbound. - p = ((p + 1u) >> 1) & 0x7fff7fffu; // -> divide the stats by 2. - } - // record bit count (lower 16 bits) and increment total count (upper 16 bits). - p += 0x00010000u + bit; - *stats = p; -} - -void VP8TokenToStats(const VP8TBuffer* const b, proba_t* const stats) { - const VP8Tokens* p = b->pages_; - while (p != NULL) { - const int N = (p->next_ == NULL) ? b->left_ : 0; - int n = MAX_NUM_TOKEN; - const token_t* const tokens = TOKEN_DATA(p); - while (n-- > N) { - const token_t token = tokens[n]; - if (!(token & FIXED_PROBA_BIT)) { - Record((token >> 15) & 1, stats + (token & 0x3fffu)); - } - } - p = p->next_; - } -} - -#endif // 0 - -//------------------------------------------------------------------------------ // Final coding pass, with known probabilities int VP8EmitTokens(VP8TBuffer* const b, VP8BitWriter* const bw, @@ -283,8 +250,9 @@ size_t VP8EstimateTokenSize(VP8TBuffer* const b, const uint8_t* const probas) { #else // DISABLE_TOKEN_BUFFER -void VP8TBufferInit(VP8TBuffer* const b) { +void VP8TBufferInit(VP8TBuffer* const b, int page_size) { (void)b; + (void)page_size; } void VP8TBufferClear(VP8TBuffer* const b) { (void)b; diff --git a/src/3rdparty/libwebp/src/enc/tree_enc.c b/src/3rdparty/libwebp/src/enc/tree_enc.c index 2c40fe7..64ed283 100644 --- a/src/3rdparty/libwebp/src/enc/tree_enc.c +++ b/src/3rdparty/libwebp/src/enc/tree_enc.c @@ -11,7 +11,7 @@ // // Author: Skal (pascal.massimino@gmail.com) -#include "./vp8i_enc.h" +#include "src/enc/vp8i_enc.h" //------------------------------------------------------------------------------ // Default probabilities diff --git a/src/3rdparty/libwebp/src/enc/vp8i_enc.h b/src/3rdparty/libwebp/src/enc/vp8i_enc.h index 93c95ec..24e1944 100644 --- a/src/3rdparty/libwebp/src/enc/vp8i_enc.h +++ b/src/3rdparty/libwebp/src/enc/vp8i_enc.h @@ -11,16 +11,16 @@ // // Author: Skal (pascal.massimino@gmail.com) -#ifndef WEBP_ENC_VP8ENCI_H_ -#define WEBP_ENC_VP8ENCI_H_ +#ifndef WEBP_ENC_VP8I_ENC_H_ +#define WEBP_ENC_VP8I_ENC_H_ #include <string.h> // for memcpy() -#include "../dec/common_dec.h" -#include "../dsp/dsp.h" -#include "../utils/bit_writer_utils.h" -#include "../utils/thread_utils.h" -#include "../utils/utils.h" -#include "../webp/encode.h" +#include "src/dec/common_dec.h" +#include "src/dsp/dsp.h" +#include "src/utils/bit_writer_utils.h" +#include "src/utils/thread_utils.h" +#include "src/utils/utils.h" +#include "src/webp/encode.h" #ifdef __cplusplus extern "C" { @@ -30,9 +30,9 @@ extern "C" { // Various defines and enums // version numbers -#define ENC_MAJ_VERSION 0 -#define ENC_MIN_VERSION 6 -#define ENC_REV_VERSION 0 +#define ENC_MAJ_VERSION 1 +#define ENC_MIN_VERSION 0 +#define ENC_REV_VERSION 3 enum { MAX_LF_LEVELS = 64, // Maximum loop filter level MAX_VARIABLE_LEVEL = 67, // last (inclusive) level with variable cost @@ -75,10 +75,10 @@ typedef enum { // Rate-distortion optimization levels #define U_OFF_ENC (16) #define V_OFF_ENC (16 + 8) -extern const int VP8Scan[16]; // in quant.c -extern const int VP8UVModeOffsets[4]; // in analyze.c -extern const int VP8I16ModeOffsets[4]; -extern const int VP8I4ModeOffsets[NUM_BMODES]; +extern const uint16_t VP8Scan[16]; +extern const uint16_t VP8UVModeOffsets[4]; +extern const uint16_t VP8I16ModeOffsets[4]; +extern const uint16_t VP8I4ModeOffsets[NUM_BMODES]; // Layout of prediction blocks // intra 16x16 @@ -120,6 +120,9 @@ static WEBP_INLINE int QUANTDIV(uint32_t n, uint32_t iQ, uint32_t B) { // Uncomment the following to remove token-buffer code: // #define DISABLE_TOKEN_BUFFER +// quality below which error-diffusion is enabled +#define ERROR_DIFFUSION_QUALITY 98 + //------------------------------------------------------------------------------ // Headers @@ -201,6 +204,8 @@ typedef struct { score_t i4_penalty_; // penalty for using Intra4 } VP8SegmentInfo; +typedef int8_t DError[2 /* u/v */][2 /* top or left */]; + // Handy transient struct to accumulate score and info during RD-optimization // and mode evaluation. typedef struct { @@ -213,6 +218,7 @@ typedef struct { uint8_t modes_i4[16]; // mode numbers for intra4 predictions int mode_uv; // mode number of chroma prediction uint32_t nz; // non-zero blocks + int8_t derr[2][3]; // DC diffusion errors for U/V for blocks #1/2/3 } VP8ModeScore; // Iterator structure to iterate through macroblocks, pointing to the @@ -242,6 +248,9 @@ typedef struct { int count_down0_; // starting counter value (for progress) int percent0_; // saved initial progress percent + DError left_derr_; // left error diffusion (u/v) + DError *top_derr_; // top diffusion error - NULL if disabled + uint8_t* y_left_; // left luma samples (addressable from index -1 to 15). uint8_t* u_left_; // left u samples (addressable from index -1 to 7) uint8_t* v_left_; // left v samples (addressable from index -1 to 7) @@ -269,7 +278,7 @@ int VP8IteratorIsDone(const VP8EncIterator* const it); // Import uncompressed samples from source. // If tmp_32 is not NULL, import boundary samples too. // tmp_32 is a 32-bytes scratch buffer that must be aligned in memory. -void VP8IteratorImport(VP8EncIterator* const it, uint8_t* tmp_32); +void VP8IteratorImport(VP8EncIterator* const it, uint8_t* const tmp_32); // export decimated samples void VP8IteratorExport(const VP8EncIterator* const it); // go to next macroblock. Returns false if not finished. @@ -330,9 +339,6 @@ int VP8RecordCoeffTokens(int ctx, const struct VP8Residual* const res, // Estimate the final coded size given a set of 'probas'. size_t VP8EstimateTokenSize(VP8TBuffer* const b, const uint8_t* const probas); -// unused for now -void VP8TokenToStats(const VP8TBuffer* const b, proba_t* const stats); - #endif // !DISABLE_TOKEN_BUFFER //------------------------------------------------------------------------------ @@ -404,6 +410,7 @@ struct VP8Encoder { uint8_t* uv_top_; // top u/v samples. // U and V are packed into 16 bytes (8 U + 8 V) LFStats* lf_stats_; // autofilter stats (if NULL, autofilter is off) + DError* top_derr_; // diffusion error (NULL if disabled) }; //------------------------------------------------------------------------------ @@ -502,19 +509,10 @@ int WebPPictureAllocYUVA(WebPPicture* const picture, int width, int height); // compressibility (no guarantee, though). Assumes that pic->use_argb is true. void WebPCleanupTransparentAreaLossless(WebPPicture* const pic); - // in near_lossless.c -// Near lossless preprocessing in RGB color-space. -int VP8ApplyNearLossless(int xsize, int ysize, uint32_t* argb, int quality); -// Near lossless adjustment for predictors. -void VP8ApplyNearLosslessPredict(int xsize, int ysize, int pred_bits, - const uint32_t* argb_orig, - uint32_t* argb, uint32_t* argb_scratch, - const uint32_t* const transform_data, - int quality, int subtract_green); //------------------------------------------------------------------------------ #ifdef __cplusplus } // extern "C" #endif -#endif /* WEBP_ENC_VP8ENCI_H_ */ +#endif // WEBP_ENC_VP8I_ENC_H_ diff --git a/src/3rdparty/libwebp/src/enc/vp8l_enc.c b/src/3rdparty/libwebp/src/enc/vp8l_enc.c index b1a793d..2efd403 100644 --- a/src/3rdparty/libwebp/src/enc/vp8l_enc.c +++ b/src/3rdparty/libwebp/src/enc/vp8l_enc.c @@ -15,20 +15,17 @@ #include <assert.h> #include <stdlib.h> -#include "./backward_references_enc.h" -#include "./histogram_enc.h" -#include "./vp8i_enc.h" -#include "./vp8li_enc.h" -#include "../dsp/lossless.h" -#include "../dsp/lossless_common.h" -#include "../utils/bit_writer_utils.h" -#include "../utils/huffman_encode_utils.h" -#include "../utils/utils.h" -#include "../webp/format_constants.h" - -#include "./delta_palettization_enc.h" - -#define PALETTE_KEY_RIGHT_SHIFT 22 // Key for 1K buffer. +#include "src/enc/backward_references_enc.h" +#include "src/enc/histogram_enc.h" +#include "src/enc/vp8i_enc.h" +#include "src/enc/vp8li_enc.h" +#include "src/dsp/lossless.h" +#include "src/dsp/lossless_common.h" +#include "src/utils/bit_writer_utils.h" +#include "src/utils/huffman_encode_utils.h" +#include "src/utils/utils.h" +#include "src/webp/format_constants.h" + // Maximum number of histogram images (sub-blocks). #define MAX_HUFF_IMAGE_SIZE 2600 @@ -128,7 +125,10 @@ static int AnalyzeAndCreatePalette(const WebPPicture* const pic, uint32_t palette[MAX_PALETTE_SIZE], int* const palette_size) { const int num_colors = WebPGetColorPalette(pic, palette); - if (num_colors > MAX_PALETTE_SIZE) return 0; + if (num_colors > MAX_PALETTE_SIZE) { + *palette_size = 0; + return 0; + } *palette_size = num_colors; qsort(palette, num_colors, sizeof(*palette), PaletteCompareColorsForQsort); if (!low_effort && PaletteHasNonMonotonousDeltas(palette, num_colors)) { @@ -188,22 +188,33 @@ static WEBP_INLINE uint32_t HashPix(uint32_t pix) { static int AnalyzeEntropy(const uint32_t* argb, int width, int height, int argb_stride, int use_palette, + int palette_size, int transform_bits, EntropyIx* const min_entropy_ix, int* const red_and_blue_always_zero) { // Allocate histogram set with cache_bits = 0. - uint32_t* const histo = - (uint32_t*)WebPSafeCalloc(kHistoTotal, sizeof(*histo) * 256); + uint32_t* histo; + + if (use_palette && palette_size <= 16) { + // In the case of small palettes, we pack 2, 4 or 8 pixels together. In + // practice, small palettes are better than any other transform. + *min_entropy_ix = kPalette; + *red_and_blue_always_zero = 1; + return 1; + } + histo = (uint32_t*)WebPSafeCalloc(kHistoTotal, sizeof(*histo) * 256); if (histo != NULL) { int i, x, y; - const uint32_t* prev_row = argb; - const uint32_t* curr_row = argb + argb_stride; - for (y = 1; y < height; ++y) { - uint32_t prev_pix = curr_row[0]; - for (x = 1; x < width; ++x) { + const uint32_t* prev_row = NULL; + const uint32_t* curr_row = argb; + uint32_t pix_prev = argb[0]; // Skip the first pixel. + for (y = 0; y < height; ++y) { + for (x = 0; x < width; ++x) { const uint32_t pix = curr_row[x]; - const uint32_t pix_diff = VP8LSubPixels(pix, prev_pix); - if ((pix_diff == 0) || (pix == prev_row[x])) continue; - prev_pix = pix; + const uint32_t pix_diff = VP8LSubPixels(pix, pix_prev); + pix_prev = pix; + if ((pix_diff == 0) || (prev_row != NULL && pix == prev_row[x])) { + continue; + } AddSingle(pix, &histo[kHistoAlpha * 256], &histo[kHistoRed * 256], @@ -246,7 +257,7 @@ static int AnalyzeEntropy(const uint32_t* argb, ++histo[kHistoAlphaPred * 256]; for (j = 0; j < kHistoTotal; ++j) { - entropy_comp[j] = VP8LBitsEntropy(&histo[j * 256], 256, NULL); + entropy_comp[j] = VP8LBitsEntropy(&histo[j * 256], 256); } entropy[kDirect] = entropy_comp[kHistoAlpha] + entropy_comp[kHistoRed] + @@ -264,8 +275,24 @@ static int AnalyzeEntropy(const uint32_t* argb, entropy_comp[kHistoRedPredSubGreen] + entropy_comp[kHistoGreenPred] + entropy_comp[kHistoBluePredSubGreen]; - // Palette mode seems more efficient in a breakeven case. Bias with 1.0. - entropy[kPalette] = entropy_comp[kHistoPalette] - 1.0; + entropy[kPalette] = entropy_comp[kHistoPalette]; + + // When including transforms, there is an overhead in bits from + // storing them. This overhead is small but matters for small images. + // For spatial, there are 14 transformations. + entropy[kSpatial] += VP8LSubSampleSize(width, transform_bits) * + VP8LSubSampleSize(height, transform_bits) * + VP8LFastLog2(14); + // For color transforms: 24 as only 3 channels are considered in a + // ColorTransformElement. + entropy[kSpatialSubGreen] += VP8LSubSampleSize(width, transform_bits) * + VP8LSubSampleSize(height, transform_bits) * + VP8LFastLog2(24); + // For palettes, add the cost of storing the palette. + // We empirically estimate the cost of a compressed entry as 8 bits. + // The palette is differential-coded when compressed hence a much + // lower cost than sizeof(uint32_t)*8. + entropy[kPalette] += palette_size * 8; *min_entropy_ix = kDirect; for (k = kDirect + 1; k <= last_mode_to_analyze; ++k) { @@ -273,6 +300,7 @@ static int AnalyzeEntropy(const uint32_t* argb, *min_entropy_ix = (EntropyIx)k; } } + assert((int)*min_entropy_ix <= last_mode_to_analyze); *red_and_blue_always_zero = 1; // Let's check if the histogram of the chosen entropy mode has // non-zero red and blue values. If all are zero, we can later skip @@ -325,60 +353,94 @@ static int GetTransformBits(int method, int histo_bits) { return res; } -static int AnalyzeAndInit(VP8LEncoder* const enc) { +// Set of parameters to be used in each iteration of the cruncher. +#define CRUNCH_CONFIGS_LZ77_MAX 2 +typedef struct { + int entropy_idx_; + int lz77s_types_to_try_[CRUNCH_CONFIGS_LZ77_MAX]; + int lz77s_types_to_try_size_; +} CrunchConfig; + +#define CRUNCH_CONFIGS_MAX kNumEntropyIx + +static int EncoderAnalyze(VP8LEncoder* const enc, + CrunchConfig crunch_configs[CRUNCH_CONFIGS_MAX], + int* const crunch_configs_size, + int* const red_and_blue_always_zero) { const WebPPicture* const pic = enc->pic_; const int width = pic->width; const int height = pic->height; - const int pix_cnt = width * height; const WebPConfig* const config = enc->config_; const int method = config->method; const int low_effort = (config->method == 0); - // we round the block size up, so we're guaranteed to have - // at max MAX_REFS_BLOCK_PER_IMAGE blocks used: - int refs_block_size = (pix_cnt - 1) / MAX_REFS_BLOCK_PER_IMAGE + 1; + int i; + int use_palette; + int n_lz77s; assert(pic != NULL && pic->argb != NULL); - enc->use_cross_color_ = 0; - enc->use_predict_ = 0; - enc->use_subtract_green_ = 0; - enc->use_palette_ = + use_palette = AnalyzeAndCreatePalette(pic, low_effort, enc->palette_, &enc->palette_size_); - // TODO(jyrki): replace the decision to be based on an actual estimate - // of entropy, or even spatial variance of entropy. - enc->histo_bits_ = GetHistoBits(method, enc->use_palette_, + // Empirical bit sizes. + enc->histo_bits_ = GetHistoBits(method, use_palette, pic->width, pic->height); enc->transform_bits_ = GetTransformBits(method, enc->histo_bits_); if (low_effort) { // AnalyzeEntropy is somewhat slow. - enc->use_predict_ = !enc->use_palette_; - enc->use_subtract_green_ = !enc->use_palette_; - enc->use_cross_color_ = 0; + crunch_configs[0].entropy_idx_ = use_palette ? kPalette : kSpatialSubGreen; + n_lz77s = 1; + *crunch_configs_size = 1; } else { - int red_and_blue_always_zero; EntropyIx min_entropy_ix; - if (!AnalyzeEntropy(pic->argb, width, height, pic->argb_stride, - enc->use_palette_, &min_entropy_ix, - &red_and_blue_always_zero)) { + // Try out multiple LZ77 on images with few colors. + n_lz77s = (enc->palette_size_ > 0 && enc->palette_size_ <= 16) ? 2 : 1; + if (!AnalyzeEntropy(pic->argb, width, height, pic->argb_stride, use_palette, + enc->palette_size_, enc->transform_bits_, + &min_entropy_ix, red_and_blue_always_zero)) { return 0; } - enc->use_palette_ = (min_entropy_ix == kPalette); - enc->use_subtract_green_ = - (min_entropy_ix == kSubGreen) || (min_entropy_ix == kSpatialSubGreen); - enc->use_predict_ = - (min_entropy_ix == kSpatial) || (min_entropy_ix == kSpatialSubGreen); - enc->use_cross_color_ = red_and_blue_always_zero ? 0 : enc->use_predict_; + if (method == 6 && config->quality == 100) { + // Go brute force on all transforms. + *crunch_configs_size = 0; + for (i = 0; i < kNumEntropyIx; ++i) { + if (i != kPalette || use_palette) { + assert(*crunch_configs_size < CRUNCH_CONFIGS_MAX); + crunch_configs[(*crunch_configs_size)++].entropy_idx_ = i; + } + } + } else { + // Only choose the guessed best transform. + *crunch_configs_size = 1; + crunch_configs[0].entropy_idx_ = min_entropy_ix; + } } + // Fill in the different LZ77s. + assert(n_lz77s <= CRUNCH_CONFIGS_LZ77_MAX); + for (i = 0; i < *crunch_configs_size; ++i) { + int j; + for (j = 0; j < n_lz77s; ++j) { + crunch_configs[i].lz77s_types_to_try_[j] = + (j == 0) ? kLZ77Standard | kLZ77RLE : kLZ77Box; + } + crunch_configs[i].lz77s_types_to_try_size_ = n_lz77s; + } + return 1; +} +static int EncoderInit(VP8LEncoder* const enc) { + const WebPPicture* const pic = enc->pic_; + const int width = pic->width; + const int height = pic->height; + const int pix_cnt = width * height; + // we round the block size up, so we're guaranteed to have + // at most MAX_REFS_BLOCK_PER_IMAGE blocks used: + const int refs_block_size = (pix_cnt - 1) / MAX_REFS_BLOCK_PER_IMAGE + 1; + int i; if (!VP8LHashChainInit(&enc->hash_chain_, pix_cnt)) return 0; - // palette-friendly input typically uses less literals - // -> reduce block size a bit - if (enc->use_palette_) refs_block_size /= 2; - VP8LBackwardRefsInit(&enc->refs_[0], refs_block_size); - VP8LBackwardRefsInit(&enc->refs_[1], refs_block_size); + for (i = 0; i < 3; ++i) VP8LBackwardRefsInit(&enc->refs_[i], refs_block_size); return 1; } @@ -400,6 +462,7 @@ static int GetHuffBitLengthsAndCodes( for (i = 0; i < histogram_image_size; ++i) { const VP8LHistogram* const histo = histogram_image->histograms[i]; HuffmanTreeCode* const codes = &huffman_codes[5 * i]; + assert(histo != NULL); for (k = 0; k < 5; ++k) { const int num_symbols = (k == 0) ? VP8LHistogramNumCodes(histo->palette_code_bits_) : @@ -571,11 +634,16 @@ static void StoreFullHuffmanCode(VP8LBitWriter* const bw, length = write_trimmed_length ? trimmed_length : num_tokens; VP8LPutBits(bw, write_trimmed_length, 1); if (write_trimmed_length) { - const int nbits = VP8LBitsLog2Ceiling(trimmed_length - 1); - const int nbitpairs = (nbits == 0) ? 1 : (nbits + 1) / 2; - VP8LPutBits(bw, nbitpairs - 1, 3); - assert(trimmed_length >= 2); - VP8LPutBits(bw, trimmed_length - 2, nbitpairs * 2); + if (trimmed_length == 2) { + VP8LPutBits(bw, 0, 3 + 2); // nbitpairs=1, trimmed_length=2 + } else { + const int nbits = BitsLog2Floor(trimmed_length - 2); + const int nbitpairs = nbits / 2 + 1; + assert(trimmed_length > 2); + assert(nbitpairs - 1 < 8); + VP8LPutBits(bw, nbitpairs - 1, 3); + VP8LPutBits(bw, trimmed_length - 2, nbitpairs * 2); + } } StoreHuffmanTreeToBitMask(bw, tokens, length, &huffman_code); } @@ -642,7 +710,7 @@ static WEBP_INLINE void WriteHuffmanCodeWithExtraBits( static WebPEncodingError StoreImageToBitMask( VP8LBitWriter* const bw, int width, int histo_bits, - VP8LBackwardRefs* const refs, + const VP8LBackwardRefs* const refs, const uint16_t* histogram_symbols, const HuffmanTreeCode* const huffman_codes) { const int histo_xsize = histo_bits ? VP8LSubSampleSize(width, histo_bits) : 1; @@ -665,7 +733,7 @@ static WebPEncodingError StoreImageToBitMask( codes = huffman_codes + 5 * histogram_ix; } if (PixOrCopyIsLiteral(v)) { - static const int order[] = { 1, 2, 0, 3 }; + static const uint8_t order[] = { 1, 2, 0, 3 }; int k; for (k = 0; k < 4; ++k) { const int code = PixOrCopyLiteral(v, order[k]); @@ -686,7 +754,6 @@ static WebPEncodingError StoreImageToBitMask( // Don't write the distance with the extra bits code since // the distance can be up to 18 bits of extra bits, and the prefix // 15 bits, totaling to 33, and our PutBits only supports up to 32 bits. - // TODO(jyrki): optimize this further. VP8LPrefixEncode(distance, &code, &n_bits, &bits); WriteHuffmanCode(bw, codes + 4, code); VP8LPutBits(bw, bits, n_bits); @@ -705,7 +772,8 @@ static WebPEncodingError StoreImageToBitMask( static WebPEncodingError EncodeImageNoHuffman(VP8LBitWriter* const bw, const uint32_t* const argb, VP8LHashChain* const hash_chain, - VP8LBackwardRefs refs_array[2], + VP8LBackwardRefs* const refs_tmp1, + VP8LBackwardRefs* const refs_tmp2, int width, int height, int quality, int low_effort) { int i; @@ -730,8 +798,9 @@ static WebPEncodingError EncodeImageNoHuffman(VP8LBitWriter* const bw, err = VP8_ENC_ERROR_OUT_OF_MEMORY; goto Error; } - refs = VP8LGetBackwardReferences(width, height, argb, quality, 0, &cache_bits, - hash_chain, refs_array); + refs = VP8LGetBackwardReferences(width, height, argb, quality, 0, + kLZ77Standard | kLZ77RLE, &cache_bits, + hash_chain, refs_tmp1, refs_tmp2); if (refs == NULL) { err = VP8_ENC_ERROR_OUT_OF_MEMORY; goto Error; @@ -741,6 +810,7 @@ static WebPEncodingError EncodeImageNoHuffman(VP8LBitWriter* const bw, err = VP8_ENC_ERROR_OUT_OF_MEMORY; goto Error; } + VP8LHistogramSetClear(histogram_image); // Build histogram image and symbols from backward references. VP8LHistogramStoreRefs(refs, histogram_image->histograms[0]); @@ -788,39 +858,37 @@ static WebPEncodingError EncodeImageNoHuffman(VP8LBitWriter* const bw, return err; } -static WebPEncodingError EncodeImageInternal(VP8LBitWriter* const bw, - const uint32_t* const argb, - VP8LHashChain* const hash_chain, - VP8LBackwardRefs refs_array[2], - int width, int height, int quality, - int low_effort, - int use_cache, int* cache_bits, - int histogram_bits, - size_t init_byte_position, - int* const hdr_size, - int* const data_size) { +static WebPEncodingError EncodeImageInternal( + VP8LBitWriter* const bw, const uint32_t* const argb, + VP8LHashChain* const hash_chain, VP8LBackwardRefs refs_array[3], int width, + int height, int quality, int low_effort, int use_cache, + const CrunchConfig* const config, int* cache_bits, int histogram_bits, + size_t init_byte_position, int* const hdr_size, int* const data_size) { WebPEncodingError err = VP8_ENC_OK; const uint32_t histogram_image_xysize = VP8LSubSampleSize(width, histogram_bits) * VP8LSubSampleSize(height, histogram_bits); VP8LHistogramSet* histogram_image = NULL; - VP8LHistogramSet* tmp_histos = NULL; + VP8LHistogram* tmp_histo = NULL; int histogram_image_size = 0; size_t bit_array_size = 0; - HuffmanTree* huff_tree = NULL; + HuffmanTree* const huff_tree = (HuffmanTree*)WebPSafeMalloc( + 3ULL * CODE_LENGTH_CODES, sizeof(*huff_tree)); HuffmanTreeToken* tokens = NULL; HuffmanTreeCode* huffman_codes = NULL; - VP8LBackwardRefs refs; - VP8LBackwardRefs* best_refs; + VP8LBackwardRefs* refs_best; + VP8LBackwardRefs* refs_tmp; uint16_t* const histogram_symbols = (uint16_t*)WebPSafeMalloc(histogram_image_xysize, sizeof(*histogram_symbols)); + int lz77s_idx; + VP8LBitWriter bw_init = *bw, bw_best; + int hdr_size_tmp; assert(histogram_bits >= MIN_HUFFMAN_BITS); assert(histogram_bits <= MAX_HUFFMAN_BITS); assert(hdr_size != NULL); assert(data_size != NULL); - VP8LBackwardRefsInit(&refs, refs_array[0].block_size_); if (histogram_symbols == NULL) { err = VP8_ENC_ERROR_OUT_OF_MEMORY; goto Error; @@ -836,142 +904,162 @@ static WebPEncodingError EncodeImageInternal(VP8LBitWriter* const bw, // 'best_refs' is the reference to the best backward refs and points to one // of refs_array[0] or refs_array[1]. // Calculate backward references from ARGB image. - if (!VP8LHashChainFill(hash_chain, quality, argb, width, height, - low_effort)) { - err = VP8_ENC_ERROR_OUT_OF_MEMORY; - goto Error; - } - best_refs = VP8LGetBackwardReferences(width, height, argb, quality, - low_effort, cache_bits, hash_chain, - refs_array); - if (best_refs == NULL || !VP8LBackwardRefsCopy(best_refs, &refs)) { - err = VP8_ENC_ERROR_OUT_OF_MEMORY; - goto Error; - } - histogram_image = - VP8LAllocateHistogramSet(histogram_image_xysize, *cache_bits); - tmp_histos = VP8LAllocateHistogramSet(2, *cache_bits); - if (histogram_image == NULL || tmp_histos == NULL) { + if (huff_tree == NULL || + !VP8LHashChainFill(hash_chain, quality, argb, width, height, + low_effort) || + !VP8LBitWriterInit(&bw_best, 0) || + (config->lz77s_types_to_try_size_ > 1 && + !VP8LBitWriterClone(bw, &bw_best))) { err = VP8_ENC_ERROR_OUT_OF_MEMORY; goto Error; } + for (lz77s_idx = 0; lz77s_idx < config->lz77s_types_to_try_size_; + ++lz77s_idx) { + refs_best = VP8LGetBackwardReferences( + width, height, argb, quality, low_effort, + config->lz77s_types_to_try_[lz77s_idx], cache_bits, hash_chain, + &refs_array[0], &refs_array[1]); + if (refs_best == NULL) { + err = VP8_ENC_ERROR_OUT_OF_MEMORY; + goto Error; + } + // Keep the best references aside and use the other element from the first + // two as a temporary for later usage. + refs_tmp = &refs_array[refs_best == &refs_array[0] ? 1 : 0]; + + histogram_image = + VP8LAllocateHistogramSet(histogram_image_xysize, *cache_bits); + tmp_histo = VP8LAllocateHistogram(*cache_bits); + if (histogram_image == NULL || tmp_histo == NULL) { + err = VP8_ENC_ERROR_OUT_OF_MEMORY; + goto Error; + } - // Build histogram image and symbols from backward references. - if (!VP8LGetHistoImageSymbols(width, height, &refs, quality, low_effort, - histogram_bits, *cache_bits, histogram_image, - tmp_histos, histogram_symbols)) { - err = VP8_ENC_ERROR_OUT_OF_MEMORY; - goto Error; - } - // Create Huffman bit lengths and codes for each histogram image. - histogram_image_size = histogram_image->size; - bit_array_size = 5 * histogram_image_size; - huffman_codes = (HuffmanTreeCode*)WebPSafeCalloc(bit_array_size, - sizeof(*huffman_codes)); - // Note: some histogram_image entries may point to tmp_histos[], so the latter - // need to outlive the following call to GetHuffBitLengthsAndCodes(). - if (huffman_codes == NULL || - !GetHuffBitLengthsAndCodes(histogram_image, huffman_codes)) { - err = VP8_ENC_ERROR_OUT_OF_MEMORY; - goto Error; - } - // Free combined histograms. - VP8LFreeHistogramSet(histogram_image); - histogram_image = NULL; + // Build histogram image and symbols from backward references. + if (!VP8LGetHistoImageSymbols(width, height, refs_best, quality, low_effort, + histogram_bits, *cache_bits, histogram_image, + tmp_histo, histogram_symbols)) { + err = VP8_ENC_ERROR_OUT_OF_MEMORY; + goto Error; + } + // Create Huffman bit lengths and codes for each histogram image. + histogram_image_size = histogram_image->size; + bit_array_size = 5 * histogram_image_size; + huffman_codes = (HuffmanTreeCode*)WebPSafeCalloc(bit_array_size, + sizeof(*huffman_codes)); + // Note: some histogram_image entries may point to tmp_histos[], so the + // latter need to outlive the following call to GetHuffBitLengthsAndCodes(). + if (huffman_codes == NULL || + !GetHuffBitLengthsAndCodes(histogram_image, huffman_codes)) { + err = VP8_ENC_ERROR_OUT_OF_MEMORY; + goto Error; + } + // Free combined histograms. + VP8LFreeHistogramSet(histogram_image); + histogram_image = NULL; - // Free scratch histograms. - VP8LFreeHistogramSet(tmp_histos); - tmp_histos = NULL; + // Free scratch histograms. + VP8LFreeHistogram(tmp_histo); + tmp_histo = NULL; - // Color Cache parameters. - if (*cache_bits > 0) { - VP8LPutBits(bw, 1, 1); - VP8LPutBits(bw, *cache_bits, 4); - } else { - VP8LPutBits(bw, 0, 1); - } + // Color Cache parameters. + if (*cache_bits > 0) { + VP8LPutBits(bw, 1, 1); + VP8LPutBits(bw, *cache_bits, 4); + } else { + VP8LPutBits(bw, 0, 1); + } - // Huffman image + meta huffman. - { - const int write_histogram_image = (histogram_image_size > 1); - VP8LPutBits(bw, write_histogram_image, 1); - if (write_histogram_image) { - uint32_t* const histogram_argb = - (uint32_t*)WebPSafeMalloc(histogram_image_xysize, - sizeof(*histogram_argb)); - int max_index = 0; - uint32_t i; - if (histogram_argb == NULL) { - err = VP8_ENC_ERROR_OUT_OF_MEMORY; - goto Error; - } - for (i = 0; i < histogram_image_xysize; ++i) { - const int symbol_index = histogram_symbols[i] & 0xffff; - histogram_argb[i] = (symbol_index << 8); - if (symbol_index >= max_index) { - max_index = symbol_index + 1; + // Huffman image + meta huffman. + { + const int write_histogram_image = (histogram_image_size > 1); + VP8LPutBits(bw, write_histogram_image, 1); + if (write_histogram_image) { + uint32_t* const histogram_argb = + (uint32_t*)WebPSafeMalloc(histogram_image_xysize, + sizeof(*histogram_argb)); + int max_index = 0; + uint32_t i; + if (histogram_argb == NULL) { + err = VP8_ENC_ERROR_OUT_OF_MEMORY; + goto Error; + } + for (i = 0; i < histogram_image_xysize; ++i) { + const int symbol_index = histogram_symbols[i] & 0xffff; + histogram_argb[i] = (symbol_index << 8); + if (symbol_index >= max_index) { + max_index = symbol_index + 1; + } } + histogram_image_size = max_index; + + VP8LPutBits(bw, histogram_bits - 2, 3); + err = EncodeImageNoHuffman( + bw, histogram_argb, hash_chain, refs_tmp, &refs_array[2], + VP8LSubSampleSize(width, histogram_bits), + VP8LSubSampleSize(height, histogram_bits), quality, low_effort); + WebPSafeFree(histogram_argb); + if (err != VP8_ENC_OK) goto Error; } - histogram_image_size = max_index; - - VP8LPutBits(bw, histogram_bits - 2, 3); - err = EncodeImageNoHuffman(bw, histogram_argb, hash_chain, refs_array, - VP8LSubSampleSize(width, histogram_bits), - VP8LSubSampleSize(height, histogram_bits), - quality, low_effort); - WebPSafeFree(histogram_argb); - if (err != VP8_ENC_OK) goto Error; } - } - // Store Huffman codes. - { - int i; - int max_tokens = 0; - huff_tree = (HuffmanTree*)WebPSafeMalloc(3ULL * CODE_LENGTH_CODES, - sizeof(*huff_tree)); - if (huff_tree == NULL) { - err = VP8_ENC_ERROR_OUT_OF_MEMORY; - goto Error; - } - // Find maximum number of symbols for the huffman tree-set. - for (i = 0; i < 5 * histogram_image_size; ++i) { - HuffmanTreeCode* const codes = &huffman_codes[i]; - if (max_tokens < codes->num_symbols) { - max_tokens = codes->num_symbols; + // Store Huffman codes. + { + int i; + int max_tokens = 0; + // Find maximum number of symbols for the huffman tree-set. + for (i = 0; i < 5 * histogram_image_size; ++i) { + HuffmanTreeCode* const codes = &huffman_codes[i]; + if (max_tokens < codes->num_symbols) { + max_tokens = codes->num_symbols; + } + } + tokens = (HuffmanTreeToken*)WebPSafeMalloc(max_tokens, sizeof(*tokens)); + if (tokens == NULL) { + err = VP8_ENC_ERROR_OUT_OF_MEMORY; + goto Error; + } + for (i = 0; i < 5 * histogram_image_size; ++i) { + HuffmanTreeCode* const codes = &huffman_codes[i]; + StoreHuffmanCode(bw, huff_tree, tokens, codes); + ClearHuffmanTreeIfOnlyOneSymbol(codes); } } - tokens = (HuffmanTreeToken*)WebPSafeMalloc(max_tokens, - sizeof(*tokens)); - if (tokens == NULL) { - err = VP8_ENC_ERROR_OUT_OF_MEMORY; - goto Error; + // Store actual literals. + hdr_size_tmp = (int)(VP8LBitWriterNumBytes(bw) - init_byte_position); + err = StoreImageToBitMask(bw, width, histogram_bits, refs_best, + histogram_symbols, huffman_codes); + // Keep track of the smallest image so far. + if (lz77s_idx == 0 || + VP8LBitWriterNumBytes(bw) < VP8LBitWriterNumBytes(&bw_best)) { + *hdr_size = hdr_size_tmp; + *data_size = + (int)(VP8LBitWriterNumBytes(bw) - init_byte_position - *hdr_size); + VP8LBitWriterSwap(bw, &bw_best); } - for (i = 0; i < 5 * histogram_image_size; ++i) { - HuffmanTreeCode* const codes = &huffman_codes[i]; - StoreHuffmanCode(bw, huff_tree, tokens, codes); - ClearHuffmanTreeIfOnlyOneSymbol(codes); + // Reset the bit writer for the following iteration if any. + if (config->lz77s_types_to_try_size_ > 1) VP8LBitWriterReset(&bw_init, bw); + WebPSafeFree(tokens); + tokens = NULL; + if (huffman_codes != NULL) { + WebPSafeFree(huffman_codes->codes); + WebPSafeFree(huffman_codes); + huffman_codes = NULL; } } - - *hdr_size = (int)(VP8LBitWriterNumBytes(bw) - init_byte_position); - // Store actual literals. - err = StoreImageToBitMask(bw, width, histogram_bits, &refs, - histogram_symbols, huffman_codes); - *data_size = - (int)(VP8LBitWriterNumBytes(bw) - init_byte_position - *hdr_size); + VP8LBitWriterSwap(bw, &bw_best); Error: WebPSafeFree(tokens); WebPSafeFree(huff_tree); VP8LFreeHistogramSet(histogram_image); - VP8LFreeHistogramSet(tmp_histos); - VP8LBackwardRefsClear(&refs); + VP8LFreeHistogram(tmp_histo); if (huffman_codes != NULL) { WebPSafeFree(huffman_codes->codes); WebPSafeFree(huffman_codes); } WebPSafeFree(histogram_symbols); + VP8LBitWriterWipeOut(&bw_best); return err; } @@ -1005,11 +1093,11 @@ static WebPEncodingError ApplyPredictFilter(const VP8LEncoder* const enc, VP8LPutBits(bw, PREDICTOR_TRANSFORM, 2); assert(pred_bits >= 2); VP8LPutBits(bw, pred_bits - 2, 3); - return EncodeImageNoHuffman(bw, enc->transform_data_, - (VP8LHashChain*)&enc->hash_chain_, - (VP8LBackwardRefs*)enc->refs_, // cast const away - transform_width, transform_height, - quality, low_effort); + return EncodeImageNoHuffman( + bw, enc->transform_data_, (VP8LHashChain*)&enc->hash_chain_, + (VP8LBackwardRefs*)&enc->refs_[0], // cast const away + (VP8LBackwardRefs*)&enc->refs_[1], transform_width, transform_height, + quality, low_effort); } static WebPEncodingError ApplyCrossColorFilter(const VP8LEncoder* const enc, @@ -1026,11 +1114,11 @@ static WebPEncodingError ApplyCrossColorFilter(const VP8LEncoder* const enc, VP8LPutBits(bw, CROSS_COLOR_TRANSFORM, 2); assert(ccolor_transform_bits >= 2); VP8LPutBits(bw, ccolor_transform_bits - 2, 3); - return EncodeImageNoHuffman(bw, enc->transform_data_, - (VP8LHashChain*)&enc->hash_chain_, - (VP8LBackwardRefs*)enc->refs_, // cast const away - transform_width, transform_height, - quality, low_effort); + return EncodeImageNoHuffman( + bw, enc->transform_data_, (VP8LHashChain*)&enc->hash_chain_, + (VP8LBackwardRefs*)&enc->refs_[0], // cast const away + (VP8LBackwardRefs*)&enc->refs_[1], transform_width, transform_height, + quality, low_effort); } // ----------------------------------------------------------------------------- @@ -1144,6 +1232,7 @@ static WebPEncodingError AllocateTransformBuffer(VP8LEncoder* const enc, } enc->transform_mem_ = mem; enc->transform_mem_size_ = (size_t)mem_size; + enc->argb_content_ = kEncoderNone; } enc->argb_ = mem; mem = (uint32_t*)WEBP_ALIGN(mem + image_size); @@ -1161,14 +1250,22 @@ static WebPEncodingError MakeInputImageCopy(VP8LEncoder* const enc) { const WebPPicture* const picture = enc->pic_; const int width = picture->width; const int height = picture->height; - int y; + err = AllocateTransformBuffer(enc, width, height); if (err != VP8_ENC_OK) return err; - for (y = 0; y < height; ++y) { - memcpy(enc->argb_ + y * width, - picture->argb + y * picture->argb_stride, - width * sizeof(*enc->argb_)); + if (enc->argb_content_ == kEncoderARGB) return VP8_ENC_OK; + + { + uint32_t* dst = enc->argb_; + const uint32_t* src = picture->argb; + int y; + for (y = 0; y < height; ++y) { + memcpy(dst, src, width * sizeof(*dst)); + dst += width; + src += picture->argb_stride; + } } + enc->argb_content_ = kEncoderARGB; assert(enc->current_width_ == width); return VP8_ENC_OK; } @@ -1215,12 +1312,13 @@ static WEBP_INLINE uint32_t ApplyPaletteHash0(uint32_t color) { static WEBP_INLINE uint32_t ApplyPaletteHash1(uint32_t color) { // Forget about alpha. - return ((color & 0x00ffffffu) * 4222244071u) >> (32 - PALETTE_INV_SIZE_BITS); + return ((uint32_t)((color & 0x00ffffffu) * 4222244071ull)) >> + (32 - PALETTE_INV_SIZE_BITS); } static WEBP_INLINE uint32_t ApplyPaletteHash2(uint32_t color) { // Forget about alpha. - return (color & 0x00ffffffu) * ((1u << 31) - 1) >> + return ((uint32_t)((color & 0x00ffffffu) * ((1ull << 31) - 1))) >> (32 - PALETTE_INV_SIZE_BITS); } @@ -1346,6 +1444,7 @@ static WebPEncodingError MapImageFromPalette(VP8LEncoder* const enc, err = ApplyPalette(src, src_stride, enc->argb_, enc->current_width_, palette, palette_size, width, height, xbits); + enc->argb_content_ = kEncoderPalette; return err; } @@ -1364,52 +1463,11 @@ static WebPEncodingError EncodePalette(VP8LBitWriter* const bw, int low_effort, tmp_palette[i] = VP8LSubPixels(palette[i], palette[i - 1]); } tmp_palette[0] = palette[0]; - return EncodeImageNoHuffman(bw, tmp_palette, &enc->hash_chain_, enc->refs_, - palette_size, 1, 20 /* quality */, low_effort); -} - -#ifdef WEBP_EXPERIMENTAL_FEATURES - -static WebPEncodingError EncodeDeltaPalettePredictorImage( - VP8LBitWriter* const bw, VP8LEncoder* const enc, int quality, - int low_effort) { - const WebPPicture* const pic = enc->pic_; - const int width = pic->width; - const int height = pic->height; - - const int pred_bits = 5; - const int transform_width = VP8LSubSampleSize(width, pred_bits); - const int transform_height = VP8LSubSampleSize(height, pred_bits); - const int pred = 7; // default is Predictor7 (Top/Left Average) - const int tiles_per_row = VP8LSubSampleSize(width, pred_bits); - const int tiles_per_col = VP8LSubSampleSize(height, pred_bits); - uint32_t* predictors; - int tile_x, tile_y; - WebPEncodingError err = VP8_ENC_OK; - - predictors = (uint32_t*)WebPSafeMalloc(tiles_per_col * tiles_per_row, - sizeof(*predictors)); - if (predictors == NULL) return VP8_ENC_ERROR_OUT_OF_MEMORY; - - for (tile_y = 0; tile_y < tiles_per_col; ++tile_y) { - for (tile_x = 0; tile_x < tiles_per_row; ++tile_x) { - predictors[tile_y * tiles_per_row + tile_x] = 0xff000000u | (pred << 8); - } - } - - VP8LPutBits(bw, TRANSFORM_PRESENT, 1); - VP8LPutBits(bw, PREDICTOR_TRANSFORM, 2); - VP8LPutBits(bw, pred_bits - 2, 3); - err = EncodeImageNoHuffman(bw, predictors, &enc->hash_chain_, - (VP8LBackwardRefs*)enc->refs_, // cast const away - transform_width, transform_height, - quality, low_effort); - WebPSafeFree(predictors); - return err; + return EncodeImageNoHuffman(bw, tmp_palette, &enc->hash_chain_, + &enc->refs_[0], &enc->refs_[1], palette_size, 1, + 20 /* quality */, low_effort); } -#endif // WEBP_EXPERIMENTAL_FEATURES - // ----------------------------------------------------------------------------- // VP8LEncoder @@ -1422,6 +1480,7 @@ static VP8LEncoder* VP8LEncoderNew(const WebPConfig* const config, } enc->config_ = config; enc->pic_ = picture; + enc->argb_content_ = kEncoderNone; VP8LEncDspInit(); @@ -1430,9 +1489,9 @@ static VP8LEncoder* VP8LEncoderNew(const WebPConfig* const config, static void VP8LEncoderDelete(VP8LEncoder* enc) { if (enc != NULL) { + int i; VP8LHashChainClear(&enc->hash_chain_); - VP8LBackwardRefsClear(&enc->refs_[0]); - VP8LBackwardRefsClear(&enc->refs_[1]); + for (i = 0; i < 3; ++i) VP8LBackwardRefsClear(&enc->refs_[i]); ClearTransformBuffer(enc); WebPSafeFree(enc); } @@ -1441,134 +1500,324 @@ static void VP8LEncoderDelete(VP8LEncoder* enc) { // ----------------------------------------------------------------------------- // Main call -WebPEncodingError VP8LEncodeStream(const WebPConfig* const config, - const WebPPicture* const picture, - VP8LBitWriter* const bw, int use_cache) { +typedef struct { + const WebPConfig* config_; + const WebPPicture* picture_; + VP8LBitWriter* bw_; + VP8LEncoder* enc_; + int use_cache_; + CrunchConfig crunch_configs_[CRUNCH_CONFIGS_MAX]; + int num_crunch_configs_; + int red_and_blue_always_zero_; + WebPEncodingError err_; + WebPAuxStats* stats_; +} StreamEncodeContext; + +static int EncodeStreamHook(void* input, void* data2) { + StreamEncodeContext* const params = (StreamEncodeContext*)input; + const WebPConfig* const config = params->config_; + const WebPPicture* const picture = params->picture_; + VP8LBitWriter* const bw = params->bw_; + VP8LEncoder* const enc = params->enc_; + const int use_cache = params->use_cache_; + const CrunchConfig* const crunch_configs = params->crunch_configs_; + const int num_crunch_configs = params->num_crunch_configs_; + const int red_and_blue_always_zero = params->red_and_blue_always_zero_; +#if !defined(WEBP_DISABLE_STATS) + WebPAuxStats* const stats = params->stats_; +#endif WebPEncodingError err = VP8_ENC_OK; const int quality = (int)config->quality; const int low_effort = (config->method == 0); +#if (WEBP_NEAR_LOSSLESS == 1) const int width = picture->width; +#endif const int height = picture->height; - VP8LEncoder* const enc = VP8LEncoderNew(config, picture); const size_t byte_position = VP8LBitWriterNumBytes(bw); +#if (WEBP_NEAR_LOSSLESS == 1) int use_near_lossless = 0; +#endif int hdr_size = 0; int data_size = 0; int use_delta_palette = 0; + int idx; + size_t best_size = 0; + VP8LBitWriter bw_init = *bw, bw_best; + (void)data2; - if (enc == NULL) { + if (!VP8LBitWriterInit(&bw_best, 0) || + (num_crunch_configs > 1 && !VP8LBitWriterClone(bw, &bw_best))) { err = VP8_ENC_ERROR_OUT_OF_MEMORY; goto Error; } - // --------------------------------------------------------------------------- - // Analyze image (entropy, num_palettes etc) - - if (!AnalyzeAndInit(enc)) { - err = VP8_ENC_ERROR_OUT_OF_MEMORY; - goto Error; - } + for (idx = 0; idx < num_crunch_configs; ++idx) { + const int entropy_idx = crunch_configs[idx].entropy_idx_; + enc->use_palette_ = (entropy_idx == kPalette); + enc->use_subtract_green_ = + (entropy_idx == kSubGreen) || (entropy_idx == kSpatialSubGreen); + enc->use_predict_ = + (entropy_idx == kSpatial) || (entropy_idx == kSpatialSubGreen); + if (low_effort) { + enc->use_cross_color_ = 0; + } else { + enc->use_cross_color_ = red_and_blue_always_zero ? 0 : enc->use_predict_; + } + // Reset any parameter in the encoder that is set in the previous iteration. + enc->cache_bits_ = 0; + VP8LBackwardRefsClear(&enc->refs_[0]); + VP8LBackwardRefsClear(&enc->refs_[1]); - // Apply near-lossless preprocessing. - use_near_lossless = - (config->near_lossless < 100) && !enc->use_palette_ && !enc->use_predict_; - if (use_near_lossless) { - if (!VP8ApplyNearLossless(width, height, picture->argb, - config->near_lossless)) { - err = VP8_ENC_ERROR_OUT_OF_MEMORY; - goto Error; +#if (WEBP_NEAR_LOSSLESS == 1) + // Apply near-lossless preprocessing. + use_near_lossless = (config->near_lossless < 100) && !enc->use_palette_ && + !enc->use_predict_; + if (use_near_lossless) { + err = AllocateTransformBuffer(enc, width, height); + if (err != VP8_ENC_OK) goto Error; + if ((enc->argb_content_ != kEncoderNearLossless) && + !VP8ApplyNearLossless(picture, config->near_lossless, enc->argb_)) { + err = VP8_ENC_ERROR_OUT_OF_MEMORY; + goto Error; + } + enc->argb_content_ = kEncoderNearLossless; + } else { + enc->argb_content_ = kEncoderNone; } - } +#else + enc->argb_content_ = kEncoderNone; +#endif -#ifdef WEBP_EXPERIMENTAL_FEATURES - if (config->use_delta_palette) { - enc->use_predict_ = 1; - enc->use_cross_color_ = 0; - enc->use_subtract_green_ = 0; - enc->use_palette_ = 1; - err = MakeInputImageCopy(enc); - if (err != VP8_ENC_OK) goto Error; - err = WebPSearchOptimalDeltaPalette(enc); - if (err != VP8_ENC_OK) goto Error; + // Encode palette if (enc->use_palette_) { - err = AllocateTransformBuffer(enc, width, height); + err = EncodePalette(bw, low_effort, enc); if (err != VP8_ENC_OK) goto Error; - err = EncodeDeltaPalettePredictorImage(bw, enc, quality, low_effort); + err = MapImageFromPalette(enc, use_delta_palette); if (err != VP8_ENC_OK) goto Error; - use_delta_palette = 1; + // If using a color cache, do not have it bigger than the number of + // colors. + if (use_cache && enc->palette_size_ < (1 << MAX_COLOR_CACHE_BITS)) { + enc->cache_bits_ = BitsLog2Floor(enc->palette_size_) + 1; + } } - } -#endif // WEBP_EXPERIMENTAL_FEATURES + if (!use_delta_palette) { + // In case image is not packed. + if (enc->argb_content_ != kEncoderNearLossless && + enc->argb_content_ != kEncoderPalette) { + err = MakeInputImageCopy(enc); + if (err != VP8_ENC_OK) goto Error; + } - // Encode palette - if (enc->use_palette_) { - err = EncodePalette(bw, low_effort, enc); - if (err != VP8_ENC_OK) goto Error; - err = MapImageFromPalette(enc, use_delta_palette); - if (err != VP8_ENC_OK) goto Error; - // If using a color cache, do not have it bigger than the number of colors. - if (use_cache && enc->palette_size_ < (1 << MAX_COLOR_CACHE_BITS)) { - enc->cache_bits_ = BitsLog2Floor(enc->palette_size_) + 1; - } - } - if (!use_delta_palette) { - // In case image is not packed. - if (enc->argb_ == NULL) { - err = MakeInputImageCopy(enc); - if (err != VP8_ENC_OK) goto Error; + // ----------------------------------------------------------------------- + // Apply transforms and write transform data. + + if (enc->use_subtract_green_) { + ApplySubtractGreen(enc, enc->current_width_, height, bw); + } + + if (enc->use_predict_) { + err = ApplyPredictFilter(enc, enc->current_width_, height, quality, + low_effort, enc->use_subtract_green_, bw); + if (err != VP8_ENC_OK) goto Error; + } + + if (enc->use_cross_color_) { + err = ApplyCrossColorFilter(enc, enc->current_width_, height, quality, + low_effort, bw); + if (err != VP8_ENC_OK) goto Error; + } } + VP8LPutBits(bw, !TRANSFORM_PRESENT, 1); // No more transforms. + // ------------------------------------------------------------------------- - // Apply transforms and write transform data. + // Encode and write the transformed image. + err = EncodeImageInternal(bw, enc->argb_, &enc->hash_chain_, enc->refs_, + enc->current_width_, height, quality, low_effort, + use_cache, &crunch_configs[idx], + &enc->cache_bits_, enc->histo_bits_, + byte_position, &hdr_size, &data_size); + if (err != VP8_ENC_OK) goto Error; - if (enc->use_subtract_green_) { - ApplySubtractGreen(enc, enc->current_width_, height, bw); + // If we are better than what we already have. + if (idx == 0 || VP8LBitWriterNumBytes(bw) < best_size) { + best_size = VP8LBitWriterNumBytes(bw); + // Store the BitWriter. + VP8LBitWriterSwap(bw, &bw_best); +#if !defined(WEBP_DISABLE_STATS) + // Update the stats. + if (stats != NULL) { + stats->lossless_features = 0; + if (enc->use_predict_) stats->lossless_features |= 1; + if (enc->use_cross_color_) stats->lossless_features |= 2; + if (enc->use_subtract_green_) stats->lossless_features |= 4; + if (enc->use_palette_) stats->lossless_features |= 8; + stats->histogram_bits = enc->histo_bits_; + stats->transform_bits = enc->transform_bits_; + stats->cache_bits = enc->cache_bits_; + stats->palette_size = enc->palette_size_; + stats->lossless_size = (int)(best_size - byte_position); + stats->lossless_hdr_size = hdr_size; + stats->lossless_data_size = data_size; + } +#endif } + // Reset the bit writer for the following iteration if any. + if (num_crunch_configs > 1) VP8LBitWriterReset(&bw_init, bw); + } + VP8LBitWriterSwap(&bw_best, bw); - if (enc->use_predict_) { - err = ApplyPredictFilter(enc, enc->current_width_, height, quality, - low_effort, enc->use_subtract_green_, bw); - if (err != VP8_ENC_OK) goto Error; - } +Error: + VP8LBitWriterWipeOut(&bw_best); + params->err_ = err; + // The hook should return false in case of error. + return (err == VP8_ENC_OK); +} - if (enc->use_cross_color_) { - err = ApplyCrossColorFilter(enc, enc->current_width_, - height, quality, low_effort, bw); - if (err != VP8_ENC_OK) goto Error; - } +WebPEncodingError VP8LEncodeStream(const WebPConfig* const config, + const WebPPicture* const picture, + VP8LBitWriter* const bw_main, + int use_cache) { + WebPEncodingError err = VP8_ENC_OK; + VP8LEncoder* const enc_main = VP8LEncoderNew(config, picture); + VP8LEncoder* enc_side = NULL; + CrunchConfig crunch_configs[CRUNCH_CONFIGS_MAX]; + int num_crunch_configs_main, num_crunch_configs_side = 0; + int idx; + int red_and_blue_always_zero = 0; + WebPWorker worker_main, worker_side; + StreamEncodeContext params_main, params_side; + // The main thread uses picture->stats, the side thread uses stats_side. + WebPAuxStats stats_side; + VP8LBitWriter bw_side; + const WebPWorkerInterface* const worker_interface = WebPGetWorkerInterface(); + int ok_main; + + // Analyze image (entropy, num_palettes etc) + if (enc_main == NULL || + !EncoderAnalyze(enc_main, crunch_configs, &num_crunch_configs_main, + &red_and_blue_always_zero) || + !EncoderInit(enc_main) || !VP8LBitWriterInit(&bw_side, 0)) { + err = VP8_ENC_ERROR_OUT_OF_MEMORY; + goto Error; } - VP8LPutBits(bw, !TRANSFORM_PRESENT, 1); // No more transforms. + // Split the configs between the main and side threads (if any). + if (config->thread_level > 0) { + num_crunch_configs_side = num_crunch_configs_main / 2; + for (idx = 0; idx < num_crunch_configs_side; ++idx) { + params_side.crunch_configs_[idx] = + crunch_configs[num_crunch_configs_main - num_crunch_configs_side + + idx]; + } + params_side.num_crunch_configs_ = num_crunch_configs_side; + } + num_crunch_configs_main -= num_crunch_configs_side; + for (idx = 0; idx < num_crunch_configs_main; ++idx) { + params_main.crunch_configs_[idx] = crunch_configs[idx]; + } + params_main.num_crunch_configs_ = num_crunch_configs_main; - // --------------------------------------------------------------------------- - // Encode and write the transformed image. - err = EncodeImageInternal(bw, enc->argb_, &enc->hash_chain_, enc->refs_, - enc->current_width_, height, quality, low_effort, - use_cache, &enc->cache_bits_, enc->histo_bits_, - byte_position, &hdr_size, &data_size); - if (err != VP8_ENC_OK) goto Error; + // Fill in the parameters for the thread workers. + { + const int params_size = (num_crunch_configs_side > 0) ? 2 : 1; + for (idx = 0; idx < params_size; ++idx) { + // Create the parameters for each worker. + WebPWorker* const worker = (idx == 0) ? &worker_main : &worker_side; + StreamEncodeContext* const param = + (idx == 0) ? ¶ms_main : ¶ms_side; + param->config_ = config; + param->picture_ = picture; + param->use_cache_ = use_cache; + param->red_and_blue_always_zero_ = red_and_blue_always_zero; + if (idx == 0) { + param->stats_ = picture->stats; + param->bw_ = bw_main; + param->enc_ = enc_main; + } else { + param->stats_ = (picture->stats == NULL) ? NULL : &stats_side; + // Create a side bit writer. + if (!VP8LBitWriterClone(bw_main, &bw_side)) { + err = VP8_ENC_ERROR_OUT_OF_MEMORY; + goto Error; + } + param->bw_ = &bw_side; + // Create a side encoder. + enc_side = VP8LEncoderNew(config, picture); + if (enc_side == NULL || !EncoderInit(enc_side)) { + err = VP8_ENC_ERROR_OUT_OF_MEMORY; + goto Error; + } + // Copy the values that were computed for the main encoder. + enc_side->histo_bits_ = enc_main->histo_bits_; + enc_side->transform_bits_ = enc_main->transform_bits_; + enc_side->palette_size_ = enc_main->palette_size_; + memcpy(enc_side->palette_, enc_main->palette_, + sizeof(enc_main->palette_)); + param->enc_ = enc_side; + } + // Create the workers. + worker_interface->Init(worker); + worker->data1 = param; + worker->data2 = NULL; + worker->hook = EncodeStreamHook; + } + } - if (picture->stats != NULL) { - WebPAuxStats* const stats = picture->stats; - stats->lossless_features = 0; - if (enc->use_predict_) stats->lossless_features |= 1; - if (enc->use_cross_color_) stats->lossless_features |= 2; - if (enc->use_subtract_green_) stats->lossless_features |= 4; - if (enc->use_palette_) stats->lossless_features |= 8; - stats->histogram_bits = enc->histo_bits_; - stats->transform_bits = enc->transform_bits_; - stats->cache_bits = enc->cache_bits_; - stats->palette_size = enc->palette_size_; - stats->lossless_size = (int)(VP8LBitWriterNumBytes(bw) - byte_position); - stats->lossless_hdr_size = hdr_size; - stats->lossless_data_size = data_size; + // Start the second thread if needed. + if (num_crunch_configs_side != 0) { + if (!worker_interface->Reset(&worker_side)) { + err = VP8_ENC_ERROR_OUT_OF_MEMORY; + goto Error; + } +#if !defined(WEBP_DISABLE_STATS) + // This line is here and not in the param initialization above to remove a + // Clang static analyzer warning. + if (picture->stats != NULL) { + memcpy(&stats_side, picture->stats, sizeof(stats_side)); + } +#endif + // This line is only useful to remove a Clang static analyzer warning. + params_side.err_ = VP8_ENC_OK; + worker_interface->Launch(&worker_side); + } + // Execute the main thread. + worker_interface->Execute(&worker_main); + ok_main = worker_interface->Sync(&worker_main); + worker_interface->End(&worker_main); + if (num_crunch_configs_side != 0) { + // Wait for the second thread. + const int ok_side = worker_interface->Sync(&worker_side); + worker_interface->End(&worker_side); + if (!ok_main || !ok_side) { + err = ok_main ? params_side.err_ : params_main.err_; + goto Error; + } + if (VP8LBitWriterNumBytes(&bw_side) < VP8LBitWriterNumBytes(bw_main)) { + VP8LBitWriterSwap(bw_main, &bw_side); +#if !defined(WEBP_DISABLE_STATS) + if (picture->stats != NULL) { + memcpy(picture->stats, &stats_side, sizeof(*picture->stats)); + } +#endif + } + } else { + if (!ok_main) { + err = params_main.err_; + goto Error; + } } - Error: - VP8LEncoderDelete(enc); +Error: + VP8LBitWriterWipeOut(&bw_side); + VP8LEncoderDelete(enc_main); + VP8LEncoderDelete(enc_side); return err; } +#undef CRUNCH_CONFIGS_MAX +#undef CRUNCH_CONFIGS_LZ77_MAX + int VP8LEncodeImage(const WebPConfig* const config, const WebPPicture* const picture) { int width, height; @@ -1633,7 +1882,6 @@ int VP8LEncodeImage(const WebPConfig* const config, err = VP8LEncodeStream(config, picture, &bw, 1 /*use_cache*/); if (err != VP8_ENC_OK) goto Error; - // TODO(skal): have a fine-grained progress report in VP8LEncodeStream(). if (!WebPReportProgress(picture, 90, &percent)) goto UserAbort; // Finish the RIFF chunk. @@ -1642,11 +1890,13 @@ int VP8LEncodeImage(const WebPConfig* const config, if (!WebPReportProgress(picture, 100, &percent)) goto UserAbort; +#if !defined(WEBP_DISABLE_STATS) // Save size. if (picture->stats != NULL) { picture->stats->coded_size += (int)coded_size; picture->stats->lossless_size = (int)coded_size; } +#endif if (picture->extra_info != NULL) { const int mb_w = (width + 15) >> 4; diff --git a/src/3rdparty/libwebp/src/enc/vp8li_enc.h b/src/3rdparty/libwebp/src/enc/vp8li_enc.h index 8c5fbcb..d2d0fc5 100644 --- a/src/3rdparty/libwebp/src/enc/vp8li_enc.h +++ b/src/3rdparty/libwebp/src/enc/vp8li_enc.h @@ -11,14 +11,23 @@ // // Author: Vikas Arora (vikaas.arora@gmail.com) -#ifndef WEBP_ENC_VP8LI_H_ -#define WEBP_ENC_VP8LI_H_ +#ifndef WEBP_ENC_VP8LI_ENC_H_ +#define WEBP_ENC_VP8LI_ENC_H_ -#include "./backward_references_enc.h" -#include "./histogram_enc.h" -#include "../utils/bit_writer_utils.h" -#include "../webp/encode.h" -#include "../webp/format_constants.h" +#ifdef HAVE_CONFIG_H +#include "src/webp/config.h" +#endif +// Either WEBP_NEAR_LOSSLESS is defined as 0 in config.h when compiling to +// disable near-lossless, or it is enabled by default. +#ifndef WEBP_NEAR_LOSSLESS +#define WEBP_NEAR_LOSSLESS 1 +#endif + +#include "src/enc/backward_references_enc.h" +#include "src/enc/histogram_enc.h" +#include "src/utils/bit_writer_utils.h" +#include "src/webp/encode.h" +#include "src/webp/format_constants.h" #ifdef __cplusplus extern "C" { @@ -27,16 +36,24 @@ extern "C" { // maximum value of transform_bits_ in VP8LEncoder. #define MAX_TRANSFORM_BITS 6 +typedef enum { + kEncoderNone = 0, + kEncoderARGB, + kEncoderNearLossless, + kEncoderPalette +} VP8LEncoderARGBContent; + typedef struct { const WebPConfig* config_; // user configuration and parameters const WebPPicture* pic_; // input picture. - uint32_t* argb_; // Transformed argb image data. - uint32_t* argb_scratch_; // Scratch memory for argb rows - // (used for prediction). - uint32_t* transform_data_; // Scratch memory for transform data. - uint32_t* transform_mem_; // Currently allocated memory. - size_t transform_mem_size_; // Currently allocated memory size. + uint32_t* argb_; // Transformed argb image data. + VP8LEncoderARGBContent argb_content_; // Content type of the argb buffer. + uint32_t* argb_scratch_; // Scratch memory for argb rows + // (used for prediction). + uint32_t* transform_data_; // Scratch memory for transform data. + uint32_t* transform_mem_; // Currently allocated memory. + size_t transform_mem_size_; // Currently allocated memory size. int current_width_; // Corresponds to packed image width. @@ -54,8 +71,7 @@ typedef struct { uint32_t palette_[MAX_PALETTE_SIZE]; // Some 'scratch' (potentially large) objects. - struct VP8LBackwardRefs refs_[2]; // Backward Refs array corresponding to - // LZ77 & RLE coding. + struct VP8LBackwardRefs refs_[3]; // Backward Refs array for temporaries. VP8LHashChain hash_chain_; // HashChain data for constructing // backward references. } VP8LEncoder; @@ -75,6 +91,13 @@ WebPEncodingError VP8LEncodeStream(const WebPConfig* const config, const WebPPicture* const picture, VP8LBitWriter* const bw, int use_cache); +#if (WEBP_NEAR_LOSSLESS == 1) +// in near_lossless.c +// Near lossless preprocessing in RGB color-space. +int VP8ApplyNearLossless(const WebPPicture* const picture, int quality, + uint32_t* const argb_dst); +#endif + //------------------------------------------------------------------------------ // Image transforms in predictor.c. @@ -92,4 +115,4 @@ void VP8LColorSpaceTransform(int width, int height, int bits, int quality, } // extern "C" #endif -#endif /* WEBP_ENC_VP8LI_H_ */ +#endif // WEBP_ENC_VP8LI_ENC_H_ diff --git a/src/3rdparty/libwebp/src/enc/webp_enc.c b/src/3rdparty/libwebp/src/enc/webp_enc.c index f18461e..9f4b10c 100644 --- a/src/3rdparty/libwebp/src/enc/webp_enc.c +++ b/src/3rdparty/libwebp/src/enc/webp_enc.c @@ -16,10 +16,10 @@ #include <string.h> #include <math.h> -#include "./cost_enc.h" -#include "./vp8i_enc.h" -#include "./vp8li_enc.h" -#include "../utils/utils.h" +#include "src/enc/cost_enc.h" +#include "src/enc/vp8i_enc.h" +#include "src/enc/vp8li_enc.h" +#include "src/utils/utils.h" // #define PRINT_MEMORY_INFO @@ -159,12 +159,16 @@ static VP8Encoder* InitVP8Encoder(const WebPConfig* const config, + WEBP_ALIGN_CST; // align all const size_t lf_stats_size = config->autofilter ? sizeof(*enc->lf_stats_) + WEBP_ALIGN_CST : 0; + const size_t top_derr_size = + (config->quality <= ERROR_DIFFUSION_QUALITY || config->pass > 1) ? + mb_w * sizeof(*enc->top_derr_) : 0; uint8_t* mem; const uint64_t size = (uint64_t)sizeof(*enc) // main struct + WEBP_ALIGN_CST // cache alignment + info_size // modes info + preds_size // prediction modes + samples_size // top/left samples + + top_derr_size // top diffusion error + nz_size // coeff context bits + lf_stats_size; // autofilter stats @@ -175,11 +179,12 @@ static VP8Encoder* InitVP8Encoder(const WebPConfig* const config, " info: %ld\n" " preds: %ld\n" " top samples: %ld\n" + " top diffusion: %ld\n" " non-zero: %ld\n" " lf-stats: %ld\n" " total: %ld\n", sizeof(*enc) + WEBP_ALIGN_CST, info_size, - preds_size, samples_size, nz_size, lf_stats_size, size); + preds_size, samples_size, top_derr_size, nz_size, lf_stats_size, size); printf("Transient object sizes:\n" " VP8EncIterator: %ld\n" " VP8ModeScore: %ld\n" @@ -207,7 +212,7 @@ static VP8Encoder* InitVP8Encoder(const WebPConfig* const config, enc->preds_w_ = preds_w; enc->mb_info_ = (VP8MBInfo*)mem; mem += info_size; - enc->preds_ = ((uint8_t*)mem) + 1 + enc->preds_w_; + enc->preds_ = mem + 1 + enc->preds_w_; mem += preds_size; enc->nz_ = 1 + (uint32_t*)WEBP_ALIGN(mem); mem += nz_size; @@ -216,9 +221,11 @@ static VP8Encoder* InitVP8Encoder(const WebPConfig* const config, // top samples (all 16-aligned) mem = (uint8_t*)WEBP_ALIGN(mem); - enc->y_top_ = (uint8_t*)mem; + enc->y_top_ = mem; enc->uv_top_ = enc->y_top_ + top_stride; mem += 2 * top_stride; + enc->top_derr_ = top_derr_size ? (DError*)mem : NULL; + mem += top_derr_size; assert(mem <= (uint8_t*)enc + size); enc->config_ = config; @@ -256,6 +263,7 @@ static int DeleteVP8Encoder(VP8Encoder* enc) { //------------------------------------------------------------------------------ +#if !defined(WEBP_DISABLE_STATS) static double GetPSNR(uint64_t err, uint64_t size) { return (err > 0 && size > 0) ? 10. * log10(255. * 255. * size / err) : 99.; } @@ -270,8 +278,10 @@ static void FinalizePSNR(const VP8Encoder* const enc) { stats->PSNR[3] = (float)GetPSNR(sse[0] + sse[1] + sse[2], size * 3 / 2); stats->PSNR[4] = (float)GetPSNR(sse[3], size); } +#endif // !defined(WEBP_DISABLE_STATS) static void StoreStats(VP8Encoder* const enc) { +#if !defined(WEBP_DISABLE_STATS) WebPAuxStats* const stats = enc->pic_->stats; if (stats != NULL) { int i, s; @@ -288,7 +298,9 @@ static void StoreStats(VP8Encoder* const enc) { stats->block_count[i] = enc->block_count_[i]; } } +#else // defined(WEBP_DISABLE_STATS) WebPReportProgress(enc->pic_, 100, &enc->percent_); // done! +#endif // !defined(WEBP_DISABLE_STATS) } int WebPEncodingSetError(const WebPPicture* const pic, @@ -336,10 +348,6 @@ int WebPEncode(const WebPConfig* config, WebPPicture* pic) { if (!config->lossless) { VP8Encoder* enc = NULL; - if (!config->exact) { - WebPCleanupTransparentArea(pic); - } - if (pic->use_argb || pic->y == NULL || pic->u == NULL || pic->v == NULL) { // Make sure we have YUVA samples. if (config->use_sharp_yuv || (config->preprocessing & 4)) { @@ -361,6 +369,10 @@ int WebPEncode(const WebPConfig* config, WebPPicture* pic) { } } + if (!config->exact) { + WebPCleanupTransparentArea(pic); + } + enc = InitVP8Encoder(config, pic); if (enc == NULL) return 0; // pic->error is already set. // Note: each of the tasks below account for 20% in the progress report. diff --git a/src/3rdparty/libwebp/src/mux/anim_encode.c b/src/3rdparty/libwebp/src/mux/anim_encode.c index 6066388..7be9906 100644 --- a/src/3rdparty/libwebp/src/mux/anim_encode.c +++ b/src/3rdparty/libwebp/src/mux/anim_encode.c @@ -16,12 +16,12 @@ #include <stdio.h> #include <stdlib.h> // for abs() -#include "../mux/animi.h" -#include "../utils/utils.h" -#include "../webp/decode.h" -#include "../webp/encode.h" -#include "../webp/format_constants.h" -#include "../webp/mux.h" +#include "src/mux/animi.h" +#include "src/utils/utils.h" +#include "src/webp/decode.h" +#include "src/webp/encode.h" +#include "src/webp/format_constants.h" +#include "src/webp/mux.h" #if defined(_MSC_VER) && _MSC_VER < 1900 #define snprintf _snprintf @@ -35,7 +35,7 @@ // Stores frame rectangle dimensions. typedef struct { int x_offset_, y_offset_, width_, height_; -} FrameRect; +} FrameRectangle; // Used to store two candidates of encoded data for an animation frame. One of // the two will be chosen later. @@ -50,7 +50,7 @@ struct WebPAnimEncoder { const int canvas_height_; // Canvas height. const WebPAnimEncoderOptions options_; // Global encoding options. - FrameRect prev_rect_; // Previous WebP frame rectangle. + FrameRectangle prev_rect_; // Previous WebP frame rectangle. WebPConfig last_config_; // Cached in case a re-encode is needed. WebPConfig last_config_reversed_; // If 'last_config_' uses lossless, then // this config uses lossy and vice versa; @@ -206,7 +206,7 @@ static void ClearRectangle(WebPPicture* const picture, } static void WebPUtilClearPic(WebPPicture* const picture, - const FrameRect* const rect) { + const FrameRectangle* const rect) { if (rect != NULL) { ClearRectangle(picture, rect->x_offset_, rect->y_offset_, rect->width_, rect->height_); @@ -400,7 +400,7 @@ static WEBP_INLINE int ComparePixelsLossy(const uint32_t* src, int src_step, return 1; } -static int IsEmptyRect(const FrameRect* const rect) { +static int IsEmptyRect(const FrameRectangle* const rect) { return (rect->width_ == 0) || (rect->height_ == 0); } @@ -413,7 +413,7 @@ static int QualityToMaxDiff(float quality) { // Assumes that an initial valid guess of change rectangle 'rect' is passed. static void MinimizeChangeRectangle(const WebPPicture* const src, const WebPPicture* const dst, - FrameRect* const rect, + FrameRectangle* const rect, int is_lossless, float quality) { int i, j; const ComparePixelsFunc compare_pixels = @@ -498,7 +498,7 @@ static void MinimizeChangeRectangle(const WebPPicture* const src, } // Snap rectangle to even offsets (and adjust dimensions if needed). -static WEBP_INLINE void SnapToEvenOffsets(FrameRect* const rect) { +static WEBP_INLINE void SnapToEvenOffsets(FrameRectangle* const rect) { rect->width_ += (rect->x_offset_ & 1); rect->height_ += (rect->y_offset_ & 1); rect->x_offset_ &= ~1; @@ -508,9 +508,9 @@ static WEBP_INLINE void SnapToEvenOffsets(FrameRect* const rect) { typedef struct { int should_try_; // Should try this set of parameters. int empty_rect_allowed_; // Frame with empty rectangle can be skipped. - FrameRect rect_ll_; // Frame rectangle for lossless compression. + FrameRectangle rect_ll_; // Frame rectangle for lossless compression. WebPPicture sub_frame_ll_; // Sub-frame pic for lossless compression. - FrameRect rect_lossy_; // Frame rectangle for lossy compression. + FrameRectangle rect_lossy_; // Frame rectangle for lossy compression. // Could be smaller than rect_ll_ as pixels // with small diffs can be ignored. WebPPicture sub_frame_lossy_; // Sub-frame pic for lossless compression. @@ -538,7 +538,8 @@ static void SubFrameParamsFree(SubFrameParams* const params) { static int GetSubRect(const WebPPicture* const prev_canvas, const WebPPicture* const curr_canvas, int is_key_frame, int is_first_frame, int empty_rect_allowed, - int is_lossless, float quality, FrameRect* const rect, + int is_lossless, float quality, + FrameRectangle* const rect, WebPPicture* const sub_frame) { if (!is_key_frame || is_first_frame) { // Optimize frame rectangle. // Note: This behaves as expected for first frame, as 'prev_canvas' is @@ -594,7 +595,7 @@ int WebPAnimEncoderRefineRect( const WebPPicture* const prev_canvas, const WebPPicture* const curr_canvas, int is_lossless, float quality, int* const x_offset, int* const y_offset, int* const width, int* const height) { - FrameRect rect; + FrameRectangle rect; const int right = clip(*x_offset + *width, 0, curr_canvas->width); const int left = clip(*x_offset, 0, curr_canvas->width - 1); const int bottom = clip(*y_offset + *height, 0, curr_canvas->height); @@ -620,7 +621,7 @@ int WebPAnimEncoderRefineRect( } static void DisposeFrameRectangle(int dispose_method, - const FrameRect* const rect, + const FrameRectangle* const rect, WebPPicture* const curr_canvas) { assert(rect != NULL); if (dispose_method == WEBP_MUX_DISPOSE_BACKGROUND) { @@ -628,13 +629,13 @@ static void DisposeFrameRectangle(int dispose_method, } } -static uint32_t RectArea(const FrameRect* const rect) { +static uint32_t RectArea(const FrameRectangle* const rect) { return (uint32_t)rect->width_ * rect->height_; } static int IsLosslessBlendingPossible(const WebPPicture* const src, const WebPPicture* const dst, - const FrameRect* const rect) { + const FrameRectangle* const rect) { int i, j; assert(src->width == dst->width && src->height == dst->height); assert(rect->x_offset_ + rect->width_ <= dst->width); @@ -656,7 +657,7 @@ static int IsLosslessBlendingPossible(const WebPPicture* const src, static int IsLossyBlendingPossible(const WebPPicture* const src, const WebPPicture* const dst, - const FrameRect* const rect, + const FrameRectangle* const rect, float quality) { const int max_allowed_diff_lossy = QualityToMaxDiff(quality); int i, j; @@ -683,7 +684,7 @@ static int IsLossyBlendingPossible(const WebPPicture* const src, // transparent pixels. // Returns true if at least one pixel gets modified. static int IncreaseTransparency(const WebPPicture* const src, - const FrameRect* const rect, + const FrameRectangle* const rect, WebPPicture* const dst) { int i, j; int modified = 0; @@ -709,7 +710,7 @@ static int IncreaseTransparency(const WebPPicture* const src, // Assumes lossy compression is being used. // Returns true if at least one pixel gets modified. static int FlattenSimilarBlocks(const WebPPicture* const src, - const FrameRect* const rect, + const FrameRectangle* const rect, WebPPicture* const dst, float quality) { const int max_allowed_diff_lossy = QualityToMaxDiff(quality); int i, j; @@ -778,13 +779,13 @@ static int EncodeFrame(const WebPConfig* const config, WebPPicture* const pic, typedef struct { WebPMemoryWriter mem_; WebPMuxFrameInfo info_; - FrameRect rect_; + FrameRectangle rect_; int evaluate_; // True if this candidate should be evaluated. } Candidate; // Generates a candidate encoded frame given a picture and metadata. static WebPEncodingError EncodeCandidate(WebPPicture* const sub_frame, - const FrameRect* const rect, + const FrameRectangle* const rect, const WebPConfig* const encoder_config, int use_blending, Candidate* const candidate) { @@ -958,7 +959,7 @@ static int IncreasePreviousDuration(WebPAnimEncoder* const enc, int duration) { if (new_duration >= MAX_DURATION) { // Special case. // Separate out previous frame from earlier merged frames to avoid overflow. // We add a 1x1 transparent frame for the previous frame, with blending on. - const FrameRect rect = { 0, 0, 1, 1 }; + const FrameRectangle rect = { 0, 0, 1, 1 }; const uint8_t lossless_1x1_bytes[] = { 0x52, 0x49, 0x46, 0x46, 0x14, 0x00, 0x00, 0x00, 0x57, 0x45, 0x42, 0x50, 0x56, 0x50, 0x38, 0x4c, 0x08, 0x00, 0x00, 0x00, 0x2f, 0x00, 0x00, 0x00, @@ -1223,7 +1224,7 @@ static int CacheFrame(WebPAnimEncoder* const enc, enc->prev_candidate_undecided_ = 0; } else { int64_t curr_delta; - FrameRect prev_rect_key, prev_rect_sub; + FrameRectangle prev_rect_key, prev_rect_sub; // Add this as a frame rectangle to enc. error_code = SetFrame(enc, config, 0, encoded_frame, &frame_skipped); @@ -1535,7 +1536,8 @@ int WebPAnimEncoderAssemble(WebPAnimEncoder* enc, WebPData* webp_data) { if (!enc->got_null_frame_ && enc->in_frame_count_ > 1 && enc->count_ > 0) { // set duration of the last frame to be avg of durations of previous frames. - const double delta_time = enc->prev_timestamp_ - enc->first_timestamp_; + const double delta_time = + (uint32_t)enc->prev_timestamp_ - enc->first_timestamp_; const int average_duration = (int)(delta_time / (enc->in_frame_count_ - 1)); if (!IncreasePreviousDuration(enc, average_duration)) { return 0; diff --git a/src/3rdparty/libwebp/src/mux/animi.h b/src/3rdparty/libwebp/src/mux/animi.h index cecaf1f..34c45ba 100644 --- a/src/3rdparty/libwebp/src/mux/animi.h +++ b/src/3rdparty/libwebp/src/mux/animi.h @@ -14,7 +14,7 @@ #ifndef WEBP_MUX_ANIMI_H_ #define WEBP_MUX_ANIMI_H_ -#include "../webp/mux.h" +#include "src/webp/mux.h" #ifdef __cplusplus extern "C" { @@ -40,4 +40,4 @@ int WebPAnimEncoderRefineRect( } // extern "C" #endif -#endif /* WEBP_MUX_ANIMI_H_ */ +#endif // WEBP_MUX_ANIMI_H_ diff --git a/src/3rdparty/libwebp/src/mux/muxedit.c b/src/3rdparty/libwebp/src/mux/muxedit.c index d2c5305..ccf14b2 100644 --- a/src/3rdparty/libwebp/src/mux/muxedit.c +++ b/src/3rdparty/libwebp/src/mux/muxedit.c @@ -13,8 +13,8 @@ // Vikas (vikasa@google.com) #include <assert.h> -#include "./muxi.h" -#include "../utils/utils.h" +#include "src/mux/muxi.h" +#include "src/utils/utils.h" //------------------------------------------------------------------------------ // Life of a mux object. @@ -69,12 +69,12 @@ void WebPMuxDelete(WebPMux* mux) { if (idx == (INDEX)) { \ err = ChunkAssignData(&chunk, data, copy_data, tag); \ if (err == WEBP_MUX_OK) { \ - err = ChunkSetNth(&chunk, (LIST), nth); \ + err = ChunkSetHead(&chunk, (LIST)); \ } \ return err; \ } -static WebPMuxError MuxSet(WebPMux* const mux, uint32_t tag, uint32_t nth, +static WebPMuxError MuxSet(WebPMux* const mux, uint32_t tag, const WebPData* const data, int copy_data) { WebPChunk chunk; WebPMuxError err = WEBP_MUX_NOT_FOUND; @@ -190,7 +190,7 @@ WebPMuxError WebPMuxSetChunk(WebPMux* mux, const char fourcc[4], if (err != WEBP_MUX_OK && err != WEBP_MUX_NOT_FOUND) return err; // Add the given chunk. - return MuxSet(mux, tag, 1, chunk_data, copy_data); + return MuxSet(mux, tag, chunk_data, copy_data); } // Creates a chunk from given 'data' and sets it as 1st chunk in 'chunk_list'. @@ -202,7 +202,7 @@ static WebPMuxError AddDataToChunkList( ChunkInit(&chunk); err = ChunkAssignData(&chunk, data, copy_data, tag); if (err != WEBP_MUX_OK) goto Err; - err = ChunkSetNth(&chunk, chunk_list, 1); + err = ChunkSetHead(&chunk, chunk_list); if (err != WEBP_MUX_OK) goto Err; return WEBP_MUX_OK; Err: @@ -266,14 +266,14 @@ WebPMuxError WebPMuxPushFrame(WebPMux* mux, const WebPMuxFrameInfo* info, int copy_data) { WebPMuxImage wpi; WebPMuxError err; - const WebPData* const bitstream = &info->bitstream; // Sanity checks. if (mux == NULL || info == NULL) return WEBP_MUX_INVALID_ARGUMENT; if (info->id != WEBP_CHUNK_ANMF) return WEBP_MUX_INVALID_ARGUMENT; - if (bitstream->bytes == NULL || bitstream->size > MAX_CHUNK_PAYLOAD) { + if (info->bitstream.bytes == NULL || + info->bitstream.size > MAX_CHUNK_PAYLOAD) { return WEBP_MUX_INVALID_ARGUMENT; } @@ -287,7 +287,7 @@ WebPMuxError WebPMuxPushFrame(WebPMux* mux, const WebPMuxFrameInfo* info, } MuxImageInit(&wpi); - err = SetAlphaAndImageChunks(bitstream, copy_data, &wpi); + err = SetAlphaAndImageChunks(&info->bitstream, copy_data, &wpi); if (err != WEBP_MUX_OK) goto Err; assert(wpi.img_ != NULL); // As SetAlphaAndImageChunks() was successful. @@ -342,7 +342,7 @@ WebPMuxError WebPMuxSetAnimationParams(WebPMux* mux, // Set the animation parameters. PutLE32(data, params->bgcolor); PutLE16(data + 4, params->loop_count); - return MuxSet(mux, kChunks[IDX_ANIM].tag, 1, &anim, 1); + return MuxSet(mux, kChunks[IDX_ANIM].tag, &anim, 1); } WebPMuxError WebPMuxSetCanvasSize(WebPMux* mux, @@ -540,7 +540,7 @@ static WebPMuxError CreateVP8XChunk(WebPMux* const mux) { PutLE24(data + 4, width - 1); // canvas width. PutLE24(data + 7, height - 1); // canvas height. - return MuxSet(mux, kChunks[IDX_VP8X].tag, 1, &vp8x, 1); + return MuxSet(mux, kChunks[IDX_VP8X].tag, &vp8x, 1); } // Cleans up 'mux' by removing any unnecessary chunks. diff --git a/src/3rdparty/libwebp/src/mux/muxi.h b/src/3rdparty/libwebp/src/mux/muxi.h index e6606aa..7bc0b07 100644 --- a/src/3rdparty/libwebp/src/mux/muxi.h +++ b/src/3rdparty/libwebp/src/mux/muxi.h @@ -14,10 +14,11 @@ #ifndef WEBP_MUX_MUXI_H_ #define WEBP_MUX_MUXI_H_ +#include <assert.h> #include <stdlib.h> -#include "../dec/vp8i_dec.h" -#include "../dec/vp8li_dec.h" -#include "../webp/mux.h" +#include "src/dec/vp8i_dec.h" +#include "src/dec/vp8li_dec.h" +#include "src/webp/mux.h" #ifdef __cplusplus extern "C" { @@ -26,9 +27,9 @@ extern "C" { //------------------------------------------------------------------------------ // Defines and constants. -#define MUX_MAJ_VERSION 0 -#define MUX_MIN_VERSION 4 -#define MUX_REV_VERSION 0 +#define MUX_MAJ_VERSION 1 +#define MUX_MIN_VERSION 0 +#define MUX_REV_VERSION 3 // Chunk object. typedef struct WebPChunk WebPChunk; @@ -126,11 +127,14 @@ WebPChunk* ChunkSearchList(WebPChunk* first, uint32_t nth, uint32_t tag); WebPMuxError ChunkAssignData(WebPChunk* chunk, const WebPData* const data, int copy_data, uint32_t tag); -// Sets 'chunk' at nth position in the 'chunk_list'. -// nth = 0 has the special meaning "last of the list". +// Sets 'chunk' as the only element in 'chunk_list' if it is empty. // On success ownership is transferred from 'chunk' to the 'chunk_list'. -WebPMuxError ChunkSetNth(WebPChunk* chunk, WebPChunk** chunk_list, - uint32_t nth); +WebPMuxError ChunkSetHead(WebPChunk* const chunk, WebPChunk** const chunk_list); +// Sets 'chunk' at last position in the 'chunk_list'. +// On success ownership is transferred from 'chunk' to the 'chunk_list'. +// *chunk_list also points towards the last valid element of the initial +// *chunk_list. +WebPMuxError ChunkAppend(WebPChunk* const chunk, WebPChunk*** const chunk_list); // Releases chunk and returns chunk->next_. WebPChunk* ChunkRelease(WebPChunk* const chunk); @@ -143,13 +147,13 @@ void ChunkListDelete(WebPChunk** const chunk_list); // Returns size of the chunk including chunk header and padding byte (if any). static WEBP_INLINE size_t SizeWithPadding(size_t chunk_size) { + assert(chunk_size <= MAX_CHUNK_PAYLOAD); return CHUNK_HEADER_SIZE + ((chunk_size + 1) & ~1U); } // Size of a chunk including header and padding. static WEBP_INLINE size_t ChunkDiskSize(const WebPChunk* chunk) { const size_t data_size = chunk->data_.size; - assert(data_size < MAX_CHUNK_PAYLOAD); return SizeWithPadding(data_size); } @@ -227,4 +231,4 @@ WebPMuxError MuxValidate(const WebPMux* const mux); } // extern "C" #endif -#endif /* WEBP_MUX_MUXI_H_ */ +#endif // WEBP_MUX_MUXI_H_ diff --git a/src/3rdparty/libwebp/src/mux/muxinternal.c b/src/3rdparty/libwebp/src/mux/muxinternal.c index 387b57e..b9ee671 100644 --- a/src/3rdparty/libwebp/src/mux/muxinternal.c +++ b/src/3rdparty/libwebp/src/mux/muxinternal.c @@ -13,8 +13,8 @@ // Vikas (vikasa@google.com) #include <assert.h> -#include "./muxi.h" -#include "../utils/utils.h" +#include "src/mux/muxi.h" +#include "src/utils/utils.h" #define UNDEFINED_CHUNK_SIZE ((uint32_t)(-1)) @@ -111,27 +111,6 @@ WebPChunk* ChunkSearchList(WebPChunk* first, uint32_t nth, uint32_t tag) { return ((nth > 0) && (iter > 0)) ? NULL : first; } -// Outputs a pointer to 'prev_chunk->next_', -// where 'prev_chunk' is the pointer to the chunk at position (nth - 1). -// Returns true if nth chunk was found. -static int ChunkSearchListToSet(WebPChunk** chunk_list, uint32_t nth, - WebPChunk*** const location) { - uint32_t count = 0; - assert(chunk_list != NULL); - *location = chunk_list; - - while (*chunk_list != NULL) { - WebPChunk* const cur_chunk = *chunk_list; - ++count; - if (count == nth) return 1; // Found. - chunk_list = &cur_chunk->next_; - *location = chunk_list; - } - - // *chunk_list is ok to be NULL if adding at last location. - return (nth == 0 || (count == nth - 1)) ? 1 : 0; -} - //------------------------------------------------------------------------------ // Chunk writer methods. @@ -156,11 +135,12 @@ WebPMuxError ChunkAssignData(WebPChunk* chunk, const WebPData* const data, return WEBP_MUX_OK; } -WebPMuxError ChunkSetNth(WebPChunk* chunk, WebPChunk** chunk_list, - uint32_t nth) { +WebPMuxError ChunkSetHead(WebPChunk* const chunk, + WebPChunk** const chunk_list) { WebPChunk* new_chunk; - if (!ChunkSearchListToSet(chunk_list, nth, &chunk_list)) { + assert(chunk_list != NULL); + if (*chunk_list != NULL) { return WEBP_MUX_NOT_FOUND; } @@ -168,11 +148,26 @@ WebPMuxError ChunkSetNth(WebPChunk* chunk, WebPChunk** chunk_list, if (new_chunk == NULL) return WEBP_MUX_MEMORY_ERROR; *new_chunk = *chunk; chunk->owner_ = 0; - new_chunk->next_ = *chunk_list; + new_chunk->next_ = NULL; *chunk_list = new_chunk; return WEBP_MUX_OK; } +WebPMuxError ChunkAppend(WebPChunk* const chunk, + WebPChunk*** const chunk_list) { + assert(chunk_list != NULL && *chunk_list != NULL); + + if (**chunk_list == NULL) { + ChunkSetHead(chunk, *chunk_list); + } else { + WebPChunk* last_chunk = **chunk_list; + while (last_chunk->next_ != NULL) last_chunk = last_chunk->next_; + ChunkSetHead(chunk, &last_chunk->next_); + *chunk_list = &last_chunk->next_; + } + return WEBP_MUX_OK; +} + //------------------------------------------------------------------------------ // Chunk deletion method(s). @@ -232,9 +227,11 @@ void MuxImageInit(WebPMuxImage* const wpi) { WebPMuxImage* MuxImageRelease(WebPMuxImage* const wpi) { WebPMuxImage* next; if (wpi == NULL) return NULL; - ChunkDelete(wpi->header_); - ChunkDelete(wpi->alpha_); - ChunkDelete(wpi->img_); + // There should be at most one chunk of header_, alpha_, img_ but we call + // ChunkListDelete to be safe + ChunkListDelete(&wpi->header_); + ChunkListDelete(&wpi->alpha_); + ChunkListDelete(&wpi->img_); ChunkListDelete(&wpi->unknown_); next = wpi->next_; @@ -504,6 +501,20 @@ WebPMuxError MuxValidate(const WebPMux* const mux) { if (!has_animation && (num_anim == 1 || num_frames > 0)) { return WEBP_MUX_INVALID_ARGUMENT; } + if (!has_animation) { + const WebPMuxImage* images = mux->images_; + // There can be only one image. + if (images == NULL || images->next_ != NULL) { + return WEBP_MUX_INVALID_ARGUMENT; + } + // Size must match. + if (mux->canvas_width_ > 0) { + if (images->width_ != mux->canvas_width_ || + images->height_ != mux->canvas_height_) { + return WEBP_MUX_INVALID_ARGUMENT; + } + } + } } // Verify either VP8X chunk is present OR there is only one elem in @@ -515,6 +526,7 @@ WebPMuxError MuxValidate(const WebPMux* const mux) { if (num_vp8x == 0 && num_images != 1) return WEBP_MUX_INVALID_ARGUMENT; // ALPHA_FLAG & alpha chunk(s) are consistent. + // Note: ALPHA_FLAG can be set when there is actually no Alpha data present. if (MuxHasAlpha(mux->images_)) { if (num_vp8x > 0) { // VP8X chunk is present, so it should contain ALPHA_FLAG. @@ -525,8 +537,6 @@ WebPMuxError MuxValidate(const WebPMux* const mux) { if (err != WEBP_MUX_OK) return err; if (num_alpha > 0) return WEBP_MUX_INVALID_ARGUMENT; } - } else { // Mux doesn't need alpha. So, ALPHA_FLAG should NOT be present. - if (flags & ALPHA_FLAG) return WEBP_MUX_INVALID_ARGUMENT; } return WEBP_MUX_OK; diff --git a/src/3rdparty/libwebp/src/mux/muxread.c b/src/3rdparty/libwebp/src/mux/muxread.c index 410acd9..268f6ac 100644 --- a/src/3rdparty/libwebp/src/mux/muxread.c +++ b/src/3rdparty/libwebp/src/mux/muxread.c @@ -13,8 +13,8 @@ // Vikas (vikasa@google.com) #include <assert.h> -#include "./muxi.h" -#include "../utils/utils.h" +#include "src/mux/muxi.h" +#include "src/utils/utils.h" //------------------------------------------------------------------------------ // Helper method(s). @@ -43,7 +43,7 @@ static WebPMuxError MuxGet(const WebPMux* const mux, CHUNK_INDEX idx, SWITCH_ID_LIST(IDX_ANIM, mux->anim_); SWITCH_ID_LIST(IDX_EXIF, mux->exif_); SWITCH_ID_LIST(IDX_XMP, mux->xmp_); - SWITCH_ID_LIST(IDX_UNKNOWN, mux->unknown_); + assert(idx != IDX_UNKNOWN); return WEBP_MUX_NOT_FOUND; } #undef SWITCH_ID_LIST @@ -59,6 +59,7 @@ static WebPMuxError ChunkVerifyAndAssign(WebPChunk* chunk, // Sanity checks. if (data_size < CHUNK_HEADER_SIZE) return WEBP_MUX_NOT_ENOUGH_DATA; chunk_size = GetLE32(data + TAG_SIZE); + if (chunk_size > MAX_CHUNK_PAYLOAD) return WEBP_MUX_BAD_DATA; { const size_t chunk_disk_size = SizeWithPadding(chunk_size); @@ -102,6 +103,7 @@ static int MuxImageParse(const WebPChunk* const chunk, int copy_data, const uint8_t* const last = bytes + size; WebPChunk subchunk; size_t subchunk_size; + WebPChunk** unknown_chunk_list = &wpi->unknown_; ChunkInit(&subchunk); assert(chunk->tag_ == kChunks[IDX_ANMF].tag); @@ -116,7 +118,7 @@ static int MuxImageParse(const WebPChunk* const chunk, int copy_data, if (size < hdr_size) goto Fail; ChunkAssignData(&subchunk, &temp, copy_data, chunk->tag_); } - ChunkSetNth(&subchunk, &wpi->header_, 1); + ChunkSetHead(&subchunk, &wpi->header_); wpi->is_partial_ = 1; // Waiting for ALPH and/or VP8/VP8L chunks. // Rest of the chunks. @@ -133,18 +135,23 @@ static int MuxImageParse(const WebPChunk* const chunk, int copy_data, switch (ChunkGetIdFromTag(subchunk.tag_)) { case WEBP_CHUNK_ALPHA: if (wpi->alpha_ != NULL) goto Fail; // Consecutive ALPH chunks. - if (ChunkSetNth(&subchunk, &wpi->alpha_, 1) != WEBP_MUX_OK) goto Fail; + if (ChunkSetHead(&subchunk, &wpi->alpha_) != WEBP_MUX_OK) goto Fail; wpi->is_partial_ = 1; // Waiting for a VP8 chunk. break; case WEBP_CHUNK_IMAGE: - if (ChunkSetNth(&subchunk, &wpi->img_, 1) != WEBP_MUX_OK) goto Fail; + if (wpi->img_ != NULL) goto Fail; // Only 1 image chunk allowed. + if (ChunkSetHead(&subchunk, &wpi->img_) != WEBP_MUX_OK) goto Fail; if (!MuxImageFinalize(wpi)) goto Fail; wpi->is_partial_ = 0; // wpi is completely filled. break; case WEBP_CHUNK_UNKNOWN: - if (wpi->is_partial_) goto Fail; // Encountered an unknown chunk - // before some image chunks. - if (ChunkSetNth(&subchunk, &wpi->unknown_, 0) != WEBP_MUX_OK) goto Fail; + if (wpi->is_partial_) { + goto Fail; // Encountered an unknown chunk + // before some image chunks. + } + if (ChunkAppend(&subchunk, &unknown_chunk_list) != WEBP_MUX_OK) { + goto Fail; + } break; default: goto Fail; @@ -175,6 +182,9 @@ WebPMux* WebPMuxCreateInternal(const WebPData* bitstream, int copy_data, const uint8_t* data; size_t size; WebPChunk chunk; + // Stores the end of the chunk lists so that it is faster to append data to + // their ends. + WebPChunk** chunk_list_ends[WEBP_CHUNK_NIL + 1] = { NULL }; ChunkInit(&chunk); // Sanity checks. @@ -187,7 +197,7 @@ WebPMux* WebPMuxCreateInternal(const WebPData* bitstream, int copy_data, size = bitstream->size; if (data == NULL) return NULL; - if (size < RIFF_HEADER_SIZE) return NULL; + if (size < RIFF_HEADER_SIZE + CHUNK_HEADER_SIZE) return NULL; if (GetLE32(data + 0) != MKFOURCC('R', 'I', 'F', 'F') || GetLE32(data + CHUNK_HEADER_SIZE) != MKFOURCC('W', 'E', 'B', 'P')) { return NULL; @@ -196,8 +206,6 @@ WebPMux* WebPMuxCreateInternal(const WebPData* bitstream, int copy_data, mux = WebPMuxNew(); if (mux == NULL) return NULL; - if (size < RIFF_HEADER_SIZE + TAG_SIZE) goto Err; - tag = GetLE32(data + RIFF_HEADER_SIZE); if (tag != kChunks[IDX_VP8].tag && tag != kChunks[IDX_VP8L].tag && @@ -205,13 +213,17 @@ WebPMux* WebPMuxCreateInternal(const WebPData* bitstream, int copy_data, goto Err; // First chunk should be VP8, VP8L or VP8X. } - riff_size = SizeWithPadding(GetLE32(data + TAG_SIZE)); - if (riff_size > MAX_CHUNK_PAYLOAD || riff_size > size) { - goto Err; - } else { - if (riff_size < size) { // Redundant data after last chunk. - size = riff_size; // To make sure we don't read any data beyond mux_size. - } + riff_size = GetLE32(data + TAG_SIZE); + if (riff_size > MAX_CHUNK_PAYLOAD) goto Err; + + // Note this padding is historical and differs from demux.c which does not + // pad the file size. + riff_size = SizeWithPadding(riff_size); + if (riff_size < CHUNK_HEADER_SIZE) goto Err; + if (riff_size > size) goto Err; + // There's no point in reading past the end of the RIFF chunk. + if (size > riff_size + CHUNK_HEADER_SIZE) { + size = riff_size + CHUNK_HEADER_SIZE; } end = data + size; @@ -226,7 +238,6 @@ WebPMux* WebPMuxCreateInternal(const WebPData* bitstream, int copy_data, while (data != end) { size_t data_size; WebPChunkId id; - WebPChunk** chunk_list; if (ChunkVerifyAndAssign(&chunk, data, size, riff_size, copy_data) != WEBP_MUX_OK) { goto Err; @@ -236,11 +247,11 @@ WebPMux* WebPMuxCreateInternal(const WebPData* bitstream, int copy_data, switch (id) { case WEBP_CHUNK_ALPHA: if (wpi->alpha_ != NULL) goto Err; // Consecutive ALPH chunks. - if (ChunkSetNth(&chunk, &wpi->alpha_, 1) != WEBP_MUX_OK) goto Err; + if (ChunkSetHead(&chunk, &wpi->alpha_) != WEBP_MUX_OK) goto Err; wpi->is_partial_ = 1; // Waiting for a VP8 chunk. break; case WEBP_CHUNK_IMAGE: - if (ChunkSetNth(&chunk, &wpi->img_, 1) != WEBP_MUX_OK) goto Err; + if (ChunkSetHead(&chunk, &wpi->img_) != WEBP_MUX_OK) goto Err; if (!MuxImageFinalize(wpi)) goto Err; wpi->is_partial_ = 0; // wpi is completely filled. PushImage: @@ -257,9 +268,13 @@ WebPMux* WebPMuxCreateInternal(const WebPData* bitstream, int copy_data, default: // A non-image chunk. if (wpi->is_partial_) goto Err; // Encountered a non-image chunk before // getting all chunks of an image. - chunk_list = MuxGetChunkListFromId(mux, id); // List to add this chunk. - if (ChunkSetNth(&chunk, chunk_list, 0) != WEBP_MUX_OK) goto Err; + if (chunk_list_ends[id] == NULL) { + chunk_list_ends[id] = + MuxGetChunkListFromId(mux, id); // List to add this chunk. + } + if (ChunkAppend(&chunk, &chunk_list_ends[id]) != WEBP_MUX_OK) goto Err; if (id == WEBP_CHUNK_VP8X) { // grab global specs + if (data_size < CHUNK_HEADER_SIZE + VP8X_CHUNK_SIZE) goto Err; mux->canvas_width_ = GetLE24(data + 12) + 1; mux->canvas_height_ = GetLE24(data + 15) + 1; } @@ -270,6 +285,9 @@ WebPMux* WebPMuxCreateInternal(const WebPData* bitstream, int copy_data, ChunkInit(&chunk); } + // Incomplete image. + if (wpi->is_partial_) goto Err; + // Validate mux if complete. if (MuxValidate(mux) != WEBP_MUX_OK) goto Err; @@ -382,6 +400,10 @@ static WebPMuxError SynthesizeBitstream(const WebPMuxImage* const wpi, uint8_t* const data = (uint8_t*)WebPSafeMalloc(1ULL, size); if (data == NULL) return WEBP_MUX_MEMORY_ERROR; + // There should be at most one alpha_ chunk and exactly one img_ chunk. + assert(wpi->alpha_ == NULL || wpi->alpha_->next_ == NULL); + assert(wpi->img_ != NULL && wpi->img_->next_ == NULL); + // Main RIFF header. dst = MuxEmitRiffHeader(data, size); diff --git a/src/3rdparty/libwebp/src/utils/bit_reader_inl_utils.h b/src/3rdparty/libwebp/src/utils/bit_reader_inl_utils.h index fd7fb04..46b3880 100644 --- a/src/3rdparty/libwebp/src/utils/bit_reader_inl_utils.h +++ b/src/3rdparty/libwebp/src/utils/bit_reader_inl_utils.h @@ -13,19 +13,19 @@ // // Author: Skal (pascal.massimino@gmail.com) -#ifndef WEBP_UTILS_BIT_READER_INL_H_ -#define WEBP_UTILS_BIT_READER_INL_H_ +#ifndef WEBP_UTILS_BIT_READER_INL_UTILS_H_ +#define WEBP_UTILS_BIT_READER_INL_UTILS_H_ #ifdef HAVE_CONFIG_H -#include "../webp/config.h" +#include "src/webp/config.h" #endif #include <string.h> // for memcpy -#include "../dsp/dsp.h" -#include "./bit_reader_utils.h" -#include "./endian_inl_utils.h" -#include "./utils.h" +#include "src/dsp/dsp.h" +#include "src/utils/bit_reader_utils.h" +#include "src/utils/endian_inl_utils.h" +#include "src/utils/utils.h" #ifdef __cplusplus extern "C" { @@ -104,7 +104,8 @@ void VP8LoadNewBytes(VP8BitReader* const br) { } // Read a bit with proba 'prob'. Speed-critical function! -static WEBP_INLINE int VP8GetBit(VP8BitReader* const br, int prob) { +static WEBP_INLINE int VP8GetBit(VP8BitReader* const br, + int prob, const char label[]) { // Don't move this declaration! It makes a big speed difference to store // 'range' *before* calling VP8LoadNewBytes(), even if this function doesn't // alter br->range_ value. @@ -129,13 +130,14 @@ static WEBP_INLINE int VP8GetBit(VP8BitReader* const br, int prob) { br->bits_ -= shift; } br->range_ = range - 1; + BT_TRACK(br); return bit; } } // simplified version of VP8GetBit() for prob=0x80 (note shift is always 1 here) static WEBP_UBSAN_IGNORE_UNSIGNED_OVERFLOW WEBP_INLINE -int VP8GetSigned(VP8BitReader* const br, int v) { +int VP8GetSigned(VP8BitReader* const br, int v, const char label[]) { if (br->bits_ < 0) { VP8LoadNewBytes(br); } @@ -148,11 +150,13 @@ int VP8GetSigned(VP8BitReader* const br, int v) { br->range_ += mask; br->range_ |= 1; br->value_ -= (bit_t)((split + 1) & mask) << pos; + BT_TRACK(br); return (v ^ mask) - mask; } } -static WEBP_INLINE int VP8GetBitAlt(VP8BitReader* const br, int prob) { +static WEBP_INLINE int VP8GetBitAlt(VP8BitReader* const br, + int prob, const char label[]) { // Don't move this declaration! It makes a big speed difference to store // 'range' *before* calling VP8LoadNewBytes(), even if this function doesn't // alter br->range_ value. @@ -179,6 +183,7 @@ static WEBP_INLINE int VP8GetBitAlt(VP8BitReader* const br, int prob) { br->bits_ -= shift; } br->range_ = range; + BT_TRACK(br); return bit; } } @@ -187,4 +192,4 @@ static WEBP_INLINE int VP8GetBitAlt(VP8BitReader* const br, int prob) { } // extern "C" #endif -#endif // WEBP_UTILS_BIT_READER_INL_H_ +#endif // WEBP_UTILS_BIT_READER_INL_UTILS_H_ diff --git a/src/3rdparty/libwebp/src/utils/bit_reader_utils.c b/src/3rdparty/libwebp/src/utils/bit_reader_utils.c index c3157e8..857cd60 100644 --- a/src/3rdparty/libwebp/src/utils/bit_reader_utils.c +++ b/src/3rdparty/libwebp/src/utils/bit_reader_utils.c @@ -12,11 +12,11 @@ // Author: Skal (pascal.massimino@gmail.com) #ifdef HAVE_CONFIG_H -#include "../webp/config.h" +#include "src/webp/config.h" #endif -#include "./bit_reader_inl_utils.h" -#include "../utils/utils.h" +#include "src/utils/bit_reader_inl_utils.h" +#include "src/utils/utils.h" //------------------------------------------------------------------------------ // VP8BitReader @@ -102,17 +102,18 @@ void VP8LoadFinalBytes(VP8BitReader* const br) { //------------------------------------------------------------------------------ // Higher-level calls -uint32_t VP8GetValue(VP8BitReader* const br, int bits) { +uint32_t VP8GetValue(VP8BitReader* const br, int bits, const char label[]) { uint32_t v = 0; while (bits-- > 0) { - v |= VP8GetBit(br, 0x80) << bits; + v |= VP8GetBit(br, 0x80, label) << bits; } return v; } -int32_t VP8GetSignedValue(VP8BitReader* const br, int bits) { - const int value = VP8GetValue(br, bits); - return VP8Get(br) ? -value : value; +int32_t VP8GetSignedValue(VP8BitReader* const br, int bits, + const char label[]) { + const int value = VP8GetValue(br, bits, label); + return VP8Get(br, label) ? -value : value; } //------------------------------------------------------------------------------ @@ -220,3 +221,78 @@ uint32_t VP8LReadBits(VP8LBitReader* const br, int n_bits) { } //------------------------------------------------------------------------------ +// Bit-tracing tool + +#if (BITTRACE > 0) + +#include <stdlib.h> // for atexit() +#include <stdio.h> +#include <string.h> + +#define MAX_NUM_LABELS 32 +static struct { + const char* label; + int size; + int count; +} kLabels[MAX_NUM_LABELS]; + +static int last_label = 0; +static int last_pos = 0; +static const uint8_t* buf_start = NULL; +static int init_done = 0; + +static void PrintBitTraces(void) { + int i; + int scale = 1; + int total = 0; + const char* units = "bits"; +#if (BITTRACE == 2) + scale = 8; + units = "bytes"; +#endif + for (i = 0; i < last_label; ++i) total += kLabels[i].size; + if (total < 1) total = 1; // avoid rounding errors + printf("=== Bit traces ===\n"); + for (i = 0; i < last_label; ++i) { + const int skip = 16 - (int)strlen(kLabels[i].label); + const int value = (kLabels[i].size + scale - 1) / scale; + assert(skip > 0); + printf("%s \%*s: %6d %s \t[%5.2f%%] [count: %7d]\n", + kLabels[i].label, skip, "", value, units, + 100.f * kLabels[i].size / total, + kLabels[i].count); + } + total = (total + scale - 1) / scale; + printf("Total: %d %s\n", total, units); +} + +void BitTrace(const struct VP8BitReader* const br, const char label[]) { + int i, pos; + if (!init_done) { + memset(kLabels, 0, sizeof(kLabels)); + atexit(PrintBitTraces); + buf_start = br->buf_; + init_done = 1; + } + pos = (int)(br->buf_ - buf_start) * 8 - br->bits_; + // if there's a too large jump, we've changed partition -> reset counter + if (abs(pos - last_pos) > 32) { + buf_start = br->buf_; + pos = 0; + last_pos = 0; + } + if (br->range_ >= 0x7f) pos += kVP8Log2Range[br->range_ - 0x7f]; + for (i = 0; i < last_label; ++i) { + if (!strcmp(label, kLabels[i].label)) break; + } + if (i == MAX_NUM_LABELS) abort(); // overflow! + kLabels[i].label = label; + kLabels[i].size += pos - last_pos; + kLabels[i].count += 1; + if (i == last_label) ++last_label; + last_pos = pos; +} + +#endif // BITTRACE > 0 + +//------------------------------------------------------------------------------ diff --git a/src/3rdparty/libwebp/src/utils/bit_reader_utils.h b/src/3rdparty/libwebp/src/utils/bit_reader_utils.h index ec3426c..e64156e 100644 --- a/src/3rdparty/libwebp/src/utils/bit_reader_utils.h +++ b/src/3rdparty/libwebp/src/utils/bit_reader_utils.h @@ -12,14 +12,35 @@ // Author: Skal (pascal.massimino@gmail.com) // Vikas Arora (vikaas.arora@gmail.com) -#ifndef WEBP_UTILS_BIT_READER_H_ -#define WEBP_UTILS_BIT_READER_H_ +#ifndef WEBP_UTILS_BIT_READER_UTILS_H_ +#define WEBP_UTILS_BIT_READER_UTILS_H_ #include <assert.h> #ifdef _MSC_VER #include <stdlib.h> // _byteswap_ulong #endif -#include "../webp/types.h" +#include "src/webp/types.h" + +// Warning! This macro triggers quite some MACRO wizardry around func signature! +#if !defined(BITTRACE) +#define BITTRACE 0 // 0 = off, 1 = print bits, 2 = print bytes +#endif + +#if (BITTRACE > 0) +struct VP8BitReader; +extern void BitTrace(const struct VP8BitReader* const br, const char label[]); +#define BT_TRACK(br) BitTrace(br, label) +#define VP8Get(BR, L) VP8GetValue(BR, 1, L) +#else +#define BT_TRACK(br) +// We'll REMOVE the 'const char label[]' from all signatures and calls (!!): +#define VP8GetValue(BR, N, L) VP8GetValue(BR, N) +#define VP8Get(BR, L) VP8GetValue(BR, 1, L) +#define VP8GetSignedValue(BR, N, L) VP8GetSignedValue(BR, N) +#define VP8GetBit(BR, P, L) VP8GetBit(BR, P) +#define VP8GetBitAlt(BR, P, L) VP8GetBitAlt(BR, P) +#define VP8GetSigned(BR, V, L) VP8GetSigned(BR, V) +#endif #ifdef __cplusplus extern "C" { @@ -92,17 +113,15 @@ void VP8BitReaderSetBuffer(VP8BitReader* const br, void VP8RemapBitReader(VP8BitReader* const br, ptrdiff_t offset); // return the next value made of 'num_bits' bits -uint32_t VP8GetValue(VP8BitReader* const br, int num_bits); -static WEBP_INLINE uint32_t VP8Get(VP8BitReader* const br) { - return VP8GetValue(br, 1); -} +uint32_t VP8GetValue(VP8BitReader* const br, int num_bits, const char label[]); // return the next value with sign-extension. -int32_t VP8GetSignedValue(VP8BitReader* const br, int num_bits); +int32_t VP8GetSignedValue(VP8BitReader* const br, int num_bits, + const char label[]); // bit_reader_inl.h will implement the following methods: -// static WEBP_INLINE int VP8GetBit(VP8BitReader* const br, int prob) -// static WEBP_INLINE int VP8GetSigned(VP8BitReader* const br, int v) +// static WEBP_INLINE int VP8GetBit(VP8BitReader* const br, int prob, ...) +// static WEBP_INLINE int VP8GetSigned(VP8BitReader* const br, int v, ...) // and should be included by the .c files that actually need them. // This is to avoid recompiling the whole library whenever this file is touched, // and also allowing platform-specific ad-hoc hacks. @@ -155,9 +174,10 @@ static WEBP_INLINE int VP8LIsEndOfStream(const VP8LBitReader* const br) { // For jumping over a number of bits in the bit stream when accessed with // VP8LPrefetchBits and VP8LFillBitWindow. +// This function does *not* set br->eos_, since it's speed-critical. +// Use with extreme care! static WEBP_INLINE void VP8LSetBitPos(VP8LBitReader* const br, int val) { br->bit_pos_ = val; - br->eos_ = VP8LIsEndOfStream(br); } // Advances the read buffer by 4 bytes to make room for reading next 32 bits. @@ -171,4 +191,4 @@ static WEBP_INLINE void VP8LFillBitWindow(VP8LBitReader* const br) { } // extern "C" #endif -#endif /* WEBP_UTILS_BIT_READER_H_ */ +#endif // WEBP_UTILS_BIT_READER_UTILS_H_ diff --git a/src/3rdparty/libwebp/src/utils/bit_writer_utils.c b/src/3rdparty/libwebp/src/utils/bit_writer_utils.c index ab0c49d..bef0e31 100644 --- a/src/3rdparty/libwebp/src/utils/bit_writer_utils.c +++ b/src/3rdparty/libwebp/src/utils/bit_writer_utils.c @@ -16,9 +16,9 @@ #include <string.h> // for memcpy() #include <stdlib.h> -#include "./bit_writer_utils.h" -#include "./endian_inl_utils.h" -#include "./utils.h" +#include "src/utils/bit_writer_utils.h" +#include "src/utils/endian_inl_utils.h" +#include "src/utils/utils.h" //------------------------------------------------------------------------------ // VP8BitWriter @@ -70,7 +70,7 @@ static void Flush(VP8BitWriter* const bw) { const int value = (bits & 0x100) ? 0x00 : 0xff; for (; bw->run_ > 0; --bw->run_) bw->buf_[pos++] = value; } - bw->buf_[pos++] = bits; + bw->buf_[pos++] = bits & 0xff; bw->pos_ = pos; } else { bw->run_++; // delay writing of bytes 0xff, pending eventual carry. @@ -239,6 +239,19 @@ int VP8LBitWriterInit(VP8LBitWriter* const bw, size_t expected_size) { return VP8LBitWriterResize(bw, expected_size); } +int VP8LBitWriterClone(const VP8LBitWriter* const src, + VP8LBitWriter* const dst) { + const size_t current_size = src->cur_ - src->buf_; + assert(src->cur_ >= src->buf_ && src->cur_ <= src->end_); + if (!VP8LBitWriterResize(dst, current_size)) return 0; + memcpy(dst->buf_, src->buf_, current_size); + dst->bits_ = src->bits_; + dst->used_ = src->used_; + dst->error_ = src->error_; + dst->cur_ = dst->buf_ + current_size; + return 1; +} + void VP8LBitWriterWipeOut(VP8LBitWriter* const bw) { if (bw != NULL) { WebPSafeFree(bw->buf_); @@ -246,6 +259,21 @@ void VP8LBitWriterWipeOut(VP8LBitWriter* const bw) { } } +void VP8LBitWriterReset(const VP8LBitWriter* const bw_init, + VP8LBitWriter* const bw) { + bw->bits_ = bw_init->bits_; + bw->used_ = bw_init->used_; + bw->cur_ = bw->buf_ + (bw_init->cur_ - bw_init->buf_); + assert(bw->cur_ <= bw->end_); + bw->error_ = bw_init->error_; +} + +void VP8LBitWriterSwap(VP8LBitWriter* const src, VP8LBitWriter* const dst) { + const VP8LBitWriter tmp = *src; + *src = *dst; + *dst = tmp; +} + void VP8LPutBitsFlushBits(VP8LBitWriter* const bw) { // If needed, make some room by flushing some bits out. if (bw->cur_ + VP8L_WRITER_BYTES > bw->end_) { diff --git a/src/3rdparty/libwebp/src/utils/bit_writer_utils.h b/src/3rdparty/libwebp/src/utils/bit_writer_utils.h index 9c02bbc..b9d5102 100644 --- a/src/3rdparty/libwebp/src/utils/bit_writer_utils.h +++ b/src/3rdparty/libwebp/src/utils/bit_writer_utils.h @@ -11,10 +11,10 @@ // // Author: Skal (pascal.massimino@gmail.com) -#ifndef WEBP_UTILS_BIT_WRITER_H_ -#define WEBP_UTILS_BIT_WRITER_H_ +#ifndef WEBP_UTILS_BIT_WRITER_UTILS_H_ +#define WEBP_UTILS_BIT_WRITER_UTILS_H_ -#include "../webp/types.h" +#include "src/webp/types.h" #ifdef __cplusplus extern "C" { @@ -100,16 +100,24 @@ typedef struct { int error_; } VP8LBitWriter; -static WEBP_INLINE size_t VP8LBitWriterNumBytes(VP8LBitWriter* const bw) { +static WEBP_INLINE size_t VP8LBitWriterNumBytes(const VP8LBitWriter* const bw) { return (bw->cur_ - bw->buf_) + ((bw->used_ + 7) >> 3); } // Returns false in case of memory allocation error. int VP8LBitWriterInit(VP8LBitWriter* const bw, size_t expected_size); +// Returns false in case of memory allocation error. +int VP8LBitWriterClone(const VP8LBitWriter* const src, + VP8LBitWriter* const dst); // Finalize the bitstream coding. Returns a pointer to the internal buffer. uint8_t* VP8LBitWriterFinish(VP8LBitWriter* const bw); // Release any pending memory and zeroes the object. void VP8LBitWriterWipeOut(VP8LBitWriter* const bw); +// Resets the cursor of the BitWriter bw to when it was like in bw_init. +void VP8LBitWriterReset(const VP8LBitWriter* const bw_init, + VP8LBitWriter* const bw); +// Swaps the memory held by two BitWriters. +void VP8LBitWriterSwap(VP8LBitWriter* const src, VP8LBitWriter* const dst); // Internal function for VP8LPutBits flushing 32 bits from the written state. void VP8LPutBitsFlushBits(VP8LBitWriter* const bw); @@ -143,4 +151,4 @@ static WEBP_INLINE void VP8LPutBits(VP8LBitWriter* const bw, } // extern "C" #endif -#endif /* WEBP_UTILS_BIT_WRITER_H_ */ +#endif // WEBP_UTILS_BIT_WRITER_UTILS_H_ diff --git a/src/3rdparty/libwebp/src/utils/color_cache_utils.c b/src/3rdparty/libwebp/src/utils/color_cache_utils.c index 0172590..b09f538 100644 --- a/src/3rdparty/libwebp/src/utils/color_cache_utils.c +++ b/src/3rdparty/libwebp/src/utils/color_cache_utils.c @@ -14,8 +14,8 @@ #include <assert.h> #include <stdlib.h> #include <string.h> -#include "./color_cache_utils.h" -#include "./utils.h" +#include "src/utils/color_cache_utils.h" +#include "src/utils/utils.h" //------------------------------------------------------------------------------ // VP8LColorCache. diff --git a/src/3rdparty/libwebp/src/utils/color_cache_utils.h b/src/3rdparty/libwebp/src/utils/color_cache_utils.h index c373e6b..ec21d51 100644 --- a/src/3rdparty/libwebp/src/utils/color_cache_utils.h +++ b/src/3rdparty/libwebp/src/utils/color_cache_utils.h @@ -12,10 +12,13 @@ // Authors: Jyrki Alakuijala (jyrki@google.com) // Urvang Joshi (urvang@google.com) -#ifndef WEBP_UTILS_COLOR_CACHE_H_ -#define WEBP_UTILS_COLOR_CACHE_H_ +#ifndef WEBP_UTILS_COLOR_CACHE_UTILS_H_ +#define WEBP_UTILS_COLOR_CACHE_UTILS_H_ -#include "../webp/types.h" +#include <assert.h> + +#include "src/dsp/dsp.h" +#include "src/webp/types.h" #ifdef __cplusplus extern "C" { @@ -28,10 +31,11 @@ typedef struct { int hash_bits_; } VP8LColorCache; -static const uint64_t kHashMul = 0x1e35a7bdull; +static const uint32_t kHashMul = 0x1e35a7bdu; -static WEBP_INLINE int HashPix(uint32_t argb, int shift) { - return (int)(((argb * kHashMul) & 0xffffffffu) >> shift); +static WEBP_UBSAN_IGNORE_UNSIGNED_OVERFLOW WEBP_INLINE +int VP8LHashPix(uint32_t argb, int shift) { + return (int)((argb * kHashMul) >> shift); } static WEBP_INLINE uint32_t VP8LColorCacheLookup( @@ -48,19 +52,19 @@ static WEBP_INLINE void VP8LColorCacheSet(const VP8LColorCache* const cc, static WEBP_INLINE void VP8LColorCacheInsert(const VP8LColorCache* const cc, uint32_t argb) { - const int key = HashPix(argb, cc->hash_shift_); + const int key = VP8LHashPix(argb, cc->hash_shift_); cc->colors_[key] = argb; } static WEBP_INLINE int VP8LColorCacheGetIndex(const VP8LColorCache* const cc, uint32_t argb) { - return HashPix(argb, cc->hash_shift_); + return VP8LHashPix(argb, cc->hash_shift_); } // Return the key if cc contains argb, and -1 otherwise. static WEBP_INLINE int VP8LColorCacheContains(const VP8LColorCache* const cc, uint32_t argb) { - const int key = HashPix(argb, cc->hash_shift_); + const int key = VP8LHashPix(argb, cc->hash_shift_); return (cc->colors_[key] == argb) ? key : -1; } @@ -82,4 +86,4 @@ void VP8LColorCacheClear(VP8LColorCache* const color_cache); } #endif -#endif // WEBP_UTILS_COLOR_CACHE_H_ +#endif // WEBP_UTILS_COLOR_CACHE_UTILS_H_ diff --git a/src/3rdparty/libwebp/src/utils/endian_inl_utils.h b/src/3rdparty/libwebp/src/utils/endian_inl_utils.h index e11260f..3630a29 100644 --- a/src/3rdparty/libwebp/src/utils/endian_inl_utils.h +++ b/src/3rdparty/libwebp/src/utils/endian_inl_utils.h @@ -9,22 +9,15 @@ // // Endian related functions. -#ifndef WEBP_UTILS_ENDIAN_INL_H_ -#define WEBP_UTILS_ENDIAN_INL_H_ +#ifndef WEBP_UTILS_ENDIAN_INL_UTILS_H_ +#define WEBP_UTILS_ENDIAN_INL_UTILS_H_ #ifdef HAVE_CONFIG_H -#include "../webp/config.h" +#include "src/webp/config.h" #endif -#include "../dsp/dsp.h" -#include "../webp/types.h" - -// some endian fix (e.g.: mips-gcc doesn't define __BIG_ENDIAN__) -#if !defined(WORDS_BIGENDIAN) && \ - (defined(__BIG_ENDIAN__) || defined(_M_PPC) || \ - (defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__))) -#define WORDS_BIGENDIAN -#endif +#include "src/dsp/dsp.h" +#include "src/webp/types.h" #if defined(WORDS_BIGENDIAN) #define HToLE32 BSwap32 @@ -97,4 +90,4 @@ static WEBP_INLINE uint64_t BSwap64(uint64_t x) { #endif // HAVE_BUILTIN_BSWAP64 } -#endif // WEBP_UTILS_ENDIAN_INL_H_ +#endif // WEBP_UTILS_ENDIAN_INL_UTILS_H_ diff --git a/src/3rdparty/libwebp/src/utils/filters_utils.c b/src/3rdparty/libwebp/src/utils/filters_utils.c index 49c1d18..bbc2c34 100644 --- a/src/3rdparty/libwebp/src/utils/filters_utils.c +++ b/src/3rdparty/libwebp/src/utils/filters_utils.c @@ -11,7 +11,7 @@ // // Author: Urvang (urvang@google.com) -#include "./filters_utils.h" +#include "src/utils/filters_utils.h" #include <stdlib.h> #include <string.h> diff --git a/src/3rdparty/libwebp/src/utils/filters_utils.h b/src/3rdparty/libwebp/src/utils/filters_utils.h index 088b132..61da66e 100644 --- a/src/3rdparty/libwebp/src/utils/filters_utils.h +++ b/src/3rdparty/libwebp/src/utils/filters_utils.h @@ -11,11 +11,11 @@ // // Author: Urvang (urvang@google.com) -#ifndef WEBP_UTILS_FILTERS_H_ -#define WEBP_UTILS_FILTERS_H_ +#ifndef WEBP_UTILS_FILTERS_UTILS_H_ +#define WEBP_UTILS_FILTERS_UTILS_H_ -#include "../webp/types.h" -#include "../dsp/dsp.h" +#include "src/webp/types.h" +#include "src/dsp/dsp.h" #ifdef __cplusplus extern "C" { @@ -29,4 +29,4 @@ WEBP_FILTER_TYPE WebPEstimateBestFilter(const uint8_t* data, } // extern "C" #endif -#endif /* WEBP_UTILS_FILTERS_H_ */ +#endif // WEBP_UTILS_FILTERS_UTILS_H_ diff --git a/src/3rdparty/libwebp/src/utils/huffman_encode_utils.c b/src/3rdparty/libwebp/src/utils/huffman_encode_utils.c index f950465..6f3b1bb 100644 --- a/src/3rdparty/libwebp/src/utils/huffman_encode_utils.c +++ b/src/3rdparty/libwebp/src/utils/huffman_encode_utils.c @@ -14,9 +14,9 @@ #include <assert.h> #include <stdlib.h> #include <string.h> -#include "./huffman_encode_utils.h" -#include "./utils.h" -#include "../webp/format_constants.h" +#include "src/utils/huffman_encode_utils.h" +#include "src/utils/utils.h" +#include "src/webp/format_constants.h" // ----------------------------------------------------------------------------- // Util function to optimize the symbol map for RLE coding diff --git a/src/3rdparty/libwebp/src/utils/huffman_encode_utils.h b/src/3rdparty/libwebp/src/utils/huffman_encode_utils.h index a157165..3e6763c 100644 --- a/src/3rdparty/libwebp/src/utils/huffman_encode_utils.h +++ b/src/3rdparty/libwebp/src/utils/huffman_encode_utils.h @@ -11,10 +11,10 @@ // // Entropy encoding (Huffman) for webp lossless -#ifndef WEBP_UTILS_HUFFMAN_ENCODE_H_ -#define WEBP_UTILS_HUFFMAN_ENCODE_H_ +#ifndef WEBP_UTILS_HUFFMAN_ENCODE_UTILS_H_ +#define WEBP_UTILS_HUFFMAN_ENCODE_UTILS_H_ -#include "../webp/types.h" +#include "src/webp/types.h" #ifdef __cplusplus extern "C" { @@ -57,4 +57,4 @@ void VP8LCreateHuffmanTree(uint32_t* const histogram, int tree_depth_limit, } #endif -#endif // WEBP_UTILS_HUFFMAN_ENCODE_H_ +#endif // WEBP_UTILS_HUFFMAN_ENCODE_UTILS_H_ diff --git a/src/3rdparty/libwebp/src/utils/huffman_utils.c b/src/3rdparty/libwebp/src/utils/huffman_utils.c index 008b5d7..0cba0fb 100644 --- a/src/3rdparty/libwebp/src/utils/huffman_utils.c +++ b/src/3rdparty/libwebp/src/utils/huffman_utils.c @@ -14,9 +14,9 @@ #include <assert.h> #include <stdlib.h> #include <string.h> -#include "./huffman_utils.h" -#include "./utils.h" -#include "../webp/format_constants.h" +#include "src/utils/huffman_utils.h" +#include "src/utils/utils.h" +#include "src/webp/format_constants.h" // Huffman data read via DecodeImageStream is represented in two (red and green) // bytes. @@ -91,7 +91,8 @@ static int BuildHuffmanTable(HuffmanCode* const root_table, int root_bits, assert(code_lengths_size != 0); assert(code_lengths != NULL); - assert(root_table != NULL); + assert((root_table != NULL && sorted != NULL) || + (root_table == NULL && sorted == NULL)); assert(root_bits > 0); // Build histogram of code lengths. @@ -120,16 +121,22 @@ static int BuildHuffmanTable(HuffmanCode* const root_table, int root_bits, for (symbol = 0; symbol < code_lengths_size; ++symbol) { const int symbol_code_length = code_lengths[symbol]; if (code_lengths[symbol] > 0) { - sorted[offset[symbol_code_length]++] = symbol; + if (sorted != NULL) { + sorted[offset[symbol_code_length]++] = symbol; + } else { + offset[symbol_code_length]++; + } } } // Special case code with only one value. if (offset[MAX_ALLOWED_CODE_LENGTH] == 1) { - HuffmanCode code; - code.bits = 0; - code.value = (uint16_t)sorted[0]; - ReplicateValue(table, 1, total_size, code); + if (sorted != NULL) { + HuffmanCode code; + code.bits = 0; + code.value = (uint16_t)sorted[0]; + ReplicateValue(table, 1, total_size, code); + } return total_size; } @@ -151,6 +158,7 @@ static int BuildHuffmanTable(HuffmanCode* const root_table, int root_bits, if (num_open < 0) { return 0; } + if (root_table == NULL) continue; for (; count[len] > 0; --count[len]) { HuffmanCode code; code.bits = (uint8_t)len; @@ -169,6 +177,7 @@ static int BuildHuffmanTable(HuffmanCode* const root_table, int root_bits, if (num_open < 0) { return 0; } + if (root_table == NULL) continue; for (; count[len] > 0; --count[len]) { HuffmanCode code; if ((key & mask) != low) { @@ -206,7 +215,10 @@ int VP8LBuildHuffmanTable(HuffmanCode* const root_table, int root_bits, const int code_lengths[], int code_lengths_size) { int total_size; assert(code_lengths_size <= MAX_CODE_LENGTHS_SIZE); - if (code_lengths_size <= SORTED_SIZE_CUTOFF) { + if (root_table == NULL) { + total_size = BuildHuffmanTable(NULL, root_bits, + code_lengths, code_lengths_size, NULL); + } else if (code_lengths_size <= SORTED_SIZE_CUTOFF) { // use local stack-allocated array. uint16_t sorted[SORTED_SIZE_CUTOFF]; total_size = BuildHuffmanTable(root_table, root_bits, diff --git a/src/3rdparty/libwebp/src/utils/huffman_utils.h b/src/3rdparty/libwebp/src/utils/huffman_utils.h index c6dd6aa..13b7ad1 100644 --- a/src/3rdparty/libwebp/src/utils/huffman_utils.h +++ b/src/3rdparty/libwebp/src/utils/huffman_utils.h @@ -11,12 +11,12 @@ // // Author: Urvang Joshi (urvang@google.com) -#ifndef WEBP_UTILS_HUFFMAN_H_ -#define WEBP_UTILS_HUFFMAN_H_ +#ifndef WEBP_UTILS_HUFFMAN_UTILS_H_ +#define WEBP_UTILS_HUFFMAN_UTILS_H_ #include <assert.h> -#include "../webp/format_constants.h" -#include "../webp/types.h" +#include "src/webp/format_constants.h" +#include "src/webp/types.h" #ifdef __cplusplus extern "C" { @@ -78,6 +78,8 @@ void VP8LHtreeGroupsFree(HTreeGroup* const htree_groups); // the huffman table. // Returns built table size or 0 in case of error (invalid tree or // memory error). +// If root_table is NULL, it returns 0 if a lookup cannot be built, something +// > 0 otherwise (but not the table size). int VP8LBuildHuffmanTable(HuffmanCode* const root_table, int root_bits, const int code_lengths[], int code_lengths_size); @@ -85,4 +87,4 @@ int VP8LBuildHuffmanTable(HuffmanCode* const root_table, int root_bits, } // extern "C" #endif -#endif // WEBP_UTILS_HUFFMAN_H_ +#endif // WEBP_UTILS_HUFFMAN_UTILS_H_ diff --git a/src/3rdparty/libwebp/src/utils/quant_levels_dec_utils.c b/src/3rdparty/libwebp/src/utils/quant_levels_dec_utils.c index d4d23d3..f65b6cd 100644 --- a/src/3rdparty/libwebp/src/utils/quant_levels_dec_utils.c +++ b/src/3rdparty/libwebp/src/utils/quant_levels_dec_utils.c @@ -14,11 +14,11 @@ // // Author: Skal (pascal.massimino@gmail.com) -#include "./quant_levels_dec_utils.h" +#include "src/utils/quant_levels_dec_utils.h" #include <string.h> // for memset -#include "./utils.h" +#include "src/utils/utils.h" // #define USE_DITHERING // uncomment to enable ordered dithering (not vital) @@ -71,10 +71,11 @@ typedef struct { //------------------------------------------------------------------------------ -#define CLIP_MASK (int)(~0U << (8 + DFIX)) +#define CLIP_8b_MASK (int)(~0U << (8 + DFIX)) static WEBP_INLINE uint8_t clip_8b(int v) { - return (!(v & CLIP_MASK)) ? (uint8_t)(v >> DFIX) : (v < 0) ? 0u : 255u; + return (!(v & CLIP_8b_MASK)) ? (uint8_t)(v >> DFIX) : (v < 0) ? 0u : 255u; } +#undef CLIP_8b_MASK // vertical accumulation static void VFilter(SmoothParams* const p) { @@ -260,9 +261,15 @@ static void CleanupParams(SmoothParams* const p) { int WebPDequantizeLevels(uint8_t* const data, int width, int height, int stride, int strength) { - const int radius = 4 * strength / 100; + int radius = 4 * strength / 100; + if (strength < 0 || strength > 100) return 0; if (data == NULL || width <= 0 || height <= 0) return 0; // bad params + + // limit the filter size to not exceed the image dimensions + if (2 * radius + 1 > width) radius = (width - 1) >> 1; + if (2 * radius + 1 > height) radius = (height - 1) >> 1; + if (radius > 0) { SmoothParams p; memset(&p, 0, sizeof(p)); diff --git a/src/3rdparty/libwebp/src/utils/quant_levels_dec_utils.h b/src/3rdparty/libwebp/src/utils/quant_levels_dec_utils.h index 59a1349..327f19f 100644 --- a/src/3rdparty/libwebp/src/utils/quant_levels_dec_utils.h +++ b/src/3rdparty/libwebp/src/utils/quant_levels_dec_utils.h @@ -11,10 +11,10 @@ // // Author: Vikas Arora (vikasa@google.com) -#ifndef WEBP_UTILS_QUANT_LEVELS_DEC_H_ -#define WEBP_UTILS_QUANT_LEVELS_DEC_H_ +#ifndef WEBP_UTILS_QUANT_LEVELS_DEC_UTILS_H_ +#define WEBP_UTILS_QUANT_LEVELS_DEC_UTILS_H_ -#include "../webp/types.h" +#include "src/webp/types.h" #ifdef __cplusplus extern "C" { @@ -32,4 +32,4 @@ int WebPDequantizeLevels(uint8_t* const data, int width, int height, int stride, } // extern "C" #endif -#endif /* WEBP_UTILS_QUANT_LEVELS_DEC_H_ */ +#endif // WEBP_UTILS_QUANT_LEVELS_DEC_UTILS_H_ diff --git a/src/3rdparty/libwebp/src/utils/quant_levels_utils.c b/src/3rdparty/libwebp/src/utils/quant_levels_utils.c index 73174e8..d65ad3c 100644 --- a/src/3rdparty/libwebp/src/utils/quant_levels_utils.c +++ b/src/3rdparty/libwebp/src/utils/quant_levels_utils.c @@ -14,7 +14,7 @@ #include <assert.h> -#include "./quant_levels_utils.h" +#include "src/utils/quant_levels_utils.h" #define NUM_SYMBOLS 256 diff --git a/src/3rdparty/libwebp/src/utils/quant_levels_utils.h b/src/3rdparty/libwebp/src/utils/quant_levels_utils.h index 1cb5a32..9ee3ea0 100644 --- a/src/3rdparty/libwebp/src/utils/quant_levels_utils.h +++ b/src/3rdparty/libwebp/src/utils/quant_levels_utils.h @@ -11,12 +11,12 @@ // // Author: Vikas Arora (vikasa@google.com) -#ifndef WEBP_UTILS_QUANT_LEVELS_H_ -#define WEBP_UTILS_QUANT_LEVELS_H_ +#ifndef WEBP_UTILS_QUANT_LEVELS_UTILS_H_ +#define WEBP_UTILS_QUANT_LEVELS_UTILS_H_ #include <stdlib.h> -#include "../webp/types.h" +#include "src/webp/types.h" #ifdef __cplusplus extern "C" { @@ -33,4 +33,4 @@ int QuantizeLevels(uint8_t* const data, int width, int height, int num_levels, } // extern "C" #endif -#endif /* WEBP_UTILS_QUANT_LEVELS_H_ */ +#endif // WEBP_UTILS_QUANT_LEVELS_UTILS_H_ diff --git a/src/3rdparty/libwebp/src/utils/random_utils.c b/src/3rdparty/libwebp/src/utils/random_utils.c index 9f1e415..7edb3fe 100644 --- a/src/3rdparty/libwebp/src/utils/random_utils.c +++ b/src/3rdparty/libwebp/src/utils/random_utils.c @@ -12,7 +12,7 @@ // Author: Skal (pascal.massimino@gmail.com) #include <string.h> -#include "./random_utils.h" +#include "src/utils/random_utils.h" //------------------------------------------------------------------------------ diff --git a/src/3rdparty/libwebp/src/utils/random_utils.h b/src/3rdparty/libwebp/src/utils/random_utils.h index c392a61..a5006f8 100644 --- a/src/3rdparty/libwebp/src/utils/random_utils.h +++ b/src/3rdparty/libwebp/src/utils/random_utils.h @@ -11,11 +11,11 @@ // // Author: Skal (pascal.massimino@gmail.com) -#ifndef WEBP_UTILS_RANDOM_H_ -#define WEBP_UTILS_RANDOM_H_ +#ifndef WEBP_UTILS_RANDOM_UTILS_H_ +#define WEBP_UTILS_RANDOM_UTILS_H_ #include <assert.h> -#include "../webp/types.h" +#include "src/webp/types.h" #ifdef __cplusplus extern "C" { @@ -60,4 +60,4 @@ static WEBP_INLINE int VP8RandomBits(VP8Random* const rg, int num_bits) { } // extern "C" #endif -#endif /* WEBP_UTILS_RANDOM_H_ */ +#endif // WEBP_UTILS_RANDOM_UTILS_H_ diff --git a/src/3rdparty/libwebp/src/utils/rescaler_utils.c b/src/3rdparty/libwebp/src/utils/rescaler_utils.c index 0d1f80d..4bcae24 100644 --- a/src/3rdparty/libwebp/src/utils/rescaler_utils.c +++ b/src/3rdparty/libwebp/src/utils/rescaler_utils.c @@ -14,8 +14,8 @@ #include <assert.h> #include <stdlib.h> #include <string.h> -#include "../dsp/dsp.h" -#include "./rescaler_utils.h" +#include "src/dsp/dsp.h" +#include "src/utils/rescaler_utils.h" //------------------------------------------------------------------------------ @@ -84,12 +84,14 @@ int WebPRescalerGetScaledDimensions(int src_width, int src_height, int height = *scaled_height; // if width is unspecified, scale original proportionally to height ratio. - if (width == 0) { - width = (src_width * height + src_height / 2) / src_height; + if (width == 0 && src_height > 0) { + width = + (int)(((uint64_t)src_width * height + src_height - 1) / src_height); } // if height is unspecified, scale original proportionally to width ratio. - if (height == 0) { - height = (src_height * width + src_width / 2) / src_width; + if (height == 0 && src_width > 0) { + height = + (int)(((uint64_t)src_height * width + src_width - 1) / src_width); } // Check if the overall dimensions still make sense. if (width <= 0 || height <= 0) { diff --git a/src/3rdparty/libwebp/src/utils/rescaler_utils.h b/src/3rdparty/libwebp/src/utils/rescaler_utils.h index 98b01a7..ca41e42 100644 --- a/src/3rdparty/libwebp/src/utils/rescaler_utils.h +++ b/src/3rdparty/libwebp/src/utils/rescaler_utils.h @@ -11,14 +11,14 @@ // // Author: Skal (pascal.massimino@gmail.com) -#ifndef WEBP_UTILS_RESCALER_H_ -#define WEBP_UTILS_RESCALER_H_ +#ifndef WEBP_UTILS_RESCALER_UTILS_H_ +#define WEBP_UTILS_RESCALER_UTILS_H_ #ifdef __cplusplus extern "C" { #endif -#include "../webp/types.h" +#include "src/webp/types.h" #define WEBP_RESCALER_RFIX 32 // fixed-point precision for multiplies #define WEBP_RESCALER_ONE (1ull << WEBP_RESCALER_RFIX) @@ -98,4 +98,4 @@ int WebPRescalerHasPendingOutput(const WebPRescaler* const rescaler) { } // extern "C" #endif -#endif /* WEBP_UTILS_RESCALER_H_ */ +#endif // WEBP_UTILS_RESCALER_UTILS_H_ diff --git a/src/3rdparty/libwebp/src/utils/thread_utils.c b/src/3rdparty/libwebp/src/utils/thread_utils.c index 1729060..438296b 100644 --- a/src/3rdparty/libwebp/src/utils/thread_utils.c +++ b/src/3rdparty/libwebp/src/utils/thread_utils.c @@ -13,8 +13,8 @@ #include <assert.h> #include <string.h> // for memset() -#include "./thread_utils.h" -#include "./utils.h" +#include "src/utils/thread_utils.h" +#include "src/utils/utils.h" #ifdef WEBP_USE_THREAD @@ -50,11 +50,11 @@ typedef struct { #endif // _WIN32 -struct WebPWorkerImpl { +typedef struct { pthread_mutex_t mutex_; pthread_cond_t condition_; pthread_t thread_; -}; +} WebPWorkerImpl; #if defined(_WIN32) @@ -201,25 +201,28 @@ static int pthread_cond_wait(pthread_cond_t* const condition, //------------------------------------------------------------------------------ -static void Execute(WebPWorker* const worker); // Forward declaration. - static THREADFN ThreadLoop(void* ptr) { WebPWorker* const worker = (WebPWorker*)ptr; + WebPWorkerImpl* const impl = (WebPWorkerImpl*)worker->impl_; int done = 0; while (!done) { - pthread_mutex_lock(&worker->impl_->mutex_); + pthread_mutex_lock(&impl->mutex_); while (worker->status_ == OK) { // wait in idling mode - pthread_cond_wait(&worker->impl_->condition_, &worker->impl_->mutex_); + pthread_cond_wait(&impl->condition_, &impl->mutex_); } if (worker->status_ == WORK) { - Execute(worker); + WebPGetWorkerInterface()->Execute(worker); worker->status_ = OK; } else if (worker->status_ == NOT_OK) { // finish the worker done = 1; } // signal to the main thread that we're done (for Sync()) - pthread_cond_signal(&worker->impl_->condition_); - pthread_mutex_unlock(&worker->impl_->mutex_); + // Note the associated mutex does not need to be held when signaling the + // condition. Unlocking the mutex first may improve performance in some + // implementations, avoiding the case where the waiting thread can't + // reacquire the mutex when woken. + pthread_mutex_unlock(&impl->mutex_); + pthread_cond_signal(&impl->condition_); } return THREAD_RETURN(NULL); // Thread is finished } @@ -229,21 +232,28 @@ static void ChangeState(WebPWorker* const worker, WebPWorkerStatus new_status) { // No-op when attempting to change state on a thread that didn't come up. // Checking status_ without acquiring the lock first would result in a data // race. - if (worker->impl_ == NULL) return; + WebPWorkerImpl* const impl = (WebPWorkerImpl*)worker->impl_; + if (impl == NULL) return; - pthread_mutex_lock(&worker->impl_->mutex_); + pthread_mutex_lock(&impl->mutex_); if (worker->status_ >= OK) { // wait for the worker to finish while (worker->status_ != OK) { - pthread_cond_wait(&worker->impl_->condition_, &worker->impl_->mutex_); + pthread_cond_wait(&impl->condition_, &impl->mutex_); } // assign new status and release the working thread if needed if (new_status != OK) { worker->status_ = new_status; - pthread_cond_signal(&worker->impl_->condition_); + // Note the associated mutex does not need to be held when signaling the + // condition. Unlocking the mutex first may improve performance in some + // implementations, avoiding the case where the waiting thread can't + // reacquire the mutex when woken. + pthread_mutex_unlock(&impl->mutex_); + pthread_cond_signal(&impl->condition_); + return; } } - pthread_mutex_unlock(&worker->impl_->mutex_); + pthread_mutex_unlock(&impl->mutex_); } #endif // WEBP_USE_THREAD @@ -268,26 +278,28 @@ static int Reset(WebPWorker* const worker) { worker->had_error = 0; if (worker->status_ < OK) { #ifdef WEBP_USE_THREAD - worker->impl_ = (WebPWorkerImpl*)WebPSafeCalloc(1, sizeof(*worker->impl_)); + WebPWorkerImpl* const impl = + (WebPWorkerImpl*)WebPSafeCalloc(1, sizeof(WebPWorkerImpl)); + worker->impl_ = (void*)impl; if (worker->impl_ == NULL) { return 0; } - if (pthread_mutex_init(&worker->impl_->mutex_, NULL)) { + if (pthread_mutex_init(&impl->mutex_, NULL)) { goto Error; } - if (pthread_cond_init(&worker->impl_->condition_, NULL)) { - pthread_mutex_destroy(&worker->impl_->mutex_); + if (pthread_cond_init(&impl->condition_, NULL)) { + pthread_mutex_destroy(&impl->mutex_); goto Error; } - pthread_mutex_lock(&worker->impl_->mutex_); - ok = !pthread_create(&worker->impl_->thread_, NULL, ThreadLoop, worker); + pthread_mutex_lock(&impl->mutex_); + ok = !pthread_create(&impl->thread_, NULL, ThreadLoop, worker); if (ok) worker->status_ = OK; - pthread_mutex_unlock(&worker->impl_->mutex_); + pthread_mutex_unlock(&impl->mutex_); if (!ok) { - pthread_mutex_destroy(&worker->impl_->mutex_); - pthread_cond_destroy(&worker->impl_->condition_); + pthread_mutex_destroy(&impl->mutex_); + pthread_cond_destroy(&impl->condition_); Error: - WebPSafeFree(worker->impl_); + WebPSafeFree(impl); worker->impl_ = NULL; return 0; } @@ -318,11 +330,12 @@ static void Launch(WebPWorker* const worker) { static void End(WebPWorker* const worker) { #ifdef WEBP_USE_THREAD if (worker->impl_ != NULL) { + WebPWorkerImpl* const impl = (WebPWorkerImpl*)worker->impl_; ChangeState(worker, NOT_OK); - pthread_join(worker->impl_->thread_, NULL); - pthread_mutex_destroy(&worker->impl_->mutex_); - pthread_cond_destroy(&worker->impl_->condition_); - WebPSafeFree(worker->impl_); + pthread_join(impl->thread_, NULL); + pthread_mutex_destroy(&impl->mutex_); + pthread_cond_destroy(&impl->condition_); + WebPSafeFree(impl); worker->impl_ = NULL; } #else diff --git a/src/3rdparty/libwebp/src/utils/thread_utils.h b/src/3rdparty/libwebp/src/utils/thread_utils.h index 8408311..29ad49f 100644 --- a/src/3rdparty/libwebp/src/utils/thread_utils.h +++ b/src/3rdparty/libwebp/src/utils/thread_utils.h @@ -11,14 +11,14 @@ // // Author: Skal (pascal.massimino@gmail.com) -#ifndef WEBP_UTILS_THREAD_H_ -#define WEBP_UTILS_THREAD_H_ +#ifndef WEBP_UTILS_THREAD_UTILS_H_ +#define WEBP_UTILS_THREAD_UTILS_H_ #ifdef HAVE_CONFIG_H -#include "../webp/config.h" +#include "src/webp/config.h" #endif -#include "../webp/types.h" +#include "src/webp/types.h" #ifdef __cplusplus extern "C" { @@ -35,12 +35,9 @@ typedef enum { // arguments (data1 and data2), and should return false in case of error. typedef int (*WebPWorkerHook)(void*, void*); -// Platform-dependent implementation details for the worker. -typedef struct WebPWorkerImpl WebPWorkerImpl; - // Synchronization object used to launch job in the worker thread typedef struct { - WebPWorkerImpl* impl_; + void* impl_; // platform-dependent implementation worker details WebPWorkerStatus status_; WebPWorkerHook hook; // hook to call void* data1; // first argument passed to 'hook' @@ -78,11 +75,11 @@ typedef struct { // decoding takes place. The contents of the interface struct are copied, it // is safe to free the corresponding memory after this call. This function is // not thread-safe. Return false in case of invalid pointer or methods. -WEBP_EXTERN(int) WebPSetWorkerInterface( +WEBP_EXTERN int WebPSetWorkerInterface( const WebPWorkerInterface* const winterface); // Retrieve the currently set thread worker interface. -WEBP_EXTERN(const WebPWorkerInterface*) WebPGetWorkerInterface(void); +WEBP_EXTERN const WebPWorkerInterface* WebPGetWorkerInterface(void); //------------------------------------------------------------------------------ @@ -90,4 +87,4 @@ WEBP_EXTERN(const WebPWorkerInterface*) WebPGetWorkerInterface(void); } // extern "C" #endif -#endif /* WEBP_UTILS_THREAD_H_ */ +#endif // WEBP_UTILS_THREAD_UTILS_H_ diff --git a/src/3rdparty/libwebp/src/utils/utils.c b/src/3rdparty/libwebp/src/utils/utils.c index 504d924..44d5c14 100644 --- a/src/3rdparty/libwebp/src/utils/utils.c +++ b/src/3rdparty/libwebp/src/utils/utils.c @@ -13,10 +13,11 @@ #include <stdlib.h> #include <string.h> // for memcpy() -#include "../webp/decode.h" -#include "../webp/encode.h" -#include "../webp/format_constants.h" // for MAX_PALETTE_SIZE -#include "./utils.h" +#include "src/webp/decode.h" +#include "src/webp/encode.h" +#include "src/webp/format_constants.h" // for MAX_PALETTE_SIZE +#include "src/utils/color_cache_utils.h" +#include "src/utils/utils.h" // If PRINT_MEM_INFO is defined, extra info (like total memory used, number of // alloc/free etc) is printed. For debugging/tuning purpose only (it's slow, @@ -252,7 +253,6 @@ int WebPGetColorPalette(const WebPPicture* const pic, uint32_t* const palette) { int num_colors = 0; uint8_t in_use[COLOR_HASH_SIZE] = { 0 }; uint32_t colors[COLOR_HASH_SIZE]; - static const uint64_t kHashMul = 0x1e35a7bdull; const uint32_t* argb = pic->argb; const int width = pic->width; const int height = pic->height; @@ -267,7 +267,7 @@ int WebPGetColorPalette(const WebPPicture* const pic, uint32_t* const palette) { continue; } last_pix = argb[x]; - key = ((last_pix * kHashMul) & 0xffffffffu) >> COLOR_HASH_RIGHT_SHIFT; + key = VP8LHashPix(last_pix, COLOR_HASH_RIGHT_SHIFT); while (1) { if (!in_use[key]) { colors[key] = last_pix; diff --git a/src/3rdparty/libwebp/src/utils/utils.h b/src/3rdparty/libwebp/src/utils/utils.h index 3ab4590..2a3ec92 100644 --- a/src/3rdparty/libwebp/src/utils/utils.h +++ b/src/3rdparty/libwebp/src/utils/utils.h @@ -16,14 +16,14 @@ #define WEBP_UTILS_UTILS_H_ #ifdef HAVE_CONFIG_H -#include "../webp/config.h" +#include "src/webp/config.h" #endif #include <assert.h> #include <limits.h> -#include "../dsp/dsp.h" -#include "../webp/types.h" +#include "src/dsp/dsp.h" +#include "src/webp/types.h" #ifdef __cplusplus extern "C" { @@ -48,13 +48,13 @@ extern "C" { // somewhere (like: malloc(num_pixels * sizeof(*something))). That's why this // safe malloc() borrows the signature from calloc(), pointing at the dangerous // underlying multiply involved. -WEBP_EXTERN(void*) WebPSafeMalloc(uint64_t nmemb, size_t size); +WEBP_EXTERN void* WebPSafeMalloc(uint64_t nmemb, size_t size); // Note that WebPSafeCalloc() expects the second argument type to be 'size_t' // in order to favor the "calloc(num_foo, sizeof(foo))" pattern. -WEBP_EXTERN(void*) WebPSafeCalloc(uint64_t nmemb, size_t size); +WEBP_EXTERN void* WebPSafeCalloc(uint64_t nmemb, size_t size); // Companion deallocation function to the above allocations. -WEBP_EXTERN(void) WebPSafeFree(void* const ptr); +WEBP_EXTERN void WebPSafeFree(void* const ptr); //------------------------------------------------------------------------------ // Alignment @@ -66,7 +66,7 @@ WEBP_EXTERN(void) WebPSafeFree(void* const ptr); // memcpy() is the safe way of moving potentially unaligned 32b memory. static WEBP_INLINE uint32_t WebPMemToUint32(const uint8_t* const ptr) { uint32_t A; - memcpy(&A, (const int*)ptr, sizeof(A)); + memcpy(&A, ptr, sizeof(A)); return A; } static WEBP_INLINE void WebPUint32ToMem(uint8_t* const ptr, uint32_t val) { @@ -92,14 +92,14 @@ static WEBP_INLINE uint32_t GetLE32(const uint8_t* const data) { // Store 16, 24 or 32 bits in little-endian order. static WEBP_INLINE void PutLE16(uint8_t* const data, int val) { assert(val < (1 << 16)); - data[0] = (val >> 0); - data[1] = (val >> 8); + data[0] = (val >> 0) & 0xff; + data[1] = (val >> 8) & 0xff; } static WEBP_INLINE void PutLE24(uint8_t* const data, int val) { assert(val < (1 << 24)); PutLE16(data, val & 0xffff); - data[2] = (val >> 16); + data[2] = (val >> 16) & 0xff; } static WEBP_INLINE void PutLE32(uint8_t* const data, uint32_t val) { @@ -107,19 +107,6 @@ static WEBP_INLINE void PutLE32(uint8_t* const data, uint32_t val) { PutLE16(data + 2, (int)(val >> 16)); } -// Returns 31 ^ clz(n) = log2(n). This is the default C-implementation, either -// based on table or not. Can be used as fallback if clz() is not available. -#define WEBP_NEED_LOG_TABLE_8BIT -extern const uint8_t WebPLogTable8bit[256]; -static WEBP_INLINE int WebPLog2FloorC(uint32_t n) { - int log = 0; - while (n >= 256) { - log += 8; - n >>= 8; - } - return log + WebPLogTable8bit[n]; -} - // Returns (int)floor(log2(n)). n must be > 0. // use GNU builtins where available. #if defined(__GNUC__) && \ @@ -138,6 +125,19 @@ static WEBP_INLINE int BitsLog2Floor(uint32_t n) { return first_set_bit; } #else // default: use the C-version. +// Returns 31 ^ clz(n) = log2(n). This is the default C-implementation, either +// based on table or not. Can be used as fallback if clz() is not available. +#define WEBP_NEED_LOG_TABLE_8BIT +extern const uint8_t WebPLogTable8bit[256]; +static WEBP_INLINE int WebPLog2FloorC(uint32_t n) { + int log_value = 0; + while (n >= 256) { + log_value += 8; + n >>= 8; + } + return log_value + WebPLogTable8bit[n]; +} + static WEBP_INLINE int BitsLog2Floor(uint32_t n) { return WebPLog2FloorC(n); } #endif @@ -147,14 +147,14 @@ static WEBP_INLINE int BitsLog2Floor(uint32_t n) { return WebPLog2FloorC(n); } struct WebPPicture; // Copy width x height pixels from 'src' to 'dst' honoring the strides. -WEBP_EXTERN(void) WebPCopyPlane(const uint8_t* src, int src_stride, - uint8_t* dst, int dst_stride, - int width, int height); +WEBP_EXTERN void WebPCopyPlane(const uint8_t* src, int src_stride, + uint8_t* dst, int dst_stride, + int width, int height); // Copy ARGB pixels from 'src' to 'dst' honoring strides. 'src' and 'dst' are // assumed to be already allocated and using ARGB data. -WEBP_EXTERN(void) WebPCopyPixels(const struct WebPPicture* const src, - struct WebPPicture* const dst); +WEBP_EXTERN void WebPCopyPixels(const struct WebPPicture* const src, + struct WebPPicture* const dst); //------------------------------------------------------------------------------ // Unique colors. @@ -166,8 +166,8 @@ WEBP_EXTERN(void) WebPCopyPixels(const struct WebPPicture* const src, // MAX_PALETTE_SIZE, also outputs the actual unique colors into 'palette'. // Note: 'palette' is assumed to be an array already allocated with at least // MAX_PALETTE_SIZE elements. -WEBP_EXTERN(int) WebPGetColorPalette(const struct WebPPicture* const pic, - uint32_t* const palette); +WEBP_EXTERN int WebPGetColorPalette(const struct WebPPicture* const pic, + uint32_t* const palette); //------------------------------------------------------------------------------ @@ -175,4 +175,4 @@ WEBP_EXTERN(int) WebPGetColorPalette(const struct WebPPicture* const pic, } // extern "C" #endif -#endif /* WEBP_UTILS_UTILS_H_ */ +#endif // WEBP_UTILS_UTILS_H_ diff --git a/src/3rdparty/libwebp/src/webp/config.h b/src/3rdparty/libwebp/src/webp/config.h index 731115b..80bb1f0 100644 --- a/src/3rdparty/libwebp/src/webp/config.h +++ b/src/3rdparty/libwebp/src/webp/config.h @@ -14,6 +14,9 @@ /* Set to 1 if __builtin_bswap64 is available */ /* #undef HAVE_BUILTIN_BSWAP64 */ +/* Define to 1 if you have the <cpu-features.h> header file. */ +/* #undef HAVE_CPU_FEATURES_H */ + /* Define to 1 if you have the <dlfcn.h> header file. */ /* #undef HAVE_DLFCN_H */ @@ -65,8 +68,7 @@ /* Define to 1 if you have the <windows.h> header file. */ /* #undef HAVE_WINDOWS_H */ -/* Define to the sub-directory in which libtool stores uninstalled libraries. - */ +/* Define to the sub-directory where libtool stores uninstalled libraries. */ /* #undef LT_OBJDIR ".libs/" */ /* Name of package */ @@ -79,7 +81,7 @@ #define PACKAGE_NAME "libwebp" /* Define to the full name and version of this package. */ -#define PACKAGE_STRING "libwebp 0.6.0" +#define PACKAGE_STRING "libwebp 1.0.3" /* Define to the one symbol short name of this package. */ #define PACKAGE_TARNAME "libwebp" @@ -88,7 +90,7 @@ #define PACKAGE_URL "http://developers.google.com/speed/webp" /* Define to the version of this package. */ -#define PACKAGE_VERSION "0.6.0" +#define PACKAGE_VERSION "1.0.3" /* Define to necessary symbol if this constant uses a non-standard name on your system. */ @@ -98,7 +100,7 @@ /* #undef STDC_HEADERS */ /* Version number of package */ -#define VERSION "0.6.0" +#define VERSION "1.0.3" /* Enable experimental code */ /* #undef WEBP_EXPERIMENTAL_FEATURES */ @@ -127,6 +129,9 @@ /* Set to 1 if PNG library is installed */ /* #undef WEBP_HAVE_PNG */ +/* Set to 1 if SDL library is installed */ +/* #undef WEBP_HAVE_SDL */ + /* Set to 1 if SSE2 is supported */ /* #undef WEBP_HAVE_SSE2 */ @@ -136,6 +141,9 @@ /* Set to 1 if TIFF library is installed */ /* #undef WEBP_HAVE_TIFF */ +/* Enable near lossless encoding */ +/* #undef WEBP_NEAR_LOSSLESS */ + /* Undefine this to disable thread support. */ #define WEBP_USE_THREAD 1 diff --git a/src/3rdparty/libwebp/src/webp/decode.h b/src/3rdparty/libwebp/src/webp/decode.h index 4c5e74a..ae8bfe8 100644 --- a/src/3rdparty/libwebp/src/webp/decode.h +++ b/src/3rdparty/libwebp/src/webp/decode.h @@ -36,39 +36,45 @@ typedef struct WebPDecoderConfig WebPDecoderConfig; // Return the decoder's version number, packed in hexadecimal using 8bits for // each of major/minor/revision. E.g: v2.5.7 is 0x020507. -WEBP_EXTERN(int) WebPGetDecoderVersion(void); +WEBP_EXTERN int WebPGetDecoderVersion(void); // Retrieve basic header information: width, height. // This function will also validate the header, returning true on success, // false otherwise. '*width' and '*height' are only valid on successful return. // Pointers 'width' and 'height' can be passed NULL if deemed irrelevant. -WEBP_EXTERN(int) WebPGetInfo(const uint8_t* data, size_t data_size, - int* width, int* height); +// Note: The following chunk sequences (before the raw VP8/VP8L data) are +// considered valid by this function: +// RIFF + VP8(L) +// RIFF + VP8X + (optional chunks) + VP8(L) +// ALPH + VP8 <-- Not a valid WebP format: only allowed for internal purpose. +// VP8(L) <-- Not a valid WebP format: only allowed for internal purpose. +WEBP_EXTERN int WebPGetInfo(const uint8_t* data, size_t data_size, + int* width, int* height); // Decodes WebP images pointed to by 'data' and returns RGBA samples, along // with the dimensions in *width and *height. The ordering of samples in // memory is R, G, B, A, R, G, B, A... in scan order (endian-independent). // The returned pointer should be deleted calling WebPFree(). // Returns NULL in case of error. -WEBP_EXTERN(uint8_t*) WebPDecodeRGBA(const uint8_t* data, size_t data_size, - int* width, int* height); +WEBP_EXTERN uint8_t* WebPDecodeRGBA(const uint8_t* data, size_t data_size, + int* width, int* height); // Same as WebPDecodeRGBA, but returning A, R, G, B, A, R, G, B... ordered data. -WEBP_EXTERN(uint8_t*) WebPDecodeARGB(const uint8_t* data, size_t data_size, - int* width, int* height); +WEBP_EXTERN uint8_t* WebPDecodeARGB(const uint8_t* data, size_t data_size, + int* width, int* height); // Same as WebPDecodeRGBA, but returning B, G, R, A, B, G, R, A... ordered data. -WEBP_EXTERN(uint8_t*) WebPDecodeBGRA(const uint8_t* data, size_t data_size, - int* width, int* height); +WEBP_EXTERN uint8_t* WebPDecodeBGRA(const uint8_t* data, size_t data_size, + int* width, int* height); // Same as WebPDecodeRGBA, but returning R, G, B, R, G, B... ordered data. // If the bitstream contains transparency, it is ignored. -WEBP_EXTERN(uint8_t*) WebPDecodeRGB(const uint8_t* data, size_t data_size, - int* width, int* height); +WEBP_EXTERN uint8_t* WebPDecodeRGB(const uint8_t* data, size_t data_size, + int* width, int* height); // Same as WebPDecodeRGB, but returning B, G, R, B, G, R... ordered data. -WEBP_EXTERN(uint8_t*) WebPDecodeBGR(const uint8_t* data, size_t data_size, - int* width, int* height); +WEBP_EXTERN uint8_t* WebPDecodeBGR(const uint8_t* data, size_t data_size, + int* width, int* height); // Decode WebP images pointed to by 'data' to Y'UV format(*). The pointer @@ -80,13 +86,13 @@ WEBP_EXTERN(uint8_t*) WebPDecodeBGR(const uint8_t* data, size_t data_size, // have a common stride returned as '*uv_stride'. // Return NULL in case of error. // (*) Also named Y'CbCr. See: http://en.wikipedia.org/wiki/YCbCr -WEBP_EXTERN(uint8_t*) WebPDecodeYUV(const uint8_t* data, size_t data_size, - int* width, int* height, - uint8_t** u, uint8_t** v, - int* stride, int* uv_stride); +WEBP_EXTERN uint8_t* WebPDecodeYUV(const uint8_t* data, size_t data_size, + int* width, int* height, + uint8_t** u, uint8_t** v, + int* stride, int* uv_stride); // Releases memory returned by the WebPDecode*() functions above. -WEBP_EXTERN(void) WebPFree(void* ptr); +WEBP_EXTERN void WebPFree(void* ptr); // These five functions are variants of the above ones, that decode the image // directly into a pre-allocated buffer 'output_buffer'. The maximum storage @@ -96,22 +102,22 @@ WEBP_EXTERN(void) WebPFree(void* ptr); // The parameter 'output_stride' specifies the distance (in bytes) // between scanlines. Hence, output_buffer_size is expected to be at least // output_stride x picture-height. -WEBP_EXTERN(uint8_t*) WebPDecodeRGBAInto( +WEBP_EXTERN uint8_t* WebPDecodeRGBAInto( const uint8_t* data, size_t data_size, uint8_t* output_buffer, size_t output_buffer_size, int output_stride); -WEBP_EXTERN(uint8_t*) WebPDecodeARGBInto( +WEBP_EXTERN uint8_t* WebPDecodeARGBInto( const uint8_t* data, size_t data_size, uint8_t* output_buffer, size_t output_buffer_size, int output_stride); -WEBP_EXTERN(uint8_t*) WebPDecodeBGRAInto( +WEBP_EXTERN uint8_t* WebPDecodeBGRAInto( const uint8_t* data, size_t data_size, uint8_t* output_buffer, size_t output_buffer_size, int output_stride); // RGB and BGR variants. Here too the transparency information, if present, // will be dropped and ignored. -WEBP_EXTERN(uint8_t*) WebPDecodeRGBInto( +WEBP_EXTERN uint8_t* WebPDecodeRGBInto( const uint8_t* data, size_t data_size, uint8_t* output_buffer, size_t output_buffer_size, int output_stride); -WEBP_EXTERN(uint8_t*) WebPDecodeBGRInto( +WEBP_EXTERN uint8_t* WebPDecodeBGRInto( const uint8_t* data, size_t data_size, uint8_t* output_buffer, size_t output_buffer_size, int output_stride); @@ -122,7 +128,7 @@ WEBP_EXTERN(uint8_t*) WebPDecodeBGRInto( // 'u_size' and 'v_size' respectively. // Pointer to the luma plane ('*luma') is returned or NULL if an error occurred // during decoding (or because some buffers were found to be too small). -WEBP_EXTERN(uint8_t*) WebPDecodeYUVInto( +WEBP_EXTERN uint8_t* WebPDecodeYUVInto( const uint8_t* data, size_t data_size, uint8_t* luma, size_t luma_size, int luma_stride, uint8_t* u, size_t u_size, int u_stride, @@ -213,7 +219,7 @@ struct WebPDecBuffer { }; // Internal, version-checked, entry point -WEBP_EXTERN(int) WebPInitDecBufferInternal(WebPDecBuffer*, int); +WEBP_EXTERN int WebPInitDecBufferInternal(WebPDecBuffer*, int); // Initialize the structure as empty. Must be called before any other use. // Returns false in case of version mismatch @@ -223,7 +229,7 @@ static WEBP_INLINE int WebPInitDecBuffer(WebPDecBuffer* buffer) { // Free any memory associated with the buffer. Must always be called last. // Note: doesn't free the 'buffer' structure itself. -WEBP_EXTERN(void) WebPFreeDecBuffer(WebPDecBuffer* buffer); +WEBP_EXTERN void WebPFreeDecBuffer(WebPDecBuffer* buffer); //------------------------------------------------------------------------------ // Enumeration of the status codes @@ -277,7 +283,7 @@ typedef enum VP8StatusCode { // within valid bounds. // All other fields of WebPDecBuffer MUST remain constant between calls. // Returns NULL if the allocation failed. -WEBP_EXTERN(WebPIDecoder*) WebPINewDecoder(WebPDecBuffer* output_buffer); +WEBP_EXTERN WebPIDecoder* WebPINewDecoder(WebPDecBuffer* output_buffer); // This function allocates and initializes an incremental-decoder object, which // will output the RGB/A samples specified by 'csp' into a preallocated @@ -289,7 +295,7 @@ WEBP_EXTERN(WebPIDecoder*) WebPINewDecoder(WebPDecBuffer* output_buffer); // colorspace 'csp' is taken into account for allocating this buffer. All other // parameters are ignored. // Returns NULL if the allocation failed, or if some parameters are invalid. -WEBP_EXTERN(WebPIDecoder*) WebPINewRGB( +WEBP_EXTERN WebPIDecoder* WebPINewRGB( WEBP_CSP_MODE csp, uint8_t* output_buffer, size_t output_buffer_size, int output_stride); @@ -304,7 +310,7 @@ WEBP_EXTERN(WebPIDecoder*) WebPINewRGB( // In this case, the output buffer will be automatically allocated (using // MODE_YUVA) when decoding starts. All parameters are then ignored. // Returns NULL if the allocation failed or if a parameter is invalid. -WEBP_EXTERN(WebPIDecoder*) WebPINewYUVA( +WEBP_EXTERN WebPIDecoder* WebPINewYUVA( uint8_t* luma, size_t luma_size, int luma_stride, uint8_t* u, size_t u_size, int u_stride, uint8_t* v, size_t v_size, int v_stride, @@ -312,19 +318,19 @@ WEBP_EXTERN(WebPIDecoder*) WebPINewYUVA( // Deprecated version of the above, without the alpha plane. // Kept for backward compatibility. -WEBP_EXTERN(WebPIDecoder*) WebPINewYUV( +WEBP_EXTERN WebPIDecoder* WebPINewYUV( uint8_t* luma, size_t luma_size, int luma_stride, uint8_t* u, size_t u_size, int u_stride, uint8_t* v, size_t v_size, int v_stride); // Deletes the WebPIDecoder object and associated memory. Must always be called // if WebPINewDecoder, WebPINewRGB or WebPINewYUV succeeded. -WEBP_EXTERN(void) WebPIDelete(WebPIDecoder* idec); +WEBP_EXTERN void WebPIDelete(WebPIDecoder* idec); // Copies and decodes the next available data. Returns VP8_STATUS_OK when // the image is successfully decoded. Returns VP8_STATUS_SUSPENDED when more // data is expected. Returns error in other cases. -WEBP_EXTERN(VP8StatusCode) WebPIAppend( +WEBP_EXTERN VP8StatusCode WebPIAppend( WebPIDecoder* idec, const uint8_t* data, size_t data_size); // A variant of the above function to be used when data buffer contains @@ -332,7 +338,7 @@ WEBP_EXTERN(VP8StatusCode) WebPIAppend( // to the internal memory. // Note that the value of the 'data' pointer can change between calls to // WebPIUpdate, for instance when the data buffer is resized to fit larger data. -WEBP_EXTERN(VP8StatusCode) WebPIUpdate( +WEBP_EXTERN VP8StatusCode WebPIUpdate( WebPIDecoder* idec, const uint8_t* data, size_t data_size); // Returns the RGB/A image decoded so far. Returns NULL if output params @@ -340,15 +346,16 @@ WEBP_EXTERN(VP8StatusCode) WebPIUpdate( // specified during call to WebPINewDecoder() or WebPINewRGB(). // *last_y is the index of last decoded row in raster scan order. Some pointers // (*last_y, *width etc.) can be NULL if corresponding information is not -// needed. -WEBP_EXTERN(uint8_t*) WebPIDecGetRGB( +// needed. The values in these pointers are only valid on successful (non-NULL) +// return. +WEBP_EXTERN uint8_t* WebPIDecGetRGB( const WebPIDecoder* idec, int* last_y, int* width, int* height, int* stride); // Same as above function to get a YUVA image. Returns pointer to the luma // plane or NULL in case of error. If there is no alpha information // the alpha pointer '*a' will be returned NULL. -WEBP_EXTERN(uint8_t*) WebPIDecGetYUVA( +WEBP_EXTERN uint8_t* WebPIDecGetYUVA( const WebPIDecoder* idec, int* last_y, uint8_t** u, uint8_t** v, uint8_t** a, int* width, int* height, int* stride, int* uv_stride, int* a_stride); @@ -368,7 +375,7 @@ static WEBP_INLINE uint8_t* WebPIDecGetYUV( // Returns NULL in case the incremental decoder object is in an invalid state. // Otherwise returns the pointer to the internal representation. This structure // is read-only, tied to WebPIDecoder's lifespan and should not be modified. -WEBP_EXTERN(const WebPDecBuffer*) WebPIDecodedArea( +WEBP_EXTERN const WebPDecBuffer* WebPIDecodedArea( const WebPIDecoder* idec, int* left, int* top, int* width, int* height); //------------------------------------------------------------------------------ @@ -416,7 +423,7 @@ struct WebPBitstreamFeatures { }; // Internal, version-checked, entry point -WEBP_EXTERN(VP8StatusCode) WebPGetFeaturesInternal( +WEBP_EXTERN VP8StatusCode WebPGetFeaturesInternal( const uint8_t*, size_t, WebPBitstreamFeatures*, int); // Retrieve features from the bitstream. The *features structure is filled @@ -424,6 +431,12 @@ WEBP_EXTERN(VP8StatusCode) WebPGetFeaturesInternal( // Returns VP8_STATUS_OK when the features are successfully retrieved. Returns // VP8_STATUS_NOT_ENOUGH_DATA when more data is needed to retrieve the // features from headers. Returns error in other cases. +// Note: The following chunk sequences (before the raw VP8/VP8L data) are +// considered valid by this function: +// RIFF + VP8(L) +// RIFF + VP8X + (optional chunks) + VP8(L) +// ALPH + VP8 <-- Not a valid WebP format: only allowed for internal purpose. +// VP8(L) <-- Not a valid WebP format: only allowed for internal purpose. static WEBP_INLINE VP8StatusCode WebPGetFeatures( const uint8_t* data, size_t data_size, WebPBitstreamFeatures* features) { @@ -457,7 +470,7 @@ struct WebPDecoderConfig { }; // Internal, version-checked, entry point -WEBP_EXTERN(int) WebPInitDecoderConfigInternal(WebPDecoderConfig*, int); +WEBP_EXTERN int WebPInitDecoderConfigInternal(WebPDecoderConfig*, int); // Initialize the configuration as empty. This function must always be // called first, unless WebPGetFeatures() is to be called. @@ -477,17 +490,17 @@ static WEBP_INLINE int WebPInitDecoderConfig(WebPDecoderConfig* config) { // The return WebPIDecoder object must always be deleted calling WebPIDelete(). // Returns NULL in case of error (and config->status will then reflect // the error condition, if available). -WEBP_EXTERN(WebPIDecoder*) WebPIDecode(const uint8_t* data, size_t data_size, - WebPDecoderConfig* config); +WEBP_EXTERN WebPIDecoder* WebPIDecode(const uint8_t* data, size_t data_size, + WebPDecoderConfig* config); // Non-incremental version. This version decodes the full data at once, taking // 'config' into account. Returns decoding status (which should be VP8_STATUS_OK // if the decoding was successful). Note that 'config' cannot be NULL. -WEBP_EXTERN(VP8StatusCode) WebPDecode(const uint8_t* data, size_t data_size, - WebPDecoderConfig* config); +WEBP_EXTERN VP8StatusCode WebPDecode(const uint8_t* data, size_t data_size, + WebPDecoderConfig* config); #ifdef __cplusplus } // extern "C" #endif -#endif /* WEBP_WEBP_DECODE_H_ */ +#endif // WEBP_WEBP_DECODE_H_ diff --git a/src/3rdparty/libwebp/src/webp/demux.h b/src/3rdparty/libwebp/src/webp/demux.h index 454f691..846eeb1 100644 --- a/src/3rdparty/libwebp/src/webp/demux.h +++ b/src/3rdparty/libwebp/src/webp/demux.h @@ -71,7 +71,7 @@ typedef struct WebPAnimDecoderOptions WebPAnimDecoderOptions; // Returns the version number of the demux library, packed in hexadecimal using // 8bits for each of major/minor/revision. E.g: v2.5.7 is 0x020507. -WEBP_EXTERN(int) WebPGetDemuxVersion(void); +WEBP_EXTERN int WebPGetDemuxVersion(void); //------------------------------------------------------------------------------ // Life of a Demux object @@ -85,7 +85,7 @@ typedef enum WebPDemuxState { } WebPDemuxState; // Internal, version-checked, entry point -WEBP_EXTERN(WebPDemuxer*) WebPDemuxInternal( +WEBP_EXTERN WebPDemuxer* WebPDemuxInternal( const WebPData*, int, WebPDemuxState*, int); // Parses the full WebP file given by 'data'. For single images the WebP file @@ -109,27 +109,32 @@ static WEBP_INLINE WebPDemuxer* WebPDemuxPartial( } // Frees memory associated with 'dmux'. -WEBP_EXTERN(void) WebPDemuxDelete(WebPDemuxer* dmux); +WEBP_EXTERN void WebPDemuxDelete(WebPDemuxer* dmux); //------------------------------------------------------------------------------ // Data/information extraction. typedef enum WebPFormatFeature { - WEBP_FF_FORMAT_FLAGS, // Extended format flags present in the 'VP8X' chunk. + WEBP_FF_FORMAT_FLAGS, // bit-wise combination of WebPFeatureFlags + // corresponding to the 'VP8X' chunk (if present). WEBP_FF_CANVAS_WIDTH, WEBP_FF_CANVAS_HEIGHT, - WEBP_FF_LOOP_COUNT, - WEBP_FF_BACKGROUND_COLOR, - WEBP_FF_FRAME_COUNT // Number of frames present in the demux object. - // In case of a partial demux, this is the number of - // frames seen so far, with the last frame possibly - // being partial. + WEBP_FF_LOOP_COUNT, // only relevant for animated file + WEBP_FF_BACKGROUND_COLOR, // idem. + WEBP_FF_FRAME_COUNT // Number of frames present in the demux object. + // In case of a partial demux, this is the number + // of frames seen so far, with the last frame + // possibly being partial. } WebPFormatFeature; // Get the 'feature' value from the 'dmux'. // NOTE: values are only valid if WebPDemux() was used or WebPDemuxPartial() // returned a state > WEBP_DEMUX_PARSING_HEADER. -WEBP_EXTERN(uint32_t) WebPDemuxGetI( +// If 'feature' is WEBP_FF_FORMAT_FLAGS, the returned value is a bit-wise +// combination of WebPFeatureFlags values. +// If 'feature' is WEBP_FF_LOOP_COUNT, WEBP_FF_BACKGROUND_COLOR, the returned +// value is only meaningful if the bitstream is animated. +WEBP_EXTERN uint32_t WebPDemuxGetI( const WebPDemuxer* dmux, WebPFormatFeature feature); //------------------------------------------------------------------------------ @@ -159,20 +164,20 @@ struct WebPIterator { // Returns false if 'dmux' is NULL or frame 'frame_number' is not present. // Call WebPDemuxReleaseIterator() when use of the iterator is complete. // NOTE: 'dmux' must persist for the lifetime of 'iter'. -WEBP_EXTERN(int) WebPDemuxGetFrame( +WEBP_EXTERN int WebPDemuxGetFrame( const WebPDemuxer* dmux, int frame_number, WebPIterator* iter); // Sets 'iter->fragment' to point to the next ('iter->frame_num' + 1) or // previous ('iter->frame_num' - 1) frame. These functions do not loop. // Returns true on success, false otherwise. -WEBP_EXTERN(int) WebPDemuxNextFrame(WebPIterator* iter); -WEBP_EXTERN(int) WebPDemuxPrevFrame(WebPIterator* iter); +WEBP_EXTERN int WebPDemuxNextFrame(WebPIterator* iter); +WEBP_EXTERN int WebPDemuxPrevFrame(WebPIterator* iter); // Releases any memory associated with 'iter'. // Must be called before any subsequent calls to WebPDemuxGetChunk() on the same // iter. Also, must be called before destroying the associated WebPDemuxer with // WebPDemuxDelete(). -WEBP_EXTERN(void) WebPDemuxReleaseIterator(WebPIterator* iter); +WEBP_EXTERN void WebPDemuxReleaseIterator(WebPIterator* iter); //------------------------------------------------------------------------------ // Chunk iteration. @@ -197,20 +202,20 @@ struct WebPChunkIterator { // payloads are accessed through WebPDemuxGetFrame() and related functions. // Call WebPDemuxReleaseChunkIterator() when use of the iterator is complete. // NOTE: 'dmux' must persist for the lifetime of the iterator. -WEBP_EXTERN(int) WebPDemuxGetChunk(const WebPDemuxer* dmux, - const char fourcc[4], int chunk_number, - WebPChunkIterator* iter); +WEBP_EXTERN int WebPDemuxGetChunk(const WebPDemuxer* dmux, + const char fourcc[4], int chunk_number, + WebPChunkIterator* iter); // Sets 'iter->chunk' to point to the next ('iter->chunk_num' + 1) or previous // ('iter->chunk_num' - 1) chunk. These functions do not loop. // Returns true on success, false otherwise. -WEBP_EXTERN(int) WebPDemuxNextChunk(WebPChunkIterator* iter); -WEBP_EXTERN(int) WebPDemuxPrevChunk(WebPChunkIterator* iter); +WEBP_EXTERN int WebPDemuxNextChunk(WebPChunkIterator* iter); +WEBP_EXTERN int WebPDemuxPrevChunk(WebPChunkIterator* iter); // Releases any memory associated with 'iter'. // Must be called before destroying the associated WebPDemuxer with // WebPDemuxDelete(). -WEBP_EXTERN(void) WebPDemuxReleaseChunkIterator(WebPChunkIterator* iter); +WEBP_EXTERN void WebPDemuxReleaseChunkIterator(WebPChunkIterator* iter); //------------------------------------------------------------------------------ // WebPAnimDecoder API @@ -252,7 +257,7 @@ struct WebPAnimDecoderOptions { }; // Internal, version-checked, entry point. -WEBP_EXTERN(int) WebPAnimDecoderOptionsInitInternal( +WEBP_EXTERN int WebPAnimDecoderOptionsInitInternal( WebPAnimDecoderOptions*, int); // Should always be called, to initialize a fresh WebPAnimDecoderOptions @@ -266,7 +271,7 @@ static WEBP_INLINE int WebPAnimDecoderOptionsInit( } // Internal, version-checked, entry point. -WEBP_EXTERN(WebPAnimDecoder*) WebPAnimDecoderNewInternal( +WEBP_EXTERN WebPAnimDecoder* WebPAnimDecoderNewInternal( const WebPData*, const WebPAnimDecoderOptions*, int); // Creates and initializes a WebPAnimDecoder object. @@ -301,8 +306,8 @@ struct WebPAnimInfo { // info - (out) global information fetched from the animation. // Returns: // True on success. -WEBP_EXTERN(int) WebPAnimDecoderGetInfo(const WebPAnimDecoder* dec, - WebPAnimInfo* info); +WEBP_EXTERN int WebPAnimDecoderGetInfo(const WebPAnimDecoder* dec, + WebPAnimInfo* info); // Fetch the next frame from 'dec' based on options supplied to // WebPAnimDecoderNew(). This will be a fully reconstructed canvas of size @@ -316,8 +321,8 @@ WEBP_EXTERN(int) WebPAnimDecoderGetInfo(const WebPAnimDecoder* dec, // Returns: // False if any of the arguments are NULL, or if there is a parsing or // decoding error, or if there are no more frames. Otherwise, returns true. -WEBP_EXTERN(int) WebPAnimDecoderGetNext(WebPAnimDecoder* dec, - uint8_t** buf, int* timestamp); +WEBP_EXTERN int WebPAnimDecoderGetNext(WebPAnimDecoder* dec, + uint8_t** buf, int* timestamp); // Check if there are more frames left to decode. // Parameters: @@ -325,7 +330,7 @@ WEBP_EXTERN(int) WebPAnimDecoderGetNext(WebPAnimDecoder* dec, // Returns: // True if 'dec' is not NULL and some frames are yet to be decoded. // Otherwise, returns false. -WEBP_EXTERN(int) WebPAnimDecoderHasMoreFrames(const WebPAnimDecoder* dec); +WEBP_EXTERN int WebPAnimDecoderHasMoreFrames(const WebPAnimDecoder* dec); // Resets the WebPAnimDecoder object, so that next call to // WebPAnimDecoderGetNext() will restart decoding from 1st frame. This would be @@ -333,7 +338,7 @@ WEBP_EXTERN(int) WebPAnimDecoderHasMoreFrames(const WebPAnimDecoder* dec); // info.loop_count times) without destroying and recreating the 'dec' object. // Parameters: // dec - (in/out) decoder instance to be reset -WEBP_EXTERN(void) WebPAnimDecoderReset(WebPAnimDecoder* dec); +WEBP_EXTERN void WebPAnimDecoderReset(WebPAnimDecoder* dec); // Grab the internal demuxer object. // Getting the demuxer object can be useful if one wants to use operations only @@ -343,16 +348,16 @@ WEBP_EXTERN(void) WebPAnimDecoderReset(WebPAnimDecoder* dec); // // Parameters: // dec - (in) decoder instance from which the demuxer object is to be fetched. -WEBP_EXTERN(const WebPDemuxer*) WebPAnimDecoderGetDemuxer( +WEBP_EXTERN const WebPDemuxer* WebPAnimDecoderGetDemuxer( const WebPAnimDecoder* dec); // Deletes the WebPAnimDecoder object. // Parameters: // dec - (in/out) decoder instance to be deleted -WEBP_EXTERN(void) WebPAnimDecoderDelete(WebPAnimDecoder* dec); +WEBP_EXTERN void WebPAnimDecoderDelete(WebPAnimDecoder* dec); #ifdef __cplusplus } // extern "C" #endif -#endif /* WEBP_WEBP_DEMUX_H_ */ +#endif // WEBP_WEBP_DEMUX_H_ diff --git a/src/3rdparty/libwebp/src/webp/encode.h b/src/3rdparty/libwebp/src/webp/encode.h index 35fde1d..339f881 100644 --- a/src/3rdparty/libwebp/src/webp/encode.h +++ b/src/3rdparty/libwebp/src/webp/encode.h @@ -35,7 +35,7 @@ typedef struct WebPMemoryWriter WebPMemoryWriter; // Return the encoder's version number, packed in hexadecimal using 8bits for // each of major/minor/revision. E.g: v2.5.7 is 0x020507. -WEBP_EXTERN(int) WebPGetEncoderVersion(void); +WEBP_EXTERN int WebPGetEncoderVersion(void); //------------------------------------------------------------------------------ // One-stop-shop call! No questions asked: @@ -46,37 +46,41 @@ WEBP_EXTERN(int) WebPGetEncoderVersion(void); // These functions compress using the lossy format, and the quality_factor // can go from 0 (smaller output, lower quality) to 100 (best quality, // larger output). -WEBP_EXTERN(size_t) WebPEncodeRGB(const uint8_t* rgb, +WEBP_EXTERN size_t WebPEncodeRGB(const uint8_t* rgb, + int width, int height, int stride, + float quality_factor, uint8_t** output); +WEBP_EXTERN size_t WebPEncodeBGR(const uint8_t* bgr, + int width, int height, int stride, + float quality_factor, uint8_t** output); +WEBP_EXTERN size_t WebPEncodeRGBA(const uint8_t* rgba, int width, int height, int stride, float quality_factor, uint8_t** output); -WEBP_EXTERN(size_t) WebPEncodeBGR(const uint8_t* bgr, +WEBP_EXTERN size_t WebPEncodeBGRA(const uint8_t* bgra, int width, int height, int stride, float quality_factor, uint8_t** output); -WEBP_EXTERN(size_t) WebPEncodeRGBA(const uint8_t* rgba, - int width, int height, int stride, - float quality_factor, uint8_t** output); -WEBP_EXTERN(size_t) WebPEncodeBGRA(const uint8_t* bgra, - int width, int height, int stride, - float quality_factor, uint8_t** output); // These functions are the equivalent of the above, but compressing in a // lossless manner. Files are usually larger than lossy format, but will // not suffer any compression loss. -WEBP_EXTERN(size_t) WebPEncodeLosslessRGB(const uint8_t* rgb, +// Note these functions, like the lossy versions, use the library's default +// settings. For lossless this means 'exact' is disabled. RGB values in +// transparent areas will be modified to improve compression. To avoid this, +// use WebPEncode() and set WebPConfig::exact to 1. +WEBP_EXTERN size_t WebPEncodeLosslessRGB(const uint8_t* rgb, + int width, int height, int stride, + uint8_t** output); +WEBP_EXTERN size_t WebPEncodeLosslessBGR(const uint8_t* bgr, + int width, int height, int stride, + uint8_t** output); +WEBP_EXTERN size_t WebPEncodeLosslessRGBA(const uint8_t* rgba, int width, int height, int stride, uint8_t** output); -WEBP_EXTERN(size_t) WebPEncodeLosslessBGR(const uint8_t* bgr, +WEBP_EXTERN size_t WebPEncodeLosslessBGRA(const uint8_t* bgra, int width, int height, int stride, uint8_t** output); -WEBP_EXTERN(size_t) WebPEncodeLosslessRGBA(const uint8_t* rgba, - int width, int height, int stride, - uint8_t** output); -WEBP_EXTERN(size_t) WebPEncodeLosslessBGRA(const uint8_t* bgra, - int width, int height, int stride, - uint8_t** output); // Releases memory returned by the WebPEncode*() functions above. -WEBP_EXTERN(void) WebPFree(void* ptr); +WEBP_EXTERN void WebPFree(void* ptr); //------------------------------------------------------------------------------ // Coding parameters @@ -93,12 +97,15 @@ typedef enum WebPImageHint { // Compression parameters. struct WebPConfig { int lossless; // Lossless encoding (0=lossy(default), 1=lossless). - float quality; // between 0 (smallest file) and 100 (biggest) + float quality; // between 0 and 100. For lossy, 0 gives the smallest + // size and 100 the largest. For lossless, this + // parameter is the amount of effort put into the + // compression: 0 is the fastest but gives larger + // files compared to the slowest, but best, 100. int method; // quality/speed trade-off (0=fast, 6=slower-better) WebPImageHint image_hint; // Hint for image type (lossless only for now). - // Parameters related to lossy compression only: int target_size; // if non-zero, set the desired target size in bytes. // Takes precedence over the 'compression' parameter. float target_PSNR; // if non-zero, specifies the minimal distortion to @@ -159,7 +166,7 @@ typedef enum WebPPreset { } WebPPreset; // Internal, version-checked, entry point -WEBP_EXTERN(int) WebPConfigInitInternal(WebPConfig*, WebPPreset, float, int); +WEBP_EXTERN int WebPConfigInitInternal(WebPConfig*, WebPPreset, float, int); // Should always be called, to initialize a fresh WebPConfig structure before // modification. Returns false in case of version mismatch. WebPConfigInit() @@ -186,15 +193,15 @@ static WEBP_INLINE int WebPConfigPreset(WebPConfig* config, // speed and final compressed size. // This function will overwrite several fields from config: 'method', 'quality' // and 'lossless'. Returns false in case of parameter error. -WEBP_EXTERN(int) WebPConfigLosslessPreset(WebPConfig* config, int level); +WEBP_EXTERN int WebPConfigLosslessPreset(WebPConfig* config, int level); // Returns true if 'config' is non-NULL and all configuration parameters are // within their valid ranges. -WEBP_EXTERN(int) WebPValidateConfig(const WebPConfig* config); +WEBP_EXTERN int WebPValidateConfig(const WebPConfig* config); //------------------------------------------------------------------------------ // Input / Output -// Structure for storing auxiliary statistics (mostly for lossy encoding). +// Structure for storing auxiliary statistics. struct WebPAuxStats { int coded_size; // final size @@ -242,16 +249,16 @@ struct WebPMemoryWriter { }; // The following must be called first before any use. -WEBP_EXTERN(void) WebPMemoryWriterInit(WebPMemoryWriter* writer); +WEBP_EXTERN void WebPMemoryWriterInit(WebPMemoryWriter* writer); // The following must be called to deallocate writer->mem memory. The 'writer' // object itself is not deallocated. -WEBP_EXTERN(void) WebPMemoryWriterClear(WebPMemoryWriter* writer); +WEBP_EXTERN void WebPMemoryWriterClear(WebPMemoryWriter* writer); // The custom writer to be used with WebPMemoryWriter as custom_ptr. Upon // completion, writer.mem and writer.size will hold the coded data. // writer.mem must be freed by calling WebPMemoryWriterClear. -WEBP_EXTERN(int) WebPMemoryWrite(const uint8_t* data, size_t data_size, - const WebPPicture* picture); +WEBP_EXTERN int WebPMemoryWrite(const uint8_t* data, size_t data_size, + const WebPPicture* picture); // Progress hook, called from time to time to report progress. It can return // false to request an abort of the encoding process, or true otherwise if @@ -354,7 +361,7 @@ struct WebPPicture { }; // Internal, version-checked, entry point -WEBP_EXTERN(int) WebPPictureInitInternal(WebPPicture*, int); +WEBP_EXTERN int WebPPictureInitInternal(WebPPicture*, int); // Should always be called, to initialize the structure. Returns false in case // of version mismatch. WebPPictureInit() must have succeeded before using the @@ -371,20 +378,20 @@ static WEBP_INLINE int WebPPictureInit(WebPPicture* picture) { // Allocate y/u/v buffers as per colorspace/width/height specification. // Note! This function will free the previous buffer if needed. // Returns false in case of memory error. -WEBP_EXTERN(int) WebPPictureAlloc(WebPPicture* picture); +WEBP_EXTERN int WebPPictureAlloc(WebPPicture* picture); // Release the memory allocated by WebPPictureAlloc() or WebPPictureImport*(). // Note that this function does _not_ free the memory used by the 'picture' // object itself. // Besides memory (which is reclaimed) all other fields of 'picture' are // preserved. -WEBP_EXTERN(void) WebPPictureFree(WebPPicture* picture); +WEBP_EXTERN void WebPPictureFree(WebPPicture* picture); // Copy the pixels of *src into *dst, using WebPPictureAlloc. Upon return, *dst // will fully own the copied pixels (this is not a view). The 'dst' picture need // not be initialized as its content is overwritten. // Returns false in case of memory allocation error. -WEBP_EXTERN(int) WebPPictureCopy(const WebPPicture* src, WebPPicture* dst); +WEBP_EXTERN int WebPPictureCopy(const WebPPicture* src, WebPPicture* dst); // Compute the single distortion for packed planes of samples. // 'src' will be compared to 'ref', and the raw distortion stored into @@ -393,19 +400,19 @@ WEBP_EXTERN(int) WebPPictureCopy(const WebPPicture* src, WebPPicture* dst); // 'x_step' is the horizontal stride (in bytes) between samples. // 'src/ref_stride' is the byte distance between rows. // Returns false in case of error (bad parameter, memory allocation error, ...). -WEBP_EXTERN(int) WebPPlaneDistortion(const uint8_t* src, size_t src_stride, - const uint8_t* ref, size_t ref_stride, - int width, int height, - size_t x_step, - int type, // 0 = PSNR, 1 = SSIM, 2 = LSIM - float* distortion, float* result); +WEBP_EXTERN int WebPPlaneDistortion(const uint8_t* src, size_t src_stride, + const uint8_t* ref, size_t ref_stride, + int width, int height, + size_t x_step, + int type, // 0 = PSNR, 1 = SSIM, 2 = LSIM + float* distortion, float* result); // Compute PSNR, SSIM or LSIM distortion metric between two pictures. Results // are in dB, stored in result[] in the B/G/R/A/All order. The distortion is // always performed using ARGB samples. Hence if the input is YUV(A), the // picture will be internally converted to ARGB (just for the measurement). // Warning: this function is rather CPU-intensive. -WEBP_EXTERN(int) WebPPictureDistortion( +WEBP_EXTERN int WebPPictureDistortion( const WebPPicture* src, const WebPPicture* ref, int metric_type, // 0 = PSNR, 1 = SSIM, 2 = LSIM float result[5]); @@ -418,8 +425,8 @@ WEBP_EXTERN(int) WebPPictureDistortion( // must be fully be comprised inside the 'src' source picture. If the source // picture uses the YUV420 colorspace, the top and left coordinates will be // snapped to even values. -WEBP_EXTERN(int) WebPPictureCrop(WebPPicture* picture, - int left, int top, int width, int height); +WEBP_EXTERN int WebPPictureCrop(WebPPicture* picture, + int left, int top, int width, int height); // Extracts a view from 'src' picture into 'dst'. The rectangle for the view // is defined by the top-left corner pixel coordinates (left, top) as well @@ -432,42 +439,42 @@ WEBP_EXTERN(int) WebPPictureCrop(WebPPicture* picture, // with WebPPictureInit() if it is different from 'src', since its content will // be overwritten. // Returns false in case of memory allocation error or invalid parameters. -WEBP_EXTERN(int) WebPPictureView(const WebPPicture* src, - int left, int top, int width, int height, - WebPPicture* dst); +WEBP_EXTERN int WebPPictureView(const WebPPicture* src, + int left, int top, int width, int height, + WebPPicture* dst); // Returns true if the 'picture' is actually a view and therefore does // not own the memory for pixels. -WEBP_EXTERN(int) WebPPictureIsView(const WebPPicture* picture); +WEBP_EXTERN int WebPPictureIsView(const WebPPicture* picture); // Rescale a picture to new dimension width x height. // If either 'width' or 'height' (but not both) is 0 the corresponding // dimension will be calculated preserving the aspect ratio. // No gamma correction is applied. // Returns false in case of error (invalid parameter or insufficient memory). -WEBP_EXTERN(int) WebPPictureRescale(WebPPicture* pic, int width, int height); +WEBP_EXTERN int WebPPictureRescale(WebPPicture* pic, int width, int height); // Colorspace conversion function to import RGB samples. // Previous buffer will be free'd, if any. // *rgb buffer should have a size of at least height * rgb_stride. // Returns false in case of memory error. -WEBP_EXTERN(int) WebPPictureImportRGB( +WEBP_EXTERN int WebPPictureImportRGB( WebPPicture* picture, const uint8_t* rgb, int rgb_stride); // Same, but for RGBA buffer. -WEBP_EXTERN(int) WebPPictureImportRGBA( +WEBP_EXTERN int WebPPictureImportRGBA( WebPPicture* picture, const uint8_t* rgba, int rgba_stride); // Same, but for RGBA buffer. Imports the RGB direct from the 32-bit format // input buffer ignoring the alpha channel. Avoids needing to copy the data // to a temporary 24-bit RGB buffer to import the RGB only. -WEBP_EXTERN(int) WebPPictureImportRGBX( +WEBP_EXTERN int WebPPictureImportRGBX( WebPPicture* picture, const uint8_t* rgbx, int rgbx_stride); // Variants of the above, but taking BGR(A|X) input. -WEBP_EXTERN(int) WebPPictureImportBGR( +WEBP_EXTERN int WebPPictureImportBGR( WebPPicture* picture, const uint8_t* bgr, int bgr_stride); -WEBP_EXTERN(int) WebPPictureImportBGRA( +WEBP_EXTERN int WebPPictureImportBGRA( WebPPicture* picture, const uint8_t* bgra, int bgra_stride); -WEBP_EXTERN(int) WebPPictureImportBGRX( +WEBP_EXTERN int WebPPictureImportBGRX( WebPPicture* picture, const uint8_t* bgrx, int bgrx_stride); // Converts picture->argb data to the YUV420A format. The 'colorspace' @@ -476,14 +483,14 @@ WEBP_EXTERN(int) WebPPictureImportBGRX( // non-opaque transparent values is detected, and 'colorspace' will be // adjusted accordingly. Note that this method is lossy. // Returns false in case of error. -WEBP_EXTERN(int) WebPPictureARGBToYUVA(WebPPicture* picture, - WebPEncCSP /*colorspace = WEBP_YUV420*/); +WEBP_EXTERN int WebPPictureARGBToYUVA(WebPPicture* picture, + WebPEncCSP /*colorspace = WEBP_YUV420*/); // Same as WebPPictureARGBToYUVA(), but the conversion is done using // pseudo-random dithering with a strength 'dithering' between // 0.0 (no dithering) and 1.0 (maximum dithering). This is useful // for photographic picture. -WEBP_EXTERN(int) WebPPictureARGBToYUVADithered( +WEBP_EXTERN int WebPPictureARGBToYUVADithered( WebPPicture* picture, WebPEncCSP colorspace, float dithering); // Performs 'sharp' RGBA->YUVA420 downsampling and colorspace conversion. @@ -491,9 +498,9 @@ WEBP_EXTERN(int) WebPPictureARGBToYUVADithered( // method is roughly 2x slower than WebPPictureARGBToYUVA() but produces better // and sharper YUV representation. // Returns false in case of error. -WEBP_EXTERN(int) WebPPictureSharpARGBToYUVA(WebPPicture* picture); +WEBP_EXTERN int WebPPictureSharpARGBToYUVA(WebPPicture* picture); // kept for backward compatibility: -WEBP_EXTERN(int) WebPPictureSmartARGBToYUVA(WebPPicture* picture); +WEBP_EXTERN int WebPPictureSmartARGBToYUVA(WebPPicture* picture); // Converts picture->yuv to picture->argb and sets picture->use_argb to true. // The input format must be YUV_420 or YUV_420A. The conversion from YUV420 to @@ -501,22 +508,22 @@ WEBP_EXTERN(int) WebPPictureSmartARGBToYUVA(WebPPicture* picture); // Note that the use of this colorspace is discouraged if one has access to the // raw ARGB samples, since using YUV420 is comparatively lossy. // Returns false in case of error. -WEBP_EXTERN(int) WebPPictureYUVAToARGB(WebPPicture* picture); +WEBP_EXTERN int WebPPictureYUVAToARGB(WebPPicture* picture); // Helper function: given a width x height plane of RGBA or YUV(A) samples -// clean-up the YUV or RGB samples under fully transparent area, to help -// compressibility (no guarantee, though). -WEBP_EXTERN(void) WebPCleanupTransparentArea(WebPPicture* picture); +// clean-up or smoothen the YUV or RGB samples under fully transparent area, +// to help compressibility (no guarantee, though). +WEBP_EXTERN void WebPCleanupTransparentArea(WebPPicture* picture); // Scan the picture 'picture' for the presence of non fully opaque alpha values. // Returns true in such case. Otherwise returns false (indicating that the // alpha plane can be ignored altogether e.g.). -WEBP_EXTERN(int) WebPPictureHasTransparency(const WebPPicture* picture); +WEBP_EXTERN int WebPPictureHasTransparency(const WebPPicture* picture); // Remove the transparency information (if present) by blending the color with // the background color 'background_rgb' (specified as 24bit RGB triplet). // After this call, all alpha values are reset to 0xff. -WEBP_EXTERN(void) WebPBlendAlpha(WebPPicture* pic, uint32_t background_rgb); +WEBP_EXTERN void WebPBlendAlpha(WebPPicture* pic, uint32_t background_rgb); //------------------------------------------------------------------------------ // Main call @@ -531,7 +538,7 @@ WEBP_EXTERN(void) WebPBlendAlpha(WebPPicture* pic, uint32_t background_rgb); // the former for lossy encoding, and the latter for lossless encoding // (when config.lossless is true). Automatic conversion from one format to // another is provided but they both incur some loss. -WEBP_EXTERN(int) WebPEncode(const WebPConfig* config, WebPPicture* picture); +WEBP_EXTERN int WebPEncode(const WebPConfig* config, WebPPicture* picture); //------------------------------------------------------------------------------ @@ -539,4 +546,4 @@ WEBP_EXTERN(int) WebPEncode(const WebPConfig* config, WebPPicture* picture); } // extern "C" #endif -#endif /* WEBP_WEBP_ENCODE_H_ */ +#endif // WEBP_WEBP_ENCODE_H_ diff --git a/src/3rdparty/libwebp/src/webp/format_constants.h b/src/3rdparty/libwebp/src/webp/format_constants.h index 329fc8a..eca6981 100644 --- a/src/3rdparty/libwebp/src/webp/format_constants.h +++ b/src/3rdparty/libwebp/src/webp/format_constants.h @@ -84,4 +84,4 @@ typedef enum { // overflow a uint32_t. #define MAX_CHUNK_PAYLOAD (~0U - CHUNK_HEADER_SIZE - 1) -#endif /* WEBP_WEBP_FORMAT_CONSTANTS_H_ */ +#endif // WEBP_WEBP_FORMAT_CONSTANTS_H_ diff --git a/src/3rdparty/libwebp/src/webp/mux.h b/src/3rdparty/libwebp/src/webp/mux.h index daccc65..66096a9 100644 --- a/src/3rdparty/libwebp/src/webp/mux.h +++ b/src/3rdparty/libwebp/src/webp/mux.h @@ -98,13 +98,13 @@ typedef enum WebPChunkId { // Returns the version number of the mux library, packed in hexadecimal using // 8bits for each of major/minor/revision. E.g: v2.5.7 is 0x020507. -WEBP_EXTERN(int) WebPGetMuxVersion(void); +WEBP_EXTERN int WebPGetMuxVersion(void); //------------------------------------------------------------------------------ // Life of a Mux object // Internal, version-checked, entry point -WEBP_EXTERN(WebPMux*) WebPNewInternal(int); +WEBP_EXTERN WebPMux* WebPNewInternal(int); // Creates an empty mux object. // Returns: @@ -117,13 +117,13 @@ static WEBP_INLINE WebPMux* WebPMuxNew(void) { // Deletes the mux object. // Parameters: // mux - (in/out) object to be deleted -WEBP_EXTERN(void) WebPMuxDelete(WebPMux* mux); +WEBP_EXTERN void WebPMuxDelete(WebPMux* mux); //------------------------------------------------------------------------------ // Mux creation. // Internal, version-checked, entry point -WEBP_EXTERN(WebPMux*) WebPMuxCreateInternal(const WebPData*, int, int); +WEBP_EXTERN WebPMux* WebPMuxCreateInternal(const WebPData*, int, int); // Creates a mux object from raw data given in WebP RIFF format. // Parameters: @@ -160,7 +160,7 @@ static WEBP_INLINE WebPMux* WebPMuxCreate(const WebPData* bitstream, // or if fourcc corresponds to an image chunk. // WEBP_MUX_MEMORY_ERROR - on memory allocation error. // WEBP_MUX_OK - on success. -WEBP_EXTERN(WebPMuxError) WebPMuxSetChunk( +WEBP_EXTERN WebPMuxError WebPMuxSetChunk( WebPMux* mux, const char fourcc[4], const WebPData* chunk_data, int copy_data); @@ -176,7 +176,7 @@ WEBP_EXTERN(WebPMuxError) WebPMuxSetChunk( // or if fourcc corresponds to an image chunk. // WEBP_MUX_NOT_FOUND - If mux does not contain a chunk with the given id. // WEBP_MUX_OK - on success. -WEBP_EXTERN(WebPMuxError) WebPMuxGetChunk( +WEBP_EXTERN WebPMuxError WebPMuxGetChunk( const WebPMux* mux, const char fourcc[4], WebPData* chunk_data); // Deletes the chunk with the given 'fourcc' from the mux object. @@ -189,7 +189,7 @@ WEBP_EXTERN(WebPMuxError) WebPMuxGetChunk( // or if fourcc corresponds to an image chunk. // WEBP_MUX_NOT_FOUND - If mux does not contain a chunk with the given fourcc. // WEBP_MUX_OK - on success. -WEBP_EXTERN(WebPMuxError) WebPMuxDeleteChunk( +WEBP_EXTERN WebPMuxError WebPMuxDeleteChunk( WebPMux* mux, const char fourcc[4]); //------------------------------------------------------------------------------ @@ -222,7 +222,7 @@ struct WebPMuxFrameInfo { // WEBP_MUX_INVALID_ARGUMENT - if mux is NULL or bitstream is NULL. // WEBP_MUX_MEMORY_ERROR - on memory allocation error. // WEBP_MUX_OK - on success. -WEBP_EXTERN(WebPMuxError) WebPMuxSetImage( +WEBP_EXTERN WebPMuxError WebPMuxSetImage( WebPMux* mux, const WebPData* bitstream, int copy_data); // Adds a frame at the end of the mux object. @@ -241,7 +241,7 @@ WEBP_EXTERN(WebPMuxError) WebPMuxSetImage( // or if content of 'frame' is invalid. // WEBP_MUX_MEMORY_ERROR - on memory allocation error. // WEBP_MUX_OK - on success. -WEBP_EXTERN(WebPMuxError) WebPMuxPushFrame( +WEBP_EXTERN WebPMuxError WebPMuxPushFrame( WebPMux* mux, const WebPMuxFrameInfo* frame, int copy_data); // Gets the nth frame from the mux object. @@ -259,7 +259,7 @@ WEBP_EXTERN(WebPMuxError) WebPMuxPushFrame( // WEBP_MUX_BAD_DATA - if nth frame chunk in mux is invalid. // WEBP_MUX_MEMORY_ERROR - on memory allocation error. // WEBP_MUX_OK - on success. -WEBP_EXTERN(WebPMuxError) WebPMuxGetFrame( +WEBP_EXTERN WebPMuxError WebPMuxGetFrame( const WebPMux* mux, uint32_t nth, WebPMuxFrameInfo* frame); // Deletes a frame from the mux object. @@ -272,7 +272,7 @@ WEBP_EXTERN(WebPMuxError) WebPMuxGetFrame( // WEBP_MUX_NOT_FOUND - If there are less than nth frames in the mux object // before deletion. // WEBP_MUX_OK - on success. -WEBP_EXTERN(WebPMuxError) WebPMuxDeleteFrame(WebPMux* mux, uint32_t nth); +WEBP_EXTERN WebPMuxError WebPMuxDeleteFrame(WebPMux* mux, uint32_t nth); //------------------------------------------------------------------------------ // Animation. @@ -296,7 +296,7 @@ struct WebPMuxAnimParams { // WEBP_MUX_INVALID_ARGUMENT - if mux or params is NULL. // WEBP_MUX_MEMORY_ERROR - on memory allocation error. // WEBP_MUX_OK - on success. -WEBP_EXTERN(WebPMuxError) WebPMuxSetAnimationParams( +WEBP_EXTERN WebPMuxError WebPMuxSetAnimationParams( WebPMux* mux, const WebPMuxAnimParams* params); // Gets the animation parameters from the mux object. @@ -307,7 +307,7 @@ WEBP_EXTERN(WebPMuxError) WebPMuxSetAnimationParams( // WEBP_MUX_INVALID_ARGUMENT - if mux or params is NULL. // WEBP_MUX_NOT_FOUND - if ANIM chunk is not present in mux object. // WEBP_MUX_OK - on success. -WEBP_EXTERN(WebPMuxError) WebPMuxGetAnimationParams( +WEBP_EXTERN WebPMuxError WebPMuxGetAnimationParams( const WebPMux* mux, WebPMuxAnimParams* params); //------------------------------------------------------------------------------ @@ -328,8 +328,8 @@ WEBP_EXTERN(WebPMuxError) WebPMuxGetAnimationParams( // WEBP_MUX_INVALID_ARGUMENT - if mux is NULL; or // width or height are invalid or out of bounds // WEBP_MUX_OK - on success. -WEBP_EXTERN(WebPMuxError) WebPMuxSetCanvasSize(WebPMux* mux, - int width, int height); +WEBP_EXTERN WebPMuxError WebPMuxSetCanvasSize(WebPMux* mux, + int width, int height); // Gets the canvas size from the mux object. // Note: This method assumes that the VP8X chunk, if present, is up-to-date. @@ -343,8 +343,8 @@ WEBP_EXTERN(WebPMuxError) WebPMuxSetCanvasSize(WebPMux* mux, // WEBP_MUX_INVALID_ARGUMENT - if mux, width or height is NULL. // WEBP_MUX_BAD_DATA - if VP8X/VP8/VP8L chunk or canvas size is invalid. // WEBP_MUX_OK - on success. -WEBP_EXTERN(WebPMuxError) WebPMuxGetCanvasSize(const WebPMux* mux, - int* width, int* height); +WEBP_EXTERN WebPMuxError WebPMuxGetCanvasSize(const WebPMux* mux, + int* width, int* height); // Gets the feature flags from the mux object. // Note: This method assumes that the VP8X chunk, if present, is up-to-date. @@ -359,8 +359,8 @@ WEBP_EXTERN(WebPMuxError) WebPMuxGetCanvasSize(const WebPMux* mux, // WEBP_MUX_INVALID_ARGUMENT - if mux or flags is NULL. // WEBP_MUX_BAD_DATA - if VP8X/VP8/VP8L chunk or canvas size is invalid. // WEBP_MUX_OK - on success. -WEBP_EXTERN(WebPMuxError) WebPMuxGetFeatures(const WebPMux* mux, - uint32_t* flags); +WEBP_EXTERN WebPMuxError WebPMuxGetFeatures(const WebPMux* mux, + uint32_t* flags); // Gets number of chunks with the given 'id' in the mux object. // Parameters: @@ -370,8 +370,8 @@ WEBP_EXTERN(WebPMuxError) WebPMuxGetFeatures(const WebPMux* mux, // Returns: // WEBP_MUX_INVALID_ARGUMENT - if mux, or num_elements is NULL. // WEBP_MUX_OK - on success. -WEBP_EXTERN(WebPMuxError) WebPMuxNumChunks(const WebPMux* mux, - WebPChunkId id, int* num_elements); +WEBP_EXTERN WebPMuxError WebPMuxNumChunks(const WebPMux* mux, + WebPChunkId id, int* num_elements); // Assembles all chunks in WebP RIFF format and returns in 'assembled_data'. // This function also validates the mux object. @@ -388,8 +388,8 @@ WEBP_EXTERN(WebPMuxError) WebPMuxNumChunks(const WebPMux* mux, // WEBP_MUX_INVALID_ARGUMENT - if mux or assembled_data is NULL. // WEBP_MUX_MEMORY_ERROR - on memory allocation error. // WEBP_MUX_OK - on success. -WEBP_EXTERN(WebPMuxError) WebPMuxAssemble(WebPMux* mux, - WebPData* assembled_data); +WEBP_EXTERN WebPMuxError WebPMuxAssemble(WebPMux* mux, + WebPData* assembled_data); //------------------------------------------------------------------------------ // WebPAnimEncoder API @@ -442,7 +442,7 @@ struct WebPAnimEncoderOptions { }; // Internal, version-checked, entry point. -WEBP_EXTERN(int) WebPAnimEncoderOptionsInitInternal( +WEBP_EXTERN int WebPAnimEncoderOptionsInitInternal( WebPAnimEncoderOptions*, int); // Should always be called, to initialize a fresh WebPAnimEncoderOptions @@ -455,7 +455,7 @@ static WEBP_INLINE int WebPAnimEncoderOptionsInit( } // Internal, version-checked, entry point. -WEBP_EXTERN(WebPAnimEncoder*) WebPAnimEncoderNewInternal( +WEBP_EXTERN WebPAnimEncoder* WebPAnimEncoderNewInternal( int, int, const WebPAnimEncoderOptions*, int); // Creates and initializes a WebPAnimEncoder object. @@ -490,7 +490,7 @@ static WEBP_INLINE WebPAnimEncoder* WebPAnimEncoderNew( // Returns: // On error, returns false and frame->error_code is set appropriately. // Otherwise, returns true. -WEBP_EXTERN(int) WebPAnimEncoderAdd( +WEBP_EXTERN int WebPAnimEncoderAdd( WebPAnimEncoder* enc, struct WebPPicture* frame, int timestamp_ms, const struct WebPConfig* config); @@ -503,8 +503,8 @@ WEBP_EXTERN(int) WebPAnimEncoderAdd( // webp_data - (out) generated WebP bitstream. // Returns: // True on success. -WEBP_EXTERN(int) WebPAnimEncoderAssemble(WebPAnimEncoder* enc, - WebPData* webp_data); +WEBP_EXTERN int WebPAnimEncoderAssemble(WebPAnimEncoder* enc, + WebPData* webp_data); // Get error string corresponding to the most recent call using 'enc'. The // returned string is owned by 'enc' and is valid only until the next call to @@ -514,12 +514,12 @@ WEBP_EXTERN(int) WebPAnimEncoderAssemble(WebPAnimEncoder* enc, // Returns: // NULL if 'enc' is NULL. Otherwise, returns the error string if the last call // to 'enc' had an error, or an empty string if the last call was a success. -WEBP_EXTERN(const char*) WebPAnimEncoderGetError(WebPAnimEncoder* enc); +WEBP_EXTERN const char* WebPAnimEncoderGetError(WebPAnimEncoder* enc); // Deletes the WebPAnimEncoder object. // Parameters: // enc - (in/out) object to be deleted -WEBP_EXTERN(void) WebPAnimEncoderDelete(WebPAnimEncoder* enc); +WEBP_EXTERN void WebPAnimEncoderDelete(WebPAnimEncoder* enc); //------------------------------------------------------------------------------ @@ -527,4 +527,4 @@ WEBP_EXTERN(void) WebPAnimEncoderDelete(WebPAnimEncoder* enc); } // extern "C" #endif -#endif /* WEBP_WEBP_MUX_H_ */ +#endif // WEBP_WEBP_MUX_H_ diff --git a/src/3rdparty/libwebp/src/webp/mux_types.h b/src/3rdparty/libwebp/src/webp/mux_types.h index b37e2c6..ceea77d 100644 --- a/src/3rdparty/libwebp/src/webp/mux_types.h +++ b/src/3rdparty/libwebp/src/webp/mux_types.h @@ -95,4 +95,4 @@ static WEBP_INLINE int WebPDataCopy(const WebPData* src, WebPData* dst) { } // extern "C" #endif -#endif /* WEBP_WEBP_MUX_TYPES_H_ */ +#endif // WEBP_WEBP_MUX_TYPES_H_ diff --git a/src/3rdparty/libwebp/src/webp/types.h b/src/3rdparty/libwebp/src/webp/types.h index 98fff35..0ce2622 100644 --- a/src/3rdparty/libwebp/src/webp/types.h +++ b/src/3rdparty/libwebp/src/webp/types.h @@ -40,13 +40,13 @@ typedef long long int int64_t; // This explicitly marks library functions and allows for changing the // signature for e.g., Windows DLL builds. # if defined(__GNUC__) && __GNUC__ >= 4 -# define WEBP_EXTERN(type) extern __attribute__ ((visibility ("default"))) type +# define WEBP_EXTERN extern __attribute__ ((visibility ("default"))) # else -# define WEBP_EXTERN(type) extern type +# define WEBP_EXTERN extern # endif /* __GNUC__ >= 4 */ #endif /* WEBP_EXTERN */ // Macro to check ABI compatibility (same major revision number) #define WEBP_ABI_IS_INCOMPATIBLE(a, b) (((a) >> 8) != ((b) >> 8)) -#endif /* WEBP_WEBP_TYPES_H_ */ +#endif // WEBP_WEBP_TYPES_H_ |