diff options
author | Eirik Aavitsland <eirik.aavitsland@qt.io> | 2023-07-04 11:19:30 +0200 |
---|---|---|
committer | Eirik Aavitsland <eirik.aavitsland@qt.io> | 2023-07-04 12:24:40 +0200 |
commit | 32d5b3dcd33c2ae724a13040838313f96cf07966 (patch) | |
tree | 03f16d05879e289102bfa227d6f0ae80b6af111e /src/3rdparty/libwebp/src/dsp | |
parent | 3c858016f91f2e977ea67917bd01db3a7e76ea4c (diff) |
Update bundled libwebp to version 1.3.1
[ChangeLog][Third-Party Code] Update bundled libwebp to version 1.3.1
Pick-to: 6.6 6.5 6.2 5.15
Change-Id: I04b4a9fdb8da7cd3627d4ffaa15326c85858c425
Reviewed-by: Eskil Abrahamsen Blomfeldt <eskil.abrahamsen-blomfeldt@qt.io>
Reviewed-by: Qt CI Bot <qt_ci_bot@qt-project.org>
Diffstat (limited to 'src/3rdparty/libwebp/src/dsp')
22 files changed, 242 insertions, 83 deletions
diff --git a/src/3rdparty/libwebp/src/dsp/alpha_processing.c b/src/3rdparty/libwebp/src/dsp/alpha_processing.c index 1892929..1d152f2 100644 --- a/src/3rdparty/libwebp/src/dsp/alpha_processing.c +++ b/src/3rdparty/libwebp/src/dsp/alpha_processing.c @@ -425,6 +425,7 @@ void (*WebPAlphaReplace)(uint32_t* src, int length, uint32_t color); //------------------------------------------------------------------------------ // Init function +extern VP8CPUInfo VP8GetCPUInfo; extern void WebPInitAlphaProcessingMIPSdspR2(void); extern void WebPInitAlphaProcessingSSE2(void); extern void WebPInitAlphaProcessingSSE41(void); diff --git a/src/3rdparty/libwebp/src/dsp/cost.c b/src/3rdparty/libwebp/src/dsp/cost.c index 460ec4f..73d2140 100644 --- a/src/3rdparty/libwebp/src/dsp/cost.c +++ b/src/3rdparty/libwebp/src/dsp/cost.c @@ -374,6 +374,7 @@ static void SetResidualCoeffs_C(const int16_t* const coeffs, VP8GetResidualCostFunc VP8GetResidualCost; VP8SetResidualCoeffsFunc VP8SetResidualCoeffs; +extern VP8CPUInfo VP8GetCPUInfo; extern void VP8EncDspCostInitMIPS32(void); extern void VP8EncDspCostInitMIPSdspR2(void); extern void VP8EncDspCostInitSSE2(void); diff --git a/src/3rdparty/libwebp/src/dsp/cost_neon.c b/src/3rdparty/libwebp/src/dsp/cost_neon.c index 8cc8ce5..6582669 100644 --- a/src/3rdparty/libwebp/src/dsp/cost_neon.c +++ b/src/3rdparty/libwebp/src/dsp/cost_neon.c @@ -29,7 +29,7 @@ static void SetResidualCoeffs_NEON(const int16_t* const coeffs, const uint8x16_t eob = vcombine_u8(vqmovn_u16(eob_0), vqmovn_u16(eob_1)); const uint8x16_t masked = vandq_u8(eob, vld1q_u8(position)); -#ifdef __aarch64__ +#if WEBP_AARCH64 res->last = vmaxvq_u8(masked) - 1; #else const uint8x8_t eob_8x8 = vmax_u8(vget_low_u8(masked), vget_high_u8(masked)); @@ -43,7 +43,7 @@ static void SetResidualCoeffs_NEON(const int16_t* const coeffs, vst1_lane_s32(&res->last, vreinterpret_s32_u32(eob_32x2), 0); --res->last; -#endif // __aarch64__ +#endif // WEBP_AARCH64 res->coeffs = coeffs; } diff --git a/src/3rdparty/libwebp/src/dsp/cpu.c b/src/3rdparty/libwebp/src/dsp/cpu.c index 62de73f..2234c77 100644 --- a/src/3rdparty/libwebp/src/dsp/cpu.c +++ b/src/3rdparty/libwebp/src/dsp/cpu.c @@ -173,6 +173,7 @@ static int x86CPUInfo(CPUFeature feature) { } return 0; } +WEBP_EXTERN VP8CPUInfo VP8GetCPUInfo; VP8CPUInfo VP8GetCPUInfo = x86CPUInfo; #elif defined(WEBP_ANDROID_NEON) // NB: needs to be before generic NEON test. static int AndroidCPUInfo(CPUFeature feature) { @@ -184,6 +185,7 @@ static int AndroidCPUInfo(CPUFeature feature) { } return 0; } +WEBP_EXTERN VP8CPUInfo VP8GetCPUInfo; VP8CPUInfo VP8GetCPUInfo = AndroidCPUInfo; #elif defined(EMSCRIPTEN) // also needs to be before generic NEON test // Use compile flags as an indicator of SIMD support instead of a runtime check. @@ -208,6 +210,7 @@ static int wasmCPUInfo(CPUFeature feature) { } return 0; } +WEBP_EXTERN VP8CPUInfo VP8GetCPUInfo; VP8CPUInfo VP8GetCPUInfo = wasmCPUInfo; #elif defined(WEBP_HAVE_NEON) // In most cases this function doesn't check for NEON support (it's assumed by @@ -236,6 +239,7 @@ static int armCPUInfo(CPUFeature feature) { return 1; #endif } +WEBP_EXTERN VP8CPUInfo VP8GetCPUInfo; VP8CPUInfo VP8GetCPUInfo = armCPUInfo; #elif defined(WEBP_USE_MIPS32) || defined(WEBP_USE_MIPS_DSP_R2) || \ defined(WEBP_USE_MSA) @@ -247,7 +251,9 @@ static int mipsCPUInfo(CPUFeature feature) { } } +WEBP_EXTERN VP8CPUInfo VP8GetCPUInfo; VP8CPUInfo VP8GetCPUInfo = mipsCPUInfo; #else +WEBP_EXTERN VP8CPUInfo VP8GetCPUInfo; VP8CPUInfo VP8GetCPUInfo = NULL; #endif diff --git a/src/3rdparty/libwebp/src/dsp/cpu.h b/src/3rdparty/libwebp/src/dsp/cpu.h index de32a39..581ecbd 100644 --- a/src/3rdparty/libwebp/src/dsp/cpu.h +++ b/src/3rdparty/libwebp/src/dsp/cpu.h @@ -14,10 +14,10 @@ #ifndef WEBP_DSP_CPU_H_ #define WEBP_DSP_CPU_H_ -#include <stddef.h> - #include <qglobal.h> +#include <stddef.h> + #ifdef HAVE_CONFIG_H #include "src/webp/config.h" #endif @@ -45,6 +45,9 @@ #define __has_builtin(x) 0 #endif +//------------------------------------------------------------------------------ +// x86 defines. + #if !defined(HAVE_CONFIG_H) #if defined(_MSC_VER) && _MSC_VER > 1310 && \ (defined(_M_X64) || defined(_M_IX86)) && !defined(__clang__) @@ -82,6 +85,9 @@ #undef WEBP_MSC_SSE41 #undef WEBP_MSC_SSE2 +//------------------------------------------------------------------------------ +// Arm defines. + // The intrinsics currently cause compiler errors with arm-nacl-gcc and the // inline assembly would need to be modified for use with Native Client. #if ((defined(__ARM_NEON__) || defined(__aarch64__)) && \ @@ -100,17 +106,27 @@ // inclusion of arm64_neon.h; Visual Studio 2019 includes this file in // arm_neon.h. Compile errors were seen with Visual Studio 2019 16.4 with // vtbl4_u8(); a fix was made in 16.6. -#if defined(_MSC_VER) && ((_MSC_VER >= 1700 && defined(_M_ARM)) || \ - (_MSC_VER >= 1926 && defined(_M_ARM64))) && \ - !defined(__clang__) && (QT_CONFIG_neon == 1) +#if defined(_MSC_VER) && \ + ((_MSC_VER >= 1700 && defined(_M_ARM)) || \ + (_MSC_VER >= 1926 && (defined(_M_ARM64) || defined(_M_ARM64EC)))) && \ + !defined(__clang__) && (QT_CONFIG_neon == 1) #define WEBP_USE_NEON #define WEBP_USE_INTRINSICS #endif +#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#define WEBP_AARCH64 1 +#else +#define WEBP_AARCH64 0 +#endif + #if defined(WEBP_USE_NEON) && !defined(WEBP_HAVE_NEON) #define WEBP_HAVE_NEON #endif +//------------------------------------------------------------------------------ +// MIPS defines. + #if defined(__mips__) && !defined(__mips64) && defined(__mips_isa_rev) && \ (__mips_isa_rev >= 1) && (__mips_isa_rev < 6) #define WEBP_USE_MIPS32 @@ -126,6 +142,8 @@ #define WEBP_USE_MSA #endif +//------------------------------------------------------------------------------ + #ifndef WEBP_DSP_OMIT_C_CODE #define WEBP_DSP_OMIT_C_CODE 1 #endif @@ -136,13 +154,14 @@ #define WEBP_NEON_OMIT_C_CODE 0 #endif -#if !(LOCAL_CLANG_PREREQ(3, 8) || LOCAL_GCC_PREREQ(4, 8) || \ - defined(__aarch64__)) +#if !(LOCAL_CLANG_PREREQ(3, 8) || LOCAL_GCC_PREREQ(4, 8) || WEBP_AARCH64) #define WEBP_NEON_WORK_AROUND_GCC 1 #else #define WEBP_NEON_WORK_AROUND_GCC 0 #endif +//------------------------------------------------------------------------------ + // This macro prevents thread_sanitizer from reporting known concurrent writes. #define WEBP_TSAN_IGNORE_FUNCTION #if defined(__has_feature) @@ -244,16 +263,7 @@ typedef enum { kMSA } CPUFeature; -#ifdef __cplusplus -extern "C" { -#endif - // returns true if the CPU supports the feature. typedef int (*VP8CPUInfo)(CPUFeature feature); -WEBP_EXTERN VP8CPUInfo VP8GetCPUInfo; - -#ifdef __cplusplus -} // extern "C" -#endif #endif // WEBP_DSP_CPU_H_ diff --git a/src/3rdparty/libwebp/src/dsp/dec.c b/src/3rdparty/libwebp/src/dsp/dec.c index 537c701..33d8df8 100644 --- a/src/3rdparty/libwebp/src/dsp/dec.c +++ b/src/3rdparty/libwebp/src/dsp/dec.c @@ -734,6 +734,7 @@ VP8SimpleFilterFunc VP8SimpleHFilter16i; void (*VP8DitherCombine8x8)(const uint8_t* dither, uint8_t* dst, int dst_stride); +extern VP8CPUInfo VP8GetCPUInfo; extern void VP8DspInitSSE2(void); extern void VP8DspInitSSE41(void); extern void VP8DspInitNEON(void); diff --git a/src/3rdparty/libwebp/src/dsp/dec_neon.c b/src/3rdparty/libwebp/src/dsp/dec_neon.c index fa85170..22784cf 100644 --- a/src/3rdparty/libwebp/src/dsp/dec_neon.c +++ b/src/3rdparty/libwebp/src/dsp/dec_neon.c @@ -1428,7 +1428,7 @@ static WEBP_INLINE void DC8_NEON(uint8_t* dst, int do_top, int do_left) { if (do_top) { const uint8x8_t A = vld1_u8(dst - BPS); // top row -#if defined(__aarch64__) +#if WEBP_AARCH64 const uint16_t p2 = vaddlv_u8(A); sum_top = vdupq_n_u16(p2); #else @@ -1511,7 +1511,7 @@ static WEBP_INLINE void DC16_NEON(uint8_t* dst, int do_top, int do_left) { if (do_top) { const uint8x16_t A = vld1q_u8(dst - BPS); // top row -#if defined(__aarch64__) +#if WEBP_AARCH64 const uint16_t p3 = vaddlvq_u8(A); sum_top = vdupq_n_u16(p3); #else diff --git a/src/3rdparty/libwebp/src/dsp/enc.c b/src/3rdparty/libwebp/src/dsp/enc.c index ea47a3f..2ba97ba 100644 --- a/src/3rdparty/libwebp/src/dsp/enc.c +++ b/src/3rdparty/libwebp/src/dsp/enc.c @@ -732,6 +732,7 @@ VP8QuantizeBlockWHT VP8EncQuantizeBlockWHT; VP8BlockCopy VP8Copy4x4; VP8BlockCopy VP8Copy16x8; +extern VP8CPUInfo VP8GetCPUInfo; extern void VP8EncDspInitSSE2(void); extern void VP8EncDspInitSSE41(void); extern void VP8EncDspInitNEON(void); diff --git a/src/3rdparty/libwebp/src/dsp/enc_neon.c b/src/3rdparty/libwebp/src/dsp/enc_neon.c index 3a04111..7148003 100644 --- a/src/3rdparty/libwebp/src/dsp/enc_neon.c +++ b/src/3rdparty/libwebp/src/dsp/enc_neon.c @@ -764,7 +764,7 @@ static WEBP_INLINE void AccumulateSSE16_NEON(const uint8_t* const a, // Horizontal sum of all four uint32_t values in 'sum'. static int SumToInt_NEON(uint32x4_t sum) { -#if defined(__aarch64__) +#if WEBP_AARCH64 return (int)vaddvq_u32(sum); #else const uint64x2_t sum2 = vpaddlq_u32(sum); @@ -865,7 +865,7 @@ static int QuantizeBlock_NEON(int16_t in[16], int16_t out[16], uint8x8x4_t shuffles; // vtbl?_u8 are marked unavailable for iOS arm64 with Xcode < 6.3, use // non-standard versions there. -#if defined(__APPLE__) && defined(__aarch64__) && \ +#if defined(__APPLE__) && WEBP_AARCH64 && \ defined(__apple_build_version__) && (__apple_build_version__< 6020037) uint8x16x2_t all_out; INIT_VECTOR2(all_out, vreinterpretq_u8_s16(out0), vreinterpretq_u8_s16(out1)); diff --git a/src/3rdparty/libwebp/src/dsp/enc_sse2.c b/src/3rdparty/libwebp/src/dsp/enc_sse2.c index 1d10556..010624a 100644 --- a/src/3rdparty/libwebp/src/dsp/enc_sse2.c +++ b/src/3rdparty/libwebp/src/dsp/enc_sse2.c @@ -25,9 +25,160 @@ //------------------------------------------------------------------------------ // Transforms (Paragraph 14.4) -// Does one or two inverse transforms. -static void ITransform_SSE2(const uint8_t* ref, const int16_t* in, uint8_t* dst, - int do_two) { +// Does one inverse transform. +static void ITransform_One_SSE2(const uint8_t* ref, const int16_t* in, + uint8_t* dst) { + // This implementation makes use of 16-bit fixed point versions of two + // multiply constants: + // K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16 + // K2 = sqrt(2) * sin (pi/8) ~= 35468 / 2^16 + // + // To be able to use signed 16-bit integers, we use the following trick to + // have constants within range: + // - Associated constants are obtained by subtracting the 16-bit fixed point + // version of one: + // k = K - (1 << 16) => K = k + (1 << 16) + // K1 = 85267 => k1 = 20091 + // K2 = 35468 => k2 = -30068 + // - The multiplication of a variable by a constant become the sum of the + // variable and the multiplication of that variable by the associated + // constant: + // (x * K) >> 16 = (x * (k + (1 << 16))) >> 16 = ((x * k ) >> 16) + x + const __m128i k1k2 = _mm_set_epi16(-30068, -30068, -30068, -30068, + 20091, 20091, 20091, 20091); + const __m128i k2k1 = _mm_set_epi16(20091, 20091, 20091, 20091, + -30068, -30068, -30068, -30068); + const __m128i zero = _mm_setzero_si128(); + const __m128i zero_four = _mm_set_epi16(0, 0, 0, 0, 4, 4, 4, 4); + __m128i T01, T23; + + // Load and concatenate the transform coefficients. + const __m128i in01 = _mm_loadu_si128((const __m128i*)&in[0]); + const __m128i in23 = _mm_loadu_si128((const __m128i*)&in[8]); + // a00 a10 a20 a30 a01 a11 a21 a31 + // a02 a12 a22 a32 a03 a13 a23 a33 + + // Vertical pass and subsequent transpose. + { + const __m128i in1 = _mm_unpackhi_epi64(in01, in01); + const __m128i in3 = _mm_unpackhi_epi64(in23, in23); + + // First pass, c and d calculations are longer because of the "trick" + // multiplications. + // c = MUL(in1, K2) - MUL(in3, K1) = MUL(in1, k2) - MUL(in3, k1) + in1 - in3 + // d = MUL(in1, K1) + MUL(in3, K2) = MUL(in1, k1) + MUL(in3, k2) + in1 + in3 + const __m128i a_d3 = _mm_add_epi16(in01, in23); + const __m128i b_c3 = _mm_sub_epi16(in01, in23); + const __m128i c1d1 = _mm_mulhi_epi16(in1, k2k1); + const __m128i c2d2 = _mm_mulhi_epi16(in3, k1k2); + const __m128i c3 = _mm_unpackhi_epi64(b_c3, b_c3); + const __m128i c4 = _mm_sub_epi16(c1d1, c2d2); + const __m128i c = _mm_add_epi16(c3, c4); + const __m128i d4u = _mm_add_epi16(c1d1, c2d2); + const __m128i du = _mm_add_epi16(a_d3, d4u); + const __m128i d = _mm_unpackhi_epi64(du, du); + + // Second pass. + const __m128i comb_ab = _mm_unpacklo_epi64(a_d3, b_c3); + const __m128i comb_dc = _mm_unpacklo_epi64(d, c); + + const __m128i tmp01 = _mm_add_epi16(comb_ab, comb_dc); + const __m128i tmp32 = _mm_sub_epi16(comb_ab, comb_dc); + const __m128i tmp23 = _mm_shuffle_epi32(tmp32, _MM_SHUFFLE(1, 0, 3, 2)); + + const __m128i transpose_0 = _mm_unpacklo_epi16(tmp01, tmp23); + const __m128i transpose_1 = _mm_unpackhi_epi16(tmp01, tmp23); + // a00 a20 a01 a21 a02 a22 a03 a23 + // a10 a30 a11 a31 a12 a32 a13 a33 + + T01 = _mm_unpacklo_epi16(transpose_0, transpose_1); + T23 = _mm_unpackhi_epi16(transpose_0, transpose_1); + // a00 a10 a20 a30 a01 a11 a21 a31 + // a02 a12 a22 a32 a03 a13 a23 a33 + } + + // Horizontal pass and subsequent transpose. + { + const __m128i T1 = _mm_unpackhi_epi64(T01, T01); + const __m128i T3 = _mm_unpackhi_epi64(T23, T23); + + // First pass, c and d calculations are longer because of the "trick" + // multiplications. + const __m128i dc = _mm_add_epi16(T01, zero_four); + + // c = MUL(T1, K2) - MUL(T3, K1) = MUL(T1, k2) - MUL(T3, k1) + T1 - T3 + // d = MUL(T1, K1) + MUL(T3, K2) = MUL(T1, k1) + MUL(T3, k2) + T1 + T3 + const __m128i a_d3 = _mm_add_epi16(dc, T23); + const __m128i b_c3 = _mm_sub_epi16(dc, T23); + const __m128i c1d1 = _mm_mulhi_epi16(T1, k2k1); + const __m128i c2d2 = _mm_mulhi_epi16(T3, k1k2); + const __m128i c3 = _mm_unpackhi_epi64(b_c3, b_c3); + const __m128i c4 = _mm_sub_epi16(c1d1, c2d2); + const __m128i c = _mm_add_epi16(c3, c4); + const __m128i d4u = _mm_add_epi16(c1d1, c2d2); + const __m128i du = _mm_add_epi16(a_d3, d4u); + const __m128i d = _mm_unpackhi_epi64(du, du); + + // Second pass. + const __m128i comb_ab = _mm_unpacklo_epi64(a_d3, b_c3); + const __m128i comb_dc = _mm_unpacklo_epi64(d, c); + + const __m128i tmp01 = _mm_add_epi16(comb_ab, comb_dc); + const __m128i tmp32 = _mm_sub_epi16(comb_ab, comb_dc); + const __m128i tmp23 = _mm_shuffle_epi32(tmp32, _MM_SHUFFLE(1, 0, 3, 2)); + + const __m128i shifted01 = _mm_srai_epi16(tmp01, 3); + const __m128i shifted23 = _mm_srai_epi16(tmp23, 3); + // a00 a01 a02 a03 a10 a11 a12 a13 + // a20 a21 a22 a23 a30 a31 a32 a33 + + const __m128i transpose_0 = _mm_unpacklo_epi16(shifted01, shifted23); + const __m128i transpose_1 = _mm_unpackhi_epi16(shifted01, shifted23); + // a00 a20 a01 a21 a02 a22 a03 a23 + // a10 a30 a11 a31 a12 a32 a13 a33 + + T01 = _mm_unpacklo_epi16(transpose_0, transpose_1); + T23 = _mm_unpackhi_epi16(transpose_0, transpose_1); + // a00 a10 a20 a30 a01 a11 a21 a31 + // a02 a12 a22 a32 a03 a13 a23 a33 + } + + // Add inverse transform to 'ref' and store. + { + // Load the reference(s). + __m128i ref01, ref23, ref0123; + int32_t buf[4]; + + // Load four bytes/pixels per line. + const __m128i ref0 = _mm_cvtsi32_si128(WebPMemToInt32(&ref[0 * BPS])); + const __m128i ref1 = _mm_cvtsi32_si128(WebPMemToInt32(&ref[1 * BPS])); + const __m128i ref2 = _mm_cvtsi32_si128(WebPMemToInt32(&ref[2 * BPS])); + const __m128i ref3 = _mm_cvtsi32_si128(WebPMemToInt32(&ref[3 * BPS])); + ref01 = _mm_unpacklo_epi32(ref0, ref1); + ref23 = _mm_unpacklo_epi32(ref2, ref3); + + // Convert to 16b. + ref01 = _mm_unpacklo_epi8(ref01, zero); + ref23 = _mm_unpacklo_epi8(ref23, zero); + // Add the inverse transform(s). + ref01 = _mm_add_epi16(ref01, T01); + ref23 = _mm_add_epi16(ref23, T23); + // Unsigned saturate to 8b. + ref0123 = _mm_packus_epi16(ref01, ref23); + + _mm_storeu_si128((__m128i *)buf, ref0123); + + // Store four bytes/pixels per line. + WebPInt32ToMem(&dst[0 * BPS], buf[0]); + WebPInt32ToMem(&dst[1 * BPS], buf[1]); + WebPInt32ToMem(&dst[2 * BPS], buf[2]); + WebPInt32ToMem(&dst[3 * BPS], buf[3]); + } +} + +// Does two inverse transforms. +static void ITransform_Two_SSE2(const uint8_t* ref, const int16_t* in, + uint8_t* dst) { // This implementation makes use of 16-bit fixed point versions of two // multiply constants: // K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16 @@ -49,33 +200,21 @@ static void ITransform_SSE2(const uint8_t* ref, const int16_t* in, uint8_t* dst, __m128i T0, T1, T2, T3; // Load and concatenate the transform coefficients (we'll do two inverse - // transforms in parallel). In the case of only one inverse transform, the - // second half of the vectors will just contain random value we'll never - // use nor store. + // transforms in parallel). __m128i in0, in1, in2, in3; { - in0 = _mm_loadl_epi64((const __m128i*)&in[0]); - in1 = _mm_loadl_epi64((const __m128i*)&in[4]); - in2 = _mm_loadl_epi64((const __m128i*)&in[8]); - in3 = _mm_loadl_epi64((const __m128i*)&in[12]); - // a00 a10 a20 a30 x x x x - // a01 a11 a21 a31 x x x x - // a02 a12 a22 a32 x x x x - // a03 a13 a23 a33 x x x x - if (do_two) { - const __m128i inB0 = _mm_loadl_epi64((const __m128i*)&in[16]); - const __m128i inB1 = _mm_loadl_epi64((const __m128i*)&in[20]); - const __m128i inB2 = _mm_loadl_epi64((const __m128i*)&in[24]); - const __m128i inB3 = _mm_loadl_epi64((const __m128i*)&in[28]); - in0 = _mm_unpacklo_epi64(in0, inB0); - in1 = _mm_unpacklo_epi64(in1, inB1); - in2 = _mm_unpacklo_epi64(in2, inB2); - in3 = _mm_unpacklo_epi64(in3, inB3); - // a00 a10 a20 a30 b00 b10 b20 b30 - // a01 a11 a21 a31 b01 b11 b21 b31 - // a02 a12 a22 a32 b02 b12 b22 b32 - // a03 a13 a23 a33 b03 b13 b23 b33 - } + const __m128i tmp0 = _mm_loadu_si128((const __m128i*)&in[0]); + const __m128i tmp1 = _mm_loadu_si128((const __m128i*)&in[8]); + const __m128i tmp2 = _mm_loadu_si128((const __m128i*)&in[16]); + const __m128i tmp3 = _mm_loadu_si128((const __m128i*)&in[24]); + in0 = _mm_unpacklo_epi64(tmp0, tmp2); + in1 = _mm_unpackhi_epi64(tmp0, tmp2); + in2 = _mm_unpacklo_epi64(tmp1, tmp3); + in3 = _mm_unpackhi_epi64(tmp1, tmp3); + // a00 a10 a20 a30 b00 b10 b20 b30 + // a01 a11 a21 a31 b01 b11 b21 b31 + // a02 a12 a22 a32 b02 b12 b22 b32 + // a03 a13 a23 a33 b03 b13 b23 b33 } // Vertical pass and subsequent transpose. @@ -148,19 +287,11 @@ static void ITransform_SSE2(const uint8_t* ref, const int16_t* in, uint8_t* dst, const __m128i zero = _mm_setzero_si128(); // Load the reference(s). __m128i ref0, ref1, ref2, ref3; - if (do_two) { - // Load eight bytes/pixels per line. - ref0 = _mm_loadl_epi64((const __m128i*)&ref[0 * BPS]); - ref1 = _mm_loadl_epi64((const __m128i*)&ref[1 * BPS]); - ref2 = _mm_loadl_epi64((const __m128i*)&ref[2 * BPS]); - ref3 = _mm_loadl_epi64((const __m128i*)&ref[3 * BPS]); - } else { - // Load four bytes/pixels per line. - ref0 = _mm_cvtsi32_si128(WebPMemToInt32(&ref[0 * BPS])); - ref1 = _mm_cvtsi32_si128(WebPMemToInt32(&ref[1 * BPS])); - ref2 = _mm_cvtsi32_si128(WebPMemToInt32(&ref[2 * BPS])); - ref3 = _mm_cvtsi32_si128(WebPMemToInt32(&ref[3 * BPS])); - } + // Load eight bytes/pixels per line. + ref0 = _mm_loadl_epi64((const __m128i*)&ref[0 * BPS]); + ref1 = _mm_loadl_epi64((const __m128i*)&ref[1 * BPS]); + ref2 = _mm_loadl_epi64((const __m128i*)&ref[2 * BPS]); + ref3 = _mm_loadl_epi64((const __m128i*)&ref[3 * BPS]); // Convert to 16b. ref0 = _mm_unpacklo_epi8(ref0, zero); ref1 = _mm_unpacklo_epi8(ref1, zero); @@ -176,20 +307,21 @@ static void ITransform_SSE2(const uint8_t* ref, const int16_t* in, uint8_t* dst, ref1 = _mm_packus_epi16(ref1, ref1); ref2 = _mm_packus_epi16(ref2, ref2); ref3 = _mm_packus_epi16(ref3, ref3); - // Store the results. - if (do_two) { - // Store eight bytes/pixels per line. - _mm_storel_epi64((__m128i*)&dst[0 * BPS], ref0); - _mm_storel_epi64((__m128i*)&dst[1 * BPS], ref1); - _mm_storel_epi64((__m128i*)&dst[2 * BPS], ref2); - _mm_storel_epi64((__m128i*)&dst[3 * BPS], ref3); - } else { - // Store four bytes/pixels per line. - WebPInt32ToMem(&dst[0 * BPS], _mm_cvtsi128_si32(ref0)); - WebPInt32ToMem(&dst[1 * BPS], _mm_cvtsi128_si32(ref1)); - WebPInt32ToMem(&dst[2 * BPS], _mm_cvtsi128_si32(ref2)); - WebPInt32ToMem(&dst[3 * BPS], _mm_cvtsi128_si32(ref3)); - } + // Store eight bytes/pixels per line. + _mm_storel_epi64((__m128i*)&dst[0 * BPS], ref0); + _mm_storel_epi64((__m128i*)&dst[1 * BPS], ref1); + _mm_storel_epi64((__m128i*)&dst[2 * BPS], ref2); + _mm_storel_epi64((__m128i*)&dst[3 * BPS], ref3); + } +} + +// Does one or two inverse transforms. +static void ITransform_SSE2(const uint8_t* ref, const int16_t* in, uint8_t* dst, + int do_two) { + if (do_two) { + ITransform_Two_SSE2(ref, in, dst); + } else { + ITransform_One_SSE2(ref, in, dst); } } diff --git a/src/3rdparty/libwebp/src/dsp/filters.c b/src/3rdparty/libwebp/src/dsp/filters.c index 4506567..85eee50 100644 --- a/src/3rdparty/libwebp/src/dsp/filters.c +++ b/src/3rdparty/libwebp/src/dsp/filters.c @@ -233,6 +233,7 @@ static void GradientUnfilter_C(const uint8_t* prev, const uint8_t* in, WebPFilterFunc WebPFilters[WEBP_FILTER_LAST]; WebPUnfilterFunc WebPUnfilters[WEBP_FILTER_LAST]; +extern VP8CPUInfo VP8GetCPUInfo; extern void VP8FiltersInitMIPSdspR2(void); extern void VP8FiltersInitMSA(void); extern void VP8FiltersInitNEON(void); diff --git a/src/3rdparty/libwebp/src/dsp/lossless.c b/src/3rdparty/libwebp/src/dsp/lossless.c index fb86e58..9f81209 100644 --- a/src/3rdparty/libwebp/src/dsp/lossless.c +++ b/src/3rdparty/libwebp/src/dsp/lossless.c @@ -588,6 +588,7 @@ VP8LConvertFunc VP8LConvertBGRAToBGR; VP8LMapARGBFunc VP8LMapColor32b; VP8LMapAlphaFunc VP8LMapColor8b; +extern VP8CPUInfo VP8GetCPUInfo; extern void VP8LDspInitSSE2(void); extern void VP8LDspInitSSE41(void); extern void VP8LDspInitNEON(void); diff --git a/src/3rdparty/libwebp/src/dsp/lossless_enc.c b/src/3rdparty/libwebp/src/dsp/lossless_enc.c index b1f9f26..cde1280 100644 --- a/src/3rdparty/libwebp/src/dsp/lossless_enc.c +++ b/src/3rdparty/libwebp/src/dsp/lossless_enc.c @@ -791,6 +791,7 @@ VP8LBundleColorMapFunc VP8LBundleColorMap; VP8LPredictorAddSubFunc VP8LPredictorsSub[16]; VP8LPredictorAddSubFunc VP8LPredictorsSub_C[16]; +extern VP8CPUInfo VP8GetCPUInfo; extern void VP8LEncDspInitSSE2(void); extern void VP8LEncDspInitSSE41(void); extern void VP8LEncDspInitNEON(void); diff --git a/src/3rdparty/libwebp/src/dsp/lossless_enc_neon.c b/src/3rdparty/libwebp/src/dsp/lossless_enc_neon.c index 7c7b73f..e32c796 100644 --- a/src/3rdparty/libwebp/src/dsp/lossless_enc_neon.c +++ b/src/3rdparty/libwebp/src/dsp/lossless_enc_neon.c @@ -25,7 +25,7 @@ // vtbl?_u8 are marked unavailable for iOS arm64 with Xcode < 6.3, use // non-standard versions there. -#if defined(__APPLE__) && defined(__aarch64__) && \ +#if defined(__APPLE__) && WEBP_AARCH64 && \ defined(__apple_build_version__) && (__apple_build_version__< 6020037) #define USE_VTBLQ #endif diff --git a/src/3rdparty/libwebp/src/dsp/lossless_neon.c b/src/3rdparty/libwebp/src/dsp/lossless_neon.c index 89e3e01..ddc9b61 100644 --- a/src/3rdparty/libwebp/src/dsp/lossless_neon.c +++ b/src/3rdparty/libwebp/src/dsp/lossless_neon.c @@ -498,7 +498,7 @@ static void PredictorAdd13_NEON(const uint32_t* in, const uint32_t* upper, // vtbl?_u8 are marked unavailable for iOS arm64 with Xcode < 6.3, use // non-standard versions there. -#if defined(__APPLE__) && defined(__aarch64__) && \ +#if defined(__APPLE__) && WEBP_AARCH64 && \ defined(__apple_build_version__) && (__apple_build_version__< 6020037) #define USE_VTBLQ #endif diff --git a/src/3rdparty/libwebp/src/dsp/neon.h b/src/3rdparty/libwebp/src/dsp/neon.h index c591f9b..14acb40 100644 --- a/src/3rdparty/libwebp/src/dsp/neon.h +++ b/src/3rdparty/libwebp/src/dsp/neon.h @@ -21,7 +21,7 @@ // Right now, some intrinsics functions seem slower, so we disable them // everywhere except newer clang/gcc or aarch64 where the inline assembly is // incompatible. -#if LOCAL_CLANG_PREREQ(3,8) || LOCAL_GCC_PREREQ(4,9) || defined(__aarch64__) +#if LOCAL_CLANG_PREREQ(3, 8) || LOCAL_GCC_PREREQ(4, 9) || WEBP_AARCH64 #define WEBP_USE_INTRINSICS // use intrinsics when possible #endif @@ -46,7 +46,7 @@ // if using intrinsics, this flag avoids some functions that make gcc-4.6.3 // crash ("internal compiler error: in immed_double_const, at emit-rtl."). // (probably similar to gcc.gnu.org/bugzilla/show_bug.cgi?id=48183) -#if !(LOCAL_CLANG_PREREQ(3,8) || LOCAL_GCC_PREREQ(4,8) || defined(__aarch64__)) +#if !(LOCAL_CLANG_PREREQ(3, 8) || LOCAL_GCC_PREREQ(4, 8) || WEBP_AARCH64) #define WORK_AROUND_GCC #endif diff --git a/src/3rdparty/libwebp/src/dsp/quant.h b/src/3rdparty/libwebp/src/dsp/quant.h index fc099bf..bf7734c 100644 --- a/src/3rdparty/libwebp/src/dsp/quant.h +++ b/src/3rdparty/libwebp/src/dsp/quant.h @@ -22,7 +22,7 @@ #define IsFlat IsFlat_NEON static uint32_t horizontal_add_uint32x4(const uint32x4_t a) { -#if defined(__aarch64__) +#if WEBP_AARCH64 return vaddvq_u32(a); #else const uint64x2_t b = vpaddlq_u32(a); diff --git a/src/3rdparty/libwebp/src/dsp/rescaler.c b/src/3rdparty/libwebp/src/dsp/rescaler.c index 14620ce..325d8be 100644 --- a/src/3rdparty/libwebp/src/dsp/rescaler.c +++ b/src/3rdparty/libwebp/src/dsp/rescaler.c @@ -197,6 +197,7 @@ WebPRescalerImportRowFunc WebPRescalerImportRowShrink; WebPRescalerExportRowFunc WebPRescalerExportRowExpand; WebPRescalerExportRowFunc WebPRescalerExportRowShrink; +extern VP8CPUInfo VP8GetCPUInfo; extern void WebPRescalerDspInitSSE2(void); extern void WebPRescalerDspInitMIPS32(void); extern void WebPRescalerDspInitMIPSdspR2(void); diff --git a/src/3rdparty/libwebp/src/dsp/ssim.c b/src/3rdparty/libwebp/src/dsp/ssim.c index f85c2e6..9a1341e 100644 --- a/src/3rdparty/libwebp/src/dsp/ssim.c +++ b/src/3rdparty/libwebp/src/dsp/ssim.c @@ -137,6 +137,7 @@ VP8SSIMGetClippedFunc VP8SSIMGetClipped; VP8AccumulateSSEFunc VP8AccumulateSSE; #endif +extern VP8CPUInfo VP8GetCPUInfo; extern void VP8SSIMDspInitSSE2(void); WEBP_DSP_INIT_FUNC(VP8SSIMDspInit) { diff --git a/src/3rdparty/libwebp/src/dsp/upsampling.c b/src/3rdparty/libwebp/src/dsp/upsampling.c index 87f771f..983b9c4 100644 --- a/src/3rdparty/libwebp/src/dsp/upsampling.c +++ b/src/3rdparty/libwebp/src/dsp/upsampling.c @@ -215,6 +215,7 @@ static void EmptyYuv444Func(const uint8_t* y, WebPYUV444Converter WebPYUV444Converters[MODE_LAST]; +extern VP8CPUInfo VP8GetCPUInfo; extern void WebPInitYUV444ConvertersMIPSdspR2(void); extern void WebPInitYUV444ConvertersSSE2(void); extern void WebPInitYUV444ConvertersSSE41(void); diff --git a/src/3rdparty/libwebp/src/dsp/upsampling_neon.c b/src/3rdparty/libwebp/src/dsp/upsampling_neon.c index 6ba71a7..bbc000c 100644 --- a/src/3rdparty/libwebp/src/dsp/upsampling_neon.c +++ b/src/3rdparty/libwebp/src/dsp/upsampling_neon.c @@ -111,7 +111,7 @@ static const int16_t kCoeffs1[4] = { 19077, 26149, 6419, 13320 }; vst4_u8(out, v255_r_g_b); \ } while (0) -#if !defined(WEBP_SWAP_16BIT_CSP) +#if (WEBP_SWAP_16BIT_CSP == 0) #define ZIP_U8(lo, hi) vzip_u8((lo), (hi)) #else #define ZIP_U8(lo, hi) vzip_u8((hi), (lo)) diff --git a/src/3rdparty/libwebp/src/dsp/yuv.c b/src/3rdparty/libwebp/src/dsp/yuv.c index d16c13d..8a04b85 100644 --- a/src/3rdparty/libwebp/src/dsp/yuv.c +++ b/src/3rdparty/libwebp/src/dsp/yuv.c @@ -70,6 +70,7 @@ void WebPSamplerProcessPlane(const uint8_t* y, int y_stride, WebPSamplerRowFunc WebPSamplers[MODE_LAST]; +extern VP8CPUInfo VP8GetCPUInfo; extern void WebPInitSamplersSSE2(void); extern void WebPInitSamplersSSE41(void); extern void WebPInitSamplersMIPS32(void); |