summaryrefslogtreecommitdiffstats
path: root/src/3rdparty/libwebp/src/dsp
diff options
context:
space:
mode:
Diffstat (limited to 'src/3rdparty/libwebp/src/dsp')
-rw-r--r--src/3rdparty/libwebp/src/dsp/alpha_processing.c65
-rw-r--r--src/3rdparty/libwebp/src/dsp/alpha_processing_neon.c21
-rw-r--r--src/3rdparty/libwebp/src/dsp/alpha_processing_sse2.c20
-rw-r--r--src/3rdparty/libwebp/src/dsp/alpha_processing_sse41.c6
-rw-r--r--src/3rdparty/libwebp/src/dsp/cost.c4
-rw-r--r--src/3rdparty/libwebp/src/dsp/cpu.c13
-rw-r--r--src/3rdparty/libwebp/src/dsp/dec.c6
-rw-r--r--src/3rdparty/libwebp/src/dsp/dsp.h88
-rw-r--r--src/3rdparty/libwebp/src/dsp/enc.c6
-rw-r--r--src/3rdparty/libwebp/src/dsp/filters.c4
-rw-r--r--src/3rdparty/libwebp/src/dsp/filters_sse2.c5
-rw-r--r--src/3rdparty/libwebp/src/dsp/lossless.c10
-rw-r--r--src/3rdparty/libwebp/src/dsp/lossless_enc.c25
-rw-r--r--src/3rdparty/libwebp/src/dsp/lossless_enc_sse2.c109
-rw-r--r--src/3rdparty/libwebp/src/dsp/lossless_enc_sse41.c121
-rw-r--r--src/3rdparty/libwebp/src/dsp/lossless_sse2.c1
-rw-r--r--src/3rdparty/libwebp/src/dsp/lossless_sse41.c132
-rw-r--r--src/3rdparty/libwebp/src/dsp/rescaler.c11
-rw-r--r--src/3rdparty/libwebp/src/dsp/ssim.c2
-rw-r--r--src/3rdparty/libwebp/src/dsp/upsampling.c10
-rw-r--r--src/3rdparty/libwebp/src/dsp/yuv.c20
21 files changed, 440 insertions, 239 deletions
diff --git a/src/3rdparty/libwebp/src/dsp/alpha_processing.c b/src/3rdparty/libwebp/src/dsp/alpha_processing.c
index 3a27990..1892929 100644
--- a/src/3rdparty/libwebp/src/dsp/alpha_processing.c
+++ b/src/3rdparty/libwebp/src/dsp/alpha_processing.c
@@ -157,7 +157,8 @@ void WebPMultARGBRow_C(uint32_t* const ptr, int width, int inverse) {
}
}
-void WebPMultRow_C(uint8_t* const ptr, const uint8_t* const alpha,
+void WebPMultRow_C(uint8_t* WEBP_RESTRICT const ptr,
+ const uint8_t* WEBP_RESTRICT const alpha,
int width, int inverse) {
int x;
for (x = 0; x < width; ++x) {
@@ -178,7 +179,8 @@ void WebPMultRow_C(uint8_t* const ptr, const uint8_t* const alpha,
#undef MFIX
void (*WebPMultARGBRow)(uint32_t* const ptr, int width, int inverse);
-void (*WebPMultRow)(uint8_t* const ptr, const uint8_t* const alpha,
+void (*WebPMultRow)(uint8_t* WEBP_RESTRICT const ptr,
+ const uint8_t* WEBP_RESTRICT const alpha,
int width, int inverse);
//------------------------------------------------------------------------------
@@ -193,8 +195,8 @@ void WebPMultARGBRows(uint8_t* ptr, int stride, int width, int num_rows,
}
}
-void WebPMultRows(uint8_t* ptr, int stride,
- const uint8_t* alpha, int alpha_stride,
+void WebPMultRows(uint8_t* WEBP_RESTRICT ptr, int stride,
+ const uint8_t* WEBP_RESTRICT alpha, int alpha_stride,
int width, int num_rows, int inverse) {
int n;
for (n = 0; n < num_rows; ++n) {
@@ -290,9 +292,9 @@ static void ApplyAlphaMultiply_16b_C(uint8_t* rgba4444,
}
#if !WEBP_NEON_OMIT_C_CODE
-static int DispatchAlpha_C(const uint8_t* alpha, int alpha_stride,
+static int DispatchAlpha_C(const uint8_t* WEBP_RESTRICT alpha, int alpha_stride,
int width, int height,
- uint8_t* dst, int dst_stride) {
+ uint8_t* WEBP_RESTRICT dst, int dst_stride) {
uint32_t alpha_mask = 0xff;
int i, j;
@@ -309,9 +311,10 @@ static int DispatchAlpha_C(const uint8_t* alpha, int alpha_stride,
return (alpha_mask != 0xff);
}
-static void DispatchAlphaToGreen_C(const uint8_t* alpha, int alpha_stride,
- int width, int height,
- uint32_t* dst, int dst_stride) {
+static void DispatchAlphaToGreen_C(const uint8_t* WEBP_RESTRICT alpha,
+ int alpha_stride, int width, int height,
+ uint32_t* WEBP_RESTRICT dst,
+ int dst_stride) {
int i, j;
for (j = 0; j < height; ++j) {
for (i = 0; i < width; ++i) {
@@ -322,9 +325,9 @@ static void DispatchAlphaToGreen_C(const uint8_t* alpha, int alpha_stride,
}
}
-static int ExtractAlpha_C(const uint8_t* argb, int argb_stride,
+static int ExtractAlpha_C(const uint8_t* WEBP_RESTRICT argb, int argb_stride,
int width, int height,
- uint8_t* alpha, int alpha_stride) {
+ uint8_t* WEBP_RESTRICT alpha, int alpha_stride) {
uint8_t alpha_mask = 0xff;
int i, j;
@@ -340,7 +343,8 @@ static int ExtractAlpha_C(const uint8_t* argb, int argb_stride,
return (alpha_mask == 0xff);
}
-static void ExtractGreen_C(const uint32_t* argb, uint8_t* alpha, int size) {
+static void ExtractGreen_C(const uint32_t* WEBP_RESTRICT argb,
+ uint8_t* WEBP_RESTRICT alpha, int size) {
int i;
for (i = 0; i < size; ++i) alpha[i] = argb[i] >> 8;
}
@@ -372,8 +376,11 @@ static WEBP_INLINE uint32_t MakeARGB32(int a, int r, int g, int b) {
}
#ifdef WORDS_BIGENDIAN
-static void PackARGB_C(const uint8_t* a, const uint8_t* r, const uint8_t* g,
- const uint8_t* b, int len, uint32_t* out) {
+static void PackARGB_C(const uint8_t* WEBP_RESTRICT a,
+ const uint8_t* WEBP_RESTRICT r,
+ const uint8_t* WEBP_RESTRICT g,
+ const uint8_t* WEBP_RESTRICT b,
+ int len, uint32_t* WEBP_RESTRICT out) {
int i;
for (i = 0; i < len; ++i) {
out[i] = MakeARGB32(a[4 * i], r[4 * i], g[4 * i], b[4 * i]);
@@ -381,8 +388,10 @@ static void PackARGB_C(const uint8_t* a, const uint8_t* r, const uint8_t* g,
}
#endif
-static void PackRGB_C(const uint8_t* r, const uint8_t* g, const uint8_t* b,
- int len, int step, uint32_t* out) {
+static void PackRGB_C(const uint8_t* WEBP_RESTRICT r,
+ const uint8_t* WEBP_RESTRICT g,
+ const uint8_t* WEBP_RESTRICT b,
+ int len, int step, uint32_t* WEBP_RESTRICT out) {
int i, offset = 0;
for (i = 0; i < len; ++i) {
out[i] = MakeARGB32(0xff, r[offset], g[offset], b[offset]);
@@ -392,16 +401,22 @@ static void PackRGB_C(const uint8_t* r, const uint8_t* g, const uint8_t* b,
void (*WebPApplyAlphaMultiply)(uint8_t*, int, int, int, int);
void (*WebPApplyAlphaMultiply4444)(uint8_t*, int, int, int);
-int (*WebPDispatchAlpha)(const uint8_t*, int, int, int, uint8_t*, int);
-void (*WebPDispatchAlphaToGreen)(const uint8_t*, int, int, int, uint32_t*, int);
-int (*WebPExtractAlpha)(const uint8_t*, int, int, int, uint8_t*, int);
-void (*WebPExtractGreen)(const uint32_t* argb, uint8_t* alpha, int size);
+int (*WebPDispatchAlpha)(const uint8_t* WEBP_RESTRICT, int, int, int,
+ uint8_t* WEBP_RESTRICT, int);
+void (*WebPDispatchAlphaToGreen)(const uint8_t* WEBP_RESTRICT, int, int, int,
+ uint32_t* WEBP_RESTRICT, int);
+int (*WebPExtractAlpha)(const uint8_t* WEBP_RESTRICT, int, int, int,
+ uint8_t* WEBP_RESTRICT, int);
+void (*WebPExtractGreen)(const uint32_t* WEBP_RESTRICT argb,
+ uint8_t* WEBP_RESTRICT alpha, int size);
#ifdef WORDS_BIGENDIAN
void (*WebPPackARGB)(const uint8_t* a, const uint8_t* r, const uint8_t* g,
const uint8_t* b, int, uint32_t*);
#endif
-void (*WebPPackRGB)(const uint8_t* r, const uint8_t* g, const uint8_t* b,
- int len, int step, uint32_t* out);
+void (*WebPPackRGB)(const uint8_t* WEBP_RESTRICT r,
+ const uint8_t* WEBP_RESTRICT g,
+ const uint8_t* WEBP_RESTRICT b,
+ int len, int step, uint32_t* WEBP_RESTRICT out);
int (*WebPHasAlpha8b)(const uint8_t* src, int length);
int (*WebPHasAlpha32b)(const uint8_t* src, int length);
@@ -438,10 +453,10 @@ WEBP_DSP_INIT_FUNC(WebPInitAlphaProcessing) {
// If defined, use CPUInfo() to overwrite some pointers with faster versions.
if (VP8GetCPUInfo != NULL) {
-#if defined(WEBP_USE_SSE2)
+#if defined(WEBP_HAVE_SSE2)
if (VP8GetCPUInfo(kSSE2)) {
WebPInitAlphaProcessingSSE2();
-#if defined(WEBP_USE_SSE41)
+#if defined(WEBP_HAVE_SSE41)
if (VP8GetCPUInfo(kSSE4_1)) {
WebPInitAlphaProcessingSSE41();
}
@@ -455,7 +470,7 @@ WEBP_DSP_INIT_FUNC(WebPInitAlphaProcessing) {
#endif
}
-#if defined(WEBP_USE_NEON)
+#if defined(WEBP_HAVE_NEON)
if (WEBP_NEON_OMIT_C_CODE ||
(VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
WebPInitAlphaProcessingNEON();
diff --git a/src/3rdparty/libwebp/src/dsp/alpha_processing_neon.c b/src/3rdparty/libwebp/src/dsp/alpha_processing_neon.c
index 9d55421..9e0ace9 100644
--- a/src/3rdparty/libwebp/src/dsp/alpha_processing_neon.c
+++ b/src/3rdparty/libwebp/src/dsp/alpha_processing_neon.c
@@ -80,9 +80,9 @@ static void ApplyAlphaMultiply_NEON(uint8_t* rgba, int alpha_first,
//------------------------------------------------------------------------------
-static int DispatchAlpha_NEON(const uint8_t* alpha, int alpha_stride,
- int width, int height,
- uint8_t* dst, int dst_stride) {
+static int DispatchAlpha_NEON(const uint8_t* WEBP_RESTRICT alpha,
+ int alpha_stride, int width, int height,
+ uint8_t* WEBP_RESTRICT dst, int dst_stride) {
uint32_t alpha_mask = 0xffffffffu;
uint8x8_t mask8 = vdup_n_u8(0xff);
uint32_t tmp[2];
@@ -112,9 +112,10 @@ static int DispatchAlpha_NEON(const uint8_t* alpha, int alpha_stride,
return (alpha_mask != 0xffffffffu);
}
-static void DispatchAlphaToGreen_NEON(const uint8_t* alpha, int alpha_stride,
- int width, int height,
- uint32_t* dst, int dst_stride) {
+static void DispatchAlphaToGreen_NEON(const uint8_t* WEBP_RESTRICT alpha,
+ int alpha_stride, int width, int height,
+ uint32_t* WEBP_RESTRICT dst,
+ int dst_stride) {
int i, j;
uint8x8x4_t greens; // leave A/R/B channels zero'd.
greens.val[0] = vdup_n_u8(0);
@@ -131,9 +132,9 @@ static void DispatchAlphaToGreen_NEON(const uint8_t* alpha, int alpha_stride,
}
}
-static int ExtractAlpha_NEON(const uint8_t* argb, int argb_stride,
+static int ExtractAlpha_NEON(const uint8_t* WEBP_RESTRICT argb, int argb_stride,
int width, int height,
- uint8_t* alpha, int alpha_stride) {
+ uint8_t* WEBP_RESTRICT alpha, int alpha_stride) {
uint32_t alpha_mask = 0xffffffffu;
uint8x8_t mask8 = vdup_n_u8(0xff);
uint32_t tmp[2];
@@ -161,8 +162,8 @@ static int ExtractAlpha_NEON(const uint8_t* argb, int argb_stride,
return (alpha_mask == 0xffffffffu);
}
-static void ExtractGreen_NEON(const uint32_t* argb,
- uint8_t* alpha, int size) {
+static void ExtractGreen_NEON(const uint32_t* WEBP_RESTRICT argb,
+ uint8_t* WEBP_RESTRICT alpha, int size) {
int i;
for (i = 0; i + 16 <= size; i += 16) {
const uint8x16x4_t rgbX = vld4q_u8((const uint8_t*)(argb + i));
diff --git a/src/3rdparty/libwebp/src/dsp/alpha_processing_sse2.c b/src/3rdparty/libwebp/src/dsp/alpha_processing_sse2.c
index f6c6e0f..a5f8c9f 100644
--- a/src/3rdparty/libwebp/src/dsp/alpha_processing_sse2.c
+++ b/src/3rdparty/libwebp/src/dsp/alpha_processing_sse2.c
@@ -18,9 +18,9 @@
//------------------------------------------------------------------------------
-static int DispatchAlpha_SSE2(const uint8_t* alpha, int alpha_stride,
- int width, int height,
- uint8_t* dst, int dst_stride) {
+static int DispatchAlpha_SSE2(const uint8_t* WEBP_RESTRICT alpha,
+ int alpha_stride, int width, int height,
+ uint8_t* WEBP_RESTRICT dst, int dst_stride) {
// alpha_and stores an 'and' operation of all the alpha[] values. The final
// value is not 0xff if any of the alpha[] is not equal to 0xff.
uint32_t alpha_and = 0xff;
@@ -72,9 +72,10 @@ static int DispatchAlpha_SSE2(const uint8_t* alpha, int alpha_stride,
return (alpha_and != 0xff);
}
-static void DispatchAlphaToGreen_SSE2(const uint8_t* alpha, int alpha_stride,
- int width, int height,
- uint32_t* dst, int dst_stride) {
+static void DispatchAlphaToGreen_SSE2(const uint8_t* WEBP_RESTRICT alpha,
+ int alpha_stride, int width, int height,
+ uint32_t* WEBP_RESTRICT dst,
+ int dst_stride) {
int i, j;
const __m128i zero = _mm_setzero_si128();
const int limit = width & ~15;
@@ -98,9 +99,9 @@ static void DispatchAlphaToGreen_SSE2(const uint8_t* alpha, int alpha_stride,
}
}
-static int ExtractAlpha_SSE2(const uint8_t* argb, int argb_stride,
+static int ExtractAlpha_SSE2(const uint8_t* WEBP_RESTRICT argb, int argb_stride,
int width, int height,
- uint8_t* alpha, int alpha_stride) {
+ uint8_t* WEBP_RESTRICT alpha, int alpha_stride) {
// alpha_and stores an 'and' operation of all the alpha[] values. The final
// value is not 0xff if any of the alpha[] is not equal to 0xff.
uint32_t alpha_and = 0xff;
@@ -317,7 +318,8 @@ static void MultARGBRow_SSE2(uint32_t* const ptr, int width, int inverse) {
if (width > 0) WebPMultARGBRow_C(ptr + x, width, inverse);
}
-static void MultRow_SSE2(uint8_t* const ptr, const uint8_t* const alpha,
+static void MultRow_SSE2(uint8_t* WEBP_RESTRICT const ptr,
+ const uint8_t* WEBP_RESTRICT const alpha,
int width, int inverse) {
int x = 0;
if (!inverse) {
diff --git a/src/3rdparty/libwebp/src/dsp/alpha_processing_sse41.c b/src/3rdparty/libwebp/src/dsp/alpha_processing_sse41.c
index 56040f9..cdf877c 100644
--- a/src/3rdparty/libwebp/src/dsp/alpha_processing_sse41.c
+++ b/src/3rdparty/libwebp/src/dsp/alpha_processing_sse41.c
@@ -19,9 +19,9 @@
//------------------------------------------------------------------------------
-static int ExtractAlpha_SSE41(const uint8_t* argb, int argb_stride,
- int width, int height,
- uint8_t* alpha, int alpha_stride) {
+static int ExtractAlpha_SSE41(const uint8_t* WEBP_RESTRICT argb,
+ int argb_stride, int width, int height,
+ uint8_t* WEBP_RESTRICT alpha, int alpha_stride) {
// alpha_and stores an 'and' operation of all the alpha[] values. The final
// value is not 0xff if any of the alpha[] is not equal to 0xff.
uint32_t alpha_and = 0xff;
diff --git a/src/3rdparty/libwebp/src/dsp/cost.c b/src/3rdparty/libwebp/src/dsp/cost.c
index cc681cd..460ec4f 100644
--- a/src/3rdparty/libwebp/src/dsp/cost.c
+++ b/src/3rdparty/libwebp/src/dsp/cost.c
@@ -395,12 +395,12 @@ WEBP_DSP_INIT_FUNC(VP8EncDspCostInit) {
VP8EncDspCostInitMIPSdspR2();
}
#endif
-#if defined(WEBP_USE_SSE2)
+#if defined(WEBP_HAVE_SSE2)
if (VP8GetCPUInfo(kSSE2)) {
VP8EncDspCostInitSSE2();
}
#endif
-#if defined(WEBP_USE_NEON)
+#if defined(WEBP_HAVE_NEON)
if (VP8GetCPUInfo(kNEON)) {
VP8EncDspCostInitNEON();
}
diff --git a/src/3rdparty/libwebp/src/dsp/cpu.c b/src/3rdparty/libwebp/src/dsp/cpu.c
index 4ca90d8..3145e19 100644
--- a/src/3rdparty/libwebp/src/dsp/cpu.c
+++ b/src/3rdparty/libwebp/src/dsp/cpu.c
@@ -189,17 +189,17 @@ VP8CPUInfo VP8GetCPUInfo = AndroidCPUInfo;
// Use compile flags as an indicator of SIMD support instead of a runtime check.
static int wasmCPUInfo(CPUFeature feature) {
switch (feature) {
-#ifdef WEBP_USE_SSE2
+#ifdef WEBP_HAVE_SSE2
case kSSE2:
return 1;
#endif
-#ifdef WEBP_USE_SSE41
+#ifdef WEBP_HAVE_SSE41
case kSSE3:
case kSlowSSSE3:
case kSSE4_1:
return 1;
#endif
-#ifdef WEBP_USE_NEON
+#ifdef WEBP_HAVE_NEON
case kNEON:
return 1;
#endif
@@ -209,9 +209,10 @@ static int wasmCPUInfo(CPUFeature feature) {
return 0;
}
VP8CPUInfo VP8GetCPUInfo = wasmCPUInfo;
-#elif defined(WEBP_USE_NEON)
-// define a dummy function to enable turning off NEON at runtime by setting
-// VP8DecGetCPUInfo = NULL
+#elif defined(WEBP_HAVE_NEON)
+// In most cases this function doesn't check for NEON support (it's assumed by
+// the configuration), but enables turning off NEON at runtime, for testing
+// purposes, by setting VP8DecGetCPUInfo = NULL.
static int armCPUInfo(CPUFeature feature) {
if (feature != kNEON) return 0;
#if defined(__linux__) && defined(WEBP_HAVE_NEON_RTCD)
diff --git a/src/3rdparty/libwebp/src/dsp/dec.c b/src/3rdparty/libwebp/src/dsp/dec.c
index 1119842..537c701 100644
--- a/src/3rdparty/libwebp/src/dsp/dec.c
+++ b/src/3rdparty/libwebp/src/dsp/dec.c
@@ -807,10 +807,10 @@ WEBP_DSP_INIT_FUNC(VP8DspInit) {
// If defined, use CPUInfo() to overwrite some pointers with faster versions.
if (VP8GetCPUInfo != NULL) {
-#if defined(WEBP_USE_SSE2)
+#if defined(WEBP_HAVE_SSE2)
if (VP8GetCPUInfo(kSSE2)) {
VP8DspInitSSE2();
-#if defined(WEBP_USE_SSE41)
+#if defined(WEBP_HAVE_SSE41)
if (VP8GetCPUInfo(kSSE4_1)) {
VP8DspInitSSE41();
}
@@ -834,7 +834,7 @@ WEBP_DSP_INIT_FUNC(VP8DspInit) {
#endif
}
-#if defined(WEBP_USE_NEON)
+#if defined(WEBP_HAVE_NEON)
if (WEBP_NEON_OMIT_C_CODE ||
(VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
VP8DspInitNEON();
diff --git a/src/3rdparty/libwebp/src/dsp/dsp.h b/src/3rdparty/libwebp/src/dsp/dsp.h
index 6df48cf..35085e0 100644
--- a/src/3rdparty/libwebp/src/dsp/dsp.h
+++ b/src/3rdparty/libwebp/src/dsp/dsp.h
@@ -27,6 +27,23 @@ extern "C" {
#define BPS 32 // this is the common stride for enc/dec
//------------------------------------------------------------------------------
+// WEBP_RESTRICT
+
+// Declares a pointer with the restrict type qualifier if available.
+// This allows code to hint to the compiler that only this pointer references a
+// particular object or memory region within the scope of the block in which it
+// is declared. This may allow for improved optimizations due to the lack of
+// pointer aliasing. See also:
+// https://en.cppreference.com/w/c/language/restrict
+#if defined(__GNUC__)
+#define WEBP_RESTRICT __restrict__
+#elif defined(_MSC_VER)
+#define WEBP_RESTRICT __restrict
+#else
+#define WEBP_RESTRICT
+#endif
+
+//------------------------------------------------------------------------------
// CPU detection
#if defined(__GNUC__)
@@ -67,21 +84,31 @@ extern "C" {
// files without intrinsics, allowing the corresponding Init() to be called.
// Files containing intrinsics will need to be built targeting the instruction
// set so should succeed on one of the earlier tests.
-#if defined(__SSE2__) || defined(WEBP_MSC_SSE2) || defined(WEBP_HAVE_SSE2)
+#if (defined(__SSE2__) || defined(WEBP_MSC_SSE2)) && \
+ (!defined(HAVE_CONFIG_H) || defined(WEBP_HAVE_SSE2))
#define WEBP_USE_SSE2
#endif
-#if defined(__SSE4_1__) || defined(WEBP_MSC_SSE41) || defined(WEBP_HAVE_SSE41)
+#if defined(WEBP_USE_SSE2) && !defined(WEBP_HAVE_SSE2)
+#define WEBP_HAVE_SSE2
+#endif
+
+#if (defined(__SSE4_1__) || defined(WEBP_MSC_SSE41)) && \
+ (!defined(HAVE_CONFIG_H) || defined(WEBP_HAVE_SSE41))
#define WEBP_USE_SSE41
#endif
+#if defined(WEBP_USE_SSE41) && !defined(WEBP_HAVE_SSE41)
+#define WEBP_HAVE_SSE41
+#endif
+
#undef WEBP_MSC_SSE41
#undef WEBP_MSC_SSE2
// The intrinsics currently cause compiler errors with arm-nacl-gcc and the
// inline assembly would need to be modified for use with Native Client.
-#if (defined(__ARM_NEON__) || \
- defined(__aarch64__) || defined(WEBP_HAVE_NEON)) && \
+#if ((defined(__ARM_NEON__) || defined(__aarch64__)) && \
+ (!defined(HAVE_CONFIG_H) || defined(WEBP_HAVE_NEON))) && \
!defined(__native_client__)
#define WEBP_USE_NEON
#endif
@@ -97,6 +124,10 @@ extern "C" {
#define WEBP_USE_INTRINSICS
#endif
+#if defined(WEBP_USE_NEON) && !defined(WEBP_HAVE_NEON)
+#define WEBP_HAVE_NEON
+#endif
+
#if defined(__mips__) && !defined(__mips64) && \
defined(__mips_isa_rev) && (__mips_isa_rev >= 1) && (__mips_isa_rev < 6)
#define WEBP_USE_MIPS32
@@ -116,7 +147,7 @@ extern "C" {
#define WEBP_DSP_OMIT_C_CODE 1
#endif
-#if (defined(__aarch64__) || defined(__ARM_NEON__)) && WEBP_DSP_OMIT_C_CODE
+#if defined(WEBP_USE_NEON) && WEBP_DSP_OMIT_C_CODE
#define WEBP_NEON_OMIT_C_CODE 1
#else
#define WEBP_NEON_OMIT_C_CODE 0
@@ -578,26 +609,29 @@ extern void (*WebPApplyAlphaMultiply4444)(
// Dispatch the values from alpha[] plane to the ARGB destination 'dst'.
// Returns true if alpha[] plane has non-trivial values different from 0xff.
-extern int (*WebPDispatchAlpha)(const uint8_t* alpha, int alpha_stride,
- int width, int height,
- uint8_t* dst, int dst_stride);
+extern int (*WebPDispatchAlpha)(const uint8_t* WEBP_RESTRICT alpha,
+ int alpha_stride, int width, int height,
+ uint8_t* WEBP_RESTRICT dst, int dst_stride);
// Transfer packed 8b alpha[] values to green channel in dst[], zero'ing the
// A/R/B values. 'dst_stride' is the stride for dst[] in uint32_t units.
-extern void (*WebPDispatchAlphaToGreen)(const uint8_t* alpha, int alpha_stride,
- int width, int height,
- uint32_t* dst, int dst_stride);
+extern void (*WebPDispatchAlphaToGreen)(const uint8_t* WEBP_RESTRICT alpha,
+ int alpha_stride, int width, int height,
+ uint32_t* WEBP_RESTRICT dst,
+ int dst_stride);
// Extract the alpha values from 32b values in argb[] and pack them into alpha[]
// (this is the opposite of WebPDispatchAlpha).
// Returns true if there's only trivial 0xff alpha values.
-extern int (*WebPExtractAlpha)(const uint8_t* argb, int argb_stride,
- int width, int height,
- uint8_t* alpha, int alpha_stride);
+extern int (*WebPExtractAlpha)(const uint8_t* WEBP_RESTRICT argb,
+ int argb_stride, int width, int height,
+ uint8_t* WEBP_RESTRICT alpha,
+ int alpha_stride);
// Extract the green values from 32b values in argb[] and pack them into alpha[]
// (this is the opposite of WebPDispatchAlphaToGreen).
-extern void (*WebPExtractGreen)(const uint32_t* argb, uint8_t* alpha, int size);
+extern void (*WebPExtractGreen)(const uint32_t* WEBP_RESTRICT argb,
+ uint8_t* WEBP_RESTRICT alpha, int size);
// Pre-Multiply operation transforms x into x * A / 255 (where x=Y,R,G or B).
// Un-Multiply operation transforms x into x * 255 / A.
@@ -610,29 +644,35 @@ void WebPMultARGBRows(uint8_t* ptr, int stride, int width, int num_rows,
int inverse);
// Same for a row of single values, with side alpha values.
-extern void (*WebPMultRow)(uint8_t* const ptr, const uint8_t* const alpha,
+extern void (*WebPMultRow)(uint8_t* WEBP_RESTRICT const ptr,
+ const uint8_t* WEBP_RESTRICT const alpha,
int width, int inverse);
// Same a WebPMultRow(), but for several 'num_rows' rows.
-void WebPMultRows(uint8_t* ptr, int stride,
- const uint8_t* alpha, int alpha_stride,
+void WebPMultRows(uint8_t* WEBP_RESTRICT ptr, int stride,
+ const uint8_t* WEBP_RESTRICT alpha, int alpha_stride,
int width, int num_rows, int inverse);
// Plain-C versions, used as fallback by some implementations.
-void WebPMultRow_C(uint8_t* const ptr, const uint8_t* const alpha,
+void WebPMultRow_C(uint8_t* WEBP_RESTRICT const ptr,
+ const uint8_t* WEBP_RESTRICT const alpha,
int width, int inverse);
void WebPMultARGBRow_C(uint32_t* const ptr, int width, int inverse);
#ifdef WORDS_BIGENDIAN
// ARGB packing function: a/r/g/b input is rgba or bgra order.
-extern void (*WebPPackARGB)(const uint8_t* a, const uint8_t* r,
- const uint8_t* g, const uint8_t* b, int len,
- uint32_t* out);
+extern void (*WebPPackARGB)(const uint8_t* WEBP_RESTRICT a,
+ const uint8_t* WEBP_RESTRICT r,
+ const uint8_t* WEBP_RESTRICT g,
+ const uint8_t* WEBP_RESTRICT b,
+ int len, uint32_t* WEBP_RESTRICT out);
#endif
// RGB packing function. 'step' can be 3 or 4. r/g/b input is rgb or bgr order.
-extern void (*WebPPackRGB)(const uint8_t* r, const uint8_t* g, const uint8_t* b,
- int len, int step, uint32_t* out);
+extern void (*WebPPackRGB)(const uint8_t* WEBP_RESTRICT r,
+ const uint8_t* WEBP_RESTRICT g,
+ const uint8_t* WEBP_RESTRICT b,
+ int len, int step, uint32_t* WEBP_RESTRICT out);
// This function returns true if src[i] contains a value different from 0xff.
extern int (*WebPHasAlpha8b)(const uint8_t* src, int length);
diff --git a/src/3rdparty/libwebp/src/dsp/enc.c b/src/3rdparty/libwebp/src/dsp/enc.c
index 2fddbc4..ea47a3f 100644
--- a/src/3rdparty/libwebp/src/dsp/enc.c
+++ b/src/3rdparty/libwebp/src/dsp/enc.c
@@ -773,10 +773,10 @@ WEBP_DSP_INIT_FUNC(VP8EncDspInit) {
// If defined, use CPUInfo() to overwrite some pointers with faster versions.
if (VP8GetCPUInfo != NULL) {
-#if defined(WEBP_USE_SSE2)
+#if defined(WEBP_HAVE_SSE2)
if (VP8GetCPUInfo(kSSE2)) {
VP8EncDspInitSSE2();
-#if defined(WEBP_USE_SSE41)
+#if defined(WEBP_HAVE_SSE41)
if (VP8GetCPUInfo(kSSE4_1)) {
VP8EncDspInitSSE41();
}
@@ -800,7 +800,7 @@ WEBP_DSP_INIT_FUNC(VP8EncDspInit) {
#endif
}
-#if defined(WEBP_USE_NEON)
+#if defined(WEBP_HAVE_NEON)
if (WEBP_NEON_OMIT_C_CODE ||
(VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
VP8EncDspInitNEON();
diff --git a/src/3rdparty/libwebp/src/dsp/filters.c b/src/3rdparty/libwebp/src/dsp/filters.c
index 9e910d9..4506567 100644
--- a/src/3rdparty/libwebp/src/dsp/filters.c
+++ b/src/3rdparty/libwebp/src/dsp/filters.c
@@ -254,7 +254,7 @@ WEBP_DSP_INIT_FUNC(VP8FiltersInit) {
#endif
if (VP8GetCPUInfo != NULL) {
-#if defined(WEBP_USE_SSE2)
+#if defined(WEBP_HAVE_SSE2)
if (VP8GetCPUInfo(kSSE2)) {
VP8FiltersInitSSE2();
}
@@ -271,7 +271,7 @@ WEBP_DSP_INIT_FUNC(VP8FiltersInit) {
#endif
}
-#if defined(WEBP_USE_NEON)
+#if defined(WEBP_HAVE_NEON)
if (WEBP_NEON_OMIT_C_CODE ||
(VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
VP8FiltersInitNEON();
diff --git a/src/3rdparty/libwebp/src/dsp/filters_sse2.c b/src/3rdparty/libwebp/src/dsp/filters_sse2.c
index 4b3f2d0..5c33ec1 100644
--- a/src/3rdparty/libwebp/src/dsp/filters_sse2.c
+++ b/src/3rdparty/libwebp/src/dsp/filters_sse2.c
@@ -320,7 +320,12 @@ extern void VP8FiltersInitSSE2(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInitSSE2(void) {
WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter_SSE2;
+#if defined(CHROMIUM)
+ // TODO(crbug.com/654974)
+ (void)VerticalUnfilter_SSE2;
+#else
WebPUnfilters[WEBP_FILTER_VERTICAL] = VerticalUnfilter_SSE2;
+#endif
WebPUnfilters[WEBP_FILTER_GRADIENT] = GradientUnfilter_SSE2;
WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter_SSE2;
diff --git a/src/3rdparty/libwebp/src/dsp/lossless.c b/src/3rdparty/libwebp/src/dsp/lossless.c
index 46b220e..d8bbb02 100644
--- a/src/3rdparty/libwebp/src/dsp/lossless.c
+++ b/src/3rdparty/libwebp/src/dsp/lossless.c
@@ -575,6 +575,7 @@ VP8LMapARGBFunc VP8LMapColor32b;
VP8LMapAlphaFunc VP8LMapColor8b;
extern void VP8LDspInitSSE2(void);
+extern void VP8LDspInitSSE41(void);
extern void VP8LDspInitNEON(void);
extern void VP8LDspInitMIPSdspR2(void);
extern void VP8LDspInitMSA(void);
@@ -621,9 +622,14 @@ WEBP_DSP_INIT_FUNC(VP8LDspInit) {
// If defined, use CPUInfo() to overwrite some pointers with faster versions.
if (VP8GetCPUInfo != NULL) {
-#if defined(WEBP_USE_SSE2)
+#if defined(WEBP_HAVE_SSE2)
if (VP8GetCPUInfo(kSSE2)) {
VP8LDspInitSSE2();
+#if defined(WEBP_HAVE_SSE41)
+ if (VP8GetCPUInfo(kSSE4_1)) {
+ VP8LDspInitSSE41();
+ }
+#endif
}
#endif
#if defined(WEBP_USE_MIPS_DSP_R2)
@@ -638,7 +644,7 @@ WEBP_DSP_INIT_FUNC(VP8LDspInit) {
#endif
}
-#if defined(WEBP_USE_NEON)
+#if defined(WEBP_HAVE_NEON)
if (WEBP_NEON_OMIT_C_CODE ||
(VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
VP8LDspInitNEON();
diff --git a/src/3rdparty/libwebp/src/dsp/lossless_enc.c b/src/3rdparty/libwebp/src/dsp/lossless_enc.c
index a0c7ab9..c3e8537 100644
--- a/src/3rdparty/libwebp/src/dsp/lossless_enc.c
+++ b/src/3rdparty/libwebp/src/dsp/lossless_enc.c
@@ -329,6 +329,15 @@ const uint8_t kPrefixEncodeExtraBitsValue[PREFIX_LOOKUP_IDX_MAX] = {
static float FastSLog2Slow_C(uint32_t v) {
assert(v >= LOG_LOOKUP_IDX_MAX);
if (v < APPROX_LOG_WITH_CORRECTION_MAX) {
+#if !defined(WEBP_HAVE_SLOW_CLZ_CTZ)
+ // use clz if available
+ const int log_cnt = BitsLog2Floor(v) - 7;
+ const uint32_t y = 1 << log_cnt;
+ int correction = 0;
+ const float v_f = (float)v;
+ const uint32_t orig_v = v;
+ v >>= log_cnt;
+#else
int log_cnt = 0;
uint32_t y = 1;
int correction = 0;
@@ -339,6 +348,7 @@ static float FastSLog2Slow_C(uint32_t v) {
v = v >> 1;
y = y << 1;
} while (v >= LOG_LOOKUP_IDX_MAX);
+#endif
// vf = (2^log_cnt) * Xf; where y = 2^log_cnt and Xf < 256
// Xf = floor(Xf) * (1 + (v % y) / v)
// log2(Xf) = log2(floor(Xf)) + log2(1 + (v % y) / v)
@@ -355,6 +365,14 @@ static float FastSLog2Slow_C(uint32_t v) {
static float FastLog2Slow_C(uint32_t v) {
assert(v >= LOG_LOOKUP_IDX_MAX);
if (v < APPROX_LOG_WITH_CORRECTION_MAX) {
+#if !defined(WEBP_HAVE_SLOW_CLZ_CTZ)
+ // use clz if available
+ const int log_cnt = BitsLog2Floor(v) - 7;
+ const uint32_t y = 1 << log_cnt;
+ const uint32_t orig_v = v;
+ double log_2;
+ v >>= log_cnt;
+#else
int log_cnt = 0;
uint32_t y = 1;
const uint32_t orig_v = v;
@@ -364,6 +382,7 @@ static float FastLog2Slow_C(uint32_t v) {
v = v >> 1;
y = y << 1;
} while (v >= LOG_LOOKUP_IDX_MAX);
+#endif
log_2 = kLog2Table[v] + log_cnt;
if (orig_v >= APPROX_LOG_MAX) {
// Since the division is still expensive, add this correction factor only
@@ -843,10 +862,10 @@ WEBP_DSP_INIT_FUNC(VP8LEncDspInit) {
// If defined, use CPUInfo() to overwrite some pointers with faster versions.
if (VP8GetCPUInfo != NULL) {
-#if defined(WEBP_USE_SSE2)
+#if defined(WEBP_HAVE_SSE2)
if (VP8GetCPUInfo(kSSE2)) {
VP8LEncDspInitSSE2();
-#if defined(WEBP_USE_SSE41)
+#if defined(WEBP_HAVE_SSE41)
if (VP8GetCPUInfo(kSSE4_1)) {
VP8LEncDspInitSSE41();
}
@@ -870,7 +889,7 @@ WEBP_DSP_INIT_FUNC(VP8LEncDspInit) {
#endif
}
-#if defined(WEBP_USE_NEON)
+#if defined(WEBP_HAVE_NEON)
if (WEBP_NEON_OMIT_C_CODE ||
(VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
VP8LEncDspInitNEON();
diff --git a/src/3rdparty/libwebp/src/dsp/lossless_enc_sse2.c b/src/3rdparty/libwebp/src/dsp/lossless_enc_sse2.c
index 90c2637..b2f83b8 100644
--- a/src/3rdparty/libwebp/src/dsp/lossless_enc_sse2.c
+++ b/src/3rdparty/libwebp/src/dsp/lossless_enc_sse2.c
@@ -232,79 +232,55 @@ static void AddVectorEq_SSE2(const uint32_t* a, uint32_t* out, int size) {
//------------------------------------------------------------------------------
// Entropy
-// Checks whether the X or Y contribution is worth computing and adding.
-// Used in loop unrolling.
-#define ANALYZE_X_OR_Y(x_or_y, j) \
- do { \
- if ((x_or_y)[i + (j)] != 0) retval -= VP8LFastSLog2((x_or_y)[i + (j)]); \
- } while (0)
-
-// Checks whether the X + Y contribution is worth computing and adding.
-// Used in loop unrolling.
-#define ANALYZE_XY(j) \
- do { \
- if (tmp[j] != 0) { \
- retval -= VP8LFastSLog2(tmp[j]); \
- ANALYZE_X_OR_Y(X, j); \
- } \
- } while (0)
-
-#if !(defined(__i386__) || defined(_M_IX86))
+// TODO(https://crbug.com/webp/499): this function produces different results
+// from the C code due to use of double/float resulting in output differences
+// when compared to -noasm.
+#if !(defined(WEBP_HAVE_SLOW_CLZ_CTZ) || defined(__i386__) || defined(_M_IX86))
+
static float CombinedShannonEntropy_SSE2(const int X[256], const int Y[256]) {
int i;
double retval = 0.;
- int sumX, sumXY;
- int32_t tmp[4];
- __m128i zero = _mm_setzero_si128();
- // Sums up X + Y, 4 ints at a time (and will merge it at the end for sumXY).
- __m128i sumXY_128 = zero;
- __m128i sumX_128 = zero;
-
- for (i = 0; i < 256; i += 4) {
- const __m128i x = _mm_loadu_si128((const __m128i*)(X + i));
- const __m128i y = _mm_loadu_si128((const __m128i*)(Y + i));
-
- // Check if any X is non-zero: this actually provides a speedup as X is
- // usually sparse.
- if (_mm_movemask_epi8(_mm_cmpeq_epi32(x, zero)) != 0xFFFF) {
- const __m128i xy_128 = _mm_add_epi32(x, y);
- sumXY_128 = _mm_add_epi32(sumXY_128, xy_128);
-
- sumX_128 = _mm_add_epi32(sumX_128, x);
-
- // Analyze the different X + Y.
- _mm_storeu_si128((__m128i*)tmp, xy_128);
-
- ANALYZE_XY(0);
- ANALYZE_XY(1);
- ANALYZE_XY(2);
- ANALYZE_XY(3);
- } else {
- // X is fully 0, so only deal with Y.
- sumXY_128 = _mm_add_epi32(sumXY_128, y);
-
- ANALYZE_X_OR_Y(Y, 0);
- ANALYZE_X_OR_Y(Y, 1);
- ANALYZE_X_OR_Y(Y, 2);
- ANALYZE_X_OR_Y(Y, 3);
+ int sumX = 0, sumXY = 0;
+ const __m128i zero = _mm_setzero_si128();
+
+ for (i = 0; i < 256; i += 16) {
+ const __m128i x0 = _mm_loadu_si128((const __m128i*)(X + i + 0));
+ const __m128i y0 = _mm_loadu_si128((const __m128i*)(Y + i + 0));
+ const __m128i x1 = _mm_loadu_si128((const __m128i*)(X + i + 4));
+ const __m128i y1 = _mm_loadu_si128((const __m128i*)(Y + i + 4));
+ const __m128i x2 = _mm_loadu_si128((const __m128i*)(X + i + 8));
+ const __m128i y2 = _mm_loadu_si128((const __m128i*)(Y + i + 8));
+ const __m128i x3 = _mm_loadu_si128((const __m128i*)(X + i + 12));
+ const __m128i y3 = _mm_loadu_si128((const __m128i*)(Y + i + 12));
+ const __m128i x4 = _mm_packs_epi16(_mm_packs_epi32(x0, x1),
+ _mm_packs_epi32(x2, x3));
+ const __m128i y4 = _mm_packs_epi16(_mm_packs_epi32(y0, y1),
+ _mm_packs_epi32(y2, y3));
+ const int32_t mx = _mm_movemask_epi8(_mm_cmpgt_epi8(x4, zero));
+ int32_t my = _mm_movemask_epi8(_mm_cmpgt_epi8(y4, zero)) | mx;
+ while (my) {
+ const int32_t j = BitsCtz(my);
+ int xy;
+ if ((mx >> j) & 1) {
+ const int x = X[i + j];
+ sumXY += x;
+ retval -= VP8LFastSLog2(x);
+ }
+ xy = X[i + j] + Y[i + j];
+ sumX += xy;
+ retval -= VP8LFastSLog2(xy);
+ my &= my - 1;
}
}
-
- // Sum up sumX_128 to get sumX.
- _mm_storeu_si128((__m128i*)tmp, sumX_128);
- sumX = tmp[3] + tmp[2] + tmp[1] + tmp[0];
-
- // Sum up sumXY_128 to get sumXY.
- _mm_storeu_si128((__m128i*)tmp, sumXY_128);
- sumXY = tmp[3] + tmp[2] + tmp[1] + tmp[0];
-
retval += VP8LFastSLog2(sumX) + VP8LFastSLog2(sumXY);
return (float)retval;
}
-#endif // !(defined(__i386__) || defined(_M_IX86))
-#undef ANALYZE_X_OR_Y
-#undef ANALYZE_XY
+#else
+
+#define DONT_USE_COMBINED_SHANNON_ENTROPY_SSE2_FUNC // won't be faster
+
+#endif
//------------------------------------------------------------------------------
@@ -662,10 +638,7 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitSSE2(void) {
VP8LCollectColorRedTransforms = CollectColorRedTransforms_SSE2;
VP8LAddVector = AddVector_SSE2;
VP8LAddVectorEq = AddVectorEq_SSE2;
- // TODO(https://crbug.com/webp/499): this function produces different results
- // from the C code due to use of double/float resulting in output differences
- // when compared to -noasm.
-#if !(defined(__i386__) || defined(_M_IX86))
+#if !defined(DONT_USE_COMBINED_SHANNON_ENTROPY_SSE2_FUNC)
VP8LCombinedShannonEntropy = CombinedShannonEntropy_SSE2;
#endif
VP8LVectorMismatch = VectorMismatch_SSE2;
diff --git a/src/3rdparty/libwebp/src/dsp/lossless_enc_sse41.c b/src/3rdparty/libwebp/src/dsp/lossless_enc_sse41.c
index 719d8ed..ad358a6 100644
--- a/src/3rdparty/libwebp/src/dsp/lossless_enc_sse41.c
+++ b/src/3rdparty/libwebp/src/dsp/lossless_enc_sse41.c
@@ -44,46 +44,47 @@ static void SubtractGreenFromBlueAndRed_SSE41(uint32_t* argb_data,
//------------------------------------------------------------------------------
// Color Transform
-#define SPAN 8
+#define MK_CST_16(HI, LO) \
+ _mm_set1_epi32((int)(((uint32_t)(HI) << 16) | ((LO) & 0xffff)))
+
static void CollectColorBlueTransforms_SSE41(const uint32_t* argb, int stride,
int tile_width, int tile_height,
int green_to_blue, int red_to_blue,
int histo[]) {
- const __m128i mults_r = _mm_set1_epi16(CST_5b(red_to_blue));
- const __m128i mults_g = _mm_set1_epi16(CST_5b(green_to_blue));
- const __m128i mask_g = _mm_set1_epi16((short)0xff00); // green mask
- const __m128i mask_gb = _mm_set1_epi32(0xffff); // green/blue mask
- const __m128i mask_b = _mm_set1_epi16(0x00ff); // blue mask
- const __m128i shuffler_lo = _mm_setr_epi8(-1, 2, -1, 6, -1, 10, -1, 14, -1,
- -1, -1, -1, -1, -1, -1, -1);
- const __m128i shuffler_hi = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1,
- 2, -1, 6, -1, 10, -1, 14);
- int y;
- for (y = 0; y < tile_height; ++y) {
- const uint32_t* const src = argb + y * stride;
- int i, x;
- for (x = 0; x + SPAN <= tile_width; x += SPAN) {
- uint16_t values[SPAN];
- const __m128i in0 = _mm_loadu_si128((__m128i*)&src[x + 0]);
- const __m128i in1 = _mm_loadu_si128((__m128i*)&src[x + SPAN / 2]);
- const __m128i r0 = _mm_shuffle_epi8(in0, shuffler_lo);
- const __m128i r1 = _mm_shuffle_epi8(in1, shuffler_hi);
- const __m128i r = _mm_or_si128(r0, r1); // r 0
- const __m128i gb0 = _mm_and_si128(in0, mask_gb);
- const __m128i gb1 = _mm_and_si128(in1, mask_gb);
- const __m128i gb = _mm_packus_epi32(gb0, gb1); // g b
- const __m128i g = _mm_and_si128(gb, mask_g); // g 0
- const __m128i A = _mm_mulhi_epi16(r, mults_r); // x dbr
- const __m128i B = _mm_mulhi_epi16(g, mults_g); // x dbg
- const __m128i C = _mm_sub_epi8(gb, B); // x b'
- const __m128i D = _mm_sub_epi8(C, A); // x b''
- const __m128i E = _mm_and_si128(D, mask_b); // 0 b''
- _mm_storeu_si128((__m128i*)values, E);
- for (i = 0; i < SPAN; ++i) ++histo[values[i]];
+ const __m128i mult =
+ MK_CST_16(CST_5b(red_to_blue) + 256,CST_5b(green_to_blue));
+ const __m128i perm =
+ _mm_setr_epi8(-1, 1, -1, 2, -1, 5, -1, 6, -1, 9, -1, 10, -1, 13, -1, 14);
+ if (tile_width >= 4) {
+ int y;
+ for (y = 0; y < tile_height; ++y) {
+ const uint32_t* const src = argb + y * stride;
+ const __m128i A1 = _mm_loadu_si128((const __m128i*)src);
+ const __m128i B1 = _mm_shuffle_epi8(A1, perm);
+ const __m128i C1 = _mm_mulhi_epi16(B1, mult);
+ const __m128i D1 = _mm_sub_epi16(A1, C1);
+ __m128i E = _mm_add_epi16(_mm_srli_epi32(D1, 16), D1);
+ int x;
+ for (x = 4; x + 4 <= tile_width; x += 4) {
+ const __m128i A2 = _mm_loadu_si128((const __m128i*)(src + x));
+ __m128i B2, C2, D2;
+ ++histo[_mm_extract_epi8(E, 0)];
+ B2 = _mm_shuffle_epi8(A2, perm);
+ ++histo[_mm_extract_epi8(E, 4)];
+ C2 = _mm_mulhi_epi16(B2, mult);
+ ++histo[_mm_extract_epi8(E, 8)];
+ D2 = _mm_sub_epi16(A2, C2);
+ ++histo[_mm_extract_epi8(E, 12)];
+ E = _mm_add_epi16(_mm_srli_epi32(D2, 16), D2);
+ }
+ ++histo[_mm_extract_epi8(E, 0)];
+ ++histo[_mm_extract_epi8(E, 4)];
+ ++histo[_mm_extract_epi8(E, 8)];
+ ++histo[_mm_extract_epi8(E, 12)];
}
}
{
- const int left_over = tile_width & (SPAN - 1);
+ const int left_over = tile_width & 3;
if (left_over > 0) {
VP8LCollectColorBlueTransforms_C(argb + tile_width - left_over, stride,
left_over, tile_height,
@@ -95,33 +96,37 @@ static void CollectColorBlueTransforms_SSE41(const uint32_t* argb, int stride,
static void CollectColorRedTransforms_SSE41(const uint32_t* argb, int stride,
int tile_width, int tile_height,
int green_to_red, int histo[]) {
- const __m128i mults_g = _mm_set1_epi16(CST_5b(green_to_red));
- const __m128i mask_g = _mm_set1_epi32(0x00ff00); // green mask
- const __m128i mask = _mm_set1_epi16(0xff);
-
- int y;
- for (y = 0; y < tile_height; ++y) {
- const uint32_t* const src = argb + y * stride;
- int i, x;
- for (x = 0; x + SPAN <= tile_width; x += SPAN) {
- uint16_t values[SPAN];
- const __m128i in0 = _mm_loadu_si128((__m128i*)&src[x + 0]);
- const __m128i in1 = _mm_loadu_si128((__m128i*)&src[x + SPAN / 2]);
- const __m128i g0 = _mm_and_si128(in0, mask_g); // 0 0 | g 0
- const __m128i g1 = _mm_and_si128(in1, mask_g);
- const __m128i g = _mm_packus_epi32(g0, g1); // g 0
- const __m128i A0 = _mm_srli_epi32(in0, 16); // 0 0 | x r
- const __m128i A1 = _mm_srli_epi32(in1, 16);
- const __m128i A = _mm_packus_epi32(A0, A1); // x r
- const __m128i B = _mm_mulhi_epi16(g, mults_g); // x dr
- const __m128i C = _mm_sub_epi8(A, B); // x r'
- const __m128i D = _mm_and_si128(C, mask); // 0 r'
- _mm_storeu_si128((__m128i*)values, D);
- for (i = 0; i < SPAN; ++i) ++histo[values[i]];
+
+ const __m128i mult = MK_CST_16(0, CST_5b(green_to_red));
+ const __m128i mask_g = _mm_set1_epi32(0x0000ff00);
+ if (tile_width >= 4) {
+ int y;
+ for (y = 0; y < tile_height; ++y) {
+ const uint32_t* const src = argb + y * stride;
+ const __m128i A1 = _mm_loadu_si128((const __m128i*)src);
+ const __m128i B1 = _mm_and_si128(A1, mask_g);
+ const __m128i C1 = _mm_madd_epi16(B1, mult);
+ __m128i D = _mm_sub_epi16(A1, C1);
+ int x;
+ for (x = 4; x + 4 <= tile_width; x += 4) {
+ const __m128i A2 = _mm_loadu_si128((const __m128i*)(src + x));
+ __m128i B2, C2;
+ ++histo[_mm_extract_epi8(D, 2)];
+ B2 = _mm_and_si128(A2, mask_g);
+ ++histo[_mm_extract_epi8(D, 6)];
+ C2 = _mm_madd_epi16(B2, mult);
+ ++histo[_mm_extract_epi8(D, 10)];
+ ++histo[_mm_extract_epi8(D, 14)];
+ D = _mm_sub_epi16(A2, C2);
+ }
+ ++histo[_mm_extract_epi8(D, 2)];
+ ++histo[_mm_extract_epi8(D, 6)];
+ ++histo[_mm_extract_epi8(D, 10)];
+ ++histo[_mm_extract_epi8(D, 14)];
}
}
{
- const int left_over = tile_width & (SPAN - 1);
+ const int left_over = tile_width & 3;
if (left_over > 0) {
VP8LCollectColorRedTransforms_C(argb + tile_width - left_over, stride,
left_over, tile_height, green_to_red,
@@ -130,6 +135,8 @@ static void CollectColorRedTransforms_SSE41(const uint32_t* argb, int stride,
}
}
+#undef MK_CST_16
+
//------------------------------------------------------------------------------
// Entry point
diff --git a/src/3rdparty/libwebp/src/dsp/lossless_sse2.c b/src/3rdparty/libwebp/src/dsp/lossless_sse2.c
index aef0cee..3a0eb44 100644
--- a/src/3rdparty/libwebp/src/dsp/lossless_sse2.c
+++ b/src/3rdparty/libwebp/src/dsp/lossless_sse2.c
@@ -18,7 +18,6 @@
#include "src/dsp/common_sse2.h"
#include "src/dsp/lossless.h"
#include "src/dsp/lossless_common.h"
-#include <assert.h>
#include <emmintrin.h>
//------------------------------------------------------------------------------
diff --git a/src/3rdparty/libwebp/src/dsp/lossless_sse41.c b/src/3rdparty/libwebp/src/dsp/lossless_sse41.c
new file mode 100644
index 0000000..b0d6daa
--- /dev/null
+++ b/src/3rdparty/libwebp/src/dsp/lossless_sse41.c
@@ -0,0 +1,132 @@
+// Copyright 2021 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// SSE41 variant of methods for lossless decoder
+
+#include "src/dsp/dsp.h"
+
+#if defined(WEBP_USE_SSE41)
+
+#include "src/dsp/common_sse41.h"
+#include "src/dsp/lossless.h"
+#include "src/dsp/lossless_common.h"
+
+//------------------------------------------------------------------------------
+// Color-space conversion functions
+
+static void TransformColorInverse_SSE41(const VP8LMultipliers* const m,
+ const uint32_t* const src,
+ int num_pixels, uint32_t* dst) {
+// sign-extended multiplying constants, pre-shifted by 5.
+#define CST(X) (((int16_t)(m->X << 8)) >> 5) // sign-extend
+ const __m128i mults_rb = _mm_set1_epi32((uint32_t)CST(green_to_red_) << 16 |
+ (CST(green_to_blue_) & 0xffff));
+ const __m128i mults_b2 = _mm_set1_epi32(CST(red_to_blue_));
+#undef CST
+ const __m128i mask_ag = _mm_set1_epi32(0xff00ff00);
+ const __m128i perm1 = _mm_setr_epi8(-1, 1, -1, 1, -1, 5, -1, 5,
+ -1, 9, -1, 9, -1, 13, -1, 13);
+ const __m128i perm2 = _mm_setr_epi8(-1, 2, -1, -1, -1, 6, -1, -1,
+ -1, 10, -1, -1, -1, 14, -1, -1);
+ int i;
+ for (i = 0; i + 4 <= num_pixels; i += 4) {
+ const __m128i A = _mm_loadu_si128((const __m128i*)(src + i));
+ const __m128i B = _mm_shuffle_epi8(A, perm1); // argb -> g0g0
+ const __m128i C = _mm_mulhi_epi16(B, mults_rb);
+ const __m128i D = _mm_add_epi8(A, C);
+ const __m128i E = _mm_shuffle_epi8(D, perm2);
+ const __m128i F = _mm_mulhi_epi16(E, mults_b2);
+ const __m128i G = _mm_add_epi8(D, F);
+ const __m128i out = _mm_blendv_epi8(G, A, mask_ag);
+ _mm_storeu_si128((__m128i*)&dst[i], out);
+ }
+ // Fall-back to C-version for left-overs.
+ if (i != num_pixels) {
+ VP8LTransformColorInverse_C(m, src + i, num_pixels - i, dst + i);
+ }
+}
+
+//------------------------------------------------------------------------------
+
+#define ARGB_TO_RGB_SSE41 do { \
+ while (num_pixels >= 16) { \
+ const __m128i in0 = _mm_loadu_si128(in + 0); \
+ const __m128i in1 = _mm_loadu_si128(in + 1); \
+ const __m128i in2 = _mm_loadu_si128(in + 2); \
+ const __m128i in3 = _mm_loadu_si128(in + 3); \
+ const __m128i a0 = _mm_shuffle_epi8(in0, perm0); \
+ const __m128i a1 = _mm_shuffle_epi8(in1, perm1); \
+ const __m128i a2 = _mm_shuffle_epi8(in2, perm2); \
+ const __m128i a3 = _mm_shuffle_epi8(in3, perm3); \
+ const __m128i b0 = _mm_blend_epi16(a0, a1, 0xc0); \
+ const __m128i b1 = _mm_blend_epi16(a1, a2, 0xf0); \
+ const __m128i b2 = _mm_blend_epi16(a2, a3, 0xfc); \
+ _mm_storeu_si128(out + 0, b0); \
+ _mm_storeu_si128(out + 1, b1); \
+ _mm_storeu_si128(out + 2, b2); \
+ in += 4; \
+ out += 3; \
+ num_pixels -= 16; \
+ } \
+} while (0)
+
+static void ConvertBGRAToRGB_SSE41(const uint32_t* src, int num_pixels,
+ uint8_t* dst) {
+ const __m128i* in = (const __m128i*)src;
+ __m128i* out = (__m128i*)dst;
+ const __m128i perm0 = _mm_setr_epi8(2, 1, 0, 6, 5, 4, 10, 9,
+ 8, 14, 13, 12, -1, -1, -1, -1);
+ const __m128i perm1 = _mm_shuffle_epi32(perm0, 0x39);
+ const __m128i perm2 = _mm_shuffle_epi32(perm0, 0x4e);
+ const __m128i perm3 = _mm_shuffle_epi32(perm0, 0x93);
+
+ ARGB_TO_RGB_SSE41;
+
+ // left-overs
+ if (num_pixels > 0) {
+ VP8LConvertBGRAToRGB_C((const uint32_t*)in, num_pixels, (uint8_t*)out);
+ }
+}
+
+static void ConvertBGRAToBGR_SSE41(const uint32_t* src,
+ int num_pixels, uint8_t* dst) {
+ const __m128i* in = (const __m128i*)src;
+ __m128i* out = (__m128i*)dst;
+ const __m128i perm0 = _mm_setr_epi8(0, 1, 2, 4, 5, 6, 8, 9, 10,
+ 12, 13, 14, -1, -1, -1, -1);
+ const __m128i perm1 = _mm_shuffle_epi32(perm0, 0x39);
+ const __m128i perm2 = _mm_shuffle_epi32(perm0, 0x4e);
+ const __m128i perm3 = _mm_shuffle_epi32(perm0, 0x93);
+
+ ARGB_TO_RGB_SSE41;
+
+ // left-overs
+ if (num_pixels > 0) {
+ VP8LConvertBGRAToBGR_C((const uint32_t*)in, num_pixels, (uint8_t*)out);
+ }
+}
+
+#undef ARGB_TO_RGB_SSE41
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8LDspInitSSE41(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitSSE41(void) {
+ VP8LTransformColorInverse = TransformColorInverse_SSE41;
+ VP8LConvertBGRAToRGB = ConvertBGRAToRGB_SSE41;
+ VP8LConvertBGRAToBGR = ConvertBGRAToBGR_SSE41;
+}
+
+#else // !WEBP_USE_SSE41
+
+WEBP_DSP_INIT_STUB(VP8LDspInitSSE41)
+
+#endif // WEBP_USE_SSE41
diff --git a/src/3rdparty/libwebp/src/dsp/rescaler.c b/src/3rdparty/libwebp/src/dsp/rescaler.c
index c5a01e8..14620ce 100644
--- a/src/3rdparty/libwebp/src/dsp/rescaler.c
+++ b/src/3rdparty/libwebp/src/dsp/rescaler.c
@@ -38,8 +38,9 @@ void WebPRescalerImportRowExpand_C(WebPRescaler* const wrk,
int x_out = channel;
// simple bilinear interpolation
int accum = wrk->x_add;
- int left = src[x_in];
- int right = (wrk->src_width > 1) ? src[x_in + x_stride] : left;
+ rescaler_t left = (rescaler_t)src[x_in];
+ rescaler_t right =
+ (wrk->src_width > 1) ? (rescaler_t)src[x_in + x_stride] : left;
x_in += x_stride;
while (1) {
wrk->frow[x_out] = right * wrk->x_add + (left - right) * accum;
@@ -50,7 +51,7 @@ void WebPRescalerImportRowExpand_C(WebPRescaler* const wrk,
left = right;
x_in += x_stride;
assert(x_in < wrk->src_width * x_stride);
- right = src[x_in];
+ right = (rescaler_t)src[x_in];
accum += wrk->x_add;
}
}
@@ -213,7 +214,7 @@ WEBP_DSP_INIT_FUNC(WebPRescalerDspInit) {
WebPRescalerImportRowShrink = WebPRescalerImportRowShrink_C;
if (VP8GetCPUInfo != NULL) {
-#if defined(WEBP_USE_SSE2)
+#if defined(WEBP_HAVE_SSE2)
if (VP8GetCPUInfo(kSSE2)) {
WebPRescalerDspInitSSE2();
}
@@ -235,7 +236,7 @@ WEBP_DSP_INIT_FUNC(WebPRescalerDspInit) {
#endif
}
-#if defined(WEBP_USE_NEON)
+#if defined(WEBP_HAVE_NEON)
if (WEBP_NEON_OMIT_C_CODE ||
(VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
WebPRescalerDspInitNEON();
diff --git a/src/3rdparty/libwebp/src/dsp/ssim.c b/src/3rdparty/libwebp/src/dsp/ssim.c
index 989ce82..f85c2e6 100644
--- a/src/3rdparty/libwebp/src/dsp/ssim.c
+++ b/src/3rdparty/libwebp/src/dsp/ssim.c
@@ -150,7 +150,7 @@ WEBP_DSP_INIT_FUNC(VP8SSIMDspInit) {
#endif
if (VP8GetCPUInfo != NULL) {
-#if defined(WEBP_USE_SSE2)
+#if defined(WEBP_HAVE_SSE2)
if (VP8GetCPUInfo(kSSE2)) {
VP8SSIMDspInitSSE2();
}
diff --git a/src/3rdparty/libwebp/src/dsp/upsampling.c b/src/3rdparty/libwebp/src/dsp/upsampling.c
index 9b60da5..87f771f 100644
--- a/src/3rdparty/libwebp/src/dsp/upsampling.c
+++ b/src/3rdparty/libwebp/src/dsp/upsampling.c
@@ -233,12 +233,12 @@ WEBP_DSP_INIT_FUNC(WebPInitYUV444Converters) {
WebPYUV444Converters[MODE_rgbA_4444] = WebPYuv444ToRgba4444_C;
if (VP8GetCPUInfo != NULL) {
-#if defined(WEBP_USE_SSE2)
+#if defined(WEBP_HAVE_SSE2)
if (VP8GetCPUInfo(kSSE2)) {
WebPInitYUV444ConvertersSSE2();
}
#endif
-#if defined(WEBP_USE_SSE41)
+#if defined(WEBP_HAVE_SSE41)
if (VP8GetCPUInfo(kSSE4_1)) {
WebPInitYUV444ConvertersSSE41();
}
@@ -278,12 +278,12 @@ WEBP_DSP_INIT_FUNC(WebPInitUpsamplers) {
// If defined, use CPUInfo() to overwrite some pointers with faster versions.
if (VP8GetCPUInfo != NULL) {
-#if defined(WEBP_USE_SSE2)
+#if defined(WEBP_HAVE_SSE2)
if (VP8GetCPUInfo(kSSE2)) {
WebPInitUpsamplersSSE2();
}
#endif
-#if defined(WEBP_USE_SSE41)
+#if defined(WEBP_HAVE_SSE41)
if (VP8GetCPUInfo(kSSE4_1)) {
WebPInitUpsamplersSSE41();
}
@@ -300,7 +300,7 @@ WEBP_DSP_INIT_FUNC(WebPInitUpsamplers) {
#endif
}
-#if defined(WEBP_USE_NEON)
+#if defined(WEBP_HAVE_NEON)
if (WEBP_NEON_OMIT_C_CODE ||
(VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
WebPInitUpsamplersNEON();
diff --git a/src/3rdparty/libwebp/src/dsp/yuv.c b/src/3rdparty/libwebp/src/dsp/yuv.c
index 14e67fc..48466f8 100644
--- a/src/3rdparty/libwebp/src/dsp/yuv.c
+++ b/src/3rdparty/libwebp/src/dsp/yuv.c
@@ -90,16 +90,16 @@ WEBP_DSP_INIT_FUNC(WebPInitSamplers) {
// If defined, use CPUInfo() to overwrite some pointers with faster versions.
if (VP8GetCPUInfo != NULL) {
-#if defined(WEBP_USE_SSE2)
+#if defined(WEBP_HAVE_SSE2)
if (VP8GetCPUInfo(kSSE2)) {
WebPInitSamplersSSE2();
}
-#endif // WEBP_USE_SSE2
-#if defined(WEBP_USE_SSE41)
+#endif // WEBP_HAVE_SSE2
+#if defined(WEBP_HAVE_SSE41)
if (VP8GetCPUInfo(kSSE4_1)) {
WebPInitSamplersSSE41();
}
-#endif // WEBP_USE_SSE41
+#endif // WEBP_HAVE_SSE41
#if defined(WEBP_USE_MIPS32)
if (VP8GetCPUInfo(kMIPS32)) {
WebPInitSamplersMIPS32();
@@ -276,26 +276,26 @@ WEBP_DSP_INIT_FUNC(WebPInitConvertARGBToYUV) {
#endif
if (VP8GetCPUInfo != NULL) {
-#if defined(WEBP_USE_SSE2)
+#if defined(WEBP_HAVE_SSE2)
if (VP8GetCPUInfo(kSSE2)) {
WebPInitConvertARGBToYUVSSE2();
WebPInitSharpYUVSSE2();
}
-#endif // WEBP_USE_SSE2
-#if defined(WEBP_USE_SSE41)
+#endif // WEBP_HAVE_SSE2
+#if defined(WEBP_HAVE_SSE41)
if (VP8GetCPUInfo(kSSE4_1)) {
WebPInitConvertARGBToYUVSSE41();
}
-#endif // WEBP_USE_SSE41
+#endif // WEBP_HAVE_SSE41
}
-#if defined(WEBP_USE_NEON)
+#if defined(WEBP_HAVE_NEON)
if (WEBP_NEON_OMIT_C_CODE ||
(VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
WebPInitConvertARGBToYUVNEON();
WebPInitSharpYUVNEON();
}
-#endif // WEBP_USE_NEON
+#endif // WEBP_HAVE_NEON
assert(WebPConvertARGBToY != NULL);
assert(WebPConvertARGBToUV != NULL);