summaryrefslogtreecommitdiffstats
path: root/src/3rdparty/libwebp/src/dsp
diff options
context:
space:
mode:
Diffstat (limited to 'src/3rdparty/libwebp/src/dsp')
-rw-r--r--src/3rdparty/libwebp/src/dsp/alpha_processing.c114
-rw-r--r--src/3rdparty/libwebp/src/dsp/alpha_processing_mips_dsp_r2.c55
-rw-r--r--src/3rdparty/libwebp/src/dsp/alpha_processing_neon.c4
-rw-r--r--src/3rdparty/libwebp/src/dsp/alpha_processing_sse2.c88
-rw-r--r--src/3rdparty/libwebp/src/dsp/alpha_processing_sse41.c10
-rw-r--r--src/3rdparty/libwebp/src/dsp/argb.c68
-rw-r--r--src/3rdparty/libwebp/src/dsp/argb_mips_dsp_r2.c110
-rw-r--r--src/3rdparty/libwebp/src/dsp/argb_sse2.c67
-rw-r--r--src/3rdparty/libwebp/src/dsp/cost.c14
-rw-r--r--src/3rdparty/libwebp/src/dsp/cost_mips32.c14
-rw-r--r--src/3rdparty/libwebp/src/dsp/cost_mips_dsp_r2.c8
-rw-r--r--src/3rdparty/libwebp/src/dsp/cost_sse2.c18
-rw-r--r--src/3rdparty/libwebp/src/dsp/cpu.c4
-rw-r--r--src/3rdparty/libwebp/src/dsp/dec.c401
-rw-r--r--src/3rdparty/libwebp/src/dsp/dec_clip_tables.c13
-rw-r--r--src/3rdparty/libwebp/src/dsp/dec_mips32.c4
-rw-r--r--src/3rdparty/libwebp/src/dsp/dec_mips_dsp_r2.c4
-rw-r--r--src/3rdparty/libwebp/src/dsp/dec_msa.c15
-rw-r--r--src/3rdparty/libwebp/src/dsp/dec_neon.c727
-rw-r--r--src/3rdparty/libwebp/src/dsp/dec_sse2.c464
-rw-r--r--src/3rdparty/libwebp/src/dsp/dec_sse41.c10
-rw-r--r--src/3rdparty/libwebp/src/dsp/dsp.h96
-rw-r--r--src/3rdparty/libwebp/src/dsp/enc.c292
-rw-r--r--src/3rdparty/libwebp/src/dsp/enc_avx2.c2
-rw-r--r--src/3rdparty/libwebp/src/dsp/enc_mips32.c77
-rw-r--r--src/3rdparty/libwebp/src/dsp/enc_mips_dsp_r2.c99
-rw-r--r--src/3rdparty/libwebp/src/dsp/enc_msa.c122
-rw-r--r--src/3rdparty/libwebp/src/dsp/enc_neon.c198
-rw-r--r--src/3rdparty/libwebp/src/dsp/enc_sse2.c477
-rw-r--r--src/3rdparty/libwebp/src/dsp/enc_sse41.c66
-rw-r--r--src/3rdparty/libwebp/src/dsp/filters.c139
-rw-r--r--src/3rdparty/libwebp/src/dsp/filters_mips_dsp_r2.c97
-rw-r--r--src/3rdparty/libwebp/src/dsp/filters_msa.c22
-rw-r--r--src/3rdparty/libwebp/src/dsp/filters_neon.c18
-rw-r--r--src/3rdparty/libwebp/src/dsp/filters_sse2.c115
-rw-r--r--src/3rdparty/libwebp/src/dsp/lossless.c199
-rw-r--r--src/3rdparty/libwebp/src/dsp/lossless.h12
-rw-r--r--src/3rdparty/libwebp/src/dsp/lossless_common.h12
-rw-r--r--src/3rdparty/libwebp/src/dsp/lossless_enc.c134
-rw-r--r--src/3rdparty/libwebp/src/dsp/lossless_enc_mips32.c59
-rw-r--r--src/3rdparty/libwebp/src/dsp/lossless_enc_mips_dsp_r2.c40
-rw-r--r--src/3rdparty/libwebp/src/dsp/lossless_enc_msa.c17
-rw-r--r--src/3rdparty/libwebp/src/dsp/lossless_enc_neon.c29
-rw-r--r--src/3rdparty/libwebp/src/dsp/lossless_enc_sse2.c93
-rw-r--r--src/3rdparty/libwebp/src/dsp/lossless_enc_sse41.c9
-rw-r--r--src/3rdparty/libwebp/src/dsp/lossless_mips_dsp_r2.c109
-rw-r--r--src/3rdparty/libwebp/src/dsp/lossless_msa.c41
-rw-r--r--src/3rdparty/libwebp/src/dsp/lossless_neon.c63
-rw-r--r--src/3rdparty/libwebp/src/dsp/lossless_sse2.c271
-rw-r--r--src/3rdparty/libwebp/src/dsp/msa_macro.h2
-rw-r--r--src/3rdparty/libwebp/src/dsp/neon.h11
-rw-r--r--src/3rdparty/libwebp/src/dsp/rescaler.c43
-rw-r--r--src/3rdparty/libwebp/src/dsp/rescaler_mips32.c24
-rw-r--r--src/3rdparty/libwebp/src/dsp/rescaler_mips_dsp_r2.c14
-rw-r--r--src/3rdparty/libwebp/src/dsp/rescaler_msa.c16
-rw-r--r--src/3rdparty/libwebp/src/dsp/rescaler_neon.c26
-rw-r--r--src/3rdparty/libwebp/src/dsp/rescaler_sse2.c92
-rw-r--r--src/3rdparty/libwebp/src/dsp/ssim.c166
-rw-r--r--src/3rdparty/libwebp/src/dsp/ssim_sse2.c165
-rw-r--r--src/3rdparty/libwebp/src/dsp/upsampling.c161
-rw-r--r--src/3rdparty/libwebp/src/dsp/upsampling_mips_dsp_r2.c47
-rw-r--r--src/3rdparty/libwebp/src/dsp/upsampling_msa.c50
-rw-r--r--src/3rdparty/libwebp/src/dsp/upsampling_neon.c52
-rw-r--r--src/3rdparty/libwebp/src/dsp/upsampling_sse2.c116
-rw-r--r--src/3rdparty/libwebp/src/dsp/yuv.c98
-rw-r--r--src/3rdparty/libwebp/src/dsp/yuv.h75
-rw-r--r--src/3rdparty/libwebp/src/dsp/yuv_mips32.c20
-rw-r--r--src/3rdparty/libwebp/src/dsp/yuv_mips_dsp_r2.c20
-rw-r--r--src/3rdparty/libwebp/src/dsp/yuv_neon.c288
-rw-r--r--src/3rdparty/libwebp/src/dsp/yuv_sse2.c345
70 files changed, 3829 insertions, 3122 deletions
diff --git a/src/3rdparty/libwebp/src/dsp/alpha_processing.c b/src/3rdparty/libwebp/src/dsp/alpha_processing.c
index 4b60e09..590e3bc 100644
--- a/src/3rdparty/libwebp/src/dsp/alpha_processing.c
+++ b/src/3rdparty/libwebp/src/dsp/alpha_processing.c
@@ -12,10 +12,13 @@
// Author: Skal (pascal.massimino@gmail.com)
#include <assert.h>
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
// Tables can be faster on some platform but incur some extra binary size (~2k).
-// #define USE_TABLES_FOR_ALPHA_MULT
+#if !defined(USE_TABLES_FOR_ALPHA_MULT)
+#define USE_TABLES_FOR_ALPHA_MULT 0 // ALTERNATE_CODE
+#endif
+
// -----------------------------------------------------------------------------
@@ -29,7 +32,7 @@ static uint32_t Mult(uint8_t x, uint32_t mult) {
return v;
}
-#ifdef USE_TABLES_FOR_ALPHA_MULT
+#if (USE_TABLES_FOR_ALPHA_MULT == 1)
static const uint32_t kMultTables[2][256] = {
{ // (255u << MFIX) / alpha
@@ -132,9 +135,9 @@ static WEBP_INLINE uint32_t GetScale(uint32_t a, int inverse) {
return inverse ? (255u << MFIX) / a : a * KINV_255;
}
-#endif // USE_TABLES_FOR_ALPHA_MULT
+#endif // USE_TABLES_FOR_ALPHA_MULT
-void WebPMultARGBRowC(uint32_t* const ptr, int width, int inverse) {
+void WebPMultARGBRow_C(uint32_t* const ptr, int width, int inverse) {
int x;
for (x = 0; x < width; ++x) {
const uint32_t argb = ptr[x];
@@ -154,8 +157,8 @@ void WebPMultARGBRowC(uint32_t* const ptr, int width, int inverse) {
}
}
-void WebPMultRowC(uint8_t* const ptr, const uint8_t* const alpha,
- int width, int inverse) {
+void WebPMultRow_C(uint8_t* const ptr, const uint8_t* const alpha,
+ int width, int inverse) {
int x;
for (x = 0; x < width; ++x) {
const uint32_t a = alpha[x];
@@ -217,8 +220,9 @@ void WebPMultRows(uint8_t* ptr, int stride,
#define PREMULTIPLY(x, m) (((x) * (m) + (1U << 23)) >> 24)
#endif
-static void ApplyAlphaMultiply(uint8_t* rgba, int alpha_first,
- int w, int h, int stride) {
+#if !WEBP_NEON_OMIT_C_CODE
+static void ApplyAlphaMultiply_C(uint8_t* rgba, int alpha_first,
+ int w, int h, int stride) {
while (h-- > 0) {
uint8_t* const rgb = rgba + (alpha_first ? 1 : 0);
const uint8_t* const alpha = rgba + (alpha_first ? 0 : 3);
@@ -235,6 +239,7 @@ static void ApplyAlphaMultiply(uint8_t* rgba, int alpha_first,
rgba += stride;
}
}
+#endif // !WEBP_NEON_OMIT_C_CODE
#undef MULTIPLIER
#undef PREMULTIPLY
@@ -254,9 +259,9 @@ static WEBP_INLINE uint8_t multiply(uint8_t x, uint32_t m) {
return (x * m) >> 16;
}
-static WEBP_INLINE void ApplyAlphaMultiply4444(uint8_t* rgba4444,
- int w, int h, int stride,
- int rg_byte_pos /* 0 or 1 */) {
+static WEBP_INLINE void ApplyAlphaMultiply4444_C(uint8_t* rgba4444,
+ int w, int h, int stride,
+ int rg_byte_pos /* 0 or 1 */) {
while (h-- > 0) {
int i;
for (i = 0; i < w; ++i) {
@@ -275,15 +280,16 @@ static WEBP_INLINE void ApplyAlphaMultiply4444(uint8_t* rgba4444,
}
#undef MULTIPLIER
-static void ApplyAlphaMultiply_16b(uint8_t* rgba4444,
- int w, int h, int stride) {
-#ifdef WEBP_SWAP_16BIT_CSP
- ApplyAlphaMultiply4444(rgba4444, w, h, stride, 1);
+static void ApplyAlphaMultiply_16b_C(uint8_t* rgba4444,
+ int w, int h, int stride) {
+#if (WEBP_SWAP_16BIT_CSP == 1)
+ ApplyAlphaMultiply4444_C(rgba4444, w, h, stride, 1);
#else
- ApplyAlphaMultiply4444(rgba4444, w, h, stride, 0);
+ ApplyAlphaMultiply4444_C(rgba4444, w, h, stride, 0);
#endif
}
+#if !WEBP_NEON_OMIT_C_CODE
static int DispatchAlpha_C(const uint8_t* alpha, int alpha_stride,
int width, int height,
uint8_t* dst, int dst_stride) {
@@ -338,6 +344,36 @@ static void ExtractGreen_C(const uint32_t* argb, uint8_t* alpha, int size) {
int i;
for (i = 0; i < size; ++i) alpha[i] = argb[i] >> 8;
}
+#endif // !WEBP_NEON_OMIT_C_CODE
+
+//------------------------------------------------------------------------------
+
+static int HasAlpha8b_C(const uint8_t* src, int length) {
+ while (length-- > 0) if (*src++ != 0xff) return 1;
+ return 0;
+}
+
+static int HasAlpha32b_C(const uint8_t* src, int length) {
+ int x;
+ for (x = 0; length-- > 0; x += 4) if (src[x] != 0xff) return 1;
+ return 0;
+}
+
+//------------------------------------------------------------------------------
+// Simple channel manipulations.
+
+static WEBP_INLINE uint32_t MakeARGB32(int a, int r, int g, int b) {
+ return (((uint32_t)a << 24) | (r << 16) | (g << 8) | b);
+}
+
+static void PackRGB_C(const uint8_t* r, const uint8_t* g, const uint8_t* b,
+ int len, int step, uint32_t* out) {
+ int i, offset = 0;
+ for (i = 0; i < len; ++i) {
+ out[i] = MakeARGB32(0xff, r[offset], g[offset], b[offset]);
+ offset += step;
+ }
+}
void (*WebPApplyAlphaMultiply)(uint8_t*, int, int, int, int);
void (*WebPApplyAlphaMultiply4444)(uint8_t*, int, int, int);
@@ -345,6 +381,11 @@ int (*WebPDispatchAlpha)(const uint8_t*, int, int, int, uint8_t*, int);
void (*WebPDispatchAlphaToGreen)(const uint8_t*, int, int, int, uint32_t*, int);
int (*WebPExtractAlpha)(const uint8_t*, int, int, int, uint8_t*, int);
void (*WebPExtractGreen)(const uint32_t* argb, uint8_t* alpha, int size);
+void (*WebPPackRGB)(const uint8_t* r, const uint8_t* g, const uint8_t* b,
+ int len, int step, uint32_t* out);
+
+int (*WebPHasAlpha8b)(const uint8_t* src, int length);
+int (*WebPHasAlpha32b)(const uint8_t* src, int length);
//------------------------------------------------------------------------------
// Init function
@@ -360,15 +401,21 @@ static volatile VP8CPUInfo alpha_processing_last_cpuinfo_used =
WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessing(void) {
if (alpha_processing_last_cpuinfo_used == VP8GetCPUInfo) return;
- WebPMultARGBRow = WebPMultARGBRowC;
- WebPMultRow = WebPMultRowC;
- WebPApplyAlphaMultiply = ApplyAlphaMultiply;
- WebPApplyAlphaMultiply4444 = ApplyAlphaMultiply_16b;
+ WebPMultARGBRow = WebPMultARGBRow_C;
+ WebPMultRow = WebPMultRow_C;
+ WebPApplyAlphaMultiply4444 = ApplyAlphaMultiply_16b_C;
+ WebPPackRGB = PackRGB_C;
+#if !WEBP_NEON_OMIT_C_CODE
+ WebPApplyAlphaMultiply = ApplyAlphaMultiply_C;
WebPDispatchAlpha = DispatchAlpha_C;
WebPDispatchAlphaToGreen = DispatchAlphaToGreen_C;
WebPExtractAlpha = ExtractAlpha_C;
WebPExtractGreen = ExtractGreen_C;
+#endif
+
+ WebPHasAlpha8b = HasAlpha8b_C;
+ WebPHasAlpha32b = HasAlpha32b_C;
// If defined, use CPUInfo() to overwrite some pointers with faster versions.
if (VP8GetCPUInfo != NULL) {
@@ -382,16 +429,31 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessing(void) {
#endif
}
#endif
-#if defined(WEBP_USE_NEON)
- if (VP8GetCPUInfo(kNEON)) {
- WebPInitAlphaProcessingNEON();
- }
-#endif
#if defined(WEBP_USE_MIPS_DSP_R2)
if (VP8GetCPUInfo(kMIPSdspR2)) {
WebPInitAlphaProcessingMIPSdspR2();
}
#endif
}
+
+#if defined(WEBP_USE_NEON)
+ if (WEBP_NEON_OMIT_C_CODE ||
+ (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
+ WebPInitAlphaProcessingNEON();
+ }
+#endif
+
+ assert(WebPMultARGBRow != NULL);
+ assert(WebPMultRow != NULL);
+ assert(WebPApplyAlphaMultiply != NULL);
+ assert(WebPApplyAlphaMultiply4444 != NULL);
+ assert(WebPDispatchAlpha != NULL);
+ assert(WebPDispatchAlphaToGreen != NULL);
+ assert(WebPExtractAlpha != NULL);
+ assert(WebPExtractGreen != NULL);
+ assert(WebPPackRGB != NULL);
+ assert(WebPHasAlpha8b != NULL);
+ assert(WebPHasAlpha32b != NULL);
+
alpha_processing_last_cpuinfo_used = VP8GetCPUInfo;
}
diff --git a/src/3rdparty/libwebp/src/dsp/alpha_processing_mips_dsp_r2.c b/src/3rdparty/libwebp/src/dsp/alpha_processing_mips_dsp_r2.c
index c631d78..e0dc91b 100644
--- a/src/3rdparty/libwebp/src/dsp/alpha_processing_mips_dsp_r2.c
+++ b/src/3rdparty/libwebp/src/dsp/alpha_processing_mips_dsp_r2.c
@@ -12,13 +12,13 @@
// Author(s): Branimir Vasic (branimir.vasic@imgtec.com)
// Djordje Pesut (djordje.pesut@imgtec.com)
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
#if defined(WEBP_USE_MIPS_DSP_R2)
-static int DispatchAlpha(const uint8_t* alpha, int alpha_stride,
- int width, int height,
- uint8_t* dst, int dst_stride) {
+static int DispatchAlpha_MIPSdspR2(const uint8_t* alpha, int alpha_stride,
+ int width, int height,
+ uint8_t* dst, int dst_stride) {
uint32_t alpha_mask = 0xffffffff;
int i, j, temp0;
@@ -79,7 +79,8 @@ static int DispatchAlpha(const uint8_t* alpha, int alpha_stride,
return (alpha_mask != 0xff);
}
-static void MultARGBRow(uint32_t* const ptr, int width, int inverse) {
+static void MultARGBRow_MIPSdspR2(uint32_t* const ptr, int width,
+ int inverse) {
int x;
const uint32_t c_00ffffff = 0x00ffffffu;
const uint32_t c_ff000000 = 0xff000000u;
@@ -124,14 +125,54 @@ static void MultARGBRow(uint32_t* const ptr, int width, int inverse) {
}
}
+static void PackRGB_MIPSdspR2(const uint8_t* r, const uint8_t* g,
+ const uint8_t* b, int len, int step,
+ uint32_t* out) {
+ int temp0, temp1, temp2, offset;
+ const int rest = len & 1;
+ const int a = 0xff;
+ const uint32_t* const loop_end = out + len - rest;
+ __asm__ volatile (
+ "xor %[offset], %[offset], %[offset] \n\t"
+ "beq %[loop_end], %[out], 0f \n\t"
+ "2: \n\t"
+ "lbux %[temp0], %[offset](%[r]) \n\t"
+ "lbux %[temp1], %[offset](%[g]) \n\t"
+ "lbux %[temp2], %[offset](%[b]) \n\t"
+ "ins %[temp0], %[a], 16, 16 \n\t"
+ "ins %[temp2], %[temp1], 16, 16 \n\t"
+ "addiu %[out], %[out], 4 \n\t"
+ "precr.qb.ph %[temp0], %[temp0], %[temp2] \n\t"
+ "sw %[temp0], -4(%[out]) \n\t"
+ "addu %[offset], %[offset], %[step] \n\t"
+ "bne %[loop_end], %[out], 2b \n\t"
+ "0: \n\t"
+ "beq %[rest], $zero, 1f \n\t"
+ "lbux %[temp0], %[offset](%[r]) \n\t"
+ "lbux %[temp1], %[offset](%[g]) \n\t"
+ "lbux %[temp2], %[offset](%[b]) \n\t"
+ "ins %[temp0], %[a], 16, 16 \n\t"
+ "ins %[temp2], %[temp1], 16, 16 \n\t"
+ "precr.qb.ph %[temp0], %[temp0], %[temp2] \n\t"
+ "sw %[temp0], 0(%[out]) \n\t"
+ "1: \n\t"
+ : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+ [offset]"=&r"(offset), [out]"+&r"(out)
+ : [a]"r"(a), [r]"r"(r), [g]"r"(g), [b]"r"(b), [step]"r"(step),
+ [loop_end]"r"(loop_end), [rest]"r"(rest)
+ : "memory"
+ );
+}
+
//------------------------------------------------------------------------------
// Entry point
extern void WebPInitAlphaProcessingMIPSdspR2(void);
WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessingMIPSdspR2(void) {
- WebPDispatchAlpha = DispatchAlpha;
- WebPMultARGBRow = MultARGBRow;
+ WebPDispatchAlpha = DispatchAlpha_MIPSdspR2;
+ WebPMultARGBRow = MultARGBRow_MIPSdspR2;
+ WebPPackRGB = PackRGB_MIPSdspR2;
}
#else // !WEBP_USE_MIPS_DSP_R2
diff --git a/src/3rdparty/libwebp/src/dsp/alpha_processing_neon.c b/src/3rdparty/libwebp/src/dsp/alpha_processing_neon.c
index 606a401..9d55421 100644
--- a/src/3rdparty/libwebp/src/dsp/alpha_processing_neon.c
+++ b/src/3rdparty/libwebp/src/dsp/alpha_processing_neon.c
@@ -11,11 +11,11 @@
//
// Author: Skal (pascal.massimino@gmail.com)
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
#if defined(WEBP_USE_NEON)
-#include "./neon.h"
+#include "src/dsp/neon.h"
//------------------------------------------------------------------------------
diff --git a/src/3rdparty/libwebp/src/dsp/alpha_processing_sse2.c b/src/3rdparty/libwebp/src/dsp/alpha_processing_sse2.c
index 83dc559..7658700 100644
--- a/src/3rdparty/libwebp/src/dsp/alpha_processing_sse2.c
+++ b/src/3rdparty/libwebp/src/dsp/alpha_processing_sse2.c
@@ -11,16 +11,16 @@
//
// Author: Skal (pascal.massimino@gmail.com)
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
#if defined(WEBP_USE_SSE2)
#include <emmintrin.h>
//------------------------------------------------------------------------------
-static int DispatchAlpha(const uint8_t* alpha, int alpha_stride,
- int width, int height,
- uint8_t* dst, int dst_stride) {
+static int DispatchAlpha_SSE2(const uint8_t* alpha, int alpha_stride,
+ int width, int height,
+ uint8_t* dst, int dst_stride) {
// alpha_and stores an 'and' operation of all the alpha[] values. The final
// value is not 0xff if any of the alpha[] is not equal to 0xff.
uint32_t alpha_and = 0xff;
@@ -72,9 +72,9 @@ static int DispatchAlpha(const uint8_t* alpha, int alpha_stride,
return (alpha_and != 0xff);
}
-static void DispatchAlphaToGreen(const uint8_t* alpha, int alpha_stride,
- int width, int height,
- uint32_t* dst, int dst_stride) {
+static void DispatchAlphaToGreen_SSE2(const uint8_t* alpha, int alpha_stride,
+ int width, int height,
+ uint32_t* dst, int dst_stride) {
int i, j;
const __m128i zero = _mm_setzero_si128();
const int limit = width & ~15;
@@ -98,9 +98,9 @@ static void DispatchAlphaToGreen(const uint8_t* alpha, int alpha_stride,
}
}
-static int ExtractAlpha(const uint8_t* argb, int argb_stride,
- int width, int height,
- uint8_t* alpha, int alpha_stride) {
+static int ExtractAlpha_SSE2(const uint8_t* argb, int argb_stride,
+ int width, int height,
+ uint8_t* alpha, int alpha_stride) {
// alpha_and stores an 'and' operation of all the alpha[] values. The final
// value is not 0xff if any of the alpha[] is not equal to 0xff.
uint32_t alpha_and = 0xff;
@@ -210,6 +210,61 @@ static void ApplyAlphaMultiply_SSE2(uint8_t* rgba, int alpha_first,
#undef MULTIPLIER
#undef PREMULTIPLY
+//------------------------------------------------------------------------------
+// Alpha detection
+
+static int HasAlpha8b_SSE2(const uint8_t* src, int length) {
+ const __m128i all_0xff = _mm_set1_epi8(0xff);
+ int i = 0;
+ for (; i + 16 <= length; i += 16) {
+ const __m128i v = _mm_loadu_si128((const __m128i*)(src + i));
+ const __m128i bits = _mm_cmpeq_epi8(v, all_0xff);
+ const int mask = _mm_movemask_epi8(bits);
+ if (mask != 0xffff) return 1;
+ }
+ for (; i < length; ++i) if (src[i] != 0xff) return 1;
+ return 0;
+}
+
+static int HasAlpha32b_SSE2(const uint8_t* src, int length) {
+ const __m128i alpha_mask = _mm_set1_epi32(0xff);
+ const __m128i all_0xff = _mm_set1_epi8(0xff);
+ int i = 0;
+ // We don't know if we can access the last 3 bytes after the last alpha
+ // value 'src[4 * length - 4]' (because we don't know if alpha is the first
+ // or the last byte of the quadruplet). Hence the '-3' protection below.
+ length = length * 4 - 3; // size in bytes
+ for (; i + 64 <= length; i += 64) {
+ const __m128i a0 = _mm_loadu_si128((const __m128i*)(src + i + 0));
+ const __m128i a1 = _mm_loadu_si128((const __m128i*)(src + i + 16));
+ const __m128i a2 = _mm_loadu_si128((const __m128i*)(src + i + 32));
+ const __m128i a3 = _mm_loadu_si128((const __m128i*)(src + i + 48));
+ const __m128i b0 = _mm_and_si128(a0, alpha_mask);
+ const __m128i b1 = _mm_and_si128(a1, alpha_mask);
+ const __m128i b2 = _mm_and_si128(a2, alpha_mask);
+ const __m128i b3 = _mm_and_si128(a3, alpha_mask);
+ const __m128i c0 = _mm_packs_epi32(b0, b1);
+ const __m128i c1 = _mm_packs_epi32(b2, b3);
+ const __m128i d = _mm_packus_epi16(c0, c1);
+ const __m128i bits = _mm_cmpeq_epi8(d, all_0xff);
+ const int mask = _mm_movemask_epi8(bits);
+ if (mask != 0xffff) return 1;
+ }
+ for (; i + 32 <= length; i += 32) {
+ const __m128i a0 = _mm_loadu_si128((const __m128i*)(src + i + 0));
+ const __m128i a1 = _mm_loadu_si128((const __m128i*)(src + i + 16));
+ const __m128i b0 = _mm_and_si128(a0, alpha_mask);
+ const __m128i b1 = _mm_and_si128(a1, alpha_mask);
+ const __m128i c = _mm_packs_epi32(b0, b1);
+ const __m128i d = _mm_packus_epi16(c, c);
+ const __m128i bits = _mm_cmpeq_epi8(d, all_0xff);
+ const int mask = _mm_movemask_epi8(bits);
+ if (mask != 0xffff) return 1;
+ }
+ for (; i <= length; i += 4) if (src[i] != 0xff) return 1;
+ return 0;
+}
+
// -----------------------------------------------------------------------------
// Apply alpha value to rows
@@ -238,7 +293,7 @@ static void MultARGBRow_SSE2(uint32_t* const ptr, int width, int inverse) {
}
}
width -= x;
- if (width > 0) WebPMultARGBRowC(ptr + x, width, inverse);
+ if (width > 0) WebPMultARGBRow_C(ptr + x, width, inverse);
}
static void MultRow_SSE2(uint8_t* const ptr, const uint8_t* const alpha,
@@ -261,7 +316,7 @@ static void MultRow_SSE2(uint8_t* const ptr, const uint8_t* const alpha,
}
}
width -= x;
- if (width > 0) WebPMultRowC(ptr + x, alpha + x, width, inverse);
+ if (width > 0) WebPMultRow_C(ptr + x, alpha + x, width, inverse);
}
//------------------------------------------------------------------------------
@@ -273,9 +328,12 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessingSSE2(void) {
WebPMultARGBRow = MultARGBRow_SSE2;
WebPMultRow = MultRow_SSE2;
WebPApplyAlphaMultiply = ApplyAlphaMultiply_SSE2;
- WebPDispatchAlpha = DispatchAlpha;
- WebPDispatchAlphaToGreen = DispatchAlphaToGreen;
- WebPExtractAlpha = ExtractAlpha;
+ WebPDispatchAlpha = DispatchAlpha_SSE2;
+ WebPDispatchAlphaToGreen = DispatchAlphaToGreen_SSE2;
+ WebPExtractAlpha = ExtractAlpha_SSE2;
+
+ WebPHasAlpha8b = HasAlpha8b_SSE2;
+ WebPHasAlpha32b = HasAlpha32b_SSE2;
}
#else // !WEBP_USE_SSE2
diff --git a/src/3rdparty/libwebp/src/dsp/alpha_processing_sse41.c b/src/3rdparty/libwebp/src/dsp/alpha_processing_sse41.c
index 986fde9..56040f9 100644
--- a/src/3rdparty/libwebp/src/dsp/alpha_processing_sse41.c
+++ b/src/3rdparty/libwebp/src/dsp/alpha_processing_sse41.c
@@ -11,7 +11,7 @@
//
// Author: Skal (pascal.massimino@gmail.com)
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
#if defined(WEBP_USE_SSE41)
@@ -19,9 +19,9 @@
//------------------------------------------------------------------------------
-static int ExtractAlpha(const uint8_t* argb, int argb_stride,
- int width, int height,
- uint8_t* alpha, int alpha_stride) {
+static int ExtractAlpha_SSE41(const uint8_t* argb, int argb_stride,
+ int width, int height,
+ uint8_t* alpha, int alpha_stride) {
// alpha_and stores an 'and' operation of all the alpha[] values. The final
// value is not 0xff if any of the alpha[] is not equal to 0xff.
uint32_t alpha_and = 0xff;
@@ -82,7 +82,7 @@ static int ExtractAlpha(const uint8_t* argb, int argb_stride,
extern void WebPInitAlphaProcessingSSE41(void);
WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessingSSE41(void) {
- WebPExtractAlpha = ExtractAlpha;
+ WebPExtractAlpha = ExtractAlpha_SSE41;
}
#else // !WEBP_USE_SSE41
diff --git a/src/3rdparty/libwebp/src/dsp/argb.c b/src/3rdparty/libwebp/src/dsp/argb.c
deleted file mode 100644
index cc1f9a9..0000000
--- a/src/3rdparty/libwebp/src/dsp/argb.c
+++ /dev/null
@@ -1,68 +0,0 @@
-// Copyright 2014 Google Inc. All Rights Reserved.
-//
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
-// -----------------------------------------------------------------------------
-//
-// ARGB making functions.
-//
-// Author: Djordje Pesut (djordje.pesut@imgtec.com)
-
-#include "./dsp.h"
-
-static WEBP_INLINE uint32_t MakeARGB32(int a, int r, int g, int b) {
- return (((uint32_t)a << 24) | (r << 16) | (g << 8) | b);
-}
-
-static void PackARGB(const uint8_t* a, const uint8_t* r, const uint8_t* g,
- const uint8_t* b, int len, uint32_t* out) {
- int i;
- for (i = 0; i < len; ++i) {
- out[i] = MakeARGB32(a[4 * i], r[4 * i], g[4 * i], b[4 * i]);
- }
-}
-
-static void PackRGB(const uint8_t* r, const uint8_t* g, const uint8_t* b,
- int len, int step, uint32_t* out) {
- int i, offset = 0;
- for (i = 0; i < len; ++i) {
- out[i] = MakeARGB32(0xff, r[offset], g[offset], b[offset]);
- offset += step;
- }
-}
-
-void (*VP8PackARGB)(const uint8_t*, const uint8_t*, const uint8_t*,
- const uint8_t*, int, uint32_t*);
-void (*VP8PackRGB)(const uint8_t*, const uint8_t*, const uint8_t*,
- int, int, uint32_t*);
-
-extern void VP8EncDspARGBInitMIPSdspR2(void);
-extern void VP8EncDspARGBInitSSE2(void);
-
-static volatile VP8CPUInfo argb_last_cpuinfo_used =
- (VP8CPUInfo)&argb_last_cpuinfo_used;
-
-WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspARGBInit(void) {
- if (argb_last_cpuinfo_used == VP8GetCPUInfo) return;
-
- VP8PackARGB = PackARGB;
- VP8PackRGB = PackRGB;
-
- // If defined, use CPUInfo() to overwrite some pointers with faster versions.
- if (VP8GetCPUInfo != NULL) {
-#if defined(WEBP_USE_SSE2)
- if (VP8GetCPUInfo(kSSE2)) {
- VP8EncDspARGBInitSSE2();
- }
-#endif
-#if defined(WEBP_USE_MIPS_DSP_R2)
- if (VP8GetCPUInfo(kMIPSdspR2)) {
- VP8EncDspARGBInitMIPSdspR2();
- }
-#endif
- }
- argb_last_cpuinfo_used = VP8GetCPUInfo;
-}
diff --git a/src/3rdparty/libwebp/src/dsp/argb_mips_dsp_r2.c b/src/3rdparty/libwebp/src/dsp/argb_mips_dsp_r2.c
deleted file mode 100644
index af65acb..0000000
--- a/src/3rdparty/libwebp/src/dsp/argb_mips_dsp_r2.c
+++ /dev/null
@@ -1,110 +0,0 @@
-// Copyright 2014 Google Inc. All Rights Reserved.
-//
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
-// -----------------------------------------------------------------------------
-//
-// ARGB making functions (mips version).
-//
-// Author: Djordje Pesut (djordje.pesut@imgtec.com)
-
-#include "./dsp.h"
-
-#if defined(WEBP_USE_MIPS_DSP_R2)
-
-static void PackARGB(const uint8_t* a, const uint8_t* r, const uint8_t* g,
- const uint8_t* b, int len, uint32_t* out) {
- int temp0, temp1, temp2, temp3, offset;
- const int rest = len & 1;
- const uint32_t* const loop_end = out + len - rest;
- const int step = 4;
- __asm__ volatile (
- "xor %[offset], %[offset], %[offset] \n\t"
- "beq %[loop_end], %[out], 0f \n\t"
- "2: \n\t"
- "lbux %[temp0], %[offset](%[a]) \n\t"
- "lbux %[temp1], %[offset](%[r]) \n\t"
- "lbux %[temp2], %[offset](%[g]) \n\t"
- "lbux %[temp3], %[offset](%[b]) \n\t"
- "ins %[temp1], %[temp0], 16, 16 \n\t"
- "ins %[temp3], %[temp2], 16, 16 \n\t"
- "addiu %[out], %[out], 4 \n\t"
- "precr.qb.ph %[temp0], %[temp1], %[temp3] \n\t"
- "sw %[temp0], -4(%[out]) \n\t"
- "addu %[offset], %[offset], %[step] \n\t"
- "bne %[loop_end], %[out], 2b \n\t"
- "0: \n\t"
- "beq %[rest], $zero, 1f \n\t"
- "lbux %[temp0], %[offset](%[a]) \n\t"
- "lbux %[temp1], %[offset](%[r]) \n\t"
- "lbux %[temp2], %[offset](%[g]) \n\t"
- "lbux %[temp3], %[offset](%[b]) \n\t"
- "ins %[temp1], %[temp0], 16, 16 \n\t"
- "ins %[temp3], %[temp2], 16, 16 \n\t"
- "precr.qb.ph %[temp0], %[temp1], %[temp3] \n\t"
- "sw %[temp0], 0(%[out]) \n\t"
- "1: \n\t"
- : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
- [temp3]"=&r"(temp3), [offset]"=&r"(offset), [out]"+&r"(out)
- : [a]"r"(a), [r]"r"(r), [g]"r"(g), [b]"r"(b), [step]"r"(step),
- [loop_end]"r"(loop_end), [rest]"r"(rest)
- : "memory"
- );
-}
-
-static void PackRGB(const uint8_t* r, const uint8_t* g, const uint8_t* b,
- int len, int step, uint32_t* out) {
- int temp0, temp1, temp2, offset;
- const int rest = len & 1;
- const int a = 0xff;
- const uint32_t* const loop_end = out + len - rest;
- __asm__ volatile (
- "xor %[offset], %[offset], %[offset] \n\t"
- "beq %[loop_end], %[out], 0f \n\t"
- "2: \n\t"
- "lbux %[temp0], %[offset](%[r]) \n\t"
- "lbux %[temp1], %[offset](%[g]) \n\t"
- "lbux %[temp2], %[offset](%[b]) \n\t"
- "ins %[temp0], %[a], 16, 16 \n\t"
- "ins %[temp2], %[temp1], 16, 16 \n\t"
- "addiu %[out], %[out], 4 \n\t"
- "precr.qb.ph %[temp0], %[temp0], %[temp2] \n\t"
- "sw %[temp0], -4(%[out]) \n\t"
- "addu %[offset], %[offset], %[step] \n\t"
- "bne %[loop_end], %[out], 2b \n\t"
- "0: \n\t"
- "beq %[rest], $zero, 1f \n\t"
- "lbux %[temp0], %[offset](%[r]) \n\t"
- "lbux %[temp1], %[offset](%[g]) \n\t"
- "lbux %[temp2], %[offset](%[b]) \n\t"
- "ins %[temp0], %[a], 16, 16 \n\t"
- "ins %[temp2], %[temp1], 16, 16 \n\t"
- "precr.qb.ph %[temp0], %[temp0], %[temp2] \n\t"
- "sw %[temp0], 0(%[out]) \n\t"
- "1: \n\t"
- : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
- [offset]"=&r"(offset), [out]"+&r"(out)
- : [a]"r"(a), [r]"r"(r), [g]"r"(g), [b]"r"(b), [step]"r"(step),
- [loop_end]"r"(loop_end), [rest]"r"(rest)
- : "memory"
- );
-}
-
-//------------------------------------------------------------------------------
-// Entry point
-
-extern void VP8EncDspARGBInitMIPSdspR2(void);
-
-WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspARGBInitMIPSdspR2(void) {
- VP8PackARGB = PackARGB;
- VP8PackRGB = PackRGB;
-}
-
-#else // !WEBP_USE_MIPS_DSP_R2
-
-WEBP_DSP_INIT_STUB(VP8EncDspARGBInitMIPSdspR2)
-
-#endif // WEBP_USE_MIPS_DSP_R2
diff --git a/src/3rdparty/libwebp/src/dsp/argb_sse2.c b/src/3rdparty/libwebp/src/dsp/argb_sse2.c
deleted file mode 100644
index afcb195..0000000
--- a/src/3rdparty/libwebp/src/dsp/argb_sse2.c
+++ /dev/null
@@ -1,67 +0,0 @@
-// Copyright 2014 Google Inc. All Rights Reserved.
-//
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
-// -----------------------------------------------------------------------------
-//
-// ARGB making functions (SSE2 version).
-//
-// Author: Skal (pascal.massimino@gmail.com)
-
-#include "./dsp.h"
-
-#if defined(WEBP_USE_SSE2)
-
-#include <assert.h>
-#include <emmintrin.h>
-#include <string.h>
-
-static WEBP_INLINE uint32_t MakeARGB32(int a, int r, int g, int b) {
- return (((uint32_t)a << 24) | (r << 16) | (g << 8) | b);
-}
-
-static void PackARGB(const uint8_t* a, const uint8_t* r, const uint8_t* g,
- const uint8_t* b, int len, uint32_t* out) {
- if (g == r + 1) { // RGBA input order. Need to swap R and B.
- int i = 0;
- const int len_max = len & ~3; // max length processed in main loop
- const __m128i red_blue_mask = _mm_set1_epi32(0x00ff00ffu);
- assert(b == r + 2);
- assert(a == r + 3);
- for (; i < len_max; i += 4) {
- const __m128i A = _mm_loadu_si128((const __m128i*)(r + 4 * i));
- const __m128i B = _mm_and_si128(A, red_blue_mask); // R 0 B 0
- const __m128i C = _mm_andnot_si128(red_blue_mask, A); // 0 G 0 A
- const __m128i D = _mm_shufflelo_epi16(B, _MM_SHUFFLE(2, 3, 0, 1));
- const __m128i E = _mm_shufflehi_epi16(D, _MM_SHUFFLE(2, 3, 0, 1));
- const __m128i F = _mm_or_si128(E, C);
- _mm_storeu_si128((__m128i*)(out + i), F);
- }
- for (; i < len; ++i) {
- out[i] = MakeARGB32(a[4 * i], r[4 * i], g[4 * i], b[4 * i]);
- }
- } else {
- assert(g == b + 1);
- assert(r == b + 2);
- assert(a == b + 3);
- memcpy(out, b, len * 4);
- }
-}
-
-//------------------------------------------------------------------------------
-// Entry point
-
-extern void VP8EncDspARGBInitSSE2(void);
-
-WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspARGBInitSSE2(void) {
- VP8PackARGB = PackARGB;
-}
-
-#else // !WEBP_USE_SSE2
-
-WEBP_DSP_INIT_STUB(VP8EncDspARGBInitSSE2)
-
-#endif // WEBP_USE_SSE2
diff --git a/src/3rdparty/libwebp/src/dsp/cost.c b/src/3rdparty/libwebp/src/dsp/cost.c
index 58ddea7..a732389 100644
--- a/src/3rdparty/libwebp/src/dsp/cost.c
+++ b/src/3rdparty/libwebp/src/dsp/cost.c
@@ -9,8 +9,8 @@
//
// Author: Skal (pascal.massimino@gmail.com)
-#include "./dsp.h"
-#include "../enc/cost_enc.h"
+#include "src/dsp/dsp.h"
+#include "src/enc/cost_enc.h"
//------------------------------------------------------------------------------
// Boolean-cost cost table
@@ -319,7 +319,7 @@ const uint8_t VP8EncBands[16 + 1] = {
//------------------------------------------------------------------------------
// Mode costs
-static int GetResidualCost(int ctx0, const VP8Residual* const res) {
+static int GetResidualCost_C(int ctx0, const VP8Residual* const res) {
int n = res->first;
// should be prob[VP8EncBands[n]], but it's equivalent for n=0 or 1
const int p0 = res->prob[n][ctx0][0];
@@ -354,8 +354,8 @@ static int GetResidualCost(int ctx0, const VP8Residual* const res) {
return cost;
}
-static void SetResidualCoeffs(const int16_t* const coeffs,
- VP8Residual* const res) {
+static void SetResidualCoeffs_C(const int16_t* const coeffs,
+ VP8Residual* const res) {
int n;
res->last = -1;
assert(res->first == 0 || coeffs[0] == 0);
@@ -384,8 +384,8 @@ static volatile VP8CPUInfo cost_last_cpuinfo_used =
WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspCostInit(void) {
if (cost_last_cpuinfo_used == VP8GetCPUInfo) return;
- VP8GetResidualCost = GetResidualCost;
- VP8SetResidualCoeffs = SetResidualCoeffs;
+ VP8GetResidualCost = GetResidualCost_C;
+ VP8SetResidualCoeffs = SetResidualCoeffs_C;
// If defined, use CPUInfo() to overwrite some pointers with faster versions.
if (VP8GetCPUInfo != NULL) {
diff --git a/src/3rdparty/libwebp/src/dsp/cost_mips32.c b/src/3rdparty/libwebp/src/dsp/cost_mips32.c
index 3102da8..0500f88 100644
--- a/src/3rdparty/libwebp/src/dsp/cost_mips32.c
+++ b/src/3rdparty/libwebp/src/dsp/cost_mips32.c
@@ -9,13 +9,13 @@
//
// Author: Djordje Pesut (djordje.pesut@imgtec.com)
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
#if defined(WEBP_USE_MIPS32)
-#include "../enc/cost_enc.h"
+#include "src/enc/cost_enc.h"
-static int GetResidualCost(int ctx0, const VP8Residual* const res) {
+static int GetResidualCost_MIPS32(int ctx0, const VP8Residual* const res) {
int temp0, temp1;
int v_reg, ctx_reg;
int n = res->first;
@@ -96,8 +96,8 @@ static int GetResidualCost(int ctx0, const VP8Residual* const res) {
return cost;
}
-static void SetResidualCoeffs(const int16_t* const coeffs,
- VP8Residual* const res) {
+static void SetResidualCoeffs_MIPS32(const int16_t* const coeffs,
+ VP8Residual* const res) {
const int16_t* p_coeffs = (int16_t*)coeffs;
int temp0, temp1, temp2, n, n1;
assert(res->first == 0 || coeffs[0] == 0);
@@ -143,8 +143,8 @@ static void SetResidualCoeffs(const int16_t* const coeffs,
extern void VP8EncDspCostInitMIPS32(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspCostInitMIPS32(void) {
- VP8GetResidualCost = GetResidualCost;
- VP8SetResidualCoeffs = SetResidualCoeffs;
+ VP8GetResidualCost = GetResidualCost_MIPS32;
+ VP8SetResidualCoeffs = SetResidualCoeffs_MIPS32;
}
#else // !WEBP_USE_MIPS32
diff --git a/src/3rdparty/libwebp/src/dsp/cost_mips_dsp_r2.c b/src/3rdparty/libwebp/src/dsp/cost_mips_dsp_r2.c
index 6ec8aeb..51248de 100644
--- a/src/3rdparty/libwebp/src/dsp/cost_mips_dsp_r2.c
+++ b/src/3rdparty/libwebp/src/dsp/cost_mips_dsp_r2.c
@@ -9,13 +9,13 @@
//
// Author: Djordje Pesut (djordje.pesut@imgtec.com)
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
#if defined(WEBP_USE_MIPS_DSP_R2)
-#include "../enc/cost_enc.h"
+#include "src/enc/cost_enc.h"
-static int GetResidualCost(int ctx0, const VP8Residual* const res) {
+static int GetResidualCost_MIPSdspR2(int ctx0, const VP8Residual* const res) {
int temp0, temp1;
int v_reg, ctx_reg;
int n = res->first;
@@ -97,7 +97,7 @@ static int GetResidualCost(int ctx0, const VP8Residual* const res) {
extern void VP8EncDspCostInitMIPSdspR2(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspCostInitMIPSdspR2(void) {
- VP8GetResidualCost = GetResidualCost;
+ VP8GetResidualCost = GetResidualCost_MIPSdspR2;
}
#else // !WEBP_USE_MIPS_DSP_R2
diff --git a/src/3rdparty/libwebp/src/dsp/cost_sse2.c b/src/3rdparty/libwebp/src/dsp/cost_sse2.c
index 421d51f..487a079 100644
--- a/src/3rdparty/libwebp/src/dsp/cost_sse2.c
+++ b/src/3rdparty/libwebp/src/dsp/cost_sse2.c
@@ -11,19 +11,19 @@
//
// Author: Skal (pascal.massimino@gmail.com)
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
#if defined(WEBP_USE_SSE2)
#include <emmintrin.h>
-#include "../enc/cost_enc.h"
-#include "../enc/vp8i_enc.h"
-#include "../utils/utils.h"
+#include "src/enc/cost_enc.h"
+#include "src/enc/vp8i_enc.h"
+#include "src/utils/utils.h"
//------------------------------------------------------------------------------
-static void SetResidualCoeffsSSE2(const int16_t* const coeffs,
- VP8Residual* const res) {
+static void SetResidualCoeffs_SSE2(const int16_t* const coeffs,
+ VP8Residual* const res) {
const __m128i c0 = _mm_loadu_si128((const __m128i*)(coeffs + 0));
const __m128i c1 = _mm_loadu_si128((const __m128i*)(coeffs + 8));
// Use SSE2 to compare 16 values with a single instruction.
@@ -42,7 +42,7 @@ static void SetResidualCoeffsSSE2(const int16_t* const coeffs,
res->coeffs = coeffs;
}
-static int GetResidualCostSSE2(int ctx0, const VP8Residual* const res) {
+static int GetResidualCost_SSE2(int ctx0, const VP8Residual* const res) {
uint8_t levels[16], ctxs[16];
uint16_t abs_levels[16];
int n = res->first;
@@ -108,8 +108,8 @@ static int GetResidualCostSSE2(int ctx0, const VP8Residual* const res) {
extern void VP8EncDspCostInitSSE2(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspCostInitSSE2(void) {
- VP8SetResidualCoeffs = SetResidualCoeffsSSE2;
- VP8GetResidualCost = GetResidualCostSSE2;
+ VP8SetResidualCoeffs = SetResidualCoeffs_SSE2;
+ VP8GetResidualCost = GetResidualCost_SSE2;
}
#else // !WEBP_USE_SSE2
diff --git a/src/3rdparty/libwebp/src/dsp/cpu.c b/src/3rdparty/libwebp/src/dsp/cpu.c
index b39184e..6f42cbb 100644
--- a/src/3rdparty/libwebp/src/dsp/cpu.c
+++ b/src/3rdparty/libwebp/src/dsp/cpu.c
@@ -11,7 +11,7 @@
//
// Author: Christian Duvivier (cduvivier@google.com)
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
#if defined(WEBP_HAVE_NEON_RTCD)
#include <stdio.h>
@@ -143,7 +143,7 @@ static int x86CPUInfo(CPUFeature feature) {
return !!(cpu_info[2] & (1 << 0));
}
if (feature == kSlowSSSE3) {
- if (is_intel && (cpu_info[2] & (1 << 0))) { // SSSE3?
+ if (is_intel && (cpu_info[2] & (1 << 9))) { // SSSE3?
return CheckSlowModel(cpu_info[0]);
}
return 0;
diff --git a/src/3rdparty/libwebp/src/dsp/dec.c b/src/3rdparty/libwebp/src/dsp/dec.c
index 007e985..7e82407 100644
--- a/src/3rdparty/libwebp/src/dsp/dec.c
+++ b/src/3rdparty/libwebp/src/dsp/dec.c
@@ -11,9 +11,11 @@
//
// Author: Skal (pascal.massimino@gmail.com)
-#include "./dsp.h"
-#include "../dec/vp8i_dec.h"
-#include "../utils/utils.h"
+#include <assert.h>
+
+#include "src/dsp/dsp.h"
+#include "src/dec/vp8i_dec.h"
+#include "src/utils/utils.h"
//------------------------------------------------------------------------------
@@ -25,7 +27,7 @@ static WEBP_INLINE uint8_t clip_8b(int v) {
// Transforms (Paragraph 14.4)
#define STORE(x, y, v) \
- dst[x + y * BPS] = clip_8b(dst[x + y * BPS] + ((v) >> 3))
+ dst[(x) + (y) * BPS] = clip_8b(dst[(x) + (y) * BPS] + ((v) >> 3))
#define STORE2(y, dc, d, c) do { \
const int DC = (dc); \
@@ -38,7 +40,8 @@ static WEBP_INLINE uint8_t clip_8b(int v) {
#define MUL1(a) ((((a) * 20091) >> 16) + (a))
#define MUL2(a) (((a) * 35468) >> 16)
-static void TransformOne(const int16_t* in, uint8_t* dst) {
+#if !WEBP_NEON_OMIT_C_CODE
+static void TransformOne_C(const int16_t* in, uint8_t* dst) {
int C[4 * 4], *tmp;
int i;
tmp = C;
@@ -78,7 +81,7 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
}
// Simplified transform when only in[0], in[1] and in[4] are non-zero
-static void TransformAC3(const int16_t* in, uint8_t* dst) {
+static void TransformAC3_C(const int16_t* in, uint8_t* dst) {
const int a = in[0] + 4;
const int c4 = MUL2(in[4]);
const int d4 = MUL1(in[4]);
@@ -93,19 +96,21 @@ static void TransformAC3(const int16_t* in, uint8_t* dst) {
#undef MUL2
#undef STORE2
-static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
- TransformOne(in, dst);
+static void TransformTwo_C(const int16_t* in, uint8_t* dst, int do_two) {
+ TransformOne_C(in, dst);
if (do_two) {
- TransformOne(in + 16, dst + 4);
+ TransformOne_C(in + 16, dst + 4);
}
}
+#endif // !WEBP_NEON_OMIT_C_CODE
-static void TransformUV(const int16_t* in, uint8_t* dst) {
+static void TransformUV_C(const int16_t* in, uint8_t* dst) {
VP8Transform(in + 0 * 16, dst, 1);
VP8Transform(in + 2 * 16, dst + 4 * BPS, 1);
}
-static void TransformDC(const int16_t* in, uint8_t* dst) {
+#if !WEBP_NEON_OMIT_C_CODE
+static void TransformDC_C(const int16_t* in, uint8_t* dst) {
const int DC = in[0] + 4;
int i, j;
for (j = 0; j < 4; ++j) {
@@ -114,8 +119,9 @@ static void TransformDC(const int16_t* in, uint8_t* dst) {
}
}
}
+#endif // !WEBP_NEON_OMIT_C_CODE
-static void TransformDCUV(const int16_t* in, uint8_t* dst) {
+static void TransformDCUV_C(const int16_t* in, uint8_t* dst) {
if (in[0 * 16]) VP8TransformDC(in + 0 * 16, dst);
if (in[1 * 16]) VP8TransformDC(in + 1 * 16, dst + 4);
if (in[2 * 16]) VP8TransformDC(in + 2 * 16, dst + 4 * BPS);
@@ -127,7 +133,8 @@ static void TransformDCUV(const int16_t* in, uint8_t* dst) {
//------------------------------------------------------------------------------
// Paragraph 14.3
-static void TransformWHT(const int16_t* in, int16_t* out) {
+#if !WEBP_NEON_OMIT_C_CODE
+static void TransformWHT_C(const int16_t* in, int16_t* out) {
int tmp[16];
int i;
for (i = 0; i < 4; ++i) {
@@ -153,6 +160,7 @@ static void TransformWHT(const int16_t* in, int16_t* out) {
out += 64;
}
}
+#endif // !WEBP_NEON_OMIT_C_CODE
void (*VP8TransformWHT)(const int16_t* in, int16_t* out);
@@ -161,6 +169,7 @@ void (*VP8TransformWHT)(const int16_t* in, int16_t* out);
#define DST(x, y) dst[(x) + (y) * BPS]
+#if !WEBP_NEON_OMIT_C_CODE
static WEBP_INLINE void TrueMotion(uint8_t* dst, int size) {
const uint8_t* top = dst - BPS;
const uint8_t* const clip0 = VP8kclip1 - top[-1];
@@ -174,21 +183,21 @@ static WEBP_INLINE void TrueMotion(uint8_t* dst, int size) {
dst += BPS;
}
}
-static void TM4(uint8_t* dst) { TrueMotion(dst, 4); }
-static void TM8uv(uint8_t* dst) { TrueMotion(dst, 8); }
-static void TM16(uint8_t* dst) { TrueMotion(dst, 16); }
+static void TM4_C(uint8_t* dst) { TrueMotion(dst, 4); }
+static void TM8uv_C(uint8_t* dst) { TrueMotion(dst, 8); }
+static void TM16_C(uint8_t* dst) { TrueMotion(dst, 16); }
//------------------------------------------------------------------------------
// 16x16
-static void VE16(uint8_t* dst) { // vertical
+static void VE16_C(uint8_t* dst) { // vertical
int j;
for (j = 0; j < 16; ++j) {
memcpy(dst + j * BPS, dst - BPS, 16);
}
}
-static void HE16(uint8_t* dst) { // horizontal
+static void HE16_C(uint8_t* dst) { // horizontal
int j;
for (j = 16; j > 0; --j) {
memset(dst, dst[-1], 16);
@@ -203,7 +212,7 @@ static WEBP_INLINE void Put16(int v, uint8_t* dst) {
}
}
-static void DC16(uint8_t* dst) { // DC
+static void DC16_C(uint8_t* dst) { // DC
int DC = 16;
int j;
for (j = 0; j < 16; ++j) {
@@ -212,7 +221,7 @@ static void DC16(uint8_t* dst) { // DC
Put16(DC >> 5, dst);
}
-static void DC16NoTop(uint8_t* dst) { // DC with top samples not available
+static void DC16NoTop_C(uint8_t* dst) { // DC with top samples not available
int DC = 8;
int j;
for (j = 0; j < 16; ++j) {
@@ -221,7 +230,7 @@ static void DC16NoTop(uint8_t* dst) { // DC with top samples not available
Put16(DC >> 4, dst);
}
-static void DC16NoLeft(uint8_t* dst) { // DC with left samples not available
+static void DC16NoLeft_C(uint8_t* dst) { // DC with left samples not available
int DC = 8;
int i;
for (i = 0; i < 16; ++i) {
@@ -230,9 +239,10 @@ static void DC16NoLeft(uint8_t* dst) { // DC with left samples not available
Put16(DC >> 4, dst);
}
-static void DC16NoTopLeft(uint8_t* dst) { // DC with no top and left samples
+static void DC16NoTopLeft_C(uint8_t* dst) { // DC with no top and left samples
Put16(0x80, dst);
}
+#endif // !WEBP_NEON_OMIT_C_CODE
VP8PredFunc VP8PredLuma16[NUM_B_DC_MODES];
@@ -242,7 +252,8 @@ VP8PredFunc VP8PredLuma16[NUM_B_DC_MODES];
#define AVG3(a, b, c) ((uint8_t)(((a) + 2 * (b) + (c) + 2) >> 2))
#define AVG2(a, b) (((a) + (b) + 1) >> 1)
-static void VE4(uint8_t* dst) { // vertical
+#if !WEBP_NEON_OMIT_C_CODE
+static void VE4_C(uint8_t* dst) { // vertical
const uint8_t* top = dst - BPS;
const uint8_t vals[4] = {
AVG3(top[-1], top[0], top[1]),
@@ -255,8 +266,9 @@ static void VE4(uint8_t* dst) { // vertical
memcpy(dst + i * BPS, vals, sizeof(vals));
}
}
+#endif // !WEBP_NEON_OMIT_C_CODE
-static void HE4(uint8_t* dst) { // horizontal
+static void HE4_C(uint8_t* dst) { // horizontal
const int A = dst[-1 - BPS];
const int B = dst[-1];
const int C = dst[-1 + BPS];
@@ -268,7 +280,8 @@ static void HE4(uint8_t* dst) { // horizontal
WebPUint32ToMem(dst + 3 * BPS, 0x01010101U * AVG3(D, E, E));
}
-static void DC4(uint8_t* dst) { // DC
+#if !WEBP_NEON_OMIT_C_CODE
+static void DC4_C(uint8_t* dst) { // DC
uint32_t dc = 4;
int i;
for (i = 0; i < 4; ++i) dc += dst[i - BPS] + dst[-1 + i * BPS];
@@ -276,7 +289,7 @@ static void DC4(uint8_t* dst) { // DC
for (i = 0; i < 4; ++i) memset(dst + i * BPS, dc, 4);
}
-static void RD4(uint8_t* dst) { // Down-right
+static void RD4_C(uint8_t* dst) { // Down-right
const int I = dst[-1 + 0 * BPS];
const int J = dst[-1 + 1 * BPS];
const int K = dst[-1 + 2 * BPS];
@@ -295,7 +308,7 @@ static void RD4(uint8_t* dst) { // Down-right
DST(3, 0) = AVG3(D, C, B);
}
-static void LD4(uint8_t* dst) { // Down-Left
+static void LD4_C(uint8_t* dst) { // Down-Left
const int A = dst[0 - BPS];
const int B = dst[1 - BPS];
const int C = dst[2 - BPS];
@@ -312,8 +325,9 @@ static void LD4(uint8_t* dst) { // Down-Left
DST(3, 2) = DST(2, 3) = AVG3(F, G, H);
DST(3, 3) = AVG3(G, H, H);
}
+#endif // !WEBP_NEON_OMIT_C_CODE
-static void VR4(uint8_t* dst) { // Vertical-Right
+static void VR4_C(uint8_t* dst) { // Vertical-Right
const int I = dst[-1 + 0 * BPS];
const int J = dst[-1 + 1 * BPS];
const int K = dst[-1 + 2 * BPS];
@@ -335,7 +349,7 @@ static void VR4(uint8_t* dst) { // Vertical-Right
DST(3, 1) = AVG3(B, C, D);
}
-static void VL4(uint8_t* dst) { // Vertical-Left
+static void VL4_C(uint8_t* dst) { // Vertical-Left
const int A = dst[0 - BPS];
const int B = dst[1 - BPS];
const int C = dst[2 - BPS];
@@ -357,7 +371,7 @@ static void VL4(uint8_t* dst) { // Vertical-Left
DST(3, 3) = AVG3(F, G, H);
}
-static void HU4(uint8_t* dst) { // Horizontal-Up
+static void HU4_C(uint8_t* dst) { // Horizontal-Up
const int I = dst[-1 + 0 * BPS];
const int J = dst[-1 + 1 * BPS];
const int K = dst[-1 + 2 * BPS];
@@ -372,7 +386,7 @@ static void HU4(uint8_t* dst) { // Horizontal-Up
DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L;
}
-static void HD4(uint8_t* dst) { // Horizontal-Down
+static void HD4_C(uint8_t* dst) { // Horizontal-Down
const int I = dst[-1 + 0 * BPS];
const int J = dst[-1 + 1 * BPS];
const int K = dst[-1 + 2 * BPS];
@@ -404,14 +418,15 @@ VP8PredFunc VP8PredLuma4[NUM_BMODES];
//------------------------------------------------------------------------------
// Chroma
-static void VE8uv(uint8_t* dst) { // vertical
+#if !WEBP_NEON_OMIT_C_CODE
+static void VE8uv_C(uint8_t* dst) { // vertical
int j;
for (j = 0; j < 8; ++j) {
memcpy(dst + j * BPS, dst - BPS, 8);
}
}
-static void HE8uv(uint8_t* dst) { // horizontal
+static void HE8uv_C(uint8_t* dst) { // horizontal
int j;
for (j = 0; j < 8; ++j) {
memset(dst, dst[-1], 8);
@@ -427,7 +442,7 @@ static WEBP_INLINE void Put8x8uv(uint8_t value, uint8_t* dst) {
}
}
-static void DC8uv(uint8_t* dst) { // DC
+static void DC8uv_C(uint8_t* dst) { // DC
int dc0 = 8;
int i;
for (i = 0; i < 8; ++i) {
@@ -436,7 +451,7 @@ static void DC8uv(uint8_t* dst) { // DC
Put8x8uv(dc0 >> 4, dst);
}
-static void DC8uvNoLeft(uint8_t* dst) { // DC with no left samples
+static void DC8uvNoLeft_C(uint8_t* dst) { // DC with no left samples
int dc0 = 4;
int i;
for (i = 0; i < 8; ++i) {
@@ -445,7 +460,7 @@ static void DC8uvNoLeft(uint8_t* dst) { // DC with no left samples
Put8x8uv(dc0 >> 3, dst);
}
-static void DC8uvNoTop(uint8_t* dst) { // DC with no top samples
+static void DC8uvNoTop_C(uint8_t* dst) { // DC with no top samples
int dc0 = 4;
int i;
for (i = 0; i < 8; ++i) {
@@ -454,17 +469,19 @@ static void DC8uvNoTop(uint8_t* dst) { // DC with no top samples
Put8x8uv(dc0 >> 3, dst);
}
-static void DC8uvNoTopLeft(uint8_t* dst) { // DC with nothing
+static void DC8uvNoTopLeft_C(uint8_t* dst) { // DC with nothing
Put8x8uv(0x80, dst);
}
+#endif // !WEBP_NEON_OMIT_C_CODE
VP8PredFunc VP8PredChroma8[NUM_B_DC_MODES];
//------------------------------------------------------------------------------
// Edge filtering functions
+#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
// 4 pixels in, 2 pixels out
-static WEBP_INLINE void do_filter2(uint8_t* p, int step) {
+static WEBP_INLINE void DoFilter2_C(uint8_t* p, int step) {
const int p1 = p[-2*step], p0 = p[-step], q0 = p[0], q1 = p[step];
const int a = 3 * (q0 - p0) + VP8ksclip1[p1 - q1]; // in [-893,892]
const int a1 = VP8ksclip2[(a + 4) >> 3]; // in [-16,15]
@@ -474,7 +491,7 @@ static WEBP_INLINE void do_filter2(uint8_t* p, int step) {
}
// 4 pixels in, 4 pixels out
-static WEBP_INLINE void do_filter4(uint8_t* p, int step) {
+static WEBP_INLINE void DoFilter4_C(uint8_t* p, int step) {
const int p1 = p[-2*step], p0 = p[-step], q0 = p[0], q1 = p[step];
const int a = 3 * (q0 - p0);
const int a1 = VP8ksclip2[(a + 4) >> 3];
@@ -487,7 +504,7 @@ static WEBP_INLINE void do_filter4(uint8_t* p, int step) {
}
// 6 pixels in, 6 pixels out
-static WEBP_INLINE void do_filter6(uint8_t* p, int step) {
+static WEBP_INLINE void DoFilter6_C(uint8_t* p, int step) {
const int p2 = p[-3*step], p1 = p[-2*step], p0 = p[-step];
const int q0 = p[0], q1 = p[step], q2 = p[2*step];
const int a = VP8ksclip1[3 * (q0 - p0) + VP8ksclip1[p1 - q1]];
@@ -503,18 +520,22 @@ static WEBP_INLINE void do_filter6(uint8_t* p, int step) {
p[ 2*step] = VP8kclip1[q2 - a3];
}
-static WEBP_INLINE int hev(const uint8_t* p, int step, int thresh) {
+static WEBP_INLINE int Hev(const uint8_t* p, int step, int thresh) {
const int p1 = p[-2*step], p0 = p[-step], q0 = p[0], q1 = p[step];
return (VP8kabs0[p1 - p0] > thresh) || (VP8kabs0[q1 - q0] > thresh);
}
+#endif // !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
-static WEBP_INLINE int needs_filter(const uint8_t* p, int step, int t) {
+#if !WEBP_NEON_OMIT_C_CODE
+static WEBP_INLINE int NeedsFilter_C(const uint8_t* p, int step, int t) {
const int p1 = p[-2 * step], p0 = p[-step], q0 = p[0], q1 = p[step];
return ((4 * VP8kabs0[p0 - q0] + VP8kabs0[p1 - q1]) <= t);
}
+#endif // !WEBP_NEON_OMIT_C_CODE
-static WEBP_INLINE int needs_filter2(const uint8_t* p,
- int step, int t, int it) {
+#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
+static WEBP_INLINE int NeedsFilter2_C(const uint8_t* p,
+ int step, int t, int it) {
const int p3 = p[-4 * step], p2 = p[-3 * step], p1 = p[-2 * step];
const int p0 = p[-step], q0 = p[0];
const int q1 = p[step], q2 = p[2 * step], q3 = p[3 * step];
@@ -523,140 +544,159 @@ static WEBP_INLINE int needs_filter2(const uint8_t* p,
VP8kabs0[p1 - p0] <= it && VP8kabs0[q3 - q2] <= it &&
VP8kabs0[q2 - q1] <= it && VP8kabs0[q1 - q0] <= it;
}
+#endif // !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
//------------------------------------------------------------------------------
// Simple In-loop filtering (Paragraph 15.2)
-static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {
+#if !WEBP_NEON_OMIT_C_CODE
+static void SimpleVFilter16_C(uint8_t* p, int stride, int thresh) {
int i;
const int thresh2 = 2 * thresh + 1;
for (i = 0; i < 16; ++i) {
- if (needs_filter(p + i, stride, thresh2)) {
- do_filter2(p + i, stride);
+ if (NeedsFilter_C(p + i, stride, thresh2)) {
+ DoFilter2_C(p + i, stride);
}
}
}
-static void SimpleHFilter16(uint8_t* p, int stride, int thresh) {
+static void SimpleHFilter16_C(uint8_t* p, int stride, int thresh) {
int i;
const int thresh2 = 2 * thresh + 1;
for (i = 0; i < 16; ++i) {
- if (needs_filter(p + i * stride, 1, thresh2)) {
- do_filter2(p + i * stride, 1);
+ if (NeedsFilter_C(p + i * stride, 1, thresh2)) {
+ DoFilter2_C(p + i * stride, 1);
}
}
}
-static void SimpleVFilter16i(uint8_t* p, int stride, int thresh) {
+static void SimpleVFilter16i_C(uint8_t* p, int stride, int thresh) {
int k;
for (k = 3; k > 0; --k) {
p += 4 * stride;
- SimpleVFilter16(p, stride, thresh);
+ SimpleVFilter16_C(p, stride, thresh);
}
}
-static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) {
+static void SimpleHFilter16i_C(uint8_t* p, int stride, int thresh) {
int k;
for (k = 3; k > 0; --k) {
p += 4;
- SimpleHFilter16(p, stride, thresh);
+ SimpleHFilter16_C(p, stride, thresh);
}
}
+#endif // !WEBP_NEON_OMIT_C_CODE
//------------------------------------------------------------------------------
// Complex In-loop filtering (Paragraph 15.3)
-static WEBP_INLINE void FilterLoop26(uint8_t* p,
- int hstride, int vstride, int size,
- int thresh, int ithresh, int hev_thresh) {
+#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
+static WEBP_INLINE void FilterLoop26_C(uint8_t* p,
+ int hstride, int vstride, int size,
+ int thresh, int ithresh,
+ int hev_thresh) {
const int thresh2 = 2 * thresh + 1;
while (size-- > 0) {
- if (needs_filter2(p, hstride, thresh2, ithresh)) {
- if (hev(p, hstride, hev_thresh)) {
- do_filter2(p, hstride);
+ if (NeedsFilter2_C(p, hstride, thresh2, ithresh)) {
+ if (Hev(p, hstride, hev_thresh)) {
+ DoFilter2_C(p, hstride);
} else {
- do_filter6(p, hstride);
+ DoFilter6_C(p, hstride);
}
}
p += vstride;
}
}
-static WEBP_INLINE void FilterLoop24(uint8_t* p,
- int hstride, int vstride, int size,
- int thresh, int ithresh, int hev_thresh) {
+static WEBP_INLINE void FilterLoop24_C(uint8_t* p,
+ int hstride, int vstride, int size,
+ int thresh, int ithresh,
+ int hev_thresh) {
const int thresh2 = 2 * thresh + 1;
while (size-- > 0) {
- if (needs_filter2(p, hstride, thresh2, ithresh)) {
- if (hev(p, hstride, hev_thresh)) {
- do_filter2(p, hstride);
+ if (NeedsFilter2_C(p, hstride, thresh2, ithresh)) {
+ if (Hev(p, hstride, hev_thresh)) {
+ DoFilter2_C(p, hstride);
} else {
- do_filter4(p, hstride);
+ DoFilter4_C(p, hstride);
}
}
p += vstride;
}
}
+#endif // !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
+#if !WEBP_NEON_OMIT_C_CODE
// on macroblock edges
-static void VFilter16(uint8_t* p, int stride,
- int thresh, int ithresh, int hev_thresh) {
- FilterLoop26(p, stride, 1, 16, thresh, ithresh, hev_thresh);
+static void VFilter16_C(uint8_t* p, int stride,
+ int thresh, int ithresh, int hev_thresh) {
+ FilterLoop26_C(p, stride, 1, 16, thresh, ithresh, hev_thresh);
}
-static void HFilter16(uint8_t* p, int stride,
- int thresh, int ithresh, int hev_thresh) {
- FilterLoop26(p, 1, stride, 16, thresh, ithresh, hev_thresh);
+static void HFilter16_C(uint8_t* p, int stride,
+ int thresh, int ithresh, int hev_thresh) {
+ FilterLoop26_C(p, 1, stride, 16, thresh, ithresh, hev_thresh);
}
// on three inner edges
-static void VFilter16i(uint8_t* p, int stride,
- int thresh, int ithresh, int hev_thresh) {
+static void VFilter16i_C(uint8_t* p, int stride,
+ int thresh, int ithresh, int hev_thresh) {
int k;
for (k = 3; k > 0; --k) {
p += 4 * stride;
- FilterLoop24(p, stride, 1, 16, thresh, ithresh, hev_thresh);
+ FilterLoop24_C(p, stride, 1, 16, thresh, ithresh, hev_thresh);
}
}
+#endif // !WEBP_NEON_OMIT_C_CODE
-static void HFilter16i(uint8_t* p, int stride,
- int thresh, int ithresh, int hev_thresh) {
+#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
+static void HFilter16i_C(uint8_t* p, int stride,
+ int thresh, int ithresh, int hev_thresh) {
int k;
for (k = 3; k > 0; --k) {
p += 4;
- FilterLoop24(p, 1, stride, 16, thresh, ithresh, hev_thresh);
+ FilterLoop24_C(p, 1, stride, 16, thresh, ithresh, hev_thresh);
}
}
+#endif // !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
+#if !WEBP_NEON_OMIT_C_CODE
// 8-pixels wide variant, for chroma filtering
-static void VFilter8(uint8_t* u, uint8_t* v, int stride,
- int thresh, int ithresh, int hev_thresh) {
- FilterLoop26(u, stride, 1, 8, thresh, ithresh, hev_thresh);
- FilterLoop26(v, stride, 1, 8, thresh, ithresh, hev_thresh);
+static void VFilter8_C(uint8_t* u, uint8_t* v, int stride,
+ int thresh, int ithresh, int hev_thresh) {
+ FilterLoop26_C(u, stride, 1, 8, thresh, ithresh, hev_thresh);
+ FilterLoop26_C(v, stride, 1, 8, thresh, ithresh, hev_thresh);
}
+#endif // !WEBP_NEON_OMIT_C_CODE
-static void HFilter8(uint8_t* u, uint8_t* v, int stride,
- int thresh, int ithresh, int hev_thresh) {
- FilterLoop26(u, 1, stride, 8, thresh, ithresh, hev_thresh);
- FilterLoop26(v, 1, stride, 8, thresh, ithresh, hev_thresh);
+#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
+static void HFilter8_C(uint8_t* u, uint8_t* v, int stride,
+ int thresh, int ithresh, int hev_thresh) {
+ FilterLoop26_C(u, 1, stride, 8, thresh, ithresh, hev_thresh);
+ FilterLoop26_C(v, 1, stride, 8, thresh, ithresh, hev_thresh);
}
+#endif // !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
-static void VFilter8i(uint8_t* u, uint8_t* v, int stride,
- int thresh, int ithresh, int hev_thresh) {
- FilterLoop24(u + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
- FilterLoop24(v + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
+#if !WEBP_NEON_OMIT_C_CODE
+static void VFilter8i_C(uint8_t* u, uint8_t* v, int stride,
+ int thresh, int ithresh, int hev_thresh) {
+ FilterLoop24_C(u + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
+ FilterLoop24_C(v + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
}
+#endif // !WEBP_NEON_OMIT_C_CODE
-static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
- int thresh, int ithresh, int hev_thresh) {
- FilterLoop24(u + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
- FilterLoop24(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
+#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
+static void HFilter8i_C(uint8_t* u, uint8_t* v, int stride,
+ int thresh, int ithresh, int hev_thresh) {
+ FilterLoop24_C(u + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
+ FilterLoop24_C(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
}
+#endif // !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
//------------------------------------------------------------------------------
-static void DitherCombine8x8(const uint8_t* dither, uint8_t* dst,
- int dst_stride) {
+static void DitherCombine8x8_C(const uint8_t* dither, uint8_t* dst,
+ int dst_stride) {
int i, j;
for (j = 0; j < 8; ++j) {
for (i = 0; i < 8; ++i) {
@@ -709,54 +749,66 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8DspInit(void) {
VP8InitClipTables();
- VP8TransformWHT = TransformWHT;
- VP8Transform = TransformTwo;
- VP8TransformUV = TransformUV;
- VP8TransformDC = TransformDC;
- VP8TransformDCUV = TransformDCUV;
- VP8TransformAC3 = TransformAC3;
-
- VP8VFilter16 = VFilter16;
- VP8HFilter16 = HFilter16;
- VP8VFilter8 = VFilter8;
- VP8HFilter8 = HFilter8;
- VP8VFilter16i = VFilter16i;
- VP8HFilter16i = HFilter16i;
- VP8VFilter8i = VFilter8i;
- VP8HFilter8i = HFilter8i;
- VP8SimpleVFilter16 = SimpleVFilter16;
- VP8SimpleHFilter16 = SimpleHFilter16;
- VP8SimpleVFilter16i = SimpleVFilter16i;
- VP8SimpleHFilter16i = SimpleHFilter16i;
-
- VP8PredLuma4[0] = DC4;
- VP8PredLuma4[1] = TM4;
- VP8PredLuma4[2] = VE4;
- VP8PredLuma4[3] = HE4;
- VP8PredLuma4[4] = RD4;
- VP8PredLuma4[5] = VR4;
- VP8PredLuma4[6] = LD4;
- VP8PredLuma4[7] = VL4;
- VP8PredLuma4[8] = HD4;
- VP8PredLuma4[9] = HU4;
-
- VP8PredLuma16[0] = DC16;
- VP8PredLuma16[1] = TM16;
- VP8PredLuma16[2] = VE16;
- VP8PredLuma16[3] = HE16;
- VP8PredLuma16[4] = DC16NoTop;
- VP8PredLuma16[5] = DC16NoLeft;
- VP8PredLuma16[6] = DC16NoTopLeft;
-
- VP8PredChroma8[0] = DC8uv;
- VP8PredChroma8[1] = TM8uv;
- VP8PredChroma8[2] = VE8uv;
- VP8PredChroma8[3] = HE8uv;
- VP8PredChroma8[4] = DC8uvNoTop;
- VP8PredChroma8[5] = DC8uvNoLeft;
- VP8PredChroma8[6] = DC8uvNoTopLeft;
-
- VP8DitherCombine8x8 = DitherCombine8x8;
+#if !WEBP_NEON_OMIT_C_CODE
+ VP8TransformWHT = TransformWHT_C;
+ VP8Transform = TransformTwo_C;
+ VP8TransformDC = TransformDC_C;
+ VP8TransformAC3 = TransformAC3_C;
+#endif
+ VP8TransformUV = TransformUV_C;
+ VP8TransformDCUV = TransformDCUV_C;
+
+#if !WEBP_NEON_OMIT_C_CODE
+ VP8VFilter16 = VFilter16_C;
+ VP8VFilter16i = VFilter16i_C;
+ VP8HFilter16 = HFilter16_C;
+ VP8VFilter8 = VFilter8_C;
+ VP8VFilter8i = VFilter8i_C;
+ VP8SimpleVFilter16 = SimpleVFilter16_C;
+ VP8SimpleHFilter16 = SimpleHFilter16_C;
+ VP8SimpleVFilter16i = SimpleVFilter16i_C;
+ VP8SimpleHFilter16i = SimpleHFilter16i_C;
+#endif
+
+#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
+ VP8HFilter16i = HFilter16i_C;
+ VP8HFilter8 = HFilter8_C;
+ VP8HFilter8i = HFilter8i_C;
+#endif
+
+#if !WEBP_NEON_OMIT_C_CODE
+ VP8PredLuma4[0] = DC4_C;
+ VP8PredLuma4[1] = TM4_C;
+ VP8PredLuma4[2] = VE4_C;
+ VP8PredLuma4[4] = RD4_C;
+ VP8PredLuma4[6] = LD4_C;
+#endif
+
+ VP8PredLuma4[3] = HE4_C;
+ VP8PredLuma4[5] = VR4_C;
+ VP8PredLuma4[7] = VL4_C;
+ VP8PredLuma4[8] = HD4_C;
+ VP8PredLuma4[9] = HU4_C;
+
+#if !WEBP_NEON_OMIT_C_CODE
+ VP8PredLuma16[0] = DC16_C;
+ VP8PredLuma16[1] = TM16_C;
+ VP8PredLuma16[2] = VE16_C;
+ VP8PredLuma16[3] = HE16_C;
+ VP8PredLuma16[4] = DC16NoTop_C;
+ VP8PredLuma16[5] = DC16NoLeft_C;
+ VP8PredLuma16[6] = DC16NoTopLeft_C;
+
+ VP8PredChroma8[0] = DC8uv_C;
+ VP8PredChroma8[1] = TM8uv_C;
+ VP8PredChroma8[2] = VE8uv_C;
+ VP8PredChroma8[3] = HE8uv_C;
+ VP8PredChroma8[4] = DC8uvNoTop_C;
+ VP8PredChroma8[5] = DC8uvNoLeft_C;
+ VP8PredChroma8[6] = DC8uvNoTopLeft_C;
+#endif
+
+ VP8DitherCombine8x8 = DitherCombine8x8_C;
// If defined, use CPUInfo() to overwrite some pointers with faster versions.
if (VP8GetCPUInfo != NULL) {
@@ -770,11 +822,6 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8DspInit(void) {
#endif
}
#endif
-#if defined(WEBP_USE_NEON)
- if (VP8GetCPUInfo(kNEON)) {
- VP8DspInitNEON();
- }
-#endif
#if defined(WEBP_USE_MIPS32)
if (VP8GetCPUInfo(kMIPS32)) {
VP8DspInitMIPS32();
@@ -791,5 +838,57 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8DspInit(void) {
}
#endif
}
+
+#if defined(WEBP_USE_NEON)
+ if (WEBP_NEON_OMIT_C_CODE ||
+ (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
+ VP8DspInitNEON();
+ }
+#endif
+
+ assert(VP8TransformWHT != NULL);
+ assert(VP8Transform != NULL);
+ assert(VP8TransformDC != NULL);
+ assert(VP8TransformAC3 != NULL);
+ assert(VP8TransformUV != NULL);
+ assert(VP8TransformDCUV != NULL);
+ assert(VP8VFilter16 != NULL);
+ assert(VP8HFilter16 != NULL);
+ assert(VP8VFilter8 != NULL);
+ assert(VP8HFilter8 != NULL);
+ assert(VP8VFilter16i != NULL);
+ assert(VP8HFilter16i != NULL);
+ assert(VP8VFilter8i != NULL);
+ assert(VP8HFilter8i != NULL);
+ assert(VP8SimpleVFilter16 != NULL);
+ assert(VP8SimpleHFilter16 != NULL);
+ assert(VP8SimpleVFilter16i != NULL);
+ assert(VP8SimpleHFilter16i != NULL);
+ assert(VP8PredLuma4[0] != NULL);
+ assert(VP8PredLuma4[1] != NULL);
+ assert(VP8PredLuma4[2] != NULL);
+ assert(VP8PredLuma4[3] != NULL);
+ assert(VP8PredLuma4[4] != NULL);
+ assert(VP8PredLuma4[5] != NULL);
+ assert(VP8PredLuma4[6] != NULL);
+ assert(VP8PredLuma4[7] != NULL);
+ assert(VP8PredLuma4[8] != NULL);
+ assert(VP8PredLuma4[9] != NULL);
+ assert(VP8PredLuma16[0] != NULL);
+ assert(VP8PredLuma16[1] != NULL);
+ assert(VP8PredLuma16[2] != NULL);
+ assert(VP8PredLuma16[3] != NULL);
+ assert(VP8PredLuma16[4] != NULL);
+ assert(VP8PredLuma16[5] != NULL);
+ assert(VP8PredLuma16[6] != NULL);
+ assert(VP8PredChroma8[0] != NULL);
+ assert(VP8PredChroma8[1] != NULL);
+ assert(VP8PredChroma8[2] != NULL);
+ assert(VP8PredChroma8[3] != NULL);
+ assert(VP8PredChroma8[4] != NULL);
+ assert(VP8PredChroma8[5] != NULL);
+ assert(VP8PredChroma8[6] != NULL);
+ assert(VP8DitherCombine8x8 != NULL);
+
dec_last_cpuinfo_used = VP8GetCPUInfo;
}
diff --git a/src/3rdparty/libwebp/src/dsp/dec_clip_tables.c b/src/3rdparty/libwebp/src/dsp/dec_clip_tables.c
index 74ba34c..427b74f 100644
--- a/src/3rdparty/libwebp/src/dsp/dec_clip_tables.c
+++ b/src/3rdparty/libwebp/src/dsp/dec_clip_tables.c
@@ -11,11 +11,14 @@
//
// Author: Skal (pascal.massimino@gmail.com)
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
-#define USE_STATIC_TABLES // undefine to have run-time table initialization
+// define to 0 to have run-time table initialization
+#if !defined(USE_STATIC_TABLES)
+#define USE_STATIC_TABLES 1 // ALTERNATE_CODE
+#endif
-#ifdef USE_STATIC_TABLES
+#if (USE_STATIC_TABLES == 1)
static const uint8_t abs0[255 + 255 + 1] = {
0xff, 0xfe, 0xfd, 0xfc, 0xfb, 0xfa, 0xf9, 0xf8, 0xf7, 0xf6, 0xf5, 0xf4,
@@ -337,7 +340,7 @@ static uint8_t clip1[255 + 511 + 1];
// and make sure it's set to true _last_ (so as to be thread-safe)
static volatile int tables_ok = 0;
-#endif
+#endif // USE_STATIC_TABLES
const int8_t* const VP8ksclip1 = (const int8_t*)&sclip1[1020];
const int8_t* const VP8ksclip2 = (const int8_t*)&sclip2[112];
@@ -345,7 +348,7 @@ const uint8_t* const VP8kclip1 = &clip1[255];
const uint8_t* const VP8kabs0 = &abs0[255];
WEBP_TSAN_IGNORE_FUNCTION void VP8InitClipTables(void) {
-#if !defined(USE_STATIC_TABLES)
+#if (USE_STATIC_TABLES == 0)
int i;
if (!tables_ok) {
for (i = -255; i <= 255; ++i) {
diff --git a/src/3rdparty/libwebp/src/dsp/dec_mips32.c b/src/3rdparty/libwebp/src/dsp/dec_mips32.c
index 4e9ef42..e4e7096 100644
--- a/src/3rdparty/libwebp/src/dsp/dec_mips32.c
+++ b/src/3rdparty/libwebp/src/dsp/dec_mips32.c
@@ -12,11 +12,11 @@
// Author(s): Djordje Pesut (djordje.pesut@imgtec.com)
// Jovan Zelincevic (jovan.zelincevic@imgtec.com)
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
#if defined(WEBP_USE_MIPS32)
-#include "./mips_macro.h"
+#include "src/dsp/mips_macro.h"
static const int kC1 = 20091 + (1 << 16);
static const int kC2 = 35468;
diff --git a/src/3rdparty/libwebp/src/dsp/dec_mips_dsp_r2.c b/src/3rdparty/libwebp/src/dsp/dec_mips_dsp_r2.c
index db5c657..b0936bc 100644
--- a/src/3rdparty/libwebp/src/dsp/dec_mips_dsp_r2.c
+++ b/src/3rdparty/libwebp/src/dsp/dec_mips_dsp_r2.c
@@ -12,11 +12,11 @@
// Author(s): Djordje Pesut (djordje.pesut@imgtec.com)
// Jovan Zelincevic (jovan.zelincevic@imgtec.com)
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
#if defined(WEBP_USE_MIPS_DSP_R2)
-#include "./mips_macro.h"
+#include "src/dsp/mips_macro.h"
static const int kC1 = 20091 + (1 << 16);
static const int kC2 = 35468;
diff --git a/src/3rdparty/libwebp/src/dsp/dec_msa.c b/src/3rdparty/libwebp/src/dsp/dec_msa.c
index 8d9c98c..8090622 100644
--- a/src/3rdparty/libwebp/src/dsp/dec_msa.c
+++ b/src/3rdparty/libwebp/src/dsp/dec_msa.c
@@ -12,11 +12,11 @@
// Author(s): Prashant Patil (prashant.patil@imgtec.com)
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
#if defined(WEBP_USE_MSA)
-#include "./msa_macro.h"
+#include "src/dsp/msa_macro.h"
//------------------------------------------------------------------------------
// Transforms
@@ -222,6 +222,7 @@ static void TransformAC3(const int16_t* in, uint8_t* dst) {
const v16i8 cnst4b = __msa_ldi_b(4); \
const v16i8 cnst3b = __msa_ldi_b(3); \
const v8i16 cnst9h = __msa_ldi_h(9); \
+ const v8i16 cnst63h = __msa_ldi_h(63); \
\
FLIP_SIGN4(p1, p0, q0, q1, p1_m, p0_m, q0_m, q1_m); \
filt = __msa_subs_s_b(p1_m, q1_m); \
@@ -241,9 +242,9 @@ static void TransformAC3(const int16_t* in, uint8_t* dst) {
ILVRL_B2_SH(filt_sign, filt, filt_r, filt_l); \
/* update q2/p2 */ \
temp0 = filt_r * cnst9h; \
- temp1 = ADDVI_H(temp0, 63); \
+ temp1 = temp0 + cnst63h; \
temp2 = filt_l * cnst9h; \
- temp3 = ADDVI_H(temp2, 63); \
+ temp3 = temp2 + cnst63h; \
FILT2(q2_m, p2_m, q2, p2); \
/* update q1/p1 */ \
temp1 = temp1 + temp0; \
@@ -708,7 +709,7 @@ static void VE4(uint8_t* dst) { // vertical
const uint32_t val0 = LW(ptop + 0);
const uint32_t val1 = LW(ptop + 4);
uint32_t out;
- v16u8 A, B, C, AC, B2, R;
+ v16u8 A = { 0 }, B, C, AC, B2, R;
INSERT_W2_UB(val0, val1, A);
B = SLDI_UB(A, A, 1);
@@ -725,7 +726,7 @@ static void RD4(uint8_t* dst) { // Down-right
uint32_t val0 = LW(ptop + 0);
uint32_t val1 = LW(ptop + 4);
uint32_t val2, val3;
- v16u8 A, B, C, AC, B2, R, A1;
+ v16u8 A, B, C, AC, B2, R, A1 = { 0 };
INSERT_W2_UB(val0, val1, A1);
A = SLDI_UB(A1, A1, 12);
@@ -753,7 +754,7 @@ static void LD4(uint8_t* dst) { // Down-Left
uint32_t val0 = LW(ptop + 0);
uint32_t val1 = LW(ptop + 4);
uint32_t val2, val3;
- v16u8 A, B, C, AC, B2, R;
+ v16u8 A = { 0 }, B, C, AC, B2, R;
INSERT_W2_UB(val0, val1, A);
B = SLDI_UB(A, A, 1);
diff --git a/src/3rdparty/libwebp/src/dsp/dec_neon.c b/src/3rdparty/libwebp/src/dsp/dec_neon.c
index 34796cf..ffa697f 100644
--- a/src/3rdparty/libwebp/src/dsp/dec_neon.c
+++ b/src/3rdparty/libwebp/src/dsp/dec_neon.c
@@ -12,43 +12,23 @@
// Authors: Somnath Banerjee (somnath@google.com)
// Johann Koenig (johannkoenig@google.com)
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
#if defined(WEBP_USE_NEON)
-#include "./neon.h"
-#include "../dec/vp8i_dec.h"
+#include "src/dsp/neon.h"
+#include "src/dec/vp8i_dec.h"
//------------------------------------------------------------------------------
// NxM Loading functions
-// Load/Store vertical edge
-#define LOAD8x4(c1, c2, c3, c4, b1, b2, stride) \
- "vld4.8 {" #c1 "[0]," #c2 "[0]," #c3 "[0]," #c4 "[0]}," #b1 "," #stride "\n" \
- "vld4.8 {" #c1 "[1]," #c2 "[1]," #c3 "[1]," #c4 "[1]}," #b2 "," #stride "\n" \
- "vld4.8 {" #c1 "[2]," #c2 "[2]," #c3 "[2]," #c4 "[2]}," #b1 "," #stride "\n" \
- "vld4.8 {" #c1 "[3]," #c2 "[3]," #c3 "[3]," #c4 "[3]}," #b2 "," #stride "\n" \
- "vld4.8 {" #c1 "[4]," #c2 "[4]," #c3 "[4]," #c4 "[4]}," #b1 "," #stride "\n" \
- "vld4.8 {" #c1 "[5]," #c2 "[5]," #c3 "[5]," #c4 "[5]}," #b2 "," #stride "\n" \
- "vld4.8 {" #c1 "[6]," #c2 "[6]," #c3 "[6]," #c4 "[6]}," #b1 "," #stride "\n" \
- "vld4.8 {" #c1 "[7]," #c2 "[7]," #c3 "[7]," #c4 "[7]}," #b2 "," #stride "\n"
-
-#define STORE8x2(c1, c2, p, stride) \
- "vst2.8 {" #c1 "[0], " #c2 "[0]}," #p "," #stride " \n" \
- "vst2.8 {" #c1 "[1], " #c2 "[1]}," #p "," #stride " \n" \
- "vst2.8 {" #c1 "[2], " #c2 "[2]}," #p "," #stride " \n" \
- "vst2.8 {" #c1 "[3], " #c2 "[3]}," #p "," #stride " \n" \
- "vst2.8 {" #c1 "[4], " #c2 "[4]}," #p "," #stride " \n" \
- "vst2.8 {" #c1 "[5], " #c2 "[5]}," #p "," #stride " \n" \
- "vst2.8 {" #c1 "[6], " #c2 "[6]}," #p "," #stride " \n" \
- "vst2.8 {" #c1 "[7], " #c2 "[7]}," #p "," #stride " \n"
-
#if !defined(WORK_AROUND_GCC)
// This intrinsics version makes gcc-4.6.3 crash during Load4x??() compilation
// (register alloc, probably). The variants somewhat mitigate the problem, but
// not quite. HFilter16i() remains problematic.
-static WEBP_INLINE uint8x8x4_t Load4x8(const uint8_t* const src, int stride) {
+static WEBP_INLINE uint8x8x4_t Load4x8_NEON(const uint8_t* const src,
+ int stride) {
const uint8x8_t zero = vdup_n_u8(0);
uint8x8x4_t out;
INIT_VECTOR4(out, zero, zero, zero, zero);
@@ -63,13 +43,15 @@ static WEBP_INLINE uint8x8x4_t Load4x8(const uint8_t* const src, int stride) {
return out;
}
-static WEBP_INLINE void Load4x16(const uint8_t* const src, int stride,
- uint8x16_t* const p1, uint8x16_t* const p0,
- uint8x16_t* const q0, uint8x16_t* const q1) {
+static WEBP_INLINE void Load4x16_NEON(const uint8_t* const src, int stride,
+ uint8x16_t* const p1,
+ uint8x16_t* const p0,
+ uint8x16_t* const q0,
+ uint8x16_t* const q1) {
// row0 = p1[0..7]|p0[0..7]|q0[0..7]|q1[0..7]
// row8 = p1[8..15]|p0[8..15]|q0[8..15]|q1[8..15]
- const uint8x8x4_t row0 = Load4x8(src - 2 + 0 * stride, stride);
- const uint8x8x4_t row8 = Load4x8(src - 2 + 8 * stride, stride);
+ const uint8x8x4_t row0 = Load4x8_NEON(src - 2 + 0 * stride, stride);
+ const uint8x8x4_t row8 = Load4x8_NEON(src - 2 + 8 * stride, stride);
*p1 = vcombine_u8(row0.val[0], row8.val[0]);
*p0 = vcombine_u8(row0.val[1], row8.val[1]);
*q0 = vcombine_u8(row0.val[2], row8.val[2]);
@@ -83,9 +65,11 @@ static WEBP_INLINE void Load4x16(const uint8_t* const src, int stride,
src += stride; \
} while (0)
-static WEBP_INLINE void Load4x16(const uint8_t* src, int stride,
- uint8x16_t* const p1, uint8x16_t* const p0,
- uint8x16_t* const q0, uint8x16_t* const q1) {
+static WEBP_INLINE void Load4x16_NEON(const uint8_t* src, int stride,
+ uint8x16_t* const p1,
+ uint8x16_t* const p0,
+ uint8x16_t* const q0,
+ uint8x16_t* const q1) {
const uint32x4_t zero = vdupq_n_u32(0);
uint32x4x4_t in;
INIT_VECTOR4(in, zero, zero, zero, zero);
@@ -126,40 +110,40 @@ static WEBP_INLINE void Load4x16(const uint8_t* src, int stride,
#endif // !WORK_AROUND_GCC
-static WEBP_INLINE void Load8x16(const uint8_t* const src, int stride,
- uint8x16_t* const p3, uint8x16_t* const p2,
- uint8x16_t* const p1, uint8x16_t* const p0,
- uint8x16_t* const q0, uint8x16_t* const q1,
- uint8x16_t* const q2, uint8x16_t* const q3) {
- Load4x16(src - 2, stride, p3, p2, p1, p0);
- Load4x16(src + 2, stride, q0, q1, q2, q3);
+static WEBP_INLINE void Load8x16_NEON(
+ const uint8_t* const src, int stride,
+ uint8x16_t* const p3, uint8x16_t* const p2, uint8x16_t* const p1,
+ uint8x16_t* const p0, uint8x16_t* const q0, uint8x16_t* const q1,
+ uint8x16_t* const q2, uint8x16_t* const q3) {
+ Load4x16_NEON(src - 2, stride, p3, p2, p1, p0);
+ Load4x16_NEON(src + 2, stride, q0, q1, q2, q3);
}
-static WEBP_INLINE void Load16x4(const uint8_t* const src, int stride,
- uint8x16_t* const p1, uint8x16_t* const p0,
- uint8x16_t* const q0, uint8x16_t* const q1) {
+static WEBP_INLINE void Load16x4_NEON(const uint8_t* const src, int stride,
+ uint8x16_t* const p1,
+ uint8x16_t* const p0,
+ uint8x16_t* const q0,
+ uint8x16_t* const q1) {
*p1 = vld1q_u8(src - 2 * stride);
*p0 = vld1q_u8(src - 1 * stride);
*q0 = vld1q_u8(src + 0 * stride);
*q1 = vld1q_u8(src + 1 * stride);
}
-static WEBP_INLINE void Load16x8(const uint8_t* const src, int stride,
- uint8x16_t* const p3, uint8x16_t* const p2,
- uint8x16_t* const p1, uint8x16_t* const p0,
- uint8x16_t* const q0, uint8x16_t* const q1,
- uint8x16_t* const q2, uint8x16_t* const q3) {
- Load16x4(src - 2 * stride, stride, p3, p2, p1, p0);
- Load16x4(src + 2 * stride, stride, q0, q1, q2, q3);
+static WEBP_INLINE void Load16x8_NEON(
+ const uint8_t* const src, int stride,
+ uint8x16_t* const p3, uint8x16_t* const p2, uint8x16_t* const p1,
+ uint8x16_t* const p0, uint8x16_t* const q0, uint8x16_t* const q1,
+ uint8x16_t* const q2, uint8x16_t* const q3) {
+ Load16x4_NEON(src - 2 * stride, stride, p3, p2, p1, p0);
+ Load16x4_NEON(src + 2 * stride, stride, q0, q1, q2, q3);
}
-static WEBP_INLINE void Load8x8x2(const uint8_t* const u,
- const uint8_t* const v,
- int stride,
- uint8x16_t* const p3, uint8x16_t* const p2,
- uint8x16_t* const p1, uint8x16_t* const p0,
- uint8x16_t* const q0, uint8x16_t* const q1,
- uint8x16_t* const q2, uint8x16_t* const q3) {
+static WEBP_INLINE void Load8x8x2_NEON(
+ const uint8_t* const u, const uint8_t* const v, int stride,
+ uint8x16_t* const p3, uint8x16_t* const p2, uint8x16_t* const p1,
+ uint8x16_t* const p0, uint8x16_t* const q0, uint8x16_t* const q1,
+ uint8x16_t* const q2, uint8x16_t* const q3) {
// We pack the 8x8 u-samples in the lower half of the uint8x16_t destination
// and the v-samples on the higher half.
*p3 = vcombine_u8(vld1_u8(u - 4 * stride), vld1_u8(v - 4 * stride));
@@ -177,13 +161,11 @@ static WEBP_INLINE void Load8x8x2(const uint8_t* const u,
#define LOAD_UV_8(ROW) \
vcombine_u8(vld1_u8(u - 4 + (ROW) * stride), vld1_u8(v - 4 + (ROW) * stride))
-static WEBP_INLINE void Load8x8x2T(const uint8_t* const u,
- const uint8_t* const v,
- int stride,
- uint8x16_t* const p3, uint8x16_t* const p2,
- uint8x16_t* const p1, uint8x16_t* const p0,
- uint8x16_t* const q0, uint8x16_t* const q1,
- uint8x16_t* const q2, uint8x16_t* const q3) {
+static WEBP_INLINE void Load8x8x2T_NEON(
+ const uint8_t* const u, const uint8_t* const v, int stride,
+ uint8x16_t* const p3, uint8x16_t* const p2, uint8x16_t* const p1,
+ uint8x16_t* const p0, uint8x16_t* const q0, uint8x16_t* const q1,
+ uint8x16_t* const q2, uint8x16_t* const q3) {
// We pack the 8x8 u-samples in the lower half of the uint8x16_t destination
// and the v-samples on the higher half.
const uint8x16_t row0 = LOAD_UV_8(0);
@@ -238,8 +220,8 @@ static WEBP_INLINE void Load8x8x2T(const uint8_t* const u,
#endif // !WORK_AROUND_GCC
-static WEBP_INLINE void Store2x8(const uint8x8x2_t v,
- uint8_t* const dst, int stride) {
+static WEBP_INLINE void Store2x8_NEON(const uint8x8x2_t v,
+ uint8_t* const dst, int stride) {
vst2_lane_u8(dst + 0 * stride, v, 0);
vst2_lane_u8(dst + 1 * stride, v, 1);
vst2_lane_u8(dst + 2 * stride, v, 2);
@@ -250,20 +232,20 @@ static WEBP_INLINE void Store2x8(const uint8x8x2_t v,
vst2_lane_u8(dst + 7 * stride, v, 7);
}
-static WEBP_INLINE void Store2x16(const uint8x16_t p0, const uint8x16_t q0,
- uint8_t* const dst, int stride) {
+static WEBP_INLINE void Store2x16_NEON(const uint8x16_t p0, const uint8x16_t q0,
+ uint8_t* const dst, int stride) {
uint8x8x2_t lo, hi;
lo.val[0] = vget_low_u8(p0);
lo.val[1] = vget_low_u8(q0);
hi.val[0] = vget_high_u8(p0);
hi.val[1] = vget_high_u8(q0);
- Store2x8(lo, dst - 1 + 0 * stride, stride);
- Store2x8(hi, dst - 1 + 8 * stride, stride);
+ Store2x8_NEON(lo, dst - 1 + 0 * stride, stride);
+ Store2x8_NEON(hi, dst - 1 + 8 * stride, stride);
}
#if !defined(WORK_AROUND_GCC)
-static WEBP_INLINE void Store4x8(const uint8x8x4_t v,
- uint8_t* const dst, int stride) {
+static WEBP_INLINE void Store4x8_NEON(const uint8x8x4_t v,
+ uint8_t* const dst, int stride) {
vst4_lane_u8(dst + 0 * stride, v, 0);
vst4_lane_u8(dst + 1 * stride, v, 1);
vst4_lane_u8(dst + 2 * stride, v, 2);
@@ -274,9 +256,9 @@ static WEBP_INLINE void Store4x8(const uint8x8x4_t v,
vst4_lane_u8(dst + 7 * stride, v, 7);
}
-static WEBP_INLINE void Store4x16(const uint8x16_t p1, const uint8x16_t p0,
- const uint8x16_t q0, const uint8x16_t q1,
- uint8_t* const dst, int stride) {
+static WEBP_INLINE void Store4x16_NEON(const uint8x16_t p1, const uint8x16_t p0,
+ const uint8x16_t q0, const uint8x16_t q1,
+ uint8_t* const dst, int stride) {
uint8x8x4_t lo, hi;
INIT_VECTOR4(lo,
vget_low_u8(p1), vget_low_u8(p0),
@@ -284,27 +266,28 @@ static WEBP_INLINE void Store4x16(const uint8x16_t p1, const uint8x16_t p0,
INIT_VECTOR4(hi,
vget_high_u8(p1), vget_high_u8(p0),
vget_high_u8(q0), vget_high_u8(q1));
- Store4x8(lo, dst - 2 + 0 * stride, stride);
- Store4x8(hi, dst - 2 + 8 * stride, stride);
+ Store4x8_NEON(lo, dst - 2 + 0 * stride, stride);
+ Store4x8_NEON(hi, dst - 2 + 8 * stride, stride);
}
#endif // !WORK_AROUND_GCC
-static WEBP_INLINE void Store16x2(const uint8x16_t p0, const uint8x16_t q0,
- uint8_t* const dst, int stride) {
+static WEBP_INLINE void Store16x2_NEON(const uint8x16_t p0, const uint8x16_t q0,
+ uint8_t* const dst, int stride) {
vst1q_u8(dst - stride, p0);
vst1q_u8(dst, q0);
}
-static WEBP_INLINE void Store16x4(const uint8x16_t p1, const uint8x16_t p0,
- const uint8x16_t q0, const uint8x16_t q1,
- uint8_t* const dst, int stride) {
- Store16x2(p1, p0, dst - stride, stride);
- Store16x2(q0, q1, dst + stride, stride);
+static WEBP_INLINE void Store16x4_NEON(const uint8x16_t p1, const uint8x16_t p0,
+ const uint8x16_t q0, const uint8x16_t q1,
+ uint8_t* const dst, int stride) {
+ Store16x2_NEON(p1, p0, dst - stride, stride);
+ Store16x2_NEON(q0, q1, dst + stride, stride);
}
-static WEBP_INLINE void Store8x2x2(const uint8x16_t p0, const uint8x16_t q0,
- uint8_t* const u, uint8_t* const v,
- int stride) {
+static WEBP_INLINE void Store8x2x2_NEON(const uint8x16_t p0,
+ const uint8x16_t q0,
+ uint8_t* const u, uint8_t* const v,
+ int stride) {
// p0 and q0 contain the u+v samples packed in low/high halves.
vst1_u8(u - stride, vget_low_u8(p0));
vst1_u8(u, vget_low_u8(q0));
@@ -312,13 +295,15 @@ static WEBP_INLINE void Store8x2x2(const uint8x16_t p0, const uint8x16_t q0,
vst1_u8(v, vget_high_u8(q0));
}
-static WEBP_INLINE void Store8x4x2(const uint8x16_t p1, const uint8x16_t p0,
- const uint8x16_t q0, const uint8x16_t q1,
- uint8_t* const u, uint8_t* const v,
- int stride) {
+static WEBP_INLINE void Store8x4x2_NEON(const uint8x16_t p1,
+ const uint8x16_t p0,
+ const uint8x16_t q0,
+ const uint8x16_t q1,
+ uint8_t* const u, uint8_t* const v,
+ int stride) {
// The p1...q1 registers contain the u+v samples packed in low/high halves.
- Store8x2x2(p1, p0, u - stride, v - stride, stride);
- Store8x2x2(q0, q1, u + stride, v + stride, stride);
+ Store8x2x2_NEON(p1, p0, u - stride, v - stride, stride);
+ Store8x2x2_NEON(q0, q1, u + stride, v + stride, stride);
}
#if !defined(WORK_AROUND_GCC)
@@ -329,11 +314,10 @@ static WEBP_INLINE void Store8x4x2(const uint8x16_t p1, const uint8x16_t p0,
(DST) += stride; \
} while (0)
-static WEBP_INLINE void Store6x8x2(const uint8x16_t p2, const uint8x16_t p1,
- const uint8x16_t p0, const uint8x16_t q0,
- const uint8x16_t q1, const uint8x16_t q2,
- uint8_t* u, uint8_t* v,
- int stride) {
+static WEBP_INLINE void Store6x8x2_NEON(
+ const uint8x16_t p2, const uint8x16_t p1, const uint8x16_t p0,
+ const uint8x16_t q0, const uint8x16_t q1, const uint8x16_t q2,
+ uint8_t* u, uint8_t* v, int stride) {
uint8x8x3_t u0, u1, v0, v1;
INIT_VECTOR3(u0, vget_low_u8(p2), vget_low_u8(p1), vget_low_u8(p0));
INIT_VECTOR3(u1, vget_low_u8(q0), vget_low_u8(q1), vget_low_u8(q2));
@@ -358,10 +342,12 @@ static WEBP_INLINE void Store6x8x2(const uint8x16_t p2, const uint8x16_t p1,
}
#undef STORE6_LANE
-static WEBP_INLINE void Store4x8x2(const uint8x16_t p1, const uint8x16_t p0,
- const uint8x16_t q0, const uint8x16_t q1,
- uint8_t* const u, uint8_t* const v,
- int stride) {
+static WEBP_INLINE void Store4x8x2_NEON(const uint8x16_t p1,
+ const uint8x16_t p0,
+ const uint8x16_t q0,
+ const uint8x16_t q1,
+ uint8_t* const u, uint8_t* const v,
+ int stride) {
uint8x8x4_t u0, v0;
INIT_VECTOR4(u0,
vget_low_u8(p1), vget_low_u8(p0),
@@ -390,15 +376,15 @@ static WEBP_INLINE void Store4x8x2(const uint8x16_t p1, const uint8x16_t p0,
#endif // !WORK_AROUND_GCC
// Zero extend 'v' to an int16x8_t.
-static WEBP_INLINE int16x8_t ConvertU8ToS16(uint8x8_t v) {
+static WEBP_INLINE int16x8_t ConvertU8ToS16_NEON(uint8x8_t v) {
return vreinterpretq_s16_u16(vmovl_u8(v));
}
// Performs unsigned 8b saturation on 'dst01' and 'dst23' storing the result
// to the corresponding rows of 'dst'.
-static WEBP_INLINE void SaturateAndStore4x4(uint8_t* const dst,
- const int16x8_t dst01,
- const int16x8_t dst23) {
+static WEBP_INLINE void SaturateAndStore4x4_NEON(uint8_t* const dst,
+ const int16x8_t dst01,
+ const int16x8_t dst23) {
// Unsigned saturate to 8b.
const uint8x8_t dst01_u8 = vqmovun_s16(dst01);
const uint8x8_t dst23_u8 = vqmovun_s16(dst23);
@@ -410,8 +396,9 @@ static WEBP_INLINE void SaturateAndStore4x4(uint8_t* const dst,
vst1_lane_u32((uint32_t*)(dst + 3 * BPS), vreinterpret_u32_u8(dst23_u8), 1);
}
-static WEBP_INLINE void Add4x4(const int16x8_t row01, const int16x8_t row23,
- uint8_t* const dst) {
+static WEBP_INLINE void Add4x4_NEON(const int16x8_t row01,
+ const int16x8_t row23,
+ uint8_t* const dst) {
uint32x2_t dst01 = vdup_n_u32(0);
uint32x2_t dst23 = vdup_n_u32(0);
@@ -423,23 +410,23 @@ static WEBP_INLINE void Add4x4(const int16x8_t row01, const int16x8_t row23,
{
// Convert to 16b.
- const int16x8_t dst01_s16 = ConvertU8ToS16(vreinterpret_u8_u32(dst01));
- const int16x8_t dst23_s16 = ConvertU8ToS16(vreinterpret_u8_u32(dst23));
+ const int16x8_t dst01_s16 = ConvertU8ToS16_NEON(vreinterpret_u8_u32(dst01));
+ const int16x8_t dst23_s16 = ConvertU8ToS16_NEON(vreinterpret_u8_u32(dst23));
// Descale with rounding.
const int16x8_t out01 = vrsraq_n_s16(dst01_s16, row01, 3);
const int16x8_t out23 = vrsraq_n_s16(dst23_s16, row23, 3);
// Add the inverse transform.
- SaturateAndStore4x4(dst, out01, out23);
+ SaturateAndStore4x4_NEON(dst, out01, out23);
}
}
//-----------------------------------------------------------------------------
// Simple In-loop filtering (Paragraph 15.2)
-static uint8x16_t NeedsFilter(const uint8x16_t p1, const uint8x16_t p0,
- const uint8x16_t q0, const uint8x16_t q1,
- int thresh) {
+static uint8x16_t NeedsFilter_NEON(const uint8x16_t p1, const uint8x16_t p0,
+ const uint8x16_t q0, const uint8x16_t q1,
+ int thresh) {
const uint8x16_t thresh_v = vdupq_n_u8((uint8_t)thresh);
const uint8x16_t a_p0_q0 = vabdq_u8(p0, q0); // abs(p0-q0)
const uint8x16_t a_p1_q1 = vabdq_u8(p1, q1); // abs(p1-q1)
@@ -450,18 +437,18 @@ static uint8x16_t NeedsFilter(const uint8x16_t p1, const uint8x16_t p0,
return mask;
}
-static int8x16_t FlipSign(const uint8x16_t v) {
+static int8x16_t FlipSign_NEON(const uint8x16_t v) {
const uint8x16_t sign_bit = vdupq_n_u8(0x80);
return vreinterpretq_s8_u8(veorq_u8(v, sign_bit));
}
-static uint8x16_t FlipSignBack(const int8x16_t v) {
+static uint8x16_t FlipSignBack_NEON(const int8x16_t v) {
const int8x16_t sign_bit = vdupq_n_s8(0x80);
return vreinterpretq_u8_s8(veorq_s8(v, sign_bit));
}
-static int8x16_t GetBaseDelta(const int8x16_t p1, const int8x16_t p0,
- const int8x16_t q0, const int8x16_t q1) {
+static int8x16_t GetBaseDelta_NEON(const int8x16_t p1, const int8x16_t p0,
+ const int8x16_t q0, const int8x16_t q1) {
const int8x16_t q0_p0 = vqsubq_s8(q0, p0); // (q0-p0)
const int8x16_t p1_q1 = vqsubq_s8(p1, q1); // (p1-q1)
const int8x16_t s1 = vqaddq_s8(p1_q1, q0_p0); // (p1-q1) + 1 * (q0 - p0)
@@ -470,7 +457,7 @@ static int8x16_t GetBaseDelta(const int8x16_t p1, const int8x16_t p0,
return s3;
}
-static int8x16_t GetBaseDelta0(const int8x16_t p0, const int8x16_t q0) {
+static int8x16_t GetBaseDelta0_NEON(const int8x16_t p0, const int8x16_t q0) {
const int8x16_t q0_p0 = vqsubq_s8(q0, p0); // (q0-p0)
const int8x16_t s1 = vqaddq_s8(q0_p0, q0_p0); // 2 * (q0 - p0)
const int8x16_t s2 = vqaddq_s8(q0_p0, s1); // 3 * (q0 - p0)
@@ -479,9 +466,10 @@ static int8x16_t GetBaseDelta0(const int8x16_t p0, const int8x16_t q0) {
//------------------------------------------------------------------------------
-static void ApplyFilter2NoFlip(const int8x16_t p0s, const int8x16_t q0s,
- const int8x16_t delta,
- int8x16_t* const op0, int8x16_t* const oq0) {
+static void ApplyFilter2NoFlip_NEON(const int8x16_t p0s, const int8x16_t q0s,
+ const int8x16_t delta,
+ int8x16_t* const op0,
+ int8x16_t* const oq0) {
const int8x16_t kCst3 = vdupq_n_s8(0x03);
const int8x16_t kCst4 = vdupq_n_s8(0x04);
const int8x16_t delta_p3 = vqaddq_s8(delta, kCst3);
@@ -494,9 +482,9 @@ static void ApplyFilter2NoFlip(const int8x16_t p0s, const int8x16_t q0s,
#if defined(WEBP_USE_INTRINSICS)
-static void ApplyFilter2(const int8x16_t p0s, const int8x16_t q0s,
- const int8x16_t delta,
- uint8x16_t* const op0, uint8x16_t* const oq0) {
+static void ApplyFilter2_NEON(const int8x16_t p0s, const int8x16_t q0s,
+ const int8x16_t delta,
+ uint8x16_t* const op0, uint8x16_t* const oq0) {
const int8x16_t kCst3 = vdupq_n_s8(0x03);
const int8x16_t kCst4 = vdupq_n_s8(0x04);
const int8x16_t delta_p3 = vqaddq_s8(delta, kCst3);
@@ -505,45 +493,66 @@ static void ApplyFilter2(const int8x16_t p0s, const int8x16_t q0s,
const int8x16_t delta4 = vshrq_n_s8(delta_p4, 3);
const int8x16_t sp0 = vqaddq_s8(p0s, delta3);
const int8x16_t sq0 = vqsubq_s8(q0s, delta4);
- *op0 = FlipSignBack(sp0);
- *oq0 = FlipSignBack(sq0);
-}
-
-static void DoFilter2(const uint8x16_t p1, const uint8x16_t p0,
- const uint8x16_t q0, const uint8x16_t q1,
- const uint8x16_t mask,
- uint8x16_t* const op0, uint8x16_t* const oq0) {
- const int8x16_t p1s = FlipSign(p1);
- const int8x16_t p0s = FlipSign(p0);
- const int8x16_t q0s = FlipSign(q0);
- const int8x16_t q1s = FlipSign(q1);
- const int8x16_t delta0 = GetBaseDelta(p1s, p0s, q0s, q1s);
+ *op0 = FlipSignBack_NEON(sp0);
+ *oq0 = FlipSignBack_NEON(sq0);
+}
+
+static void DoFilter2_NEON(const uint8x16_t p1, const uint8x16_t p0,
+ const uint8x16_t q0, const uint8x16_t q1,
+ const uint8x16_t mask,
+ uint8x16_t* const op0, uint8x16_t* const oq0) {
+ const int8x16_t p1s = FlipSign_NEON(p1);
+ const int8x16_t p0s = FlipSign_NEON(p0);
+ const int8x16_t q0s = FlipSign_NEON(q0);
+ const int8x16_t q1s = FlipSign_NEON(q1);
+ const int8x16_t delta0 = GetBaseDelta_NEON(p1s, p0s, q0s, q1s);
const int8x16_t delta1 = vandq_s8(delta0, vreinterpretq_s8_u8(mask));
- ApplyFilter2(p0s, q0s, delta1, op0, oq0);
+ ApplyFilter2_NEON(p0s, q0s, delta1, op0, oq0);
}
-static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {
+static void SimpleVFilter16_NEON(uint8_t* p, int stride, int thresh) {
uint8x16_t p1, p0, q0, q1, op0, oq0;
- Load16x4(p, stride, &p1, &p0, &q0, &q1);
+ Load16x4_NEON(p, stride, &p1, &p0, &q0, &q1);
{
- const uint8x16_t mask = NeedsFilter(p1, p0, q0, q1, thresh);
- DoFilter2(p1, p0, q0, q1, mask, &op0, &oq0);
+ const uint8x16_t mask = NeedsFilter_NEON(p1, p0, q0, q1, thresh);
+ DoFilter2_NEON(p1, p0, q0, q1, mask, &op0, &oq0);
}
- Store16x2(op0, oq0, p, stride);
+ Store16x2_NEON(op0, oq0, p, stride);
}
-static void SimpleHFilter16(uint8_t* p, int stride, int thresh) {
+static void SimpleHFilter16_NEON(uint8_t* p, int stride, int thresh) {
uint8x16_t p1, p0, q0, q1, oq0, op0;
- Load4x16(p, stride, &p1, &p0, &q0, &q1);
+ Load4x16_NEON(p, stride, &p1, &p0, &q0, &q1);
{
- const uint8x16_t mask = NeedsFilter(p1, p0, q0, q1, thresh);
- DoFilter2(p1, p0, q0, q1, mask, &op0, &oq0);
+ const uint8x16_t mask = NeedsFilter_NEON(p1, p0, q0, q1, thresh);
+ DoFilter2_NEON(p1, p0, q0, q1, mask, &op0, &oq0);
}
- Store2x16(op0, oq0, p, stride);
+ Store2x16_NEON(op0, oq0, p, stride);
}
#else
+// Load/Store vertical edge
+#define LOAD8x4(c1, c2, c3, c4, b1, b2, stride) \
+ "vld4.8 {" #c1 "[0]," #c2 "[0]," #c3 "[0]," #c4 "[0]}," #b1 "," #stride "\n" \
+ "vld4.8 {" #c1 "[1]," #c2 "[1]," #c3 "[1]," #c4 "[1]}," #b2 "," #stride "\n" \
+ "vld4.8 {" #c1 "[2]," #c2 "[2]," #c3 "[2]," #c4 "[2]}," #b1 "," #stride "\n" \
+ "vld4.8 {" #c1 "[3]," #c2 "[3]," #c3 "[3]," #c4 "[3]}," #b2 "," #stride "\n" \
+ "vld4.8 {" #c1 "[4]," #c2 "[4]," #c3 "[4]," #c4 "[4]}," #b1 "," #stride "\n" \
+ "vld4.8 {" #c1 "[5]," #c2 "[5]," #c3 "[5]," #c4 "[5]}," #b2 "," #stride "\n" \
+ "vld4.8 {" #c1 "[6]," #c2 "[6]," #c3 "[6]," #c4 "[6]}," #b1 "," #stride "\n" \
+ "vld4.8 {" #c1 "[7]," #c2 "[7]," #c3 "[7]," #c4 "[7]}," #b2 "," #stride "\n"
+
+#define STORE8x2(c1, c2, p, stride) \
+ "vst2.8 {" #c1 "[0], " #c2 "[0]}," #p "," #stride " \n" \
+ "vst2.8 {" #c1 "[1], " #c2 "[1]}," #p "," #stride " \n" \
+ "vst2.8 {" #c1 "[2], " #c2 "[2]}," #p "," #stride " \n" \
+ "vst2.8 {" #c1 "[3], " #c2 "[3]}," #p "," #stride " \n" \
+ "vst2.8 {" #c1 "[4], " #c2 "[4]}," #p "," #stride " \n" \
+ "vst2.8 {" #c1 "[5], " #c2 "[5]}," #p "," #stride " \n" \
+ "vst2.8 {" #c1 "[6], " #c2 "[6]}," #p "," #stride " \n" \
+ "vst2.8 {" #c1 "[7], " #c2 "[7]}," #p "," #stride " \n"
+
#define QRegs "q0", "q1", "q2", "q3", \
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
@@ -592,7 +601,7 @@ static void SimpleHFilter16(uint8_t* p, int stride, int thresh) {
DO_SIMPLE_FILTER(p0, q0, q9) /* apply filter */ \
FLIP_SIGN_BIT2(p0, q0, q10)
-static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {
+static void SimpleVFilter16_NEON(uint8_t* p, int stride, int thresh) {
__asm__ volatile (
"sub %[p], %[p], %[stride], lsl #1 \n" // p -= 2 * stride
@@ -613,7 +622,7 @@ static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {
);
}
-static void SimpleHFilter16(uint8_t* p, int stride, int thresh) {
+static void SimpleHFilter16_NEON(uint8_t* p, int stride, int thresh) {
__asm__ volatile (
"sub r4, %[p], #2 \n" // base1 = p - 2
"lsl r6, %[stride], #1 \n" // r6 = 2 * stride
@@ -639,30 +648,33 @@ static void SimpleHFilter16(uint8_t* p, int stride, int thresh) {
);
}
+#undef LOAD8x4
+#undef STORE8x2
+
#endif // WEBP_USE_INTRINSICS
-static void SimpleVFilter16i(uint8_t* p, int stride, int thresh) {
+static void SimpleVFilter16i_NEON(uint8_t* p, int stride, int thresh) {
uint32_t k;
for (k = 3; k != 0; --k) {
p += 4 * stride;
- SimpleVFilter16(p, stride, thresh);
+ SimpleVFilter16_NEON(p, stride, thresh);
}
}
-static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) {
+static void SimpleHFilter16i_NEON(uint8_t* p, int stride, int thresh) {
uint32_t k;
for (k = 3; k != 0; --k) {
p += 4;
- SimpleHFilter16(p, stride, thresh);
+ SimpleHFilter16_NEON(p, stride, thresh);
}
}
//------------------------------------------------------------------------------
// Complex In-loop filtering (Paragraph 15.3)
-static uint8x16_t NeedsHev(const uint8x16_t p1, const uint8x16_t p0,
- const uint8x16_t q0, const uint8x16_t q1,
- int hev_thresh) {
+static uint8x16_t NeedsHev_NEON(const uint8x16_t p1, const uint8x16_t p0,
+ const uint8x16_t q0, const uint8x16_t q1,
+ int hev_thresh) {
const uint8x16_t hev_thresh_v = vdupq_n_u8((uint8_t)hev_thresh);
const uint8x16_t a_p1_p0 = vabdq_u8(p1, p0); // abs(p1 - p0)
const uint8x16_t a_q1_q0 = vabdq_u8(q1, q0); // abs(q1 - q0)
@@ -671,11 +683,11 @@ static uint8x16_t NeedsHev(const uint8x16_t p1, const uint8x16_t p0,
return mask;
}
-static uint8x16_t NeedsFilter2(const uint8x16_t p3, const uint8x16_t p2,
- const uint8x16_t p1, const uint8x16_t p0,
- const uint8x16_t q0, const uint8x16_t q1,
- const uint8x16_t q2, const uint8x16_t q3,
- int ithresh, int thresh) {
+static uint8x16_t NeedsFilter2_NEON(const uint8x16_t p3, const uint8x16_t p2,
+ const uint8x16_t p1, const uint8x16_t p0,
+ const uint8x16_t q0, const uint8x16_t q1,
+ const uint8x16_t q2, const uint8x16_t q3,
+ int ithresh, int thresh) {
const uint8x16_t ithresh_v = vdupq_n_u8((uint8_t)ithresh);
const uint8x16_t a_p3_p2 = vabdq_u8(p3, p2); // abs(p3 - p2)
const uint8x16_t a_p2_p1 = vabdq_u8(p2, p1); // abs(p2 - p1)
@@ -689,14 +701,14 @@ static uint8x16_t NeedsFilter2(const uint8x16_t p3, const uint8x16_t p2,
const uint8x16_t max12 = vmaxq_u8(max1, max2);
const uint8x16_t max123 = vmaxq_u8(max12, max3);
const uint8x16_t mask2 = vcgeq_u8(ithresh_v, max123);
- const uint8x16_t mask1 = NeedsFilter(p1, p0, q0, q1, thresh);
+ const uint8x16_t mask1 = NeedsFilter_NEON(p1, p0, q0, q1, thresh);
const uint8x16_t mask = vandq_u8(mask1, mask2);
return mask;
}
// 4-points filter
-static void ApplyFilter4(
+static void ApplyFilter4_NEON(
const int8x16_t p1, const int8x16_t p0,
const int8x16_t q0, const int8x16_t q1,
const int8x16_t delta0,
@@ -709,47 +721,47 @@ static void ApplyFilter4(
const int8x16_t a1 = vshrq_n_s8(delta1, 3);
const int8x16_t a2 = vshrq_n_s8(delta2, 3);
const int8x16_t a3 = vrshrq_n_s8(a1, 1); // a3 = (a1 + 1) >> 1
- *op0 = FlipSignBack(vqaddq_s8(p0, a2)); // clip(p0 + a2)
- *oq0 = FlipSignBack(vqsubq_s8(q0, a1)); // clip(q0 - a1)
- *op1 = FlipSignBack(vqaddq_s8(p1, a3)); // clip(p1 + a3)
- *oq1 = FlipSignBack(vqsubq_s8(q1, a3)); // clip(q1 - a3)
+ *op0 = FlipSignBack_NEON(vqaddq_s8(p0, a2)); // clip(p0 + a2)
+ *oq0 = FlipSignBack_NEON(vqsubq_s8(q0, a1)); // clip(q0 - a1)
+ *op1 = FlipSignBack_NEON(vqaddq_s8(p1, a3)); // clip(p1 + a3)
+ *oq1 = FlipSignBack_NEON(vqsubq_s8(q1, a3)); // clip(q1 - a3)
}
-static void DoFilter4(
+static void DoFilter4_NEON(
const uint8x16_t p1, const uint8x16_t p0,
const uint8x16_t q0, const uint8x16_t q1,
const uint8x16_t mask, const uint8x16_t hev_mask,
uint8x16_t* const op1, uint8x16_t* const op0,
uint8x16_t* const oq0, uint8x16_t* const oq1) {
// This is a fused version of DoFilter2() calling ApplyFilter2 directly
- const int8x16_t p1s = FlipSign(p1);
- int8x16_t p0s = FlipSign(p0);
- int8x16_t q0s = FlipSign(q0);
- const int8x16_t q1s = FlipSign(q1);
+ const int8x16_t p1s = FlipSign_NEON(p1);
+ int8x16_t p0s = FlipSign_NEON(p0);
+ int8x16_t q0s = FlipSign_NEON(q0);
+ const int8x16_t q1s = FlipSign_NEON(q1);
const uint8x16_t simple_lf_mask = vandq_u8(mask, hev_mask);
// do_filter2 part (simple loopfilter on pixels with hev)
{
- const int8x16_t delta = GetBaseDelta(p1s, p0s, q0s, q1s);
+ const int8x16_t delta = GetBaseDelta_NEON(p1s, p0s, q0s, q1s);
const int8x16_t simple_lf_delta =
vandq_s8(delta, vreinterpretq_s8_u8(simple_lf_mask));
- ApplyFilter2NoFlip(p0s, q0s, simple_lf_delta, &p0s, &q0s);
+ ApplyFilter2NoFlip_NEON(p0s, q0s, simple_lf_delta, &p0s, &q0s);
}
// do_filter4 part (complex loopfilter on pixels without hev)
{
- const int8x16_t delta0 = GetBaseDelta0(p0s, q0s);
+ const int8x16_t delta0 = GetBaseDelta0_NEON(p0s, q0s);
// we use: (mask & hev_mask) ^ mask = mask & !hev_mask
const uint8x16_t complex_lf_mask = veorq_u8(simple_lf_mask, mask);
const int8x16_t complex_lf_delta =
vandq_s8(delta0, vreinterpretq_s8_u8(complex_lf_mask));
- ApplyFilter4(p1s, p0s, q0s, q1s, complex_lf_delta, op1, op0, oq0, oq1);
+ ApplyFilter4_NEON(p1s, p0s, q0s, q1s, complex_lf_delta, op1, op0, oq0, oq1);
}
}
// 6-points filter
-static void ApplyFilter6(
+static void ApplyFilter6_NEON(
const int8x16_t p2, const int8x16_t p1, const int8x16_t p0,
const int8x16_t q0, const int8x16_t q1, const int8x16_t q2,
const int8x16_t delta,
@@ -778,35 +790,35 @@ static void ApplyFilter6(
const int8x16_t a2 = vcombine_s8(a2_lo, a2_hi);
const int8x16_t a3 = vcombine_s8(a3_lo, a3_hi);
- *op0 = FlipSignBack(vqaddq_s8(p0, a1)); // clip(p0 + a1)
- *oq0 = FlipSignBack(vqsubq_s8(q0, a1)); // clip(q0 - q1)
- *oq1 = FlipSignBack(vqsubq_s8(q1, a2)); // clip(q1 - a2)
- *op1 = FlipSignBack(vqaddq_s8(p1, a2)); // clip(p1 + a2)
- *oq2 = FlipSignBack(vqsubq_s8(q2, a3)); // clip(q2 - a3)
- *op2 = FlipSignBack(vqaddq_s8(p2, a3)); // clip(p2 + a3)
+ *op0 = FlipSignBack_NEON(vqaddq_s8(p0, a1)); // clip(p0 + a1)
+ *oq0 = FlipSignBack_NEON(vqsubq_s8(q0, a1)); // clip(q0 - q1)
+ *oq1 = FlipSignBack_NEON(vqsubq_s8(q1, a2)); // clip(q1 - a2)
+ *op1 = FlipSignBack_NEON(vqaddq_s8(p1, a2)); // clip(p1 + a2)
+ *oq2 = FlipSignBack_NEON(vqsubq_s8(q2, a3)); // clip(q2 - a3)
+ *op2 = FlipSignBack_NEON(vqaddq_s8(p2, a3)); // clip(p2 + a3)
}
-static void DoFilter6(
+static void DoFilter6_NEON(
const uint8x16_t p2, const uint8x16_t p1, const uint8x16_t p0,
const uint8x16_t q0, const uint8x16_t q1, const uint8x16_t q2,
const uint8x16_t mask, const uint8x16_t hev_mask,
uint8x16_t* const op2, uint8x16_t* const op1, uint8x16_t* const op0,
uint8x16_t* const oq0, uint8x16_t* const oq1, uint8x16_t* const oq2) {
// This is a fused version of DoFilter2() calling ApplyFilter2 directly
- const int8x16_t p2s = FlipSign(p2);
- const int8x16_t p1s = FlipSign(p1);
- int8x16_t p0s = FlipSign(p0);
- int8x16_t q0s = FlipSign(q0);
- const int8x16_t q1s = FlipSign(q1);
- const int8x16_t q2s = FlipSign(q2);
+ const int8x16_t p2s = FlipSign_NEON(p2);
+ const int8x16_t p1s = FlipSign_NEON(p1);
+ int8x16_t p0s = FlipSign_NEON(p0);
+ int8x16_t q0s = FlipSign_NEON(q0);
+ const int8x16_t q1s = FlipSign_NEON(q1);
+ const int8x16_t q2s = FlipSign_NEON(q2);
const uint8x16_t simple_lf_mask = vandq_u8(mask, hev_mask);
- const int8x16_t delta0 = GetBaseDelta(p1s, p0s, q0s, q1s);
+ const int8x16_t delta0 = GetBaseDelta_NEON(p1s, p0s, q0s, q1s);
// do_filter2 part (simple loopfilter on pixels with hev)
{
const int8x16_t simple_lf_delta =
vandq_s8(delta0, vreinterpretq_s8_u8(simple_lf_mask));
- ApplyFilter2NoFlip(p0s, q0s, simple_lf_delta, &p0s, &q0s);
+ ApplyFilter2NoFlip_NEON(p0s, q0s, simple_lf_delta, &p0s, &q0s);
}
// do_filter6 part (complex loopfilter on pixels without hev)
@@ -815,65 +827,65 @@ static void DoFilter6(
const uint8x16_t complex_lf_mask = veorq_u8(simple_lf_mask, mask);
const int8x16_t complex_lf_delta =
vandq_s8(delta0, vreinterpretq_s8_u8(complex_lf_mask));
- ApplyFilter6(p2s, p1s, p0s, q0s, q1s, q2s, complex_lf_delta,
- op2, op1, op0, oq0, oq1, oq2);
+ ApplyFilter6_NEON(p2s, p1s, p0s, q0s, q1s, q2s, complex_lf_delta,
+ op2, op1, op0, oq0, oq1, oq2);
}
}
// on macroblock edges
-static void VFilter16(uint8_t* p, int stride,
- int thresh, int ithresh, int hev_thresh) {
+static void VFilter16_NEON(uint8_t* p, int stride,
+ int thresh, int ithresh, int hev_thresh) {
uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
- Load16x8(p, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
+ Load16x8_NEON(p, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
{
- const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3,
- ithresh, thresh);
- const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
+ const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
+ ithresh, thresh);
+ const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
uint8x16_t op2, op1, op0, oq0, oq1, oq2;
- DoFilter6(p2, p1, p0, q0, q1, q2, mask, hev_mask,
- &op2, &op1, &op0, &oq0, &oq1, &oq2);
- Store16x2(op2, op1, p - 2 * stride, stride);
- Store16x2(op0, oq0, p + 0 * stride, stride);
- Store16x2(oq1, oq2, p + 2 * stride, stride);
+ DoFilter6_NEON(p2, p1, p0, q0, q1, q2, mask, hev_mask,
+ &op2, &op1, &op0, &oq0, &oq1, &oq2);
+ Store16x2_NEON(op2, op1, p - 2 * stride, stride);
+ Store16x2_NEON(op0, oq0, p + 0 * stride, stride);
+ Store16x2_NEON(oq1, oq2, p + 2 * stride, stride);
}
}
-static void HFilter16(uint8_t* p, int stride,
- int thresh, int ithresh, int hev_thresh) {
+static void HFilter16_NEON(uint8_t* p, int stride,
+ int thresh, int ithresh, int hev_thresh) {
uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
- Load8x16(p, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
+ Load8x16_NEON(p, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
{
- const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3,
- ithresh, thresh);
- const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
+ const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
+ ithresh, thresh);
+ const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
uint8x16_t op2, op1, op0, oq0, oq1, oq2;
- DoFilter6(p2, p1, p0, q0, q1, q2, mask, hev_mask,
- &op2, &op1, &op0, &oq0, &oq1, &oq2);
- Store2x16(op2, op1, p - 2, stride);
- Store2x16(op0, oq0, p + 0, stride);
- Store2x16(oq1, oq2, p + 2, stride);
+ DoFilter6_NEON(p2, p1, p0, q0, q1, q2, mask, hev_mask,
+ &op2, &op1, &op0, &oq0, &oq1, &oq2);
+ Store2x16_NEON(op2, op1, p - 2, stride);
+ Store2x16_NEON(op0, oq0, p + 0, stride);
+ Store2x16_NEON(oq1, oq2, p + 2, stride);
}
}
// on three inner edges
-static void VFilter16i(uint8_t* p, int stride,
- int thresh, int ithresh, int hev_thresh) {
+static void VFilter16i_NEON(uint8_t* p, int stride,
+ int thresh, int ithresh, int hev_thresh) {
uint32_t k;
uint8x16_t p3, p2, p1, p0;
- Load16x4(p + 2 * stride, stride, &p3, &p2, &p1, &p0);
+ Load16x4_NEON(p + 2 * stride, stride, &p3, &p2, &p1, &p0);
for (k = 3; k != 0; --k) {
uint8x16_t q0, q1, q2, q3;
p += 4 * stride;
- Load16x4(p + 2 * stride, stride, &q0, &q1, &q2, &q3);
+ Load16x4_NEON(p + 2 * stride, stride, &q0, &q1, &q2, &q3);
{
const uint8x16_t mask =
- NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3, ithresh, thresh);
- const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
+ NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3, ithresh, thresh);
+ const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
// p3 and p2 are not just temporary variables here: they will be
// re-used for next span. And q2/q3 will become p1/p0 accordingly.
- DoFilter4(p1, p0, q0, q1, mask, hev_mask, &p1, &p0, &p3, &p2);
- Store16x4(p1, p0, p3, p2, p, stride);
+ DoFilter4_NEON(p1, p0, q0, q1, mask, hev_mask, &p1, &p0, &p3, &p2);
+ Store16x4_NEON(p1, p0, p3, p2, p, stride);
p1 = q2;
p0 = q3;
}
@@ -881,21 +893,21 @@ static void VFilter16i(uint8_t* p, int stride,
}
#if !defined(WORK_AROUND_GCC)
-static void HFilter16i(uint8_t* p, int stride,
- int thresh, int ithresh, int hev_thresh) {
+static void HFilter16i_NEON(uint8_t* p, int stride,
+ int thresh, int ithresh, int hev_thresh) {
uint32_t k;
uint8x16_t p3, p2, p1, p0;
- Load4x16(p + 2, stride, &p3, &p2, &p1, &p0);
+ Load4x16_NEON(p + 2, stride, &p3, &p2, &p1, &p0);
for (k = 3; k != 0; --k) {
uint8x16_t q0, q1, q2, q3;
p += 4;
- Load4x16(p + 2, stride, &q0, &q1, &q2, &q3);
+ Load4x16_NEON(p + 2, stride, &q0, &q1, &q2, &q3);
{
const uint8x16_t mask =
- NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3, ithresh, thresh);
- const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
- DoFilter4(p1, p0, q0, q1, mask, hev_mask, &p1, &p0, &p3, &p2);
- Store4x16(p1, p0, p3, p2, p, stride);
+ NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3, ithresh, thresh);
+ const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
+ DoFilter4_NEON(p1, p0, q0, q1, mask, hev_mask, &p1, &p0, &p3, &p2);
+ Store4x16_NEON(p1, p0, p3, p2, p, stride);
p1 = q2;
p0 = q3;
}
@@ -904,67 +916,67 @@ static void HFilter16i(uint8_t* p, int stride,
#endif // !WORK_AROUND_GCC
// 8-pixels wide variant, for chroma filtering
-static void VFilter8(uint8_t* u, uint8_t* v, int stride,
- int thresh, int ithresh, int hev_thresh) {
+static void VFilter8_NEON(uint8_t* u, uint8_t* v, int stride,
+ int thresh, int ithresh, int hev_thresh) {
uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
- Load8x8x2(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
+ Load8x8x2_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
{
- const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3,
- ithresh, thresh);
- const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
+ const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
+ ithresh, thresh);
+ const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
uint8x16_t op2, op1, op0, oq0, oq1, oq2;
- DoFilter6(p2, p1, p0, q0, q1, q2, mask, hev_mask,
- &op2, &op1, &op0, &oq0, &oq1, &oq2);
- Store8x2x2(op2, op1, u - 2 * stride, v - 2 * stride, stride);
- Store8x2x2(op0, oq0, u + 0 * stride, v + 0 * stride, stride);
- Store8x2x2(oq1, oq2, u + 2 * stride, v + 2 * stride, stride);
+ DoFilter6_NEON(p2, p1, p0, q0, q1, q2, mask, hev_mask,
+ &op2, &op1, &op0, &oq0, &oq1, &oq2);
+ Store8x2x2_NEON(op2, op1, u - 2 * stride, v - 2 * stride, stride);
+ Store8x2x2_NEON(op0, oq0, u + 0 * stride, v + 0 * stride, stride);
+ Store8x2x2_NEON(oq1, oq2, u + 2 * stride, v + 2 * stride, stride);
}
}
-static void VFilter8i(uint8_t* u, uint8_t* v, int stride,
- int thresh, int ithresh, int hev_thresh) {
+static void VFilter8i_NEON(uint8_t* u, uint8_t* v, int stride,
+ int thresh, int ithresh, int hev_thresh) {
uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
u += 4 * stride;
v += 4 * stride;
- Load8x8x2(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
+ Load8x8x2_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
{
- const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3,
- ithresh, thresh);
- const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
+ const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
+ ithresh, thresh);
+ const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
uint8x16_t op1, op0, oq0, oq1;
- DoFilter4(p1, p0, q0, q1, mask, hev_mask, &op1, &op0, &oq0, &oq1);
- Store8x4x2(op1, op0, oq0, oq1, u, v, stride);
+ DoFilter4_NEON(p1, p0, q0, q1, mask, hev_mask, &op1, &op0, &oq0, &oq1);
+ Store8x4x2_NEON(op1, op0, oq0, oq1, u, v, stride);
}
}
#if !defined(WORK_AROUND_GCC)
-static void HFilter8(uint8_t* u, uint8_t* v, int stride,
- int thresh, int ithresh, int hev_thresh) {
+static void HFilter8_NEON(uint8_t* u, uint8_t* v, int stride,
+ int thresh, int ithresh, int hev_thresh) {
uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
- Load8x8x2T(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
+ Load8x8x2T_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
{
- const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3,
- ithresh, thresh);
- const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
+ const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
+ ithresh, thresh);
+ const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
uint8x16_t op2, op1, op0, oq0, oq1, oq2;
- DoFilter6(p2, p1, p0, q0, q1, q2, mask, hev_mask,
- &op2, &op1, &op0, &oq0, &oq1, &oq2);
- Store6x8x2(op2, op1, op0, oq0, oq1, oq2, u, v, stride);
+ DoFilter6_NEON(p2, p1, p0, q0, q1, q2, mask, hev_mask,
+ &op2, &op1, &op0, &oq0, &oq1, &oq2);
+ Store6x8x2_NEON(op2, op1, op0, oq0, oq1, oq2, u, v, stride);
}
}
-static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
- int thresh, int ithresh, int hev_thresh) {
+static void HFilter8i_NEON(uint8_t* u, uint8_t* v, int stride,
+ int thresh, int ithresh, int hev_thresh) {
uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
u += 4;
v += 4;
- Load8x8x2T(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
+ Load8x8x2T_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
{
- const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3,
- ithresh, thresh);
- const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
+ const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
+ ithresh, thresh);
+ const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
uint8x16_t op1, op0, oq0, oq1;
- DoFilter4(p1, p0, q0, q1, mask, hev_mask, &op1, &op0, &oq0, &oq1);
- Store4x8x2(op1, op0, oq0, oq1, u, v, stride);
+ DoFilter4_NEON(p1, p0, q0, q1, mask, hev_mask, &op1, &op0, &oq0, &oq1);
+ Store4x8x2_NEON(op1, op0, oq0, oq1, u, v, stride);
}
}
#endif // !WORK_AROUND_GCC
@@ -992,8 +1004,9 @@ static const int16_t kC1 = 20091;
static const int16_t kC2 = 17734; // half of kC2, actually. See comment above.
#if defined(WEBP_USE_INTRINSICS)
-static WEBP_INLINE void Transpose8x2(const int16x8_t in0, const int16x8_t in1,
- int16x8x2_t* const out) {
+static WEBP_INLINE void Transpose8x2_NEON(const int16x8_t in0,
+ const int16x8_t in1,
+ int16x8x2_t* const out) {
// a0 a1 a2 a3 | b0 b1 b2 b3 => a0 b0 c0 d0 | a1 b1 c1 d1
// c0 c1 c2 c3 | d0 d1 d2 d3 a2 b2 c2 d2 | a3 b3 c3 d3
const int16x8x2_t tmp0 = vzipq_s16(in0, in1); // a0 c0 a1 c1 a2 c2 ...
@@ -1001,7 +1014,7 @@ static WEBP_INLINE void Transpose8x2(const int16x8_t in0, const int16x8_t in1,
*out = vzipq_s16(tmp0.val[0], tmp0.val[1]);
}
-static WEBP_INLINE void TransformPass(int16x8x2_t* const rows) {
+static WEBP_INLINE void TransformPass_NEON(int16x8x2_t* const rows) {
// {rows} = in0 | in4
// in8 | in12
// B1 = in4 | in12
@@ -1024,20 +1037,20 @@ static WEBP_INLINE void TransformPass(int16x8x2_t* const rows) {
const int16x8_t E0 = vqaddq_s16(D0, D1); // a+d | b+c
const int16x8_t E_tmp = vqsubq_s16(D0, D1); // a-d | b-c
const int16x8_t E1 = vcombine_s16(vget_high_s16(E_tmp), vget_low_s16(E_tmp));
- Transpose8x2(E0, E1, rows);
+ Transpose8x2_NEON(E0, E1, rows);
}
-static void TransformOne(const int16_t* in, uint8_t* dst) {
+static void TransformOne_NEON(const int16_t* in, uint8_t* dst) {
int16x8x2_t rows;
INIT_VECTOR2(rows, vld1q_s16(in + 0), vld1q_s16(in + 8));
- TransformPass(&rows);
- TransformPass(&rows);
- Add4x4(rows.val[0], rows.val[1], dst);
+ TransformPass_NEON(&rows);
+ TransformPass_NEON(&rows);
+ Add4x4_NEON(rows.val[0], rows.val[1], dst);
}
#else
-static void TransformOne(const int16_t* in, uint8_t* dst) {
+static void TransformOne_NEON(const int16_t* in, uint8_t* dst) {
const int kBPS = BPS;
// kC1, kC2. Padded because vld1.16 loads 8 bytes
const int16_t constants[4] = { kC1, kC2, 0, 0 };
@@ -1170,16 +1183,16 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
#endif // WEBP_USE_INTRINSICS
-static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
- TransformOne(in, dst);
+static void TransformTwo_NEON(const int16_t* in, uint8_t* dst, int do_two) {
+ TransformOne_NEON(in, dst);
if (do_two) {
- TransformOne(in + 16, dst + 4);
+ TransformOne_NEON(in + 16, dst + 4);
}
}
-static void TransformDC(const int16_t* in, uint8_t* dst) {
+static void TransformDC_NEON(const int16_t* in, uint8_t* dst) {
const int16x8_t DC = vdupq_n_s16(in[0]);
- Add4x4(DC, DC, dst);
+ Add4x4_NEON(DC, DC, dst);
}
//------------------------------------------------------------------------------
@@ -1191,7 +1204,7 @@ static void TransformDC(const int16_t* in, uint8_t* dst) {
*dst = vgetq_lane_s32(rows.val[3], col); (dst) += 16; \
} while (0)
-static void TransformWHT(const int16_t* in, int16_t* out) {
+static void TransformWHT_NEON(const int16_t* in, int16_t* out) {
int32x4x4_t tmp;
{
@@ -1209,7 +1222,7 @@ static void TransformWHT(const int16_t* in, int16_t* out) {
tmp.val[2] = vsubq_s32(a0, a1);
tmp.val[3] = vsubq_s32(a3, a2);
// Arrange the temporary results column-wise.
- tmp = Transpose4x4(tmp);
+ tmp = Transpose4x4_NEON(tmp);
}
{
@@ -1243,7 +1256,7 @@ static void TransformWHT(const int16_t* in, int16_t* out) {
//------------------------------------------------------------------------------
#define MUL(a, b) (((a) * (b)) >> 16)
-static void TransformAC3(const int16_t* in, uint8_t* dst) {
+static void TransformAC3_NEON(const int16_t* in, uint8_t* dst) {
static const int kC1_full = 20091 + (1 << 16);
static const int kC2_full = 35468;
const int16x4_t A = vld1_dup_s16(in);
@@ -1259,14 +1272,14 @@ static void TransformAC3(const int16_t* in, uint8_t* dst) {
const int16x4_t B = vqadd_s16(A, CD);
const int16x8_t m0_m1 = vcombine_s16(vqadd_s16(B, d4), vqadd_s16(B, c4));
const int16x8_t m2_m3 = vcombine_s16(vqsub_s16(B, c4), vqsub_s16(B, d4));
- Add4x4(m0_m1, m2_m3, dst);
+ Add4x4_NEON(m0_m1, m2_m3, dst);
}
#undef MUL
//------------------------------------------------------------------------------
// 4x4
-static void DC4(uint8_t* dst) { // DC
+static void DC4_NEON(uint8_t* dst) { // DC
const uint8x8_t A = vld1_u8(dst - BPS); // top row
const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top
const uint16x4_t p1 = vpadd_u16(p0, p0);
@@ -1287,17 +1300,17 @@ static void DC4(uint8_t* dst) { // DC
}
// TrueMotion (4x4 + 8x8)
-static WEBP_INLINE void TrueMotion(uint8_t* dst, int size) {
+static WEBP_INLINE void TrueMotion_NEON(uint8_t* dst, int size) {
const uint8x8_t TL = vld1_dup_u8(dst - BPS - 1); // top-left pixel 'A[-1]'
const uint8x8_t T = vld1_u8(dst - BPS); // top row 'A[0..3]'
const int16x8_t d = vreinterpretq_s16_u16(vsubl_u8(T, TL)); // A[c] - A[-1]
int y;
for (y = 0; y < size; y += 4) {
// left edge
- const int16x8_t L0 = ConvertU8ToS16(vld1_dup_u8(dst + 0 * BPS - 1));
- const int16x8_t L1 = ConvertU8ToS16(vld1_dup_u8(dst + 1 * BPS - 1));
- const int16x8_t L2 = ConvertU8ToS16(vld1_dup_u8(dst + 2 * BPS - 1));
- const int16x8_t L3 = ConvertU8ToS16(vld1_dup_u8(dst + 3 * BPS - 1));
+ const int16x8_t L0 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 0 * BPS - 1));
+ const int16x8_t L1 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 1 * BPS - 1));
+ const int16x8_t L2 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 2 * BPS - 1));
+ const int16x8_t L3 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 3 * BPS - 1));
const int16x8_t r0 = vaddq_s16(L0, d); // L[r] + A[c] - A[-1]
const int16x8_t r1 = vaddq_s16(L1, d);
const int16x8_t r2 = vaddq_s16(L2, d);
@@ -1322,9 +1335,9 @@ static WEBP_INLINE void TrueMotion(uint8_t* dst, int size) {
}
}
-static void TM4(uint8_t* dst) { TrueMotion(dst, 4); }
+static void TM4_NEON(uint8_t* dst) { TrueMotion_NEON(dst, 4); }
-static void VE4(uint8_t* dst) { // vertical
+static void VE4_NEON(uint8_t* dst) { // vertical
// NB: avoid vld1_u64 here as an alignment hint may be added -> SIGBUS.
const uint64x1_t A0 = vreinterpret_u64_u8(vld1_u8(dst - BPS - 1)); // top row
const uint64x1_t A1 = vshr_n_u64(A0, 8);
@@ -1340,7 +1353,7 @@ static void VE4(uint8_t* dst) { // vertical
}
}
-static void RD4(uint8_t* dst) { // Down-right
+static void RD4_NEON(uint8_t* dst) { // Down-right
const uint8x8_t XABCD_u8 = vld1_u8(dst - BPS - 1);
const uint64x1_t XABCD = vreinterpret_u64_u8(XABCD_u8);
const uint64x1_t ____XABC = vshl_n_u64(XABCD, 32);
@@ -1368,7 +1381,7 @@ static void RD4(uint8_t* dst) { // Down-right
vst1_lane_u32((uint32_t*)(dst + 3 * BPS), r3, 0);
}
-static void LD4(uint8_t* dst) { // Down-left
+static void LD4_NEON(uint8_t* dst) { // Down-left
// Note using the same shift trick as VE4() is slower here.
const uint8x8_t ABCDEFGH = vld1_u8(dst - BPS + 0);
const uint8x8_t BCDEFGH0 = vld1_u8(dst - BPS + 1);
@@ -1390,7 +1403,7 @@ static void LD4(uint8_t* dst) { // Down-left
//------------------------------------------------------------------------------
// Chroma
-static void VE8uv(uint8_t* dst) { // vertical
+static void VE8uv_NEON(uint8_t* dst) { // vertical
const uint8x8_t top = vld1_u8(dst - BPS);
int j;
for (j = 0; j < 8; ++j) {
@@ -1398,7 +1411,7 @@ static void VE8uv(uint8_t* dst) { // vertical
}
}
-static void HE8uv(uint8_t* dst) { // horizontal
+static void HE8uv_NEON(uint8_t* dst) { // horizontal
int j;
for (j = 0; j < 8; ++j) {
const uint8x8_t left = vld1_dup_u8(dst - 1);
@@ -1407,7 +1420,7 @@ static void HE8uv(uint8_t* dst) { // horizontal
}
}
-static WEBP_INLINE void DC8(uint8_t* dst, int do_top, int do_left) {
+static WEBP_INLINE void DC8_NEON(uint8_t* dst, int do_top, int do_left) {
uint16x8_t sum_top;
uint16x8_t sum_left;
uint8x8_t dc0;
@@ -1458,17 +1471,17 @@ static WEBP_INLINE void DC8(uint8_t* dst, int do_top, int do_left) {
}
}
-static void DC8uv(uint8_t* dst) { DC8(dst, 1, 1); }
-static void DC8uvNoTop(uint8_t* dst) { DC8(dst, 0, 1); }
-static void DC8uvNoLeft(uint8_t* dst) { DC8(dst, 1, 0); }
-static void DC8uvNoTopLeft(uint8_t* dst) { DC8(dst, 0, 0); }
+static void DC8uv_NEON(uint8_t* dst) { DC8_NEON(dst, 1, 1); }
+static void DC8uvNoTop_NEON(uint8_t* dst) { DC8_NEON(dst, 0, 1); }
+static void DC8uvNoLeft_NEON(uint8_t* dst) { DC8_NEON(dst, 1, 0); }
+static void DC8uvNoTopLeft_NEON(uint8_t* dst) { DC8_NEON(dst, 0, 0); }
-static void TM8uv(uint8_t* dst) { TrueMotion(dst, 8); }
+static void TM8uv_NEON(uint8_t* dst) { TrueMotion_NEON(dst, 8); }
//------------------------------------------------------------------------------
// 16x16
-static void VE16(uint8_t* dst) { // vertical
+static void VE16_NEON(uint8_t* dst) { // vertical
const uint8x16_t top = vld1q_u8(dst - BPS);
int j;
for (j = 0; j < 16; ++j) {
@@ -1476,7 +1489,7 @@ static void VE16(uint8_t* dst) { // vertical
}
}
-static void HE16(uint8_t* dst) { // horizontal
+static void HE16_NEON(uint8_t* dst) { // horizontal
int j;
for (j = 0; j < 16; ++j) {
const uint8x16_t left = vld1q_dup_u8(dst - 1);
@@ -1485,7 +1498,7 @@ static void HE16(uint8_t* dst) { // horizontal
}
}
-static WEBP_INLINE void DC16(uint8_t* dst, int do_top, int do_left) {
+static WEBP_INLINE void DC16_NEON(uint8_t* dst, int do_top, int do_left) {
uint16x8_t sum_top;
uint16x8_t sum_left;
uint8x8_t dc0;
@@ -1542,12 +1555,12 @@ static WEBP_INLINE void DC16(uint8_t* dst, int do_top, int do_left) {
}
}
-static void DC16TopLeft(uint8_t* dst) { DC16(dst, 1, 1); }
-static void DC16NoTop(uint8_t* dst) { DC16(dst, 0, 1); }
-static void DC16NoLeft(uint8_t* dst) { DC16(dst, 1, 0); }
-static void DC16NoTopLeft(uint8_t* dst) { DC16(dst, 0, 0); }
+static void DC16TopLeft_NEON(uint8_t* dst) { DC16_NEON(dst, 1, 1); }
+static void DC16NoTop_NEON(uint8_t* dst) { DC16_NEON(dst, 0, 1); }
+static void DC16NoLeft_NEON(uint8_t* dst) { DC16_NEON(dst, 1, 0); }
+static void DC16NoTopLeft_NEON(uint8_t* dst) { DC16_NEON(dst, 0, 0); }
-static void TM16(uint8_t* dst) {
+static void TM16_NEON(uint8_t* dst) {
const uint8x8_t TL = vld1_dup_u8(dst - BPS - 1); // top-left pixel 'A[-1]'
const uint8x16_t T = vld1q_u8(dst - BPS); // top row 'A[0..15]'
// A[c] - A[-1]
@@ -1556,10 +1569,10 @@ static void TM16(uint8_t* dst) {
int y;
for (y = 0; y < 16; y += 4) {
// left edge
- const int16x8_t L0 = ConvertU8ToS16(vld1_dup_u8(dst + 0 * BPS - 1));
- const int16x8_t L1 = ConvertU8ToS16(vld1_dup_u8(dst + 1 * BPS - 1));
- const int16x8_t L2 = ConvertU8ToS16(vld1_dup_u8(dst + 2 * BPS - 1));
- const int16x8_t L3 = ConvertU8ToS16(vld1_dup_u8(dst + 3 * BPS - 1));
+ const int16x8_t L0 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 0 * BPS - 1));
+ const int16x8_t L1 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 1 * BPS - 1));
+ const int16x8_t L2 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 2 * BPS - 1));
+ const int16x8_t L3 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 3 * BPS - 1));
const int16x8_t r0_lo = vaddq_s16(L0, d_lo); // L[r] + A[c] - A[-1]
const int16x8_t r1_lo = vaddq_s16(L1, d_lo);
const int16x8_t r2_lo = vaddq_s16(L2, d_lo);
@@ -1587,49 +1600,49 @@ static void TM16(uint8_t* dst) {
extern void VP8DspInitNEON(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitNEON(void) {
- VP8Transform = TransformTwo;
- VP8TransformAC3 = TransformAC3;
- VP8TransformDC = TransformDC;
- VP8TransformWHT = TransformWHT;
-
- VP8VFilter16 = VFilter16;
- VP8VFilter16i = VFilter16i;
- VP8HFilter16 = HFilter16;
+ VP8Transform = TransformTwo_NEON;
+ VP8TransformAC3 = TransformAC3_NEON;
+ VP8TransformDC = TransformDC_NEON;
+ VP8TransformWHT = TransformWHT_NEON;
+
+ VP8VFilter16 = VFilter16_NEON;
+ VP8VFilter16i = VFilter16i_NEON;
+ VP8HFilter16 = HFilter16_NEON;
#if !defined(WORK_AROUND_GCC)
- VP8HFilter16i = HFilter16i;
+ VP8HFilter16i = HFilter16i_NEON;
#endif
- VP8VFilter8 = VFilter8;
- VP8VFilter8i = VFilter8i;
+ VP8VFilter8 = VFilter8_NEON;
+ VP8VFilter8i = VFilter8i_NEON;
#if !defined(WORK_AROUND_GCC)
- VP8HFilter8 = HFilter8;
- VP8HFilter8i = HFilter8i;
+ VP8HFilter8 = HFilter8_NEON;
+ VP8HFilter8i = HFilter8i_NEON;
#endif
- VP8SimpleVFilter16 = SimpleVFilter16;
- VP8SimpleHFilter16 = SimpleHFilter16;
- VP8SimpleVFilter16i = SimpleVFilter16i;
- VP8SimpleHFilter16i = SimpleHFilter16i;
-
- VP8PredLuma4[0] = DC4;
- VP8PredLuma4[1] = TM4;
- VP8PredLuma4[2] = VE4;
- VP8PredLuma4[4] = RD4;
- VP8PredLuma4[6] = LD4;
-
- VP8PredLuma16[0] = DC16TopLeft;
- VP8PredLuma16[1] = TM16;
- VP8PredLuma16[2] = VE16;
- VP8PredLuma16[3] = HE16;
- VP8PredLuma16[4] = DC16NoTop;
- VP8PredLuma16[5] = DC16NoLeft;
- VP8PredLuma16[6] = DC16NoTopLeft;
-
- VP8PredChroma8[0] = DC8uv;
- VP8PredChroma8[1] = TM8uv;
- VP8PredChroma8[2] = VE8uv;
- VP8PredChroma8[3] = HE8uv;
- VP8PredChroma8[4] = DC8uvNoTop;
- VP8PredChroma8[5] = DC8uvNoLeft;
- VP8PredChroma8[6] = DC8uvNoTopLeft;
+ VP8SimpleVFilter16 = SimpleVFilter16_NEON;
+ VP8SimpleHFilter16 = SimpleHFilter16_NEON;
+ VP8SimpleVFilter16i = SimpleVFilter16i_NEON;
+ VP8SimpleHFilter16i = SimpleHFilter16i_NEON;
+
+ VP8PredLuma4[0] = DC4_NEON;
+ VP8PredLuma4[1] = TM4_NEON;
+ VP8PredLuma4[2] = VE4_NEON;
+ VP8PredLuma4[4] = RD4_NEON;
+ VP8PredLuma4[6] = LD4_NEON;
+
+ VP8PredLuma16[0] = DC16TopLeft_NEON;
+ VP8PredLuma16[1] = TM16_NEON;
+ VP8PredLuma16[2] = VE16_NEON;
+ VP8PredLuma16[3] = HE16_NEON;
+ VP8PredLuma16[4] = DC16NoTop_NEON;
+ VP8PredLuma16[5] = DC16NoLeft_NEON;
+ VP8PredLuma16[6] = DC16NoTopLeft_NEON;
+
+ VP8PredChroma8[0] = DC8uv_NEON;
+ VP8PredChroma8[1] = TM8uv_NEON;
+ VP8PredChroma8[2] = VE8uv_NEON;
+ VP8PredChroma8[3] = HE8uv_NEON;
+ VP8PredChroma8[4] = DC8uvNoTop_NEON;
+ VP8PredChroma8[5] = DC8uvNoLeft_NEON;
+ VP8PredChroma8[6] = DC8uvNoTopLeft_NEON;
}
#else // !WEBP_USE_NEON
diff --git a/src/3rdparty/libwebp/src/dsp/dec_sse2.c b/src/3rdparty/libwebp/src/dsp/dec_sse2.c
index 411fb02..b3840fa 100644
--- a/src/3rdparty/libwebp/src/dsp/dec_sse2.c
+++ b/src/3rdparty/libwebp/src/dsp/dec_sse2.c
@@ -12,23 +12,25 @@
// Author: somnath@google.com (Somnath Banerjee)
// cduvivier@google.com (Christian Duvivier)
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
#if defined(WEBP_USE_SSE2)
// The 3-coeff sparse transform in SSE2 is not really faster than the plain-C
// one it seems => disable it by default. Uncomment the following to enable:
-// #define USE_TRANSFORM_AC3
+#if !defined(USE_TRANSFORM_AC3)
+#define USE_TRANSFORM_AC3 0 // ALTERNATE_CODE
+#endif
#include <emmintrin.h>
-#include "./common_sse2.h"
-#include "../dec/vp8i_dec.h"
-#include "../utils/utils.h"
+#include "src/dsp/common_sse2.h"
+#include "src/dec/vp8i_dec.h"
+#include "src/utils/utils.h"
//------------------------------------------------------------------------------
// Transforms (Paragraph 14.4)
-static void Transform(const int16_t* in, uint8_t* dst, int do_two) {
+static void Transform_SSE2(const int16_t* in, uint8_t* dst, int do_two) {
// This implementation makes use of 16-bit fixed point versions of two
// multiply constants:
// K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16
@@ -193,7 +195,7 @@ static void Transform(const int16_t* in, uint8_t* dst, int do_two) {
}
}
-#if defined(USE_TRANSFORM_AC3)
+#if (USE_TRANSFORM_AC3 == 1)
#define MUL(a, b) (((a) * (b)) >> 16)
static void TransformAC3(const int16_t* in, uint8_t* dst) {
static const int kC1 = 20091 + (1 << 16);
@@ -248,7 +250,7 @@ static void TransformAC3(const int16_t* in, uint8_t* dst) {
_mm_subs_epu8((p), (q)))
// Shift each byte of "x" by 3 bits while preserving by the sign bit.
-static WEBP_INLINE void SignedShift8b(__m128i* const x) {
+static WEBP_INLINE void SignedShift8b_SSE2(__m128i* const x) {
const __m128i zero = _mm_setzero_si128();
const __m128i lo_0 = _mm_unpacklo_epi8(zero, *x);
const __m128i hi_0 = _mm_unpackhi_epi8(zero, *x);
@@ -258,8 +260,8 @@ static WEBP_INLINE void SignedShift8b(__m128i* const x) {
}
#define FLIP_SIGN_BIT2(a, b) { \
- a = _mm_xor_si128(a, sign_bit); \
- b = _mm_xor_si128(b, sign_bit); \
+ (a) = _mm_xor_si128(a, sign_bit); \
+ (b) = _mm_xor_si128(b, sign_bit); \
}
#define FLIP_SIGN_BIT4(a, b, c, d) { \
@@ -268,11 +270,11 @@ static WEBP_INLINE void SignedShift8b(__m128i* const x) {
}
// input/output is uint8_t
-static WEBP_INLINE void GetNotHEV(const __m128i* const p1,
- const __m128i* const p0,
- const __m128i* const q0,
- const __m128i* const q1,
- int hev_thresh, __m128i* const not_hev) {
+static WEBP_INLINE void GetNotHEV_SSE2(const __m128i* const p1,
+ const __m128i* const p0,
+ const __m128i* const q0,
+ const __m128i* const q1,
+ int hev_thresh, __m128i* const not_hev) {
const __m128i zero = _mm_setzero_si128();
const __m128i t_1 = MM_ABS(*p1, *p0);
const __m128i t_2 = MM_ABS(*q1, *q0);
@@ -285,11 +287,11 @@ static WEBP_INLINE void GetNotHEV(const __m128i* const p1,
}
// input pixels are int8_t
-static WEBP_INLINE void GetBaseDelta(const __m128i* const p1,
- const __m128i* const p0,
- const __m128i* const q0,
- const __m128i* const q1,
- __m128i* const delta) {
+static WEBP_INLINE void GetBaseDelta_SSE2(const __m128i* const p1,
+ const __m128i* const p0,
+ const __m128i* const q0,
+ const __m128i* const q1,
+ __m128i* const delta) {
// beware of addition order, for saturation!
const __m128i p1_q1 = _mm_subs_epi8(*p1, *q1); // p1 - q1
const __m128i q0_p0 = _mm_subs_epi8(*q0, *p0); // q0 - p0
@@ -300,15 +302,16 @@ static WEBP_INLINE void GetBaseDelta(const __m128i* const p1,
}
// input and output are int8_t
-static WEBP_INLINE void DoSimpleFilter(__m128i* const p0, __m128i* const q0,
- const __m128i* const fl) {
+static WEBP_INLINE void DoSimpleFilter_SSE2(__m128i* const p0,
+ __m128i* const q0,
+ const __m128i* const fl) {
const __m128i k3 = _mm_set1_epi8(3);
const __m128i k4 = _mm_set1_epi8(4);
__m128i v3 = _mm_adds_epi8(*fl, k3);
__m128i v4 = _mm_adds_epi8(*fl, k4);
- SignedShift8b(&v4); // v4 >> 3
- SignedShift8b(&v3); // v3 >> 3
+ SignedShift8b_SSE2(&v4); // v4 >> 3
+ SignedShift8b_SSE2(&v3); // v3 >> 3
*q0 = _mm_subs_epi8(*q0, v4); // q0 -= v4
*p0 = _mm_adds_epi8(*p0, v3); // p0 += v3
}
@@ -317,9 +320,9 @@ static WEBP_INLINE void DoSimpleFilter(__m128i* const p0, __m128i* const q0,
// Update operations:
// q = q - delta and p = p + delta; where delta = [(a_hi >> 7), (a_lo >> 7)]
// Pixels 'pi' and 'qi' are int8_t on input, uint8_t on output (sign flip).
-static WEBP_INLINE void Update2Pixels(__m128i* const pi, __m128i* const qi,
- const __m128i* const a0_lo,
- const __m128i* const a0_hi) {
+static WEBP_INLINE void Update2Pixels_SSE2(__m128i* const pi, __m128i* const qi,
+ const __m128i* const a0_lo,
+ const __m128i* const a0_hi) {
const __m128i a1_lo = _mm_srai_epi16(*a0_lo, 7);
const __m128i a1_hi = _mm_srai_epi16(*a0_hi, 7);
const __m128i delta = _mm_packs_epi16(a1_lo, a1_hi);
@@ -330,11 +333,11 @@ static WEBP_INLINE void Update2Pixels(__m128i* const pi, __m128i* const qi,
}
// input pixels are uint8_t
-static WEBP_INLINE void NeedsFilter(const __m128i* const p1,
- const __m128i* const p0,
- const __m128i* const q0,
- const __m128i* const q1,
- int thresh, __m128i* const mask) {
+static WEBP_INLINE void NeedsFilter_SSE2(const __m128i* const p1,
+ const __m128i* const p0,
+ const __m128i* const q0,
+ const __m128i* const q1,
+ int thresh, __m128i* const mask) {
const __m128i m_thresh = _mm_set1_epi8(thresh);
const __m128i t1 = MM_ABS(*p1, *q1); // abs(p1 - q1)
const __m128i kFE = _mm_set1_epi8(0xFE);
@@ -353,28 +356,29 @@ static WEBP_INLINE void NeedsFilter(const __m128i* const p1,
// Edge filtering functions
// Applies filter on 2 pixels (p0 and q0)
-static WEBP_INLINE void DoFilter2(__m128i* const p1, __m128i* const p0,
- __m128i* const q0, __m128i* const q1,
- int thresh) {
+static WEBP_INLINE void DoFilter2_SSE2(__m128i* const p1, __m128i* const p0,
+ __m128i* const q0, __m128i* const q1,
+ int thresh) {
__m128i a, mask;
const __m128i sign_bit = _mm_set1_epi8(0x80);
- // convert p1/q1 to int8_t (for GetBaseDelta)
+ // convert p1/q1 to int8_t (for GetBaseDelta_SSE2)
const __m128i p1s = _mm_xor_si128(*p1, sign_bit);
const __m128i q1s = _mm_xor_si128(*q1, sign_bit);
- NeedsFilter(p1, p0, q0, q1, thresh, &mask);
+ NeedsFilter_SSE2(p1, p0, q0, q1, thresh, &mask);
FLIP_SIGN_BIT2(*p0, *q0);
- GetBaseDelta(&p1s, p0, q0, &q1s, &a);
+ GetBaseDelta_SSE2(&p1s, p0, q0, &q1s, &a);
a = _mm_and_si128(a, mask); // mask filter values we don't care about
- DoSimpleFilter(p0, q0, &a);
+ DoSimpleFilter_SSE2(p0, q0, &a);
FLIP_SIGN_BIT2(*p0, *q0);
}
// Applies filter on 4 pixels (p1, p0, q0 and q1)
-static WEBP_INLINE void DoFilter4(__m128i* const p1, __m128i* const p0,
- __m128i* const q0, __m128i* const q1,
- const __m128i* const mask, int hev_thresh) {
+static WEBP_INLINE void DoFilter4_SSE2(__m128i* const p1, __m128i* const p0,
+ __m128i* const q0, __m128i* const q1,
+ const __m128i* const mask,
+ int hev_thresh) {
const __m128i zero = _mm_setzero_si128();
const __m128i sign_bit = _mm_set1_epi8(0x80);
const __m128i k64 = _mm_set1_epi8(64);
@@ -384,7 +388,7 @@ static WEBP_INLINE void DoFilter4(__m128i* const p1, __m128i* const p0,
__m128i t1, t2, t3;
// compute hev mask
- GetNotHEV(p1, p0, q0, q1, hev_thresh, &not_hev);
+ GetNotHEV_SSE2(p1, p0, q0, q1, hev_thresh, &not_hev);
// convert to signed values
FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1);
@@ -399,8 +403,8 @@ static WEBP_INLINE void DoFilter4(__m128i* const p1, __m128i* const p0,
t2 = _mm_adds_epi8(t1, k3); // 3 * (q0 - p0) + hev(p1 - q1) + 3
t3 = _mm_adds_epi8(t1, k4); // 3 * (q0 - p0) + hev(p1 - q1) + 4
- SignedShift8b(&t2); // (3 * (q0 - p0) + hev(p1 - q1) + 3) >> 3
- SignedShift8b(&t3); // (3 * (q0 - p0) + hev(p1 - q1) + 4) >> 3
+ SignedShift8b_SSE2(&t2); // (3 * (q0 - p0) + hev(p1 - q1) + 3) >> 3
+ SignedShift8b_SSE2(&t3); // (3 * (q0 - p0) + hev(p1 - q1) + 4) >> 3
*p0 = _mm_adds_epi8(*p0, t2); // p0 += t2
*q0 = _mm_subs_epi8(*q0, t3); // q0 -= t3
FLIP_SIGN_BIT2(*p0, *q0);
@@ -417,25 +421,26 @@ static WEBP_INLINE void DoFilter4(__m128i* const p1, __m128i* const p0,
}
// Applies filter on 6 pixels (p2, p1, p0, q0, q1 and q2)
-static WEBP_INLINE void DoFilter6(__m128i* const p2, __m128i* const p1,
- __m128i* const p0, __m128i* const q0,
- __m128i* const q1, __m128i* const q2,
- const __m128i* const mask, int hev_thresh) {
+static WEBP_INLINE void DoFilter6_SSE2(__m128i* const p2, __m128i* const p1,
+ __m128i* const p0, __m128i* const q0,
+ __m128i* const q1, __m128i* const q2,
+ const __m128i* const mask,
+ int hev_thresh) {
const __m128i zero = _mm_setzero_si128();
const __m128i sign_bit = _mm_set1_epi8(0x80);
__m128i a, not_hev;
// compute hev mask
- GetNotHEV(p1, p0, q0, q1, hev_thresh, &not_hev);
+ GetNotHEV_SSE2(p1, p0, q0, q1, hev_thresh, &not_hev);
FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1);
FLIP_SIGN_BIT2(*p2, *q2);
- GetBaseDelta(p1, p0, q0, q1, &a);
+ GetBaseDelta_SSE2(p1, p0, q0, q1, &a);
{ // do simple filter on pixels with hev
const __m128i m = _mm_andnot_si128(not_hev, *mask);
const __m128i f = _mm_and_si128(a, m);
- DoSimpleFilter(p0, q0, &f);
+ DoSimpleFilter_SSE2(p0, q0, &f);
}
{ // do strong filter on pixels with not hev
@@ -460,15 +465,15 @@ static WEBP_INLINE void DoFilter6(__m128i* const p2, __m128i* const p1,
const __m128i a0_lo = _mm_add_epi16(a1_lo, f9_lo); // Filter * 27 + 63
const __m128i a0_hi = _mm_add_epi16(a1_hi, f9_hi); // Filter * 27 + 63
- Update2Pixels(p2, q2, &a2_lo, &a2_hi);
- Update2Pixels(p1, q1, &a1_lo, &a1_hi);
- Update2Pixels(p0, q0, &a0_lo, &a0_hi);
+ Update2Pixels_SSE2(p2, q2, &a2_lo, &a2_hi);
+ Update2Pixels_SSE2(p1, q1, &a1_lo, &a1_hi);
+ Update2Pixels_SSE2(p0, q0, &a0_lo, &a0_hi);
}
}
// reads 8 rows across a vertical edge.
-static WEBP_INLINE void Load8x4(const uint8_t* const b, int stride,
- __m128i* const p, __m128i* const q) {
+static WEBP_INLINE void Load8x4_SSE2(const uint8_t* const b, int stride,
+ __m128i* const p, __m128i* const q) {
// A0 = 63 62 61 60 23 22 21 20 43 42 41 40 03 02 01 00
// A1 = 73 72 71 70 33 32 31 30 53 52 51 50 13 12 11 10
const __m128i A0 = _mm_set_epi32(
@@ -494,11 +499,11 @@ static WEBP_INLINE void Load8x4(const uint8_t* const b, int stride,
*q = _mm_unpackhi_epi32(C0, C1);
}
-static WEBP_INLINE void Load16x4(const uint8_t* const r0,
- const uint8_t* const r8,
- int stride,
- __m128i* const p1, __m128i* const p0,
- __m128i* const q0, __m128i* const q1) {
+static WEBP_INLINE void Load16x4_SSE2(const uint8_t* const r0,
+ const uint8_t* const r8,
+ int stride,
+ __m128i* const p1, __m128i* const p0,
+ __m128i* const q0, __m128i* const q1) {
// Assume the pixels around the edge (|) are numbered as follows
// 00 01 | 02 03
// 10 11 | 12 13
@@ -514,8 +519,8 @@ static WEBP_INLINE void Load16x4(const uint8_t* const r0,
// q0 = 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
// p0 = f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
// q1 = f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
- Load8x4(r0, stride, p1, q0);
- Load8x4(r8, stride, p0, q1);
+ Load8x4_SSE2(r0, stride, p1, q0);
+ Load8x4_SSE2(r8, stride, p0, q1);
{
// p1 = f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
@@ -531,7 +536,8 @@ static WEBP_INLINE void Load16x4(const uint8_t* const r0,
}
}
-static WEBP_INLINE void Store4x4(__m128i* const x, uint8_t* dst, int stride) {
+static WEBP_INLINE void Store4x4_SSE2(__m128i* const x,
+ uint8_t* dst, int stride) {
int i;
for (i = 0; i < 4; ++i, dst += stride) {
WebPUint32ToMem(dst, _mm_cvtsi128_si32(*x));
@@ -540,12 +546,12 @@ static WEBP_INLINE void Store4x4(__m128i* const x, uint8_t* dst, int stride) {
}
// Transpose back and store
-static WEBP_INLINE void Store16x4(const __m128i* const p1,
- const __m128i* const p0,
- const __m128i* const q0,
- const __m128i* const q1,
- uint8_t* r0, uint8_t* r8,
- int stride) {
+static WEBP_INLINE void Store16x4_SSE2(const __m128i* const p1,
+ const __m128i* const p0,
+ const __m128i* const q0,
+ const __m128i* const q1,
+ uint8_t* r0, uint8_t* r8,
+ int stride) {
__m128i t1, p1_s, p0_s, q0_s, q1_s;
// p0 = 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
@@ -572,55 +578,55 @@ static WEBP_INLINE void Store16x4(const __m128i* const p1,
p1_s = _mm_unpacklo_epi16(t1, q1_s);
q1_s = _mm_unpackhi_epi16(t1, q1_s);
- Store4x4(&p0_s, r0, stride);
+ Store4x4_SSE2(&p0_s, r0, stride);
r0 += 4 * stride;
- Store4x4(&q0_s, r0, stride);
+ Store4x4_SSE2(&q0_s, r0, stride);
- Store4x4(&p1_s, r8, stride);
+ Store4x4_SSE2(&p1_s, r8, stride);
r8 += 4 * stride;
- Store4x4(&q1_s, r8, stride);
+ Store4x4_SSE2(&q1_s, r8, stride);
}
//------------------------------------------------------------------------------
// Simple In-loop filtering (Paragraph 15.2)
-static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {
+static void SimpleVFilter16_SSE2(uint8_t* p, int stride, int thresh) {
// Load
__m128i p1 = _mm_loadu_si128((__m128i*)&p[-2 * stride]);
__m128i p0 = _mm_loadu_si128((__m128i*)&p[-stride]);
__m128i q0 = _mm_loadu_si128((__m128i*)&p[0]);
__m128i q1 = _mm_loadu_si128((__m128i*)&p[stride]);
- DoFilter2(&p1, &p0, &q0, &q1, thresh);
+ DoFilter2_SSE2(&p1, &p0, &q0, &q1, thresh);
// Store
_mm_storeu_si128((__m128i*)&p[-stride], p0);
_mm_storeu_si128((__m128i*)&p[0], q0);
}
-static void SimpleHFilter16(uint8_t* p, int stride, int thresh) {
+static void SimpleHFilter16_SSE2(uint8_t* p, int stride, int thresh) {
__m128i p1, p0, q0, q1;
p -= 2; // beginning of p1
- Load16x4(p, p + 8 * stride, stride, &p1, &p0, &q0, &q1);
- DoFilter2(&p1, &p0, &q0, &q1, thresh);
- Store16x4(&p1, &p0, &q0, &q1, p, p + 8 * stride, stride);
+ Load16x4_SSE2(p, p + 8 * stride, stride, &p1, &p0, &q0, &q1);
+ DoFilter2_SSE2(&p1, &p0, &q0, &q1, thresh);
+ Store16x4_SSE2(&p1, &p0, &q0, &q1, p, p + 8 * stride, stride);
}
-static void SimpleVFilter16i(uint8_t* p, int stride, int thresh) {
+static void SimpleVFilter16i_SSE2(uint8_t* p, int stride, int thresh) {
int k;
for (k = 3; k > 0; --k) {
p += 4 * stride;
- SimpleVFilter16(p, stride, thresh);
+ SimpleVFilter16_SSE2(p, stride, thresh);
}
}
-static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) {
+static void SimpleHFilter16i_SSE2(uint8_t* p, int stride, int thresh) {
int k;
for (k = 3; k > 0; --k) {
p += 4;
- SimpleHFilter16(p, stride, thresh);
+ SimpleHFilter16_SSE2(p, stride, thresh);
}
}
@@ -628,60 +634,60 @@ static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) {
// Complex In-loop filtering (Paragraph 15.3)
#define MAX_DIFF1(p3, p2, p1, p0, m) do { \
- m = MM_ABS(p1, p0); \
- m = _mm_max_epu8(m, MM_ABS(p3, p2)); \
- m = _mm_max_epu8(m, MM_ABS(p2, p1)); \
+ (m) = MM_ABS(p1, p0); \
+ (m) = _mm_max_epu8(m, MM_ABS(p3, p2)); \
+ (m) = _mm_max_epu8(m, MM_ABS(p2, p1)); \
} while (0)
#define MAX_DIFF2(p3, p2, p1, p0, m) do { \
- m = _mm_max_epu8(m, MM_ABS(p1, p0)); \
- m = _mm_max_epu8(m, MM_ABS(p3, p2)); \
- m = _mm_max_epu8(m, MM_ABS(p2, p1)); \
+ (m) = _mm_max_epu8(m, MM_ABS(p1, p0)); \
+ (m) = _mm_max_epu8(m, MM_ABS(p3, p2)); \
+ (m) = _mm_max_epu8(m, MM_ABS(p2, p1)); \
} while (0)
#define LOAD_H_EDGES4(p, stride, e1, e2, e3, e4) { \
- e1 = _mm_loadu_si128((__m128i*)&(p)[0 * stride]); \
- e2 = _mm_loadu_si128((__m128i*)&(p)[1 * stride]); \
- e3 = _mm_loadu_si128((__m128i*)&(p)[2 * stride]); \
- e4 = _mm_loadu_si128((__m128i*)&(p)[3 * stride]); \
+ (e1) = _mm_loadu_si128((__m128i*)&(p)[0 * (stride)]); \
+ (e2) = _mm_loadu_si128((__m128i*)&(p)[1 * (stride)]); \
+ (e3) = _mm_loadu_si128((__m128i*)&(p)[2 * (stride)]); \
+ (e4) = _mm_loadu_si128((__m128i*)&(p)[3 * (stride)]); \
}
#define LOADUV_H_EDGE(p, u, v, stride) do { \
const __m128i U = _mm_loadl_epi64((__m128i*)&(u)[(stride)]); \
const __m128i V = _mm_loadl_epi64((__m128i*)&(v)[(stride)]); \
- p = _mm_unpacklo_epi64(U, V); \
+ (p) = _mm_unpacklo_epi64(U, V); \
} while (0)
#define LOADUV_H_EDGES4(u, v, stride, e1, e2, e3, e4) { \
- LOADUV_H_EDGE(e1, u, v, 0 * stride); \
- LOADUV_H_EDGE(e2, u, v, 1 * stride); \
- LOADUV_H_EDGE(e3, u, v, 2 * stride); \
- LOADUV_H_EDGE(e4, u, v, 3 * stride); \
+ LOADUV_H_EDGE(e1, u, v, 0 * (stride)); \
+ LOADUV_H_EDGE(e2, u, v, 1 * (stride)); \
+ LOADUV_H_EDGE(e3, u, v, 2 * (stride)); \
+ LOADUV_H_EDGE(e4, u, v, 3 * (stride)); \
}
#define STOREUV(p, u, v, stride) { \
- _mm_storel_epi64((__m128i*)&u[(stride)], p); \
- p = _mm_srli_si128(p, 8); \
- _mm_storel_epi64((__m128i*)&v[(stride)], p); \
+ _mm_storel_epi64((__m128i*)&(u)[(stride)], p); \
+ (p) = _mm_srli_si128(p, 8); \
+ _mm_storel_epi64((__m128i*)&(v)[(stride)], p); \
}
-static WEBP_INLINE void ComplexMask(const __m128i* const p1,
- const __m128i* const p0,
- const __m128i* const q0,
- const __m128i* const q1,
- int thresh, int ithresh,
- __m128i* const mask) {
+static WEBP_INLINE void ComplexMask_SSE2(const __m128i* const p1,
+ const __m128i* const p0,
+ const __m128i* const q0,
+ const __m128i* const q1,
+ int thresh, int ithresh,
+ __m128i* const mask) {
const __m128i it = _mm_set1_epi8(ithresh);
const __m128i diff = _mm_subs_epu8(*mask, it);
const __m128i thresh_mask = _mm_cmpeq_epi8(diff, _mm_setzero_si128());
__m128i filter_mask;
- NeedsFilter(p1, p0, q0, q1, thresh, &filter_mask);
+ NeedsFilter_SSE2(p1, p0, q0, q1, thresh, &filter_mask);
*mask = _mm_and_si128(thresh_mask, filter_mask);
}
// on macroblock edges
-static void VFilter16(uint8_t* p, int stride,
- int thresh, int ithresh, int hev_thresh) {
+static void VFilter16_SSE2(uint8_t* p, int stride,
+ int thresh, int ithresh, int hev_thresh) {
__m128i t1;
__m128i mask;
__m128i p2, p1, p0, q0, q1, q2;
@@ -694,8 +700,8 @@ static void VFilter16(uint8_t* p, int stride,
LOAD_H_EDGES4(p, stride, q0, q1, q2, t1);
MAX_DIFF2(t1, q2, q1, q0, mask);
- ComplexMask(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
- DoFilter6(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh);
+ ComplexMask_SSE2(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
+ DoFilter6_SSE2(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh);
// Store
_mm_storeu_si128((__m128i*)&p[-3 * stride], p2);
@@ -706,28 +712,28 @@ static void VFilter16(uint8_t* p, int stride,
_mm_storeu_si128((__m128i*)&p[+2 * stride], q2);
}
-static void HFilter16(uint8_t* p, int stride,
- int thresh, int ithresh, int hev_thresh) {
+static void HFilter16_SSE2(uint8_t* p, int stride,
+ int thresh, int ithresh, int hev_thresh) {
__m128i mask;
__m128i p3, p2, p1, p0, q0, q1, q2, q3;
uint8_t* const b = p - 4;
- Load16x4(b, b + 8 * stride, stride, &p3, &p2, &p1, &p0); // p3, p2, p1, p0
+ Load16x4_SSE2(b, b + 8 * stride, stride, &p3, &p2, &p1, &p0);
MAX_DIFF1(p3, p2, p1, p0, mask);
- Load16x4(p, p + 8 * stride, stride, &q0, &q1, &q2, &q3); // q0, q1, q2, q3
+ Load16x4_SSE2(p, p + 8 * stride, stride, &q0, &q1, &q2, &q3);
MAX_DIFF2(q3, q2, q1, q0, mask);
- ComplexMask(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
- DoFilter6(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh);
+ ComplexMask_SSE2(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
+ DoFilter6_SSE2(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh);
- Store16x4(&p3, &p2, &p1, &p0, b, b + 8 * stride, stride);
- Store16x4(&q0, &q1, &q2, &q3, p, p + 8 * stride, stride);
+ Store16x4_SSE2(&p3, &p2, &p1, &p0, b, b + 8 * stride, stride);
+ Store16x4_SSE2(&q0, &q1, &q2, &q3, p, p + 8 * stride, stride);
}
// on three inner edges
-static void VFilter16i(uint8_t* p, int stride,
- int thresh, int ithresh, int hev_thresh) {
+static void VFilter16i_SSE2(uint8_t* p, int stride,
+ int thresh, int ithresh, int hev_thresh) {
int k;
__m128i p3, p2, p1, p0; // loop invariants
@@ -744,8 +750,8 @@ static void VFilter16i(uint8_t* p, int stride,
// p3 and p2 are not just temporary variables here: they will be
// re-used for next span. And q2/q3 will become p1/p0 accordingly.
- ComplexMask(&p1, &p0, &p3, &p2, thresh, ithresh, &mask);
- DoFilter4(&p1, &p0, &p3, &p2, &mask, hev_thresh);
+ ComplexMask_SSE2(&p1, &p0, &p3, &p2, thresh, ithresh, &mask);
+ DoFilter4_SSE2(&p1, &p0, &p3, &p2, &mask, hev_thresh);
// Store
_mm_storeu_si128((__m128i*)&b[0 * stride], p1);
@@ -759,12 +765,12 @@ static void VFilter16i(uint8_t* p, int stride,
}
}
-static void HFilter16i(uint8_t* p, int stride,
- int thresh, int ithresh, int hev_thresh) {
+static void HFilter16i_SSE2(uint8_t* p, int stride,
+ int thresh, int ithresh, int hev_thresh) {
int k;
__m128i p3, p2, p1, p0; // loop invariants
- Load16x4(p, p + 8 * stride, stride, &p3, &p2, &p1, &p0); // prologue
+ Load16x4_SSE2(p, p + 8 * stride, stride, &p3, &p2, &p1, &p0); // prologue
for (k = 3; k > 0; --k) {
__m128i mask, tmp1, tmp2;
@@ -773,13 +779,13 @@ static void HFilter16i(uint8_t* p, int stride,
p += 4; // beginning of q0 (and next span)
MAX_DIFF1(p3, p2, p1, p0, mask); // compute partial mask
- Load16x4(p, p + 8 * stride, stride, &p3, &p2, &tmp1, &tmp2);
+ Load16x4_SSE2(p, p + 8 * stride, stride, &p3, &p2, &tmp1, &tmp2);
MAX_DIFF2(p3, p2, tmp1, tmp2, mask);
- ComplexMask(&p1, &p0, &p3, &p2, thresh, ithresh, &mask);
- DoFilter4(&p1, &p0, &p3, &p2, &mask, hev_thresh);
+ ComplexMask_SSE2(&p1, &p0, &p3, &p2, thresh, ithresh, &mask);
+ DoFilter4_SSE2(&p1, &p0, &p3, &p2, &mask, hev_thresh);
- Store16x4(&p1, &p0, &p3, &p2, b, b + 8 * stride, stride);
+ Store16x4_SSE2(&p1, &p0, &p3, &p2, b, b + 8 * stride, stride);
// rotate samples
p1 = tmp1;
@@ -788,8 +794,8 @@ static void HFilter16i(uint8_t* p, int stride,
}
// 8-pixels wide variant, for chroma filtering
-static void VFilter8(uint8_t* u, uint8_t* v, int stride,
- int thresh, int ithresh, int hev_thresh) {
+static void VFilter8_SSE2(uint8_t* u, uint8_t* v, int stride,
+ int thresh, int ithresh, int hev_thresh) {
__m128i mask;
__m128i t1, p2, p1, p0, q0, q1, q2;
@@ -801,8 +807,8 @@ static void VFilter8(uint8_t* u, uint8_t* v, int stride,
LOADUV_H_EDGES4(u, v, stride, q0, q1, q2, t1);
MAX_DIFF2(t1, q2, q1, q0, mask);
- ComplexMask(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
- DoFilter6(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh);
+ ComplexMask_SSE2(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
+ DoFilter6_SSE2(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh);
// Store
STOREUV(p2, u, v, -3 * stride);
@@ -813,28 +819,28 @@ static void VFilter8(uint8_t* u, uint8_t* v, int stride,
STOREUV(q2, u, v, 2 * stride);
}
-static void HFilter8(uint8_t* u, uint8_t* v, int stride,
- int thresh, int ithresh, int hev_thresh) {
+static void HFilter8_SSE2(uint8_t* u, uint8_t* v, int stride,
+ int thresh, int ithresh, int hev_thresh) {
__m128i mask;
__m128i p3, p2, p1, p0, q0, q1, q2, q3;
uint8_t* const tu = u - 4;
uint8_t* const tv = v - 4;
- Load16x4(tu, tv, stride, &p3, &p2, &p1, &p0); // p3, p2, p1, p0
+ Load16x4_SSE2(tu, tv, stride, &p3, &p2, &p1, &p0);
MAX_DIFF1(p3, p2, p1, p0, mask);
- Load16x4(u, v, stride, &q0, &q1, &q2, &q3); // q0, q1, q2, q3
+ Load16x4_SSE2(u, v, stride, &q0, &q1, &q2, &q3);
MAX_DIFF2(q3, q2, q1, q0, mask);
- ComplexMask(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
- DoFilter6(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh);
+ ComplexMask_SSE2(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
+ DoFilter6_SSE2(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh);
- Store16x4(&p3, &p2, &p1, &p0, tu, tv, stride);
- Store16x4(&q0, &q1, &q2, &q3, u, v, stride);
+ Store16x4_SSE2(&p3, &p2, &p1, &p0, tu, tv, stride);
+ Store16x4_SSE2(&q0, &q1, &q2, &q3, u, v, stride);
}
-static void VFilter8i(uint8_t* u, uint8_t* v, int stride,
- int thresh, int ithresh, int hev_thresh) {
+static void VFilter8i_SSE2(uint8_t* u, uint8_t* v, int stride,
+ int thresh, int ithresh, int hev_thresh) {
__m128i mask;
__m128i t1, t2, p1, p0, q0, q1;
@@ -849,8 +855,8 @@ static void VFilter8i(uint8_t* u, uint8_t* v, int stride,
LOADUV_H_EDGES4(u, v, stride, q0, q1, t1, t2);
MAX_DIFF2(t2, t1, q1, q0, mask);
- ComplexMask(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
- DoFilter4(&p1, &p0, &q0, &q1, &mask, hev_thresh);
+ ComplexMask_SSE2(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
+ DoFilter4_SSE2(&p1, &p0, &q0, &q1, &mask, hev_thresh);
// Store
STOREUV(p1, u, v, -2 * stride);
@@ -859,24 +865,24 @@ static void VFilter8i(uint8_t* u, uint8_t* v, int stride,
STOREUV(q1, u, v, 1 * stride);
}
-static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
- int thresh, int ithresh, int hev_thresh) {
+static void HFilter8i_SSE2(uint8_t* u, uint8_t* v, int stride,
+ int thresh, int ithresh, int hev_thresh) {
__m128i mask;
__m128i t1, t2, p1, p0, q0, q1;
- Load16x4(u, v, stride, &t2, &t1, &p1, &p0); // p3, p2, p1, p0
+ Load16x4_SSE2(u, v, stride, &t2, &t1, &p1, &p0); // p3, p2, p1, p0
MAX_DIFF1(t2, t1, p1, p0, mask);
u += 4; // beginning of q0
v += 4;
- Load16x4(u, v, stride, &q0, &q1, &t1, &t2); // q0, q1, q2, q3
+ Load16x4_SSE2(u, v, stride, &q0, &q1, &t1, &t2); // q0, q1, q2, q3
MAX_DIFF2(t2, t1, q1, q0, mask);
- ComplexMask(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
- DoFilter4(&p1, &p0, &q0, &q1, &mask, hev_thresh);
+ ComplexMask_SSE2(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
+ DoFilter4_SSE2(&p1, &p0, &q0, &q1, &mask, hev_thresh);
u -= 2; // beginning of p1
v -= 2;
- Store16x4(&p1, &p0, &q0, &q1, u, v, stride);
+ Store16x4_SSE2(&p1, &p0, &q0, &q1, u, v, stride);
}
//------------------------------------------------------------------------------
@@ -893,7 +899,7 @@ static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
// where: AC = (a + b + 1) >> 1, BC = (b + c + 1) >> 1
// and ab = a ^ b, bc = b ^ c, lsb = (AC^BC)&1
-static void VE4(uint8_t* dst) { // vertical
+static void VE4_SSE2(uint8_t* dst) { // vertical
const __m128i one = _mm_set1_epi8(1);
const __m128i ABCDEFGH = _mm_loadl_epi64((__m128i*)(dst - BPS - 1));
const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 1);
@@ -909,7 +915,7 @@ static void VE4(uint8_t* dst) { // vertical
}
}
-static void LD4(uint8_t* dst) { // Down-Left
+static void LD4_SSE2(uint8_t* dst) { // Down-Left
const __m128i one = _mm_set1_epi8(1);
const __m128i ABCDEFGH = _mm_loadl_epi64((__m128i*)(dst - BPS));
const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 1);
@@ -925,7 +931,7 @@ static void LD4(uint8_t* dst) { // Down-Left
WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3)));
}
-static void VR4(uint8_t* dst) { // Vertical-Right
+static void VR4_SSE2(uint8_t* dst) { // Vertical-Right
const __m128i one = _mm_set1_epi8(1);
const int I = dst[-1 + 0 * BPS];
const int J = dst[-1 + 1 * BPS];
@@ -950,7 +956,7 @@ static void VR4(uint8_t* dst) { // Vertical-Right
DST(0, 3) = AVG3(K, J, I);
}
-static void VL4(uint8_t* dst) { // Vertical-Left
+static void VL4_SSE2(uint8_t* dst) { // Vertical-Left
const __m128i one = _mm_set1_epi8(1);
const __m128i ABCDEFGH = _mm_loadl_epi64((__m128i*)(dst - BPS));
const __m128i BCDEFGH_ = _mm_srli_si128(ABCDEFGH, 1);
@@ -975,7 +981,7 @@ static void VL4(uint8_t* dst) { // Vertical-Left
DST(3, 3) = (extra_out >> 8) & 0xff;
}
-static void RD4(uint8_t* dst) { // Down-right
+static void RD4_SSE2(uint8_t* dst) { // Down-right
const __m128i one = _mm_set1_epi8(1);
const __m128i XABCD = _mm_loadl_epi64((__m128i*)(dst - BPS - 1));
const __m128i ____XABCD = _mm_slli_si128(XABCD, 4);
@@ -1004,7 +1010,7 @@ static void RD4(uint8_t* dst) { // Down-right
//------------------------------------------------------------------------------
// Luma 16x16
-static WEBP_INLINE void TrueMotion(uint8_t* dst, int size) {
+static WEBP_INLINE void TrueMotion_SSE2(uint8_t* dst, int size) {
const uint8_t* top = dst - BPS;
const __m128i zero = _mm_setzero_si128();
int y;
@@ -1041,11 +1047,11 @@ static WEBP_INLINE void TrueMotion(uint8_t* dst, int size) {
}
}
-static void TM4(uint8_t* dst) { TrueMotion(dst, 4); }
-static void TM8uv(uint8_t* dst) { TrueMotion(dst, 8); }
-static void TM16(uint8_t* dst) { TrueMotion(dst, 16); }
+static void TM4_SSE2(uint8_t* dst) { TrueMotion_SSE2(dst, 4); }
+static void TM8uv_SSE2(uint8_t* dst) { TrueMotion_SSE2(dst, 8); }
+static void TM16_SSE2(uint8_t* dst) { TrueMotion_SSE2(dst, 16); }
-static void VE16(uint8_t* dst) {
+static void VE16_SSE2(uint8_t* dst) {
const __m128i top = _mm_loadu_si128((const __m128i*)(dst - BPS));
int j;
for (j = 0; j < 16; ++j) {
@@ -1053,7 +1059,7 @@ static void VE16(uint8_t* dst) {
}
}
-static void HE16(uint8_t* dst) { // horizontal
+static void HE16_SSE2(uint8_t* dst) { // horizontal
int j;
for (j = 16; j > 0; --j) {
const __m128i values = _mm_set1_epi8(dst[-1]);
@@ -1062,7 +1068,7 @@ static void HE16(uint8_t* dst) { // horizontal
}
}
-static WEBP_INLINE void Put16(uint8_t v, uint8_t* dst) {
+static WEBP_INLINE void Put16_SSE2(uint8_t v, uint8_t* dst) {
int j;
const __m128i values = _mm_set1_epi8(v);
for (j = 0; j < 16; ++j) {
@@ -1070,7 +1076,7 @@ static WEBP_INLINE void Put16(uint8_t v, uint8_t* dst) {
}
}
-static void DC16(uint8_t* dst) { // DC
+static void DC16_SSE2(uint8_t* dst) { // DC
const __m128i zero = _mm_setzero_si128();
const __m128i top = _mm_loadu_si128((const __m128i*)(dst - BPS));
const __m128i sad8x2 = _mm_sad_epu8(top, zero);
@@ -1083,37 +1089,37 @@ static void DC16(uint8_t* dst) { // DC
}
{
const int DC = _mm_cvtsi128_si32(sum) + left + 16;
- Put16(DC >> 5, dst);
+ Put16_SSE2(DC >> 5, dst);
}
}
-static void DC16NoTop(uint8_t* dst) { // DC with top samples not available
+static void DC16NoTop_SSE2(uint8_t* dst) { // DC with top samples unavailable
int DC = 8;
int j;
for (j = 0; j < 16; ++j) {
DC += dst[-1 + j * BPS];
}
- Put16(DC >> 4, dst);
+ Put16_SSE2(DC >> 4, dst);
}
-static void DC16NoLeft(uint8_t* dst) { // DC with left samples not available
+static void DC16NoLeft_SSE2(uint8_t* dst) { // DC with left samples unavailable
const __m128i zero = _mm_setzero_si128();
const __m128i top = _mm_loadu_si128((const __m128i*)(dst - BPS));
const __m128i sad8x2 = _mm_sad_epu8(top, zero);
// sum the two sads: sad8x2[0:1] + sad8x2[8:9]
const __m128i sum = _mm_add_epi16(sad8x2, _mm_shuffle_epi32(sad8x2, 2));
const int DC = _mm_cvtsi128_si32(sum) + 8;
- Put16(DC >> 4, dst);
+ Put16_SSE2(DC >> 4, dst);
}
-static void DC16NoTopLeft(uint8_t* dst) { // DC with no top and left samples
- Put16(0x80, dst);
+static void DC16NoTopLeft_SSE2(uint8_t* dst) { // DC with no top & left samples
+ Put16_SSE2(0x80, dst);
}
//------------------------------------------------------------------------------
// Chroma
-static void VE8uv(uint8_t* dst) { // vertical
+static void VE8uv_SSE2(uint8_t* dst) { // vertical
int j;
const __m128i top = _mm_loadl_epi64((const __m128i*)(dst - BPS));
for (j = 0; j < 8; ++j) {
@@ -1121,17 +1127,8 @@ static void VE8uv(uint8_t* dst) { // vertical
}
}
-static void HE8uv(uint8_t* dst) { // horizontal
- int j;
- for (j = 0; j < 8; ++j) {
- const __m128i values = _mm_set1_epi8(dst[-1]);
- _mm_storel_epi64((__m128i*)dst, values);
- dst += BPS;
- }
-}
-
// helper for chroma-DC predictions
-static WEBP_INLINE void Put8x8uv(uint8_t v, uint8_t* dst) {
+static WEBP_INLINE void Put8x8uv_SSE2(uint8_t v, uint8_t* dst) {
int j;
const __m128i values = _mm_set1_epi8(v);
for (j = 0; j < 8; ++j) {
@@ -1139,7 +1136,7 @@ static WEBP_INLINE void Put8x8uv(uint8_t v, uint8_t* dst) {
}
}
-static void DC8uv(uint8_t* dst) { // DC
+static void DC8uv_SSE2(uint8_t* dst) { // DC
const __m128i zero = _mm_setzero_si128();
const __m128i top = _mm_loadl_epi64((const __m128i*)(dst - BPS));
const __m128i sum = _mm_sad_epu8(top, zero);
@@ -1150,29 +1147,29 @@ static void DC8uv(uint8_t* dst) { // DC
}
{
const int DC = _mm_cvtsi128_si32(sum) + left + 8;
- Put8x8uv(DC >> 4, dst);
+ Put8x8uv_SSE2(DC >> 4, dst);
}
}
-static void DC8uvNoLeft(uint8_t* dst) { // DC with no left samples
+static void DC8uvNoLeft_SSE2(uint8_t* dst) { // DC with no left samples
const __m128i zero = _mm_setzero_si128();
const __m128i top = _mm_loadl_epi64((const __m128i*)(dst - BPS));
const __m128i sum = _mm_sad_epu8(top, zero);
const int DC = _mm_cvtsi128_si32(sum) + 4;
- Put8x8uv(DC >> 3, dst);
+ Put8x8uv_SSE2(DC >> 3, dst);
}
-static void DC8uvNoTop(uint8_t* dst) { // DC with no top samples
+static void DC8uvNoTop_SSE2(uint8_t* dst) { // DC with no top samples
int dc0 = 4;
int i;
for (i = 0; i < 8; ++i) {
dc0 += dst[-1 + i * BPS];
}
- Put8x8uv(dc0 >> 3, dst);
+ Put8x8uv_SSE2(dc0 >> 3, dst);
}
-static void DC8uvNoTopLeft(uint8_t* dst) { // DC with nothing
- Put8x8uv(0x80, dst);
+static void DC8uvNoTopLeft_SSE2(uint8_t* dst) { // DC with nothing
+ Put8x8uv_SSE2(0x80, dst);
}
//------------------------------------------------------------------------------
@@ -1181,47 +1178,46 @@ static void DC8uvNoTopLeft(uint8_t* dst) { // DC with nothing
extern void VP8DspInitSSE2(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitSSE2(void) {
- VP8Transform = Transform;
-#if defined(USE_TRANSFORM_AC3)
- VP8TransformAC3 = TransformAC3;
+ VP8Transform = Transform_SSE2;
+#if (USE_TRANSFORM_AC3 == 1)
+ VP8TransformAC3 = TransformAC3_SSE2;
#endif
- VP8VFilter16 = VFilter16;
- VP8HFilter16 = HFilter16;
- VP8VFilter8 = VFilter8;
- VP8HFilter8 = HFilter8;
- VP8VFilter16i = VFilter16i;
- VP8HFilter16i = HFilter16i;
- VP8VFilter8i = VFilter8i;
- VP8HFilter8i = HFilter8i;
-
- VP8SimpleVFilter16 = SimpleVFilter16;
- VP8SimpleHFilter16 = SimpleHFilter16;
- VP8SimpleVFilter16i = SimpleVFilter16i;
- VP8SimpleHFilter16i = SimpleHFilter16i;
-
- VP8PredLuma4[1] = TM4;
- VP8PredLuma4[2] = VE4;
- VP8PredLuma4[4] = RD4;
- VP8PredLuma4[5] = VR4;
- VP8PredLuma4[6] = LD4;
- VP8PredLuma4[7] = VL4;
-
- VP8PredLuma16[0] = DC16;
- VP8PredLuma16[1] = TM16;
- VP8PredLuma16[2] = VE16;
- VP8PredLuma16[3] = HE16;
- VP8PredLuma16[4] = DC16NoTop;
- VP8PredLuma16[5] = DC16NoLeft;
- VP8PredLuma16[6] = DC16NoTopLeft;
-
- VP8PredChroma8[0] = DC8uv;
- VP8PredChroma8[1] = TM8uv;
- VP8PredChroma8[2] = VE8uv;
- VP8PredChroma8[3] = HE8uv;
- VP8PredChroma8[4] = DC8uvNoTop;
- VP8PredChroma8[5] = DC8uvNoLeft;
- VP8PredChroma8[6] = DC8uvNoTopLeft;
+ VP8VFilter16 = VFilter16_SSE2;
+ VP8HFilter16 = HFilter16_SSE2;
+ VP8VFilter8 = VFilter8_SSE2;
+ VP8HFilter8 = HFilter8_SSE2;
+ VP8VFilter16i = VFilter16i_SSE2;
+ VP8HFilter16i = HFilter16i_SSE2;
+ VP8VFilter8i = VFilter8i_SSE2;
+ VP8HFilter8i = HFilter8i_SSE2;
+
+ VP8SimpleVFilter16 = SimpleVFilter16_SSE2;
+ VP8SimpleHFilter16 = SimpleHFilter16_SSE2;
+ VP8SimpleVFilter16i = SimpleVFilter16i_SSE2;
+ VP8SimpleHFilter16i = SimpleHFilter16i_SSE2;
+
+ VP8PredLuma4[1] = TM4_SSE2;
+ VP8PredLuma4[2] = VE4_SSE2;
+ VP8PredLuma4[4] = RD4_SSE2;
+ VP8PredLuma4[5] = VR4_SSE2;
+ VP8PredLuma4[6] = LD4_SSE2;
+ VP8PredLuma4[7] = VL4_SSE2;
+
+ VP8PredLuma16[0] = DC16_SSE2;
+ VP8PredLuma16[1] = TM16_SSE2;
+ VP8PredLuma16[2] = VE16_SSE2;
+ VP8PredLuma16[3] = HE16_SSE2;
+ VP8PredLuma16[4] = DC16NoTop_SSE2;
+ VP8PredLuma16[5] = DC16NoLeft_SSE2;
+ VP8PredLuma16[6] = DC16NoTopLeft_SSE2;
+
+ VP8PredChroma8[0] = DC8uv_SSE2;
+ VP8PredChroma8[1] = TM8uv_SSE2;
+ VP8PredChroma8[2] = VE8uv_SSE2;
+ VP8PredChroma8[4] = DC8uvNoTop_SSE2;
+ VP8PredChroma8[5] = DC8uvNoLeft_SSE2;
+ VP8PredChroma8[6] = DC8uvNoTopLeft_SSE2;
}
#else // !WEBP_USE_SSE2
diff --git a/src/3rdparty/libwebp/src/dsp/dec_sse41.c b/src/3rdparty/libwebp/src/dsp/dec_sse41.c
index 4e81ec4..8f18506 100644
--- a/src/3rdparty/libwebp/src/dsp/dec_sse41.c
+++ b/src/3rdparty/libwebp/src/dsp/dec_sse41.c
@@ -11,15 +11,15 @@
//
// Author: Skal (pascal.massimino@gmail.com)
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
#if defined(WEBP_USE_SSE41)
#include <smmintrin.h>
-#include "../dec/vp8i_dec.h"
-#include "../utils/utils.h"
+#include "src/dec/vp8i_dec.h"
+#include "src/utils/utils.h"
-static void HE16(uint8_t* dst) { // horizontal
+static void HE16_SSE41(uint8_t* dst) { // horizontal
int j;
const __m128i kShuffle3 = _mm_set1_epi8(3);
for (j = 16; j > 0; --j) {
@@ -36,7 +36,7 @@ static void HE16(uint8_t* dst) { // horizontal
extern void VP8DspInitSSE41(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitSSE41(void) {
- VP8PredLuma16[3] = HE16;
+ VP8PredLuma16[3] = HE16_SSE41;
}
#else // !WEBP_USE_SSE41
diff --git a/src/3rdparty/libwebp/src/dsp/dsp.h b/src/3rdparty/libwebp/src/dsp/dsp.h
index 7cf6e05..5b703a1 100644
--- a/src/3rdparty/libwebp/src/dsp/dsp.h
+++ b/src/3rdparty/libwebp/src/dsp/dsp.h
@@ -15,10 +15,10 @@
#define WEBP_DSP_DSP_H_
#ifdef HAVE_CONFIG_H
-#include "../webp/config.h"
+#include "src/webp/config.h"
#endif
-#include "../webp/types.h"
+#include "src/webp/types.h"
#ifdef __cplusplus
extern "C" {
@@ -38,10 +38,22 @@ extern "C" {
# define LOCAL_GCC_PREREQ(maj, min) 0
#endif
+#if defined(__clang__)
+# define LOCAL_CLANG_VERSION ((__clang_major__ << 8) | __clang_minor__)
+# define LOCAL_CLANG_PREREQ(maj, min) \
+ (LOCAL_CLANG_VERSION >= (((maj) << 8) | (min)))
+#else
+# define LOCAL_CLANG_VERSION 0
+# define LOCAL_CLANG_PREREQ(maj, min) 0
+#endif
+
#ifndef __has_builtin
# define __has_builtin(x) 0
#endif
+// for now, none of the optimizations below are available in emscripten
+#if !defined(EMSCRIPTEN)
+
#if defined(_MSC_VER) && _MSC_VER > 1310 && \
(defined(_M_X64) || defined(_M_IX86))
#define WEBP_MSC_SSE2 // Visual C++ SSE2 targets
@@ -69,18 +81,20 @@ extern "C" {
#define WEBP_USE_AVX2
#endif
-#if defined(__ANDROID__) && defined(__ARM_ARCH_7A__)
-#define WEBP_ANDROID_NEON // Android targets that might support NEON
-#endif
-
// The intrinsics currently cause compiler errors with arm-nacl-gcc and the
// inline assembly would need to be modified for use with Native Client.
-#if (defined(__ARM_NEON__) || defined(WEBP_ANDROID_NEON) || \
+#if (defined(__ARM_NEON__) || \
defined(__aarch64__) || defined(WEBP_HAVE_NEON)) && \
!defined(__native_client__)
#define WEBP_USE_NEON
#endif
+#if !defined(WEBP_USE_NEON) && defined(__ANDROID__) && \
+ defined(__ARM_ARCH_7A__) && defined(HAVE_CPU_FEATURES_H)
+#define WEBP_ANDROID_NEON // Android targets that may have NEON
+#define WEBP_USE_NEON
+#endif
+
#if defined(_MSC_VER) && _MSC_VER >= 1700 && defined(_M_ARM)
#define WEBP_USE_NEON
#define WEBP_USE_INTRINSICS
@@ -91,7 +105,7 @@ extern "C" {
#define WEBP_USE_MIPS32
#if (__mips_isa_rev >= 2)
#define WEBP_USE_MIPS32_R2
-#if defined(__mips_dspr2) || (__mips_dsp_rev >= 2)
+#if defined(__mips_dspr2) || (defined(__mips_dsp_rev) && __mips_dsp_rev >= 2)
#define WEBP_USE_MIPS_DSP_R2
#endif
#endif
@@ -101,6 +115,24 @@ extern "C" {
#define WEBP_USE_MSA
#endif
+#endif /* EMSCRIPTEN */
+
+#ifndef WEBP_DSP_OMIT_C_CODE
+#define WEBP_DSP_OMIT_C_CODE 1
+#endif
+
+#if (defined(__aarch64__) || defined(__ARM_NEON__)) && WEBP_DSP_OMIT_C_CODE
+#define WEBP_NEON_OMIT_C_CODE 1
+#else
+#define WEBP_NEON_OMIT_C_CODE 0
+#endif
+
+#if !(LOCAL_CLANG_PREREQ(3,8) || LOCAL_GCC_PREREQ(4,8) || defined(__aarch64__))
+#define WEBP_NEON_WORK_AROUND_GCC 1
+#else
+#define WEBP_NEON_WORK_AROUND_GCC 0
+#endif
+
// This macro prevents thread_sanitizer from reporting known concurrent writes.
#define WEBP_TSAN_IGNORE_FUNCTION
#if defined(__has_feature)
@@ -130,6 +162,11 @@ extern "C" {
#endif
#endif
+// Regularize the definition of WEBP_SWAP_16BIT_CSP (backward compatibility)
+#if !defined(WEBP_SWAP_16BIT_CSP)
+#define WEBP_SWAP_16BIT_CSP 0
+#endif
+
typedef enum {
kSSE2,
kSSE3,
@@ -144,7 +181,7 @@ typedef enum {
} CPUFeature;
// returns true if the CPU supports the feature.
typedef int (*VP8CPUInfo)(CPUFeature feature);
-WEBP_EXTERN(VP8CPUInfo) VP8GetCPUInfo;
+WEBP_EXTERN VP8CPUInfo VP8GetCPUInfo;
//------------------------------------------------------------------------------
// Init stub generator
@@ -272,6 +309,7 @@ typedef double (*VP8SSIMGetClippedFunc)(const uint8_t* src1, int stride1,
int xo, int yo, // center position
int W, int H); // plane dimension
+#if !defined(WEBP_REDUCE_SIZE)
// This version is called with the guarantee that you can load 8 bytes and
// 8 rows at offset src1 and src2
typedef double (*VP8SSIMGetFunc)(const uint8_t* src1, int stride1,
@@ -279,10 +317,13 @@ typedef double (*VP8SSIMGetFunc)(const uint8_t* src1, int stride1,
extern VP8SSIMGetFunc VP8SSIMGet; // unclipped / unchecked
extern VP8SSIMGetClippedFunc VP8SSIMGetClipped; // with clipping
+#endif
+#if !defined(WEBP_DISABLE_STATS)
typedef uint32_t (*VP8AccumulateSSEFunc)(const uint8_t* src1,
const uint8_t* src2, int len);
extern VP8AccumulateSSEFunc VP8AccumulateSSE;
+#endif
// must be called before using any of the above directly
void VP8SSIMDspInit(void);
@@ -463,12 +504,12 @@ extern WebPRescalerExportRowFunc WebPRescalerExportRowExpand;
extern WebPRescalerExportRowFunc WebPRescalerExportRowShrink;
// Plain-C implementation, as fall-back.
-extern void WebPRescalerImportRowExpandC(struct WebPRescaler* const wrk,
- const uint8_t* src);
-extern void WebPRescalerImportRowShrinkC(struct WebPRescaler* const wrk,
- const uint8_t* src);
-extern void WebPRescalerExportRowExpandC(struct WebPRescaler* const wrk);
-extern void WebPRescalerExportRowShrinkC(struct WebPRescaler* const wrk);
+extern void WebPRescalerImportRowExpand_C(struct WebPRescaler* const wrk,
+ const uint8_t* src);
+extern void WebPRescalerImportRowShrink_C(struct WebPRescaler* const wrk,
+ const uint8_t* src);
+extern void WebPRescalerExportRowExpand_C(struct WebPRescaler* const wrk);
+extern void WebPRescalerExportRowShrink_C(struct WebPRescaler* const wrk);
// Main entry calls:
extern void WebPRescalerImportRow(struct WebPRescaler* const wrk,
@@ -534,24 +575,21 @@ void WebPMultRows(uint8_t* ptr, int stride,
int width, int num_rows, int inverse);
// Plain-C versions, used as fallback by some implementations.
-void WebPMultRowC(uint8_t* const ptr, const uint8_t* const alpha,
- int width, int inverse);
-void WebPMultARGBRowC(uint32_t* const ptr, int width, int inverse);
-
-// To be called first before using the above.
-void WebPInitAlphaProcessing(void);
-
-// ARGB packing function: a/r/g/b input is rgba or bgra order.
-extern void (*VP8PackARGB)(const uint8_t* a, const uint8_t* r,
- const uint8_t* g, const uint8_t* b, int len,
- uint32_t* out);
+void WebPMultRow_C(uint8_t* const ptr, const uint8_t* const alpha,
+ int width, int inverse);
+void WebPMultARGBRow_C(uint32_t* const ptr, int width, int inverse);
// RGB packing function. 'step' can be 3 or 4. r/g/b input is rgb or bgr order.
-extern void (*VP8PackRGB)(const uint8_t* r, const uint8_t* g, const uint8_t* b,
- int len, int step, uint32_t* out);
+extern void (*WebPPackRGB)(const uint8_t* r, const uint8_t* g, const uint8_t* b,
+ int len, int step, uint32_t* out);
+
+// This function returns true if src[i] contains a value different from 0xff.
+extern int (*WebPHasAlpha8b)(const uint8_t* src, int length);
+// This function returns true if src[4*i] contains a value different from 0xff.
+extern int (*WebPHasAlpha32b)(const uint8_t* src, int length);
// To be called first before using the above.
-void VP8EncDspARGBInit(void);
+void WebPInitAlphaProcessing(void);
//------------------------------------------------------------------------------
// Filter functions
diff --git a/src/3rdparty/libwebp/src/dsp/enc.c b/src/3rdparty/libwebp/src/dsp/enc.c
index f31bc6d..1c807f1 100644
--- a/src/3rdparty/libwebp/src/dsp/enc.c
+++ b/src/3rdparty/libwebp/src/dsp/enc.c
@@ -14,16 +14,18 @@
#include <assert.h>
#include <stdlib.h> // for abs()
-#include "./dsp.h"
-#include "../enc/vp8i_enc.h"
+#include "src/dsp/dsp.h"
+#include "src/enc/vp8i_enc.h"
static WEBP_INLINE uint8_t clip_8b(int v) {
return (!(v & ~0xff)) ? v : (v < 0) ? 0 : 255;
}
+#if !WEBP_NEON_OMIT_C_CODE
static WEBP_INLINE int clip_max(int v, int max) {
return (v > max) ? max : v;
}
+#endif // !WEBP_NEON_OMIT_C_CODE
//------------------------------------------------------------------------------
// Compute susceptibility based on DCT-coeff histograms:
@@ -56,9 +58,10 @@ void VP8SetHistogramData(const int distribution[MAX_COEFF_THRESH + 1],
histo->last_non_zero = last_non_zero;
}
-static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
- int start_block, int end_block,
- VP8Histogram* const histo) {
+#if !WEBP_NEON_OMIT_C_CODE
+static void CollectHistogram_C(const uint8_t* ref, const uint8_t* pred,
+ int start_block, int end_block,
+ VP8Histogram* const histo) {
int j;
int distribution[MAX_COEFF_THRESH + 1] = { 0 };
for (j = start_block; j < end_block; ++j) {
@@ -76,6 +79,7 @@ static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
}
VP8SetHistogramData(distribution, histo);
}
+#endif // !WEBP_NEON_OMIT_C_CODE
//------------------------------------------------------------------------------
// run-time tables (~4k)
@@ -100,6 +104,8 @@ static WEBP_TSAN_IGNORE_FUNCTION void InitTables(void) {
//------------------------------------------------------------------------------
// Transforms (Paragraph 14.4)
+#if !WEBP_NEON_OMIT_C_CODE
+
#define STORE(x, y, v) \
dst[(x) + (y) * BPS] = clip_8b(ref[(x) + (y) * BPS] + ((v) >> 3))
@@ -140,15 +146,15 @@ static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
}
}
-static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst,
- int do_two) {
+static void ITransform_C(const uint8_t* ref, const int16_t* in, uint8_t* dst,
+ int do_two) {
ITransformOne(ref, in, dst);
if (do_two) {
ITransformOne(ref + 4, in + 16, dst + 4);
}
}
-static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
+static void FTransform_C(const uint8_t* src, const uint8_t* ref, int16_t* out) {
int i;
int tmp[16];
for (i = 0; i < 4; ++i, src += BPS, ref += BPS) {
@@ -176,13 +182,16 @@ static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
out[12+ i] = ((a3 * 2217 - a2 * 5352 + 51000) >> 16);
}
}
+#endif // !WEBP_NEON_OMIT_C_CODE
-static void FTransform2(const uint8_t* src, const uint8_t* ref, int16_t* out) {
+static void FTransform2_C(const uint8_t* src, const uint8_t* ref,
+ int16_t* out) {
VP8FTransform(src, ref, out);
VP8FTransform(src + 4, ref + 4, out + 16);
}
-static void FTransformWHT(const int16_t* in, int16_t* out) {
+#if !WEBP_NEON_OMIT_C_CODE
+static void FTransformWHT_C(const int16_t* in, int16_t* out) {
// input is 12b signed
int32_t tmp[16];
int i;
@@ -211,6 +220,7 @@ static void FTransformWHT(const int16_t* in, int16_t* out) {
out[12 + i] = b3 >> 1;
}
}
+#endif // !WEBP_NEON_OMIT_C_CODE
#undef MUL
#undef STORE
@@ -303,8 +313,8 @@ static WEBP_INLINE void DCMode(uint8_t* dst, const uint8_t* left,
//------------------------------------------------------------------------------
// Chroma 8x8 prediction (paragraph 12.2)
-static void IntraChromaPreds(uint8_t* dst, const uint8_t* left,
- const uint8_t* top) {
+static void IntraChromaPreds_C(uint8_t* dst, const uint8_t* left,
+ const uint8_t* top) {
// U block
DCMode(C8DC8 + dst, left, top, 8, 8, 4);
VerticalPred(C8VE8 + dst, top, 8);
@@ -323,8 +333,8 @@ static void IntraChromaPreds(uint8_t* dst, const uint8_t* left,
//------------------------------------------------------------------------------
// luma 16x16 prediction (paragraph 12.3)
-static void Intra16Preds(uint8_t* dst,
- const uint8_t* left, const uint8_t* top) {
+static void Intra16Preds_C(uint8_t* dst,
+ const uint8_t* left, const uint8_t* top) {
DCMode(I16DC16 + dst, left, top, 16, 16, 5);
VerticalPred(I16VE16 + dst, top, 16);
HorizontalPred(I16HE16 + dst, left, 16);
@@ -507,7 +517,7 @@ static void TM4(uint8_t* dst, const uint8_t* top) {
// Left samples are top[-5 .. -2], top_left is top[-1], top are
// located at top[0..3], and top right is top[4..7]
-static void Intra4Preds(uint8_t* dst, const uint8_t* top) {
+static void Intra4Preds_C(uint8_t* dst, const uint8_t* top) {
DC4(I4DC4 + dst, top);
TM4(I4TM4 + dst, top);
VE4(I4VE4 + dst, top);
@@ -523,6 +533,7 @@ static void Intra4Preds(uint8_t* dst, const uint8_t* top) {
//------------------------------------------------------------------------------
// Metric
+#if !WEBP_NEON_OMIT_C_CODE
static WEBP_INLINE int GetSSE(const uint8_t* a, const uint8_t* b,
int w, int h) {
int count = 0;
@@ -538,20 +549,21 @@ static WEBP_INLINE int GetSSE(const uint8_t* a, const uint8_t* b,
return count;
}
-static int SSE16x16(const uint8_t* a, const uint8_t* b) {
+static int SSE16x16_C(const uint8_t* a, const uint8_t* b) {
return GetSSE(a, b, 16, 16);
}
-static int SSE16x8(const uint8_t* a, const uint8_t* b) {
+static int SSE16x8_C(const uint8_t* a, const uint8_t* b) {
return GetSSE(a, b, 16, 8);
}
-static int SSE8x8(const uint8_t* a, const uint8_t* b) {
+static int SSE8x8_C(const uint8_t* a, const uint8_t* b) {
return GetSSE(a, b, 8, 8);
}
-static int SSE4x4(const uint8_t* a, const uint8_t* b) {
+static int SSE4x4_C(const uint8_t* a, const uint8_t* b) {
return GetSSE(a, b, 4, 4);
}
+#endif // !WEBP_NEON_OMIT_C_CODE
-static void Mean16x4(const uint8_t* ref, uint32_t dc[4]) {
+static void Mean16x4_C(const uint8_t* ref, uint32_t dc[4]) {
int k, x, y;
for (k = 0; k < 4; ++k) {
uint32_t avg = 0;
@@ -571,6 +583,7 @@ static void Mean16x4(const uint8_t* ref, uint32_t dc[4]) {
// We try to match the spectral content (weighted) between source and
// reconstructed samples.
+#if !WEBP_NEON_OMIT_C_CODE
// Hadamard transform
// Returns the weighted sum of the absolute value of transformed coefficients.
// w[] contains a row-major 4 by 4 symmetric matrix.
@@ -608,24 +621,25 @@ static int TTransform(const uint8_t* in, const uint16_t* w) {
return sum;
}
-static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
- const uint16_t* const w) {
+static int Disto4x4_C(const uint8_t* const a, const uint8_t* const b,
+ const uint16_t* const w) {
const int sum1 = TTransform(a, w);
const int sum2 = TTransform(b, w);
return abs(sum2 - sum1) >> 5;
}
-static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
- const uint16_t* const w) {
+static int Disto16x16_C(const uint8_t* const a, const uint8_t* const b,
+ const uint16_t* const w) {
int D = 0;
int x, y;
for (y = 0; y < 16 * BPS; y += 4 * BPS) {
for (x = 0; x < 16; x += 4) {
- D += Disto4x4(a + x + y, b + x + y, w);
+ D += Disto4x4_C(a + x + y, b + x + y, w);
}
}
return D;
}
+#endif // !WEBP_NEON_OMIT_C_CODE
//------------------------------------------------------------------------------
// Quantization
@@ -636,8 +650,8 @@ static const uint8_t kZigzag[16] = {
};
// Simple quantization
-static int QuantizeBlock(int16_t in[16], int16_t out[16],
- const VP8Matrix* const mtx) {
+static int QuantizeBlock_C(int16_t in[16], int16_t out[16],
+ const VP8Matrix* const mtx) {
int last = -1;
int n;
for (n = 0; n < 16; ++n) {
@@ -662,13 +676,15 @@ static int QuantizeBlock(int16_t in[16], int16_t out[16],
return (last >= 0);
}
-static int Quantize2Blocks(int16_t in[32], int16_t out[32],
- const VP8Matrix* const mtx) {
+#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
+static int Quantize2Blocks_C(int16_t in[32], int16_t out[32],
+ const VP8Matrix* const mtx) {
int nz;
nz = VP8EncQuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0;
nz |= VP8EncQuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1;
return nz;
}
+#endif // !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
//------------------------------------------------------------------------------
// Block copy
@@ -682,149 +698,15 @@ static WEBP_INLINE void Copy(const uint8_t* src, uint8_t* dst, int w, int h) {
}
}
-static void Copy4x4(const uint8_t* src, uint8_t* dst) {
+static void Copy4x4_C(const uint8_t* src, uint8_t* dst) {
Copy(src, dst, 4, 4);
}
-static void Copy16x8(const uint8_t* src, uint8_t* dst) {
+static void Copy16x8_C(const uint8_t* src, uint8_t* dst) {
Copy(src, dst, 16, 8);
}
//------------------------------------------------------------------------------
-// SSIM / PSNR
-
-// hat-shaped filter. Sum of coefficients is equal to 16.
-static const uint32_t kWeight[2 * VP8_SSIM_KERNEL + 1] = {
- 1, 2, 3, 4, 3, 2, 1
-};
-static const uint32_t kWeightSum = 16 * 16; // sum{kWeight}^2
-
-static WEBP_INLINE double SSIMCalculation(
- const VP8DistoStats* const stats, uint32_t N /*num samples*/) {
- const uint32_t w2 = N * N;
- const uint32_t C1 = 20 * w2;
- const uint32_t C2 = 60 * w2;
- const uint32_t C3 = 8 * 8 * w2; // 'dark' limit ~= 6
- const uint64_t xmxm = (uint64_t)stats->xm * stats->xm;
- const uint64_t ymym = (uint64_t)stats->ym * stats->ym;
- if (xmxm + ymym >= C3) {
- const int64_t xmym = (int64_t)stats->xm * stats->ym;
- const int64_t sxy = (int64_t)stats->xym * N - xmym; // can be negative
- const uint64_t sxx = (uint64_t)stats->xxm * N - xmxm;
- const uint64_t syy = (uint64_t)stats->yym * N - ymym;
- // we descale by 8 to prevent overflow during the fnum/fden multiply.
- const uint64_t num_S = (2 * (uint64_t)(sxy < 0 ? 0 : sxy) + C2) >> 8;
- const uint64_t den_S = (sxx + syy + C2) >> 8;
- const uint64_t fnum = (2 * xmym + C1) * num_S;
- const uint64_t fden = (xmxm + ymym + C1) * den_S;
- const double r = (double)fnum / fden;
- assert(r >= 0. && r <= 1.0);
- return r;
- }
- return 1.; // area is too dark to contribute meaningfully
-}
-
-double VP8SSIMFromStats(const VP8DistoStats* const stats) {
- return SSIMCalculation(stats, kWeightSum);
-}
-
-double VP8SSIMFromStatsClipped(const VP8DistoStats* const stats) {
- return SSIMCalculation(stats, stats->w);
-}
-
-static double SSIMGetClipped_C(const uint8_t* src1, int stride1,
- const uint8_t* src2, int stride2,
- int xo, int yo, int W, int H) {
- VP8DistoStats stats = { 0, 0, 0, 0, 0, 0 };
- const int ymin = (yo - VP8_SSIM_KERNEL < 0) ? 0 : yo - VP8_SSIM_KERNEL;
- const int ymax = (yo + VP8_SSIM_KERNEL > H - 1) ? H - 1
- : yo + VP8_SSIM_KERNEL;
- const int xmin = (xo - VP8_SSIM_KERNEL < 0) ? 0 : xo - VP8_SSIM_KERNEL;
- const int xmax = (xo + VP8_SSIM_KERNEL > W - 1) ? W - 1
- : xo + VP8_SSIM_KERNEL;
- int x, y;
- src1 += ymin * stride1;
- src2 += ymin * stride2;
- for (y = ymin; y <= ymax; ++y, src1 += stride1, src2 += stride2) {
- for (x = xmin; x <= xmax; ++x) {
- const uint32_t w = kWeight[VP8_SSIM_KERNEL + x - xo]
- * kWeight[VP8_SSIM_KERNEL + y - yo];
- const uint32_t s1 = src1[x];
- const uint32_t s2 = src2[x];
- stats.w += w;
- stats.xm += w * s1;
- stats.ym += w * s2;
- stats.xxm += w * s1 * s1;
- stats.xym += w * s1 * s2;
- stats.yym += w * s2 * s2;
- }
- }
- return VP8SSIMFromStatsClipped(&stats);
-}
-
-static double SSIMGet_C(const uint8_t* src1, int stride1,
- const uint8_t* src2, int stride2) {
- VP8DistoStats stats = { 0, 0, 0, 0, 0, 0 };
- int x, y;
- for (y = 0; y <= 2 * VP8_SSIM_KERNEL; ++y, src1 += stride1, src2 += stride2) {
- for (x = 0; x <= 2 * VP8_SSIM_KERNEL; ++x) {
- const uint32_t w = kWeight[x] * kWeight[y];
- const uint32_t s1 = src1[x];
- const uint32_t s2 = src2[x];
- stats.xm += w * s1;
- stats.ym += w * s2;
- stats.xxm += w * s1 * s1;
- stats.xym += w * s1 * s2;
- stats.yym += w * s2 * s2;
- }
- }
- return VP8SSIMFromStats(&stats);
-}
-
-//------------------------------------------------------------------------------
-
-static uint32_t AccumulateSSE(const uint8_t* src1,
- const uint8_t* src2, int len) {
- int i;
- uint32_t sse2 = 0;
- assert(len <= 65535); // to ensure that accumulation fits within uint32_t
- for (i = 0; i < len; ++i) {
- const int32_t diff = src1[i] - src2[i];
- sse2 += diff * diff;
- }
- return sse2;
-}
-
-//------------------------------------------------------------------------------
-
-VP8SSIMGetFunc VP8SSIMGet;
-VP8SSIMGetClippedFunc VP8SSIMGetClipped;
-VP8AccumulateSSEFunc VP8AccumulateSSE;
-
-extern void VP8SSIMDspInitSSE2(void);
-
-static volatile VP8CPUInfo ssim_last_cpuinfo_used =
- (VP8CPUInfo)&ssim_last_cpuinfo_used;
-
-WEBP_TSAN_IGNORE_FUNCTION void VP8SSIMDspInit(void) {
- if (ssim_last_cpuinfo_used == VP8GetCPUInfo) return;
-
- VP8SSIMGetClipped = SSIMGetClipped_C;
- VP8SSIMGet = SSIMGet_C;
-
- VP8AccumulateSSE = AccumulateSSE;
- if (VP8GetCPUInfo != NULL) {
-#if defined(WEBP_USE_SSE2)
- if (VP8GetCPUInfo(kSSE2)) {
- VP8SSIMDspInitSSE2();
- }
-#endif
- }
-
- ssim_last_cpuinfo_used = VP8GetCPUInfo;
-}
-
-//------------------------------------------------------------------------------
// Initialization
// Speed-critical function pointers. We have to initialize them to the default
@@ -868,26 +750,32 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInit(void) {
InitTables();
// default C implementations
- VP8CollectHistogram = CollectHistogram;
- VP8ITransform = ITransform;
- VP8FTransform = FTransform;
- VP8FTransform2 = FTransform2;
- VP8FTransformWHT = FTransformWHT;
- VP8EncPredLuma4 = Intra4Preds;
- VP8EncPredLuma16 = Intra16Preds;
- VP8EncPredChroma8 = IntraChromaPreds;
- VP8SSE16x16 = SSE16x16;
- VP8SSE8x8 = SSE8x8;
- VP8SSE16x8 = SSE16x8;
- VP8SSE4x4 = SSE4x4;
- VP8TDisto4x4 = Disto4x4;
- VP8TDisto16x16 = Disto16x16;
- VP8Mean16x4 = Mean16x4;
- VP8EncQuantizeBlock = QuantizeBlock;
- VP8EncQuantize2Blocks = Quantize2Blocks;
- VP8EncQuantizeBlockWHT = QuantizeBlock;
- VP8Copy4x4 = Copy4x4;
- VP8Copy16x8 = Copy16x8;
+#if !WEBP_NEON_OMIT_C_CODE
+ VP8ITransform = ITransform_C;
+ VP8FTransform = FTransform_C;
+ VP8FTransformWHT = FTransformWHT_C;
+ VP8TDisto4x4 = Disto4x4_C;
+ VP8TDisto16x16 = Disto16x16_C;
+ VP8CollectHistogram = CollectHistogram_C;
+ VP8SSE16x16 = SSE16x16_C;
+ VP8SSE16x8 = SSE16x8_C;
+ VP8SSE8x8 = SSE8x8_C;
+ VP8SSE4x4 = SSE4x4_C;
+#endif
+
+#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
+ VP8EncQuantizeBlock = QuantizeBlock_C;
+ VP8EncQuantize2Blocks = Quantize2Blocks_C;
+#endif
+
+ VP8FTransform2 = FTransform2_C;
+ VP8EncPredLuma4 = Intra4Preds_C;
+ VP8EncPredLuma16 = Intra16Preds_C;
+ VP8EncPredChroma8 = IntraChromaPreds_C;
+ VP8Mean16x4 = Mean16x4_C;
+ VP8EncQuantizeBlockWHT = QuantizeBlock_C;
+ VP8Copy4x4 = Copy4x4_C;
+ VP8Copy16x8 = Copy16x8_C;
// If defined, use CPUInfo() to overwrite some pointers with faster versions.
if (VP8GetCPUInfo != NULL) {
@@ -906,11 +794,6 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInit(void) {
VP8EncDspInitAVX2();
}
#endif
-#if defined(WEBP_USE_NEON)
- if (VP8GetCPUInfo(kNEON)) {
- VP8EncDspInitNEON();
- }
-#endif
#if defined(WEBP_USE_MIPS32)
if (VP8GetCPUInfo(kMIPS32)) {
VP8EncDspInitMIPS32();
@@ -927,5 +810,34 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInit(void) {
}
#endif
}
+
+#if defined(WEBP_USE_NEON)
+ if (WEBP_NEON_OMIT_C_CODE ||
+ (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
+ VP8EncDspInitNEON();
+ }
+#endif
+
+ assert(VP8ITransform != NULL);
+ assert(VP8FTransform != NULL);
+ assert(VP8FTransformWHT != NULL);
+ assert(VP8TDisto4x4 != NULL);
+ assert(VP8TDisto16x16 != NULL);
+ assert(VP8CollectHistogram != NULL);
+ assert(VP8SSE16x16 != NULL);
+ assert(VP8SSE16x8 != NULL);
+ assert(VP8SSE8x8 != NULL);
+ assert(VP8SSE4x4 != NULL);
+ assert(VP8EncQuantizeBlock != NULL);
+ assert(VP8EncQuantize2Blocks != NULL);
+ assert(VP8FTransform2 != NULL);
+ assert(VP8EncPredLuma4 != NULL);
+ assert(VP8EncPredLuma16 != NULL);
+ assert(VP8EncPredChroma8 != NULL);
+ assert(VP8Mean16x4 != NULL);
+ assert(VP8EncQuantizeBlockWHT != NULL);
+ assert(VP8Copy4x4 != NULL);
+ assert(VP8Copy16x8 != NULL);
+
enc_last_cpuinfo_used = VP8GetCPUInfo;
}
diff --git a/src/3rdparty/libwebp/src/dsp/enc_avx2.c b/src/3rdparty/libwebp/src/dsp/enc_avx2.c
index 93efb30..8bc5798 100644
--- a/src/3rdparty/libwebp/src/dsp/enc_avx2.c
+++ b/src/3rdparty/libwebp/src/dsp/enc_avx2.c
@@ -9,7 +9,7 @@
//
// AVX2 version of speed-critical encoding functions.
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
#if defined(WEBP_USE_AVX2)
diff --git a/src/3rdparty/libwebp/src/dsp/enc_mips32.c b/src/3rdparty/libwebp/src/dsp/enc_mips32.c
index 752b14d..618f0fc 100644
--- a/src/3rdparty/libwebp/src/dsp/enc_mips32.c
+++ b/src/3rdparty/libwebp/src/dsp/enc_mips32.c
@@ -13,13 +13,13 @@
// Jovan Zelincevic (jovan.zelincevic@imgtec.com)
// Slobodan Prijic (slobodan.prijic@imgtec.com)
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
#if defined(WEBP_USE_MIPS32)
-#include "./mips_macro.h"
-#include "../enc/vp8i_enc.h"
-#include "../enc/cost_enc.h"
+#include "src/dsp/mips_macro.h"
+#include "src/enc/vp8i_enc.h"
+#include "src/enc/cost_enc.h"
static const int kC1 = 20091 + (1 << 16);
static const int kC2 = 35468;
@@ -113,8 +113,9 @@ static const int kC2 = 35468;
"sb %[" #TEMP12 "], 3+" XSTR(BPS) "*" #A "(%[temp16]) \n\t"
// Does one or two inverse transforms.
-static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
- uint8_t* dst) {
+static WEBP_INLINE void ITransformOne_MIPS32(const uint8_t* ref,
+ const int16_t* in,
+ uint8_t* dst) {
int temp0, temp1, temp2, temp3, temp4, temp5, temp6;
int temp7, temp8, temp9, temp10, temp11, temp12, temp13;
int temp14, temp15, temp16, temp17, temp18, temp19, temp20;
@@ -144,11 +145,11 @@ static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
);
}
-static void ITransform(const uint8_t* ref, const int16_t* in,
- uint8_t* dst, int do_two) {
- ITransformOne(ref, in, dst);
+static void ITransform_MIPS32(const uint8_t* ref, const int16_t* in,
+ uint8_t* dst, int do_two) {
+ ITransformOne_MIPS32(ref, in, dst);
if (do_two) {
- ITransformOne(ref + 4, in + 16, dst + 4);
+ ITransformOne_MIPS32(ref + 4, in + 16, dst + 4);
}
}
@@ -187,8 +188,8 @@ static void ITransform(const uint8_t* ref, const int16_t* in,
"sh %[temp5], " #J "(%[ppin]) \n\t" \
"sh %[level], " #N "(%[pout]) \n\t"
-static int QuantizeBlock(int16_t in[16], int16_t out[16],
- const VP8Matrix* const mtx) {
+static int QuantizeBlock_MIPS32(int16_t in[16], int16_t out[16],
+ const VP8Matrix* const mtx) {
int temp0, temp1, temp2, temp3, temp4, temp5;
int sign, coeff, level, i;
int max_level = MAX_LEVEL;
@@ -238,11 +239,11 @@ static int QuantizeBlock(int16_t in[16], int16_t out[16],
return 0;
}
-static int Quantize2Blocks(int16_t in[32], int16_t out[32],
- const VP8Matrix* const mtx) {
+static int Quantize2Blocks_MIPS32(int16_t in[32], int16_t out[32],
+ const VP8Matrix* const mtx) {
int nz;
- nz = QuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0;
- nz |= QuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1;
+ nz = QuantizeBlock_MIPS32(in + 0 * 16, out + 0 * 16, mtx) << 0;
+ nz |= QuantizeBlock_MIPS32(in + 1 * 16, out + 1 * 16, mtx) << 1;
return nz;
}
@@ -361,8 +362,8 @@ static int Quantize2Blocks(int16_t in[32], int16_t out[32],
"msub %[temp6], %[temp0] \n\t" \
"msub %[temp7], %[temp1] \n\t"
-static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
- const uint16_t* const w) {
+static int Disto4x4_MIPS32(const uint8_t* const a, const uint8_t* const b,
+ const uint16_t* const w) {
int tmp[32];
int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
@@ -396,13 +397,13 @@ static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
#undef VERTICAL_PASS
#undef HORIZONTAL_PASS
-static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
- const uint16_t* const w) {
+static int Disto16x16_MIPS32(const uint8_t* const a, const uint8_t* const b,
+ const uint16_t* const w) {
int D = 0;
int x, y;
for (y = 0; y < 16 * BPS; y += 4 * BPS) {
for (x = 0; x < 16; x += 4) {
- D += Disto4x4(a + x + y, b + x + y, w);
+ D += Disto4x4_MIPS32(a + x + y, b + x + y, w);
}
}
return D;
@@ -478,7 +479,8 @@ static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
"sh %[" #TEMP8 "], " #D "(%[temp20]) \n\t" \
"sh %[" #TEMP12 "], " #B "(%[temp20]) \n\t"
-static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
+static void FTransform_MIPS32(const uint8_t* src, const uint8_t* ref,
+ int16_t* out) {
int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
int temp9, temp10, temp11, temp12, temp13, temp14, temp15, temp16;
int temp17, temp18, temp19, temp20;
@@ -539,7 +541,7 @@ static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
GET_SSE_INNER(C, C + 1, C + 2, C + 3) \
GET_SSE_INNER(D, D + 1, D + 2, D + 3)
-static int SSE16x16(const uint8_t* a, const uint8_t* b) {
+static int SSE16x16_MIPS32(const uint8_t* a, const uint8_t* b) {
int count;
int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
@@ -573,7 +575,7 @@ static int SSE16x16(const uint8_t* a, const uint8_t* b) {
return count;
}
-static int SSE16x8(const uint8_t* a, const uint8_t* b) {
+static int SSE16x8_MIPS32(const uint8_t* a, const uint8_t* b) {
int count;
int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
@@ -599,7 +601,7 @@ static int SSE16x8(const uint8_t* a, const uint8_t* b) {
return count;
}
-static int SSE8x8(const uint8_t* a, const uint8_t* b) {
+static int SSE8x8_MIPS32(const uint8_t* a, const uint8_t* b) {
int count;
int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
@@ -621,7 +623,7 @@ static int SSE8x8(const uint8_t* a, const uint8_t* b) {
return count;
}
-static int SSE4x4(const uint8_t* a, const uint8_t* b) {
+static int SSE4x4_MIPS32(const uint8_t* a, const uint8_t* b) {
int count;
int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
@@ -651,17 +653,20 @@ static int SSE4x4(const uint8_t* a, const uint8_t* b) {
extern void VP8EncDspInitMIPS32(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitMIPS32(void) {
- VP8ITransform = ITransform;
- VP8FTransform = FTransform;
- VP8EncQuantizeBlock = QuantizeBlock;
- VP8EncQuantize2Blocks = Quantize2Blocks;
- VP8TDisto4x4 = Disto4x4;
- VP8TDisto16x16 = Disto16x16;
+ VP8ITransform = ITransform_MIPS32;
+ VP8FTransform = FTransform_MIPS32;
+
+ VP8EncQuantizeBlock = QuantizeBlock_MIPS32;
+ VP8EncQuantize2Blocks = Quantize2Blocks_MIPS32;
+
+ VP8TDisto4x4 = Disto4x4_MIPS32;
+ VP8TDisto16x16 = Disto16x16_MIPS32;
+
#if !defined(WORK_AROUND_GCC)
- VP8SSE16x16 = SSE16x16;
- VP8SSE8x8 = SSE8x8;
- VP8SSE16x8 = SSE16x8;
- VP8SSE4x4 = SSE4x4;
+ VP8SSE16x16 = SSE16x16_MIPS32;
+ VP8SSE8x8 = SSE8x8_MIPS32;
+ VP8SSE16x8 = SSE16x8_MIPS32;
+ VP8SSE4x4 = SSE4x4_MIPS32;
#endif
}
diff --git a/src/3rdparty/libwebp/src/dsp/enc_mips_dsp_r2.c b/src/3rdparty/libwebp/src/dsp/enc_mips_dsp_r2.c
index 6c8c1c6..9ddd895 100644
--- a/src/3rdparty/libwebp/src/dsp/enc_mips_dsp_r2.c
+++ b/src/3rdparty/libwebp/src/dsp/enc_mips_dsp_r2.c
@@ -12,13 +12,13 @@
// Author(s): Darko Laus (darko.laus@imgtec.com)
// Mirko Raus (mirko.raus@imgtec.com)
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
#if defined(WEBP_USE_MIPS_DSP_R2)
-#include "./mips_macro.h"
-#include "../enc/cost_enc.h"
-#include "../enc/vp8i_enc.h"
+#include "src/dsp/mips_macro.h"
+#include "src/enc/cost_enc.h"
+#include "src/enc/vp8i_enc.h"
static const int kC1 = 20091 + (1 << 16);
static const int kC2 = 35468;
@@ -141,7 +141,8 @@ static const int kC2 = 35468;
"sh %[" #TEMP8 "], " #D "(%[temp20]) \n\t" \
"sh %[" #TEMP12 "], " #B "(%[temp20]) \n\t"
-static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
+static void FTransform_MIPSdspR2(const uint8_t* src, const uint8_t* ref,
+ int16_t* out) {
const int c2217 = 2217;
const int c5352 = 5352;
int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
@@ -238,16 +239,16 @@ static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
);
}
-static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst,
- int do_two) {
+static void ITransform_MIPSdspR2(const uint8_t* ref, const int16_t* in,
+ uint8_t* dst, int do_two) {
ITransformOne(ref, in, dst);
if (do_two) {
ITransformOne(ref + 4, in + 16, dst + 4);
}
}
-static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
- const uint16_t* const w) {
+static int Disto4x4_MIPSdspR2(const uint8_t* const a, const uint8_t* const b,
+ const uint16_t* const w) {
int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;
int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17;
@@ -313,13 +314,14 @@ static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
return abs(temp3 - temp17) >> 5;
}
-static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
- const uint16_t* const w) {
+static int Disto16x16_MIPSdspR2(const uint8_t* const a,
+ const uint8_t* const b,
+ const uint16_t* const w) {
int D = 0;
int x, y;
for (y = 0; y < 16 * BPS; y += 4 * BPS) {
for (x = 0; x < 16; x += 4) {
- D += Disto4x4(a + x + y, b + x + y, w);
+ D += Disto4x4_MIPSdspR2(a + x + y, b + x + y, w);
}
}
return D;
@@ -1011,8 +1013,8 @@ static void HU4(uint8_t* dst, const uint8_t* top) {
//------------------------------------------------------------------------------
// Chroma 8x8 prediction (paragraph 12.2)
-static void IntraChromaPreds(uint8_t* dst, const uint8_t* left,
- const uint8_t* top) {
+static void IntraChromaPreds_MIPSdspR2(uint8_t* dst, const uint8_t* left,
+ const uint8_t* top) {
// U block
DCMode8(C8DC8 + dst, left, top);
VerticalPred8(C8VE8 + dst, top);
@@ -1031,8 +1033,8 @@ static void IntraChromaPreds(uint8_t* dst, const uint8_t* left,
//------------------------------------------------------------------------------
// luma 16x16 prediction (paragraph 12.3)
-static void Intra16Preds(uint8_t* dst,
- const uint8_t* left, const uint8_t* top) {
+static void Intra16Preds_MIPSdspR2(uint8_t* dst,
+ const uint8_t* left, const uint8_t* top) {
DCMode16(I16DC16 + dst, left, top);
VerticalPred16(I16VE16 + dst, top);
HorizontalPred16(I16HE16 + dst, left);
@@ -1041,7 +1043,7 @@ static void Intra16Preds(uint8_t* dst,
// Left samples are top[-5 .. -2], top_left is top[-1], top are
// located at top[0..3], and top right is top[4..7]
-static void Intra4Preds(uint8_t* dst, const uint8_t* top) {
+static void Intra4Preds_MIPSdspR2(uint8_t* dst, const uint8_t* top) {
DC4(I4DC4 + dst, top);
TM4(I4TM4 + dst, top);
VE4(I4VE4 + dst, top);
@@ -1077,7 +1079,7 @@ static void Intra4Preds(uint8_t* dst, const uint8_t* top) {
GET_SSE_INNER(C) \
GET_SSE_INNER(D)
-static int SSE16x16(const uint8_t* a, const uint8_t* b) {
+static int SSE16x16_MIPSdspR2(const uint8_t* a, const uint8_t* b) {
int count;
int temp0, temp1, temp2, temp3;
__asm__ volatile (
@@ -1107,7 +1109,7 @@ static int SSE16x16(const uint8_t* a, const uint8_t* b) {
return count;
}
-static int SSE16x8(const uint8_t* a, const uint8_t* b) {
+static int SSE16x8_MIPSdspR2(const uint8_t* a, const uint8_t* b) {
int count;
int temp0, temp1, temp2, temp3;
__asm__ volatile (
@@ -1129,7 +1131,7 @@ static int SSE16x8(const uint8_t* a, const uint8_t* b) {
return count;
}
-static int SSE8x8(const uint8_t* a, const uint8_t* b) {
+static int SSE8x8_MIPSdspR2(const uint8_t* a, const uint8_t* b) {
int count;
int temp0, temp1, temp2, temp3;
__asm__ volatile (
@@ -1147,7 +1149,7 @@ static int SSE8x8(const uint8_t* a, const uint8_t* b) {
return count;
}
-static int SSE4x4(const uint8_t* a, const uint8_t* b) {
+static int SSE4x4_MIPSdspR2(const uint8_t* a, const uint8_t* b) {
int count;
int temp0, temp1, temp2, temp3;
__asm__ volatile (
@@ -1270,8 +1272,8 @@ static int SSE4x4(const uint8_t* a, const uint8_t* b) {
"usw $0, " #J "(%[ppin]) \n\t" \
"3: \n\t"
-static int QuantizeBlock(int16_t in[16], int16_t out[16],
- const VP8Matrix* const mtx) {
+static int QuantizeBlock_MIPSdspR2(int16_t in[16], int16_t out[16],
+ const VP8Matrix* const mtx) {
int temp0, temp1, temp2, temp3, temp4, temp5,temp6;
int sign, coeff, level;
int max_level = MAX_LEVEL;
@@ -1311,11 +1313,11 @@ static int QuantizeBlock(int16_t in[16], int16_t out[16],
return (ret != 0);
}
-static int Quantize2Blocks(int16_t in[32], int16_t out[32],
- const VP8Matrix* const mtx) {
+static int Quantize2Blocks_MIPSdspR2(int16_t in[32], int16_t out[32],
+ const VP8Matrix* const mtx) {
int nz;
- nz = QuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0;
- nz |= QuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1;
+ nz = QuantizeBlock_MIPSdspR2(in + 0 * 16, out + 0 * 16, mtx) << 0;
+ nz |= QuantizeBlock_MIPSdspR2(in + 1 * 16, out + 1 * 16, mtx) << 1;
return nz;
}
@@ -1358,7 +1360,7 @@ static int Quantize2Blocks(int16_t in[32], int16_t out[32],
"usw %[" #TEMP4 "], " #C "(%[out]) \n\t" \
"usw %[" #TEMP6 "], " #D "(%[out]) \n\t"
-static void FTransformWHT(const int16_t* in, int16_t* out) {
+static void FTransformWHT_MIPSdspR2(const int16_t* in, int16_t* out) {
int temp0, temp1, temp2, temp3, temp4;
int temp5, temp6, temp7, temp8, temp9;
@@ -1450,9 +1452,9 @@ static void FTransformWHT(const int16_t* in, int16_t* out) {
"addiu %[temp8], %[temp8], 1 \n\t" \
"sw %[temp8], 0(%[temp3]) \n\t"
-static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
- int start_block, int end_block,
- VP8Histogram* const histo) {
+static void CollectHistogram_MIPSdspR2(const uint8_t* ref, const uint8_t* pred,
+ int start_block, int end_block,
+ VP8Histogram* const histo) {
int j;
int distribution[MAX_COEFF_THRESH + 1] = { 0 };
const int max_coeff = (MAX_COEFF_THRESH << 16) + MAX_COEFF_THRESH;
@@ -1484,23 +1486,28 @@ static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
extern void VP8EncDspInitMIPSdspR2(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitMIPSdspR2(void) {
- VP8FTransform = FTransform;
- VP8ITransform = ITransform;
- VP8TDisto4x4 = Disto4x4;
- VP8TDisto16x16 = Disto16x16;
- VP8EncPredLuma16 = Intra16Preds;
- VP8EncPredChroma8 = IntraChromaPreds;
- VP8EncPredLuma4 = Intra4Preds;
+ VP8FTransform = FTransform_MIPSdspR2;
+ VP8FTransformWHT = FTransformWHT_MIPSdspR2;
+ VP8ITransform = ITransform_MIPSdspR2;
+
+ VP8TDisto4x4 = Disto4x4_MIPSdspR2;
+ VP8TDisto16x16 = Disto16x16_MIPSdspR2;
+
+ VP8EncPredLuma16 = Intra16Preds_MIPSdspR2;
+ VP8EncPredChroma8 = IntraChromaPreds_MIPSdspR2;
+ VP8EncPredLuma4 = Intra4Preds_MIPSdspR2;
+
#if !defined(WORK_AROUND_GCC)
- VP8SSE16x16 = SSE16x16;
- VP8SSE8x8 = SSE8x8;
- VP8SSE16x8 = SSE16x8;
- VP8SSE4x4 = SSE4x4;
+ VP8SSE16x16 = SSE16x16_MIPSdspR2;
+ VP8SSE8x8 = SSE8x8_MIPSdspR2;
+ VP8SSE16x8 = SSE16x8_MIPSdspR2;
+ VP8SSE4x4 = SSE4x4_MIPSdspR2;
#endif
- VP8EncQuantizeBlock = QuantizeBlock;
- VP8EncQuantize2Blocks = Quantize2Blocks;
- VP8FTransformWHT = FTransformWHT;
- VP8CollectHistogram = CollectHistogram;
+
+ VP8EncQuantizeBlock = QuantizeBlock_MIPSdspR2;
+ VP8EncQuantize2Blocks = Quantize2Blocks_MIPSdspR2;
+
+ VP8CollectHistogram = CollectHistogram_MIPSdspR2;
}
#else // !WEBP_USE_MIPS_DSP_R2
diff --git a/src/3rdparty/libwebp/src/dsp/enc_msa.c b/src/3rdparty/libwebp/src/dsp/enc_msa.c
index 909b46d..6f85add 100644
--- a/src/3rdparty/libwebp/src/dsp/enc_msa.c
+++ b/src/3rdparty/libwebp/src/dsp/enc_msa.c
@@ -11,13 +11,13 @@
//
// Author: Prashant Patil (prashant.patil@imgtec.com)
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
#if defined(WEBP_USE_MSA)
#include <stdlib.h>
-#include "./msa_macro.h"
-#include "../enc/vp8i_enc.h"
+#include "src/dsp/msa_macro.h"
+#include "src/enc/vp8i_enc.h"
//------------------------------------------------------------------------------
// Transforms
@@ -69,20 +69,21 @@ static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
ST4x4_UB(res0, res0, 3, 2, 1, 0, dst, BPS);
}
-static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst,
- int do_two) {
+static void ITransform_MSA(const uint8_t* ref, const int16_t* in, uint8_t* dst,
+ int do_two) {
ITransformOne(ref, in, dst);
if (do_two) {
ITransformOne(ref + 4, in + 16, dst + 4);
}
}
-static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
+static void FTransform_MSA(const uint8_t* src, const uint8_t* ref,
+ int16_t* out) {
uint64_t out0, out1, out2, out3;
uint32_t in0, in1, in2, in3;
v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
v8i16 t0, t1, t2, t3;
- v16u8 srcl0, srcl1, src0, src1;
+ v16u8 srcl0, srcl1, src0 = { 0 }, src1 = { 0 };
const v8i16 mask0 = { 0, 4, 8, 12, 1, 5, 9, 13 };
const v8i16 mask1 = { 3, 7, 11, 15, 2, 6, 10, 14 };
const v8i16 mask2 = { 4, 0, 5, 1, 6, 2, 7, 3 };
@@ -130,7 +131,7 @@ static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
SD4(out0, out1, out2, out3, out, 8);
}
-static void FTransformWHT(const int16_t* in, int16_t* out) {
+static void FTransformWHT_MSA(const int16_t* in, int16_t* out) {
v8i16 in0 = { 0 };
v8i16 in1 = { 0 };
v8i16 tmp0, tmp1, tmp2, tmp3;
@@ -167,10 +168,10 @@ static void FTransformWHT(const int16_t* in, int16_t* out) {
ST_SH2(out0, out1, out, 8);
}
-static int TTransform(const uint8_t* in, const uint16_t* w) {
+static int TTransform_MSA(const uint8_t* in, const uint16_t* w) {
int sum;
uint32_t in0_m, in1_m, in2_m, in3_m;
- v16i8 src0;
+ v16i8 src0 = { 0 };
v8i16 in0, in1, tmp0, tmp1, tmp2, tmp3;
v4i32 dst0, dst1;
const v16i8 zero = { 0 };
@@ -199,20 +200,20 @@ static int TTransform(const uint8_t* in, const uint16_t* w) {
return sum;
}
-static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
- const uint16_t* const w) {
- const int sum1 = TTransform(a, w);
- const int sum2 = TTransform(b, w);
+static int Disto4x4_MSA(const uint8_t* const a, const uint8_t* const b,
+ const uint16_t* const w) {
+ const int sum1 = TTransform_MSA(a, w);
+ const int sum2 = TTransform_MSA(b, w);
return abs(sum2 - sum1) >> 5;
}
-static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
- const uint16_t* const w) {
+static int Disto16x16_MSA(const uint8_t* const a, const uint8_t* const b,
+ const uint16_t* const w) {
int D = 0;
int x, y;
for (y = 0; y < 16 * BPS; y += 4 * BPS) {
for (x = 0; x < 16; x += 4) {
- D += Disto4x4(a + x + y, b + x + y, w);
+ D += Disto4x4_MSA(a + x + y, b + x + y, w);
}
}
return D;
@@ -221,9 +222,9 @@ static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
//------------------------------------------------------------------------------
// Histogram
-static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
- int start_block, int end_block,
- VP8Histogram* const histo) {
+static void CollectHistogram_MSA(const uint8_t* ref, const uint8_t* pred,
+ int start_block, int end_block,
+ VP8Histogram* const histo) {
int j;
int distribution[MAX_COEFF_THRESH + 1] = { 0 };
for (j = start_block; j < end_block; ++j) {
@@ -259,8 +260,9 @@ static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
#define AVG2(a, b) (((a) + (b) + 1) >> 1)
static WEBP_INLINE void VE4(uint8_t* dst, const uint8_t* top) { // vertical
+ const v16u8 A1 = { 0 };
const uint64_t val_m = LD(top - 1);
- const v16u8 A = (v16u8)__msa_insert_d((v2i64)A, 0, val_m);
+ const v16u8 A = (v16u8)__msa_insert_d((v2i64)A1, 0, val_m);
const v16u8 B = SLDI_UB(A, A, 1);
const v16u8 C = SLDI_UB(A, A, 2);
const v16u8 AC = __msa_ave_u_b(A, C);
@@ -292,8 +294,9 @@ static WEBP_INLINE void DC4(uint8_t* dst, const uint8_t* top) {
}
static WEBP_INLINE void RD4(uint8_t* dst, const uint8_t* top) {
+ const v16u8 A2 = { 0 };
const uint64_t val_m = LD(top - 5);
- const v16u8 A1 = (v16u8)__msa_insert_d((v2i64)A1, 0, val_m);
+ const v16u8 A1 = (v16u8)__msa_insert_d((v2i64)A2, 0, val_m);
const v16u8 A = (v16u8)__msa_insert_b((v16i8)A1, 8, top[3]);
const v16u8 B = SLDI_UB(A, A, 1);
const v16u8 C = SLDI_UB(A, A, 2);
@@ -311,8 +314,9 @@ static WEBP_INLINE void RD4(uint8_t* dst, const uint8_t* top) {
}
static WEBP_INLINE void LD4(uint8_t* dst, const uint8_t* top) {
+ const v16u8 A1 = { 0 };
const uint64_t val_m = LD(top);
- const v16u8 A = (v16u8)__msa_insert_d((v2i64)A, 0, val_m);
+ const v16u8 A = (v16u8)__msa_insert_d((v2i64)A1, 0, val_m);
const v16u8 B = SLDI_UB(A, A, 1);
const v16u8 C1 = SLDI_UB(A, A, 2);
const v16u8 C = (v16u8)__msa_insert_b((v16i8)C1, 6, top[7]);
@@ -427,7 +431,7 @@ static WEBP_INLINE void TM4(uint8_t* dst, const uint8_t* top) {
#undef AVG3
#undef AVG2
-static void Intra4Preds(uint8_t* dst, const uint8_t* top) {
+static void Intra4Preds_MSA(uint8_t* dst, const uint8_t* top) {
DC4(I4DC4 + dst, top);
TM4(I4TM4 + dst, top);
VE4(I4VE4 + dst, top);
@@ -544,8 +548,8 @@ static WEBP_INLINE void DCMode16x16(uint8_t* dst, const uint8_t* left,
STORE16x16(out, dst);
}
-static void Intra16Preds(uint8_t* dst,
- const uint8_t* left, const uint8_t* top) {
+static void Intra16Preds_MSA(uint8_t* dst,
+ const uint8_t* left, const uint8_t* top) {
DCMode16x16(I16DC16 + dst, left, top);
VerticalPred16x16(I16VE16 + dst, top);
HorizontalPred16x16(I16HE16 + dst, left);
@@ -645,7 +649,7 @@ static WEBP_INLINE void TrueMotion8x8(uint8_t* dst, const uint8_t* left,
static WEBP_INLINE void DCMode8x8(uint8_t* dst, const uint8_t* left,
const uint8_t* top) {
uint64_t out;
- v16u8 src;
+ v16u8 src = { 0 };
if (top != NULL && left != NULL) {
const uint64_t left_m = LD(left);
const uint64_t top_m = LD(top);
@@ -666,8 +670,8 @@ static WEBP_INLINE void DCMode8x8(uint8_t* dst, const uint8_t* left,
STORE8x8(out, dst);
}
-static void IntraChromaPreds(uint8_t* dst, const uint8_t* left,
- const uint8_t* top) {
+static void IntraChromaPreds_MSA(uint8_t* dst, const uint8_t* left,
+ const uint8_t* top) {
// U block
DCMode8x8(C8DC8 + dst, left, top);
VerticalPred8x8(C8VE8 + dst, top);
@@ -708,7 +712,7 @@ static void IntraChromaPreds(uint8_t* dst, const uint8_t* left,
DPADD_SH2_SW(tmp2, tmp3, tmp2, tmp3, out2, out3); \
} while (0)
-static int SSE16x16(const uint8_t* a, const uint8_t* b) {
+static int SSE16x16_MSA(const uint8_t* a, const uint8_t* b) {
uint32_t sum;
v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
@@ -735,7 +739,7 @@ static int SSE16x16(const uint8_t* a, const uint8_t* b) {
return sum;
}
-static int SSE16x8(const uint8_t* a, const uint8_t* b) {
+static int SSE16x8_MSA(const uint8_t* a, const uint8_t* b) {
uint32_t sum;
v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
@@ -754,7 +758,7 @@ static int SSE16x8(const uint8_t* a, const uint8_t* b) {
return sum;
}
-static int SSE8x8(const uint8_t* a, const uint8_t* b) {
+static int SSE8x8_MSA(const uint8_t* a, const uint8_t* b) {
uint32_t sum;
v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
@@ -774,10 +778,10 @@ static int SSE8x8(const uint8_t* a, const uint8_t* b) {
return sum;
}
-static int SSE4x4(const uint8_t* a, const uint8_t* b) {
+static int SSE4x4_MSA(const uint8_t* a, const uint8_t* b) {
uint32_t sum = 0;
uint32_t src0, src1, src2, src3, ref0, ref1, ref2, ref3;
- v16u8 src, ref, tmp0, tmp1;
+ v16u8 src = { 0 }, ref = { 0 }, tmp0, tmp1;
v8i16 diff0, diff1;
v4i32 out0, out1;
@@ -796,8 +800,8 @@ static int SSE4x4(const uint8_t* a, const uint8_t* b) {
//------------------------------------------------------------------------------
// Quantization
-static int QuantizeBlock(int16_t in[16], int16_t out[16],
- const VP8Matrix* const mtx) {
+static int QuantizeBlock_MSA(int16_t in[16], int16_t out[16],
+ const VP8Matrix* const mtx) {
int sum;
v8i16 in0, in1, sh0, sh1, out0, out1;
v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, sign0, sign1;
@@ -828,7 +832,7 @@ static int QuantizeBlock(int16_t in[16], int16_t out[16],
tmp1 = (tmp3 > maxlevel);
tmp2 = (v8i16)__msa_bmnz_v((v16u8)tmp2, (v16u8)maxlevel, (v16u8)tmp0);
tmp3 = (v8i16)__msa_bmnz_v((v16u8)tmp3, (v16u8)maxlevel, (v16u8)tmp1);
- SUB2(0, tmp2, 0, tmp3, tmp0, tmp1);
+ SUB2(zero, tmp2, zero, tmp3, tmp0, tmp1);
tmp2 = (v8i16)__msa_bmnz_v((v16u8)tmp2, (v16u8)tmp0, (v16u8)sign0);
tmp3 = (v8i16)__msa_bmnz_v((v16u8)tmp3, (v16u8)tmp1, (v16u8)sign1);
LD_SW4(&mtx->zthresh_[0], 4, t0, t1, t2, t3); // zthresh
@@ -849,8 +853,8 @@ static int QuantizeBlock(int16_t in[16], int16_t out[16],
return (sum > 0);
}
-static int Quantize2Blocks(int16_t in[32], int16_t out[32],
- const VP8Matrix* const mtx) {
+static int Quantize2Blocks_MSA(int16_t in[32], int16_t out[32],
+ const VP8Matrix* const mtx) {
int nz;
nz = VP8EncQuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0;
nz |= VP8EncQuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1;
@@ -863,26 +867,26 @@ static int Quantize2Blocks(int16_t in[32], int16_t out[32],
extern void VP8EncDspInitMSA(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitMSA(void) {
- VP8ITransform = ITransform;
- VP8FTransform = FTransform;
- VP8FTransformWHT = FTransformWHT;
-
- VP8TDisto4x4 = Disto4x4;
- VP8TDisto16x16 = Disto16x16;
- VP8CollectHistogram = CollectHistogram;
-
- VP8EncPredLuma4 = Intra4Preds;
- VP8EncPredLuma16 = Intra16Preds;
- VP8EncPredChroma8 = IntraChromaPreds;
-
- VP8SSE16x16 = SSE16x16;
- VP8SSE16x8 = SSE16x8;
- VP8SSE8x8 = SSE8x8;
- VP8SSE4x4 = SSE4x4;
-
- VP8EncQuantizeBlock = QuantizeBlock;
- VP8EncQuantize2Blocks = Quantize2Blocks;
- VP8EncQuantizeBlockWHT = QuantizeBlock;
+ VP8ITransform = ITransform_MSA;
+ VP8FTransform = FTransform_MSA;
+ VP8FTransformWHT = FTransformWHT_MSA;
+
+ VP8TDisto4x4 = Disto4x4_MSA;
+ VP8TDisto16x16 = Disto16x16_MSA;
+ VP8CollectHistogram = CollectHistogram_MSA;
+
+ VP8EncPredLuma4 = Intra4Preds_MSA;
+ VP8EncPredLuma16 = Intra16Preds_MSA;
+ VP8EncPredChroma8 = IntraChromaPreds_MSA;
+
+ VP8SSE16x16 = SSE16x16_MSA;
+ VP8SSE16x8 = SSE16x8_MSA;
+ VP8SSE8x8 = SSE8x8_MSA;
+ VP8SSE4x4 = SSE4x4_MSA;
+
+ VP8EncQuantizeBlock = QuantizeBlock_MSA;
+ VP8EncQuantize2Blocks = Quantize2Blocks_MSA;
+ VP8EncQuantizeBlockWHT = QuantizeBlock_MSA;
}
#else // !WEBP_USE_MSA
diff --git a/src/3rdparty/libwebp/src/dsp/enc_neon.c b/src/3rdparty/libwebp/src/dsp/enc_neon.c
index 6a078d6..43bf124 100644
--- a/src/3rdparty/libwebp/src/dsp/enc_neon.c
+++ b/src/3rdparty/libwebp/src/dsp/enc_neon.c
@@ -11,14 +11,14 @@
//
// adapted from libvpx (http://www.webmproject.org/code/)
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
#if defined(WEBP_USE_NEON)
#include <assert.h>
-#include "./neon.h"
-#include "../enc/vp8i_enc.h"
+#include "src/dsp/neon.h"
+#include "src/enc/vp8i_enc.h"
//------------------------------------------------------------------------------
// Transforms (Paragraph 14.4)
@@ -37,15 +37,15 @@ static const int16_t kC2 = 17734; // half of kC2, actually. See comment above.
#if defined(WEBP_USE_INTRINSICS)
// Treats 'v' as an uint8x8_t and zero extends to an int16x8_t.
-static WEBP_INLINE int16x8_t ConvertU8ToS16(uint32x2_t v) {
+static WEBP_INLINE int16x8_t ConvertU8ToS16_NEON(uint32x2_t v) {
return vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(v)));
}
// Performs unsigned 8b saturation on 'dst01' and 'dst23' storing the result
// to the corresponding rows of 'dst'.
-static WEBP_INLINE void SaturateAndStore4x4(uint8_t* const dst,
- const int16x8_t dst01,
- const int16x8_t dst23) {
+static WEBP_INLINE void SaturateAndStore4x4_NEON(uint8_t* const dst,
+ const int16x8_t dst01,
+ const int16x8_t dst23) {
// Unsigned saturate to 8b.
const uint8x8_t dst01_u8 = vqmovun_s16(dst01);
const uint8x8_t dst23_u8 = vqmovun_s16(dst23);
@@ -57,8 +57,10 @@ static WEBP_INLINE void SaturateAndStore4x4(uint8_t* const dst,
vst1_lane_u32((uint32_t*)(dst + 3 * BPS), vreinterpret_u32_u8(dst23_u8), 1);
}
-static WEBP_INLINE void Add4x4(const int16x8_t row01, const int16x8_t row23,
- const uint8_t* const ref, uint8_t* const dst) {
+static WEBP_INLINE void Add4x4_NEON(const int16x8_t row01,
+ const int16x8_t row23,
+ const uint8_t* const ref,
+ uint8_t* const dst) {
uint32x2_t dst01 = vdup_n_u32(0);
uint32x2_t dst23 = vdup_n_u32(0);
@@ -70,19 +72,20 @@ static WEBP_INLINE void Add4x4(const int16x8_t row01, const int16x8_t row23,
{
// Convert to 16b.
- const int16x8_t dst01_s16 = ConvertU8ToS16(dst01);
- const int16x8_t dst23_s16 = ConvertU8ToS16(dst23);
+ const int16x8_t dst01_s16 = ConvertU8ToS16_NEON(dst01);
+ const int16x8_t dst23_s16 = ConvertU8ToS16_NEON(dst23);
// Descale with rounding.
const int16x8_t out01 = vrsraq_n_s16(dst01_s16, row01, 3);
const int16x8_t out23 = vrsraq_n_s16(dst23_s16, row23, 3);
// Add the inverse transform.
- SaturateAndStore4x4(dst, out01, out23);
+ SaturateAndStore4x4_NEON(dst, out01, out23);
}
}
-static WEBP_INLINE void Transpose8x2(const int16x8_t in0, const int16x8_t in1,
- int16x8x2_t* const out) {
+static WEBP_INLINE void Transpose8x2_NEON(const int16x8_t in0,
+ const int16x8_t in1,
+ int16x8x2_t* const out) {
// a0 a1 a2 a3 | b0 b1 b2 b3 => a0 b0 c0 d0 | a1 b1 c1 d1
// c0 c1 c2 c3 | d0 d1 d2 d3 a2 b2 c2 d2 | a3 b3 c3 d3
const int16x8x2_t tmp0 = vzipq_s16(in0, in1); // a0 c0 a1 c1 a2 c2 ...
@@ -90,7 +93,7 @@ static WEBP_INLINE void Transpose8x2(const int16x8_t in0, const int16x8_t in1,
*out = vzipq_s16(tmp0.val[0], tmp0.val[1]);
}
-static WEBP_INLINE void TransformPass(int16x8x2_t* const rows) {
+static WEBP_INLINE void TransformPass_NEON(int16x8x2_t* const rows) {
// {rows} = in0 | in4
// in8 | in12
// B1 = in4 | in12
@@ -113,22 +116,22 @@ static WEBP_INLINE void TransformPass(int16x8x2_t* const rows) {
const int16x8_t E0 = vqaddq_s16(D0, D1); // a+d | b+c
const int16x8_t E_tmp = vqsubq_s16(D0, D1); // a-d | b-c
const int16x8_t E1 = vcombine_s16(vget_high_s16(E_tmp), vget_low_s16(E_tmp));
- Transpose8x2(E0, E1, rows);
+ Transpose8x2_NEON(E0, E1, rows);
}
-static void ITransformOne(const uint8_t* ref,
- const int16_t* in, uint8_t* dst) {
+static void ITransformOne_NEON(const uint8_t* ref,
+ const int16_t* in, uint8_t* dst) {
int16x8x2_t rows;
INIT_VECTOR2(rows, vld1q_s16(in + 0), vld1q_s16(in + 8));
- TransformPass(&rows);
- TransformPass(&rows);
- Add4x4(rows.val[0], rows.val[1], ref, dst);
+ TransformPass_NEON(&rows);
+ TransformPass_NEON(&rows);
+ Add4x4_NEON(rows.val[0], rows.val[1], ref, dst);
}
#else
-static void ITransformOne(const uint8_t* ref,
- const int16_t* in, uint8_t* dst) {
+static void ITransformOne_NEON(const uint8_t* ref,
+ const int16_t* in, uint8_t* dst) {
const int kBPS = BPS;
const int16_t kC1C2[] = { kC1, kC2, 0, 0 };
@@ -243,16 +246,16 @@ static void ITransformOne(const uint8_t* ref,
#endif // WEBP_USE_INTRINSICS
-static void ITransform(const uint8_t* ref,
- const int16_t* in, uint8_t* dst, int do_two) {
- ITransformOne(ref, in, dst);
+static void ITransform_NEON(const uint8_t* ref,
+ const int16_t* in, uint8_t* dst, int do_two) {
+ ITransformOne_NEON(ref, in, dst);
if (do_two) {
- ITransformOne(ref + 4, in + 16, dst + 4);
+ ITransformOne_NEON(ref + 4, in + 16, dst + 4);
}
}
// Load all 4x4 pixels into a single uint8x16_t variable.
-static uint8x16_t Load4x4(const uint8_t* src) {
+static uint8x16_t Load4x4_NEON(const uint8_t* src) {
uint32x4_t out = vdupq_n_u32(0);
out = vld1q_lane_u32((const uint32_t*)(src + 0 * BPS), out, 0);
out = vld1q_lane_u32((const uint32_t*)(src + 1 * BPS), out, 1);
@@ -265,10 +268,12 @@ static uint8x16_t Load4x4(const uint8_t* src) {
#if defined(WEBP_USE_INTRINSICS)
-static WEBP_INLINE void Transpose4x4_S16(const int16x4_t A, const int16x4_t B,
- const int16x4_t C, const int16x4_t D,
- int16x8_t* const out01,
- int16x8_t* const out32) {
+static WEBP_INLINE void Transpose4x4_S16_NEON(const int16x4_t A,
+ const int16x4_t B,
+ const int16x4_t C,
+ const int16x4_t D,
+ int16x8_t* const out01,
+ int16x8_t* const out32) {
const int16x4x2_t AB = vtrn_s16(A, B);
const int16x4x2_t CD = vtrn_s16(C, D);
const int32x2x2_t tmp02 = vtrn_s32(vreinterpret_s32_s16(AB.val[0]),
@@ -283,24 +288,24 @@ static WEBP_INLINE void Transpose4x4_S16(const int16x4_t A, const int16x4_t B,
vreinterpret_s64_s32(tmp02.val[1])));
}
-static WEBP_INLINE int16x8_t DiffU8ToS16(const uint8x8_t a,
- const uint8x8_t b) {
+static WEBP_INLINE int16x8_t DiffU8ToS16_NEON(const uint8x8_t a,
+ const uint8x8_t b) {
return vreinterpretq_s16_u16(vsubl_u8(a, b));
}
-static void FTransform(const uint8_t* src, const uint8_t* ref,
- int16_t* out) {
+static void FTransform_NEON(const uint8_t* src, const uint8_t* ref,
+ int16_t* out) {
int16x8_t d0d1, d3d2; // working 4x4 int16 variables
{
- const uint8x16_t S0 = Load4x4(src);
- const uint8x16_t R0 = Load4x4(ref);
- const int16x8_t D0D1 = DiffU8ToS16(vget_low_u8(S0), vget_low_u8(R0));
- const int16x8_t D2D3 = DiffU8ToS16(vget_high_u8(S0), vget_high_u8(R0));
+ const uint8x16_t S0 = Load4x4_NEON(src);
+ const uint8x16_t R0 = Load4x4_NEON(ref);
+ const int16x8_t D0D1 = DiffU8ToS16_NEON(vget_low_u8(S0), vget_low_u8(R0));
+ const int16x8_t D2D3 = DiffU8ToS16_NEON(vget_high_u8(S0), vget_high_u8(R0));
const int16x4_t D0 = vget_low_s16(D0D1);
const int16x4_t D1 = vget_high_s16(D0D1);
const int16x4_t D2 = vget_low_s16(D2D3);
const int16x4_t D3 = vget_high_s16(D2D3);
- Transpose4x4_S16(D0, D1, D2, D3, &d0d1, &d3d2);
+ Transpose4x4_S16_NEON(D0, D1, D2, D3, &d0d1, &d3d2);
}
{ // 1rst pass
const int32x4_t kCst937 = vdupq_n_s32(937);
@@ -318,7 +323,7 @@ static void FTransform(const uint8_t* src, const uint8_t* ref,
const int32x4_t a3_m_a2 = vmlsl_n_s16(a3_2217, vget_high_s16(a3a2), 5352);
const int16x4_t tmp1 = vshrn_n_s32(vaddq_s32(a2_p_a3, kCst1812), 9);
const int16x4_t tmp3 = vshrn_n_s32(vaddq_s32(a3_m_a2, kCst937), 9);
- Transpose4x4_S16(tmp0, tmp1, tmp2, tmp3, &d0d1, &d3d2);
+ Transpose4x4_S16_NEON(tmp0, tmp1, tmp2, tmp3, &d0d1, &d3d2);
}
{ // 2nd pass
// the (1<<16) addition is for the replacement: a3!=0 <-> 1-(a3==0)
@@ -358,8 +363,8 @@ static const int32_t kCoeff32[] = {
51000, 51000, 51000, 51000
};
-static void FTransform(const uint8_t* src, const uint8_t* ref,
- int16_t* out) {
+static void FTransform_NEON(const uint8_t* src, const uint8_t* ref,
+ int16_t* out) {
const int kBPS = BPS;
const uint8_t* src_ptr = src;
const uint8_t* ref_ptr = ref;
@@ -478,7 +483,7 @@ static void FTransform(const uint8_t* src, const uint8_t* ref,
src += stride; \
} while (0)
-static void FTransformWHT(const int16_t* src, int16_t* out) {
+static void FTransformWHT_NEON(const int16_t* src, int16_t* out) {
const int stride = 16;
const int16x4_t zero = vdup_n_s16(0);
int32x4x4_t tmp0;
@@ -516,7 +521,7 @@ static void FTransformWHT(const int16_t* src, int16_t* out) {
tmp0.val[3] = vsubq_s32(a0, a1);
}
{
- const int32x4x4_t tmp1 = Transpose4x4(tmp0);
+ const int32x4x4_t tmp1 = Transpose4x4_NEON(tmp0);
// a0 = tmp[0 + i] + tmp[ 8 + i]
// a1 = tmp[4 + i] + tmp[12 + i]
// a2 = tmp[4 + i] - tmp[12 + i]
@@ -560,7 +565,7 @@ static void FTransformWHT(const int16_t* src, int16_t* out) {
// a 26ae, b 26ae
// a 37bf, b 37bf
//
-static WEBP_INLINE int16x8x4_t DistoTranspose4x4S16(int16x8x4_t q4_in) {
+static WEBP_INLINE int16x8x4_t DistoTranspose4x4S16_NEON(int16x8x4_t q4_in) {
const int16x8x2_t q2_tmp0 = vtrnq_s16(q4_in.val[0], q4_in.val[1]);
const int16x8x2_t q2_tmp1 = vtrnq_s16(q4_in.val[2], q4_in.val[3]);
const int32x4x2_t q2_tmp2 = vtrnq_s32(vreinterpretq_s32_s16(q2_tmp0.val[0]),
@@ -574,7 +579,8 @@ static WEBP_INLINE int16x8x4_t DistoTranspose4x4S16(int16x8x4_t q4_in) {
return q4_in;
}
-static WEBP_INLINE int16x8x4_t DistoHorizontalPass(const int16x8x4_t q4_in) {
+static WEBP_INLINE int16x8x4_t DistoHorizontalPass_NEON(
+ const int16x8x4_t q4_in) {
// {a0, a1} = {in[0] + in[2], in[1] + in[3]}
// {a3, a2} = {in[0] - in[2], in[1] - in[3]}
const int16x8_t q_a0 = vaddq_s16(q4_in.val[0], q4_in.val[2]);
@@ -593,7 +599,7 @@ static WEBP_INLINE int16x8x4_t DistoHorizontalPass(const int16x8x4_t q4_in) {
return q4_out;
}
-static WEBP_INLINE int16x8x4_t DistoVerticalPass(const uint8x8x4_t q4_in) {
+static WEBP_INLINE int16x8x4_t DistoVerticalPass_NEON(const uint8x8x4_t q4_in) {
const int16x8_t q_a0 = vreinterpretq_s16_u16(vaddl_u8(q4_in.val[0],
q4_in.val[2]));
const int16x8_t q_a1 = vreinterpretq_s16_u16(vaddl_u8(q4_in.val[1],
@@ -610,7 +616,7 @@ static WEBP_INLINE int16x8x4_t DistoVerticalPass(const uint8x8x4_t q4_in) {
return q4_out;
}
-static WEBP_INLINE int16x4x4_t DistoLoadW(const uint16_t* w) {
+static WEBP_INLINE int16x4x4_t DistoLoadW_NEON(const uint16_t* w) {
const uint16x8_t q_w07 = vld1q_u16(&w[0]);
const uint16x8_t q_w8f = vld1q_u16(&w[8]);
int16x4x4_t d4_w;
@@ -622,8 +628,8 @@ static WEBP_INLINE int16x4x4_t DistoLoadW(const uint16_t* w) {
return d4_w;
}
-static WEBP_INLINE int32x2_t DistoSum(const int16x8x4_t q4_in,
- const int16x4x4_t d4_w) {
+static WEBP_INLINE int32x2_t DistoSum_NEON(const int16x8x4_t q4_in,
+ const int16x4x4_t d4_w) {
int32x2_t d_sum;
// sum += w[ 0] * abs(b0);
// sum += w[ 4] * abs(b1);
@@ -652,8 +658,8 @@ static WEBP_INLINE int32x2_t DistoSum(const int16x8x4_t q4_in,
// Hadamard transform
// Returns the weighted sum of the absolute value of transformed coefficients.
// w[] contains a row-major 4 by 4 symmetric matrix.
-static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
- const uint16_t* const w) {
+static int Disto4x4_NEON(const uint8_t* const a, const uint8_t* const b,
+ const uint16_t* const w) {
uint32x2_t d_in_ab_0123 = vdup_n_u32(0);
uint32x2_t d_in_ab_4567 = vdup_n_u32(0);
uint32x2_t d_in_ab_89ab = vdup_n_u32(0);
@@ -679,12 +685,12 @@ static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
// Vertical pass first to avoid a transpose (vertical and horizontal passes
// are commutative because w/kWeightY is symmetric) and subsequent
// transpose.
- const int16x8x4_t q4_v = DistoVerticalPass(d4_in);
- const int16x4x4_t d4_w = DistoLoadW(w);
+ const int16x8x4_t q4_v = DistoVerticalPass_NEON(d4_in);
+ const int16x4x4_t d4_w = DistoLoadW_NEON(w);
// horizontal pass
- const int16x8x4_t q4_t = DistoTranspose4x4S16(q4_v);
- const int16x8x4_t q4_h = DistoHorizontalPass(q4_t);
- int32x2_t d_sum = DistoSum(q4_h, d4_w);
+ const int16x8x4_t q4_t = DistoTranspose4x4S16_NEON(q4_v);
+ const int16x8x4_t q4_h = DistoHorizontalPass_NEON(q4_t);
+ int32x2_t d_sum = DistoSum_NEON(q4_h, d4_w);
// abs(sum2 - sum1) >> 5
d_sum = vabs_s32(d_sum);
@@ -694,13 +700,13 @@ static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
}
#undef LOAD_LANE_32b
-static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
- const uint16_t* const w) {
+static int Disto16x16_NEON(const uint8_t* const a, const uint8_t* const b,
+ const uint16_t* const w) {
int D = 0;
int x, y;
for (y = 0; y < 16 * BPS; y += 4 * BPS) {
for (x = 0; x < 16; x += 4) {
- D += Disto4x4(a + x + y, b + x + y, w);
+ D += Disto4x4_NEON(a + x + y, b + x + y, w);
}
}
return D;
@@ -708,15 +714,15 @@ static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
//------------------------------------------------------------------------------
-static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
- int start_block, int end_block,
- VP8Histogram* const histo) {
+static void CollectHistogram_NEON(const uint8_t* ref, const uint8_t* pred,
+ int start_block, int end_block,
+ VP8Histogram* const histo) {
const uint16x8_t max_coeff_thresh = vdupq_n_u16(MAX_COEFF_THRESH);
int j;
int distribution[MAX_COEFF_THRESH + 1] = { 0 };
for (j = start_block; j < end_block; ++j) {
int16_t out[16];
- FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
+ FTransform_NEON(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
{
int k;
const int16x8_t a0 = vld1q_s16(out + 0);
@@ -740,9 +746,9 @@ static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
//------------------------------------------------------------------------------
-static WEBP_INLINE void AccumulateSSE16(const uint8_t* const a,
- const uint8_t* const b,
- uint32x4_t* const sum) {
+static WEBP_INLINE void AccumulateSSE16_NEON(const uint8_t* const a,
+ const uint8_t* const b,
+ uint32x4_t* const sum) {
const uint8x16_t a0 = vld1q_u8(a);
const uint8x16_t b0 = vld1q_u8(b);
const uint8x16_t abs_diff = vabdq_u8(a0, b0);
@@ -757,7 +763,7 @@ static WEBP_INLINE void AccumulateSSE16(const uint8_t* const a,
}
// Horizontal sum of all four uint32_t values in 'sum'.
-static int SumToInt(uint32x4_t sum) {
+static int SumToInt_NEON(uint32x4_t sum) {
const uint64x2_t sum2 = vpaddlq_u32(sum);
const uint64_t sum3 = vgetq_lane_u64(sum2, 0) + vgetq_lane_u64(sum2, 1);
return (int)sum3;
@@ -767,18 +773,18 @@ static int SSE16x16_NEON(const uint8_t* a, const uint8_t* b) {
uint32x4_t sum = vdupq_n_u32(0);
int y;
for (y = 0; y < 16; ++y) {
- AccumulateSSE16(a + y * BPS, b + y * BPS, &sum);
+ AccumulateSSE16_NEON(a + y * BPS, b + y * BPS, &sum);
}
- return SumToInt(sum);
+ return SumToInt_NEON(sum);
}
static int SSE16x8_NEON(const uint8_t* a, const uint8_t* b) {
uint32x4_t sum = vdupq_n_u32(0);
int y;
for (y = 0; y < 8; ++y) {
- AccumulateSSE16(a + y * BPS, b + y * BPS, &sum);
+ AccumulateSSE16_NEON(a + y * BPS, b + y * BPS, &sum);
}
- return SumToInt(sum);
+ return SumToInt_NEON(sum);
}
static int SSE8x8_NEON(const uint8_t* a, const uint8_t* b) {
@@ -791,12 +797,12 @@ static int SSE8x8_NEON(const uint8_t* a, const uint8_t* b) {
const uint16x8_t prod = vmull_u8(abs_diff, abs_diff);
sum = vpadalq_u16(sum, prod);
}
- return SumToInt(sum);
+ return SumToInt_NEON(sum);
}
static int SSE4x4_NEON(const uint8_t* a, const uint8_t* b) {
- const uint8x16_t a0 = Load4x4(a);
- const uint8x16_t b0 = Load4x4(b);
+ const uint8x16_t a0 = Load4x4_NEON(a);
+ const uint8x16_t b0 = Load4x4_NEON(b);
const uint8x16_t abs_diff = vabdq_u8(a0, b0);
const uint16x8_t prod1 = vmull_u8(vget_low_u8(abs_diff),
vget_low_u8(abs_diff));
@@ -805,7 +811,7 @@ static int SSE4x4_NEON(const uint8_t* a, const uint8_t* b) {
/* pair-wise adds and widen */
const uint32x4_t sum1 = vpaddlq_u16(prod1);
const uint32x4_t sum2 = vpaddlq_u16(prod2);
- return SumToInt(vaddq_u32(sum1, sum2));
+ return SumToInt_NEON(vaddq_u32(sum1, sum2));
}
//------------------------------------------------------------------------------
@@ -813,8 +819,8 @@ static int SSE4x4_NEON(const uint8_t* a, const uint8_t* b) {
// Compilation with gcc-4.6.x is problematic for now.
#if !defined(WORK_AROUND_GCC)
-static int16x8_t Quantize(int16_t* const in,
- const VP8Matrix* const mtx, int offset) {
+static int16x8_t Quantize_NEON(int16_t* const in,
+ const VP8Matrix* const mtx, int offset) {
const uint16x8_t sharp = vld1q_u16(&mtx->sharpen_[offset]);
const uint16x8_t q = vld1q_u16(&mtx->q_[offset]);
const uint16x8_t iq = vld1q_u16(&mtx->iq_[offset]);
@@ -847,10 +853,10 @@ static const uint8_t kShuffles[4][8] = {
{ 14, 15, 22, 23, 28, 29, 30, 31 }
};
-static int QuantizeBlock(int16_t in[16], int16_t out[16],
- const VP8Matrix* const mtx) {
- const int16x8_t out0 = Quantize(in, mtx, 0);
- const int16x8_t out1 = Quantize(in, mtx, 8);
+static int QuantizeBlock_NEON(int16_t in[16], int16_t out[16],
+ const VP8Matrix* const mtx) {
+ const int16x8_t out0 = Quantize_NEON(in, mtx, 0);
+ const int16x8_t out1 = Quantize_NEON(in, mtx, 8);
uint8x8x4_t shuffles;
// vtbl?_u8 are marked unavailable for iOS arm64 with Xcode < 6.3, use
// non-standard versions there.
@@ -889,11 +895,11 @@ static int QuantizeBlock(int16_t in[16], int16_t out[16],
return 0;
}
-static int Quantize2Blocks(int16_t in[32], int16_t out[32],
- const VP8Matrix* const mtx) {
+static int Quantize2Blocks_NEON(int16_t in[32], int16_t out[32],
+ const VP8Matrix* const mtx) {
int nz;
- nz = QuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0;
- nz |= QuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1;
+ nz = QuantizeBlock_NEON(in + 0 * 16, out + 0 * 16, mtx) << 0;
+ nz |= QuantizeBlock_NEON(in + 1 * 16, out + 1 * 16, mtx) << 1;
return nz;
}
@@ -905,14 +911,14 @@ static int Quantize2Blocks(int16_t in[32], int16_t out[32],
extern void VP8EncDspInitNEON(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitNEON(void) {
- VP8ITransform = ITransform;
- VP8FTransform = FTransform;
+ VP8ITransform = ITransform_NEON;
+ VP8FTransform = FTransform_NEON;
- VP8FTransformWHT = FTransformWHT;
+ VP8FTransformWHT = FTransformWHT_NEON;
- VP8TDisto4x4 = Disto4x4;
- VP8TDisto16x16 = Disto16x16;
- VP8CollectHistogram = CollectHistogram;
+ VP8TDisto4x4 = Disto4x4_NEON;
+ VP8TDisto16x16 = Disto16x16_NEON;
+ VP8CollectHistogram = CollectHistogram_NEON;
VP8SSE16x16 = SSE16x16_NEON;
VP8SSE16x8 = SSE16x8_NEON;
@@ -920,8 +926,8 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitNEON(void) {
VP8SSE4x4 = SSE4x4_NEON;
#if !defined(WORK_AROUND_GCC)
- VP8EncQuantizeBlock = QuantizeBlock;
- VP8EncQuantize2Blocks = Quantize2Blocks;
+ VP8EncQuantizeBlock = QuantizeBlock_NEON;
+ VP8EncQuantize2Blocks = Quantize2Blocks_NEON;
#endif
}
diff --git a/src/3rdparty/libwebp/src/dsp/enc_sse2.c b/src/3rdparty/libwebp/src/dsp/enc_sse2.c
index 2026a74..7b3f142 100644
--- a/src/3rdparty/libwebp/src/dsp/enc_sse2.c
+++ b/src/3rdparty/libwebp/src/dsp/enc_sse2.c
@@ -11,23 +11,23 @@
//
// Author: Christian Duvivier (cduvivier@google.com)
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
#if defined(WEBP_USE_SSE2)
#include <assert.h>
#include <stdlib.h> // for abs()
#include <emmintrin.h>
-#include "./common_sse2.h"
-#include "../enc/cost_enc.h"
-#include "../enc/vp8i_enc.h"
+#include "src/dsp/common_sse2.h"
+#include "src/enc/cost_enc.h"
+#include "src/enc/vp8i_enc.h"
//------------------------------------------------------------------------------
// Transforms (Paragraph 14.4)
// Does one or two inverse transforms.
-static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst,
- int do_two) {
+static void ITransform_SSE2(const uint8_t* ref, const int16_t* in, uint8_t* dst,
+ int do_two) {
// This implementation makes use of 16-bit fixed point versions of two
// multiply constants:
// K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16
@@ -193,10 +193,10 @@ static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst,
}
}
-static void FTransformPass1(const __m128i* const in01,
- const __m128i* const in23,
- __m128i* const out01,
- __m128i* const out32) {
+static void FTransformPass1_SSE2(const __m128i* const in01,
+ const __m128i* const in23,
+ __m128i* const out01,
+ __m128i* const out32) {
const __m128i k937 = _mm_set1_epi32(937);
const __m128i k1812 = _mm_set1_epi32(1812);
@@ -239,8 +239,9 @@ static void FTransformPass1(const __m128i* const in01,
*out32 = _mm_shuffle_epi32(v23, _MM_SHUFFLE(1, 0, 3, 2)); // 3 2 3 2 3 2..
}
-static void FTransformPass2(const __m128i* const v01, const __m128i* const v32,
- int16_t* out) {
+static void FTransformPass2_SSE2(const __m128i* const v01,
+ const __m128i* const v32,
+ int16_t* out) {
const __m128i zero = _mm_setzero_si128();
const __m128i seven = _mm_set1_epi16(7);
const __m128i k5352_2217 = _mm_set_epi16(5352, 2217, 5352, 2217,
@@ -291,7 +292,8 @@ static void FTransformPass2(const __m128i* const v01, const __m128i* const v32,
_mm_storeu_si128((__m128i*)&out[8], d2_f3);
}
-static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
+static void FTransform_SSE2(const uint8_t* src, const uint8_t* ref,
+ int16_t* out) {
const __m128i zero = _mm_setzero_si128();
// Load src.
const __m128i src0 = _mm_loadl_epi64((const __m128i*)&src[0 * BPS]);
@@ -328,13 +330,14 @@ static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
__m128i v01, v32;
// First pass
- FTransformPass1(&row01, &row23, &v01, &v32);
+ FTransformPass1_SSE2(&row01, &row23, &v01, &v32);
// Second pass
- FTransformPass2(&v01, &v32, out);
+ FTransformPass2_SSE2(&v01, &v32, out);
}
-static void FTransform2(const uint8_t* src, const uint8_t* ref, int16_t* out) {
+static void FTransform2_SSE2(const uint8_t* src, const uint8_t* ref,
+ int16_t* out) {
const __m128i zero = _mm_setzero_si128();
// Load src and convert to 16b.
@@ -374,15 +377,15 @@ static void FTransform2(const uint8_t* src, const uint8_t* ref, int16_t* out) {
__m128i v01h, v32h;
// First pass
- FTransformPass1(&shuf01l, &shuf23l, &v01l, &v32l);
- FTransformPass1(&shuf01h, &shuf23h, &v01h, &v32h);
+ FTransformPass1_SSE2(&shuf01l, &shuf23l, &v01l, &v32l);
+ FTransformPass1_SSE2(&shuf01h, &shuf23h, &v01h, &v32h);
// Second pass
- FTransformPass2(&v01l, &v32l, out + 0);
- FTransformPass2(&v01h, &v32h, out + 16);
+ FTransformPass2_SSE2(&v01l, &v32l, out + 0);
+ FTransformPass2_SSE2(&v01h, &v32h, out + 16);
}
-static void FTransformWHTRow(const int16_t* const in, __m128i* const out) {
+static void FTransformWHTRow_SSE2(const int16_t* const in, __m128i* const out) {
const __m128i kMult = _mm_set_epi16(-1, 1, -1, 1, 1, 1, 1, 1);
const __m128i src0 = _mm_loadl_epi64((__m128i*)&in[0 * 16]);
const __m128i src1 = _mm_loadl_epi64((__m128i*)&in[1 * 16]);
@@ -398,14 +401,14 @@ static void FTransformWHTRow(const int16_t* const in, __m128i* const out) {
*out = _mm_madd_epi16(D, kMult);
}
-static void FTransformWHT(const int16_t* in, int16_t* out) {
+static void FTransformWHT_SSE2(const int16_t* in, int16_t* out) {
// Input is 12b signed.
__m128i row0, row1, row2, row3;
// Rows are 14b signed.
- FTransformWHTRow(in + 0 * 64, &row0);
- FTransformWHTRow(in + 1 * 64, &row1);
- FTransformWHTRow(in + 2 * 64, &row2);
- FTransformWHTRow(in + 3 * 64, &row3);
+ FTransformWHTRow_SSE2(in + 0 * 64, &row0);
+ FTransformWHTRow_SSE2(in + 1 * 64, &row1);
+ FTransformWHTRow_SSE2(in + 2 * 64, &row2);
+ FTransformWHTRow_SSE2(in + 3 * 64, &row3);
{
// The a* are 15b signed.
@@ -431,9 +434,9 @@ static void FTransformWHT(const int16_t* in, int16_t* out) {
// Compute susceptibility based on DCT-coeff histograms:
// the higher, the "easier" the macroblock is to compress.
-static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
- int start_block, int end_block,
- VP8Histogram* const histo) {
+static void CollectHistogram_SSE2(const uint8_t* ref, const uint8_t* pred,
+ int start_block, int end_block,
+ VP8Histogram* const histo) {
const __m128i zero = _mm_setzero_si128();
const __m128i max_coeff_thresh = _mm_set1_epi16(MAX_COEFF_THRESH);
int j;
@@ -442,7 +445,7 @@ static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
int16_t out[16];
int k;
- FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
+ FTransform_SSE2(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
// Convert coefficients to bin (within out[]).
{
@@ -476,7 +479,7 @@ static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
// Intra predictions
// helper for chroma-DC predictions
-static WEBP_INLINE void Put8x8uv(uint8_t v, uint8_t* dst) {
+static WEBP_INLINE void Put8x8uv_SSE2(uint8_t v, uint8_t* dst) {
int j;
const __m128i values = _mm_set1_epi8(v);
for (j = 0; j < 8; ++j) {
@@ -484,7 +487,7 @@ static WEBP_INLINE void Put8x8uv(uint8_t v, uint8_t* dst) {
}
}
-static WEBP_INLINE void Put16(uint8_t v, uint8_t* dst) {
+static WEBP_INLINE void Put16_SSE2(uint8_t v, uint8_t* dst) {
int j;
const __m128i values = _mm_set1_epi8(v);
for (j = 0; j < 16; ++j) {
@@ -492,20 +495,20 @@ static WEBP_INLINE void Put16(uint8_t v, uint8_t* dst) {
}
}
-static WEBP_INLINE void Fill(uint8_t* dst, int value, int size) {
+static WEBP_INLINE void Fill_SSE2(uint8_t* dst, int value, int size) {
if (size == 4) {
int j;
for (j = 0; j < 4; ++j) {
memset(dst + j * BPS, value, 4);
}
} else if (size == 8) {
- Put8x8uv(value, dst);
+ Put8x8uv_SSE2(value, dst);
} else {
- Put16(value, dst);
+ Put16_SSE2(value, dst);
}
}
-static WEBP_INLINE void VE8uv(uint8_t* dst, const uint8_t* top) {
+static WEBP_INLINE void VE8uv_SSE2(uint8_t* dst, const uint8_t* top) {
int j;
const __m128i top_values = _mm_loadl_epi64((const __m128i*)top);
for (j = 0; j < 8; ++j) {
@@ -513,7 +516,7 @@ static WEBP_INLINE void VE8uv(uint8_t* dst, const uint8_t* top) {
}
}
-static WEBP_INLINE void VE16(uint8_t* dst, const uint8_t* top) {
+static WEBP_INLINE void VE16_SSE2(uint8_t* dst, const uint8_t* top) {
const __m128i top_values = _mm_load_si128((const __m128i*)top);
int j;
for (j = 0; j < 16; ++j) {
@@ -521,20 +524,20 @@ static WEBP_INLINE void VE16(uint8_t* dst, const uint8_t* top) {
}
}
-static WEBP_INLINE void VerticalPred(uint8_t* dst,
- const uint8_t* top, int size) {
+static WEBP_INLINE void VerticalPred_SSE2(uint8_t* dst,
+ const uint8_t* top, int size) {
if (top != NULL) {
if (size == 8) {
- VE8uv(dst, top);
+ VE8uv_SSE2(dst, top);
} else {
- VE16(dst, top);
+ VE16_SSE2(dst, top);
}
} else {
- Fill(dst, 127, size);
+ Fill_SSE2(dst, 127, size);
}
}
-static WEBP_INLINE void HE8uv(uint8_t* dst, const uint8_t* left) {
+static WEBP_INLINE void HE8uv_SSE2(uint8_t* dst, const uint8_t* left) {
int j;
for (j = 0; j < 8; ++j) {
const __m128i values = _mm_set1_epi8(left[j]);
@@ -543,7 +546,7 @@ static WEBP_INLINE void HE8uv(uint8_t* dst, const uint8_t* left) {
}
}
-static WEBP_INLINE void HE16(uint8_t* dst, const uint8_t* left) {
+static WEBP_INLINE void HE16_SSE2(uint8_t* dst, const uint8_t* left) {
int j;
for (j = 0; j < 16; ++j) {
const __m128i values = _mm_set1_epi8(left[j]);
@@ -552,21 +555,21 @@ static WEBP_INLINE void HE16(uint8_t* dst, const uint8_t* left) {
}
}
-static WEBP_INLINE void HorizontalPred(uint8_t* dst,
- const uint8_t* left, int size) {
+static WEBP_INLINE void HorizontalPred_SSE2(uint8_t* dst,
+ const uint8_t* left, int size) {
if (left != NULL) {
if (size == 8) {
- HE8uv(dst, left);
+ HE8uv_SSE2(dst, left);
} else {
- HE16(dst, left);
+ HE16_SSE2(dst, left);
}
} else {
- Fill(dst, 129, size);
+ Fill_SSE2(dst, 129, size);
}
}
-static WEBP_INLINE void TM(uint8_t* dst, const uint8_t* left,
- const uint8_t* top, int size) {
+static WEBP_INLINE void TM_SSE2(uint8_t* dst, const uint8_t* left,
+ const uint8_t* top, int size) {
const __m128i zero = _mm_setzero_si128();
int y;
if (size == 8) {
@@ -593,13 +596,13 @@ static WEBP_INLINE void TM(uint8_t* dst, const uint8_t* left,
}
}
-static WEBP_INLINE void TrueMotion(uint8_t* dst, const uint8_t* left,
- const uint8_t* top, int size) {
+static WEBP_INLINE void TrueMotion_SSE2(uint8_t* dst, const uint8_t* left,
+ const uint8_t* top, int size) {
if (left != NULL) {
if (top != NULL) {
- TM(dst, left, top, size);
+ TM_SSE2(dst, left, top, size);
} else {
- HorizontalPred(dst, left, size);
+ HorizontalPred_SSE2(dst, left, size);
}
} else {
// true motion without left samples (hence: with default 129 value)
@@ -607,90 +610,90 @@ static WEBP_INLINE void TrueMotion(uint8_t* dst, const uint8_t* left,
// Note that if top samples are not available, the default value is
// then 129, and not 127 as in the VerticalPred case.
if (top != NULL) {
- VerticalPred(dst, top, size);
+ VerticalPred_SSE2(dst, top, size);
} else {
- Fill(dst, 129, size);
+ Fill_SSE2(dst, 129, size);
}
}
}
-static WEBP_INLINE void DC8uv(uint8_t* dst, const uint8_t* left,
- const uint8_t* top) {
+static WEBP_INLINE void DC8uv_SSE2(uint8_t* dst, const uint8_t* left,
+ const uint8_t* top) {
const __m128i top_values = _mm_loadl_epi64((const __m128i*)top);
const __m128i left_values = _mm_loadl_epi64((const __m128i*)left);
const __m128i combined = _mm_unpacklo_epi64(top_values, left_values);
const int DC = VP8HorizontalAdd8b(&combined) + 8;
- Put8x8uv(DC >> 4, dst);
+ Put8x8uv_SSE2(DC >> 4, dst);
}
-static WEBP_INLINE void DC8uvNoLeft(uint8_t* dst, const uint8_t* top) {
+static WEBP_INLINE void DC8uvNoLeft_SSE2(uint8_t* dst, const uint8_t* top) {
const __m128i zero = _mm_setzero_si128();
const __m128i top_values = _mm_loadl_epi64((const __m128i*)top);
const __m128i sum = _mm_sad_epu8(top_values, zero);
const int DC = _mm_cvtsi128_si32(sum) + 4;
- Put8x8uv(DC >> 3, dst);
+ Put8x8uv_SSE2(DC >> 3, dst);
}
-static WEBP_INLINE void DC8uvNoTop(uint8_t* dst, const uint8_t* left) {
+static WEBP_INLINE void DC8uvNoTop_SSE2(uint8_t* dst, const uint8_t* left) {
// 'left' is contiguous so we can reuse the top summation.
- DC8uvNoLeft(dst, left);
+ DC8uvNoLeft_SSE2(dst, left);
}
-static WEBP_INLINE void DC8uvNoTopLeft(uint8_t* dst) {
- Put8x8uv(0x80, dst);
+static WEBP_INLINE void DC8uvNoTopLeft_SSE2(uint8_t* dst) {
+ Put8x8uv_SSE2(0x80, dst);
}
-static WEBP_INLINE void DC8uvMode(uint8_t* dst, const uint8_t* left,
- const uint8_t* top) {
+static WEBP_INLINE void DC8uvMode_SSE2(uint8_t* dst, const uint8_t* left,
+ const uint8_t* top) {
if (top != NULL) {
if (left != NULL) { // top and left present
- DC8uv(dst, left, top);
+ DC8uv_SSE2(dst, left, top);
} else { // top, but no left
- DC8uvNoLeft(dst, top);
+ DC8uvNoLeft_SSE2(dst, top);
}
} else if (left != NULL) { // left but no top
- DC8uvNoTop(dst, left);
+ DC8uvNoTop_SSE2(dst, left);
} else { // no top, no left, nothing.
- DC8uvNoTopLeft(dst);
+ DC8uvNoTopLeft_SSE2(dst);
}
}
-static WEBP_INLINE void DC16(uint8_t* dst, const uint8_t* left,
- const uint8_t* top) {
+static WEBP_INLINE void DC16_SSE2(uint8_t* dst, const uint8_t* left,
+ const uint8_t* top) {
const __m128i top_row = _mm_load_si128((const __m128i*)top);
const __m128i left_row = _mm_load_si128((const __m128i*)left);
const int DC =
VP8HorizontalAdd8b(&top_row) + VP8HorizontalAdd8b(&left_row) + 16;
- Put16(DC >> 5, dst);
+ Put16_SSE2(DC >> 5, dst);
}
-static WEBP_INLINE void DC16NoLeft(uint8_t* dst, const uint8_t* top) {
+static WEBP_INLINE void DC16NoLeft_SSE2(uint8_t* dst, const uint8_t* top) {
const __m128i top_row = _mm_load_si128((const __m128i*)top);
const int DC = VP8HorizontalAdd8b(&top_row) + 8;
- Put16(DC >> 4, dst);
+ Put16_SSE2(DC >> 4, dst);
}
-static WEBP_INLINE void DC16NoTop(uint8_t* dst, const uint8_t* left) {
+static WEBP_INLINE void DC16NoTop_SSE2(uint8_t* dst, const uint8_t* left) {
// 'left' is contiguous so we can reuse the top summation.
- DC16NoLeft(dst, left);
+ DC16NoLeft_SSE2(dst, left);
}
-static WEBP_INLINE void DC16NoTopLeft(uint8_t* dst) {
- Put16(0x80, dst);
+static WEBP_INLINE void DC16NoTopLeft_SSE2(uint8_t* dst) {
+ Put16_SSE2(0x80, dst);
}
-static WEBP_INLINE void DC16Mode(uint8_t* dst, const uint8_t* left,
- const uint8_t* top) {
+static WEBP_INLINE void DC16Mode_SSE2(uint8_t* dst, const uint8_t* left,
+ const uint8_t* top) {
if (top != NULL) {
if (left != NULL) { // top and left present
- DC16(dst, left, top);
+ DC16_SSE2(dst, left, top);
} else { // top, but no left
- DC16NoLeft(dst, top);
+ DC16NoLeft_SSE2(dst, top);
}
} else if (left != NULL) { // left but no top
- DC16NoTop(dst, left);
+ DC16NoTop_SSE2(dst, left);
} else { // no top, no left, nothing.
- DC16NoTopLeft(dst);
+ DC16NoTopLeft_SSE2(dst);
}
}
@@ -709,7 +712,8 @@ static WEBP_INLINE void DC16Mode(uint8_t* dst, const uint8_t* left,
// where: AC = (a + b + 1) >> 1, BC = (b + c + 1) >> 1
// and ab = a ^ b, bc = b ^ c, lsb = (AC^BC)&1
-static WEBP_INLINE void VE4(uint8_t* dst, const uint8_t* top) { // vertical
+static WEBP_INLINE void VE4_SSE2(uint8_t* dst,
+ const uint8_t* top) { // vertical
const __m128i one = _mm_set1_epi8(1);
const __m128i ABCDEFGH = _mm_loadl_epi64((__m128i*)(top - 1));
const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 1);
@@ -725,7 +729,8 @@ static WEBP_INLINE void VE4(uint8_t* dst, const uint8_t* top) { // vertical
}
}
-static WEBP_INLINE void HE4(uint8_t* dst, const uint8_t* top) { // horizontal
+static WEBP_INLINE void HE4_SSE2(uint8_t* dst,
+ const uint8_t* top) { // horizontal
const int X = top[-1];
const int I = top[-2];
const int J = top[-3];
@@ -737,14 +742,15 @@ static WEBP_INLINE void HE4(uint8_t* dst, const uint8_t* top) { // horizontal
WebPUint32ToMem(dst + 3 * BPS, 0x01010101U * AVG3(K, L, L));
}
-static WEBP_INLINE void DC4(uint8_t* dst, const uint8_t* top) {
+static WEBP_INLINE void DC4_SSE2(uint8_t* dst, const uint8_t* top) {
uint32_t dc = 4;
int i;
for (i = 0; i < 4; ++i) dc += top[i] + top[-5 + i];
- Fill(dst, dc >> 3, 4);
+ Fill_SSE2(dst, dc >> 3, 4);
}
-static WEBP_INLINE void LD4(uint8_t* dst, const uint8_t* top) { // Down-Left
+static WEBP_INLINE void LD4_SSE2(uint8_t* dst,
+ const uint8_t* top) { // Down-Left
const __m128i one = _mm_set1_epi8(1);
const __m128i ABCDEFGH = _mm_loadl_epi64((const __m128i*)top);
const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 1);
@@ -760,8 +766,8 @@ static WEBP_INLINE void LD4(uint8_t* dst, const uint8_t* top) { // Down-Left
WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3)));
}
-static WEBP_INLINE void VR4(uint8_t* dst,
- const uint8_t* top) { // Vertical-Right
+static WEBP_INLINE void VR4_SSE2(uint8_t* dst,
+ const uint8_t* top) { // Vertical-Right
const __m128i one = _mm_set1_epi8(1);
const int I = top[-2];
const int J = top[-3];
@@ -786,8 +792,8 @@ static WEBP_INLINE void VR4(uint8_t* dst,
DST(0, 3) = AVG3(K, J, I);
}
-static WEBP_INLINE void VL4(uint8_t* dst,
- const uint8_t* top) { // Vertical-Left
+static WEBP_INLINE void VL4_SSE2(uint8_t* dst,
+ const uint8_t* top) { // Vertical-Left
const __m128i one = _mm_set1_epi8(1);
const __m128i ABCDEFGH = _mm_loadl_epi64((const __m128i*)top);
const __m128i BCDEFGH_ = _mm_srli_si128(ABCDEFGH, 1);
@@ -812,7 +818,8 @@ static WEBP_INLINE void VL4(uint8_t* dst,
DST(3, 3) = (extra_out >> 8) & 0xff;
}
-static WEBP_INLINE void RD4(uint8_t* dst, const uint8_t* top) { // Down-right
+static WEBP_INLINE void RD4_SSE2(uint8_t* dst,
+ const uint8_t* top) { // Down-right
const __m128i one = _mm_set1_epi8(1);
const __m128i LKJIXABC = _mm_loadl_epi64((const __m128i*)(top - 5));
const __m128i LKJIXABCD = _mm_insert_epi16(LKJIXABC, top[3], 4);
@@ -828,7 +835,7 @@ static WEBP_INLINE void RD4(uint8_t* dst, const uint8_t* top) { // Down-right
WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3)));
}
-static WEBP_INLINE void HU4(uint8_t* dst, const uint8_t* top) {
+static WEBP_INLINE void HU4_SSE2(uint8_t* dst, const uint8_t* top) {
const int I = top[-2];
const int J = top[-3];
const int K = top[-4];
@@ -843,7 +850,7 @@ static WEBP_INLINE void HU4(uint8_t* dst, const uint8_t* top) {
DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L;
}
-static WEBP_INLINE void HD4(uint8_t* dst, const uint8_t* top) {
+static WEBP_INLINE void HD4_SSE2(uint8_t* dst, const uint8_t* top) {
const int X = top[-1];
const int I = top[-2];
const int J = top[-3];
@@ -866,7 +873,7 @@ static WEBP_INLINE void HD4(uint8_t* dst, const uint8_t* top) {
DST(1, 3) = AVG3(L, K, J);
}
-static WEBP_INLINE void TM4(uint8_t* dst, const uint8_t* top) {
+static WEBP_INLINE void TM4_SSE2(uint8_t* dst, const uint8_t* top) {
const __m128i zero = _mm_setzero_si128();
const __m128i top_values = _mm_cvtsi32_si128(WebPMemToUint32(top));
const __m128i top_base = _mm_unpacklo_epi8(top_values, zero);
@@ -888,55 +895,56 @@ static WEBP_INLINE void TM4(uint8_t* dst, const uint8_t* top) {
// Left samples are top[-5 .. -2], top_left is top[-1], top are
// located at top[0..3], and top right is top[4..7]
-static void Intra4Preds(uint8_t* dst, const uint8_t* top) {
- DC4(I4DC4 + dst, top);
- TM4(I4TM4 + dst, top);
- VE4(I4VE4 + dst, top);
- HE4(I4HE4 + dst, top);
- RD4(I4RD4 + dst, top);
- VR4(I4VR4 + dst, top);
- LD4(I4LD4 + dst, top);
- VL4(I4VL4 + dst, top);
- HD4(I4HD4 + dst, top);
- HU4(I4HU4 + dst, top);
+static void Intra4Preds_SSE2(uint8_t* dst, const uint8_t* top) {
+ DC4_SSE2(I4DC4 + dst, top);
+ TM4_SSE2(I4TM4 + dst, top);
+ VE4_SSE2(I4VE4 + dst, top);
+ HE4_SSE2(I4HE4 + dst, top);
+ RD4_SSE2(I4RD4 + dst, top);
+ VR4_SSE2(I4VR4 + dst, top);
+ LD4_SSE2(I4LD4 + dst, top);
+ VL4_SSE2(I4VL4 + dst, top);
+ HD4_SSE2(I4HD4 + dst, top);
+ HU4_SSE2(I4HU4 + dst, top);
}
//------------------------------------------------------------------------------
// Chroma 8x8 prediction (paragraph 12.2)
-static void IntraChromaPreds(uint8_t* dst, const uint8_t* left,
- const uint8_t* top) {
+static void IntraChromaPreds_SSE2(uint8_t* dst, const uint8_t* left,
+ const uint8_t* top) {
// U block
- DC8uvMode(C8DC8 + dst, left, top);
- VerticalPred(C8VE8 + dst, top, 8);
- HorizontalPred(C8HE8 + dst, left, 8);
- TrueMotion(C8TM8 + dst, left, top, 8);
+ DC8uvMode_SSE2(C8DC8 + dst, left, top);
+ VerticalPred_SSE2(C8VE8 + dst, top, 8);
+ HorizontalPred_SSE2(C8HE8 + dst, left, 8);
+ TrueMotion_SSE2(C8TM8 + dst, left, top, 8);
// V block
dst += 8;
if (top != NULL) top += 8;
if (left != NULL) left += 16;
- DC8uvMode(C8DC8 + dst, left, top);
- VerticalPred(C8VE8 + dst, top, 8);
- HorizontalPred(C8HE8 + dst, left, 8);
- TrueMotion(C8TM8 + dst, left, top, 8);
+ DC8uvMode_SSE2(C8DC8 + dst, left, top);
+ VerticalPred_SSE2(C8VE8 + dst, top, 8);
+ HorizontalPred_SSE2(C8HE8 + dst, left, 8);
+ TrueMotion_SSE2(C8TM8 + dst, left, top, 8);
}
//------------------------------------------------------------------------------
// luma 16x16 prediction (paragraph 12.3)
-static void Intra16Preds(uint8_t* dst,
- const uint8_t* left, const uint8_t* top) {
- DC16Mode(I16DC16 + dst, left, top);
- VerticalPred(I16VE16 + dst, top, 16);
- HorizontalPred(I16HE16 + dst, left, 16);
- TrueMotion(I16TM16 + dst, left, top, 16);
+static void Intra16Preds_SSE2(uint8_t* dst,
+ const uint8_t* left, const uint8_t* top) {
+ DC16Mode_SSE2(I16DC16 + dst, left, top);
+ VerticalPred_SSE2(I16VE16 + dst, top, 16);
+ HorizontalPred_SSE2(I16HE16 + dst, left, 16);
+ TrueMotion_SSE2(I16TM16 + dst, left, top, 16);
}
//------------------------------------------------------------------------------
// Metric
-static WEBP_INLINE void SubtractAndAccumulate(const __m128i a, const __m128i b,
- __m128i* const sum) {
+static WEBP_INLINE void SubtractAndAccumulate_SSE2(const __m128i a,
+ const __m128i b,
+ __m128i* const sum) {
// take abs(a-b) in 8b
const __m128i a_b = _mm_subs_epu8(a, b);
const __m128i b_a = _mm_subs_epu8(b, a);
@@ -951,8 +959,8 @@ static WEBP_INLINE void SubtractAndAccumulate(const __m128i a, const __m128i b,
*sum = _mm_add_epi32(sum1, sum2);
}
-static WEBP_INLINE int SSE_16xN(const uint8_t* a, const uint8_t* b,
- int num_pairs) {
+static WEBP_INLINE int SSE_16xN_SSE2(const uint8_t* a, const uint8_t* b,
+ int num_pairs) {
__m128i sum = _mm_setzero_si128();
int32_t tmp[4];
int i;
@@ -963,8 +971,8 @@ static WEBP_INLINE int SSE_16xN(const uint8_t* a, const uint8_t* b,
const __m128i a1 = _mm_loadu_si128((const __m128i*)&a[BPS * 1]);
const __m128i b1 = _mm_loadu_si128((const __m128i*)&b[BPS * 1]);
__m128i sum1, sum2;
- SubtractAndAccumulate(a0, b0, &sum1);
- SubtractAndAccumulate(a1, b1, &sum2);
+ SubtractAndAccumulate_SSE2(a0, b0, &sum1);
+ SubtractAndAccumulate_SSE2(a1, b1, &sum2);
sum = _mm_add_epi32(sum, _mm_add_epi32(sum1, sum2));
a += 2 * BPS;
b += 2 * BPS;
@@ -973,18 +981,18 @@ static WEBP_INLINE int SSE_16xN(const uint8_t* a, const uint8_t* b,
return (tmp[3] + tmp[2] + tmp[1] + tmp[0]);
}
-static int SSE16x16(const uint8_t* a, const uint8_t* b) {
- return SSE_16xN(a, b, 8);
+static int SSE16x16_SSE2(const uint8_t* a, const uint8_t* b) {
+ return SSE_16xN_SSE2(a, b, 8);
}
-static int SSE16x8(const uint8_t* a, const uint8_t* b) {
- return SSE_16xN(a, b, 4);
+static int SSE16x8_SSE2(const uint8_t* a, const uint8_t* b) {
+ return SSE_16xN_SSE2(a, b, 4);
}
#define LOAD_8x16b(ptr) \
_mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(ptr)), zero)
-static int SSE8x8(const uint8_t* a, const uint8_t* b) {
+static int SSE8x8_SSE2(const uint8_t* a, const uint8_t* b) {
const __m128i zero = _mm_setzero_si128();
int num_pairs = 4;
__m128i sum = zero;
@@ -1011,7 +1019,7 @@ static int SSE8x8(const uint8_t* a, const uint8_t* b) {
}
#undef LOAD_8x16b
-static int SSE4x4(const uint8_t* a, const uint8_t* b) {
+static int SSE4x4_SSE2(const uint8_t* a, const uint8_t* b) {
const __m128i zero = _mm_setzero_si128();
// Load values. Note that we read 8 pixels instead of 4,
@@ -1048,7 +1056,7 @@ static int SSE4x4(const uint8_t* a, const uint8_t* b) {
//------------------------------------------------------------------------------
-static void Mean16x4(const uint8_t* ref, uint32_t dc[4]) {
+static void Mean16x4_SSE2(const uint8_t* ref, uint32_t dc[4]) {
const __m128i mask = _mm_set1_epi16(0x00ff);
const __m128i a0 = _mm_loadu_si128((const __m128i*)&ref[BPS * 0]);
const __m128i a1 = _mm_loadu_si128((const __m128i*)&ref[BPS * 1]);
@@ -1086,8 +1094,8 @@ static void Mean16x4(const uint8_t* ref, uint32_t dc[4]) {
// Hadamard transform
// Returns the weighted sum of the absolute value of transformed coefficients.
// w[] contains a row-major 4 by 4 symmetric matrix.
-static int TTransform(const uint8_t* inA, const uint8_t* inB,
- const uint16_t* const w) {
+static int TTransform_SSE2(const uint8_t* inA, const uint8_t* inB,
+ const uint16_t* const w) {
int32_t sum[4];
__m128i tmp_0, tmp_1, tmp_2, tmp_3;
const __m128i zero = _mm_setzero_si128();
@@ -1187,19 +1195,19 @@ static int TTransform(const uint8_t* inA, const uint8_t* inB,
return sum[0] + sum[1] + sum[2] + sum[3];
}
-static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
- const uint16_t* const w) {
- const int diff_sum = TTransform(a, b, w);
+static int Disto4x4_SSE2(const uint8_t* const a, const uint8_t* const b,
+ const uint16_t* const w) {
+ const int diff_sum = TTransform_SSE2(a, b, w);
return abs(diff_sum) >> 5;
}
-static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
- const uint16_t* const w) {
+static int Disto16x16_SSE2(const uint8_t* const a, const uint8_t* const b,
+ const uint16_t* const w) {
int D = 0;
int x, y;
for (y = 0; y < 16 * BPS; y += 4 * BPS) {
for (x = 0; x < 16; x += 4) {
- D += Disto4x4(a + x + y, b + x + y, w);
+ D += Disto4x4_SSE2(a + x + y, b + x + y, w);
}
}
return D;
@@ -1209,9 +1217,9 @@ static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
// Quantization
//
-static WEBP_INLINE int DoQuantizeBlock(int16_t in[16], int16_t out[16],
- const uint16_t* const sharpen,
- const VP8Matrix* const mtx) {
+static WEBP_INLINE int DoQuantizeBlock_SSE2(int16_t in[16], int16_t out[16],
+ const uint16_t* const sharpen,
+ const VP8Matrix* const mtx) {
const __m128i max_coeff_2047 = _mm_set1_epi16(MAX_LEVEL);
const __m128i zero = _mm_setzero_si128();
__m128i coeff0, coeff8;
@@ -1321,22 +1329,22 @@ static WEBP_INLINE int DoQuantizeBlock(int16_t in[16], int16_t out[16],
return (_mm_movemask_epi8(_mm_cmpeq_epi8(packed_out, zero)) != 0xffff);
}
-static int QuantizeBlock(int16_t in[16], int16_t out[16],
- const VP8Matrix* const mtx) {
- return DoQuantizeBlock(in, out, &mtx->sharpen_[0], mtx);
+static int QuantizeBlock_SSE2(int16_t in[16], int16_t out[16],
+ const VP8Matrix* const mtx) {
+ return DoQuantizeBlock_SSE2(in, out, &mtx->sharpen_[0], mtx);
}
-static int QuantizeBlockWHT(int16_t in[16], int16_t out[16],
- const VP8Matrix* const mtx) {
- return DoQuantizeBlock(in, out, NULL, mtx);
+static int QuantizeBlockWHT_SSE2(int16_t in[16], int16_t out[16],
+ const VP8Matrix* const mtx) {
+ return DoQuantizeBlock_SSE2(in, out, NULL, mtx);
}
-static int Quantize2Blocks(int16_t in[32], int16_t out[32],
- const VP8Matrix* const mtx) {
+static int Quantize2Blocks_SSE2(int16_t in[32], int16_t out[32],
+ const VP8Matrix* const mtx) {
int nz;
const uint16_t* const sharpen = &mtx->sharpen_[0];
- nz = DoQuantizeBlock(in + 0 * 16, out + 0 * 16, sharpen, mtx) << 0;
- nz |= DoQuantizeBlock(in + 1 * 16, out + 1 * 16, sharpen, mtx) << 1;
+ nz = DoQuantizeBlock_SSE2(in + 0 * 16, out + 0 * 16, sharpen, mtx) << 0;
+ nz |= DoQuantizeBlock_SSE2(in + 1 * 16, out + 1 * 16, sharpen, mtx) << 1;
return nz;
}
@@ -1346,139 +1354,28 @@ static int Quantize2Blocks(int16_t in[32], int16_t out[32],
extern void VP8EncDspInitSSE2(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitSSE2(void) {
- VP8CollectHistogram = CollectHistogram;
- VP8EncPredLuma16 = Intra16Preds;
- VP8EncPredChroma8 = IntraChromaPreds;
- VP8EncPredLuma4 = Intra4Preds;
- VP8EncQuantizeBlock = QuantizeBlock;
- VP8EncQuantize2Blocks = Quantize2Blocks;
- VP8EncQuantizeBlockWHT = QuantizeBlockWHT;
- VP8ITransform = ITransform;
- VP8FTransform = FTransform;
- VP8FTransform2 = FTransform2;
- VP8FTransformWHT = FTransformWHT;
- VP8SSE16x16 = SSE16x16;
- VP8SSE16x8 = SSE16x8;
- VP8SSE8x8 = SSE8x8;
- VP8SSE4x4 = SSE4x4;
- VP8TDisto4x4 = Disto4x4;
- VP8TDisto16x16 = Disto16x16;
- VP8Mean16x4 = Mean16x4;
-}
-
-//------------------------------------------------------------------------------
-// SSIM / PSNR entry point (TODO(skal): move to its own file later)
-
-static uint32_t AccumulateSSE_SSE2(const uint8_t* src1,
- const uint8_t* src2, int len) {
- int i = 0;
- uint32_t sse2 = 0;
- if (len >= 16) {
- const int limit = len - 32;
- int32_t tmp[4];
- __m128i sum1;
- __m128i sum = _mm_setzero_si128();
- __m128i a0 = _mm_loadu_si128((const __m128i*)&src1[i]);
- __m128i b0 = _mm_loadu_si128((const __m128i*)&src2[i]);
- i += 16;
- while (i <= limit) {
- const __m128i a1 = _mm_loadu_si128((const __m128i*)&src1[i]);
- const __m128i b1 = _mm_loadu_si128((const __m128i*)&src2[i]);
- __m128i sum2;
- i += 16;
- SubtractAndAccumulate(a0, b0, &sum1);
- sum = _mm_add_epi32(sum, sum1);
- a0 = _mm_loadu_si128((const __m128i*)&src1[i]);
- b0 = _mm_loadu_si128((const __m128i*)&src2[i]);
- i += 16;
- SubtractAndAccumulate(a1, b1, &sum2);
- sum = _mm_add_epi32(sum, sum2);
- }
- SubtractAndAccumulate(a0, b0, &sum1);
- sum = _mm_add_epi32(sum, sum1);
- _mm_storeu_si128((__m128i*)tmp, sum);
- sse2 += (tmp[3] + tmp[2] + tmp[1] + tmp[0]);
- }
-
- for (; i < len; ++i) {
- const int32_t diff = src1[i] - src2[i];
- sse2 += diff * diff;
- }
- return sse2;
-}
-
-static uint32_t HorizontalAdd16b(const __m128i* const m) {
- uint16_t tmp[8];
- const __m128i a = _mm_srli_si128(*m, 8);
- const __m128i b = _mm_add_epi16(*m, a);
- _mm_storeu_si128((__m128i*)tmp, b);
- return (uint32_t)tmp[3] + tmp[2] + tmp[1] + tmp[0];
-}
-
-static uint32_t HorizontalAdd32b(const __m128i* const m) {
- const __m128i a = _mm_srli_si128(*m, 8);
- const __m128i b = _mm_add_epi32(*m, a);
- const __m128i c = _mm_add_epi32(b, _mm_srli_si128(b, 4));
- return (uint32_t)_mm_cvtsi128_si32(c);
-}
-
-static const uint16_t kWeight[] = { 1, 2, 3, 4, 3, 2, 1, 0 };
-
-#define ACCUMULATE_ROW(WEIGHT) do { \
- /* compute row weight (Wx * Wy) */ \
- const __m128i Wy = _mm_set1_epi16((WEIGHT)); \
- const __m128i W = _mm_mullo_epi16(Wx, Wy); \
- /* process 8 bytes at a time (7 bytes, actually) */ \
- const __m128i a0 = _mm_loadl_epi64((const __m128i*)src1); \
- const __m128i b0 = _mm_loadl_epi64((const __m128i*)src2); \
- /* convert to 16b and multiply by weight */ \
- const __m128i a1 = _mm_unpacklo_epi8(a0, zero); \
- const __m128i b1 = _mm_unpacklo_epi8(b0, zero); \
- const __m128i wa1 = _mm_mullo_epi16(a1, W); \
- const __m128i wb1 = _mm_mullo_epi16(b1, W); \
- /* accumulate */ \
- xm = _mm_add_epi16(xm, wa1); \
- ym = _mm_add_epi16(ym, wb1); \
- xxm = _mm_add_epi32(xxm, _mm_madd_epi16(a1, wa1)); \
- xym = _mm_add_epi32(xym, _mm_madd_epi16(a1, wb1)); \
- yym = _mm_add_epi32(yym, _mm_madd_epi16(b1, wb1)); \
- src1 += stride1; \
- src2 += stride2; \
-} while (0)
-
-static double SSIMGet_SSE2(const uint8_t* src1, int stride1,
- const uint8_t* src2, int stride2) {
- VP8DistoStats stats;
- const __m128i zero = _mm_setzero_si128();
- __m128i xm = zero, ym = zero; // 16b accums
- __m128i xxm = zero, yym = zero, xym = zero; // 32b accum
- const __m128i Wx = _mm_loadu_si128((const __m128i*)kWeight);
- assert(2 * VP8_SSIM_KERNEL + 1 == 7);
- ACCUMULATE_ROW(1);
- ACCUMULATE_ROW(2);
- ACCUMULATE_ROW(3);
- ACCUMULATE_ROW(4);
- ACCUMULATE_ROW(3);
- ACCUMULATE_ROW(2);
- ACCUMULATE_ROW(1);
- stats.xm = HorizontalAdd16b(&xm);
- stats.ym = HorizontalAdd16b(&ym);
- stats.xxm = HorizontalAdd32b(&xxm);
- stats.xym = HorizontalAdd32b(&xym);
- stats.yym = HorizontalAdd32b(&yym);
- return VP8SSIMFromStats(&stats);
-}
-
-extern void VP8SSIMDspInitSSE2(void);
-
-WEBP_TSAN_IGNORE_FUNCTION void VP8SSIMDspInitSSE2(void) {
- VP8AccumulateSSE = AccumulateSSE_SSE2;
- VP8SSIMGet = SSIMGet_SSE2;
+ VP8CollectHistogram = CollectHistogram_SSE2;
+ VP8EncPredLuma16 = Intra16Preds_SSE2;
+ VP8EncPredChroma8 = IntraChromaPreds_SSE2;
+ VP8EncPredLuma4 = Intra4Preds_SSE2;
+ VP8EncQuantizeBlock = QuantizeBlock_SSE2;
+ VP8EncQuantize2Blocks = Quantize2Blocks_SSE2;
+ VP8EncQuantizeBlockWHT = QuantizeBlockWHT_SSE2;
+ VP8ITransform = ITransform_SSE2;
+ VP8FTransform = FTransform_SSE2;
+ VP8FTransform2 = FTransform2_SSE2;
+ VP8FTransformWHT = FTransformWHT_SSE2;
+ VP8SSE16x16 = SSE16x16_SSE2;
+ VP8SSE16x8 = SSE16x8_SSE2;
+ VP8SSE8x8 = SSE8x8_SSE2;
+ VP8SSE4x4 = SSE4x4_SSE2;
+ VP8TDisto4x4 = Disto4x4_SSE2;
+ VP8TDisto16x16 = Disto16x16_SSE2;
+ VP8Mean16x4 = Mean16x4_SSE2;
}
#else // !WEBP_USE_SSE2
WEBP_DSP_INIT_STUB(VP8EncDspInitSSE2)
-WEBP_DSP_INIT_STUB(VP8SSIMDspInitSSE2)
#endif // WEBP_USE_SSE2
diff --git a/src/3rdparty/libwebp/src/dsp/enc_sse41.c b/src/3rdparty/libwebp/src/dsp/enc_sse41.c
index e32086d..924035a 100644
--- a/src/3rdparty/libwebp/src/dsp/enc_sse41.c
+++ b/src/3rdparty/libwebp/src/dsp/enc_sse41.c
@@ -11,21 +11,21 @@
//
// Author: Skal (pascal.massimino@gmail.com)
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
#if defined(WEBP_USE_SSE41)
#include <smmintrin.h>
#include <stdlib.h> // for abs()
-#include "./common_sse2.h"
-#include "../enc/vp8i_enc.h"
+#include "src/dsp/common_sse2.h"
+#include "src/enc/vp8i_enc.h"
//------------------------------------------------------------------------------
// Compute susceptibility based on DCT-coeff histograms.
-static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
- int start_block, int end_block,
- VP8Histogram* const histo) {
+static void CollectHistogram_SSE41(const uint8_t* ref, const uint8_t* pred,
+ int start_block, int end_block,
+ VP8Histogram* const histo) {
const __m128i max_coeff_thresh = _mm_set1_epi16(MAX_COEFF_THRESH);
int j;
int distribution[MAX_COEFF_THRESH + 1] = { 0 };
@@ -70,8 +70,8 @@ static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
// Hadamard transform
// Returns the weighted sum of the absolute value of transformed coefficients.
// w[] contains a row-major 4 by 4 symmetric matrix.
-static int TTransform(const uint8_t* inA, const uint8_t* inB,
- const uint16_t* const w) {
+static int TTransform_SSE41(const uint8_t* inA, const uint8_t* inB,
+ const uint16_t* const w) {
int32_t sum[4];
__m128i tmp_0, tmp_1, tmp_2, tmp_3;
@@ -168,19 +168,19 @@ static int TTransform(const uint8_t* inA, const uint8_t* inB,
return sum[0] + sum[1] + sum[2] + sum[3];
}
-static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
- const uint16_t* const w) {
- const int diff_sum = TTransform(a, b, w);
+static int Disto4x4_SSE41(const uint8_t* const a, const uint8_t* const b,
+ const uint16_t* const w) {
+ const int diff_sum = TTransform_SSE41(a, b, w);
return abs(diff_sum) >> 5;
}
-static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
- const uint16_t* const w) {
+static int Disto16x16_SSE41(const uint8_t* const a, const uint8_t* const b,
+ const uint16_t* const w) {
int D = 0;
int x, y;
for (y = 0; y < 16 * BPS; y += 4 * BPS) {
for (x = 0; x < 16; x += 4) {
- D += Disto4x4(a + x + y, b + x + y, w);
+ D += Disto4x4_SSE41(a + x + y, b + x + y, w);
}
}
return D;
@@ -197,9 +197,9 @@ static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
2 * (D) + 1, 2 * (D) + 0, 2 * (C) + 1, 2 * (C) + 0, \
2 * (B) + 1, 2 * (B) + 0, 2 * (A) + 1, 2 * (A) + 0)
-static WEBP_INLINE int DoQuantizeBlock(int16_t in[16], int16_t out[16],
- const uint16_t* const sharpen,
- const VP8Matrix* const mtx) {
+static WEBP_INLINE int DoQuantizeBlock_SSE41(int16_t in[16], int16_t out[16],
+ const uint16_t* const sharpen,
+ const VP8Matrix* const mtx) {
const __m128i max_coeff_2047 = _mm_set1_epi16(MAX_LEVEL);
const __m128i zero = _mm_setzero_si128();
__m128i out0, out8;
@@ -300,22 +300,22 @@ static WEBP_INLINE int DoQuantizeBlock(int16_t in[16], int16_t out[16],
#undef PSHUFB_CST
-static int QuantizeBlock(int16_t in[16], int16_t out[16],
- const VP8Matrix* const mtx) {
- return DoQuantizeBlock(in, out, &mtx->sharpen_[0], mtx);
+static int QuantizeBlock_SSE41(int16_t in[16], int16_t out[16],
+ const VP8Matrix* const mtx) {
+ return DoQuantizeBlock_SSE41(in, out, &mtx->sharpen_[0], mtx);
}
-static int QuantizeBlockWHT(int16_t in[16], int16_t out[16],
- const VP8Matrix* const mtx) {
- return DoQuantizeBlock(in, out, NULL, mtx);
+static int QuantizeBlockWHT_SSE41(int16_t in[16], int16_t out[16],
+ const VP8Matrix* const mtx) {
+ return DoQuantizeBlock_SSE41(in, out, NULL, mtx);
}
-static int Quantize2Blocks(int16_t in[32], int16_t out[32],
- const VP8Matrix* const mtx) {
+static int Quantize2Blocks_SSE41(int16_t in[32], int16_t out[32],
+ const VP8Matrix* const mtx) {
int nz;
const uint16_t* const sharpen = &mtx->sharpen_[0];
- nz = DoQuantizeBlock(in + 0 * 16, out + 0 * 16, sharpen, mtx) << 0;
- nz |= DoQuantizeBlock(in + 1 * 16, out + 1 * 16, sharpen, mtx) << 1;
+ nz = DoQuantizeBlock_SSE41(in + 0 * 16, out + 0 * 16, sharpen, mtx) << 0;
+ nz |= DoQuantizeBlock_SSE41(in + 1 * 16, out + 1 * 16, sharpen, mtx) << 1;
return nz;
}
@@ -324,12 +324,12 @@ static int Quantize2Blocks(int16_t in[32], int16_t out[32],
extern void VP8EncDspInitSSE41(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitSSE41(void) {
- VP8CollectHistogram = CollectHistogram;
- VP8EncQuantizeBlock = QuantizeBlock;
- VP8EncQuantize2Blocks = Quantize2Blocks;
- VP8EncQuantizeBlockWHT = QuantizeBlockWHT;
- VP8TDisto4x4 = Disto4x4;
- VP8TDisto16x16 = Disto16x16;
+ VP8CollectHistogram = CollectHistogram_SSE41;
+ VP8EncQuantizeBlock = QuantizeBlock_SSE41;
+ VP8EncQuantize2Blocks = Quantize2Blocks_SSE41;
+ VP8EncQuantizeBlockWHT = QuantizeBlockWHT_SSE41;
+ VP8TDisto4x4 = Disto4x4_SSE41;
+ VP8TDisto16x16 = Disto16x16_SSE41;
}
#else // !WEBP_USE_SSE41
diff --git a/src/3rdparty/libwebp/src/dsp/filters.c b/src/3rdparty/libwebp/src/dsp/filters.c
index 65f34aa..ca5f877 100644
--- a/src/3rdparty/libwebp/src/dsp/filters.c
+++ b/src/3rdparty/libwebp/src/dsp/filters.c
@@ -11,7 +11,7 @@
//
// Author: Urvang (urvang@google.com)
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
#include <assert.h>
#include <stdlib.h>
#include <string.h>
@@ -20,16 +20,17 @@
// Helpful macro.
# define SANITY_CHECK(in, out) \
- assert(in != NULL); \
- assert(out != NULL); \
+ assert((in) != NULL); \
+ assert((out) != NULL); \
assert(width > 0); \
assert(height > 0); \
assert(stride >= width); \
assert(row >= 0 && num_rows > 0 && row + num_rows <= height); \
(void)height; // Silence unused warning.
-static WEBP_INLINE void PredictLine(const uint8_t* src, const uint8_t* pred,
- uint8_t* dst, int length, int inverse) {
+#if !WEBP_NEON_OMIT_C_CODE
+static WEBP_INLINE void PredictLine_C(const uint8_t* src, const uint8_t* pred,
+ uint8_t* dst, int length, int inverse) {
int i;
if (inverse) {
for (i = 0; i < length; ++i) dst[i] = src[i] + pred[i];
@@ -41,10 +42,10 @@ static WEBP_INLINE void PredictLine(const uint8_t* src, const uint8_t* pred,
//------------------------------------------------------------------------------
// Horizontal filter.
-static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in,
- int width, int height, int stride,
- int row, int num_rows,
- int inverse, uint8_t* out) {
+static WEBP_INLINE void DoHorizontalFilter_C(const uint8_t* in,
+ int width, int height, int stride,
+ int row, int num_rows,
+ int inverse, uint8_t* out) {
const uint8_t* preds;
const size_t start_offset = row * stride;
const int last_row = row + num_rows;
@@ -56,7 +57,7 @@ static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in,
if (row == 0) {
// Leftmost pixel is the same as input for topmost scanline.
out[0] = in[0];
- PredictLine(in + 1, preds, out + 1, width - 1, inverse);
+ PredictLine_C(in + 1, preds, out + 1, width - 1, inverse);
row = 1;
preds += stride;
in += stride;
@@ -66,8 +67,8 @@ static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in,
// Filter line-by-line.
while (row < last_row) {
// Leftmost pixel is predicted from above.
- PredictLine(in, preds - stride, out, 1, inverse);
- PredictLine(in + 1, preds, out + 1, width - 1, inverse);
+ PredictLine_C(in, preds - stride, out, 1, inverse);
+ PredictLine_C(in + 1, preds, out + 1, width - 1, inverse);
++row;
preds += stride;
in += stride;
@@ -78,10 +79,10 @@ static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in,
//------------------------------------------------------------------------------
// Vertical filter.
-static WEBP_INLINE void DoVerticalFilter(const uint8_t* in,
- int width, int height, int stride,
- int row, int num_rows,
- int inverse, uint8_t* out) {
+static WEBP_INLINE void DoVerticalFilter_C(const uint8_t* in,
+ int width, int height, int stride,
+ int row, int num_rows,
+ int inverse, uint8_t* out) {
const uint8_t* preds;
const size_t start_offset = row * stride;
const int last_row = row + num_rows;
@@ -94,7 +95,7 @@ static WEBP_INLINE void DoVerticalFilter(const uint8_t* in,
// Very first top-left pixel is copied.
out[0] = in[0];
// Rest of top scan-line is left-predicted.
- PredictLine(in + 1, preds, out + 1, width - 1, inverse);
+ PredictLine_C(in + 1, preds, out + 1, width - 1, inverse);
row = 1;
in += stride;
out += stride;
@@ -105,26 +106,28 @@ static WEBP_INLINE void DoVerticalFilter(const uint8_t* in,
// Filter line-by-line.
while (row < last_row) {
- PredictLine(in, preds, out, width, inverse);
+ PredictLine_C(in, preds, out, width, inverse);
++row;
preds += stride;
in += stride;
out += stride;
}
}
+#endif // !WEBP_NEON_OMIT_C_CODE
//------------------------------------------------------------------------------
// Gradient filter.
-static WEBP_INLINE int GradientPredictor(uint8_t a, uint8_t b, uint8_t c) {
+static WEBP_INLINE int GradientPredictor_C(uint8_t a, uint8_t b, uint8_t c) {
const int g = a + b - c;
return ((g & ~0xff) == 0) ? g : (g < 0) ? 0 : 255; // clip to 8bit
}
-static WEBP_INLINE void DoGradientFilter(const uint8_t* in,
- int width, int height, int stride,
- int row, int num_rows,
- int inverse, uint8_t* out) {
+#if !WEBP_NEON_OMIT_C_CODE
+static WEBP_INLINE void DoGradientFilter_C(const uint8_t* in,
+ int width, int height, int stride,
+ int row, int num_rows,
+ int inverse, uint8_t* out) {
const uint8_t* preds;
const size_t start_offset = row * stride;
const int last_row = row + num_rows;
@@ -136,7 +139,7 @@ static WEBP_INLINE void DoGradientFilter(const uint8_t* in,
// left prediction for top scan-line
if (row == 0) {
out[0] = in[0];
- PredictLine(in + 1, preds, out + 1, width - 1, inverse);
+ PredictLine_C(in + 1, preds, out + 1, width - 1, inverse);
row = 1;
preds += stride;
in += stride;
@@ -147,11 +150,11 @@ static WEBP_INLINE void DoGradientFilter(const uint8_t* in,
while (row < last_row) {
int w;
// leftmost pixel: predict from above.
- PredictLine(in, preds - stride, out, 1, inverse);
+ PredictLine_C(in, preds - stride, out, 1, inverse);
for (w = 1; w < width; ++w) {
- const int pred = GradientPredictor(preds[w - 1],
- preds[w - stride],
- preds[w - stride - 1]);
+ const int pred = GradientPredictor_C(preds[w - 1],
+ preds[w - stride],
+ preds[w - stride - 1]);
out[w] = in[w] + (inverse ? pred : -pred);
}
++row;
@@ -160,32 +163,34 @@ static WEBP_INLINE void DoGradientFilter(const uint8_t* in,
out += stride;
}
}
+#endif // !WEBP_NEON_OMIT_C_CODE
#undef SANITY_CHECK
//------------------------------------------------------------------------------
-static void HorizontalFilter(const uint8_t* data, int width, int height,
- int stride, uint8_t* filtered_data) {
- DoHorizontalFilter(data, width, height, stride, 0, height, 0, filtered_data);
+#if !WEBP_NEON_OMIT_C_CODE
+static void HorizontalFilter_C(const uint8_t* data, int width, int height,
+ int stride, uint8_t* filtered_data) {
+ DoHorizontalFilter_C(data, width, height, stride, 0, height, 0,
+ filtered_data);
}
-static void VerticalFilter(const uint8_t* data, int width, int height,
- int stride, uint8_t* filtered_data) {
- DoVerticalFilter(data, width, height, stride, 0, height, 0, filtered_data);
+static void VerticalFilter_C(const uint8_t* data, int width, int height,
+ int stride, uint8_t* filtered_data) {
+ DoVerticalFilter_C(data, width, height, stride, 0, height, 0, filtered_data);
}
-
-static void GradientFilter(const uint8_t* data, int width, int height,
- int stride, uint8_t* filtered_data) {
- DoGradientFilter(data, width, height, stride, 0, height, 0, filtered_data);
+static void GradientFilter_C(const uint8_t* data, int width, int height,
+ int stride, uint8_t* filtered_data) {
+ DoGradientFilter_C(data, width, height, stride, 0, height, 0, filtered_data);
}
-
+#endif // !WEBP_NEON_OMIT_C_CODE
//------------------------------------------------------------------------------
-static void HorizontalUnfilter(const uint8_t* prev, const uint8_t* in,
- uint8_t* out, int width) {
+static void HorizontalUnfilter_C(const uint8_t* prev, const uint8_t* in,
+ uint8_t* out, int width) {
uint8_t pred = (prev == NULL) ? 0 : prev[0];
int i;
for (i = 0; i < width; ++i) {
@@ -194,26 +199,28 @@ static void HorizontalUnfilter(const uint8_t* prev, const uint8_t* in,
}
}
-static void VerticalUnfilter(const uint8_t* prev, const uint8_t* in,
- uint8_t* out, int width) {
+#if !WEBP_NEON_OMIT_C_CODE
+static void VerticalUnfilter_C(const uint8_t* prev, const uint8_t* in,
+ uint8_t* out, int width) {
if (prev == NULL) {
- HorizontalUnfilter(NULL, in, out, width);
+ HorizontalUnfilter_C(NULL, in, out, width);
} else {
int i;
for (i = 0; i < width; ++i) out[i] = prev[i] + in[i];
}
}
+#endif // !WEBP_NEON_OMIT_C_CODE
-static void GradientUnfilter(const uint8_t* prev, const uint8_t* in,
- uint8_t* out, int width) {
+static void GradientUnfilter_C(const uint8_t* prev, const uint8_t* in,
+ uint8_t* out, int width) {
if (prev == NULL) {
- HorizontalUnfilter(NULL, in, out, width);
+ HorizontalUnfilter_C(NULL, in, out, width);
} else {
uint8_t top = prev[0], top_left = top, left = top;
int i;
for (i = 0; i < width; ++i) {
top = prev[i]; // need to read this first, in case prev==out
- left = in[i] + GradientPredictor(left, top, top_left);
+ left = in[i] + GradientPredictor_C(left, top, top_left);
top_left = top;
out[i] = left;
}
@@ -238,14 +245,18 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInit(void) {
if (filters_last_cpuinfo_used == VP8GetCPUInfo) return;
WebPUnfilters[WEBP_FILTER_NONE] = NULL;
- WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter;
- WebPUnfilters[WEBP_FILTER_VERTICAL] = VerticalUnfilter;
- WebPUnfilters[WEBP_FILTER_GRADIENT] = GradientUnfilter;
+#if !WEBP_NEON_OMIT_C_CODE
+ WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter_C;
+ WebPUnfilters[WEBP_FILTER_VERTICAL] = VerticalUnfilter_C;
+#endif
+ WebPUnfilters[WEBP_FILTER_GRADIENT] = GradientUnfilter_C;
WebPFilters[WEBP_FILTER_NONE] = NULL;
- WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter;
- WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter;
- WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter;
+#if !WEBP_NEON_OMIT_C_CODE
+ WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter_C;
+ WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter_C;
+ WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter_C;
+#endif
if (VP8GetCPUInfo != NULL) {
#if defined(WEBP_USE_SSE2)
@@ -253,11 +264,6 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInit(void) {
VP8FiltersInitSSE2();
}
#endif
-#if defined(WEBP_USE_NEON)
- if (VP8GetCPUInfo(kNEON)) {
- VP8FiltersInitNEON();
- }
-#endif
#if defined(WEBP_USE_MIPS_DSP_R2)
if (VP8GetCPUInfo(kMIPSdspR2)) {
VP8FiltersInitMIPSdspR2();
@@ -269,5 +275,20 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInit(void) {
}
#endif
}
+
+#if defined(WEBP_USE_NEON)
+ if (WEBP_NEON_OMIT_C_CODE ||
+ (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
+ VP8FiltersInitNEON();
+ }
+#endif
+
+ assert(WebPUnfilters[WEBP_FILTER_HORIZONTAL] != NULL);
+ assert(WebPUnfilters[WEBP_FILTER_VERTICAL] != NULL);
+ assert(WebPUnfilters[WEBP_FILTER_GRADIENT] != NULL);
+ assert(WebPFilters[WEBP_FILTER_HORIZONTAL] != NULL);
+ assert(WebPFilters[WEBP_FILTER_VERTICAL] != NULL);
+ assert(WebPFilters[WEBP_FILTER_GRADIENT] != NULL);
+
filters_last_cpuinfo_used = VP8GetCPUInfo;
}
diff --git a/src/3rdparty/libwebp/src/dsp/filters_mips_dsp_r2.c b/src/3rdparty/libwebp/src/dsp/filters_mips_dsp_r2.c
index 1d82e3c..9382b12 100644
--- a/src/3rdparty/libwebp/src/dsp/filters_mips_dsp_r2.c
+++ b/src/3rdparty/libwebp/src/dsp/filters_mips_dsp_r2.c
@@ -12,11 +12,11 @@
// Author(s): Branimir Vasic (branimir.vasic@imgtec.com)
// Djordje Pesut (djordje.pesut@imgtec.com)
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
#if defined(WEBP_USE_MIPS_DSP_R2)
-#include "../dsp/dsp.h"
+#include "src/dsp/dsp.h"
#include <assert.h>
#include <stdlib.h>
#include <string.h>
@@ -101,8 +101,8 @@
); \
} while (0)
-static WEBP_INLINE void PredictLine(const uint8_t* src, uint8_t* dst,
- int length) {
+static WEBP_INLINE void PredictLine_MIPSdspR2(const uint8_t* src, uint8_t* dst,
+ int length) {
DO_PREDICT_LINE(src, dst, length, 0);
}
@@ -192,10 +192,11 @@ static WEBP_INLINE void PredictLine(const uint8_t* src, uint8_t* dst,
} \
} while (0)
-static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in,
- int width, int height, int stride,
- int row, int num_rows,
- uint8_t* out) {
+static WEBP_INLINE void DoHorizontalFilter_MIPSdspR2(const uint8_t* in,
+ int width, int height,
+ int stride,
+ int row, int num_rows,
+ uint8_t* out) {
const uint8_t* preds;
const size_t start_offset = row * stride;
const int last_row = row + num_rows;
@@ -207,7 +208,7 @@ static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in,
if (row == 0) {
// Leftmost pixel is the same as input for topmost scanline.
out[0] = in[0];
- PredictLine(in + 1, out + 1, width - 1);
+ PredictLine_MIPSdspR2(in + 1, out + 1, width - 1);
row = 1;
preds += stride;
in += stride;
@@ -219,9 +220,11 @@ static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in,
}
#undef FILTER_LINE_BY_LINE
-static void HorizontalFilter(const uint8_t* data, int width, int height,
- int stride, uint8_t* filtered_data) {
- DoHorizontalFilter(data, width, height, stride, 0, height, filtered_data);
+static void HorizontalFilter_MIPSdspR2(const uint8_t* data,
+ int width, int height,
+ int stride, uint8_t* filtered_data) {
+ DoHorizontalFilter_MIPSdspR2(data, width, height, stride, 0, height,
+ filtered_data);
}
//------------------------------------------------------------------------------
@@ -237,9 +240,11 @@ static void HorizontalFilter(const uint8_t* data, int width, int height,
} \
} while (0)
-static WEBP_INLINE void DoVerticalFilter(const uint8_t* in,
- int width, int height, int stride,
- int row, int num_rows, uint8_t* out) {
+static WEBP_INLINE void DoVerticalFilter_MIPSdspR2(const uint8_t* in,
+ int width, int height,
+ int stride,
+ int row, int num_rows,
+ uint8_t* out) {
const uint8_t* preds;
const size_t start_offset = row * stride;
const int last_row = row + num_rows;
@@ -252,7 +257,7 @@ static WEBP_INLINE void DoVerticalFilter(const uint8_t* in,
// Very first top-left pixel is copied.
out[0] = in[0];
// Rest of top scan-line is left-predicted.
- PredictLine(in + 1, out + 1, width - 1);
+ PredictLine_MIPSdspR2(in + 1, out + 1, width - 1);
row = 1;
in += stride;
out += stride;
@@ -266,15 +271,16 @@ static WEBP_INLINE void DoVerticalFilter(const uint8_t* in,
}
#undef FILTER_LINE_BY_LINE
-static void VerticalFilter(const uint8_t* data, int width, int height,
- int stride, uint8_t* filtered_data) {
- DoVerticalFilter(data, width, height, stride, 0, height, filtered_data);
+static void VerticalFilter_MIPSdspR2(const uint8_t* data, int width, int height,
+ int stride, uint8_t* filtered_data) {
+ DoVerticalFilter_MIPSdspR2(data, width, height, stride, 0, height,
+ filtered_data);
}
//------------------------------------------------------------------------------
// Gradient filter.
-static WEBP_INLINE int GradientPredictor(uint8_t a, uint8_t b, uint8_t c) {
+static int GradientPredictor_MIPSdspR2(uint8_t a, uint8_t b, uint8_t c) {
int temp0;
__asm__ volatile (
"addu %[temp0], %[a], %[b] \n\t"
@@ -293,9 +299,9 @@ static WEBP_INLINE int GradientPredictor(uint8_t a, uint8_t b, uint8_t c) {
int w; \
PREDICT_LINE_ONE_PASS(in, PREDS - stride, out); \
for (w = 1; w < width; ++w) { \
- const int pred = GradientPredictor(PREDS[w - 1], \
- PREDS[w - stride], \
- PREDS[w - stride - 1]); \
+ const int pred = GradientPredictor_MIPSdspR2(PREDS[w - 1], \
+ PREDS[w - stride], \
+ PREDS[w - stride - 1]); \
out[w] = in[w] OPERATION pred; \
} \
++row; \
@@ -304,9 +310,9 @@ static WEBP_INLINE int GradientPredictor(uint8_t a, uint8_t b, uint8_t c) {
} \
} while (0)
-static WEBP_INLINE void DoGradientFilter(const uint8_t* in,
- int width, int height, int stride,
- int row, int num_rows, uint8_t* out) {
+static void DoGradientFilter_MIPSdspR2(const uint8_t* in,
+ int width, int height, int stride,
+ int row, int num_rows, uint8_t* out) {
const uint8_t* preds;
const size_t start_offset = row * stride;
const int last_row = row + num_rows;
@@ -318,7 +324,7 @@ static WEBP_INLINE void DoGradientFilter(const uint8_t* in,
// left prediction for top scan-line
if (row == 0) {
out[0] = in[0];
- PredictLine(in + 1, out + 1, width - 1);
+ PredictLine_MIPSdspR2(in + 1, out + 1, width - 1);
row = 1;
preds += stride;
in += stride;
@@ -330,38 +336,39 @@ static WEBP_INLINE void DoGradientFilter(const uint8_t* in,
}
#undef FILTER_LINE_BY_LINE
-static void GradientFilter(const uint8_t* data, int width, int height,
- int stride, uint8_t* filtered_data) {
- DoGradientFilter(data, width, height, stride, 0, height, filtered_data);
+static void GradientFilter_MIPSdspR2(const uint8_t* data, int width, int height,
+ int stride, uint8_t* filtered_data) {
+ DoGradientFilter_MIPSdspR2(data, width, height, stride, 0, height,
+ filtered_data);
}
//------------------------------------------------------------------------------
-static void HorizontalUnfilter(const uint8_t* prev, const uint8_t* in,
- uint8_t* out, int width) {
+static void HorizontalUnfilter_MIPSdspR2(const uint8_t* prev, const uint8_t* in,
+ uint8_t* out, int width) {
out[0] = in[0] + (prev == NULL ? 0 : prev[0]);
DO_PREDICT_LINE(in + 1, out + 1, width - 1, 1);
}
-static void VerticalUnfilter(const uint8_t* prev, const uint8_t* in,
- uint8_t* out, int width) {
+static void VerticalUnfilter_MIPSdspR2(const uint8_t* prev, const uint8_t* in,
+ uint8_t* out, int width) {
if (prev == NULL) {
- HorizontalUnfilter(NULL, in, out, width);
+ HorizontalUnfilter_MIPSdspR2(NULL, in, out, width);
} else {
DO_PREDICT_LINE_VERTICAL(in, prev, out, width, 1);
}
}
-static void GradientUnfilter(const uint8_t* prev, const uint8_t* in,
- uint8_t* out, int width) {
+static void GradientUnfilter_MIPSdspR2(const uint8_t* prev, const uint8_t* in,
+ uint8_t* out, int width) {
if (prev == NULL) {
- HorizontalUnfilter(NULL, in, out, width);
+ HorizontalUnfilter_MIPSdspR2(NULL, in, out, width);
} else {
uint8_t top = prev[0], top_left = top, left = top;
int i;
for (i = 0; i < width; ++i) {
top = prev[i]; // need to read this first, in case prev==dst
- left = in[i] + GradientPredictor(left, top, top_left);
+ left = in[i] + GradientPredictor_MIPSdspR2(left, top, top_left);
top_left = top;
out[i] = left;
}
@@ -379,13 +386,13 @@ static void GradientUnfilter(const uint8_t* prev, const uint8_t* in,
extern void VP8FiltersInitMIPSdspR2(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInitMIPSdspR2(void) {
- WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter;
- WebPUnfilters[WEBP_FILTER_VERTICAL] = VerticalUnfilter;
- WebPUnfilters[WEBP_FILTER_GRADIENT] = GradientUnfilter;
+ WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter_MIPSdspR2;
+ WebPUnfilters[WEBP_FILTER_VERTICAL] = VerticalUnfilter_MIPSdspR2;
+ WebPUnfilters[WEBP_FILTER_GRADIENT] = GradientUnfilter_MIPSdspR2;
- WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter;
- WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter;
- WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter;
+ WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter_MIPSdspR2;
+ WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter_MIPSdspR2;
+ WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter_MIPSdspR2;
}
#else // !WEBP_USE_MIPS_DSP_R2
diff --git a/src/3rdparty/libwebp/src/dsp/filters_msa.c b/src/3rdparty/libwebp/src/dsp/filters_msa.c
index 4b8922d..14c437d 100644
--- a/src/3rdparty/libwebp/src/dsp/filters_msa.c
+++ b/src/3rdparty/libwebp/src/dsp/filters_msa.c
@@ -11,11 +11,11 @@
//
// Author: Prashant Patil (prashant.patil@imgtec.com)
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
#if defined(WEBP_USE_MSA)
-#include "./msa_macro.h"
+#include "src/dsp/msa_macro.h"
#include <assert.h>
@@ -66,8 +66,8 @@ static WEBP_INLINE void PredictLineInverse0(const uint8_t* src,
//------------------------------------------------------------------------------
// Horrizontal filter
-static void HorizontalFilter(const uint8_t* data, int width, int height,
- int stride, uint8_t* filtered_data) {
+static void HorizontalFilter_MSA(const uint8_t* data, int width, int height,
+ int stride, uint8_t* filtered_data) {
const uint8_t* preds = data;
const uint8_t* in = data;
uint8_t* out = filtered_data;
@@ -129,8 +129,8 @@ static WEBP_INLINE void PredictLineGradient(const uint8_t* pinput,
}
-static void GradientFilter(const uint8_t* data, int width, int height,
- int stride, uint8_t* filtered_data) {
+static void GradientFilter_MSA(const uint8_t* data, int width, int height,
+ int stride, uint8_t* filtered_data) {
const uint8_t* in = data;
const uint8_t* preds = data;
uint8_t* out = filtered_data;
@@ -157,8 +157,8 @@ static void GradientFilter(const uint8_t* data, int width, int height,
//------------------------------------------------------------------------------
// Vertical filter
-static void VerticalFilter(const uint8_t* data, int width, int height,
- int stride, uint8_t* filtered_data) {
+static void VerticalFilter_MSA(const uint8_t* data, int width, int height,
+ int stride, uint8_t* filtered_data) {
const uint8_t* in = data;
const uint8_t* preds = data;
uint8_t* out = filtered_data;
@@ -190,9 +190,9 @@ static void VerticalFilter(const uint8_t* data, int width, int height,
extern void VP8FiltersInitMSA(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInitMSA(void) {
- WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter;
- WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter;
- WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter;
+ WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter_MSA;
+ WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter_MSA;
+ WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter_MSA;
}
#else // !WEBP_USE_MSA
diff --git a/src/3rdparty/libwebp/src/dsp/filters_neon.c b/src/3rdparty/libwebp/src/dsp/filters_neon.c
index 4d6e50c..3e6a578 100644
--- a/src/3rdparty/libwebp/src/dsp/filters_neon.c
+++ b/src/3rdparty/libwebp/src/dsp/filters_neon.c
@@ -11,12 +11,12 @@
//
// Author: Skal (pascal.massimino@gmail.com)
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
#if defined(WEBP_USE_NEON)
#include <assert.h>
-#include "./neon.h"
+#include "src/dsp/neon.h"
//------------------------------------------------------------------------------
// Helpful macros.
@@ -134,7 +134,7 @@ static WEBP_INLINE void DoVerticalFilter_NEON(const uint8_t* in,
}
static void VerticalFilter_NEON(const uint8_t* data, int width, int height,
- int stride, uint8_t* filtered_data) {
+ int stride, uint8_t* filtered_data) {
DoVerticalFilter_NEON(data, width, height, stride, 0, height,
filtered_data);
}
@@ -196,7 +196,7 @@ static WEBP_INLINE void DoGradientFilter_NEON(const uint8_t* in,
}
static void GradientFilter_NEON(const uint8_t* data, int width, int height,
- int stride, uint8_t* filtered_data) {
+ int stride, uint8_t* filtered_data) {
DoGradientFilter_NEON(data, width, height, stride, 0, height,
filtered_data);
}
@@ -251,9 +251,11 @@ static void VerticalUnfilter_NEON(const uint8_t* prev, const uint8_t* in,
// GradientUnfilter_NEON is correct but slower than the C-version,
// at least on ARM64. For armv7, it's a wash.
// So best is to disable it for now, but keep the idea around...
-// #define USE_GRADIENT_UNFILTER
+#if !defined(USE_GRADIENT_UNFILTER)
+#define USE_GRADIENT_UNFILTER 0 // ALTERNATE_CODE
+#endif
-#if defined(USE_GRADIENT_UNFILTER)
+#if (USE_GRADIENT_UNFILTER == 1)
#define GRAD_PROCESS_LANE(L) do { \
const uint8x8_t tmp1 = ROTATE_RIGHT_N(pred, 1); /* rotate predictor in */ \
const int16x8_t tmp2 = vaddq_s16(BC, U8_TO_S16(tmp1)); \
@@ -292,7 +294,7 @@ static void GradientPredictInverse_NEON(const uint8_t* const in,
#undef GRAD_PROCESS_LANE
static void GradientUnfilter_NEON(const uint8_t* prev, const uint8_t* in,
- uint8_t* out, int width) {
+ uint8_t* out, int width) {
if (prev == NULL) {
HorizontalUnfilter_NEON(NULL, in, out, width);
} else {
@@ -311,7 +313,7 @@ extern void VP8FiltersInitNEON(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInitNEON(void) {
WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter_NEON;
WebPUnfilters[WEBP_FILTER_VERTICAL] = VerticalUnfilter_NEON;
-#if defined(USE_GRADIENT_UNFILTER)
+#if (USE_GRADIENT_UNFILTER == 1)
WebPUnfilters[WEBP_FILTER_GRADIENT] = GradientUnfilter_NEON;
#endif
diff --git a/src/3rdparty/libwebp/src/dsp/filters_sse2.c b/src/3rdparty/libwebp/src/dsp/filters_sse2.c
index 67f7799..5a18895 100644
--- a/src/3rdparty/libwebp/src/dsp/filters_sse2.c
+++ b/src/3rdparty/libwebp/src/dsp/filters_sse2.c
@@ -11,7 +11,7 @@
//
// Author: Skal (pascal.massimino@gmail.com)
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
#if defined(WEBP_USE_SSE2)
@@ -24,16 +24,16 @@
// Helpful macro.
# define SANITY_CHECK(in, out) \
- assert(in != NULL); \
- assert(out != NULL); \
+ assert((in) != NULL); \
+ assert((out) != NULL); \
assert(width > 0); \
assert(height > 0); \
assert(stride >= width); \
assert(row >= 0 && num_rows > 0 && row + num_rows <= height); \
(void)height; // Silence unused warning.
-static void PredictLineTop(const uint8_t* src, const uint8_t* pred,
- uint8_t* dst, int length) {
+static void PredictLineTop_SSE2(const uint8_t* src, const uint8_t* pred,
+ uint8_t* dst, int length) {
int i;
const int max_pos = length & ~31;
assert(length >= 0);
@@ -51,7 +51,7 @@ static void PredictLineTop(const uint8_t* src, const uint8_t* pred,
}
// Special case for left-based prediction (when preds==dst-1 or preds==src-1).
-static void PredictLineLeft(const uint8_t* src, uint8_t* dst, int length) {
+static void PredictLineLeft_SSE2(const uint8_t* src, uint8_t* dst, int length) {
int i;
const int max_pos = length & ~31;
assert(length >= 0);
@@ -71,10 +71,11 @@ static void PredictLineLeft(const uint8_t* src, uint8_t* dst, int length) {
//------------------------------------------------------------------------------
// Horizontal filter.
-static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in,
- int width, int height, int stride,
- int row, int num_rows,
- uint8_t* out) {
+static WEBP_INLINE void DoHorizontalFilter_SSE2(const uint8_t* in,
+ int width, int height,
+ int stride,
+ int row, int num_rows,
+ uint8_t* out) {
const size_t start_offset = row * stride;
const int last_row = row + num_rows;
SANITY_CHECK(in, out);
@@ -84,7 +85,7 @@ static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in,
if (row == 0) {
// Leftmost pixel is the same as input for topmost scanline.
out[0] = in[0];
- PredictLineLeft(in + 1, out + 1, width - 1);
+ PredictLineLeft_SSE2(in + 1, out + 1, width - 1);
row = 1;
in += stride;
out += stride;
@@ -94,7 +95,7 @@ static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in,
while (row < last_row) {
// Leftmost pixel is predicted from above.
out[0] = in[0] - in[-stride];
- PredictLineLeft(in + 1, out + 1, width - 1);
+ PredictLineLeft_SSE2(in + 1, out + 1, width - 1);
++row;
in += stride;
out += stride;
@@ -104,9 +105,10 @@ static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in,
//------------------------------------------------------------------------------
// Vertical filter.
-static WEBP_INLINE void DoVerticalFilter(const uint8_t* in,
- int width, int height, int stride,
- int row, int num_rows, uint8_t* out) {
+static WEBP_INLINE void DoVerticalFilter_SSE2(const uint8_t* in,
+ int width, int height, int stride,
+ int row, int num_rows,
+ uint8_t* out) {
const size_t start_offset = row * stride;
const int last_row = row + num_rows;
SANITY_CHECK(in, out);
@@ -117,7 +119,7 @@ static WEBP_INLINE void DoVerticalFilter(const uint8_t* in,
// Very first top-left pixel is copied.
out[0] = in[0];
// Rest of top scan-line is left-predicted.
- PredictLineLeft(in + 1, out + 1, width - 1);
+ PredictLineLeft_SSE2(in + 1, out + 1, width - 1);
row = 1;
in += stride;
out += stride;
@@ -125,7 +127,7 @@ static WEBP_INLINE void DoVerticalFilter(const uint8_t* in,
// Filter line-by-line.
while (row < last_row) {
- PredictLineTop(in, in - stride, out, width);
+ PredictLineTop_SSE2(in, in - stride, out, width);
++row;
in += stride;
out += stride;
@@ -135,14 +137,14 @@ static WEBP_INLINE void DoVerticalFilter(const uint8_t* in,
//------------------------------------------------------------------------------
// Gradient filter.
-static WEBP_INLINE int GradientPredictorC(uint8_t a, uint8_t b, uint8_t c) {
+static WEBP_INLINE int GradientPredictor_SSE2(uint8_t a, uint8_t b, uint8_t c) {
const int g = a + b - c;
return ((g & ~0xff) == 0) ? g : (g < 0) ? 0 : 255; // clip to 8bit
}
-static void GradientPredictDirect(const uint8_t* const row,
- const uint8_t* const top,
- uint8_t* const out, int length) {
+static void GradientPredictDirect_SSE2(const uint8_t* const row,
+ const uint8_t* const top,
+ uint8_t* const out, int length) {
const int max_pos = length & ~7;
int i;
const __m128i zero = _mm_setzero_si128();
@@ -161,14 +163,14 @@ static void GradientPredictDirect(const uint8_t* const row,
_mm_storel_epi64((__m128i*)(out + i), H);
}
for (; i < length; ++i) {
- out[i] = row[i] - GradientPredictorC(row[i - 1], top[i], top[i - 1]);
+ out[i] = row[i] - GradientPredictor_SSE2(row[i - 1], top[i], top[i - 1]);
}
}
-static WEBP_INLINE void DoGradientFilter(const uint8_t* in,
- int width, int height, int stride,
- int row, int num_rows,
- uint8_t* out) {
+static WEBP_INLINE void DoGradientFilter_SSE2(const uint8_t* in,
+ int width, int height, int stride,
+ int row, int num_rows,
+ uint8_t* out) {
const size_t start_offset = row * stride;
const int last_row = row + num_rows;
SANITY_CHECK(in, out);
@@ -178,7 +180,7 @@ static WEBP_INLINE void DoGradientFilter(const uint8_t* in,
// left prediction for top scan-line
if (row == 0) {
out[0] = in[0];
- PredictLineLeft(in + 1, out + 1, width - 1);
+ PredictLineLeft_SSE2(in + 1, out + 1, width - 1);
row = 1;
in += stride;
out += stride;
@@ -187,7 +189,7 @@ static WEBP_INLINE void DoGradientFilter(const uint8_t* in,
// Filter line-by-line.
while (row < last_row) {
out[0] = in[0] - in[-stride];
- GradientPredictDirect(in + 1, in + 1 - stride, out + 1, width - 1);
+ GradientPredictDirect_SSE2(in + 1, in + 1 - stride, out + 1, width - 1);
++row;
in += stride;
out += stride;
@@ -198,26 +200,27 @@ static WEBP_INLINE void DoGradientFilter(const uint8_t* in,
//------------------------------------------------------------------------------
-static void HorizontalFilter(const uint8_t* data, int width, int height,
- int stride, uint8_t* filtered_data) {
- DoHorizontalFilter(data, width, height, stride, 0, height, filtered_data);
+static void HorizontalFilter_SSE2(const uint8_t* data, int width, int height,
+ int stride, uint8_t* filtered_data) {
+ DoHorizontalFilter_SSE2(data, width, height, stride, 0, height,
+ filtered_data);
}
-static void VerticalFilter(const uint8_t* data, int width, int height,
- int stride, uint8_t* filtered_data) {
- DoVerticalFilter(data, width, height, stride, 0, height, filtered_data);
+static void VerticalFilter_SSE2(const uint8_t* data, int width, int height,
+ int stride, uint8_t* filtered_data) {
+ DoVerticalFilter_SSE2(data, width, height, stride, 0, height, filtered_data);
}
-static void GradientFilter(const uint8_t* data, int width, int height,
- int stride, uint8_t* filtered_data) {
- DoGradientFilter(data, width, height, stride, 0, height, filtered_data);
+static void GradientFilter_SSE2(const uint8_t* data, int width, int height,
+ int stride, uint8_t* filtered_data) {
+ DoGradientFilter_SSE2(data, width, height, stride, 0, height, filtered_data);
}
//------------------------------------------------------------------------------
// Inverse transforms
-static void HorizontalUnfilter(const uint8_t* prev, const uint8_t* in,
- uint8_t* out, int width) {
+static void HorizontalUnfilter_SSE2(const uint8_t* prev, const uint8_t* in,
+ uint8_t* out, int width) {
int i;
__m128i last;
out[0] = in[0] + (prev == NULL ? 0 : prev[0]);
@@ -238,10 +241,10 @@ static void HorizontalUnfilter(const uint8_t* prev, const uint8_t* in,
for (; i < width; ++i) out[i] = in[i] + out[i - 1];
}
-static void VerticalUnfilter(const uint8_t* prev, const uint8_t* in,
- uint8_t* out, int width) {
+static void VerticalUnfilter_SSE2(const uint8_t* prev, const uint8_t* in,
+ uint8_t* out, int width) {
if (prev == NULL) {
- HorizontalUnfilter(NULL, in, out, width);
+ HorizontalUnfilter_SSE2(NULL, in, out, width);
} else {
int i;
const int max_pos = width & ~31;
@@ -260,9 +263,9 @@ static void VerticalUnfilter(const uint8_t* prev, const uint8_t* in,
}
}
-static void GradientPredictInverse(const uint8_t* const in,
- const uint8_t* const top,
- uint8_t* const row, int length) {
+static void GradientPredictInverse_SSE2(const uint8_t* const in,
+ const uint8_t* const top,
+ uint8_t* const row, int length) {
if (length > 0) {
int i;
const int max_pos = length & ~7;
@@ -293,18 +296,18 @@ static void GradientPredictInverse(const uint8_t* const in,
_mm_storel_epi64((__m128i*)&row[i], out);
}
for (; i < length; ++i) {
- row[i] = in[i] + GradientPredictorC(row[i - 1], top[i], top[i - 1]);
+ row[i] = in[i] + GradientPredictor_SSE2(row[i - 1], top[i], top[i - 1]);
}
}
}
-static void GradientUnfilter(const uint8_t* prev, const uint8_t* in,
- uint8_t* out, int width) {
+static void GradientUnfilter_SSE2(const uint8_t* prev, const uint8_t* in,
+ uint8_t* out, int width) {
if (prev == NULL) {
- HorizontalUnfilter(NULL, in, out, width);
+ HorizontalUnfilter_SSE2(NULL, in, out, width);
} else {
out[0] = in[0] + prev[0]; // predict from above
- GradientPredictInverse(in + 1, prev + 1, out + 1, width - 1);
+ GradientPredictInverse_SSE2(in + 1, prev + 1, out + 1, width - 1);
}
}
@@ -314,13 +317,13 @@ static void GradientUnfilter(const uint8_t* prev, const uint8_t* in,
extern void VP8FiltersInitSSE2(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInitSSE2(void) {
- WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter;
- WebPUnfilters[WEBP_FILTER_VERTICAL] = VerticalUnfilter;
- WebPUnfilters[WEBP_FILTER_GRADIENT] = GradientUnfilter;
+ WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter_SSE2;
+ WebPUnfilters[WEBP_FILTER_VERTICAL] = VerticalUnfilter_SSE2;
+ WebPUnfilters[WEBP_FILTER_GRADIENT] = GradientUnfilter_SSE2;
- WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter;
- WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter;
- WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter;
+ WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter_SSE2;
+ WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter_SSE2;
+ WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter_SSE2;
}
#else // !WEBP_USE_SSE2
diff --git a/src/3rdparty/libwebp/src/dsp/lossless.c b/src/3rdparty/libwebp/src/dsp/lossless.c
index 20d18f6..83f553d 100644
--- a/src/3rdparty/libwebp/src/dsp/lossless.c
+++ b/src/3rdparty/libwebp/src/dsp/lossless.c
@@ -13,14 +13,15 @@
// Jyrki Alakuijala (jyrki@google.com)
// Urvang Joshi (urvang@google.com)
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
+#include <assert.h>
#include <math.h>
#include <stdlib.h>
-#include "../dec/vp8li_dec.h"
-#include "../utils/endian_inl_utils.h"
-#include "./lossless.h"
-#include "./lossless_common.h"
+#include "src/dec/vp8li_dec.h"
+#include "src/utils/endian_inl_utils.h"
+#include "src/dsp/lossless.h"
+#include "src/dsp/lossless_common.h"
#define MAX_DIFF_COST (1e30f)
@@ -80,8 +81,9 @@ static WEBP_INLINE uint32_t ClampedAddSubtractHalf(uint32_t c0, uint32_t c1,
return ((uint32_t)a << 24) | (r << 16) | (g << 8) | b;
}
-// gcc-4.9 on ARM generates incorrect code in Select() when Sub3() is inlined.
-#if defined(__arm__) && LOCAL_GCC_VERSION == 0x409
+// gcc <= 4.9 on ARM generates incorrect code in Select() when Sub3() is
+// inlined.
+#if defined(__arm__) && LOCAL_GCC_VERSION <= 0x409
# define LOCAL_INLINE __attribute__ ((noinline))
#else
# define LOCAL_INLINE WEBP_INLINE
@@ -107,69 +109,69 @@ static WEBP_INLINE uint32_t Select(uint32_t a, uint32_t b, uint32_t c) {
//------------------------------------------------------------------------------
// Predictors
-static uint32_t Predictor0(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor0_C(uint32_t left, const uint32_t* const top) {
(void)top;
(void)left;
return ARGB_BLACK;
}
-static uint32_t Predictor1(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor1_C(uint32_t left, const uint32_t* const top) {
(void)top;
return left;
}
-static uint32_t Predictor2(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor2_C(uint32_t left, const uint32_t* const top) {
(void)left;
return top[0];
}
-static uint32_t Predictor3(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor3_C(uint32_t left, const uint32_t* const top) {
(void)left;
return top[1];
}
-static uint32_t Predictor4(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor4_C(uint32_t left, const uint32_t* const top) {
(void)left;
return top[-1];
}
-static uint32_t Predictor5(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor5_C(uint32_t left, const uint32_t* const top) {
const uint32_t pred = Average3(left, top[0], top[1]);
return pred;
}
-static uint32_t Predictor6(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor6_C(uint32_t left, const uint32_t* const top) {
const uint32_t pred = Average2(left, top[-1]);
return pred;
}
-static uint32_t Predictor7(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor7_C(uint32_t left, const uint32_t* const top) {
const uint32_t pred = Average2(left, top[0]);
return pred;
}
-static uint32_t Predictor8(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor8_C(uint32_t left, const uint32_t* const top) {
const uint32_t pred = Average2(top[-1], top[0]);
(void)left;
return pred;
}
-static uint32_t Predictor9(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor9_C(uint32_t left, const uint32_t* const top) {
const uint32_t pred = Average2(top[0], top[1]);
(void)left;
return pred;
}
-static uint32_t Predictor10(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor10_C(uint32_t left, const uint32_t* const top) {
const uint32_t pred = Average4(left, top[-1], top[0], top[1]);
return pred;
}
-static uint32_t Predictor11(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor11_C(uint32_t left, const uint32_t* const top) {
const uint32_t pred = Select(top[0], left, top[-1]);
return pred;
}
-static uint32_t Predictor12(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor12_C(uint32_t left, const uint32_t* const top) {
const uint32_t pred = ClampedAddSubtractFull(left, top[0], top[-1]);
return pred;
}
-static uint32_t Predictor13(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor13_C(uint32_t left, const uint32_t* const top) {
const uint32_t pred = ClampedAddSubtractHalf(left, top[0], top[-1]);
return pred;
}
-GENERATE_PREDICTOR_ADD(Predictor0, PredictorAdd0)
-static void PredictorAdd1(const uint32_t* in, const uint32_t* upper,
- int num_pixels, uint32_t* out) {
+GENERATE_PREDICTOR_ADD(Predictor0_C, PredictorAdd0_C)
+static void PredictorAdd1_C(const uint32_t* in, const uint32_t* upper,
+ int num_pixels, uint32_t* out) {
int i;
uint32_t left = out[-1];
for (i = 0; i < num_pixels; ++i) {
@@ -177,29 +179,29 @@ static void PredictorAdd1(const uint32_t* in, const uint32_t* upper,
}
(void)upper;
}
-GENERATE_PREDICTOR_ADD(Predictor2, PredictorAdd2)
-GENERATE_PREDICTOR_ADD(Predictor3, PredictorAdd3)
-GENERATE_PREDICTOR_ADD(Predictor4, PredictorAdd4)
-GENERATE_PREDICTOR_ADD(Predictor5, PredictorAdd5)
-GENERATE_PREDICTOR_ADD(Predictor6, PredictorAdd6)
-GENERATE_PREDICTOR_ADD(Predictor7, PredictorAdd7)
-GENERATE_PREDICTOR_ADD(Predictor8, PredictorAdd8)
-GENERATE_PREDICTOR_ADD(Predictor9, PredictorAdd9)
-GENERATE_PREDICTOR_ADD(Predictor10, PredictorAdd10)
-GENERATE_PREDICTOR_ADD(Predictor11, PredictorAdd11)
-GENERATE_PREDICTOR_ADD(Predictor12, PredictorAdd12)
-GENERATE_PREDICTOR_ADD(Predictor13, PredictorAdd13)
+GENERATE_PREDICTOR_ADD(Predictor2_C, PredictorAdd2_C)
+GENERATE_PREDICTOR_ADD(Predictor3_C, PredictorAdd3_C)
+GENERATE_PREDICTOR_ADD(Predictor4_C, PredictorAdd4_C)
+GENERATE_PREDICTOR_ADD(Predictor5_C, PredictorAdd5_C)
+GENERATE_PREDICTOR_ADD(Predictor6_C, PredictorAdd6_C)
+GENERATE_PREDICTOR_ADD(Predictor7_C, PredictorAdd7_C)
+GENERATE_PREDICTOR_ADD(Predictor8_C, PredictorAdd8_C)
+GENERATE_PREDICTOR_ADD(Predictor9_C, PredictorAdd9_C)
+GENERATE_PREDICTOR_ADD(Predictor10_C, PredictorAdd10_C)
+GENERATE_PREDICTOR_ADD(Predictor11_C, PredictorAdd11_C)
+GENERATE_PREDICTOR_ADD(Predictor12_C, PredictorAdd12_C)
+GENERATE_PREDICTOR_ADD(Predictor13_C, PredictorAdd13_C)
//------------------------------------------------------------------------------
// Inverse prediction.
-static void PredictorInverseTransform(const VP8LTransform* const transform,
- int y_start, int y_end,
- const uint32_t* in, uint32_t* out) {
+static void PredictorInverseTransform_C(const VP8LTransform* const transform,
+ int y_start, int y_end,
+ const uint32_t* in, uint32_t* out) {
const int width = transform->xsize_;
if (y_start == 0) { // First Row follows the L (mode=1) mode.
- PredictorAdd0(in, NULL, 1, out);
- PredictorAdd1(in + 1, NULL, width - 1, out + 1);
+ PredictorAdd0_C(in, NULL, 1, out);
+ PredictorAdd1_C(in + 1, NULL, width - 1, out + 1);
in += width;
out += width;
++y_start;
@@ -217,7 +219,7 @@ static void PredictorInverseTransform(const VP8LTransform* const transform,
const uint32_t* pred_mode_src = pred_mode_base;
int x = 1;
// First pixel follows the T (mode=2) mode.
- PredictorAdd2(in, out - width, 1, out);
+ PredictorAdd2_C(in, out - width, 1, out);
// .. the rest:
while (x < width) {
const VP8LPredictorAddSubFunc pred_func =
@@ -272,8 +274,8 @@ void VP8LTransformColorInverse_C(const VP8LMultipliers* const m,
const uint32_t argb = src[i];
const uint32_t green = argb >> 8;
const uint32_t red = argb >> 16;
- int new_red = red;
- int new_blue = argb;
+ int new_red = red & 0xff;
+ int new_blue = argb & 0xff;
new_red += ColorTransformDelta(m->green_to_red_, green);
new_red &= 0xff;
new_blue += ColorTransformDelta(m->green_to_blue_, green);
@@ -284,9 +286,9 @@ void VP8LTransformColorInverse_C(const VP8LMultipliers* const m,
}
// Color space inverse transform.
-static void ColorSpaceInverseTransform(const VP8LTransform* const transform,
- int y_start, int y_end,
- const uint32_t* src, uint32_t* dst) {
+static void ColorSpaceInverseTransform_C(const VP8LTransform* const transform,
+ int y_start, int y_end,
+ const uint32_t* src, uint32_t* dst) {
const int width = transform->xsize_;
const int tile_width = 1 << transform->bits_;
const int mask = tile_width - 1;
@@ -362,10 +364,10 @@ STATIC_DECL void FUNC_NAME(const VP8LTransform* const transform, \
} \
}
-COLOR_INDEX_INVERSE(ColorIndexInverseTransform, MapARGB, static, uint32_t, 32b,
- VP8GetARGBIndex, VP8GetARGBValue)
-COLOR_INDEX_INVERSE(VP8LColorIndexInverseTransformAlpha, MapAlpha, , uint8_t,
- 8b, VP8GetAlphaIndex, VP8GetAlphaValue)
+COLOR_INDEX_INVERSE(ColorIndexInverseTransform_C, MapARGB_C, static,
+ uint32_t, 32b, VP8GetARGBIndex, VP8GetARGBValue)
+COLOR_INDEX_INVERSE(VP8LColorIndexInverseTransformAlpha, MapAlpha_C, ,
+ uint8_t, 8b, VP8GetAlphaIndex, VP8GetAlphaValue)
#undef COLOR_INDEX_INVERSE
@@ -380,7 +382,7 @@ void VP8LInverseTransform(const VP8LTransform* const transform,
VP8LAddGreenToBlueAndRed(in, (row_end - row_start) * width, out);
break;
case PREDICTOR_TRANSFORM:
- PredictorInverseTransform(transform, row_start, row_end, in, out);
+ PredictorInverseTransform_C(transform, row_start, row_end, in, out);
if (row_end != transform->ysize_) {
// The last predicted row in this iteration will be the top-pred row
// for the first row in next iteration.
@@ -389,7 +391,7 @@ void VP8LInverseTransform(const VP8LTransform* const transform,
}
break;
case CROSS_COLOR_TRANSFORM:
- ColorSpaceInverseTransform(transform, row_start, row_end, in, out);
+ ColorSpaceInverseTransform_C(transform, row_start, row_end, in, out);
break;
case COLOR_INDEXING_TRANSFORM:
if (in == out && transform->bits_ > 0) {
@@ -403,9 +405,9 @@ void VP8LInverseTransform(const VP8LTransform* const transform,
VP8LSubSampleSize(transform->xsize_, transform->bits_);
uint32_t* const src = out + out_stride - in_stride;
memmove(src, out, in_stride * sizeof(*src));
- ColorIndexInverseTransform(transform, row_start, row_end, src, out);
+ ColorIndexInverseTransform_C(transform, row_start, row_end, src, out);
} else {
- ColorIndexInverseTransform(transform, row_start, row_end, in, out);
+ ColorIndexInverseTransform_C(transform, row_start, row_end, in, out);
}
break;
}
@@ -452,7 +454,7 @@ void VP8LConvertBGRAToRGBA4444_C(const uint32_t* src,
const uint32_t argb = *src++;
const uint8_t rg = ((argb >> 16) & 0xf0) | ((argb >> 12) & 0xf);
const uint8_t ba = ((argb >> 0) & 0xf0) | ((argb >> 28) & 0xf);
-#ifdef WEBP_SWAP_16BIT_CSP
+#if (WEBP_SWAP_16BIT_CSP == 1)
*dst++ = ba;
*dst++ = rg;
#else
@@ -469,7 +471,7 @@ void VP8LConvertBGRAToRGB565_C(const uint32_t* src,
const uint32_t argb = *src++;
const uint8_t rg = ((argb >> 16) & 0xf8) | ((argb >> 13) & 0x7);
const uint8_t gb = ((argb >> 5) & 0xe0) | ((argb >> 3) & 0x1f);
-#ifdef WEBP_SWAP_16BIT_CSP
+#if (WEBP_SWAP_16BIT_CSP == 1)
*dst++ = gb;
*dst++ = rg;
#else
@@ -496,22 +498,7 @@ static void CopyOrSwap(const uint32_t* src, int num_pixels, uint8_t* dst,
const uint32_t* const src_end = src + num_pixels;
while (src < src_end) {
const uint32_t argb = *src++;
-
-#if !defined(WORDS_BIGENDIAN)
-#if !defined(WEBP_REFERENCE_IMPLEMENTATION)
WebPUint32ToMem(dst, BSwap32(argb));
-#else // WEBP_REFERENCE_IMPLEMENTATION
- dst[0] = (argb >> 24) & 0xff;
- dst[1] = (argb >> 16) & 0xff;
- dst[2] = (argb >> 8) & 0xff;
- dst[3] = (argb >> 0) & 0xff;
-#endif
-#else // WORDS_BIGENDIAN
- dst[0] = (argb >> 0) & 0xff;
- dst[1] = (argb >> 8) & 0xff;
- dst[2] = (argb >> 16) & 0xff;
- dst[3] = (argb >> 24) & 0xff;
-#endif
dst += sizeof(argb);
}
} else {
@@ -593,23 +580,23 @@ extern void VP8LDspInitMSA(void);
static volatile VP8CPUInfo lossless_last_cpuinfo_used =
(VP8CPUInfo)&lossless_last_cpuinfo_used;
-#define COPY_PREDICTOR_ARRAY(IN, OUT) do { \
- (OUT)[0] = IN##0; \
- (OUT)[1] = IN##1; \
- (OUT)[2] = IN##2; \
- (OUT)[3] = IN##3; \
- (OUT)[4] = IN##4; \
- (OUT)[5] = IN##5; \
- (OUT)[6] = IN##6; \
- (OUT)[7] = IN##7; \
- (OUT)[8] = IN##8; \
- (OUT)[9] = IN##9; \
- (OUT)[10] = IN##10; \
- (OUT)[11] = IN##11; \
- (OUT)[12] = IN##12; \
- (OUT)[13] = IN##13; \
- (OUT)[14] = IN##0; /* <- padding security sentinels*/ \
- (OUT)[15] = IN##0; \
+#define COPY_PREDICTOR_ARRAY(IN, OUT) do { \
+ (OUT)[0] = IN##0_C; \
+ (OUT)[1] = IN##1_C; \
+ (OUT)[2] = IN##2_C; \
+ (OUT)[3] = IN##3_C; \
+ (OUT)[4] = IN##4_C; \
+ (OUT)[5] = IN##5_C; \
+ (OUT)[6] = IN##6_C; \
+ (OUT)[7] = IN##7_C; \
+ (OUT)[8] = IN##8_C; \
+ (OUT)[9] = IN##9_C; \
+ (OUT)[10] = IN##10_C; \
+ (OUT)[11] = IN##11_C; \
+ (OUT)[12] = IN##12_C; \
+ (OUT)[13] = IN##13_C; \
+ (OUT)[14] = IN##0_C; /* <- padding security sentinels*/ \
+ (OUT)[15] = IN##0_C; \
} while (0);
WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInit(void) {
@@ -620,18 +607,21 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInit(void) {
COPY_PREDICTOR_ARRAY(PredictorAdd, VP8LPredictorsAdd)
COPY_PREDICTOR_ARRAY(PredictorAdd, VP8LPredictorsAdd_C)
+#if !WEBP_NEON_OMIT_C_CODE
VP8LAddGreenToBlueAndRed = VP8LAddGreenToBlueAndRed_C;
VP8LTransformColorInverse = VP8LTransformColorInverse_C;
- VP8LConvertBGRAToRGB = VP8LConvertBGRAToRGB_C;
VP8LConvertBGRAToRGBA = VP8LConvertBGRAToRGBA_C;
+ VP8LConvertBGRAToRGB = VP8LConvertBGRAToRGB_C;
+ VP8LConvertBGRAToBGR = VP8LConvertBGRAToBGR_C;
+#endif
+
VP8LConvertBGRAToRGBA4444 = VP8LConvertBGRAToRGBA4444_C;
VP8LConvertBGRAToRGB565 = VP8LConvertBGRAToRGB565_C;
- VP8LConvertBGRAToBGR = VP8LConvertBGRAToBGR_C;
- VP8LMapColor32b = MapARGB;
- VP8LMapColor8b = MapAlpha;
+ VP8LMapColor32b = MapARGB_C;
+ VP8LMapColor8b = MapAlpha_C;
// If defined, use CPUInfo() to overwrite some pointers with faster versions.
if (VP8GetCPUInfo != NULL) {
@@ -640,11 +630,6 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInit(void) {
VP8LDspInitSSE2();
}
#endif
-#if defined(WEBP_USE_NEON)
- if (VP8GetCPUInfo(kNEON)) {
- VP8LDspInitNEON();
- }
-#endif
#if defined(WEBP_USE_MIPS_DSP_R2)
if (VP8GetCPUInfo(kMIPSdspR2)) {
VP8LDspInitMIPSdspR2();
@@ -656,6 +641,24 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInit(void) {
}
#endif
}
+
+#if defined(WEBP_USE_NEON)
+ if (WEBP_NEON_OMIT_C_CODE ||
+ (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
+ VP8LDspInitNEON();
+ }
+#endif
+
+ assert(VP8LAddGreenToBlueAndRed != NULL);
+ assert(VP8LTransformColorInverse != NULL);
+ assert(VP8LConvertBGRAToRGBA != NULL);
+ assert(VP8LConvertBGRAToRGB != NULL);
+ assert(VP8LConvertBGRAToBGR != NULL);
+ assert(VP8LConvertBGRAToRGBA4444 != NULL);
+ assert(VP8LConvertBGRAToRGB565 != NULL);
+ assert(VP8LMapColor32b != NULL);
+ assert(VP8LMapColor8b != NULL);
+
lossless_last_cpuinfo_used = VP8GetCPUInfo;
}
#undef COPY_PREDICTOR_ARRAY
diff --git a/src/3rdparty/libwebp/src/dsp/lossless.h b/src/3rdparty/libwebp/src/dsp/lossless.h
index 352a54e..a99dbda 100644
--- a/src/3rdparty/libwebp/src/dsp/lossless.h
+++ b/src/3rdparty/libwebp/src/dsp/lossless.h
@@ -15,18 +15,18 @@
#ifndef WEBP_DSP_LOSSLESS_H_
#define WEBP_DSP_LOSSLESS_H_
-#include "../webp/types.h"
-#include "../webp/decode.h"
+#include "src/webp/types.h"
+#include "src/webp/decode.h"
-#include "../enc/histogram_enc.h"
-#include "../utils/utils.h"
+#include "src/enc/histogram_enc.h"
+#include "src/utils/utils.h"
#ifdef __cplusplus
extern "C" {
#endif
#ifdef WEBP_EXPERIMENTAL_FEATURES
-#include "../enc/delta_palettization_enc.h"
+#include "src/enc/delta_palettization_enc.h"
#endif // WEBP_EXPERIMENTAL_FEATURES
//------------------------------------------------------------------------------
@@ -124,7 +124,7 @@ void VP8LDspInit(void);
typedef void (*VP8LProcessEncBlueAndRedFunc)(uint32_t* dst, int num_pixels);
extern VP8LProcessEncBlueAndRedFunc VP8LSubtractGreenFromBlueAndRed;
typedef void (*VP8LTransformColorFunc)(const VP8LMultipliers* const m,
- uint32_t* const dst, int num_pixels);
+ uint32_t* dst, int num_pixels);
extern VP8LTransformColorFunc VP8LTransformColor;
typedef void (*VP8LCollectColorBlueTransformsFunc)(
const uint32_t* argb, int stride,
diff --git a/src/3rdparty/libwebp/src/dsp/lossless_common.h b/src/3rdparty/libwebp/src/dsp/lossless_common.h
index c40f711..a2648d1 100644
--- a/src/3rdparty/libwebp/src/dsp/lossless_common.h
+++ b/src/3rdparty/libwebp/src/dsp/lossless_common.h
@@ -16,9 +16,9 @@
#ifndef WEBP_DSP_LOSSLESS_COMMON_H_
#define WEBP_DSP_LOSSLESS_COMMON_H_
-#include "../webp/types.h"
+#include "src/webp/types.h"
-#include "../utils/utils.h"
+#include "src/utils/utils.h"
#ifdef __cplusplus
extern "C" {
@@ -93,14 +93,6 @@ static WEBP_INLINE float VP8LFastSLog2(uint32_t v) {
// -----------------------------------------------------------------------------
// PrefixEncode()
-static WEBP_INLINE int VP8LBitsLog2Ceiling(uint32_t n) {
- const int log_floor = BitsLog2Floor(n);
- if (n == (n & ~(n - 1))) { // zero or a power of two.
- return log_floor;
- }
- return log_floor + 1;
-}
-
// Splitting of distance and length codes into prefixes and
// extra bits. The prefixes are encoded with an entropy code
// while the extra bits are stored just as normal bits.
diff --git a/src/3rdparty/libwebp/src/dsp/lossless_enc.c b/src/3rdparty/libwebp/src/dsp/lossless_enc.c
index 4e46fba..92ca3c0 100644
--- a/src/3rdparty/libwebp/src/dsp/lossless_enc.c
+++ b/src/3rdparty/libwebp/src/dsp/lossless_enc.c
@@ -13,15 +13,16 @@
// Jyrki Alakuijala (jyrki@google.com)
// Urvang Joshi (urvang@google.com)
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
+#include <assert.h>
#include <math.h>
#include <stdlib.h>
-#include "../dec/vp8li_dec.h"
-#include "../utils/endian_inl_utils.h"
-#include "./lossless.h"
-#include "./lossless_common.h"
-#include "./yuv.h"
+#include "src/dec/vp8li_dec.h"
+#include "src/utils/endian_inl_utils.h"
+#include "src/dsp/lossless.h"
+#include "src/dsp/lossless_common.h"
+#include "src/dsp/yuv.h"
// lookup table for small values of log2(int)
const float kLog2Table[LOG_LOOKUP_IDX_MAX] = {
@@ -325,7 +326,7 @@ const uint8_t kPrefixEncodeExtraBitsValue[PREFIX_LOOKUP_IDX_MAX] = {
112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126
};
-static float FastSLog2Slow(uint32_t v) {
+static float FastSLog2Slow_C(uint32_t v) {
assert(v >= LOG_LOOKUP_IDX_MAX);
if (v < APPROX_LOG_WITH_CORRECTION_MAX) {
int log_cnt = 0;
@@ -351,7 +352,7 @@ static float FastSLog2Slow(uint32_t v) {
}
}
-static float FastLog2Slow(uint32_t v) {
+static float FastLog2Slow_C(uint32_t v) {
assert(v >= LOG_LOOKUP_IDX_MAX);
if (v < APPROX_LOG_WITH_CORRECTION_MAX) {
int log_cnt = 0;
@@ -380,7 +381,7 @@ static float FastLog2Slow(uint32_t v) {
// Methods to calculate Entropy (Shannon).
// Compute the combined Shanon's entropy for distribution {X} and {X+Y}
-static float CombinedShannonEntropy(const int X[256], const int Y[256]) {
+static float CombinedShannonEntropy_C(const int X[256], const int Y[256]) {
int i;
double retval = 0.;
int sumX = 0, sumXY = 0;
@@ -453,9 +454,9 @@ static WEBP_INLINE void GetEntropyUnrefinedHelper(
*i_prev = i;
}
-static void GetEntropyUnrefined(const uint32_t X[], int length,
- VP8LBitEntropy* const bit_entropy,
- VP8LStreaks* const stats) {
+static void GetEntropyUnrefined_C(const uint32_t X[], int length,
+ VP8LBitEntropy* const bit_entropy,
+ VP8LStreaks* const stats) {
int i;
int i_prev = 0;
uint32_t x_prev = X[0];
@@ -474,10 +475,11 @@ static void GetEntropyUnrefined(const uint32_t X[], int length,
bit_entropy->entropy += VP8LFastSLog2(bit_entropy->sum);
}
-static void GetCombinedEntropyUnrefined(const uint32_t X[], const uint32_t Y[],
- int length,
- VP8LBitEntropy* const bit_entropy,
- VP8LStreaks* const stats) {
+static void GetCombinedEntropyUnrefined_C(const uint32_t X[],
+ const uint32_t Y[],
+ int length,
+ VP8LBitEntropy* const bit_entropy,
+ VP8LStreaks* const stats) {
int i = 1;
int i_prev = 0;
uint32_t xy_prev = X[0] + Y[0];
@@ -520,8 +522,8 @@ void VP8LTransformColor_C(const VP8LMultipliers* const m, uint32_t* data,
const uint32_t argb = data[i];
const uint32_t green = argb >> 8;
const uint32_t red = argb >> 16;
- int new_red = red;
- int new_blue = argb;
+ int new_red = red & 0xff;
+ int new_blue = argb & 0xff;
new_red -= ColorTransformDelta(m->green_to_red_, green);
new_red &= 0xff;
new_blue -= ColorTransformDelta(m->green_to_blue_, green);
@@ -577,8 +579,8 @@ void VP8LCollectColorBlueTransforms_C(const uint32_t* argb, int stride,
//------------------------------------------------------------------------------
-static int VectorMismatch(const uint32_t* const array1,
- const uint32_t* const array2, int length) {
+static int VectorMismatch_C(const uint32_t* const array1,
+ const uint32_t* const array2, int length) {
int match_len = 0;
while (match_len < length && array1[match_len] == array2[match_len]) {
@@ -610,15 +612,15 @@ void VP8LBundleColorMap_C(const uint8_t* const row, int width, int xbits,
//------------------------------------------------------------------------------
-static double ExtraCost(const uint32_t* population, int length) {
+static double ExtraCost_C(const uint32_t* population, int length) {
int i;
double cost = 0.;
for (i = 2; i < length - 2; ++i) cost += (i >> 1) * population[i + 2];
return cost;
}
-static double ExtraCostCombined(const uint32_t* X, const uint32_t* Y,
- int length) {
+static double ExtraCostCombined_C(const uint32_t* X, const uint32_t* Y,
+ int length) {
int i;
double cost = 0.;
for (i = 2; i < length - 2; ++i) {
@@ -630,9 +632,9 @@ static double ExtraCostCombined(const uint32_t* X, const uint32_t* Y,
//------------------------------------------------------------------------------
-static void HistogramAdd(const VP8LHistogram* const a,
- const VP8LHistogram* const b,
- VP8LHistogram* const out) {
+static void HistogramAdd_C(const VP8LHistogram* const a,
+ const VP8LHistogram* const b,
+ VP8LHistogram* const out) {
int i;
const int literal_size = VP8LHistogramNumCodes(a->palette_code_bits_);
assert(a->palette_code_bits_ == b->palette_code_bits_);
@@ -869,26 +871,28 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInit(void) {
VP8LDspInit();
+#if !WEBP_NEON_OMIT_C_CODE
VP8LSubtractGreenFromBlueAndRed = VP8LSubtractGreenFromBlueAndRed_C;
VP8LTransformColor = VP8LTransformColor_C;
+#endif
VP8LCollectColorBlueTransforms = VP8LCollectColorBlueTransforms_C;
VP8LCollectColorRedTransforms = VP8LCollectColorRedTransforms_C;
- VP8LFastLog2Slow = FastLog2Slow;
- VP8LFastSLog2Slow = FastSLog2Slow;
+ VP8LFastLog2Slow = FastLog2Slow_C;
+ VP8LFastSLog2Slow = FastSLog2Slow_C;
- VP8LExtraCost = ExtraCost;
- VP8LExtraCostCombined = ExtraCostCombined;
- VP8LCombinedShannonEntropy = CombinedShannonEntropy;
+ VP8LExtraCost = ExtraCost_C;
+ VP8LExtraCostCombined = ExtraCostCombined_C;
+ VP8LCombinedShannonEntropy = CombinedShannonEntropy_C;
- VP8LGetEntropyUnrefined = GetEntropyUnrefined;
- VP8LGetCombinedEntropyUnrefined = GetCombinedEntropyUnrefined;
+ VP8LGetEntropyUnrefined = GetEntropyUnrefined_C;
+ VP8LGetCombinedEntropyUnrefined = GetCombinedEntropyUnrefined_C;
- VP8LHistogramAdd = HistogramAdd;
+ VP8LHistogramAdd = HistogramAdd_C;
- VP8LVectorMismatch = VectorMismatch;
+ VP8LVectorMismatch = VectorMismatch_C;
VP8LBundleColorMap = VP8LBundleColorMap_C;
VP8LPredictorsSub[0] = PredictorSub0_C;
@@ -937,11 +941,6 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInit(void) {
#endif
}
#endif
-#if defined(WEBP_USE_NEON)
- if (VP8GetCPUInfo(kNEON)) {
- VP8LEncDspInitNEON();
- }
-#endif
#if defined(WEBP_USE_MIPS32)
if (VP8GetCPUInfo(kMIPS32)) {
VP8LEncDspInitMIPS32();
@@ -958,6 +957,61 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInit(void) {
}
#endif
}
+
+#if defined(WEBP_USE_NEON)
+ if (WEBP_NEON_OMIT_C_CODE ||
+ (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
+ VP8LEncDspInitNEON();
+ }
+#endif
+
+ assert(VP8LSubtractGreenFromBlueAndRed != NULL);
+ assert(VP8LTransformColor != NULL);
+ assert(VP8LCollectColorBlueTransforms != NULL);
+ assert(VP8LCollectColorRedTransforms != NULL);
+ assert(VP8LFastLog2Slow != NULL);
+ assert(VP8LFastSLog2Slow != NULL);
+ assert(VP8LExtraCost != NULL);
+ assert(VP8LExtraCostCombined != NULL);
+ assert(VP8LCombinedShannonEntropy != NULL);
+ assert(VP8LGetEntropyUnrefined != NULL);
+ assert(VP8LGetCombinedEntropyUnrefined != NULL);
+ assert(VP8LHistogramAdd != NULL);
+ assert(VP8LVectorMismatch != NULL);
+ assert(VP8LBundleColorMap != NULL);
+ assert(VP8LPredictorsSub[0] != NULL);
+ assert(VP8LPredictorsSub[1] != NULL);
+ assert(VP8LPredictorsSub[2] != NULL);
+ assert(VP8LPredictorsSub[3] != NULL);
+ assert(VP8LPredictorsSub[4] != NULL);
+ assert(VP8LPredictorsSub[5] != NULL);
+ assert(VP8LPredictorsSub[6] != NULL);
+ assert(VP8LPredictorsSub[7] != NULL);
+ assert(VP8LPredictorsSub[8] != NULL);
+ assert(VP8LPredictorsSub[9] != NULL);
+ assert(VP8LPredictorsSub[10] != NULL);
+ assert(VP8LPredictorsSub[11] != NULL);
+ assert(VP8LPredictorsSub[12] != NULL);
+ assert(VP8LPredictorsSub[13] != NULL);
+ assert(VP8LPredictorsSub[14] != NULL);
+ assert(VP8LPredictorsSub[15] != NULL);
+ assert(VP8LPredictorsSub_C[0] != NULL);
+ assert(VP8LPredictorsSub_C[1] != NULL);
+ assert(VP8LPredictorsSub_C[2] != NULL);
+ assert(VP8LPredictorsSub_C[3] != NULL);
+ assert(VP8LPredictorsSub_C[4] != NULL);
+ assert(VP8LPredictorsSub_C[5] != NULL);
+ assert(VP8LPredictorsSub_C[6] != NULL);
+ assert(VP8LPredictorsSub_C[7] != NULL);
+ assert(VP8LPredictorsSub_C[8] != NULL);
+ assert(VP8LPredictorsSub_C[9] != NULL);
+ assert(VP8LPredictorsSub_C[10] != NULL);
+ assert(VP8LPredictorsSub_C[11] != NULL);
+ assert(VP8LPredictorsSub_C[12] != NULL);
+ assert(VP8LPredictorsSub_C[13] != NULL);
+ assert(VP8LPredictorsSub_C[14] != NULL);
+ assert(VP8LPredictorsSub_C[15] != NULL);
+
lossless_enc_last_cpuinfo_used = VP8GetCPUInfo;
}
diff --git a/src/3rdparty/libwebp/src/dsp/lossless_enc_mips32.c b/src/3rdparty/libwebp/src/dsp/lossless_enc_mips32.c
index 4186b9f..e7b58f4 100644
--- a/src/3rdparty/libwebp/src/dsp/lossless_enc_mips32.c
+++ b/src/3rdparty/libwebp/src/dsp/lossless_enc_mips32.c
@@ -12,9 +12,9 @@
// Author(s): Djordje Pesut (djordje.pesut@imgtec.com)
// Jovan Zelincevic (jovan.zelincevic@imgtec.com)
-#include "./dsp.h"
-#include "./lossless.h"
-#include "./lossless_common.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/lossless.h"
+#include "src/dsp/lossless_common.h"
#if defined(WEBP_USE_MIPS32)
@@ -23,7 +23,7 @@
#include <stdlib.h>
#include <string.h>
-static float FastSLog2Slow(uint32_t v) {
+static float FastSLog2Slow_MIPS32(uint32_t v) {
assert(v >= LOG_LOOKUP_IDX_MAX);
if (v < APPROX_LOG_WITH_CORRECTION_MAX) {
uint32_t log_cnt, y, correction;
@@ -59,7 +59,7 @@ static float FastSLog2Slow(uint32_t v) {
}
}
-static float FastLog2Slow(uint32_t v) {
+static float FastLog2Slow_MIPS32(uint32_t v) {
assert(v >= LOG_LOOKUP_IDX_MAX);
if (v < APPROX_LOG_WITH_CORRECTION_MAX) {
uint32_t log_cnt, y;
@@ -104,7 +104,7 @@ static float FastLog2Slow(uint32_t v) {
// pop += 2;
// }
// return (double)cost;
-static double ExtraCost(const uint32_t* const population, int length) {
+static double ExtraCost_MIPS32(const uint32_t* const population, int length) {
int i, temp0, temp1;
const uint32_t* pop = &population[4];
const uint32_t* const LoopEnd = &population[length];
@@ -149,8 +149,8 @@ static double ExtraCost(const uint32_t* const population, int length) {
// pY += 2;
// }
// return (double)cost;
-static double ExtraCostCombined(const uint32_t* const X,
- const uint32_t* const Y, int length) {
+static double ExtraCostCombined_MIPS32(const uint32_t* const X,
+ const uint32_t* const Y, int length) {
int i, temp0, temp1, temp2, temp3;
const uint32_t* pX = &X[4];
const uint32_t* pY = &Y[4];
@@ -241,9 +241,9 @@ static WEBP_INLINE void GetEntropyUnrefinedHelper(
*i_prev = i;
}
-static void GetEntropyUnrefined(const uint32_t X[], int length,
- VP8LBitEntropy* const bit_entropy,
- VP8LStreaks* const stats) {
+static void GetEntropyUnrefined_MIPS32(const uint32_t X[], int length,
+ VP8LBitEntropy* const bit_entropy,
+ VP8LStreaks* const stats) {
int i;
int i_prev = 0;
uint32_t x_prev = X[0];
@@ -262,26 +262,27 @@ static void GetEntropyUnrefined(const uint32_t X[], int length,
bit_entropy->entropy += VP8LFastSLog2(bit_entropy->sum);
}
-static void GetCombinedEntropyUnrefined(const uint32_t X[], const uint32_t Y[],
- int length,
- VP8LBitEntropy* const bit_entropy,
- VP8LStreaks* const stats) {
+static void GetCombinedEntropyUnrefined_MIPS32(const uint32_t X[],
+ const uint32_t Y[],
+ int length,
+ VP8LBitEntropy* const entropy,
+ VP8LStreaks* const stats) {
int i = 1;
int i_prev = 0;
uint32_t xy_prev = X[0] + Y[0];
memset(stats, 0, sizeof(*stats));
- VP8LBitEntropyInit(bit_entropy);
+ VP8LBitEntropyInit(entropy);
for (i = 1; i < length; ++i) {
const uint32_t xy = X[i] + Y[i];
if (xy != xy_prev) {
- GetEntropyUnrefinedHelper(xy, i, &xy_prev, &i_prev, bit_entropy, stats);
+ GetEntropyUnrefinedHelper(xy, i, &xy_prev, &i_prev, entropy, stats);
}
}
- GetEntropyUnrefinedHelper(0, i, &xy_prev, &i_prev, bit_entropy, stats);
+ GetEntropyUnrefinedHelper(0, i, &xy_prev, &i_prev, entropy, stats);
- bit_entropy->entropy += VP8LFastSLog2(bit_entropy->sum);
+ entropy->entropy += VP8LFastSLog2(entropy->sum);
}
#define ASM_START \
@@ -374,9 +375,9 @@ static void GetCombinedEntropyUnrefined(const uint32_t X[], const uint32_t Y[],
} \
} while (0)
-static void HistogramAdd(const VP8LHistogram* const a,
- const VP8LHistogram* const b,
- VP8LHistogram* const out) {
+static void HistogramAdd_MIPS32(const VP8LHistogram* const a,
+ const VP8LHistogram* const b,
+ VP8LHistogram* const out) {
uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
const int extra_cache_size = VP8LHistogramNumCodes(a->palette_code_bits_)
- (NUM_LITERAL_CODES + NUM_LENGTH_CODES);
@@ -415,13 +416,13 @@ static void HistogramAdd(const VP8LHistogram* const a,
extern void VP8LEncDspInitMIPS32(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitMIPS32(void) {
- VP8LFastSLog2Slow = FastSLog2Slow;
- VP8LFastLog2Slow = FastLog2Slow;
- VP8LExtraCost = ExtraCost;
- VP8LExtraCostCombined = ExtraCostCombined;
- VP8LGetEntropyUnrefined = GetEntropyUnrefined;
- VP8LGetCombinedEntropyUnrefined = GetCombinedEntropyUnrefined;
- VP8LHistogramAdd = HistogramAdd;
+ VP8LFastSLog2Slow = FastSLog2Slow_MIPS32;
+ VP8LFastLog2Slow = FastLog2Slow_MIPS32;
+ VP8LExtraCost = ExtraCost_MIPS32;
+ VP8LExtraCostCombined = ExtraCostCombined_MIPS32;
+ VP8LGetEntropyUnrefined = GetEntropyUnrefined_MIPS32;
+ VP8LGetCombinedEntropyUnrefined = GetCombinedEntropyUnrefined_MIPS32;
+ VP8LHistogramAdd = HistogramAdd_MIPS32;
}
#else // !WEBP_USE_MIPS32
diff --git a/src/3rdparty/libwebp/src/dsp/lossless_enc_mips_dsp_r2.c b/src/3rdparty/libwebp/src/dsp/lossless_enc_mips_dsp_r2.c
index 0abf3c4..5855e6a 100644
--- a/src/3rdparty/libwebp/src/dsp/lossless_enc_mips_dsp_r2.c
+++ b/src/3rdparty/libwebp/src/dsp/lossless_enc_mips_dsp_r2.c
@@ -12,14 +12,14 @@
// Author(s): Djordje Pesut (djordje.pesut@imgtec.com)
// Jovan Zelincevic (jovan.zelincevic@imgtec.com)
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
#if defined(WEBP_USE_MIPS_DSP_R2)
-#include "./lossless.h"
+#include "src/dsp/lossless.h"
-static void SubtractGreenFromBlueAndRed(uint32_t* argb_data,
- int num_pixels) {
+static void SubtractGreenFromBlueAndRed_MIPSdspR2(uint32_t* argb_data,
+ int num_pixels) {
uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
uint32_t* const p_loop1_end = argb_data + (num_pixels & ~3);
uint32_t* const p_loop2_end = p_loop1_end + (num_pixels & 3);
@@ -78,8 +78,8 @@ static WEBP_INLINE uint32_t ColorTransformDelta(int8_t color_pred,
return (uint32_t)((int)(color_pred) * color) >> 5;
}
-static void TransformColor(const VP8LMultipliers* const m, uint32_t* data,
- int num_pixels) {
+static void TransformColor_MIPSdspR2(const VP8LMultipliers* const m,
+ uint32_t* data, int num_pixels) {
int temp0, temp1, temp2, temp3, temp4, temp5;
uint32_t argb, argb1, new_red, new_red1;
const uint32_t G_to_R = m->green_to_red_;
@@ -171,10 +171,13 @@ static WEBP_INLINE uint8_t TransformColorBlue(uint8_t green_to_blue,
return (new_blue & 0xff);
}
-static void CollectColorBlueTransforms(const uint32_t* argb, int stride,
- int tile_width, int tile_height,
- int green_to_blue, int red_to_blue,
- int histo[]) {
+static void CollectColorBlueTransforms_MIPSdspR2(const uint32_t* argb,
+ int stride,
+ int tile_width,
+ int tile_height,
+ int green_to_blue,
+ int red_to_blue,
+ int histo[]) {
const int rtb = (red_to_blue << 16) | (red_to_blue & 0xffff);
const int gtb = (green_to_blue << 16) | (green_to_blue & 0xffff);
const uint32_t mask = 0xff00ffu;
@@ -222,9 +225,12 @@ static WEBP_INLINE uint8_t TransformColorRed(uint8_t green_to_red,
return (new_red & 0xff);
}
-static void CollectColorRedTransforms(const uint32_t* argb, int stride,
- int tile_width, int tile_height,
- int green_to_red, int histo[]) {
+static void CollectColorRedTransforms_MIPSdspR2(const uint32_t* argb,
+ int stride,
+ int tile_width,
+ int tile_height,
+ int green_to_red,
+ int histo[]) {
const int gtr = (green_to_red << 16) | (green_to_red & 0xffff);
while (tile_height-- > 0) {
int x;
@@ -262,10 +268,10 @@ static void CollectColorRedTransforms(const uint32_t* argb, int stride,
extern void VP8LEncDspInitMIPSdspR2(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitMIPSdspR2(void) {
- VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed;
- VP8LTransformColor = TransformColor;
- VP8LCollectColorBlueTransforms = CollectColorBlueTransforms;
- VP8LCollectColorRedTransforms = CollectColorRedTransforms;
+ VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed_MIPSdspR2;
+ VP8LTransformColor = TransformColor_MIPSdspR2;
+ VP8LCollectColorBlueTransforms = CollectColorBlueTransforms_MIPSdspR2;
+ VP8LCollectColorRedTransforms = CollectColorRedTransforms_MIPSdspR2;
}
#else // !WEBP_USE_MIPS_DSP_R2
diff --git a/src/3rdparty/libwebp/src/dsp/lossless_enc_msa.c b/src/3rdparty/libwebp/src/dsp/lossless_enc_msa.c
index 2f69ba3..600dddf 100644
--- a/src/3rdparty/libwebp/src/dsp/lossless_enc_msa.c
+++ b/src/3rdparty/libwebp/src/dsp/lossless_enc_msa.c
@@ -11,12 +11,12 @@
//
// Authors: Prashant Patil (Prashant.Patil@imgtec.com)
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
#if defined(WEBP_USE_MSA)
-#include "./lossless.h"
-#include "./msa_macro.h"
+#include "src/dsp/lossless.h"
+#include "src/dsp/msa_macro.h"
#define TRANSFORM_COLOR_8(src0, src1, dst0, dst1, c0, c1, mask0, mask1) do { \
v8i16 g0, g1, t0, t1, t2, t3; \
@@ -48,8 +48,8 @@
dst = VSHF_UB(src, t0, mask1); \
} while (0)
-static void TransformColor(const VP8LMultipliers* const m, uint32_t* data,
- int num_pixels) {
+static void TransformColor_MSA(const VP8LMultipliers* const m, uint32_t* data,
+ int num_pixels) {
v16u8 src0, dst0;
const v16i8 g2br = (v16i8)__msa_fill_w(m->green_to_blue_ |
(m->green_to_red_ << 16));
@@ -94,7 +94,8 @@ static void TransformColor(const VP8LMultipliers* const m, uint32_t* data,
}
}
-static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) {
+static void SubtractGreenFromBlueAndRed_MSA(uint32_t* argb_data,
+ int num_pixels) {
int i;
uint8_t* ptemp_data = (uint8_t*)argb_data;
v16u8 src0, dst0, tmp0;
@@ -136,8 +137,8 @@ static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) {
extern void VP8LEncDspInitMSA(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitMSA(void) {
- VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed;
- VP8LTransformColor = TransformColor;
+ VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed_MSA;
+ VP8LTransformColor = TransformColor_MSA;
}
#else // !WEBP_USE_MSA
diff --git a/src/3rdparty/libwebp/src/dsp/lossless_enc_neon.c b/src/3rdparty/libwebp/src/dsp/lossless_enc_neon.c
index 4c56f25..7c7b73f 100644
--- a/src/3rdparty/libwebp/src/dsp/lossless_enc_neon.c
+++ b/src/3rdparty/libwebp/src/dsp/lossless_enc_neon.c
@@ -11,14 +11,14 @@
//
// Author: Skal (pascal.massimino@gmail.com)
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
#if defined(WEBP_USE_NEON)
#include <arm_neon.h>
-#include "./lossless.h"
-#include "./neon.h"
+#include "src/dsp/lossless.h"
+#include "src/dsp/neon.h"
//------------------------------------------------------------------------------
// Subtract-Green Transform
@@ -36,8 +36,8 @@ static const uint8_t kGreenShuffle[16] = {
1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255
};
-static WEBP_INLINE uint8x16_t DoGreenShuffle(const uint8x16_t argb,
- const uint8x16_t shuffle) {
+static WEBP_INLINE uint8x16_t DoGreenShuffle_NEON(const uint8x16_t argb,
+ const uint8x16_t shuffle) {
return vcombine_u8(vtbl1q_u8(argb, vget_low_u8(shuffle)),
vtbl1q_u8(argb, vget_high_u8(shuffle)));
}
@@ -45,14 +45,15 @@ static WEBP_INLINE uint8x16_t DoGreenShuffle(const uint8x16_t argb,
// 255 = byte will be zeroed
static const uint8_t kGreenShuffle[8] = { 1, 255, 1, 255, 5, 255, 5, 255 };
-static WEBP_INLINE uint8x16_t DoGreenShuffle(const uint8x16_t argb,
- const uint8x8_t shuffle) {
+static WEBP_INLINE uint8x16_t DoGreenShuffle_NEON(const uint8x16_t argb,
+ const uint8x8_t shuffle) {
return vcombine_u8(vtbl1_u8(vget_low_u8(argb), shuffle),
vtbl1_u8(vget_high_u8(argb), shuffle));
}
#endif // USE_VTBLQ
-static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) {
+static void SubtractGreenFromBlueAndRed_NEON(uint32_t* argb_data,
+ int num_pixels) {
const uint32_t* const end = argb_data + (num_pixels & ~3);
#ifdef USE_VTBLQ
const uint8x16_t shuffle = vld1q_u8(kGreenShuffle);
@@ -61,7 +62,7 @@ static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) {
#endif
for (; argb_data < end; argb_data += 4) {
const uint8x16_t argb = vld1q_u8((uint8_t*)argb_data);
- const uint8x16_t greens = DoGreenShuffle(argb, shuffle);
+ const uint8x16_t greens = DoGreenShuffle_NEON(argb, shuffle);
vst1q_u8((uint8_t*)argb_data, vsubq_u8(argb, greens));
}
// fallthrough and finish off with plain-C
@@ -71,8 +72,8 @@ static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) {
//------------------------------------------------------------------------------
// Color Transform
-static void TransformColor(const VP8LMultipliers* const m,
- uint32_t* argb_data, int num_pixels) {
+static void TransformColor_NEON(const VP8LMultipliers* const m,
+ uint32_t* argb_data, int num_pixels) {
// sign-extended multiplying constants, pre-shifted by 6.
#define CST(X) (((int16_t)(m->X << 8)) >> 6)
const int16_t rb[8] = {
@@ -102,7 +103,7 @@ static void TransformColor(const VP8LMultipliers* const m,
for (i = 0; i + 4 <= num_pixels; i += 4) {
const uint8x16_t in = vld1q_u8((uint8_t*)(argb_data + i));
// 0 g 0 g
- const uint8x16_t greens = DoGreenShuffle(in, shuffle);
+ const uint8x16_t greens = DoGreenShuffle_NEON(in, shuffle);
// x dr x db1
const int16x8_t A = vqdmulhq_s16(vreinterpretq_s16_u8(greens), mults_rb);
// r 0 b 0
@@ -132,8 +133,8 @@ static void TransformColor(const VP8LMultipliers* const m,
extern void VP8LEncDspInitNEON(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitNEON(void) {
- VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed;
- VP8LTransformColor = TransformColor;
+ VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed_NEON;
+ VP8LTransformColor = TransformColor_NEON;
}
#else // !WEBP_USE_NEON
diff --git a/src/3rdparty/libwebp/src/dsp/lossless_enc_sse2.c b/src/3rdparty/libwebp/src/dsp/lossless_enc_sse2.c
index 8ad85d9..1eaf35c 100644
--- a/src/3rdparty/libwebp/src/dsp/lossless_enc_sse2.c
+++ b/src/3rdparty/libwebp/src/dsp/lossless_enc_sse2.c
@@ -11,22 +11,23 @@
//
// Author: Skal (pascal.massimino@gmail.com)
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
#if defined(WEBP_USE_SSE2)
#include <assert.h>
#include <emmintrin.h>
-#include "./lossless.h"
-#include "./common_sse2.h"
-#include "./lossless_common.h"
+#include "src/dsp/lossless.h"
+#include "src/dsp/common_sse2.h"
+#include "src/dsp/lossless_common.h"
// For sign-extended multiplying constants, pre-shifted by 5:
-#define CST_5b(X) (((int16_t)((uint16_t)X << 8)) >> 5)
+#define CST_5b(X) (((int16_t)((uint16_t)(X) << 8)) >> 5)
//------------------------------------------------------------------------------
// Subtract-Green Transform
-static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) {
+static void SubtractGreenFromBlueAndRed_SSE2(uint32_t* argb_data,
+ int num_pixels) {
int i;
for (i = 0; i + 4 <= num_pixels; i += 4) {
const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]); // argb
@@ -45,8 +46,8 @@ static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) {
//------------------------------------------------------------------------------
// Color Transform
-static void TransformColor(const VP8LMultipliers* const m,
- uint32_t* argb_data, int num_pixels) {
+static void TransformColor_SSE2(const VP8LMultipliers* const m,
+ uint32_t* argb_data, int num_pixels) {
const __m128i mults_rb = _mm_set_epi16(
CST_5b(m->green_to_red_), CST_5b(m->green_to_blue_),
CST_5b(m->green_to_red_), CST_5b(m->green_to_blue_),
@@ -80,10 +81,10 @@ static void TransformColor(const VP8LMultipliers* const m,
//------------------------------------------------------------------------------
#define SPAN 8
-static void CollectColorBlueTransforms(const uint32_t* argb, int stride,
- int tile_width, int tile_height,
- int green_to_blue, int red_to_blue,
- int histo[]) {
+static void CollectColorBlueTransforms_SSE2(const uint32_t* argb, int stride,
+ int tile_width, int tile_height,
+ int green_to_blue, int red_to_blue,
+ int histo[]) {
const __m128i mults_r = _mm_set_epi16(
CST_5b(red_to_blue), 0, CST_5b(red_to_blue), 0,
CST_5b(red_to_blue), 0, CST_5b(red_to_blue), 0);
@@ -131,9 +132,9 @@ static void CollectColorBlueTransforms(const uint32_t* argb, int stride,
}
}
-static void CollectColorRedTransforms(const uint32_t* argb, int stride,
- int tile_width, int tile_height,
- int green_to_red, int histo[]) {
+static void CollectColorRedTransforms_SSE2(const uint32_t* argb, int stride,
+ int tile_width, int tile_height,
+ int green_to_red, int histo[]) {
const __m128i mults_g = _mm_set_epi16(
0, CST_5b(green_to_red), 0, CST_5b(green_to_red),
0, CST_5b(green_to_red), 0, CST_5b(green_to_red));
@@ -177,8 +178,8 @@ static void CollectColorRedTransforms(const uint32_t* argb, int stride,
//------------------------------------------------------------------------------
#define LINE_SIZE 16 // 8 or 16
-static void AddVector(const uint32_t* a, const uint32_t* b, uint32_t* out,
- int size) {
+static void AddVector_SSE2(const uint32_t* a, const uint32_t* b, uint32_t* out,
+ int size) {
int i;
assert(size % LINE_SIZE == 0);
for (i = 0; i < size; i += LINE_SIZE) {
@@ -203,7 +204,7 @@ static void AddVector(const uint32_t* a, const uint32_t* b, uint32_t* out,
}
}
-static void AddVectorEq(const uint32_t* a, uint32_t* out, int size) {
+static void AddVectorEq_SSE2(const uint32_t* a, uint32_t* out, int size) {
int i;
assert(size % LINE_SIZE == 0);
for (i = 0; i < size; i += LINE_SIZE) {
@@ -231,22 +232,22 @@ static void AddVectorEq(const uint32_t* a, uint32_t* out, int size) {
// Note we are adding uint32_t's as *signed* int32's (using _mm_add_epi32). But
// that's ok since the histogram values are less than 1<<28 (max picture size).
-static void HistogramAdd(const VP8LHistogram* const a,
- const VP8LHistogram* const b,
- VP8LHistogram* const out) {
+static void HistogramAdd_SSE2(const VP8LHistogram* const a,
+ const VP8LHistogram* const b,
+ VP8LHistogram* const out) {
int i;
const int literal_size = VP8LHistogramNumCodes(a->palette_code_bits_);
assert(a->palette_code_bits_ == b->palette_code_bits_);
if (b != out) {
- AddVector(a->literal_, b->literal_, out->literal_, NUM_LITERAL_CODES);
- AddVector(a->red_, b->red_, out->red_, NUM_LITERAL_CODES);
- AddVector(a->blue_, b->blue_, out->blue_, NUM_LITERAL_CODES);
- AddVector(a->alpha_, b->alpha_, out->alpha_, NUM_LITERAL_CODES);
+ AddVector_SSE2(a->literal_, b->literal_, out->literal_, NUM_LITERAL_CODES);
+ AddVector_SSE2(a->red_, b->red_, out->red_, NUM_LITERAL_CODES);
+ AddVector_SSE2(a->blue_, b->blue_, out->blue_, NUM_LITERAL_CODES);
+ AddVector_SSE2(a->alpha_, b->alpha_, out->alpha_, NUM_LITERAL_CODES);
} else {
- AddVectorEq(a->literal_, out->literal_, NUM_LITERAL_CODES);
- AddVectorEq(a->red_, out->red_, NUM_LITERAL_CODES);
- AddVectorEq(a->blue_, out->blue_, NUM_LITERAL_CODES);
- AddVectorEq(a->alpha_, out->alpha_, NUM_LITERAL_CODES);
+ AddVectorEq_SSE2(a->literal_, out->literal_, NUM_LITERAL_CODES);
+ AddVectorEq_SSE2(a->red_, out->red_, NUM_LITERAL_CODES);
+ AddVectorEq_SSE2(a->blue_, out->blue_, NUM_LITERAL_CODES);
+ AddVectorEq_SSE2(a->alpha_, out->alpha_, NUM_LITERAL_CODES);
}
for (i = NUM_LITERAL_CODES; i < literal_size; ++i) {
out->literal_[i] = a->literal_[i] + b->literal_[i];
@@ -261,9 +262,9 @@ static void HistogramAdd(const VP8LHistogram* const a,
// Checks whether the X or Y contribution is worth computing and adding.
// Used in loop unrolling.
-#define ANALYZE_X_OR_Y(x_or_y, j) \
- do { \
- if (x_or_y[i + j] != 0) retval -= VP8LFastSLog2(x_or_y[i + j]); \
+#define ANALYZE_X_OR_Y(x_or_y, j) \
+ do { \
+ if ((x_or_y)[i + (j)] != 0) retval -= VP8LFastSLog2((x_or_y)[i + (j)]); \
} while (0)
// Checks whether the X + Y contribution is worth computing and adding.
@@ -276,7 +277,7 @@ static void HistogramAdd(const VP8LHistogram* const a,
} \
} while (0)
-static float CombinedShannonEntropy(const int X[256], const int Y[256]) {
+static float CombinedShannonEntropy_SSE2(const int X[256], const int Y[256]) {
int i;
double retval = 0.;
int sumX, sumXY;
@@ -332,8 +333,8 @@ static float CombinedShannonEntropy(const int X[256], const int Y[256]) {
//------------------------------------------------------------------------------
-static int VectorMismatch(const uint32_t* const array1,
- const uint32_t* const array2, int length) {
+static int VectorMismatch_SSE2(const uint32_t* const array1,
+ const uint32_t* const array2, int length) {
int match_len;
if (length >= 12) {
@@ -574,8 +575,8 @@ static void PredictorSub10_SSE2(const uint32_t* in, const uint32_t* upper,
}
// Predictor11: select.
-static void GetSumAbsDiff32(const __m128i* const A, const __m128i* const B,
- __m128i* const out) {
+static void GetSumAbsDiff32_SSE2(const __m128i* const A, const __m128i* const B,
+ __m128i* const out) {
// We can unpack with any value on the upper 32 bits, provided it's the same
// on both operands (to that their sum of abs diff is zero). Here we use *A.
const __m128i A_lo = _mm_unpacklo_epi32(*A, *A);
@@ -596,8 +597,8 @@ static void PredictorSub11_SSE2(const uint32_t* in, const uint32_t* upper,
const __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]);
const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
__m128i pa, pb;
- GetSumAbsDiff32(&T, &TL, &pa); // pa = sum |T-TL|
- GetSumAbsDiff32(&L, &TL, &pb); // pb = sum |L-TL|
+ GetSumAbsDiff32_SSE2(&T, &TL, &pa); // pa = sum |T-TL|
+ GetSumAbsDiff32_SSE2(&L, &TL, &pb); // pb = sum |L-TL|
{
const __m128i mask = _mm_cmpgt_epi32(pb, pa);
const __m128i A = _mm_and_si128(mask, L);
@@ -677,13 +678,13 @@ static void PredictorSub13_SSE2(const uint32_t* in, const uint32_t* upper,
extern void VP8LEncDspInitSSE2(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitSSE2(void) {
- VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed;
- VP8LTransformColor = TransformColor;
- VP8LCollectColorBlueTransforms = CollectColorBlueTransforms;
- VP8LCollectColorRedTransforms = CollectColorRedTransforms;
- VP8LHistogramAdd = HistogramAdd;
- VP8LCombinedShannonEntropy = CombinedShannonEntropy;
- VP8LVectorMismatch = VectorMismatch;
+ VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed_SSE2;
+ VP8LTransformColor = TransformColor_SSE2;
+ VP8LCollectColorBlueTransforms = CollectColorBlueTransforms_SSE2;
+ VP8LCollectColorRedTransforms = CollectColorRedTransforms_SSE2;
+ VP8LHistogramAdd = HistogramAdd_SSE2;
+ VP8LCombinedShannonEntropy = CombinedShannonEntropy_SSE2;
+ VP8LVectorMismatch = VectorMismatch_SSE2;
VP8LBundleColorMap = BundleColorMap_SSE2;
VP8LPredictorsSub[0] = PredictorSub0_SSE2;
diff --git a/src/3rdparty/libwebp/src/dsp/lossless_enc_sse41.c b/src/3rdparty/libwebp/src/dsp/lossless_enc_sse41.c
index 821057c..3526a34 100644
--- a/src/3rdparty/libwebp/src/dsp/lossless_enc_sse41.c
+++ b/src/3rdparty/libwebp/src/dsp/lossless_enc_sse41.c
@@ -11,17 +11,18 @@
//
// Author: Skal (pascal.massimino@gmail.com)
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
#if defined(WEBP_USE_SSE41)
#include <assert.h>
#include <smmintrin.h>
-#include "./lossless.h"
+#include "src/dsp/lossless.h"
//------------------------------------------------------------------------------
// Subtract-Green Transform
-static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) {
+static void SubtractGreenFromBlueAndRed_SSE41(uint32_t* argb_data,
+ int num_pixels) {
int i;
const __m128i kCstShuffle = _mm_set_epi8(-1, 13, -1, 13, -1, 9, -1, 9,
-1, 5, -1, 5, -1, 1, -1, 1);
@@ -43,7 +44,7 @@ static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) {
extern void VP8LEncDspInitSSE41(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitSSE41(void) {
- VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed;
+ VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed_SSE41;
}
#else // !WEBP_USE_SSE41
diff --git a/src/3rdparty/libwebp/src/dsp/lossless_mips_dsp_r2.c b/src/3rdparty/libwebp/src/dsp/lossless_mips_dsp_r2.c
index 2984ce8..9888854 100644
--- a/src/3rdparty/libwebp/src/dsp/lossless_mips_dsp_r2.c
+++ b/src/3rdparty/libwebp/src/dsp/lossless_mips_dsp_r2.c
@@ -12,12 +12,12 @@
// Author(s): Djordje Pesut (djordje.pesut@imgtec.com)
// Jovan Zelincevic (jovan.zelincevic@imgtec.com)
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
#if defined(WEBP_USE_MIPS_DSP_R2)
-#include "./lossless.h"
-#include "./lossless_common.h"
+#include "src/dsp/lossless.h"
+#include "src/dsp/lossless_common.h"
#define MAP_COLOR_FUNCS(FUNC_NAME, TYPE, GET_INDEX, GET_VALUE) \
static void FUNC_NAME(const TYPE* src, \
@@ -86,8 +86,8 @@ static void FUNC_NAME(const TYPE* src, \
} \
}
-MAP_COLOR_FUNCS(MapARGB, uint32_t, VP8GetARGBIndex, VP8GetARGBValue)
-MAP_COLOR_FUNCS(MapAlpha, uint8_t, VP8GetAlphaIndex, VP8GetAlphaValue)
+MAP_COLOR_FUNCS(MapARGB_MIPSdspR2, uint32_t, VP8GetARGBIndex, VP8GetARGBValue)
+MAP_COLOR_FUNCS(MapAlpha_MIPSdspR2, uint8_t, VP8GetAlphaIndex, VP8GetAlphaValue)
#undef MAP_COLOR_FUNCS
@@ -188,48 +188,52 @@ static WEBP_INLINE uint32_t Average4(uint32_t a0, uint32_t a1,
return Average2(Average2(a0, a1), Average2(a2, a3));
}
-static uint32_t Predictor5(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor5_MIPSdspR2(uint32_t left, const uint32_t* const top) {
return Average3(left, top[0], top[1]);
}
-static uint32_t Predictor6(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor6_MIPSdspR2(uint32_t left, const uint32_t* const top) {
return Average2(left, top[-1]);
}
-static uint32_t Predictor7(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor7_MIPSdspR2(uint32_t left, const uint32_t* const top) {
return Average2(left, top[0]);
}
-static uint32_t Predictor8(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor8_MIPSdspR2(uint32_t left, const uint32_t* const top) {
(void)left;
return Average2(top[-1], top[0]);
}
-static uint32_t Predictor9(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor9_MIPSdspR2(uint32_t left, const uint32_t* const top) {
(void)left;
return Average2(top[0], top[1]);
}
-static uint32_t Predictor10(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor10_MIPSdspR2(uint32_t left,
+ const uint32_t* const top) {
return Average4(left, top[-1], top[0], top[1]);
}
-static uint32_t Predictor11(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor11_MIPSdspR2(uint32_t left,
+ const uint32_t* const top) {
return Select(top[0], left, top[-1]);
}
-static uint32_t Predictor12(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor12_MIPSdspR2(uint32_t left,
+ const uint32_t* const top) {
return ClampedAddSubtractFull(left, top[0], top[-1]);
}
-static uint32_t Predictor13(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor13_MIPSdspR2(uint32_t left,
+ const uint32_t* const top) {
return ClampedAddSubtractHalf(left, top[0], top[-1]);
}
// Add green to blue and red channels (i.e. perform the inverse transform of
// 'subtract green').
-static void AddGreenToBlueAndRed(const uint32_t* src, int num_pixels,
- uint32_t* dst) {
+static void AddGreenToBlueAndRed_MIPSdspR2(const uint32_t* src, int num_pixels,
+ uint32_t* dst) {
uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
const uint32_t* const p_loop1_end = src + (num_pixels & ~3);
const uint32_t* const p_loop2_end = src + num_pixels;
@@ -285,9 +289,9 @@ static void AddGreenToBlueAndRed(const uint32_t* src, int num_pixels,
);
}
-static void TransformColorInverse(const VP8LMultipliers* const m,
- const uint32_t* src, int num_pixels,
- uint32_t* dst) {
+static void TransformColorInverse_MIPSdspR2(const VP8LMultipliers* const m,
+ const uint32_t* src, int num_pixels,
+ uint32_t* dst) {
int temp0, temp1, temp2, temp3, temp4, temp5;
uint32_t argb, argb1, new_red;
const uint32_t G_to_R = m->green_to_red_;
@@ -356,8 +360,8 @@ static void TransformColorInverse(const VP8LMultipliers* const m,
if (num_pixels & 1) VP8LTransformColorInverse_C(m, src, 1, dst);
}
-static void ConvertBGRAToRGB(const uint32_t* src,
- int num_pixels, uint8_t* dst) {
+static void ConvertBGRAToRGB_MIPSdspR2(const uint32_t* src,
+ int num_pixels, uint8_t* dst) {
int temp0, temp1, temp2, temp3;
const uint32_t* const p_loop1_end = src + (num_pixels & ~3);
const uint32_t* const p_loop2_end = src + num_pixels;
@@ -408,8 +412,8 @@ static void ConvertBGRAToRGB(const uint32_t* src,
);
}
-static void ConvertBGRAToRGBA(const uint32_t* src,
- int num_pixels, uint8_t* dst) {
+static void ConvertBGRAToRGBA_MIPSdspR2(const uint32_t* src,
+ int num_pixels, uint8_t* dst) {
int temp0, temp1, temp2, temp3;
const uint32_t* const p_loop1_end = src + (num_pixels & ~3);
const uint32_t* const p_loop2_end = src + num_pixels;
@@ -458,8 +462,8 @@ static void ConvertBGRAToRGBA(const uint32_t* src,
);
}
-static void ConvertBGRAToRGBA4444(const uint32_t* src,
- int num_pixels, uint8_t* dst) {
+static void ConvertBGRAToRGBA4444_MIPSdspR2(const uint32_t* src,
+ int num_pixels, uint8_t* dst) {
int temp0, temp1, temp2, temp3, temp4, temp5;
const uint32_t* const p_loop1_end = src + (num_pixels & ~3);
const uint32_t* const p_loop2_end = src + num_pixels;
@@ -492,7 +496,7 @@ static void ConvertBGRAToRGBA4444(const uint32_t* src,
"ins %[temp3], %[temp5], 16, 4 \n\t"
"addiu %[src], %[src], 16 \n\t"
"precr.qb.ph %[temp3], %[temp3], %[temp2] \n\t"
-#ifdef WEBP_SWAP_16BIT_CSP
+#if (WEBP_SWAP_16BIT_CSP == 1)
"usw %[temp1], 0(%[dst]) \n\t"
"usw %[temp3], 4(%[dst]) \n\t"
#else
@@ -514,7 +518,7 @@ static void ConvertBGRAToRGBA4444(const uint32_t* src,
"ins %[temp0], %[temp5], 16, 4 \n\t"
"addiu %[src], %[src], 4 \n\t"
"precr.qb.ph %[temp0], %[temp0], %[temp0] \n\t"
-#ifdef WEBP_SWAP_16BIT_CSP
+#if (WEBP_SWAP_16BIT_CSP == 1)
"ush %[temp0], 0(%[dst]) \n\t"
#else
"wsbh %[temp0], %[temp0] \n\t"
@@ -532,8 +536,8 @@ static void ConvertBGRAToRGBA4444(const uint32_t* src,
);
}
-static void ConvertBGRAToRGB565(const uint32_t* src,
- int num_pixels, uint8_t* dst) {
+static void ConvertBGRAToRGB565_MIPSdspR2(const uint32_t* src,
+ int num_pixels, uint8_t* dst) {
int temp0, temp1, temp2, temp3, temp4, temp5;
const uint32_t* const p_loop1_end = src + (num_pixels & ~3);
const uint32_t* const p_loop2_end = src + num_pixels;
@@ -570,7 +574,7 @@ static void ConvertBGRAToRGB565(const uint32_t* src,
"ins %[temp2], %[temp3], 0, 5 \n\t"
"addiu %[src], %[src], 16 \n\t"
"append %[temp2], %[temp1], 16 \n\t"
-#ifdef WEBP_SWAP_16BIT_CSP
+#if (WEBP_SWAP_16BIT_CSP == 1)
"usw %[temp0], 0(%[dst]) \n\t"
"usw %[temp2], 4(%[dst]) \n\t"
#else
@@ -592,7 +596,7 @@ static void ConvertBGRAToRGB565(const uint32_t* src,
"ins %[temp4], %[temp5], 0, 11 \n\t"
"addiu %[src], %[src], 4 \n\t"
"ins %[temp4], %[temp0], 0, 5 \n\t"
-#ifdef WEBP_SWAP_16BIT_CSP
+#if (WEBP_SWAP_16BIT_CSP == 1)
"ush %[temp4], 0(%[dst]) \n\t"
#else
"wsbh %[temp4], %[temp4] \n\t"
@@ -610,8 +614,8 @@ static void ConvertBGRAToRGB565(const uint32_t* src,
);
}
-static void ConvertBGRAToBGR(const uint32_t* src,
- int num_pixels, uint8_t* dst) {
+static void ConvertBGRAToBGR_MIPSdspR2(const uint32_t* src,
+ int num_pixels, uint8_t* dst) {
int temp0, temp1, temp2, temp3;
const uint32_t* const p_loop1_end = src + (num_pixels & ~3);
const uint32_t* const p_loop2_end = src + num_pixels;
@@ -662,24 +666,27 @@ static void ConvertBGRAToBGR(const uint32_t* src,
extern void VP8LDspInitMIPSdspR2(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitMIPSdspR2(void) {
- VP8LMapColor32b = MapARGB;
- VP8LMapColor8b = MapAlpha;
- VP8LPredictors[5] = Predictor5;
- VP8LPredictors[6] = Predictor6;
- VP8LPredictors[7] = Predictor7;
- VP8LPredictors[8] = Predictor8;
- VP8LPredictors[9] = Predictor9;
- VP8LPredictors[10] = Predictor10;
- VP8LPredictors[11] = Predictor11;
- VP8LPredictors[12] = Predictor12;
- VP8LPredictors[13] = Predictor13;
- VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed;
- VP8LTransformColorInverse = TransformColorInverse;
- VP8LConvertBGRAToRGB = ConvertBGRAToRGB;
- VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA;
- VP8LConvertBGRAToRGBA4444 = ConvertBGRAToRGBA4444;
- VP8LConvertBGRAToRGB565 = ConvertBGRAToRGB565;
- VP8LConvertBGRAToBGR = ConvertBGRAToBGR;
+ VP8LMapColor32b = MapARGB_MIPSdspR2;
+ VP8LMapColor8b = MapAlpha_MIPSdspR2;
+
+ VP8LPredictors[5] = Predictor5_MIPSdspR2;
+ VP8LPredictors[6] = Predictor6_MIPSdspR2;
+ VP8LPredictors[7] = Predictor7_MIPSdspR2;
+ VP8LPredictors[8] = Predictor8_MIPSdspR2;
+ VP8LPredictors[9] = Predictor9_MIPSdspR2;
+ VP8LPredictors[10] = Predictor10_MIPSdspR2;
+ VP8LPredictors[11] = Predictor11_MIPSdspR2;
+ VP8LPredictors[12] = Predictor12_MIPSdspR2;
+ VP8LPredictors[13] = Predictor13_MIPSdspR2;
+
+ VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed_MIPSdspR2;
+ VP8LTransformColorInverse = TransformColorInverse_MIPSdspR2;
+
+ VP8LConvertBGRAToRGB = ConvertBGRAToRGB_MIPSdspR2;
+ VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA_MIPSdspR2;
+ VP8LConvertBGRAToRGBA4444 = ConvertBGRAToRGBA4444_MIPSdspR2;
+ VP8LConvertBGRAToRGB565 = ConvertBGRAToRGB565_MIPSdspR2;
+ VP8LConvertBGRAToBGR = ConvertBGRAToBGR_MIPSdspR2;
}
#else // !WEBP_USE_MIPS_DSP_R2
diff --git a/src/3rdparty/libwebp/src/dsp/lossless_msa.c b/src/3rdparty/libwebp/src/dsp/lossless_msa.c
index f6dd564..9f54720 100644
--- a/src/3rdparty/libwebp/src/dsp/lossless_msa.c
+++ b/src/3rdparty/libwebp/src/dsp/lossless_msa.c
@@ -11,12 +11,12 @@
//
// Author: Prashant Patil (prashant.patil@imgtec.com)
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
#if defined(WEBP_USE_MSA)
-#include "./lossless.h"
-#include "./msa_macro.h"
+#include "src/dsp/lossless.h"
+#include "src/dsp/msa_macro.h"
//------------------------------------------------------------------------------
// Colorspace conversion functions
@@ -43,7 +43,7 @@
#define CONVERT8_BGRA_XXX(psrc, pdst, m0, m1) do { \
uint64_t pix_d; \
- v16u8 src0, src1, src2, dst0, dst1; \
+ v16u8 src0, src1, src2 = { 0 }, dst0, dst1; \
LD_UB2(psrc, 16, src0, src1); \
VSHF_B2_UB(src0, src1, src1, src2, m0, m1, dst0, dst1); \
ST_UB(dst0, pdst); \
@@ -109,8 +109,8 @@
dst = VSHF_UB(src, t0, mask1); \
} while (0)
-static void ConvertBGRAToRGBA(const uint32_t* src,
- int num_pixels, uint8_t* dst) {
+static void ConvertBGRAToRGBA_MSA(const uint32_t* src,
+ int num_pixels, uint8_t* dst) {
int i;
const uint8_t* ptemp_src = (const uint8_t*)src;
uint8_t* ptemp_dst = (uint8_t*)dst;
@@ -150,8 +150,8 @@ static void ConvertBGRAToRGBA(const uint32_t* src,
}
}
-static void ConvertBGRAToBGR(const uint32_t* src,
- int num_pixels, uint8_t* dst) {
+static void ConvertBGRAToBGR_MSA(const uint32_t* src,
+ int num_pixels, uint8_t* dst) {
const uint8_t* ptemp_src = (const uint8_t*)src;
uint8_t* ptemp_dst = (uint8_t*)dst;
const v16u8 mask0 = { 0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14,
@@ -197,8 +197,8 @@ static void ConvertBGRAToBGR(const uint32_t* src,
}
}
-static void ConvertBGRAToRGB(const uint32_t* src,
- int num_pixels, uint8_t* dst) {
+static void ConvertBGRAToRGB_MSA(const uint32_t* src,
+ int num_pixels, uint8_t* dst) {
const uint8_t* ptemp_src = (const uint8_t*)src;
uint8_t* ptemp_dst = (uint8_t*)dst;
const v16u8 mask0 = { 2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12,
@@ -244,8 +244,8 @@ static void ConvertBGRAToRGB(const uint32_t* src,
}
}
-static void AddGreenToBlueAndRed(const uint32_t* const src, int num_pixels,
- uint32_t* dst) {
+static void AddGreenToBlueAndRed_MSA(const uint32_t* const src, int num_pixels,
+ uint32_t* dst) {
int i;
const uint8_t* in = (const uint8_t*)src;
uint8_t* out = (uint8_t*)dst;
@@ -286,9 +286,9 @@ static void AddGreenToBlueAndRed(const uint32_t* const src, int num_pixels,
}
}
-static void TransformColorInverse(const VP8LMultipliers* const m,
- const uint32_t* src, int num_pixels,
- uint32_t* dst) {
+static void TransformColorInverse_MSA(const VP8LMultipliers* const m,
+ const uint32_t* src, int num_pixels,
+ uint32_t* dst) {
v16u8 src0, dst0;
const v16i8 g2br = (v16i8)__msa_fill_w(m->green_to_blue_ |
(m->green_to_red_ << 16));
@@ -341,11 +341,12 @@ static void TransformColorInverse(const VP8LMultipliers* const m,
extern void VP8LDspInitMSA(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitMSA(void) {
- VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA;
- VP8LConvertBGRAToBGR = ConvertBGRAToBGR;
- VP8LConvertBGRAToRGB = ConvertBGRAToRGB;
- VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed;
- VP8LTransformColorInverse = TransformColorInverse;
+ VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA_MSA;
+ VP8LConvertBGRAToBGR = ConvertBGRAToBGR_MSA;
+ VP8LConvertBGRAToRGB = ConvertBGRAToRGB_MSA;
+
+ VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed_MSA;
+ VP8LTransformColorInverse = TransformColorInverse_MSA;
}
#else // !WEBP_USE_MSA
diff --git a/src/3rdparty/libwebp/src/dsp/lossless_neon.c b/src/3rdparty/libwebp/src/dsp/lossless_neon.c
index 1145d5f..76a1b6f 100644
--- a/src/3rdparty/libwebp/src/dsp/lossless_neon.c
+++ b/src/3rdparty/libwebp/src/dsp/lossless_neon.c
@@ -11,14 +11,14 @@
//
// Author: Skal (pascal.massimino@gmail.com)
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
#if defined(WEBP_USE_NEON)
#include <arm_neon.h>
-#include "./lossless.h"
-#include "./neon.h"
+#include "src/dsp/lossless.h"
+#include "src/dsp/neon.h"
//------------------------------------------------------------------------------
// Colorspace conversion functions
@@ -26,8 +26,8 @@
#if !defined(WORK_AROUND_GCC)
// gcc 4.6.0 had some trouble (NDK-r9) with this code. We only use it for
// gcc-4.8.x at least.
-static void ConvertBGRAToRGBA(const uint32_t* src,
- int num_pixels, uint8_t* dst) {
+static void ConvertBGRAToRGBA_NEON(const uint32_t* src,
+ int num_pixels, uint8_t* dst) {
const uint32_t* const end = src + (num_pixels & ~15);
for (; src < end; src += 16) {
uint8x16x4_t pixel = vld4q_u8((uint8_t*)src);
@@ -41,8 +41,8 @@ static void ConvertBGRAToRGBA(const uint32_t* src,
VP8LConvertBGRAToRGBA_C(src, num_pixels & 15, dst); // left-overs
}
-static void ConvertBGRAToBGR(const uint32_t* src,
- int num_pixels, uint8_t* dst) {
+static void ConvertBGRAToBGR_NEON(const uint32_t* src,
+ int num_pixels, uint8_t* dst) {
const uint32_t* const end = src + (num_pixels & ~15);
for (; src < end; src += 16) {
const uint8x16x4_t pixel = vld4q_u8((uint8_t*)src);
@@ -53,8 +53,8 @@ static void ConvertBGRAToBGR(const uint32_t* src,
VP8LConvertBGRAToBGR_C(src, num_pixels & 15, dst); // left-overs
}
-static void ConvertBGRAToRGB(const uint32_t* src,
- int num_pixels, uint8_t* dst) {
+static void ConvertBGRAToRGB_NEON(const uint32_t* src,
+ int num_pixels, uint8_t* dst) {
const uint32_t* const end = src + (num_pixels & ~15);
for (; src < end; src += 16) {
const uint8x16x4_t pixel = vld4q_u8((uint8_t*)src);
@@ -71,8 +71,8 @@ static void ConvertBGRAToRGB(const uint32_t* src,
static const uint8_t kRGBAShuffle[8] = { 2, 1, 0, 3, 6, 5, 4, 7 };
-static void ConvertBGRAToRGBA(const uint32_t* src,
- int num_pixels, uint8_t* dst) {
+static void ConvertBGRAToRGBA_NEON(const uint32_t* src,
+ int num_pixels, uint8_t* dst) {
const uint32_t* const end = src + (num_pixels & ~1);
const uint8x8_t shuffle = vld1_u8(kRGBAShuffle);
for (; src < end; src += 2) {
@@ -89,8 +89,8 @@ static const uint8_t kBGRShuffle[3][8] = {
{ 21, 22, 24, 25, 26, 28, 29, 30 }
};
-static void ConvertBGRAToBGR(const uint32_t* src,
- int num_pixels, uint8_t* dst) {
+static void ConvertBGRAToBGR_NEON(const uint32_t* src,
+ int num_pixels, uint8_t* dst) {
const uint32_t* const end = src + (num_pixels & ~7);
const uint8x8_t shuffle0 = vld1_u8(kBGRShuffle[0]);
const uint8x8_t shuffle1 = vld1_u8(kBGRShuffle[1]);
@@ -116,8 +116,8 @@ static const uint8_t kRGBShuffle[3][8] = {
{ 21, 20, 26, 25, 24, 30, 29, 28 }
};
-static void ConvertBGRAToRGB(const uint32_t* src,
- int num_pixels, uint8_t* dst) {
+static void ConvertBGRAToRGB_NEON(const uint32_t* src,
+ int num_pixels, uint8_t* dst) {
const uint32_t* const end = src + (num_pixels & ~7);
const uint8x8_t shuffle0 = vld1_u8(kRGBShuffle[0]);
const uint8x8_t shuffle1 = vld1_u8(kRGBShuffle[1]);
@@ -139,7 +139,6 @@ static void ConvertBGRAToRGB(const uint32_t* src,
#endif // !WORK_AROUND_GCC
-
//------------------------------------------------------------------------------
// Predictor Transform
@@ -506,8 +505,8 @@ static const uint8_t kGreenShuffle[16] = {
1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255
};
-static WEBP_INLINE uint8x16_t DoGreenShuffle(const uint8x16_t argb,
- const uint8x16_t shuffle) {
+static WEBP_INLINE uint8x16_t DoGreenShuffle_NEON(const uint8x16_t argb,
+ const uint8x16_t shuffle) {
return vcombine_u8(vtbl1q_u8(argb, vget_low_u8(shuffle)),
vtbl1q_u8(argb, vget_high_u8(shuffle)));
}
@@ -515,15 +514,15 @@ static WEBP_INLINE uint8x16_t DoGreenShuffle(const uint8x16_t argb,
// 255 = byte will be zeroed
static const uint8_t kGreenShuffle[8] = { 1, 255, 1, 255, 5, 255, 5, 255 };
-static WEBP_INLINE uint8x16_t DoGreenShuffle(const uint8x16_t argb,
- const uint8x8_t shuffle) {
+static WEBP_INLINE uint8x16_t DoGreenShuffle_NEON(const uint8x16_t argb,
+ const uint8x8_t shuffle) {
return vcombine_u8(vtbl1_u8(vget_low_u8(argb), shuffle),
vtbl1_u8(vget_high_u8(argb), shuffle));
}
#endif // USE_VTBLQ
-static void AddGreenToBlueAndRed(const uint32_t* src, int num_pixels,
- uint32_t* dst) {
+static void AddGreenToBlueAndRed_NEON(const uint32_t* src, int num_pixels,
+ uint32_t* dst) {
const uint32_t* const end = src + (num_pixels & ~3);
#ifdef USE_VTBLQ
const uint8x16_t shuffle = vld1q_u8(kGreenShuffle);
@@ -532,7 +531,7 @@ static void AddGreenToBlueAndRed(const uint32_t* src, int num_pixels,
#endif
for (; src < end; src += 4, dst += 4) {
const uint8x16_t argb = vld1q_u8((const uint8_t*)src);
- const uint8x16_t greens = DoGreenShuffle(argb, shuffle);
+ const uint8x16_t greens = DoGreenShuffle_NEON(argb, shuffle);
vst1q_u8((uint8_t*)dst, vaddq_u8(argb, greens));
}
// fallthrough and finish off with plain-C
@@ -542,9 +541,9 @@ static void AddGreenToBlueAndRed(const uint32_t* src, int num_pixels,
//------------------------------------------------------------------------------
// Color Transform
-static void TransformColorInverse(const VP8LMultipliers* const m,
- const uint32_t* const src, int num_pixels,
- uint32_t* dst) {
+static void TransformColorInverse_NEON(const VP8LMultipliers* const m,
+ const uint32_t* const src,
+ int num_pixels, uint32_t* dst) {
// sign-extended multiplying constants, pre-shifted by 6.
#define CST(X) (((int16_t)(m->X << 8)) >> 6)
const int16_t rb[8] = {
@@ -575,7 +574,7 @@ static void TransformColorInverse(const VP8LMultipliers* const m,
const uint8x16_t in = vld1q_u8((const uint8_t*)(src + i));
const uint32x4_t a0g0 = vandq_u32(vreinterpretq_u32_u8(in), mask_ag);
// 0 g 0 g
- const uint8x16_t greens = DoGreenShuffle(in, shuffle);
+ const uint8x16_t greens = DoGreenShuffle_NEON(in, shuffle);
// x dr x db1
const int16x8_t A = vqdmulhq_s16(vreinterpretq_s16_u8(greens), mults_rb);
// x r' x b'
@@ -627,12 +626,12 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitNEON(void) {
VP8LPredictorsAdd[12] = PredictorAdd12_NEON;
VP8LPredictorsAdd[13] = PredictorAdd13_NEON;
- VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA;
- VP8LConvertBGRAToBGR = ConvertBGRAToBGR;
- VP8LConvertBGRAToRGB = ConvertBGRAToRGB;
+ VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA_NEON;
+ VP8LConvertBGRAToBGR = ConvertBGRAToBGR_NEON;
+ VP8LConvertBGRAToRGB = ConvertBGRAToRGB_NEON;
- VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed;
- VP8LTransformColorInverse = TransformColorInverse;
+ VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed_NEON;
+ VP8LTransformColorInverse = TransformColorInverse_NEON;
}
#else // !WEBP_USE_NEON
diff --git a/src/3rdparty/libwebp/src/dsp/lossless_sse2.c b/src/3rdparty/libwebp/src/dsp/lossless_sse2.c
index 15aae93..653b466 100644
--- a/src/3rdparty/libwebp/src/dsp/lossless_sse2.c
+++ b/src/3rdparty/libwebp/src/dsp/lossless_sse2.c
@@ -11,21 +11,22 @@
//
// Author: Skal (pascal.massimino@gmail.com)
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
#if defined(WEBP_USE_SSE2)
-#include "./common_sse2.h"
-#include "./lossless.h"
-#include "./lossless_common.h"
+#include "src/dsp/common_sse2.h"
+#include "src/dsp/lossless.h"
+#include "src/dsp/lossless_common.h"
#include <assert.h>
#include <emmintrin.h>
//------------------------------------------------------------------------------
// Predictor Transform
-static WEBP_INLINE uint32_t ClampedAddSubtractFull(uint32_t c0, uint32_t c1,
- uint32_t c2) {
+static WEBP_INLINE uint32_t ClampedAddSubtractFull_SSE2(uint32_t c0,
+ uint32_t c1,
+ uint32_t c2) {
const __m128i zero = _mm_setzero_si128();
const __m128i C0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c0), zero);
const __m128i C1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c1), zero);
@@ -37,8 +38,9 @@ static WEBP_INLINE uint32_t ClampedAddSubtractFull(uint32_t c0, uint32_t c1,
return output;
}
-static WEBP_INLINE uint32_t ClampedAddSubtractHalf(uint32_t c0, uint32_t c1,
- uint32_t c2) {
+static WEBP_INLINE uint32_t ClampedAddSubtractHalf_SSE2(uint32_t c0,
+ uint32_t c1,
+ uint32_t c2) {
const __m128i zero = _mm_setzero_si128();
const __m128i C0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c0), zero);
const __m128i C1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c1), zero);
@@ -55,7 +57,7 @@ static WEBP_INLINE uint32_t ClampedAddSubtractHalf(uint32_t c0, uint32_t c1,
return output;
}
-static WEBP_INLINE uint32_t Select(uint32_t a, uint32_t b, uint32_t c) {
+static WEBP_INLINE uint32_t Select_SSE2(uint32_t a, uint32_t b, uint32_t c) {
int pa_minus_pb;
const __m128i zero = _mm_setzero_si128();
const __m128i A0 = _mm_cvtsi32_si128(a);
@@ -88,8 +90,9 @@ static WEBP_INLINE void Average2_m128i(const __m128i* const a0,
*avg = _mm_sub_epi8(avg1, one);
}
-static WEBP_INLINE void Average2_uint32(const uint32_t a0, const uint32_t a1,
- __m128i* const avg) {
+static WEBP_INLINE void Average2_uint32_SSE2(const uint32_t a0,
+ const uint32_t a1,
+ __m128i* const avg) {
// (a + b) >> 1 = ((a + b + 1) >> 1) - ((a ^ b) & 1)
const __m128i ones = _mm_set1_epi8(1);
const __m128i A0 = _mm_cvtsi32_si128(a0);
@@ -99,7 +102,7 @@ static WEBP_INLINE void Average2_uint32(const uint32_t a0, const uint32_t a1,
*avg = _mm_sub_epi8(avg1, one);
}
-static WEBP_INLINE __m128i Average2_uint32_16(uint32_t a0, uint32_t a1) {
+static WEBP_INLINE __m128i Average2_uint32_16_SSE2(uint32_t a0, uint32_t a1) {
const __m128i zero = _mm_setzero_si128();
const __m128i A0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(a0), zero);
const __m128i A1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(a1), zero);
@@ -107,15 +110,16 @@ static WEBP_INLINE __m128i Average2_uint32_16(uint32_t a0, uint32_t a1) {
return _mm_srli_epi16(sum, 1);
}
-static WEBP_INLINE uint32_t Average2(uint32_t a0, uint32_t a1) {
+static WEBP_INLINE uint32_t Average2_SSE2(uint32_t a0, uint32_t a1) {
__m128i output;
- Average2_uint32(a0, a1, &output);
+ Average2_uint32_SSE2(a0, a1, &output);
return _mm_cvtsi128_si32(output);
}
-static WEBP_INLINE uint32_t Average3(uint32_t a0, uint32_t a1, uint32_t a2) {
+static WEBP_INLINE uint32_t Average3_SSE2(uint32_t a0, uint32_t a1,
+ uint32_t a2) {
const __m128i zero = _mm_setzero_si128();
- const __m128i avg1 = Average2_uint32_16(a0, a2);
+ const __m128i avg1 = Average2_uint32_16_SSE2(a0, a2);
const __m128i A1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(a1), zero);
const __m128i sum = _mm_add_epi16(avg1, A1);
const __m128i avg2 = _mm_srli_epi16(sum, 1);
@@ -124,10 +128,10 @@ static WEBP_INLINE uint32_t Average3(uint32_t a0, uint32_t a1, uint32_t a2) {
return output;
}
-static WEBP_INLINE uint32_t Average4(uint32_t a0, uint32_t a1,
- uint32_t a2, uint32_t a3) {
- const __m128i avg1 = Average2_uint32_16(a0, a1);
- const __m128i avg2 = Average2_uint32_16(a2, a3);
+static WEBP_INLINE uint32_t Average4_SSE2(uint32_t a0, uint32_t a1,
+ uint32_t a2, uint32_t a3) {
+ const __m128i avg1 = Average2_uint32_16_SSE2(a0, a1);
+ const __m128i avg2 = Average2_uint32_16_SSE2(a2, a3);
const __m128i sum = _mm_add_epi16(avg2, avg1);
const __m128i avg3 = _mm_srli_epi16(sum, 1);
const __m128i A0 = _mm_packus_epi16(avg3, avg3);
@@ -136,41 +140,41 @@ static WEBP_INLINE uint32_t Average4(uint32_t a0, uint32_t a1,
}
static uint32_t Predictor5_SSE2(uint32_t left, const uint32_t* const top) {
- const uint32_t pred = Average3(left, top[0], top[1]);
+ const uint32_t pred = Average3_SSE2(left, top[0], top[1]);
return pred;
}
static uint32_t Predictor6_SSE2(uint32_t left, const uint32_t* const top) {
- const uint32_t pred = Average2(left, top[-1]);
+ const uint32_t pred = Average2_SSE2(left, top[-1]);
return pred;
}
static uint32_t Predictor7_SSE2(uint32_t left, const uint32_t* const top) {
- const uint32_t pred = Average2(left, top[0]);
+ const uint32_t pred = Average2_SSE2(left, top[0]);
return pred;
}
static uint32_t Predictor8_SSE2(uint32_t left, const uint32_t* const top) {
- const uint32_t pred = Average2(top[-1], top[0]);
+ const uint32_t pred = Average2_SSE2(top[-1], top[0]);
(void)left;
return pred;
}
static uint32_t Predictor9_SSE2(uint32_t left, const uint32_t* const top) {
- const uint32_t pred = Average2(top[0], top[1]);
+ const uint32_t pred = Average2_SSE2(top[0], top[1]);
(void)left;
return pred;
}
static uint32_t Predictor10_SSE2(uint32_t left, const uint32_t* const top) {
- const uint32_t pred = Average4(left, top[-1], top[0], top[1]);
+ const uint32_t pred = Average4_SSE2(left, top[-1], top[0], top[1]);
return pred;
}
static uint32_t Predictor11_SSE2(uint32_t left, const uint32_t* const top) {
- const uint32_t pred = Select(top[0], left, top[-1]);
+ const uint32_t pred = Select_SSE2(top[0], left, top[-1]);
return pred;
}
static uint32_t Predictor12_SSE2(uint32_t left, const uint32_t* const top) {
- const uint32_t pred = ClampedAddSubtractFull(left, top[0], top[-1]);
+ const uint32_t pred = ClampedAddSubtractFull_SSE2(left, top[0], top[-1]);
return pred;
}
static uint32_t Predictor13_SSE2(uint32_t left, const uint32_t* const top) {
- const uint32_t pred = ClampedAddSubtractHalf(left, top[0], top[-1]);
+ const uint32_t pred = ClampedAddSubtractHalf_SSE2(left, top[0], top[-1]);
return pred;
}
@@ -272,9 +276,24 @@ GENERATE_PREDICTOR_2(9, upper[i + 1])
#undef GENERATE_PREDICTOR_2
// Predictor10: average of (average of (L,TL), average of (T, TR)).
+#define DO_PRED10(OUT) do { \
+ __m128i avgLTL, avg; \
+ Average2_m128i(&L, &TL, &avgLTL); \
+ Average2_m128i(&avgTTR, &avgLTL, &avg); \
+ L = _mm_add_epi8(avg, src); \
+ out[i + (OUT)] = _mm_cvtsi128_si32(L); \
+} while (0)
+
+#define DO_PRED10_SHIFT do { \
+ /* Rotate the pre-computed values for the next iteration.*/ \
+ avgTTR = _mm_srli_si128(avgTTR, 4); \
+ TL = _mm_srli_si128(TL, 4); \
+ src = _mm_srli_si128(src, 4); \
+} while (0)
+
static void PredictorAdd10_SSE2(const uint32_t* in, const uint32_t* upper,
int num_pixels, uint32_t* out) {
- int i, j;
+ int i;
__m128i L = _mm_cvtsi32_si128(out[-1]);
for (i = 0; i + 4 <= num_pixels; i += 4) {
__m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
@@ -283,79 +302,90 @@ static void PredictorAdd10_SSE2(const uint32_t* in, const uint32_t* upper,
const __m128i TR = _mm_loadu_si128((const __m128i*)&upper[i + 1]);
__m128i avgTTR;
Average2_m128i(&T, &TR, &avgTTR);
- for (j = 0; j < 4; ++j) {
- __m128i avgLTL, avg;
- Average2_m128i(&L, &TL, &avgLTL);
- Average2_m128i(&avgTTR, &avgLTL, &avg);
- L = _mm_add_epi8(avg, src);
- out[i + j] = _mm_cvtsi128_si32(L);
- // Rotate the pre-computed values for the next iteration.
- avgTTR = _mm_srli_si128(avgTTR, 4);
- TL = _mm_srli_si128(TL, 4);
- src = _mm_srli_si128(src, 4);
- }
+ DO_PRED10(0);
+ DO_PRED10_SHIFT;
+ DO_PRED10(1);
+ DO_PRED10_SHIFT;
+ DO_PRED10(2);
+ DO_PRED10_SHIFT;
+ DO_PRED10(3);
}
if (i != num_pixels) {
VP8LPredictorsAdd_C[10](in + i, upper + i, num_pixels - i, out + i);
}
}
+#undef DO_PRED10
+#undef DO_PRED10_SHIFT
// Predictor11: select.
-static void GetSumAbsDiff32(const __m128i* const A, const __m128i* const B,
- __m128i* const out) {
- // We can unpack with any value on the upper 32 bits, provided it's the same
- // on both operands (to that their sum of abs diff is zero). Here we use *A.
- const __m128i A_lo = _mm_unpacklo_epi32(*A, *A);
- const __m128i B_lo = _mm_unpacklo_epi32(*B, *A);
- const __m128i A_hi = _mm_unpackhi_epi32(*A, *A);
- const __m128i B_hi = _mm_unpackhi_epi32(*B, *A);
- const __m128i s_lo = _mm_sad_epu8(A_lo, B_lo);
- const __m128i s_hi = _mm_sad_epu8(A_hi, B_hi);
- *out = _mm_packs_epi32(s_lo, s_hi);
-}
+#define DO_PRED11(OUT) do { \
+ const __m128i L_lo = _mm_unpacklo_epi32(L, T); \
+ const __m128i TL_lo = _mm_unpacklo_epi32(TL, T); \
+ const __m128i pb = _mm_sad_epu8(L_lo, TL_lo); /* pb = sum |L-TL|*/ \
+ const __m128i mask = _mm_cmpgt_epi32(pb, pa); \
+ const __m128i A = _mm_and_si128(mask, L); \
+ const __m128i B = _mm_andnot_si128(mask, T); \
+ const __m128i pred = _mm_or_si128(A, B); /* pred = (pa > b)? L : T*/ \
+ L = _mm_add_epi8(src, pred); \
+ out[i + (OUT)] = _mm_cvtsi128_si32(L); \
+} while (0)
+
+#define DO_PRED11_SHIFT do { \
+ /* Shift the pre-computed value for the next iteration.*/ \
+ T = _mm_srli_si128(T, 4); \
+ TL = _mm_srli_si128(TL, 4); \
+ src = _mm_srli_si128(src, 4); \
+ pa = _mm_srli_si128(pa, 4); \
+} while (0)
static void PredictorAdd11_SSE2(const uint32_t* in, const uint32_t* upper,
int num_pixels, uint32_t* out) {
- int i, j;
+ int i;
+ __m128i pa;
__m128i L = _mm_cvtsi32_si128(out[-1]);
for (i = 0; i + 4 <= num_pixels; i += 4) {
__m128i T = _mm_loadu_si128((const __m128i*)&upper[i]);
__m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]);
__m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
- __m128i pa;
- GetSumAbsDiff32(&T, &TL, &pa); // pa = sum |T-TL|
- for (j = 0; j < 4; ++j) {
- const __m128i L_lo = _mm_unpacklo_epi32(L, L);
- const __m128i TL_lo = _mm_unpacklo_epi32(TL, L);
- const __m128i pb = _mm_sad_epu8(L_lo, TL_lo); // pb = sum |L-TL|
- const __m128i mask = _mm_cmpgt_epi32(pb, pa);
- const __m128i A = _mm_and_si128(mask, L);
- const __m128i B = _mm_andnot_si128(mask, T);
- const __m128i pred = _mm_or_si128(A, B); // pred = (L > T)? L : T
- L = _mm_add_epi8(src, pred);
- out[i + j] = _mm_cvtsi128_si32(L);
- // Shift the pre-computed value for the next iteration.
- T = _mm_srli_si128(T, 4);
- TL = _mm_srli_si128(TL, 4);
- src = _mm_srli_si128(src, 4);
- pa = _mm_srli_si128(pa, 4);
+ {
+ // We can unpack with any value on the upper 32 bits, provided it's the
+ // same on both operands (so that their sum of abs diff is zero). Here we
+ // use T.
+ const __m128i T_lo = _mm_unpacklo_epi32(T, T);
+ const __m128i TL_lo = _mm_unpacklo_epi32(TL, T);
+ const __m128i T_hi = _mm_unpackhi_epi32(T, T);
+ const __m128i TL_hi = _mm_unpackhi_epi32(TL, T);
+ const __m128i s_lo = _mm_sad_epu8(T_lo, TL_lo);
+ const __m128i s_hi = _mm_sad_epu8(T_hi, TL_hi);
+ pa = _mm_packs_epi32(s_lo, s_hi); // pa = sum |T-TL|
}
+ DO_PRED11(0);
+ DO_PRED11_SHIFT;
+ DO_PRED11(1);
+ DO_PRED11_SHIFT;
+ DO_PRED11(2);
+ DO_PRED11_SHIFT;
+ DO_PRED11(3);
}
if (i != num_pixels) {
VP8LPredictorsAdd_C[11](in + i, upper + i, num_pixels - i, out + i);
}
}
+#undef DO_PRED11
+#undef DO_PRED11_SHIFT
// Predictor12: ClampedAddSubtractFull.
-#define DO_PRED12(DIFF, LANE, OUT) \
-do { \
- const __m128i all = _mm_add_epi16(L, (DIFF)); \
- const __m128i alls = _mm_packus_epi16(all, all); \
- const __m128i res = _mm_add_epi8(src, alls); \
- out[i + (OUT)] = _mm_cvtsi128_si32(res); \
- L = _mm_unpacklo_epi8(res, zero); \
+#define DO_PRED12(DIFF, LANE, OUT) do { \
+ const __m128i all = _mm_add_epi16(L, (DIFF)); \
+ const __m128i alls = _mm_packus_epi16(all, all); \
+ const __m128i res = _mm_add_epi8(src, alls); \
+ out[i + (OUT)] = _mm_cvtsi128_si32(res); \
+ L = _mm_unpacklo_epi8(res, zero); \
+} while (0)
+
+#define DO_PRED12_SHIFT(DIFF, LANE) do { \
/* Shift the pre-computed value for the next iteration.*/ \
- if (LANE == 0) (DIFF) = _mm_srli_si128((DIFF), 8); \
+ if ((LANE) == 0) (DIFF) = _mm_srli_si128((DIFF), 8); \
src = _mm_srli_si128(src, 4); \
} while (0)
@@ -377,8 +407,11 @@ static void PredictorAdd12_SSE2(const uint32_t* in, const uint32_t* upper,
__m128i diff_lo = _mm_sub_epi16(T_lo, TL_lo);
__m128i diff_hi = _mm_sub_epi16(T_hi, TL_hi);
DO_PRED12(diff_lo, 0, 0);
+ DO_PRED12_SHIFT(diff_lo, 0);
DO_PRED12(diff_lo, 1, 1);
+ DO_PRED12_SHIFT(diff_lo, 1);
DO_PRED12(diff_hi, 0, 2);
+ DO_PRED12_SHIFT(diff_hi, 0);
DO_PRED12(diff_hi, 1, 3);
}
if (i != num_pixels) {
@@ -386,6 +419,7 @@ static void PredictorAdd12_SSE2(const uint32_t* in, const uint32_t* upper,
}
}
#undef DO_PRED12
+#undef DO_PRED12_SHIFT
// Due to averages with integers, values cannot be accumulated in parallel for
// predictors 13.
@@ -394,8 +428,8 @@ GENERATE_PREDICTOR_ADD(Predictor13_SSE2, PredictorAdd13_SSE2)
//------------------------------------------------------------------------------
// Subtract-Green Transform
-static void AddGreenToBlueAndRed(const uint32_t* const src, int num_pixels,
- uint32_t* dst) {
+static void AddGreenToBlueAndRed_SSE2(const uint32_t* const src, int num_pixels,
+ uint32_t* dst) {
int i;
for (i = 0; i + 4 <= num_pixels; i += 4) {
const __m128i in = _mm_loadu_si128((const __m128i*)&src[i]); // argb
@@ -414,9 +448,9 @@ static void AddGreenToBlueAndRed(const uint32_t* const src, int num_pixels,
//------------------------------------------------------------------------------
// Color Transform
-static void TransformColorInverse(const VP8LMultipliers* const m,
- const uint32_t* const src, int num_pixels,
- uint32_t* dst) {
+static void TransformColorInverse_SSE2(const VP8LMultipliers* const m,
+ const uint32_t* const src,
+ int num_pixels, uint32_t* dst) {
// sign-extended multiplying constants, pre-shifted by 5.
#define CST(X) (((int16_t)(m->X << 8)) >> 5) // sign-extend
const __m128i mults_rb = _mm_set_epi16(
@@ -454,8 +488,8 @@ static void TransformColorInverse(const VP8LMultipliers* const m,
//------------------------------------------------------------------------------
// Color-space conversion functions
-static void ConvertBGRAToRGB(const uint32_t* src, int num_pixels,
- uint8_t* dst) {
+static void ConvertBGRAToRGB_SSE2(const uint32_t* src, int num_pixels,
+ uint8_t* dst) {
const __m128i* in = (const __m128i*)src;
__m128i* out = (__m128i*)dst;
@@ -490,27 +524,26 @@ static void ConvertBGRAToRGB(const uint32_t* src, int num_pixels,
}
}
-static void ConvertBGRAToRGBA(const uint32_t* src,
- int num_pixels, uint8_t* dst) {
+static void ConvertBGRAToRGBA_SSE2(const uint32_t* src,
+ int num_pixels, uint8_t* dst) {
+ const __m128i red_blue_mask = _mm_set1_epi32(0x00ff00ffu);
const __m128i* in = (const __m128i*)src;
__m128i* out = (__m128i*)dst;
while (num_pixels >= 8) {
- const __m128i bgra0 = _mm_loadu_si128(in++); // bgra0|bgra1|bgra2|bgra3
- const __m128i bgra4 = _mm_loadu_si128(in++); // bgra4|bgra5|bgra6|bgra7
- const __m128i v0l = _mm_unpacklo_epi8(bgra0, bgra4); // b0b4g0g4r0r4a0a4...
- const __m128i v0h = _mm_unpackhi_epi8(bgra0, bgra4); // b2b6g2g6r2r6a2a6...
- const __m128i v1l = _mm_unpacklo_epi8(v0l, v0h); // b0b2b4b6g0g2g4g6...
- const __m128i v1h = _mm_unpackhi_epi8(v0l, v0h); // b1b3b5b7g1g3g5g7...
- const __m128i v2l = _mm_unpacklo_epi8(v1l, v1h); // b0...b7 | g0...g7
- const __m128i v2h = _mm_unpackhi_epi8(v1l, v1h); // r0...r7 | a0...a7
- const __m128i ga0 = _mm_unpackhi_epi64(v2l, v2h); // g0...g7 | a0...a7
- const __m128i rb0 = _mm_unpacklo_epi64(v2h, v2l); // r0...r7 | b0...b7
- const __m128i rg0 = _mm_unpacklo_epi8(rb0, ga0); // r0g0r1g1 ... r6g6r7g7
- const __m128i ba0 = _mm_unpackhi_epi8(rb0, ga0); // b0a0b1a1 ... b6a6b7a7
- const __m128i rgba0 = _mm_unpacklo_epi16(rg0, ba0); // rgba0|rgba1...
- const __m128i rgba4 = _mm_unpackhi_epi16(rg0, ba0); // rgba4|rgba5...
- _mm_storeu_si128(out++, rgba0);
- _mm_storeu_si128(out++, rgba4);
+ const __m128i A1 = _mm_loadu_si128(in++);
+ const __m128i A2 = _mm_loadu_si128(in++);
+ const __m128i B1 = _mm_and_si128(A1, red_blue_mask); // R 0 B 0
+ const __m128i B2 = _mm_and_si128(A2, red_blue_mask); // R 0 B 0
+ const __m128i C1 = _mm_andnot_si128(red_blue_mask, A1); // 0 G 0 A
+ const __m128i C2 = _mm_andnot_si128(red_blue_mask, A2); // 0 G 0 A
+ const __m128i D1 = _mm_shufflelo_epi16(B1, _MM_SHUFFLE(2, 3, 0, 1));
+ const __m128i D2 = _mm_shufflelo_epi16(B2, _MM_SHUFFLE(2, 3, 0, 1));
+ const __m128i E1 = _mm_shufflehi_epi16(D1, _MM_SHUFFLE(2, 3, 0, 1));
+ const __m128i E2 = _mm_shufflehi_epi16(D2, _MM_SHUFFLE(2, 3, 0, 1));
+ const __m128i F1 = _mm_or_si128(E1, C1);
+ const __m128i F2 = _mm_or_si128(E2, C2);
+ _mm_storeu_si128(out++, F1);
+ _mm_storeu_si128(out++, F2);
num_pixels -= 8;
}
// left-overs
@@ -519,8 +552,8 @@ static void ConvertBGRAToRGBA(const uint32_t* src,
}
}
-static void ConvertBGRAToRGBA4444(const uint32_t* src,
- int num_pixels, uint8_t* dst) {
+static void ConvertBGRAToRGBA4444_SSE2(const uint32_t* src,
+ int num_pixels, uint8_t* dst) {
const __m128i mask_0x0f = _mm_set1_epi8(0x0f);
const __m128i mask_0xf0 = _mm_set1_epi8(0xf0);
const __m128i* in = (const __m128i*)src;
@@ -541,7 +574,7 @@ static void ConvertBGRAToRGBA4444(const uint32_t* src,
const __m128i ga2 = _mm_and_si128(ga1, mask_0x0f); // g0-|g1-|...|a6-|a7-
const __m128i rgba0 = _mm_or_si128(ga2, rb1); // rg0..rg7 | ba0..ba7
const __m128i rgba1 = _mm_srli_si128(rgba0, 8); // ba0..ba7 | 0
-#ifdef WEBP_SWAP_16BIT_CSP
+#if (WEBP_SWAP_16BIT_CSP == 1)
const __m128i rgba = _mm_unpacklo_epi8(rgba1, rgba0); // barg0...barg7
#else
const __m128i rgba = _mm_unpacklo_epi8(rgba0, rgba1); // rgba0...rgba7
@@ -555,8 +588,8 @@ static void ConvertBGRAToRGBA4444(const uint32_t* src,
}
}
-static void ConvertBGRAToRGB565(const uint32_t* src,
- int num_pixels, uint8_t* dst) {
+static void ConvertBGRAToRGB565_SSE2(const uint32_t* src,
+ int num_pixels, uint8_t* dst) {
const __m128i mask_0xe0 = _mm_set1_epi8(0xe0);
const __m128i mask_0xf8 = _mm_set1_epi8(0xf8);
const __m128i mask_0x07 = _mm_set1_epi8(0x07);
@@ -582,7 +615,7 @@ static void ConvertBGRAToRGB565(const uint32_t* src,
const __m128i rg1 = _mm_or_si128(rb1, g_lo2); // gr0...gr7|xx
const __m128i b1 = _mm_srli_epi16(b0, 3);
const __m128i gb1 = _mm_or_si128(b1, g_hi2); // bg0...bg7|xx
-#ifdef WEBP_SWAP_16BIT_CSP
+#if (WEBP_SWAP_16BIT_CSP == 1)
const __m128i rgba = _mm_unpacklo_epi8(gb1, rg1); // rggb0...rggb7
#else
const __m128i rgba = _mm_unpacklo_epi8(rg1, gb1); // bgrb0...bgrb7
@@ -596,8 +629,8 @@ static void ConvertBGRAToRGB565(const uint32_t* src,
}
}
-static void ConvertBGRAToBGR(const uint32_t* src,
- int num_pixels, uint8_t* dst) {
+static void ConvertBGRAToBGR_SSE2(const uint32_t* src,
+ int num_pixels, uint8_t* dst) {
const __m128i mask_l = _mm_set_epi32(0, 0x00ffffff, 0, 0x00ffffff);
const __m128i mask_h = _mm_set_epi32(0x00ffffff, 0, 0x00ffffff, 0);
const __m128i* in = (const __m128i*)src;
@@ -660,14 +693,14 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitSSE2(void) {
VP8LPredictorsAdd[12] = PredictorAdd12_SSE2;
VP8LPredictorsAdd[13] = PredictorAdd13_SSE2;
- VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed;
- VP8LTransformColorInverse = TransformColorInverse;
+ VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed_SSE2;
+ VP8LTransformColorInverse = TransformColorInverse_SSE2;
- VP8LConvertBGRAToRGB = ConvertBGRAToRGB;
- VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA;
- VP8LConvertBGRAToRGBA4444 = ConvertBGRAToRGBA4444;
- VP8LConvertBGRAToRGB565 = ConvertBGRAToRGB565;
- VP8LConvertBGRAToBGR = ConvertBGRAToBGR;
+ VP8LConvertBGRAToRGB = ConvertBGRAToRGB_SSE2;
+ VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA_SSE2;
+ VP8LConvertBGRAToRGBA4444 = ConvertBGRAToRGBA4444_SSE2;
+ VP8LConvertBGRAToRGB565 = ConvertBGRAToRGB565_SSE2;
+ VP8LConvertBGRAToBGR = ConvertBGRAToBGR_SSE2;
}
#else // !WEBP_USE_SSE2
diff --git a/src/3rdparty/libwebp/src/dsp/msa_macro.h b/src/3rdparty/libwebp/src/dsp/msa_macro.h
index d0e5f45..dfacda6 100644
--- a/src/3rdparty/libwebp/src/dsp/msa_macro.h
+++ b/src/3rdparty/libwebp/src/dsp/msa_macro.h
@@ -22,6 +22,7 @@
#endif
#ifdef CLANG_BUILD
+ #define ALPHAVAL (-1)
#define ADDVI_H(a, b) __msa_addvi_h((v8i16)a, b)
#define ADDVI_W(a, b) __msa_addvi_w((v4i32)a, b)
#define SRAI_B(a, b) __msa_srai_b((v16i8)a, b)
@@ -32,6 +33,7 @@
#define ANDI_B(a, b) __msa_andi_b((v16u8)a, b)
#define ORI_B(a, b) __msa_ori_b((v16u8)a, b)
#else
+ #define ALPHAVAL (0xff)
#define ADDVI_H(a, b) (a + b)
#define ADDVI_W(a, b) (a + b)
#define SRAI_B(a, b) (a >> b)
diff --git a/src/3rdparty/libwebp/src/dsp/neon.h b/src/3rdparty/libwebp/src/dsp/neon.h
index 3b548a6..aa1dea1 100644
--- a/src/3rdparty/libwebp/src/dsp/neon.h
+++ b/src/3rdparty/libwebp/src/dsp/neon.h
@@ -14,11 +14,12 @@
#include <arm_neon.h>
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
// Right now, some intrinsics functions seem slower, so we disable them
-// everywhere except aarch64 where the inline assembly is incompatible.
-#if defined(__aarch64__)
+// everywhere except newer clang/gcc or aarch64 where the inline assembly is
+// incompatible.
+#if LOCAL_CLANG_PREREQ(3,8) || LOCAL_GCC_PREREQ(4,9) || defined(__aarch64__)
#define WEBP_USE_INTRINSICS // use intrinsics when possible
#endif
@@ -43,11 +44,11 @@
// if using intrinsics, this flag avoids some functions that make gcc-4.6.3
// crash ("internal compiler error: in immed_double_const, at emit-rtl.").
// (probably similar to gcc.gnu.org/bugzilla/show_bug.cgi?id=48183)
-#if !(LOCAL_GCC_PREREQ(4,8) || defined(__aarch64__))
+#if !(LOCAL_CLANG_PREREQ(3,8) || LOCAL_GCC_PREREQ(4,8) || defined(__aarch64__))
#define WORK_AROUND_GCC
#endif
-static WEBP_INLINE int32x4x4_t Transpose4x4(const int32x4x4_t rows) {
+static WEBP_INLINE int32x4x4_t Transpose4x4_NEON(const int32x4x4_t rows) {
uint64x2x2_t row01, row23;
row01.val[0] = vreinterpretq_u64_s32(rows.val[0]);
diff --git a/src/3rdparty/libwebp/src/dsp/rescaler.c b/src/3rdparty/libwebp/src/dsp/rescaler.c
index 0f54502..4b6b783 100644
--- a/src/3rdparty/libwebp/src/dsp/rescaler.c
+++ b/src/3rdparty/libwebp/src/dsp/rescaler.c
@@ -13,8 +13,8 @@
#include <assert.h>
-#include "./dsp.h"
-#include "../utils/rescaler_utils.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/rescaler_utils.h"
//------------------------------------------------------------------------------
// Implementations of critical functions ImportRow / ExportRow
@@ -25,7 +25,8 @@
//------------------------------------------------------------------------------
// Row import
-void WebPRescalerImportRowExpandC(WebPRescaler* const wrk, const uint8_t* src) {
+void WebPRescalerImportRowExpand_C(WebPRescaler* const wrk,
+ const uint8_t* src) {
const int x_stride = wrk->num_channels;
const int x_out_max = wrk->dst_width * wrk->num_channels;
int channel;
@@ -56,7 +57,8 @@ void WebPRescalerImportRowExpandC(WebPRescaler* const wrk, const uint8_t* src) {
}
}
-void WebPRescalerImportRowShrinkC(WebPRescaler* const wrk, const uint8_t* src) {
+void WebPRescalerImportRowShrink_C(WebPRescaler* const wrk,
+ const uint8_t* src) {
const int x_stride = wrk->num_channels;
const int x_out_max = wrk->dst_width * wrk->num_channels;
int channel;
@@ -92,7 +94,7 @@ void WebPRescalerImportRowShrinkC(WebPRescaler* const wrk, const uint8_t* src) {
//------------------------------------------------------------------------------
// Row export
-void WebPRescalerExportRowExpandC(WebPRescaler* const wrk) {
+void WebPRescalerExportRowExpand_C(WebPRescaler* const wrk) {
int x_out;
uint8_t* const dst = wrk->dst;
rescaler_t* const irow = wrk->irow;
@@ -123,7 +125,7 @@ void WebPRescalerExportRowExpandC(WebPRescaler* const wrk) {
}
}
-void WebPRescalerExportRowShrinkC(WebPRescaler* const wrk) {
+void WebPRescalerExportRowShrink_C(WebPRescaler* const wrk) {
int x_out;
uint8_t* const dst = wrk->dst;
rescaler_t* const irow = wrk->irow;
@@ -207,11 +209,14 @@ static volatile VP8CPUInfo rescaler_last_cpuinfo_used =
WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInit(void) {
if (rescaler_last_cpuinfo_used == VP8GetCPUInfo) return;
+#if !defined(WEBP_REDUCE_SIZE)
+#if !WEBP_NEON_OMIT_C_CODE
+ WebPRescalerExportRowExpand = WebPRescalerExportRowExpand_C;
+ WebPRescalerExportRowShrink = WebPRescalerExportRowShrink_C;
+#endif
- WebPRescalerImportRowExpand = WebPRescalerImportRowExpandC;
- WebPRescalerImportRowShrink = WebPRescalerImportRowShrinkC;
- WebPRescalerExportRowExpand = WebPRescalerExportRowExpandC;
- WebPRescalerExportRowShrink = WebPRescalerExportRowShrinkC;
+ WebPRescalerImportRowExpand = WebPRescalerImportRowExpand_C;
+ WebPRescalerImportRowShrink = WebPRescalerImportRowShrink_C;
if (VP8GetCPUInfo != NULL) {
#if defined(WEBP_USE_SSE2)
@@ -219,11 +224,6 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInit(void) {
WebPRescalerDspInitSSE2();
}
#endif
-#if defined(WEBP_USE_NEON)
- if (VP8GetCPUInfo(kNEON)) {
- WebPRescalerDspInitNEON();
- }
-#endif
#if defined(WEBP_USE_MIPS32)
if (VP8GetCPUInfo(kMIPS32)) {
WebPRescalerDspInitMIPS32();
@@ -240,5 +240,18 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInit(void) {
}
#endif
}
+
+#if defined(WEBP_USE_NEON)
+ if (WEBP_NEON_OMIT_C_CODE ||
+ (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
+ WebPRescalerDspInitNEON();
+ }
+#endif
+
+ assert(WebPRescalerExportRowExpand != NULL);
+ assert(WebPRescalerExportRowShrink != NULL);
+ assert(WebPRescalerImportRowExpand != NULL);
+ assert(WebPRescalerImportRowShrink != NULL);
+#endif // WEBP_REDUCE_SIZE
rescaler_last_cpuinfo_used = VP8GetCPUInfo;
}
diff --git a/src/3rdparty/libwebp/src/dsp/rescaler_mips32.c b/src/3rdparty/libwebp/src/dsp/rescaler_mips32.c
index e09ad5d..542f7e5 100644
--- a/src/3rdparty/libwebp/src/dsp/rescaler_mips32.c
+++ b/src/3rdparty/libwebp/src/dsp/rescaler_mips32.c
@@ -11,17 +11,18 @@
//
// Author(s): Djordje Pesut (djordje.pesut@imgtec.com)
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
-#if defined(WEBP_USE_MIPS32)
+#if defined(WEBP_USE_MIPS32) && !defined(WEBP_REDUCE_SIZE)
#include <assert.h>
-#include "../utils/rescaler_utils.h"
+#include "src/utils/rescaler_utils.h"
//------------------------------------------------------------------------------
// Row import
-static void ImportRowShrink(WebPRescaler* const wrk, const uint8_t* src) {
+static void ImportRowShrink_MIPS32(WebPRescaler* const wrk,
+ const uint8_t* src) {
const int x_stride = wrk->num_channels;
const int x_out_max = wrk->dst_width * wrk->num_channels;
const int fx_scale = wrk->fx_scale;
@@ -80,7 +81,8 @@ static void ImportRowShrink(WebPRescaler* const wrk, const uint8_t* src) {
}
}
-static void ImportRowExpand(WebPRescaler* const wrk, const uint8_t* src) {
+static void ImportRowExpand_MIPS32(WebPRescaler* const wrk,
+ const uint8_t* src) {
const int x_stride = wrk->num_channels;
const int x_out_max = wrk->dst_width * wrk->num_channels;
const int x_add = wrk->x_add;
@@ -144,7 +146,7 @@ static void ImportRowExpand(WebPRescaler* const wrk, const uint8_t* src) {
//------------------------------------------------------------------------------
// Row export
-static void ExportRowExpand(WebPRescaler* const wrk) {
+static void ExportRowExpand_MIPS32(WebPRescaler* const wrk) {
uint8_t* dst = wrk->dst;
rescaler_t* irow = wrk->irow;
const int x_out_max = wrk->dst_width * wrk->num_channels;
@@ -207,7 +209,7 @@ static void ExportRowExpand(WebPRescaler* const wrk) {
}
}
-static void ExportRowShrink(WebPRescaler* const wrk) {
+static void ExportRowShrink_MIPS32(WebPRescaler* const wrk) {
const int x_out_max = wrk->dst_width * wrk->num_channels;
uint8_t* dst = wrk->dst;
rescaler_t* irow = wrk->irow;
@@ -278,10 +280,10 @@ static void ExportRowShrink(WebPRescaler* const wrk) {
extern void WebPRescalerDspInitMIPS32(void);
WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInitMIPS32(void) {
- WebPRescalerImportRowExpand = ImportRowExpand;
- WebPRescalerImportRowShrink = ImportRowShrink;
- WebPRescalerExportRowExpand = ExportRowExpand;
- WebPRescalerExportRowShrink = ExportRowShrink;
+ WebPRescalerImportRowExpand = ImportRowExpand_MIPS32;
+ WebPRescalerImportRowShrink = ImportRowShrink_MIPS32;
+ WebPRescalerExportRowExpand = ExportRowExpand_MIPS32;
+ WebPRescalerExportRowShrink = ExportRowShrink_MIPS32;
}
#else // !WEBP_USE_MIPS32
diff --git a/src/3rdparty/libwebp/src/dsp/rescaler_mips_dsp_r2.c b/src/3rdparty/libwebp/src/dsp/rescaler_mips_dsp_r2.c
index 2308d64..b78aac1 100644
--- a/src/3rdparty/libwebp/src/dsp/rescaler_mips_dsp_r2.c
+++ b/src/3rdparty/libwebp/src/dsp/rescaler_mips_dsp_r2.c
@@ -11,12 +11,12 @@
//
// Author(s): Djordje Pesut (djordje.pesut@imgtec.com)
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
-#if defined(WEBP_USE_MIPS_DSP_R2)
+#if defined(WEBP_USE_MIPS_DSP_R2) && !defined(WEBP_REDUCE_SIZE)
#include <assert.h>
-#include "../utils/rescaler_utils.h"
+#include "src/utils/rescaler_utils.h"
#define ROUNDER (WEBP_RESCALER_ONE >> 1)
#define MULT_FIX(x, y) (((uint64_t)(x) * (y) + ROUNDER) >> WEBP_RESCALER_RFIX)
@@ -24,7 +24,7 @@
//------------------------------------------------------------------------------
// Row export
-static void ExportRowShrink(WebPRescaler* const wrk) {
+static void ExportRowShrink_MIPSdspR2(WebPRescaler* const wrk) {
int i;
const int x_out_max = wrk->dst_width * wrk->num_channels;
uint8_t* dst = wrk->dst;
@@ -162,7 +162,7 @@ static void ExportRowShrink(WebPRescaler* const wrk) {
}
}
-static void ExportRowExpand(WebPRescaler* const wrk) {
+static void ExportRowExpand_MIPSdspR2(WebPRescaler* const wrk) {
int i;
uint8_t* dst = wrk->dst;
rescaler_t* irow = wrk->irow;
@@ -303,8 +303,8 @@ static void ExportRowExpand(WebPRescaler* const wrk) {
extern void WebPRescalerDspInitMIPSdspR2(void);
WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInitMIPSdspR2(void) {
- WebPRescalerExportRowExpand = ExportRowExpand;
- WebPRescalerExportRowShrink = ExportRowShrink;
+ WebPRescalerExportRowExpand = ExportRowExpand_MIPSdspR2;
+ WebPRescalerExportRowShrink = ExportRowShrink_MIPSdspR2;
}
#else // !WEBP_USE_MIPS_DSP_R2
diff --git a/src/3rdparty/libwebp/src/dsp/rescaler_msa.c b/src/3rdparty/libwebp/src/dsp/rescaler_msa.c
index 2c10e55..f3bc99f 100644
--- a/src/3rdparty/libwebp/src/dsp/rescaler_msa.c
+++ b/src/3rdparty/libwebp/src/dsp/rescaler_msa.c
@@ -11,14 +11,14 @@
//
// Author: Prashant Patil (prashant.patil@imgtec.com)
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
-#if defined(WEBP_USE_MSA)
+#if defined(WEBP_USE_MSA) && !defined(WEBP_REDUCE_SIZE)
#include <assert.h>
-#include "../utils/rescaler_utils.h"
-#include "./msa_macro.h"
+#include "src/utils/rescaler_utils.h"
+#include "src/dsp/msa_macro.h"
#define ROUNDER (WEBP_RESCALER_ONE >> 1)
#define MULT_FIX(x, y) (((uint64_t)(x) * (y) + ROUNDER) >> WEBP_RESCALER_RFIX)
@@ -246,7 +246,7 @@ static WEBP_INLINE void ExportRowExpand_1(const uint32_t* frow, uint32_t* irow,
}
}
-static void RescalerExportRowExpand(WebPRescaler* const wrk) {
+static void RescalerExportRowExpand_MIPSdspR2(WebPRescaler* const wrk) {
uint8_t* dst = wrk->dst;
rescaler_t* irow = wrk->irow;
const int x_out_max = wrk->dst_width * wrk->num_channels;
@@ -411,7 +411,7 @@ static WEBP_INLINE void ExportRowShrink_1(uint32_t* irow, uint8_t* dst,
}
}
-static void RescalerExportRowShrink(WebPRescaler* const wrk) {
+static void RescalerExportRowShrink_MIPSdspR2(WebPRescaler* const wrk) {
uint8_t* dst = wrk->dst;
rescaler_t* irow = wrk->irow;
const int x_out_max = wrk->dst_width * wrk->num_channels;
@@ -433,8 +433,8 @@ static void RescalerExportRowShrink(WebPRescaler* const wrk) {
extern void WebPRescalerDspInitMSA(void);
WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInitMSA(void) {
- WebPRescalerExportRowExpand = RescalerExportRowExpand;
- WebPRescalerExportRowShrink = RescalerExportRowShrink;
+ WebPRescalerExportRowExpand = RescalerExportRowExpand_MIPSdspR2;
+ WebPRescalerExportRowShrink = RescalerExportRowShrink_MIPSdspR2;
}
#else // !WEBP_USE_MSA
diff --git a/src/3rdparty/libwebp/src/dsp/rescaler_neon.c b/src/3rdparty/libwebp/src/dsp/rescaler_neon.c
index b2dd8f3..3eff9fb 100644
--- a/src/3rdparty/libwebp/src/dsp/rescaler_neon.c
+++ b/src/3rdparty/libwebp/src/dsp/rescaler_neon.c
@@ -11,14 +11,14 @@
//
// Author: Skal (pascal.massimino@gmail.com)
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
-#if defined(WEBP_USE_NEON)
+#if defined(WEBP_USE_NEON) && !defined(WEBP_REDUCE_SIZE)
#include <arm_neon.h>
#include <assert.h>
-#include "./neon.h"
-#include "../utils/rescaler_utils.h"
+#include "src/dsp/neon.h"
+#include "src/utils/rescaler_utils.h"
#define ROUNDER (WEBP_RESCALER_ONE >> 1)
#define MULT_FIX_C(x, y) (((uint64_t)(x) * (y) + ROUNDER) >> WEBP_RESCALER_RFIX)
@@ -41,9 +41,9 @@
#error "MULT_FIX/WEBP_RESCALER_RFIX need some more work"
#endif
-static uint32x4_t Interpolate(const rescaler_t* const frow,
- const rescaler_t* const irow,
- uint32_t A, uint32_t B) {
+static uint32x4_t Interpolate_NEON(const rescaler_t* const frow,
+ const rescaler_t* const irow,
+ uint32_t A, uint32_t B) {
LOAD_32x4(frow, A0);
LOAD_32x4(irow, B0);
const uint64x2_t C0 = vmull_n_u32(vget_low_u32(A0), A);
@@ -56,7 +56,7 @@ static uint32x4_t Interpolate(const rescaler_t* const frow,
return E;
}
-static void RescalerExportRowExpand(WebPRescaler* const wrk) {
+static void RescalerExportRowExpand_NEON(WebPRescaler* const wrk) {
int x_out;
uint8_t* const dst = wrk->dst;
rescaler_t* const irow = wrk->irow;
@@ -91,9 +91,9 @@ static void RescalerExportRowExpand(WebPRescaler* const wrk) {
const uint32_t A = (uint32_t)(WEBP_RESCALER_ONE - B);
for (x_out = 0; x_out < max_span; x_out += 8) {
const uint32x4_t C0 =
- Interpolate(frow + x_out + 0, irow + x_out + 0, A, B);
+ Interpolate_NEON(frow + x_out + 0, irow + x_out + 0, A, B);
const uint32x4_t C1 =
- Interpolate(frow + x_out + 4, irow + x_out + 4, A, B);
+ Interpolate_NEON(frow + x_out + 4, irow + x_out + 4, A, B);
const uint32x4_t D0 = MULT_FIX(C0, fy_scale_half);
const uint32x4_t D1 = MULT_FIX(C1, fy_scale_half);
const uint16x4_t E0 = vmovn_u32(D0);
@@ -112,7 +112,7 @@ static void RescalerExportRowExpand(WebPRescaler* const wrk) {
}
}
-static void RescalerExportRowShrink(WebPRescaler* const wrk) {
+static void RescalerExportRowShrink_NEON(WebPRescaler* const wrk) {
int x_out;
uint8_t* const dst = wrk->dst;
rescaler_t* const irow = wrk->irow;
@@ -175,8 +175,8 @@ static void RescalerExportRowShrink(WebPRescaler* const wrk) {
extern void WebPRescalerDspInitNEON(void);
WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInitNEON(void) {
- WebPRescalerExportRowExpand = RescalerExportRowExpand;
- WebPRescalerExportRowShrink = RescalerExportRowShrink;
+ WebPRescalerExportRowExpand = RescalerExportRowExpand_NEON;
+ WebPRescalerExportRowShrink = RescalerExportRowShrink_NEON;
}
#else // !WEBP_USE_NEON
diff --git a/src/3rdparty/libwebp/src/dsp/rescaler_sse2.c b/src/3rdparty/libwebp/src/dsp/rescaler_sse2.c
index 8271c22..f93b204 100644
--- a/src/3rdparty/libwebp/src/dsp/rescaler_sse2.c
+++ b/src/3rdparty/libwebp/src/dsp/rescaler_sse2.c
@@ -11,14 +11,14 @@
//
// Author: Skal (pascal.massimino@gmail.com)
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
-#if defined(WEBP_USE_SSE2)
+#if defined(WEBP_USE_SSE2) && !defined(WEBP_REDUCE_SIZE)
#include <emmintrin.h>
#include <assert.h>
-#include "../utils/rescaler_utils.h"
-#include "../utils/utils.h"
+#include "src/utils/rescaler_utils.h"
+#include "src/utils/utils.h"
//------------------------------------------------------------------------------
// Implementations of critical functions ImportRow / ExportRow
@@ -27,7 +27,7 @@
#define MULT_FIX(x, y) (((uint64_t)(x) * (y) + ROUNDER) >> WEBP_RESCALER_RFIX)
// input: 8 bytes ABCDEFGH -> output: A0E0B0F0C0G0D0H0
-static void LoadTwoPixels(const uint8_t* const src, __m128i* out) {
+static void LoadTwoPixels_SSE2(const uint8_t* const src, __m128i* out) {
const __m128i zero = _mm_setzero_si128();
const __m128i A = _mm_loadl_epi64((const __m128i*)(src)); // ABCDEFGH
const __m128i B = _mm_unpacklo_epi8(A, zero); // A0B0C0D0E0F0G0H0
@@ -36,14 +36,14 @@ static void LoadTwoPixels(const uint8_t* const src, __m128i* out) {
}
// input: 8 bytes ABCDEFGH -> output: A0B0C0D0E0F0G0H0
-static void LoadHeightPixels(const uint8_t* const src, __m128i* out) {
+static void LoadHeightPixels_SSE2(const uint8_t* const src, __m128i* out) {
const __m128i zero = _mm_setzero_si128();
const __m128i A = _mm_loadl_epi64((const __m128i*)(src)); // ABCDEFGH
*out = _mm_unpacklo_epi8(A, zero);
}
-static void RescalerImportRowExpandSSE2(WebPRescaler* const wrk,
- const uint8_t* src) {
+static void RescalerImportRowExpand_SSE2(WebPRescaler* const wrk,
+ const uint8_t* src) {
rescaler_t* frow = wrk->frow;
const rescaler_t* const frow_end = frow + wrk->dst_width * wrk->num_channels;
const int x_add = wrk->x_add;
@@ -54,10 +54,10 @@ static void RescalerImportRowExpandSSE2(WebPRescaler* const wrk,
assert(wrk->x_expand);
if (wrk->num_channels == 4) {
if (wrk->src_width < 2) {
- WebPRescalerImportRowExpandC(wrk, src);
+ WebPRescalerImportRowExpand_C(wrk, src);
return;
}
- LoadTwoPixels(src, &cur_pixels);
+ LoadTwoPixels_SSE2(src, &cur_pixels);
src += 4;
while (1) {
const __m128i mult = _mm_set1_epi32(((x_add - accum) << 16) | accum);
@@ -67,7 +67,7 @@ static void RescalerImportRowExpandSSE2(WebPRescaler* const wrk,
if (frow >= frow_end) break;
accum -= wrk->x_sub;
if (accum < 0) {
- LoadTwoPixels(src, &cur_pixels);
+ LoadTwoPixels_SSE2(src, &cur_pixels);
src += 4;
accum += x_add;
}
@@ -76,10 +76,10 @@ static void RescalerImportRowExpandSSE2(WebPRescaler* const wrk,
int left;
const uint8_t* const src_limit = src + wrk->src_width - 8;
if (wrk->src_width < 8) {
- WebPRescalerImportRowExpandC(wrk, src);
+ WebPRescalerImportRowExpand_C(wrk, src);
return;
}
- LoadHeightPixels(src, &cur_pixels);
+ LoadHeightPixels_SSE2(src, &cur_pixels);
src += 7;
left = 7;
while (1) {
@@ -94,7 +94,7 @@ static void RescalerImportRowExpandSSE2(WebPRescaler* const wrk,
if (--left) {
cur_pixels = _mm_srli_si128(cur_pixels, 2);
} else if (src <= src_limit) {
- LoadHeightPixels(src, &cur_pixels);
+ LoadHeightPixels_SSE2(src, &cur_pixels);
src += 7;
left = 7;
} else { // tail
@@ -110,8 +110,8 @@ static void RescalerImportRowExpandSSE2(WebPRescaler* const wrk,
assert(accum == 0);
}
-static void RescalerImportRowShrinkSSE2(WebPRescaler* const wrk,
- const uint8_t* src) {
+static void RescalerImportRowShrink_SSE2(WebPRescaler* const wrk,
+ const uint8_t* src) {
const int x_sub = wrk->x_sub;
int accum = 0;
const __m128i zero = _mm_setzero_si128();
@@ -123,7 +123,7 @@ static void RescalerImportRowShrinkSSE2(WebPRescaler* const wrk,
const rescaler_t* const frow_end = wrk->frow + 4 * wrk->dst_width;
if (wrk->num_channels != 4 || wrk->x_add > (x_sub << 7)) {
- WebPRescalerImportRowShrinkC(wrk, src);
+ WebPRescalerImportRowShrink_C(wrk, src);
return;
}
assert(!WebPRescalerInputDone(wrk));
@@ -169,12 +169,12 @@ static void RescalerImportRowShrinkSSE2(WebPRescaler* const wrk,
// Row export
// load *src as epi64, multiply by mult and store result in [out0 ... out3]
-static WEBP_INLINE void LoadDispatchAndMult(const rescaler_t* const src,
- const __m128i* const mult,
- __m128i* const out0,
- __m128i* const out1,
- __m128i* const out2,
- __m128i* const out3) {
+static WEBP_INLINE void LoadDispatchAndMult_SSE2(const rescaler_t* const src,
+ const __m128i* const mult,
+ __m128i* const out0,
+ __m128i* const out1,
+ __m128i* const out2,
+ __m128i* const out3) {
const __m128i A0 = _mm_loadu_si128((const __m128i*)(src + 0));
const __m128i A1 = _mm_loadu_si128((const __m128i*)(src + 4));
const __m128i A2 = _mm_srli_epi64(A0, 32);
@@ -192,12 +192,12 @@ static WEBP_INLINE void LoadDispatchAndMult(const rescaler_t* const src,
}
}
-static WEBP_INLINE void ProcessRow(const __m128i* const A0,
- const __m128i* const A1,
- const __m128i* const A2,
- const __m128i* const A3,
- const __m128i* const mult,
- uint8_t* const dst) {
+static WEBP_INLINE void ProcessRow_SSE2(const __m128i* const A0,
+ const __m128i* const A1,
+ const __m128i* const A2,
+ const __m128i* const A3,
+ const __m128i* const mult,
+ uint8_t* const dst) {
const __m128i rounder = _mm_set_epi32(0, ROUNDER, 0, ROUNDER);
const __m128i mask = _mm_set_epi32(0xffffffffu, 0, 0xffffffffu, 0);
const __m128i B0 = _mm_mul_epu32(*A0, *mult);
@@ -210,7 +210,7 @@ static WEBP_INLINE void ProcessRow(const __m128i* const A0,
const __m128i C3 = _mm_add_epi64(B3, rounder);
const __m128i D0 = _mm_srli_epi64(C0, WEBP_RESCALER_RFIX);
const __m128i D1 = _mm_srli_epi64(C1, WEBP_RESCALER_RFIX);
-#if (WEBP_RESCALER_FIX < 32)
+#if (WEBP_RESCALER_RFIX < 32)
const __m128i D2 =
_mm_and_si128(_mm_slli_epi64(C2, 32 - WEBP_RESCALER_RFIX), mask);
const __m128i D3 =
@@ -226,7 +226,7 @@ static WEBP_INLINE void ProcessRow(const __m128i* const A0,
_mm_storel_epi64((__m128i*)dst, G);
}
-static void RescalerExportRowExpandSSE2(WebPRescaler* const wrk) {
+static void RescalerExportRowExpand_SSE2(WebPRescaler* const wrk) {
int x_out;
uint8_t* const dst = wrk->dst;
rescaler_t* const irow = wrk->irow;
@@ -240,8 +240,8 @@ static void RescalerExportRowExpandSSE2(WebPRescaler* const wrk) {
if (wrk->y_accum == 0) {
for (x_out = 0; x_out + 8 <= x_out_max; x_out += 8) {
__m128i A0, A1, A2, A3;
- LoadDispatchAndMult(frow + x_out, NULL, &A0, &A1, &A2, &A3);
- ProcessRow(&A0, &A1, &A2, &A3, &mult, dst + x_out);
+ LoadDispatchAndMult_SSE2(frow + x_out, NULL, &A0, &A1, &A2, &A3);
+ ProcessRow_SSE2(&A0, &A1, &A2, &A3, &mult, dst + x_out);
}
for (; x_out < x_out_max; ++x_out) {
const uint32_t J = frow[x_out];
@@ -257,8 +257,8 @@ static void RescalerExportRowExpandSSE2(WebPRescaler* const wrk) {
const __m128i rounder = _mm_set_epi32(0, ROUNDER, 0, ROUNDER);
for (x_out = 0; x_out + 8 <= x_out_max; x_out += 8) {
__m128i A0, A1, A2, A3, B0, B1, B2, B3;
- LoadDispatchAndMult(frow + x_out, &mA, &A0, &A1, &A2, &A3);
- LoadDispatchAndMult(irow + x_out, &mB, &B0, &B1, &B2, &B3);
+ LoadDispatchAndMult_SSE2(frow + x_out, &mA, &A0, &A1, &A2, &A3);
+ LoadDispatchAndMult_SSE2(irow + x_out, &mB, &B0, &B1, &B2, &B3);
{
const __m128i C0 = _mm_add_epi64(A0, B0);
const __m128i C1 = _mm_add_epi64(A1, B1);
@@ -272,7 +272,7 @@ static void RescalerExportRowExpandSSE2(WebPRescaler* const wrk) {
const __m128i E1 = _mm_srli_epi64(D1, WEBP_RESCALER_RFIX);
const __m128i E2 = _mm_srli_epi64(D2, WEBP_RESCALER_RFIX);
const __m128i E3 = _mm_srli_epi64(D3, WEBP_RESCALER_RFIX);
- ProcessRow(&E0, &E1, &E2, &E3, &mult, dst + x_out);
+ ProcessRow_SSE2(&E0, &E1, &E2, &E3, &mult, dst + x_out);
}
}
for (; x_out < x_out_max; ++x_out) {
@@ -286,7 +286,7 @@ static void RescalerExportRowExpandSSE2(WebPRescaler* const wrk) {
}
}
-static void RescalerExportRowShrinkSSE2(WebPRescaler* const wrk) {
+static void RescalerExportRowShrink_SSE2(WebPRescaler* const wrk) {
int x_out;
uint8_t* const dst = wrk->dst;
rescaler_t* const irow = wrk->irow;
@@ -303,8 +303,8 @@ static void RescalerExportRowShrinkSSE2(WebPRescaler* const wrk) {
const __m128i rounder = _mm_set_epi32(0, ROUNDER, 0, ROUNDER);
for (x_out = 0; x_out + 8 <= x_out_max; x_out += 8) {
__m128i A0, A1, A2, A3, B0, B1, B2, B3;
- LoadDispatchAndMult(irow + x_out, NULL, &A0, &A1, &A2, &A3);
- LoadDispatchAndMult(frow + x_out, &mult_y, &B0, &B1, &B2, &B3);
+ LoadDispatchAndMult_SSE2(irow + x_out, NULL, &A0, &A1, &A2, &A3);
+ LoadDispatchAndMult_SSE2(frow + x_out, &mult_y, &B0, &B1, &B2, &B3);
{
const __m128i C0 = _mm_add_epi64(B0, rounder);
const __m128i C1 = _mm_add_epi64(B1, rounder);
@@ -324,7 +324,7 @@ static void RescalerExportRowShrinkSSE2(WebPRescaler* const wrk) {
const __m128i G1 = _mm_or_si128(D1, F3);
_mm_storeu_si128((__m128i*)(irow + x_out + 0), G0);
_mm_storeu_si128((__m128i*)(irow + x_out + 4), G1);
- ProcessRow(&E0, &E1, &E2, &E3, &mult_xy, dst + x_out);
+ ProcessRow_SSE2(&E0, &E1, &E2, &E3, &mult_xy, dst + x_out);
}
}
for (; x_out < x_out_max; ++x_out) {
@@ -340,10 +340,10 @@ static void RescalerExportRowShrinkSSE2(WebPRescaler* const wrk) {
const __m128i zero = _mm_setzero_si128();
for (x_out = 0; x_out + 8 <= x_out_max; x_out += 8) {
__m128i A0, A1, A2, A3;
- LoadDispatchAndMult(irow + x_out, NULL, &A0, &A1, &A2, &A3);
+ LoadDispatchAndMult_SSE2(irow + x_out, NULL, &A0, &A1, &A2, &A3);
_mm_storeu_si128((__m128i*)(irow + x_out + 0), zero);
_mm_storeu_si128((__m128i*)(irow + x_out + 4), zero);
- ProcessRow(&A0, &A1, &A2, &A3, &mult, dst + x_out);
+ ProcessRow_SSE2(&A0, &A1, &A2, &A3, &mult, dst + x_out);
}
for (; x_out < x_out_max; ++x_out) {
const int v = (int)MULT_FIX(irow[x_out], scale);
@@ -362,10 +362,10 @@ static void RescalerExportRowShrinkSSE2(WebPRescaler* const wrk) {
extern void WebPRescalerDspInitSSE2(void);
WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInitSSE2(void) {
- WebPRescalerImportRowExpand = RescalerImportRowExpandSSE2;
- WebPRescalerImportRowShrink = RescalerImportRowShrinkSSE2;
- WebPRescalerExportRowExpand = RescalerExportRowExpandSSE2;
- WebPRescalerExportRowShrink = RescalerExportRowShrinkSSE2;
+ WebPRescalerImportRowExpand = RescalerImportRowExpand_SSE2;
+ WebPRescalerImportRowShrink = RescalerImportRowShrink_SSE2;
+ WebPRescalerExportRowExpand = RescalerExportRowExpand_SSE2;
+ WebPRescalerExportRowShrink = RescalerExportRowShrink_SSE2;
}
#else // !WEBP_USE_SSE2
diff --git a/src/3rdparty/libwebp/src/dsp/ssim.c b/src/3rdparty/libwebp/src/dsp/ssim.c
new file mode 100644
index 0000000..dc1b518
--- /dev/null
+++ b/src/3rdparty/libwebp/src/dsp/ssim.c
@@ -0,0 +1,166 @@
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// distortion calculation
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include <assert.h>
+#include <stdlib.h> // for abs()
+
+#include "src/dsp/dsp.h"
+
+#if !defined(WEBP_REDUCE_SIZE)
+
+//------------------------------------------------------------------------------
+// SSIM / PSNR
+
+// hat-shaped filter. Sum of coefficients is equal to 16.
+static const uint32_t kWeight[2 * VP8_SSIM_KERNEL + 1] = {
+ 1, 2, 3, 4, 3, 2, 1
+};
+static const uint32_t kWeightSum = 16 * 16; // sum{kWeight}^2
+
+static WEBP_INLINE double SSIMCalculation(
+ const VP8DistoStats* const stats, uint32_t N /*num samples*/) {
+ const uint32_t w2 = N * N;
+ const uint32_t C1 = 20 * w2;
+ const uint32_t C2 = 60 * w2;
+ const uint32_t C3 = 8 * 8 * w2; // 'dark' limit ~= 6
+ const uint64_t xmxm = (uint64_t)stats->xm * stats->xm;
+ const uint64_t ymym = (uint64_t)stats->ym * stats->ym;
+ if (xmxm + ymym >= C3) {
+ const int64_t xmym = (int64_t)stats->xm * stats->ym;
+ const int64_t sxy = (int64_t)stats->xym * N - xmym; // can be negative
+ const uint64_t sxx = (uint64_t)stats->xxm * N - xmxm;
+ const uint64_t syy = (uint64_t)stats->yym * N - ymym;
+ // we descale by 8 to prevent overflow during the fnum/fden multiply.
+ const uint64_t num_S = (2 * (uint64_t)(sxy < 0 ? 0 : sxy) + C2) >> 8;
+ const uint64_t den_S = (sxx + syy + C2) >> 8;
+ const uint64_t fnum = (2 * xmym + C1) * num_S;
+ const uint64_t fden = (xmxm + ymym + C1) * den_S;
+ const double r = (double)fnum / fden;
+ assert(r >= 0. && r <= 1.0);
+ return r;
+ }
+ return 1.; // area is too dark to contribute meaningfully
+}
+
+double VP8SSIMFromStats(const VP8DistoStats* const stats) {
+ return SSIMCalculation(stats, kWeightSum);
+}
+
+double VP8SSIMFromStatsClipped(const VP8DistoStats* const stats) {
+ return SSIMCalculation(stats, stats->w);
+}
+
+static double SSIMGetClipped_C(const uint8_t* src1, int stride1,
+ const uint8_t* src2, int stride2,
+ int xo, int yo, int W, int H) {
+ VP8DistoStats stats = { 0, 0, 0, 0, 0, 0 };
+ const int ymin = (yo - VP8_SSIM_KERNEL < 0) ? 0 : yo - VP8_SSIM_KERNEL;
+ const int ymax = (yo + VP8_SSIM_KERNEL > H - 1) ? H - 1
+ : yo + VP8_SSIM_KERNEL;
+ const int xmin = (xo - VP8_SSIM_KERNEL < 0) ? 0 : xo - VP8_SSIM_KERNEL;
+ const int xmax = (xo + VP8_SSIM_KERNEL > W - 1) ? W - 1
+ : xo + VP8_SSIM_KERNEL;
+ int x, y;
+ src1 += ymin * stride1;
+ src2 += ymin * stride2;
+ for (y = ymin; y <= ymax; ++y, src1 += stride1, src2 += stride2) {
+ for (x = xmin; x <= xmax; ++x) {
+ const uint32_t w = kWeight[VP8_SSIM_KERNEL + x - xo]
+ * kWeight[VP8_SSIM_KERNEL + y - yo];
+ const uint32_t s1 = src1[x];
+ const uint32_t s2 = src2[x];
+ stats.w += w;
+ stats.xm += w * s1;
+ stats.ym += w * s2;
+ stats.xxm += w * s1 * s1;
+ stats.xym += w * s1 * s2;
+ stats.yym += w * s2 * s2;
+ }
+ }
+ return VP8SSIMFromStatsClipped(&stats);
+}
+
+static double SSIMGet_C(const uint8_t* src1, int stride1,
+ const uint8_t* src2, int stride2) {
+ VP8DistoStats stats = { 0, 0, 0, 0, 0, 0 };
+ int x, y;
+ for (y = 0; y <= 2 * VP8_SSIM_KERNEL; ++y, src1 += stride1, src2 += stride2) {
+ for (x = 0; x <= 2 * VP8_SSIM_KERNEL; ++x) {
+ const uint32_t w = kWeight[x] * kWeight[y];
+ const uint32_t s1 = src1[x];
+ const uint32_t s2 = src2[x];
+ stats.xm += w * s1;
+ stats.ym += w * s2;
+ stats.xxm += w * s1 * s1;
+ stats.xym += w * s1 * s2;
+ stats.yym += w * s2 * s2;
+ }
+ }
+ return VP8SSIMFromStats(&stats);
+}
+
+#endif // !defined(WEBP_REDUCE_SIZE)
+
+//------------------------------------------------------------------------------
+
+#if !defined(WEBP_DISABLE_STATS)
+static uint32_t AccumulateSSE_C(const uint8_t* src1,
+ const uint8_t* src2, int len) {
+ int i;
+ uint32_t sse2 = 0;
+ assert(len <= 65535); // to ensure that accumulation fits within uint32_t
+ for (i = 0; i < len; ++i) {
+ const int32_t diff = src1[i] - src2[i];
+ sse2 += diff * diff;
+ }
+ return sse2;
+}
+#endif
+
+//------------------------------------------------------------------------------
+
+#if !defined(WEBP_REDUCE_SIZE)
+VP8SSIMGetFunc VP8SSIMGet;
+VP8SSIMGetClippedFunc VP8SSIMGetClipped;
+#endif
+#if !defined(WEBP_DISABLE_STATS)
+VP8AccumulateSSEFunc VP8AccumulateSSE;
+#endif
+
+extern void VP8SSIMDspInitSSE2(void);
+
+static volatile VP8CPUInfo ssim_last_cpuinfo_used =
+ (VP8CPUInfo)&ssim_last_cpuinfo_used;
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8SSIMDspInit(void) {
+ if (ssim_last_cpuinfo_used == VP8GetCPUInfo) return;
+
+#if !defined(WEBP_REDUCE_SIZE)
+ VP8SSIMGetClipped = SSIMGetClipped_C;
+ VP8SSIMGet = SSIMGet_C;
+#endif
+
+#if !defined(WEBP_DISABLE_STATS)
+ VP8AccumulateSSE = AccumulateSSE_C;
+#endif
+
+ if (VP8GetCPUInfo != NULL) {
+#if defined(WEBP_USE_SSE2)
+ if (VP8GetCPUInfo(kSSE2)) {
+ VP8SSIMDspInitSSE2();
+ }
+#endif
+ }
+
+ ssim_last_cpuinfo_used = VP8GetCPUInfo;
+}
diff --git a/src/3rdparty/libwebp/src/dsp/ssim_sse2.c b/src/3rdparty/libwebp/src/dsp/ssim_sse2.c
new file mode 100644
index 0000000..1dcb0eb
--- /dev/null
+++ b/src/3rdparty/libwebp/src/dsp/ssim_sse2.c
@@ -0,0 +1,165 @@
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// SSE2 version of distortion calculation
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "src/dsp/dsp.h"
+
+#if defined(WEBP_USE_SSE2)
+
+#include <assert.h>
+#include <emmintrin.h>
+
+#include "src/dsp/common_sse2.h"
+
+#if !defined(WEBP_DISABLE_STATS)
+
+// Helper function
+static WEBP_INLINE void SubtractAndSquare_SSE2(const __m128i a, const __m128i b,
+ __m128i* const sum) {
+ // take abs(a-b) in 8b
+ const __m128i a_b = _mm_subs_epu8(a, b);
+ const __m128i b_a = _mm_subs_epu8(b, a);
+ const __m128i abs_a_b = _mm_or_si128(a_b, b_a);
+ // zero-extend to 16b
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i C0 = _mm_unpacklo_epi8(abs_a_b, zero);
+ const __m128i C1 = _mm_unpackhi_epi8(abs_a_b, zero);
+ // multiply with self
+ const __m128i sum1 = _mm_madd_epi16(C0, C0);
+ const __m128i sum2 = _mm_madd_epi16(C1, C1);
+ *sum = _mm_add_epi32(sum1, sum2);
+}
+
+//------------------------------------------------------------------------------
+// SSIM / PSNR entry point
+
+static uint32_t AccumulateSSE_SSE2(const uint8_t* src1,
+ const uint8_t* src2, int len) {
+ int i = 0;
+ uint32_t sse2 = 0;
+ if (len >= 16) {
+ const int limit = len - 32;
+ int32_t tmp[4];
+ __m128i sum1;
+ __m128i sum = _mm_setzero_si128();
+ __m128i a0 = _mm_loadu_si128((const __m128i*)&src1[i]);
+ __m128i b0 = _mm_loadu_si128((const __m128i*)&src2[i]);
+ i += 16;
+ while (i <= limit) {
+ const __m128i a1 = _mm_loadu_si128((const __m128i*)&src1[i]);
+ const __m128i b1 = _mm_loadu_si128((const __m128i*)&src2[i]);
+ __m128i sum2;
+ i += 16;
+ SubtractAndSquare_SSE2(a0, b0, &sum1);
+ sum = _mm_add_epi32(sum, sum1);
+ a0 = _mm_loadu_si128((const __m128i*)&src1[i]);
+ b0 = _mm_loadu_si128((const __m128i*)&src2[i]);
+ i += 16;
+ SubtractAndSquare_SSE2(a1, b1, &sum2);
+ sum = _mm_add_epi32(sum, sum2);
+ }
+ SubtractAndSquare_SSE2(a0, b0, &sum1);
+ sum = _mm_add_epi32(sum, sum1);
+ _mm_storeu_si128((__m128i*)tmp, sum);
+ sse2 += (tmp[3] + tmp[2] + tmp[1] + tmp[0]);
+ }
+
+ for (; i < len; ++i) {
+ const int32_t diff = src1[i] - src2[i];
+ sse2 += diff * diff;
+ }
+ return sse2;
+}
+#endif // !defined(WEBP_DISABLE_STATS)
+
+#if !defined(WEBP_REDUCE_SIZE)
+
+static uint32_t HorizontalAdd16b_SSE2(const __m128i* const m) {
+ uint16_t tmp[8];
+ const __m128i a = _mm_srli_si128(*m, 8);
+ const __m128i b = _mm_add_epi16(*m, a);
+ _mm_storeu_si128((__m128i*)tmp, b);
+ return (uint32_t)tmp[3] + tmp[2] + tmp[1] + tmp[0];
+}
+
+static uint32_t HorizontalAdd32b_SSE2(const __m128i* const m) {
+ const __m128i a = _mm_srli_si128(*m, 8);
+ const __m128i b = _mm_add_epi32(*m, a);
+ const __m128i c = _mm_add_epi32(b, _mm_srli_si128(b, 4));
+ return (uint32_t)_mm_cvtsi128_si32(c);
+}
+
+static const uint16_t kWeight[] = { 1, 2, 3, 4, 3, 2, 1, 0 };
+
+#define ACCUMULATE_ROW(WEIGHT) do { \
+ /* compute row weight (Wx * Wy) */ \
+ const __m128i Wy = _mm_set1_epi16((WEIGHT)); \
+ const __m128i W = _mm_mullo_epi16(Wx, Wy); \
+ /* process 8 bytes at a time (7 bytes, actually) */ \
+ const __m128i a0 = _mm_loadl_epi64((const __m128i*)src1); \
+ const __m128i b0 = _mm_loadl_epi64((const __m128i*)src2); \
+ /* convert to 16b and multiply by weight */ \
+ const __m128i a1 = _mm_unpacklo_epi8(a0, zero); \
+ const __m128i b1 = _mm_unpacklo_epi8(b0, zero); \
+ const __m128i wa1 = _mm_mullo_epi16(a1, W); \
+ const __m128i wb1 = _mm_mullo_epi16(b1, W); \
+ /* accumulate */ \
+ xm = _mm_add_epi16(xm, wa1); \
+ ym = _mm_add_epi16(ym, wb1); \
+ xxm = _mm_add_epi32(xxm, _mm_madd_epi16(a1, wa1)); \
+ xym = _mm_add_epi32(xym, _mm_madd_epi16(a1, wb1)); \
+ yym = _mm_add_epi32(yym, _mm_madd_epi16(b1, wb1)); \
+ src1 += stride1; \
+ src2 += stride2; \
+} while (0)
+
+static double SSIMGet_SSE2(const uint8_t* src1, int stride1,
+ const uint8_t* src2, int stride2) {
+ VP8DistoStats stats;
+ const __m128i zero = _mm_setzero_si128();
+ __m128i xm = zero, ym = zero; // 16b accums
+ __m128i xxm = zero, yym = zero, xym = zero; // 32b accum
+ const __m128i Wx = _mm_loadu_si128((const __m128i*)kWeight);
+ assert(2 * VP8_SSIM_KERNEL + 1 == 7);
+ ACCUMULATE_ROW(1);
+ ACCUMULATE_ROW(2);
+ ACCUMULATE_ROW(3);
+ ACCUMULATE_ROW(4);
+ ACCUMULATE_ROW(3);
+ ACCUMULATE_ROW(2);
+ ACCUMULATE_ROW(1);
+ stats.xm = HorizontalAdd16b_SSE2(&xm);
+ stats.ym = HorizontalAdd16b_SSE2(&ym);
+ stats.xxm = HorizontalAdd32b_SSE2(&xxm);
+ stats.xym = HorizontalAdd32b_SSE2(&xym);
+ stats.yym = HorizontalAdd32b_SSE2(&yym);
+ return VP8SSIMFromStats(&stats);
+}
+
+#endif // !defined(WEBP_REDUCE_SIZE)
+
+extern void VP8SSIMDspInitSSE2(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8SSIMDspInitSSE2(void) {
+#if !defined(WEBP_DISABLE_STATS)
+ VP8AccumulateSSE = AccumulateSSE_SSE2;
+#endif
+#if !defined(WEBP_REDUCE_SIZE)
+ VP8SSIMGet = SSIMGet_SSE2;
+#endif
+}
+
+#else // !WEBP_USE_SSE2
+
+WEBP_DSP_INIT_STUB(VP8SSIMDspInitSSE2)
+
+#endif // WEBP_USE_SSE2
diff --git a/src/3rdparty/libwebp/src/dsp/upsampling.c b/src/3rdparty/libwebp/src/dsp/upsampling.c
index 265e722..e72626a 100644
--- a/src/3rdparty/libwebp/src/dsp/upsampling.c
+++ b/src/3rdparty/libwebp/src/dsp/upsampling.c
@@ -11,8 +11,8 @@
//
// Author: somnath@google.com (Somnath Banerjee)
-#include "./dsp.h"
-#include "./yuv.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/yuv.h"
#include <assert.h>
@@ -63,17 +63,17 @@ static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y, \
const uint32_t uv0 = (diag_12 + tl_uv) >> 1; \
const uint32_t uv1 = (diag_03 + t_uv) >> 1; \
FUNC(top_y[2 * x - 1], uv0 & 0xff, (uv0 >> 16), \
- top_dst + (2 * x - 1) * XSTEP); \
+ top_dst + (2 * x - 1) * (XSTEP)); \
FUNC(top_y[2 * x - 0], uv1 & 0xff, (uv1 >> 16), \
- top_dst + (2 * x - 0) * XSTEP); \
+ top_dst + (2 * x - 0) * (XSTEP)); \
} \
if (bottom_y != NULL) { \
const uint32_t uv0 = (diag_03 + l_uv) >> 1; \
const uint32_t uv1 = (diag_12 + uv) >> 1; \
FUNC(bottom_y[2 * x - 1], uv0 & 0xff, (uv0 >> 16), \
- bottom_dst + (2 * x - 1) * XSTEP); \
+ bottom_dst + (2 * x - 1) * (XSTEP)); \
FUNC(bottom_y[2 * x + 0], uv1 & 0xff, (uv1 >> 16), \
- bottom_dst + (2 * x + 0) * XSTEP); \
+ bottom_dst + (2 * x + 0) * (XSTEP)); \
} \
tl_uv = t_uv; \
l_uv = uv; \
@@ -82,24 +82,50 @@ static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y, \
{ \
const uint32_t uv0 = (3 * tl_uv + l_uv + 0x00020002u) >> 2; \
FUNC(top_y[len - 1], uv0 & 0xff, (uv0 >> 16), \
- top_dst + (len - 1) * XSTEP); \
+ top_dst + (len - 1) * (XSTEP)); \
} \
if (bottom_y != NULL) { \
const uint32_t uv0 = (3 * l_uv + tl_uv + 0x00020002u) >> 2; \
FUNC(bottom_y[len - 1], uv0 & 0xff, (uv0 >> 16), \
- bottom_dst + (len - 1) * XSTEP); \
+ bottom_dst + (len - 1) * (XSTEP)); \
} \
} \
}
// All variants implemented.
-UPSAMPLE_FUNC(UpsampleRgbLinePair, VP8YuvToRgb, 3)
-UPSAMPLE_FUNC(UpsampleBgrLinePair, VP8YuvToBgr, 3)
-UPSAMPLE_FUNC(UpsampleRgbaLinePair, VP8YuvToRgba, 4)
-UPSAMPLE_FUNC(UpsampleBgraLinePair, VP8YuvToBgra, 4)
-UPSAMPLE_FUNC(UpsampleArgbLinePair, VP8YuvToArgb, 4)
-UPSAMPLE_FUNC(UpsampleRgba4444LinePair, VP8YuvToRgba4444, 2)
-UPSAMPLE_FUNC(UpsampleRgb565LinePair, VP8YuvToRgb565, 2)
+#if !WEBP_NEON_OMIT_C_CODE
+UPSAMPLE_FUNC(UpsampleRgbaLinePair_C, VP8YuvToRgba, 4)
+UPSAMPLE_FUNC(UpsampleBgraLinePair_C, VP8YuvToBgra, 4)
+#if !defined(WEBP_REDUCE_CSP)
+UPSAMPLE_FUNC(UpsampleArgbLinePair_C, VP8YuvToArgb, 4)
+UPSAMPLE_FUNC(UpsampleRgbLinePair_C, VP8YuvToRgb, 3)
+UPSAMPLE_FUNC(UpsampleBgrLinePair_C, VP8YuvToBgr, 3)
+UPSAMPLE_FUNC(UpsampleRgba4444LinePair_C, VP8YuvToRgba4444, 2)
+UPSAMPLE_FUNC(UpsampleRgb565LinePair_C, VP8YuvToRgb565, 2)
+#else
+static void EmptyUpsampleFunc(const uint8_t* top_y, const uint8_t* bottom_y,
+ const uint8_t* top_u, const uint8_t* top_v,
+ const uint8_t* cur_u, const uint8_t* cur_v,
+ uint8_t* top_dst, uint8_t* bottom_dst, int len) {
+ (void)top_y;
+ (void)bottom_y;
+ (void)top_u;
+ (void)top_v;
+ (void)cur_u;
+ (void)cur_v;
+ (void)top_dst;
+ (void)bottom_dst;
+ (void)len;
+ assert(0); // COLORSPACE SUPPORT NOT COMPILED
+}
+#define UpsampleArgbLinePair_C EmptyUpsampleFunc
+#define UpsampleRgbLinePair_C EmptyUpsampleFunc
+#define UpsampleBgrLinePair_C EmptyUpsampleFunc
+#define UpsampleRgba4444LinePair_C EmptyUpsampleFunc
+#define UpsampleRgb565LinePair_C EmptyUpsampleFunc
+#endif // WEBP_REDUCE_CSP
+
+#endif
#undef LOAD_UV
#undef UPSAMPLE_FUNC
@@ -141,7 +167,6 @@ DUAL_SAMPLE_FUNC(DualLineSamplerARGB, VP8YuvToArgb)
WebPUpsampleLinePairFunc WebPGetLinePairConverter(int alpha_is_last) {
WebPInitUpsamplers();
- VP8YUVInit();
#ifdef FANCY_UPSAMPLING
return WebPUpsamplers[alpha_is_last ? MODE_BGRA : MODE_ARGB];
#else
@@ -158,16 +183,33 @@ extern void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v, \
void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v, \
uint8_t* dst, int len) { \
int i; \
- for (i = 0; i < len; ++i) FUNC(y[i], u[i], v[i], &dst[i * XSTEP]); \
+ for (i = 0; i < len; ++i) FUNC(y[i], u[i], v[i], &dst[i * (XSTEP)]); \
}
-YUV444_FUNC(WebPYuv444ToRgbC, VP8YuvToRgb, 3)
-YUV444_FUNC(WebPYuv444ToBgrC, VP8YuvToBgr, 3)
-YUV444_FUNC(WebPYuv444ToRgbaC, VP8YuvToRgba, 4)
-YUV444_FUNC(WebPYuv444ToBgraC, VP8YuvToBgra, 4)
-YUV444_FUNC(WebPYuv444ToArgbC, VP8YuvToArgb, 4)
-YUV444_FUNC(WebPYuv444ToRgba4444C, VP8YuvToRgba4444, 2)
-YUV444_FUNC(WebPYuv444ToRgb565C, VP8YuvToRgb565, 2)
+YUV444_FUNC(WebPYuv444ToRgba_C, VP8YuvToRgba, 4)
+YUV444_FUNC(WebPYuv444ToBgra_C, VP8YuvToBgra, 4)
+#if !defined(WEBP_REDUCE_CSP)
+YUV444_FUNC(WebPYuv444ToRgb_C, VP8YuvToRgb, 3)
+YUV444_FUNC(WebPYuv444ToBgr_C, VP8YuvToBgr, 3)
+YUV444_FUNC(WebPYuv444ToArgb_C, VP8YuvToArgb, 4)
+YUV444_FUNC(WebPYuv444ToRgba4444_C, VP8YuvToRgba4444, 2)
+YUV444_FUNC(WebPYuv444ToRgb565_C, VP8YuvToRgb565, 2)
+#else
+static void EmptyYuv444Func(const uint8_t* y,
+ const uint8_t* u, const uint8_t* v,
+ uint8_t* dst, int len) {
+ (void)y;
+ (void)u;
+ (void)v;
+ (void)dst;
+ (void)len;
+}
+#define WebPYuv444ToRgb_C EmptyYuv444Func
+#define WebPYuv444ToBgr_C EmptyYuv444Func
+#define WebPYuv444ToArgb_C EmptyYuv444Func
+#define WebPYuv444ToRgba4444_C EmptyYuv444Func
+#define WebPYuv444ToRgb565_C EmptyYuv444Func
+#endif // WEBP_REDUCE_CSP
#undef YUV444_FUNC
@@ -182,17 +224,17 @@ static volatile VP8CPUInfo upsampling_last_cpuinfo_used1 =
WEBP_TSAN_IGNORE_FUNCTION void WebPInitYUV444Converters(void) {
if (upsampling_last_cpuinfo_used1 == VP8GetCPUInfo) return;
- WebPYUV444Converters[MODE_RGB] = WebPYuv444ToRgbC;
- WebPYUV444Converters[MODE_RGBA] = WebPYuv444ToRgbaC;
- WebPYUV444Converters[MODE_BGR] = WebPYuv444ToBgrC;
- WebPYUV444Converters[MODE_BGRA] = WebPYuv444ToBgraC;
- WebPYUV444Converters[MODE_ARGB] = WebPYuv444ToArgbC;
- WebPYUV444Converters[MODE_RGBA_4444] = WebPYuv444ToRgba4444C;
- WebPYUV444Converters[MODE_RGB_565] = WebPYuv444ToRgb565C;
- WebPYUV444Converters[MODE_rgbA] = WebPYuv444ToRgbaC;
- WebPYUV444Converters[MODE_bgrA] = WebPYuv444ToBgraC;
- WebPYUV444Converters[MODE_Argb] = WebPYuv444ToArgbC;
- WebPYUV444Converters[MODE_rgbA_4444] = WebPYuv444ToRgba4444C;
+ WebPYUV444Converters[MODE_RGBA] = WebPYuv444ToRgba_C;
+ WebPYUV444Converters[MODE_BGRA] = WebPYuv444ToBgra_C;
+ WebPYUV444Converters[MODE_RGB] = WebPYuv444ToRgb_C;
+ WebPYUV444Converters[MODE_BGR] = WebPYuv444ToBgr_C;
+ WebPYUV444Converters[MODE_ARGB] = WebPYuv444ToArgb_C;
+ WebPYUV444Converters[MODE_RGBA_4444] = WebPYuv444ToRgba4444_C;
+ WebPYUV444Converters[MODE_RGB_565] = WebPYuv444ToRgb565_C;
+ WebPYUV444Converters[MODE_rgbA] = WebPYuv444ToRgba_C;
+ WebPYUV444Converters[MODE_bgrA] = WebPYuv444ToBgra_C;
+ WebPYUV444Converters[MODE_Argb] = WebPYuv444ToArgb_C;
+ WebPYUV444Converters[MODE_rgbA_4444] = WebPYuv444ToRgba4444_C;
if (VP8GetCPUInfo != NULL) {
#if defined(WEBP_USE_SSE2)
@@ -224,17 +266,19 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplers(void) {
if (upsampling_last_cpuinfo_used2 == VP8GetCPUInfo) return;
#ifdef FANCY_UPSAMPLING
- WebPUpsamplers[MODE_RGB] = UpsampleRgbLinePair;
- WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePair;
- WebPUpsamplers[MODE_BGR] = UpsampleBgrLinePair;
- WebPUpsamplers[MODE_BGRA] = UpsampleBgraLinePair;
- WebPUpsamplers[MODE_ARGB] = UpsampleArgbLinePair;
- WebPUpsamplers[MODE_RGBA_4444] = UpsampleRgba4444LinePair;
- WebPUpsamplers[MODE_RGB_565] = UpsampleRgb565LinePair;
- WebPUpsamplers[MODE_rgbA] = UpsampleRgbaLinePair;
- WebPUpsamplers[MODE_bgrA] = UpsampleBgraLinePair;
- WebPUpsamplers[MODE_Argb] = UpsampleArgbLinePair;
- WebPUpsamplers[MODE_rgbA_4444] = UpsampleRgba4444LinePair;
+#if !WEBP_NEON_OMIT_C_CODE
+ WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePair_C;
+ WebPUpsamplers[MODE_BGRA] = UpsampleBgraLinePair_C;
+ WebPUpsamplers[MODE_rgbA] = UpsampleRgbaLinePair_C;
+ WebPUpsamplers[MODE_bgrA] = UpsampleBgraLinePair_C;
+ WebPUpsamplers[MODE_RGB] = UpsampleRgbLinePair_C;
+ WebPUpsamplers[MODE_BGR] = UpsampleBgrLinePair_C;
+ WebPUpsamplers[MODE_ARGB] = UpsampleArgbLinePair_C;
+ WebPUpsamplers[MODE_RGBA_4444] = UpsampleRgba4444LinePair_C;
+ WebPUpsamplers[MODE_RGB_565] = UpsampleRgb565LinePair_C;
+ WebPUpsamplers[MODE_Argb] = UpsampleArgbLinePair_C;
+ WebPUpsamplers[MODE_rgbA_4444] = UpsampleRgba4444LinePair_C;
+#endif
// If defined, use CPUInfo() to overwrite some pointers with faster versions.
if (VP8GetCPUInfo != NULL) {
@@ -243,11 +287,6 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplers(void) {
WebPInitUpsamplersSSE2();
}
#endif
-#if defined(WEBP_USE_NEON)
- if (VP8GetCPUInfo(kNEON)) {
- WebPInitUpsamplersNEON();
- }
-#endif
#if defined(WEBP_USE_MIPS_DSP_R2)
if (VP8GetCPUInfo(kMIPSdspR2)) {
WebPInitUpsamplersMIPSdspR2();
@@ -259,6 +298,26 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplers(void) {
}
#endif
}
+
+#if defined(WEBP_USE_NEON)
+ if (WEBP_NEON_OMIT_C_CODE ||
+ (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
+ WebPInitUpsamplersNEON();
+ }
+#endif
+
+ assert(WebPUpsamplers[MODE_RGBA] != NULL);
+ assert(WebPUpsamplers[MODE_BGRA] != NULL);
+ assert(WebPUpsamplers[MODE_rgbA] != NULL);
+ assert(WebPUpsamplers[MODE_bgrA] != NULL);
+ assert(WebPUpsamplers[MODE_RGB] != NULL);
+ assert(WebPUpsamplers[MODE_BGR] != NULL);
+ assert(WebPUpsamplers[MODE_ARGB] != NULL);
+ assert(WebPUpsamplers[MODE_RGBA_4444] != NULL);
+ assert(WebPUpsamplers[MODE_RGB_565] != NULL);
+ assert(WebPUpsamplers[MODE_Argb] != NULL);
+ assert(WebPUpsamplers[MODE_rgbA_4444] != NULL);
+
#endif // FANCY_UPSAMPLING
upsampling_last_cpuinfo_used2 = VP8GetCPUInfo;
}
diff --git a/src/3rdparty/libwebp/src/dsp/upsampling_mips_dsp_r2.c b/src/3rdparty/libwebp/src/dsp/upsampling_mips_dsp_r2.c
index ed2eb74..10d499d 100644
--- a/src/3rdparty/libwebp/src/dsp/upsampling_mips_dsp_r2.c
+++ b/src/3rdparty/libwebp/src/dsp/upsampling_mips_dsp_r2.c
@@ -12,14 +12,12 @@
// Author(s): Branimir Vasic (branimir.vasic@imgtec.com)
// Djordje Pesut (djordje.pesut@imgtec.com)
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
#if defined(WEBP_USE_MIPS_DSP_R2)
#include <assert.h>
-#include "./yuv.h"
-
-#if !defined(WEBP_YUV_USE_TABLE)
+#include "src/dsp/yuv.h"
#define YUV_TO_RGB(Y, U, V, R, G, B) do { \
const int t1 = MultHi(Y, 19077); \
@@ -48,6 +46,7 @@
); \
} while (0)
+#if !defined(WEBP_REDUCE_CSP)
static WEBP_INLINE void YuvToRgb(int y, int u, int v, uint8_t* const rgb) {
int r, g, b;
YUV_TO_RGB(y, u, v, r, g, b);
@@ -68,7 +67,7 @@ static WEBP_INLINE void YuvToRgb565(int y, int u, int v, uint8_t* const rgb) {
{
const int rg = (r & 0xf8) | (g >> 5);
const int gb = ((g << 3) & 0xe0) | (b >> 3);
-#ifdef WEBP_SWAP_16BIT_CSP
+#if (WEBP_SWAP_16BIT_CSP == 1)
rgb[0] = gb;
rgb[1] = rg;
#else
@@ -84,7 +83,7 @@ static WEBP_INLINE void YuvToRgba4444(int y, int u, int v,
{
const int rg = (r & 0xf0) | (g >> 4);
const int ba = (b & 0xf0) | 0x0f; // overwrite the lower 4 bits
-#ifdef WEBP_SWAP_16BIT_CSP
+#if (WEBP_SWAP_16BIT_CSP == 1)
argb[0] = ba;
argb[1] = rg;
#else
@@ -93,11 +92,12 @@ static WEBP_INLINE void YuvToRgba4444(int y, int u, int v,
#endif
}
}
-#endif // WEBP_YUV_USE_TABLE
+#endif // WEBP_REDUCE_CSP
//-----------------------------------------------------------------------------
// Alpha handling variants
+#if !defined(WEBP_REDUCE_CSP)
static WEBP_INLINE void YuvToArgb(uint8_t y, uint8_t u, uint8_t v,
uint8_t* const argb) {
int r, g, b;
@@ -107,6 +107,7 @@ static WEBP_INLINE void YuvToArgb(uint8_t y, uint8_t u, uint8_t v,
argb[2] = g;
argb[3] = b;
}
+#endif // WEBP_REDUCE_CSP
static WEBP_INLINE void YuvToBgra(uint8_t y, uint8_t u, uint8_t v,
uint8_t* const bgra) {
int r, g, b;
@@ -200,13 +201,15 @@ static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y, \
}
// All variants implemented.
-UPSAMPLE_FUNC(UpsampleRgbLinePair, YuvToRgb, 3)
-UPSAMPLE_FUNC(UpsampleBgrLinePair, YuvToBgr, 3)
UPSAMPLE_FUNC(UpsampleRgbaLinePair, YuvToRgba, 4)
UPSAMPLE_FUNC(UpsampleBgraLinePair, YuvToBgra, 4)
+#if !defined(WEBP_REDUCE_CSP)
+UPSAMPLE_FUNC(UpsampleRgbLinePair, YuvToRgb, 3)
+UPSAMPLE_FUNC(UpsampleBgrLinePair, YuvToBgr, 3)
UPSAMPLE_FUNC(UpsampleArgbLinePair, YuvToArgb, 4)
UPSAMPLE_FUNC(UpsampleRgba4444LinePair, YuvToRgba4444, 2)
UPSAMPLE_FUNC(UpsampleRgb565LinePair, YuvToRgb565, 2)
+#endif // WEBP_REDUCE_CSP
#undef LOAD_UV
#undef UPSAMPLE_FUNC
@@ -217,17 +220,19 @@ UPSAMPLE_FUNC(UpsampleRgb565LinePair, YuvToRgb565, 2)
extern void WebPInitUpsamplersMIPSdspR2(void);
WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplersMIPSdspR2(void) {
- WebPUpsamplers[MODE_RGB] = UpsampleRgbLinePair;
WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePair;
- WebPUpsamplers[MODE_BGR] = UpsampleBgrLinePair;
WebPUpsamplers[MODE_BGRA] = UpsampleBgraLinePair;
+ WebPUpsamplers[MODE_rgbA] = UpsampleRgbaLinePair;
+ WebPUpsamplers[MODE_bgrA] = UpsampleBgraLinePair;
+#if !defined(WEBP_REDUCE_CSP)
+ WebPUpsamplers[MODE_RGB] = UpsampleRgbLinePair;
+ WebPUpsamplers[MODE_BGR] = UpsampleBgrLinePair;
WebPUpsamplers[MODE_ARGB] = UpsampleArgbLinePair;
WebPUpsamplers[MODE_RGBA_4444] = UpsampleRgba4444LinePair;
WebPUpsamplers[MODE_RGB_565] = UpsampleRgb565LinePair;
- WebPUpsamplers[MODE_rgbA] = UpsampleRgbaLinePair;
- WebPUpsamplers[MODE_bgrA] = UpsampleBgraLinePair;
WebPUpsamplers[MODE_Argb] = UpsampleArgbLinePair;
WebPUpsamplers[MODE_rgbA_4444] = UpsampleRgba4444LinePair;
+#endif // WEBP_REDUCE_CSP
}
#endif // FANCY_UPSAMPLING
@@ -242,13 +247,15 @@ static void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v, \
for (i = 0; i < len; ++i) FUNC(y[i], u[i], v[i], &dst[i * XSTEP]); \
}
-YUV444_FUNC(Yuv444ToRgb, YuvToRgb, 3)
-YUV444_FUNC(Yuv444ToBgr, YuvToBgr, 3)
YUV444_FUNC(Yuv444ToRgba, YuvToRgba, 4)
YUV444_FUNC(Yuv444ToBgra, YuvToBgra, 4)
+#if !defined(WEBP_REDUCE_CSP)
+YUV444_FUNC(Yuv444ToRgb, YuvToRgb, 3)
+YUV444_FUNC(Yuv444ToBgr, YuvToBgr, 3)
YUV444_FUNC(Yuv444ToArgb, YuvToArgb, 4)
YUV444_FUNC(Yuv444ToRgba4444, YuvToRgba4444, 2)
YUV444_FUNC(Yuv444ToRgb565, YuvToRgb565, 2)
+#endif // WEBP_REDUCE_CSP
#undef YUV444_FUNC
@@ -258,17 +265,19 @@ YUV444_FUNC(Yuv444ToRgb565, YuvToRgb565, 2)
extern void WebPInitYUV444ConvertersMIPSdspR2(void);
WEBP_TSAN_IGNORE_FUNCTION void WebPInitYUV444ConvertersMIPSdspR2(void) {
- WebPYUV444Converters[MODE_RGB] = Yuv444ToRgb;
WebPYUV444Converters[MODE_RGBA] = Yuv444ToRgba;
- WebPYUV444Converters[MODE_BGR] = Yuv444ToBgr;
WebPYUV444Converters[MODE_BGRA] = Yuv444ToBgra;
+ WebPYUV444Converters[MODE_rgbA] = Yuv444ToRgba;
+ WebPYUV444Converters[MODE_bgrA] = Yuv444ToBgra;
+#if !defined(WEBP_REDUCE_CSP)
+ WebPYUV444Converters[MODE_RGB] = Yuv444ToRgb;
+ WebPYUV444Converters[MODE_BGR] = Yuv444ToBgr;
WebPYUV444Converters[MODE_ARGB] = Yuv444ToArgb;
WebPYUV444Converters[MODE_RGBA_4444] = Yuv444ToRgba4444;
WebPYUV444Converters[MODE_RGB_565] = Yuv444ToRgb565;
- WebPYUV444Converters[MODE_rgbA] = Yuv444ToRgba;
- WebPYUV444Converters[MODE_bgrA] = Yuv444ToBgra;
WebPYUV444Converters[MODE_Argb] = Yuv444ToArgb;
WebPYUV444Converters[MODE_rgbA_4444] = Yuv444ToRgba4444;
+#endif // WEBP_REDUCE_CSP
}
#else // !WEBP_USE_MIPS_DSP_R2
diff --git a/src/3rdparty/libwebp/src/dsp/upsampling_msa.c b/src/3rdparty/libwebp/src/dsp/upsampling_msa.c
index f24926f..535ffb7 100644
--- a/src/3rdparty/libwebp/src/dsp/upsampling_msa.c
+++ b/src/3rdparty/libwebp/src/dsp/upsampling_msa.c
@@ -12,12 +12,12 @@
// Author: Prashant Patil (prashant.patil@imgtec.com)
#include <string.h>
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
#if defined(WEBP_USE_MSA)
-#include "./msa_macro.h"
-#include "./yuv.h"
+#include "src/dsp/msa_macro.h"
+#include "src/dsp/yuv.h"
#ifdef FANCY_UPSAMPLING
@@ -274,7 +274,7 @@ static void YuvToRgb565(int y, int u, int v, uint8_t* const rgb) {
const int b = Clip8(b1 >> 6);
const int rg = (r & 0xf8) | (g >> 5);
const int gb = ((g << 3) & 0xe0) | (b >> 3);
-#ifdef WEBP_SWAP_16BIT_CSP
+#if (WEBP_SWAP_16BIT_CSP == 1)
rgb[0] = gb;
rgb[1] = rg;
#else
@@ -293,7 +293,7 @@ static void YuvToRgba4444(int y, int u, int v, uint8_t* const argb) {
const int b = Clip8(b1 >> 6);
const int rg = (r & 0xf0) | (g >> 4);
const int ba = (b & 0xf0) | 0x0f; // overwrite the lower 4 bits
-#ifdef WEBP_SWAP_16BIT_CSP
+#if (WEBP_SWAP_16BIT_CSP == 1)
argb[0] = ba;
argb[1] = rg;
#else
@@ -374,7 +374,7 @@ static void YuvToBgrLine(const uint8_t* y, const uint8_t* u,
static void YuvToRgbaLine(const uint8_t* y, const uint8_t* u,
const uint8_t* v, uint8_t* dst, int length) {
v16u8 R, G, B;
- const v16u8 A = (v16u8)__msa_ldi_b(0xff);
+ const v16u8 A = (v16u8)__msa_ldi_b(ALPHAVAL);
while (length >= 16) {
CALC_RGB16(y, u, v, R, G, B);
STORE16_4(R, G, B, A, dst);
@@ -402,7 +402,7 @@ static void YuvToRgbaLine(const uint8_t* y, const uint8_t* u,
static void YuvToBgraLine(const uint8_t* y, const uint8_t* u,
const uint8_t* v, uint8_t* dst, int length) {
v16u8 R, G, B;
- const v16u8 A = (v16u8)__msa_ldi_b(0xff);
+ const v16u8 A = (v16u8)__msa_ldi_b(ALPHAVAL);
while (length >= 16) {
CALC_RGB16(y, u, v, R, G, B);
STORE16_4(B, G, R, A, dst);
@@ -430,7 +430,7 @@ static void YuvToBgraLine(const uint8_t* y, const uint8_t* u,
static void YuvToArgbLine(const uint8_t* y, const uint8_t* u,
const uint8_t* v, uint8_t* dst, int length) {
v16u8 R, G, B;
- const v16u8 A = (v16u8)__msa_ldi_b(0xff);
+ const v16u8 A = (v16u8)__msa_ldi_b(ALPHAVAL);
while (length >= 16) {
CALC_RGB16(y, u, v, R, G, B);
STORE16_4(A, R, G, B, dst);
@@ -459,11 +459,11 @@ static void YuvToRgba4444Line(const uint8_t* y, const uint8_t* u,
const uint8_t* v, uint8_t* dst, int length) {
v16u8 R, G, B, RG, BA, tmp0, tmp1;
while (length >= 16) {
- #ifdef WEBP_SWAP_16BIT_CSP
+#if (WEBP_SWAP_16BIT_CSP == 1)
CALC_RGBA4444(y, u, v, BA, RG, 16, dst);
- #else
+#else
CALC_RGBA4444(y, u, v, RG, BA, 16, dst);
- #endif
+#endif
y += 16;
u += 16;
v += 16;
@@ -473,7 +473,7 @@ static void YuvToRgba4444Line(const uint8_t* y, const uint8_t* u,
if (length > 8) {
uint8_t temp[2 * 16] = { 0 };
memcpy(temp, y, length * sizeof(*temp));
-#ifdef WEBP_SWAP_16BIT_CSP
+#if (WEBP_SWAP_16BIT_CSP == 1)
CALC_RGBA4444(temp, u, v, BA, RG, 16, temp);
#else
CALC_RGBA4444(temp, u, v, RG, BA, 16, temp);
@@ -482,7 +482,7 @@ static void YuvToRgba4444Line(const uint8_t* y, const uint8_t* u,
} else if (length > 0) {
uint8_t temp[2 * 8] = { 0 };
memcpy(temp, y, length * sizeof(*temp));
-#ifdef WEBP_SWAP_16BIT_CSP
+#if (WEBP_SWAP_16BIT_CSP == 1)
CALC_RGBA4444(temp, u, v, BA, RG, 8, temp);
#else
CALC_RGBA4444(temp, u, v, RG, BA, 8, temp);
@@ -495,11 +495,11 @@ static void YuvToRgb565Line(const uint8_t* y, const uint8_t* u,
const uint8_t* v, uint8_t* dst, int length) {
v16u8 R, G, B, RG, GB, tmp0, tmp1;
while (length >= 16) {
- #ifdef WEBP_SWAP_16BIT_CSP
+#if (WEBP_SWAP_16BIT_CSP == 1)
CALC_RGB565(y, u, v, GB, RG, 16, dst);
- #else
+#else
CALC_RGB565(y, u, v, RG, GB, 16, dst);
- #endif
+#endif
y += 16;
u += 16;
v += 16;
@@ -509,7 +509,7 @@ static void YuvToRgb565Line(const uint8_t* y, const uint8_t* u,
if (length > 8) {
uint8_t temp[2 * 16] = { 0 };
memcpy(temp, y, length * sizeof(*temp));
-#ifdef WEBP_SWAP_16BIT_CSP
+#if (WEBP_SWAP_16BIT_CSP == 1)
CALC_RGB565(temp, u, v, GB, RG, 16, temp);
#else
CALC_RGB565(temp, u, v, RG, GB, 16, temp);
@@ -518,7 +518,7 @@ static void YuvToRgb565Line(const uint8_t* y, const uint8_t* u,
} else if (length > 0) {
uint8_t temp[2 * 8] = { 0 };
memcpy(temp, y, length * sizeof(*temp));
-#ifdef WEBP_SWAP_16BIT_CSP
+#if (WEBP_SWAP_16BIT_CSP == 1)
CALC_RGB565(temp, u, v, GB, RG, 8, temp);
#else
CALC_RGB565(temp, u, v, RG, GB, 8, temp);
@@ -640,13 +640,15 @@ static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bot_y, \
} \
}
-UPSAMPLE_FUNC(UpsampleRgbLinePair, YuvToRgb, 3)
-UPSAMPLE_FUNC(UpsampleBgrLinePair, YuvToBgr, 3)
UPSAMPLE_FUNC(UpsampleRgbaLinePair, YuvToRgba, 4)
UPSAMPLE_FUNC(UpsampleBgraLinePair, YuvToBgra, 4)
+#if !defined(WEBP_REDUCE_CSP)
+UPSAMPLE_FUNC(UpsampleRgbLinePair, YuvToRgb, 3)
+UPSAMPLE_FUNC(UpsampleBgrLinePair, YuvToBgr, 3)
UPSAMPLE_FUNC(UpsampleArgbLinePair, YuvToArgb, 4)
UPSAMPLE_FUNC(UpsampleRgba4444LinePair, YuvToRgba4444, 2)
UPSAMPLE_FUNC(UpsampleRgb565LinePair, YuvToRgb565, 2)
+#endif // WEBP_REDUCE_CSP
//------------------------------------------------------------------------------
// Entry point
@@ -656,17 +658,19 @@ extern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */];
extern void WebPInitUpsamplersMSA(void);
WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplersMSA(void) {
- WebPUpsamplers[MODE_RGB] = UpsampleRgbLinePair;
WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePair;
- WebPUpsamplers[MODE_BGR] = UpsampleBgrLinePair;
WebPUpsamplers[MODE_BGRA] = UpsampleBgraLinePair;
- WebPUpsamplers[MODE_ARGB] = UpsampleArgbLinePair;
WebPUpsamplers[MODE_rgbA] = UpsampleRgbaLinePair;
WebPUpsamplers[MODE_bgrA] = UpsampleBgraLinePair;
+#if !defined(WEBP_REDUCE_CSP)
+ WebPUpsamplers[MODE_RGB] = UpsampleRgbLinePair;
+ WebPUpsamplers[MODE_BGR] = UpsampleBgrLinePair;
+ WebPUpsamplers[MODE_ARGB] = UpsampleArgbLinePair;
WebPUpsamplers[MODE_Argb] = UpsampleArgbLinePair;
WebPUpsamplers[MODE_RGB_565] = UpsampleRgb565LinePair;
WebPUpsamplers[MODE_RGBA_4444] = UpsampleRgba4444LinePair;
WebPUpsamplers[MODE_rgbA_4444] = UpsampleRgba4444LinePair;
+#endif // WEBP_REDUCE_CSP
}
#endif // FANCY_UPSAMPLING
diff --git a/src/3rdparty/libwebp/src/dsp/upsampling_neon.c b/src/3rdparty/libwebp/src/dsp/upsampling_neon.c
index d371a83..17cbc9f 100644
--- a/src/3rdparty/libwebp/src/dsp/upsampling_neon.c
+++ b/src/3rdparty/libwebp/src/dsp/upsampling_neon.c
@@ -12,15 +12,15 @@
// Author: mans@mansr.com (Mans Rullgard)
// Based on SSE code by: somnath@google.com (Somnath Banerjee)
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
#if defined(WEBP_USE_NEON)
#include <assert.h>
#include <arm_neon.h>
#include <string.h>
-#include "./neon.h"
-#include "./yuv.h"
+#include "src/dsp/neon.h"
+#include "src/dsp/yuv.h"
#ifdef FANCY_UPSAMPLING
@@ -58,8 +58,8 @@
} while (0)
// Turn the macro into a function for reducing code-size when non-critical
-static void Upsample16Pixels(const uint8_t *r1, const uint8_t *r2,
- uint8_t *out) {
+static void Upsample16Pixels_NEON(const uint8_t *r1, const uint8_t *r2,
+ uint8_t *out) {
UPSAMPLE_16PIXELS(r1, r2, out);
}
@@ -70,7 +70,7 @@ static void Upsample16Pixels(const uint8_t *r1, const uint8_t *r2,
/* replicate last byte */ \
memset(r1 + (num_pixels), r1[(num_pixels) - 1], 9 - (num_pixels)); \
memset(r2 + (num_pixels), r2[(num_pixels) - 1], 9 - (num_pixels)); \
- Upsample16Pixels(r1, r2, out); \
+ Upsample16Pixels_NEON(r1, r2, out); \
}
//-----------------------------------------------------------------------------
@@ -243,13 +243,15 @@ static void FUNC_NAME(const uint8_t *top_y, const uint8_t *bottom_y, \
}
// NEON variants of the fancy upsampler.
-NEON_UPSAMPLE_FUNC(UpsampleRgbLinePair, Rgb, 3)
-NEON_UPSAMPLE_FUNC(UpsampleBgrLinePair, Bgr, 3)
-NEON_UPSAMPLE_FUNC(UpsampleRgbaLinePair, Rgba, 4)
-NEON_UPSAMPLE_FUNC(UpsampleBgraLinePair, Bgra, 4)
-NEON_UPSAMPLE_FUNC(UpsampleArgbLinePair, Argb, 4)
-NEON_UPSAMPLE_FUNC(UpsampleRgba4444LinePair, Rgba4444, 2)
-NEON_UPSAMPLE_FUNC(UpsampleRgb565LinePair, Rgb565, 2)
+NEON_UPSAMPLE_FUNC(UpsampleRgbaLinePair_NEON, Rgba, 4)
+NEON_UPSAMPLE_FUNC(UpsampleBgraLinePair_NEON, Bgra, 4)
+#if !defined(WEBP_REDUCE_CSP)
+NEON_UPSAMPLE_FUNC(UpsampleRgbLinePair_NEON, Rgb, 3)
+NEON_UPSAMPLE_FUNC(UpsampleBgrLinePair_NEON, Bgr, 3)
+NEON_UPSAMPLE_FUNC(UpsampleArgbLinePair_NEON, Argb, 4)
+NEON_UPSAMPLE_FUNC(UpsampleRgba4444LinePair_NEON, Rgba4444, 2)
+NEON_UPSAMPLE_FUNC(UpsampleRgb565LinePair_NEON, Rgb565, 2)
+#endif // WEBP_REDUCE_CSP
//------------------------------------------------------------------------------
// Entry point
@@ -259,17 +261,19 @@ extern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */];
extern void WebPInitUpsamplersNEON(void);
WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplersNEON(void) {
- WebPUpsamplers[MODE_RGB] = UpsampleRgbLinePair;
- WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePair;
- WebPUpsamplers[MODE_BGR] = UpsampleBgrLinePair;
- WebPUpsamplers[MODE_BGRA] = UpsampleBgraLinePair;
- WebPUpsamplers[MODE_ARGB] = UpsampleArgbLinePair;
- WebPUpsamplers[MODE_rgbA] = UpsampleRgbaLinePair;
- WebPUpsamplers[MODE_bgrA] = UpsampleBgraLinePair;
- WebPUpsamplers[MODE_Argb] = UpsampleArgbLinePair;
- WebPUpsamplers[MODE_RGB_565] = UpsampleRgb565LinePair;
- WebPUpsamplers[MODE_RGBA_4444] = UpsampleRgba4444LinePair;
- WebPUpsamplers[MODE_rgbA_4444] = UpsampleRgba4444LinePair;
+ WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePair_NEON;
+ WebPUpsamplers[MODE_BGRA] = UpsampleBgraLinePair_NEON;
+ WebPUpsamplers[MODE_rgbA] = UpsampleRgbaLinePair_NEON;
+ WebPUpsamplers[MODE_bgrA] = UpsampleBgraLinePair_NEON;
+#if !defined(WEBP_REDUCE_CSP)
+ WebPUpsamplers[MODE_RGB] = UpsampleRgbLinePair_NEON;
+ WebPUpsamplers[MODE_BGR] = UpsampleBgrLinePair_NEON;
+ WebPUpsamplers[MODE_ARGB] = UpsampleArgbLinePair_NEON;
+ WebPUpsamplers[MODE_Argb] = UpsampleArgbLinePair_NEON;
+ WebPUpsamplers[MODE_RGB_565] = UpsampleRgb565LinePair_NEON;
+ WebPUpsamplers[MODE_RGBA_4444] = UpsampleRgba4444LinePair_NEON;
+ WebPUpsamplers[MODE_rgbA_4444] = UpsampleRgba4444LinePair_NEON;
+#endif // WEBP_REDUCE_CSP
}
#endif // FANCY_UPSAMPLING
diff --git a/src/3rdparty/libwebp/src/dsp/upsampling_sse2.c b/src/3rdparty/libwebp/src/dsp/upsampling_sse2.c
index b5b6689..fd5d303 100644
--- a/src/3rdparty/libwebp/src/dsp/upsampling_sse2.c
+++ b/src/3rdparty/libwebp/src/dsp/upsampling_sse2.c
@@ -11,14 +11,14 @@
//
// Author: somnath@google.com (Somnath Banerjee)
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
#if defined(WEBP_USE_SSE2)
#include <assert.h>
#include <emmintrin.h>
#include <string.h>
-#include "./yuv.h"
+#include "src/dsp/yuv.h"
#ifdef FANCY_UPSAMPLING
@@ -83,13 +83,13 @@
GET_M(ad, s, diag2); /* diag2 = (3a + b + c + 3d) / 8 */ \
\
/* pack the alternate pixels */ \
- PACK_AND_STORE(a, b, diag1, diag2, out + 0); /* store top */ \
- PACK_AND_STORE(c, d, diag2, diag1, out + 2 * 32); /* store bottom */ \
+ PACK_AND_STORE(a, b, diag1, diag2, (out) + 0); /* store top */ \
+ PACK_AND_STORE(c, d, diag2, diag1, (out) + 2 * 32); /* store bottom */ \
}
// Turn the macro into a function for reducing code-size when non-critical
-static void Upsample32Pixels(const uint8_t r1[], const uint8_t r2[],
- uint8_t* const out) {
+static void Upsample32Pixels_SSE2(const uint8_t r1[], const uint8_t r2[],
+ uint8_t* const out) {
UPSAMPLE_32PIXELS(r1, r2, out);
}
@@ -101,30 +101,30 @@ static void Upsample32Pixels(const uint8_t r1[], const uint8_t r2[],
memset(r1 + (num_pixels), r1[(num_pixels) - 1], 17 - (num_pixels)); \
memset(r2 + (num_pixels), r2[(num_pixels) - 1], 17 - (num_pixels)); \
/* using the shared function instead of the macro saves ~3k code size */ \
- Upsample32Pixels(r1, r2, out); \
+ Upsample32Pixels_SSE2(r1, r2, out); \
}
#define CONVERT2RGB(FUNC, XSTEP, top_y, bottom_y, \
top_dst, bottom_dst, cur_x, num_pixels) { \
int n; \
for (n = 0; n < (num_pixels); ++n) { \
- FUNC(top_y[(cur_x) + n], r_u[n], r_v[n], \
- top_dst + ((cur_x) + n) * XSTEP); \
+ FUNC((top_y)[(cur_x) + n], r_u[n], r_v[n], \
+ (top_dst) + ((cur_x) + n) * (XSTEP)); \
} \
- if (bottom_y != NULL) { \
+ if ((bottom_y) != NULL) { \
for (n = 0; n < (num_pixels); ++n) { \
- FUNC(bottom_y[(cur_x) + n], r_u[64 + n], r_v[64 + n], \
- bottom_dst + ((cur_x) + n) * XSTEP); \
+ FUNC((bottom_y)[(cur_x) + n], r_u[64 + n], r_v[64 + n], \
+ (bottom_dst) + ((cur_x) + n) * (XSTEP)); \
} \
} \
}
#define CONVERT2RGB_32(FUNC, XSTEP, top_y, bottom_y, \
top_dst, bottom_dst, cur_x) do { \
- FUNC##32(top_y + (cur_x), r_u, r_v, top_dst + (cur_x) * XSTEP); \
- if (bottom_y != NULL) { \
- FUNC##32(bottom_y + (cur_x), r_u + 64, r_v + 64, \
- bottom_dst + (cur_x) * XSTEP); \
+ FUNC##32_SSE2((top_y) + (cur_x), r_u, r_v, (top_dst) + (cur_x) * (XSTEP)); \
+ if ((bottom_y) != NULL) { \
+ FUNC##32_SSE2((bottom_y) + (cur_x), r_u + 64, r_v + 64, \
+ (bottom_dst) + (cur_x) * (XSTEP)); \
} \
} while (0)
@@ -169,13 +169,16 @@ static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y, \
}
// SSE2 variants of the fancy upsampler.
-SSE2_UPSAMPLE_FUNC(UpsampleRgbLinePair, VP8YuvToRgb, 3)
-SSE2_UPSAMPLE_FUNC(UpsampleBgrLinePair, VP8YuvToBgr, 3)
-SSE2_UPSAMPLE_FUNC(UpsampleRgbaLinePair, VP8YuvToRgba, 4)
-SSE2_UPSAMPLE_FUNC(UpsampleBgraLinePair, VP8YuvToBgra, 4)
-SSE2_UPSAMPLE_FUNC(UpsampleArgbLinePair, VP8YuvToArgb, 4)
-SSE2_UPSAMPLE_FUNC(UpsampleRgba4444LinePair, VP8YuvToRgba4444, 2)
-SSE2_UPSAMPLE_FUNC(UpsampleRgb565LinePair, VP8YuvToRgb565, 2)
+SSE2_UPSAMPLE_FUNC(UpsampleRgbaLinePair_SSE2, VP8YuvToRgba, 4)
+SSE2_UPSAMPLE_FUNC(UpsampleBgraLinePair_SSE2, VP8YuvToBgra, 4)
+
+#if !defined(WEBP_REDUCE_CSP)
+SSE2_UPSAMPLE_FUNC(UpsampleRgbLinePair_SSE2, VP8YuvToRgb, 3)
+SSE2_UPSAMPLE_FUNC(UpsampleBgrLinePair_SSE2, VP8YuvToBgr, 3)
+SSE2_UPSAMPLE_FUNC(UpsampleArgbLinePair_SSE2, VP8YuvToArgb, 4)
+SSE2_UPSAMPLE_FUNC(UpsampleRgba4444LinePair_SSE2, VP8YuvToRgba4444, 2)
+SSE2_UPSAMPLE_FUNC(UpsampleRgb565LinePair_SSE2, VP8YuvToRgb565, 2)
+#endif // WEBP_REDUCE_CSP
#undef GET_M
#undef PACK_AND_STORE
@@ -193,17 +196,19 @@ extern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */];
extern void WebPInitUpsamplersSSE2(void);
WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplersSSE2(void) {
- WebPUpsamplers[MODE_RGB] = UpsampleRgbLinePair;
- WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePair;
- WebPUpsamplers[MODE_BGR] = UpsampleBgrLinePair;
- WebPUpsamplers[MODE_BGRA] = UpsampleBgraLinePair;
- WebPUpsamplers[MODE_ARGB] = UpsampleArgbLinePair;
- WebPUpsamplers[MODE_rgbA] = UpsampleRgbaLinePair;
- WebPUpsamplers[MODE_bgrA] = UpsampleBgraLinePair;
- WebPUpsamplers[MODE_Argb] = UpsampleArgbLinePair;
- WebPUpsamplers[MODE_RGB_565] = UpsampleRgb565LinePair;
- WebPUpsamplers[MODE_RGBA_4444] = UpsampleRgba4444LinePair;
- WebPUpsamplers[MODE_rgbA_4444] = UpsampleRgba4444LinePair;
+ WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePair_SSE2;
+ WebPUpsamplers[MODE_BGRA] = UpsampleBgraLinePair_SSE2;
+ WebPUpsamplers[MODE_rgbA] = UpsampleRgbaLinePair_SSE2;
+ WebPUpsamplers[MODE_bgrA] = UpsampleBgraLinePair_SSE2;
+#if !defined(WEBP_REDUCE_CSP)
+ WebPUpsamplers[MODE_RGB] = UpsampleRgbLinePair_SSE2;
+ WebPUpsamplers[MODE_BGR] = UpsampleBgrLinePair_SSE2;
+ WebPUpsamplers[MODE_ARGB] = UpsampleArgbLinePair_SSE2;
+ WebPUpsamplers[MODE_Argb] = UpsampleArgbLinePair_SSE2;
+ WebPUpsamplers[MODE_RGB_565] = UpsampleRgb565LinePair_SSE2;
+ WebPUpsamplers[MODE_RGBA_4444] = UpsampleRgba4444LinePair_SSE2;
+ WebPUpsamplers[MODE_rgbA_4444] = UpsampleRgba4444LinePair_SSE2;
+#endif // WEBP_REDUCE_CSP
}
#endif // FANCY_UPSAMPLING
@@ -213,29 +218,46 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplersSSE2(void) {
extern WebPYUV444Converter WebPYUV444Converters[/* MODE_LAST */];
extern void WebPInitYUV444ConvertersSSE2(void);
-#define YUV444_FUNC(FUNC_NAME, CALL, XSTEP) \
-extern void WebP##FUNC_NAME##C(const uint8_t* y, const uint8_t* u, \
- const uint8_t* v, uint8_t* dst, int len); \
+#define YUV444_FUNC(FUNC_NAME, CALL, CALL_C, XSTEP) \
+extern void CALL_C(const uint8_t* y, const uint8_t* u, const uint8_t* v, \
+ uint8_t* dst, int len); \
static void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v, \
uint8_t* dst, int len) { \
int i; \
const int max_len = len & ~31; \
- for (i = 0; i < max_len; i += 32) CALL(y + i, u + i, v + i, dst + i * XSTEP);\
+ for (i = 0; i < max_len; i += 32) { \
+ CALL(y + i, u + i, v + i, dst + i * (XSTEP)); \
+ } \
if (i < len) { /* C-fallback */ \
- WebP##FUNC_NAME##C(y + i, u + i, v + i, dst + i * XSTEP, len - i); \
+ CALL_C(y + i, u + i, v + i, dst + i * (XSTEP), len - i); \
} \
}
-YUV444_FUNC(Yuv444ToRgba, VP8YuvToRgba32, 4);
-YUV444_FUNC(Yuv444ToBgra, VP8YuvToBgra32, 4);
-YUV444_FUNC(Yuv444ToRgb, VP8YuvToRgb32, 3);
-YUV444_FUNC(Yuv444ToBgr, VP8YuvToBgr32, 3);
+YUV444_FUNC(Yuv444ToRgba_SSE2, VP8YuvToRgba32_SSE2, WebPYuv444ToRgba_C, 4);
+YUV444_FUNC(Yuv444ToBgra_SSE2, VP8YuvToBgra32_SSE2, WebPYuv444ToBgra_C, 4);
+#if !defined(WEBP_REDUCE_CSP)
+YUV444_FUNC(Yuv444ToRgb_SSE2, VP8YuvToRgb32_SSE2, WebPYuv444ToRgb_C, 3);
+YUV444_FUNC(Yuv444ToBgr_SSE2, VP8YuvToBgr32_SSE2, WebPYuv444ToBgr_C, 3);
+YUV444_FUNC(Yuv444ToArgb_SSE2, VP8YuvToArgb32_SSE2, WebPYuv444ToArgb_C, 4)
+YUV444_FUNC(Yuv444ToRgba4444_SSE2, VP8YuvToRgba444432_SSE2, \
+ WebPYuv444ToRgba4444_C, 2)
+YUV444_FUNC(Yuv444ToRgb565_SSE2, VP8YuvToRgb56532_SSE2, WebPYuv444ToRgb565_C, 2)
+#endif // WEBP_REDUCE_CSP
WEBP_TSAN_IGNORE_FUNCTION void WebPInitYUV444ConvertersSSE2(void) {
- WebPYUV444Converters[MODE_RGBA] = Yuv444ToRgba;
- WebPYUV444Converters[MODE_BGRA] = Yuv444ToBgra;
- WebPYUV444Converters[MODE_RGB] = Yuv444ToRgb;
- WebPYUV444Converters[MODE_BGR] = Yuv444ToBgr;
+ WebPYUV444Converters[MODE_RGBA] = Yuv444ToRgba_SSE2;
+ WebPYUV444Converters[MODE_BGRA] = Yuv444ToBgra_SSE2;
+ WebPYUV444Converters[MODE_rgbA] = Yuv444ToRgba_SSE2;
+ WebPYUV444Converters[MODE_bgrA] = Yuv444ToBgra_SSE2;
+#if !defined(WEBP_REDUCE_CSP)
+ WebPYUV444Converters[MODE_RGB] = Yuv444ToRgb_SSE2;
+ WebPYUV444Converters[MODE_BGR] = Yuv444ToBgr_SSE2;
+ WebPYUV444Converters[MODE_ARGB] = Yuv444ToArgb_SSE2;
+ WebPYUV444Converters[MODE_RGBA_4444] = Yuv444ToRgba4444_SSE2;
+ WebPYUV444Converters[MODE_RGB_565] = Yuv444ToRgb565_SSE2;
+ WebPYUV444Converters[MODE_Argb] = Yuv444ToArgb_SSE2;
+ WebPYUV444Converters[MODE_rgbA_4444] = Yuv444ToRgba4444_SSE2;
+#endif // WEBP_REDUCE_CSP
}
#else
diff --git a/src/3rdparty/libwebp/src/dsp/yuv.c b/src/3rdparty/libwebp/src/dsp/yuv.c
index dd7d9de..bddf81f 100644
--- a/src/3rdparty/libwebp/src/dsp/yuv.c
+++ b/src/3rdparty/libwebp/src/dsp/yuv.c
@@ -11,63 +11,11 @@
//
// Author: Skal (pascal.massimino@gmail.com)
-#include "./yuv.h"
+#include "src/dsp/yuv.h"
+#include <assert.h>
#include <stdlib.h>
-#if defined(WEBP_YUV_USE_TABLE)
-
-static int done = 0;
-
-static WEBP_INLINE uint8_t clip(int v, int max_value) {
- return v < 0 ? 0 : v > max_value ? max_value : v;
-}
-
-int16_t VP8kVToR[256], VP8kUToB[256];
-int32_t VP8kVToG[256], VP8kUToG[256];
-uint8_t VP8kClip[YUV_RANGE_MAX - YUV_RANGE_MIN];
-uint8_t VP8kClip4Bits[YUV_RANGE_MAX - YUV_RANGE_MIN];
-
-WEBP_TSAN_IGNORE_FUNCTION void VP8YUVInit(void) {
- int i;
- if (done) {
- return;
- }
-#ifndef USE_YUVj
- for (i = 0; i < 256; ++i) {
- VP8kVToR[i] = (89858 * (i - 128) + YUV_HALF) >> YUV_FIX;
- VP8kUToG[i] = -22014 * (i - 128) + YUV_HALF;
- VP8kVToG[i] = -45773 * (i - 128);
- VP8kUToB[i] = (113618 * (i - 128) + YUV_HALF) >> YUV_FIX;
- }
- for (i = YUV_RANGE_MIN; i < YUV_RANGE_MAX; ++i) {
- const int k = ((i - 16) * 76283 + YUV_HALF) >> YUV_FIX;
- VP8kClip[i - YUV_RANGE_MIN] = clip(k, 255);
- VP8kClip4Bits[i - YUV_RANGE_MIN] = clip((k + 8) >> 4, 15);
- }
-#else
- for (i = 0; i < 256; ++i) {
- VP8kVToR[i] = (91881 * (i - 128) + YUV_HALF) >> YUV_FIX;
- VP8kUToG[i] = -22554 * (i - 128) + YUV_HALF;
- VP8kVToG[i] = -46802 * (i - 128);
- VP8kUToB[i] = (116130 * (i - 128) + YUV_HALF) >> YUV_FIX;
- }
- for (i = YUV_RANGE_MIN; i < YUV_RANGE_MAX; ++i) {
- const int k = i;
- VP8kClip[i - YUV_RANGE_MIN] = clip(k, 255);
- VP8kClip4Bits[i - YUV_RANGE_MIN] = clip((k + 8) >> 4, 15);
- }
-#endif
-
- done = 1;
-}
-
-#else
-
-WEBP_TSAN_IGNORE_FUNCTION void VP8YUVInit(void) {}
-
-#endif // WEBP_YUV_USE_TABLE
-
//-----------------------------------------------------------------------------
// Plain-C version
@@ -75,14 +23,14 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8YUVInit(void) {}
static void FUNC_NAME(const uint8_t* y, \
const uint8_t* u, const uint8_t* v, \
uint8_t* dst, int len) { \
- const uint8_t* const end = dst + (len & ~1) * XSTEP; \
+ const uint8_t* const end = dst + (len & ~1) * (XSTEP); \
while (dst != end) { \
FUNC(y[0], u[0], v[0], dst); \
- FUNC(y[1], u[0], v[0], dst + XSTEP); \
+ FUNC(y[1], u[0], v[0], dst + (XSTEP)); \
y += 2; \
++u; \
++v; \
- dst += 2 * XSTEP; \
+ dst += 2 * (XSTEP); \
} \
if (len & 1) { \
FUNC(y[0], u[0], v[0], dst); \
@@ -168,7 +116,7 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitSamplers(void) {
//-----------------------------------------------------------------------------
// ARGB -> YUV converters
-static void ConvertARGBToY(const uint32_t* argb, uint8_t* y, int width) {
+static void ConvertARGBToY_C(const uint32_t* argb, uint8_t* y, int width) {
int i;
for (i = 0; i < width; ++i) {
const uint32_t p = argb[i];
@@ -220,14 +168,14 @@ void WebPConvertARGBToUV_C(const uint32_t* argb, uint8_t* u, uint8_t* v,
//-----------------------------------------------------------------------------
-static void ConvertRGB24ToY(const uint8_t* rgb, uint8_t* y, int width) {
+static void ConvertRGB24ToY_C(const uint8_t* rgb, uint8_t* y, int width) {
int i;
for (i = 0; i < width; ++i, rgb += 3) {
y[i] = VP8RGBToY(rgb[0], rgb[1], rgb[2], YUV_HALF);
}
}
-static void ConvertBGR24ToY(const uint8_t* bgr, uint8_t* y, int width) {
+static void ConvertBGR24ToY_C(const uint8_t* bgr, uint8_t* y, int width) {
int i;
for (i = 0; i < width; ++i, bgr += 3) {
y[i] = VP8RGBToY(bgr[2], bgr[1], bgr[0], YUV_HALF);
@@ -246,6 +194,7 @@ void WebPConvertRGBA32ToUV_C(const uint16_t* rgb,
//-----------------------------------------------------------------------------
+#if !WEBP_NEON_OMIT_C_CODE
#define MAX_Y ((1 << 10) - 1) // 10b precision over 16b-arithmetic
static uint16_t clip_y(int v) {
return (v < 0) ? 0 : (v > MAX_Y) ? MAX_Y : (uint16_t)v;
@@ -283,6 +232,7 @@ static void SharpYUVFilterRow_C(const int16_t* A, const int16_t* B, int len,
out[2 * i + 1] = clip_y(best_y[2 * i + 1] + v1);
}
}
+#endif // !WEBP_NEON_OMIT_C_CODE
#undef MAX_Y
@@ -308,22 +258,26 @@ static volatile VP8CPUInfo rgba_to_yuv_last_cpuinfo_used =
(VP8CPUInfo)&rgba_to_yuv_last_cpuinfo_used;
extern void WebPInitConvertARGBToYUVSSE2(void);
+extern void WebPInitConvertARGBToYUVNEON(void);
extern void WebPInitSharpYUVSSE2(void);
+extern void WebPInitSharpYUVNEON(void);
WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUV(void) {
if (rgba_to_yuv_last_cpuinfo_used == VP8GetCPUInfo) return;
- WebPConvertARGBToY = ConvertARGBToY;
+ WebPConvertARGBToY = ConvertARGBToY_C;
WebPConvertARGBToUV = WebPConvertARGBToUV_C;
- WebPConvertRGB24ToY = ConvertRGB24ToY;
- WebPConvertBGR24ToY = ConvertBGR24ToY;
+ WebPConvertRGB24ToY = ConvertRGB24ToY_C;
+ WebPConvertBGR24ToY = ConvertBGR24ToY_C;
WebPConvertRGBA32ToUV = WebPConvertRGBA32ToUV_C;
+#if !WEBP_NEON_OMIT_C_CODE
WebPSharpYUVUpdateY = SharpYUVUpdateY_C;
WebPSharpYUVUpdateRGB = SharpYUVUpdateRGB_C;
WebPSharpYUVFilterRow = SharpYUVFilterRow_C;
+#endif
if (VP8GetCPUInfo != NULL) {
#if defined(WEBP_USE_SSE2)
@@ -333,5 +287,23 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUV(void) {
}
#endif // WEBP_USE_SSE2
}
+
+#if defined(WEBP_USE_NEON)
+ if (WEBP_NEON_OMIT_C_CODE ||
+ (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
+ WebPInitConvertARGBToYUVNEON();
+ WebPInitSharpYUVNEON();
+ }
+#endif // WEBP_USE_NEON
+
+ assert(WebPConvertARGBToY != NULL);
+ assert(WebPConvertARGBToUV != NULL);
+ assert(WebPConvertRGB24ToY != NULL);
+ assert(WebPConvertBGR24ToY != NULL);
+ assert(WebPConvertRGBA32ToUV != NULL);
+ assert(WebPSharpYUVUpdateY != NULL);
+ assert(WebPSharpYUVUpdateRGB != NULL);
+ assert(WebPSharpYUVFilterRow != NULL);
+
rgba_to_yuv_last_cpuinfo_used = VP8GetCPUInfo;
}
diff --git a/src/3rdparty/libwebp/src/dsp/yuv.h b/src/3rdparty/libwebp/src/dsp/yuv.h
index 1d33b58..c8a5583 100644
--- a/src/3rdparty/libwebp/src/dsp/yuv.h
+++ b/src/3rdparty/libwebp/src/dsp/yuv.h
@@ -35,18 +35,8 @@
#ifndef WEBP_DSP_YUV_H_
#define WEBP_DSP_YUV_H_
-#include "./dsp.h"
-#include "../dec/vp8_dec.h"
-
-#if defined(WEBP_EXPERIMENTAL_FEATURES)
-// Do NOT activate this feature for real compression. This is only experimental!
-// This flag is for comparison purpose against JPEG's "YUVj" natural colorspace.
-// This colorspace is close to Rec.601's Y'CbCr model with the notable
-// difference of allowing larger range for luma/chroma.
-// See http://en.wikipedia.org/wiki/YCbCr#JPEG_conversion paragraph, and its
-// difference with http://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.601_conversion
-// #define USE_YUVj
-#endif
+#include "src/dsp/dsp.h"
+#include "src/dec/vp8_dec.h"
//------------------------------------------------------------------------------
// YUV -> RGB conversion
@@ -58,12 +48,8 @@ extern "C" {
enum {
YUV_FIX = 16, // fixed-point precision for RGB->YUV
YUV_HALF = 1 << (YUV_FIX - 1),
- YUV_MASK = (256 << YUV_FIX) - 1,
- YUV_RANGE_MIN = -227, // min value of r/g/b output
- YUV_RANGE_MAX = 256 + 226, // max value of r/g/b output
YUV_FIX2 = 6, // fixed-point precision for YUV->RGB
- YUV_HALF2 = 1 << YUV_FIX2 >> 1,
YUV_MASK2 = (256 << YUV_FIX2) - 1
};
@@ -111,7 +97,7 @@ static WEBP_INLINE void VP8YuvToRgb565(int y, int u, int v,
const int b = VP8YUVToB(y, u); // 5 usable bits
const int rg = (r & 0xf8) | (g >> 5);
const int gb = ((g << 3) & 0xe0) | (b >> 3);
-#ifdef WEBP_SWAP_16BIT_CSP
+#if (WEBP_SWAP_16BIT_CSP == 1)
rgb[0] = gb;
rgb[1] = rg;
#else
@@ -127,7 +113,7 @@ static WEBP_INLINE void VP8YuvToRgba4444(int y, int u, int v,
const int b = VP8YUVToB(y, u); // 4 usable bits
const int rg = (r & 0xf0) | (g >> 4);
const int ba = (b & 0xf0) | 0x0f; // overwrite the lower 4 bits
-#ifdef WEBP_SWAP_16BIT_CSP
+#if (WEBP_SWAP_16BIT_CSP == 1)
argb[0] = ba;
argb[1] = rg;
#else
@@ -157,29 +143,26 @@ static WEBP_INLINE void VP8YuvToRgba(uint8_t y, uint8_t u, uint8_t v,
rgba[3] = 0xff;
}
-// Must be called before everything, to initialize the tables.
-void VP8YUVInit(void);
-
//-----------------------------------------------------------------------------
// SSE2 extra functions (mostly for upsampling_sse2.c)
#if defined(WEBP_USE_SSE2)
// Process 32 pixels and store the result (16b, 24b or 32b per pixel) in *dst.
-void VP8YuvToRgba32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
- uint8_t* dst);
-void VP8YuvToRgb32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
- uint8_t* dst);
-void VP8YuvToBgra32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
- uint8_t* dst);
-void VP8YuvToBgr32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
- uint8_t* dst);
-void VP8YuvToArgb32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
- uint8_t* dst);
-void VP8YuvToRgba444432(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+void VP8YuvToRgba32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+ uint8_t* dst);
+void VP8YuvToRgb32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+ uint8_t* dst);
+void VP8YuvToBgra32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+ uint8_t* dst);
+void VP8YuvToBgr32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
uint8_t* dst);
-void VP8YuvToRgb56532(const uint8_t* y, const uint8_t* u, const uint8_t* v,
- uint8_t* dst);
+void VP8YuvToArgb32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+ uint8_t* dst);
+void VP8YuvToRgba444432_SSE2(const uint8_t* y, const uint8_t* u,
+ const uint8_t* v, uint8_t* dst);
+void VP8YuvToRgb56532_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+ uint8_t* dst);
#endif // WEBP_USE_SSE2
@@ -192,8 +175,6 @@ static WEBP_INLINE int VP8ClipUV(int uv, int rounding) {
return ((uv & ~0xff) == 0) ? uv : (uv < 0) ? 0 : 255;
}
-#ifndef USE_YUVj
-
static WEBP_INLINE int VP8RGBToY(int r, int g, int b, int rounding) {
const int luma = 16839 * r + 33059 * g + 6420 * b;
return (luma + rounding + (16 << YUV_FIX)) >> YUV_FIX; // no need to clip
@@ -209,28 +190,6 @@ static WEBP_INLINE int VP8RGBToV(int r, int g, int b, int rounding) {
return VP8ClipUV(v, rounding);
}
-#else
-
-// This JPEG-YUV colorspace, only for comparison!
-// These are also 16bit precision coefficients from Rec.601, but with full
-// [0..255] output range.
-static WEBP_INLINE int VP8RGBToY(int r, int g, int b, int rounding) {
- const int luma = 19595 * r + 38470 * g + 7471 * b;
- return (luma + rounding) >> YUV_FIX; // no need to clip
-}
-
-static WEBP_INLINE int VP8RGBToU(int r, int g, int b, int rounding) {
- const int u = -11058 * r - 21710 * g + 32768 * b;
- return VP8ClipUV(u, rounding);
-}
-
-static WEBP_INLINE int VP8RGBToV(int r, int g, int b, int rounding) {
- const int v = 32768 * r - 27439 * g - 5329 * b;
- return VP8ClipUV(v, rounding);
-}
-
-#endif // USE_YUVj
-
#ifdef __cplusplus
} // extern "C"
#endif
diff --git a/src/3rdparty/libwebp/src/dsp/yuv_mips32.c b/src/3rdparty/libwebp/src/dsp/yuv_mips32.c
index e61aac5..9d0a887 100644
--- a/src/3rdparty/libwebp/src/dsp/yuv_mips32.c
+++ b/src/3rdparty/libwebp/src/dsp/yuv_mips32.c
@@ -12,11 +12,11 @@
// Author(s): Djordje Pesut (djordje.pesut@imgtec.com)
// Jovan Zelincevic (jovan.zelincevic@imgtec.com)
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
#if defined(WEBP_USE_MIPS32)
-#include "./yuv.h"
+#include "src/dsp/yuv.h"
//------------------------------------------------------------------------------
// simple point-sampling
@@ -77,10 +77,10 @@ static void FUNC_NAME(const uint8_t* y, \
} \
}
-ROW_FUNC(YuvToRgbRow, 3, 0, 1, 2, 0)
-ROW_FUNC(YuvToRgbaRow, 4, 0, 1, 2, 3)
-ROW_FUNC(YuvToBgrRow, 3, 2, 1, 0, 0)
-ROW_FUNC(YuvToBgraRow, 4, 2, 1, 0, 3)
+ROW_FUNC(YuvToRgbRow_MIPS32, 3, 0, 1, 2, 0)
+ROW_FUNC(YuvToRgbaRow_MIPS32, 4, 0, 1, 2, 3)
+ROW_FUNC(YuvToBgrRow_MIPS32, 3, 2, 1, 0, 0)
+ROW_FUNC(YuvToBgraRow_MIPS32, 4, 2, 1, 0, 3)
#undef ROW_FUNC
@@ -90,10 +90,10 @@ ROW_FUNC(YuvToBgraRow, 4, 2, 1, 0, 3)
extern void WebPInitSamplersMIPS32(void);
WEBP_TSAN_IGNORE_FUNCTION void WebPInitSamplersMIPS32(void) {
- WebPSamplers[MODE_RGB] = YuvToRgbRow;
- WebPSamplers[MODE_RGBA] = YuvToRgbaRow;
- WebPSamplers[MODE_BGR] = YuvToBgrRow;
- WebPSamplers[MODE_BGRA] = YuvToBgraRow;
+ WebPSamplers[MODE_RGB] = YuvToRgbRow_MIPS32;
+ WebPSamplers[MODE_RGBA] = YuvToRgbaRow_MIPS32;
+ WebPSamplers[MODE_BGR] = YuvToBgrRow_MIPS32;
+ WebPSamplers[MODE_BGRA] = YuvToBgraRow_MIPS32;
}
#else // !WEBP_USE_MIPS32
diff --git a/src/3rdparty/libwebp/src/dsp/yuv_mips_dsp_r2.c b/src/3rdparty/libwebp/src/dsp/yuv_mips_dsp_r2.c
index 1720d41..cc8afcc 100644
--- a/src/3rdparty/libwebp/src/dsp/yuv_mips_dsp_r2.c
+++ b/src/3rdparty/libwebp/src/dsp/yuv_mips_dsp_r2.c
@@ -12,11 +12,11 @@
// Author(s): Branimir Vasic (branimir.vasic@imgtec.com)
// Djordje Pesut (djordje.pesut@imgtec.com)
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
#if defined(WEBP_USE_MIPS_DSP_R2)
-#include "./yuv.h"
+#include "src/dsp/yuv.h"
//------------------------------------------------------------------------------
// simple point-sampling
@@ -105,10 +105,10 @@ static void FUNC_NAME(const uint8_t* y, \
} \
}
-ROW_FUNC(YuvToRgbRow, 3, 0, 1, 2, 0)
-ROW_FUNC(YuvToRgbaRow, 4, 0, 1, 2, 3)
-ROW_FUNC(YuvToBgrRow, 3, 2, 1, 0, 0)
-ROW_FUNC(YuvToBgraRow, 4, 2, 1, 0, 3)
+ROW_FUNC(YuvToRgbRow_MIPSdspR2, 3, 0, 1, 2, 0)
+ROW_FUNC(YuvToRgbaRow_MIPSdspR2, 4, 0, 1, 2, 3)
+ROW_FUNC(YuvToBgrRow_MIPSdspR2, 3, 2, 1, 0, 0)
+ROW_FUNC(YuvToBgraRow_MIPSdspR2, 4, 2, 1, 0, 3)
#undef ROW_FUNC
#undef ASM_CLOBBER_LIST
@@ -121,10 +121,10 @@ ROW_FUNC(YuvToBgraRow, 4, 2, 1, 0, 3)
extern void WebPInitSamplersMIPSdspR2(void);
WEBP_TSAN_IGNORE_FUNCTION void WebPInitSamplersMIPSdspR2(void) {
- WebPSamplers[MODE_RGB] = YuvToRgbRow;
- WebPSamplers[MODE_RGBA] = YuvToRgbaRow;
- WebPSamplers[MODE_BGR] = YuvToBgrRow;
- WebPSamplers[MODE_BGRA] = YuvToBgraRow;
+ WebPSamplers[MODE_RGB] = YuvToRgbRow_MIPSdspR2;
+ WebPSamplers[MODE_RGBA] = YuvToRgbaRow_MIPSdspR2;
+ WebPSamplers[MODE_BGR] = YuvToBgrRow_MIPSdspR2;
+ WebPSamplers[MODE_BGRA] = YuvToBgraRow_MIPSdspR2;
}
#else // !WEBP_USE_MIPS_DSP_R2
diff --git a/src/3rdparty/libwebp/src/dsp/yuv_neon.c b/src/3rdparty/libwebp/src/dsp/yuv_neon.c
new file mode 100644
index 0000000..a34d602
--- /dev/null
+++ b/src/3rdparty/libwebp/src/dsp/yuv_neon.c
@@ -0,0 +1,288 @@
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// YUV->RGB conversion functions
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "src/dsp/yuv.h"
+
+#if defined(WEBP_USE_NEON)
+
+#include <assert.h>
+#include <stdlib.h>
+
+#include "src/dsp/neon.h"
+
+//-----------------------------------------------------------------------------
+
+static uint8x8_t ConvertRGBToY_NEON(const uint8x8_t R,
+ const uint8x8_t G,
+ const uint8x8_t B) {
+ const uint16x8_t r = vmovl_u8(R);
+ const uint16x8_t g = vmovl_u8(G);
+ const uint16x8_t b = vmovl_u8(B);
+ const uint16x4_t r_lo = vget_low_u16(r);
+ const uint16x4_t r_hi = vget_high_u16(r);
+ const uint16x4_t g_lo = vget_low_u16(g);
+ const uint16x4_t g_hi = vget_high_u16(g);
+ const uint16x4_t b_lo = vget_low_u16(b);
+ const uint16x4_t b_hi = vget_high_u16(b);
+ const uint32x4_t tmp0_lo = vmull_n_u16( r_lo, 16839u);
+ const uint32x4_t tmp0_hi = vmull_n_u16( r_hi, 16839u);
+ const uint32x4_t tmp1_lo = vmlal_n_u16(tmp0_lo, g_lo, 33059u);
+ const uint32x4_t tmp1_hi = vmlal_n_u16(tmp0_hi, g_hi, 33059u);
+ const uint32x4_t tmp2_lo = vmlal_n_u16(tmp1_lo, b_lo, 6420u);
+ const uint32x4_t tmp2_hi = vmlal_n_u16(tmp1_hi, b_hi, 6420u);
+ const uint16x8_t Y1 = vcombine_u16(vrshrn_n_u32(tmp2_lo, 16),
+ vrshrn_n_u32(tmp2_hi, 16));
+ const uint16x8_t Y2 = vaddq_u16(Y1, vdupq_n_u16(16));
+ return vqmovn_u16(Y2);
+}
+
+static void ConvertRGB24ToY_NEON(const uint8_t* rgb, uint8_t* y, int width) {
+ int i;
+ for (i = 0; i + 8 <= width; i += 8, rgb += 3 * 8) {
+ const uint8x8x3_t RGB = vld3_u8(rgb);
+ const uint8x8_t Y = ConvertRGBToY_NEON(RGB.val[0], RGB.val[1], RGB.val[2]);
+ vst1_u8(y + i, Y);
+ }
+ for (; i < width; ++i, rgb += 3) { // left-over
+ y[i] = VP8RGBToY(rgb[0], rgb[1], rgb[2], YUV_HALF);
+ }
+}
+
+static void ConvertBGR24ToY_NEON(const uint8_t* bgr, uint8_t* y, int width) {
+ int i;
+ for (i = 0; i + 8 <= width; i += 8, bgr += 3 * 8) {
+ const uint8x8x3_t BGR = vld3_u8(bgr);
+ const uint8x8_t Y = ConvertRGBToY_NEON(BGR.val[2], BGR.val[1], BGR.val[0]);
+ vst1_u8(y + i, Y);
+ }
+ for (; i < width; ++i, bgr += 3) { // left-over
+ y[i] = VP8RGBToY(bgr[2], bgr[1], bgr[0], YUV_HALF);
+ }
+}
+
+static void ConvertARGBToY_NEON(const uint32_t* argb, uint8_t* y, int width) {
+ int i;
+ for (i = 0; i + 8 <= width; i += 8) {
+ const uint8x8x4_t RGB = vld4_u8((const uint8_t*)&argb[i]);
+ const uint8x8_t Y = ConvertRGBToY_NEON(RGB.val[2], RGB.val[1], RGB.val[0]);
+ vst1_u8(y + i, Y);
+ }
+ for (; i < width; ++i) { // left-over
+ const uint32_t p = argb[i];
+ y[i] = VP8RGBToY((p >> 16) & 0xff, (p >> 8) & 0xff, (p >> 0) & 0xff,
+ YUV_HALF);
+ }
+}
+
+//-----------------------------------------------------------------------------
+
+// computes: DST_s16 = [(C0 * r + C1 * g + C2 * b) >> 16] + CST
+#define MULTIPLY_16b_PREAMBLE(r, g, b) \
+ const int16x4_t r_lo = vreinterpret_s16_u16(vget_low_u16(r)); \
+ const int16x4_t r_hi = vreinterpret_s16_u16(vget_high_u16(r)); \
+ const int16x4_t g_lo = vreinterpret_s16_u16(vget_low_u16(g)); \
+ const int16x4_t g_hi = vreinterpret_s16_u16(vget_high_u16(g)); \
+ const int16x4_t b_lo = vreinterpret_s16_u16(vget_low_u16(b)); \
+ const int16x4_t b_hi = vreinterpret_s16_u16(vget_high_u16(b))
+
+#define MULTIPLY_16b(C0, C1, C2, CST, DST_s16) do { \
+ const int32x4_t tmp0_lo = vmull_n_s16( r_lo, C0); \
+ const int32x4_t tmp0_hi = vmull_n_s16( r_hi, C0); \
+ const int32x4_t tmp1_lo = vmlal_n_s16(tmp0_lo, g_lo, C1); \
+ const int32x4_t tmp1_hi = vmlal_n_s16(tmp0_hi, g_hi, C1); \
+ const int32x4_t tmp2_lo = vmlal_n_s16(tmp1_lo, b_lo, C2); \
+ const int32x4_t tmp2_hi = vmlal_n_s16(tmp1_hi, b_hi, C2); \
+ const int16x8_t tmp3 = vcombine_s16(vshrn_n_s32(tmp2_lo, 16), \
+ vshrn_n_s32(tmp2_hi, 16)); \
+ DST_s16 = vaddq_s16(tmp3, vdupq_n_s16(CST)); \
+} while (0)
+
+// This needs to be a macro, since (128 << SHIFT) needs to be an immediate.
+#define CONVERT_RGB_TO_UV(r, g, b, SHIFT, U_DST, V_DST) do { \
+ MULTIPLY_16b_PREAMBLE(r, g, b); \
+ MULTIPLY_16b(-9719, -19081, 28800, 128 << SHIFT, U_DST); \
+ MULTIPLY_16b(28800, -24116, -4684, 128 << SHIFT, V_DST); \
+} while (0)
+
+static void ConvertRGBA32ToUV_NEON(const uint16_t* rgb,
+ uint8_t* u, uint8_t* v, int width) {
+ int i;
+ for (i = 0; i + 8 <= width; i += 8, rgb += 4 * 8) {
+ const uint16x8x4_t RGB = vld4q_u16((const uint16_t*)rgb);
+ int16x8_t U, V;
+ CONVERT_RGB_TO_UV(RGB.val[0], RGB.val[1], RGB.val[2], 2, U, V);
+ vst1_u8(u + i, vqrshrun_n_s16(U, 2));
+ vst1_u8(v + i, vqrshrun_n_s16(V, 2));
+ }
+ for (; i < width; i += 1, rgb += 4) {
+ const int r = rgb[0], g = rgb[1], b = rgb[2];
+ u[i] = VP8RGBToU(r, g, b, YUV_HALF << 2);
+ v[i] = VP8RGBToV(r, g, b, YUV_HALF << 2);
+ }
+}
+
+static void ConvertARGBToUV_NEON(const uint32_t* argb, uint8_t* u, uint8_t* v,
+ int src_width, int do_store) {
+ int i;
+ for (i = 0; i + 16 <= src_width; i += 16, u += 8, v += 8) {
+ const uint8x16x4_t RGB = vld4q_u8((const uint8_t*)&argb[i]);
+ const uint16x8_t R = vpaddlq_u8(RGB.val[2]); // pair-wise adds
+ const uint16x8_t G = vpaddlq_u8(RGB.val[1]);
+ const uint16x8_t B = vpaddlq_u8(RGB.val[0]);
+ int16x8_t U_tmp, V_tmp;
+ CONVERT_RGB_TO_UV(R, G, B, 1, U_tmp, V_tmp);
+ {
+ const uint8x8_t U = vqrshrun_n_s16(U_tmp, 1);
+ const uint8x8_t V = vqrshrun_n_s16(V_tmp, 1);
+ if (do_store) {
+ vst1_u8(u, U);
+ vst1_u8(v, V);
+ } else {
+ const uint8x8_t prev_u = vld1_u8(u);
+ const uint8x8_t prev_v = vld1_u8(v);
+ vst1_u8(u, vrhadd_u8(U, prev_u));
+ vst1_u8(v, vrhadd_u8(V, prev_v));
+ }
+ }
+ }
+ if (i < src_width) { // left-over
+ WebPConvertARGBToUV_C(argb + i, u, v, src_width - i, do_store);
+ }
+}
+
+
+//------------------------------------------------------------------------------
+
+extern void WebPInitConvertARGBToYUVNEON(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUVNEON(void) {
+ WebPConvertRGB24ToY = ConvertRGB24ToY_NEON;
+ WebPConvertBGR24ToY = ConvertBGR24ToY_NEON;
+ WebPConvertARGBToY = ConvertARGBToY_NEON;
+ WebPConvertARGBToUV = ConvertARGBToUV_NEON;
+ WebPConvertRGBA32ToUV = ConvertRGBA32ToUV_NEON;
+}
+
+//------------------------------------------------------------------------------
+
+#define MAX_Y ((1 << 10) - 1) // 10b precision over 16b-arithmetic
+static uint16_t clip_y_NEON(int v) {
+ return (v < 0) ? 0 : (v > MAX_Y) ? MAX_Y : (uint16_t)v;
+}
+
+static uint64_t SharpYUVUpdateY_NEON(const uint16_t* ref, const uint16_t* src,
+ uint16_t* dst, int len) {
+ int i;
+ const int16x8_t zero = vdupq_n_s16(0);
+ const int16x8_t max = vdupq_n_s16(MAX_Y);
+ uint64x2_t sum = vdupq_n_u64(0);
+ uint64_t diff;
+
+ for (i = 0; i + 8 <= len; i += 8) {
+ const int16x8_t A = vreinterpretq_s16_u16(vld1q_u16(ref + i));
+ const int16x8_t B = vreinterpretq_s16_u16(vld1q_u16(src + i));
+ const int16x8_t C = vreinterpretq_s16_u16(vld1q_u16(dst + i));
+ const int16x8_t D = vsubq_s16(A, B); // diff_y
+ const int16x8_t F = vaddq_s16(C, D); // new_y
+ const uint16x8_t H =
+ vreinterpretq_u16_s16(vmaxq_s16(vminq_s16(F, max), zero));
+ const int16x8_t I = vabsq_s16(D); // abs(diff_y)
+ vst1q_u16(dst + i, H);
+ sum = vpadalq_u32(sum, vpaddlq_u16(vreinterpretq_u16_s16(I)));
+ }
+ diff = vgetq_lane_u64(sum, 0) + vgetq_lane_u64(sum, 1);
+ for (; i < len; ++i) {
+ const int diff_y = ref[i] - src[i];
+ const int new_y = (int)(dst[i]) + diff_y;
+ dst[i] = clip_y_NEON(new_y);
+ diff += (uint64_t)(abs(diff_y));
+ }
+ return diff;
+}
+
+static void SharpYUVUpdateRGB_NEON(const int16_t* ref, const int16_t* src,
+ int16_t* dst, int len) {
+ int i;
+ for (i = 0; i + 8 <= len; i += 8) {
+ const int16x8_t A = vld1q_s16(ref + i);
+ const int16x8_t B = vld1q_s16(src + i);
+ const int16x8_t C = vld1q_s16(dst + i);
+ const int16x8_t D = vsubq_s16(A, B); // diff_uv
+ const int16x8_t E = vaddq_s16(C, D); // new_uv
+ vst1q_s16(dst + i, E);
+ }
+ for (; i < len; ++i) {
+ const int diff_uv = ref[i] - src[i];
+ dst[i] += diff_uv;
+ }
+}
+
+static void SharpYUVFilterRow_NEON(const int16_t* A, const int16_t* B, int len,
+ const uint16_t* best_y, uint16_t* out) {
+ int i;
+ const int16x8_t max = vdupq_n_s16(MAX_Y);
+ const int16x8_t zero = vdupq_n_s16(0);
+ for (i = 0; i + 8 <= len; i += 8) {
+ const int16x8_t a0 = vld1q_s16(A + i + 0);
+ const int16x8_t a1 = vld1q_s16(A + i + 1);
+ const int16x8_t b0 = vld1q_s16(B + i + 0);
+ const int16x8_t b1 = vld1q_s16(B + i + 1);
+ const int16x8_t a0b1 = vaddq_s16(a0, b1);
+ const int16x8_t a1b0 = vaddq_s16(a1, b0);
+ const int16x8_t a0a1b0b1 = vaddq_s16(a0b1, a1b0); // A0+A1+B0+B1
+ const int16x8_t a0b1_2 = vaddq_s16(a0b1, a0b1); // 2*(A0+B1)
+ const int16x8_t a1b0_2 = vaddq_s16(a1b0, a1b0); // 2*(A1+B0)
+ const int16x8_t c0 = vshrq_n_s16(vaddq_s16(a0b1_2, a0a1b0b1), 3);
+ const int16x8_t c1 = vshrq_n_s16(vaddq_s16(a1b0_2, a0a1b0b1), 3);
+ const int16x8_t d0 = vaddq_s16(c1, a0);
+ const int16x8_t d1 = vaddq_s16(c0, a1);
+ const int16x8_t e0 = vrshrq_n_s16(d0, 1);
+ const int16x8_t e1 = vrshrq_n_s16(d1, 1);
+ const int16x8x2_t f = vzipq_s16(e0, e1);
+ const int16x8_t g0 = vreinterpretq_s16_u16(vld1q_u16(best_y + 2 * i + 0));
+ const int16x8_t g1 = vreinterpretq_s16_u16(vld1q_u16(best_y + 2 * i + 8));
+ const int16x8_t h0 = vaddq_s16(g0, f.val[0]);
+ const int16x8_t h1 = vaddq_s16(g1, f.val[1]);
+ const int16x8_t i0 = vmaxq_s16(vminq_s16(h0, max), zero);
+ const int16x8_t i1 = vmaxq_s16(vminq_s16(h1, max), zero);
+ vst1q_u16(out + 2 * i + 0, vreinterpretq_u16_s16(i0));
+ vst1q_u16(out + 2 * i + 8, vreinterpretq_u16_s16(i1));
+ }
+ for (; i < len; ++i) {
+ const int a0b1 = A[i + 0] + B[i + 1];
+ const int a1b0 = A[i + 1] + B[i + 0];
+ const int a0a1b0b1 = a0b1 + a1b0 + 8;
+ const int v0 = (8 * A[i + 0] + 2 * a1b0 + a0a1b0b1) >> 4;
+ const int v1 = (8 * A[i + 1] + 2 * a0b1 + a0a1b0b1) >> 4;
+ out[2 * i + 0] = clip_y_NEON(best_y[2 * i + 0] + v0);
+ out[2 * i + 1] = clip_y_NEON(best_y[2 * i + 1] + v1);
+ }
+}
+#undef MAX_Y
+
+//------------------------------------------------------------------------------
+
+extern void WebPInitSharpYUVNEON(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void WebPInitSharpYUVNEON(void) {
+ WebPSharpYUVUpdateY = SharpYUVUpdateY_NEON;
+ WebPSharpYUVUpdateRGB = SharpYUVUpdateRGB_NEON;
+ WebPSharpYUVFilterRow = SharpYUVFilterRow_NEON;
+}
+
+#else // !WEBP_USE_NEON
+
+WEBP_DSP_INIT_STUB(WebPInitConvertARGBToYUVNEON)
+WEBP_DSP_INIT_STUB(WebPInitSharpYUVNEON)
+
+#endif // WEBP_USE_NEON
diff --git a/src/3rdparty/libwebp/src/dsp/yuv_sse2.c b/src/3rdparty/libwebp/src/dsp/yuv_sse2.c
index e33c2bb..6810bf8 100644
--- a/src/3rdparty/libwebp/src/dsp/yuv_sse2.c
+++ b/src/3rdparty/libwebp/src/dsp/yuv_sse2.c
@@ -11,11 +11,11 @@
//
// Author: Skal (pascal.massimino@gmail.com)
-#include "./yuv.h"
+#include "src/dsp/yuv.h"
#if defined(WEBP_USE_SSE2)
-#include "./common_sse2.h"
+#include "src/dsp/common_sse2.h"
#include <stdlib.h>
#include <emmintrin.h>
@@ -26,12 +26,12 @@
// R = (19077 * y + 26149 * v - 14234) >> 6
// G = (19077 * y - 6419 * u - 13320 * v + 8708) >> 6
// B = (19077 * y + 33050 * u - 17685) >> 6
-static void ConvertYUV444ToRGB(const __m128i* const Y0,
- const __m128i* const U0,
- const __m128i* const V0,
- __m128i* const R,
- __m128i* const G,
- __m128i* const B) {
+static void ConvertYUV444ToRGB_SSE2(const __m128i* const Y0,
+ const __m128i* const U0,
+ const __m128i* const V0,
+ __m128i* const R,
+ __m128i* const G,
+ __m128i* const B) {
const __m128i k19077 = _mm_set1_epi16(19077);
const __m128i k26149 = _mm_set1_epi16(26149);
const __m128i k14234 = _mm_set1_epi16(14234);
@@ -66,13 +66,13 @@ static void ConvertYUV444ToRGB(const __m128i* const Y0,
}
// Load the bytes into the *upper* part of 16b words. That's "<< 8", basically.
-static WEBP_INLINE __m128i Load_HI_16(const uint8_t* src) {
+static WEBP_INLINE __m128i Load_HI_16_SSE2(const uint8_t* src) {
const __m128i zero = _mm_setzero_si128();
return _mm_unpacklo_epi8(zero, _mm_loadl_epi64((const __m128i*)src));
}
// Load and replicate the U/V samples
-static WEBP_INLINE __m128i Load_UV_HI_8(const uint8_t* src) {
+static WEBP_INLINE __m128i Load_UV_HI_8_SSE2(const uint8_t* src) {
const __m128i zero = _mm_setzero_si128();
const __m128i tmp0 = _mm_cvtsi32_si128(*(const uint32_t*)src);
const __m128i tmp1 = _mm_unpacklo_epi8(zero, tmp0);
@@ -80,29 +80,33 @@ static WEBP_INLINE __m128i Load_UV_HI_8(const uint8_t* src) {
}
// Convert 32 samples of YUV444 to R/G/B
-static void YUV444ToRGB(const uint8_t* const y,
- const uint8_t* const u,
- const uint8_t* const v,
- __m128i* const R, __m128i* const G, __m128i* const B) {
- const __m128i Y0 = Load_HI_16(y), U0 = Load_HI_16(u), V0 = Load_HI_16(v);
- ConvertYUV444ToRGB(&Y0, &U0, &V0, R, G, B);
+static void YUV444ToRGB_SSE2(const uint8_t* const y,
+ const uint8_t* const u,
+ const uint8_t* const v,
+ __m128i* const R, __m128i* const G,
+ __m128i* const B) {
+ const __m128i Y0 = Load_HI_16_SSE2(y), U0 = Load_HI_16_SSE2(u),
+ V0 = Load_HI_16_SSE2(v);
+ ConvertYUV444ToRGB_SSE2(&Y0, &U0, &V0, R, G, B);
}
// Convert 32 samples of YUV420 to R/G/B
-static void YUV420ToRGB(const uint8_t* const y,
- const uint8_t* const u,
- const uint8_t* const v,
- __m128i* const R, __m128i* const G, __m128i* const B) {
- const __m128i Y0 = Load_HI_16(y), U0 = Load_UV_HI_8(u), V0 = Load_UV_HI_8(v);
- ConvertYUV444ToRGB(&Y0, &U0, &V0, R, G, B);
+static void YUV420ToRGB_SSE2(const uint8_t* const y,
+ const uint8_t* const u,
+ const uint8_t* const v,
+ __m128i* const R, __m128i* const G,
+ __m128i* const B) {
+ const __m128i Y0 = Load_HI_16_SSE2(y), U0 = Load_UV_HI_8_SSE2(u),
+ V0 = Load_UV_HI_8_SSE2(v);
+ ConvertYUV444ToRGB_SSE2(&Y0, &U0, &V0, R, G, B);
}
// Pack R/G/B/A results into 32b output.
-static WEBP_INLINE void PackAndStore4(const __m128i* const R,
- const __m128i* const G,
- const __m128i* const B,
- const __m128i* const A,
- uint8_t* const dst) {
+static WEBP_INLINE void PackAndStore4_SSE2(const __m128i* const R,
+ const __m128i* const G,
+ const __m128i* const B,
+ const __m128i* const A,
+ uint8_t* const dst) {
const __m128i rb = _mm_packus_epi16(*R, *B);
const __m128i ga = _mm_packus_epi16(*G, *A);
const __m128i rg = _mm_unpacklo_epi8(rb, ga);
@@ -114,12 +118,12 @@ static WEBP_INLINE void PackAndStore4(const __m128i* const R,
}
// Pack R/G/B/A results into 16b output.
-static WEBP_INLINE void PackAndStore4444(const __m128i* const R,
- const __m128i* const G,
- const __m128i* const B,
- const __m128i* const A,
- uint8_t* const dst) {
-#if !defined(WEBP_SWAP_16BIT_CSP)
+static WEBP_INLINE void PackAndStore4444_SSE2(const __m128i* const R,
+ const __m128i* const G,
+ const __m128i* const B,
+ const __m128i* const A,
+ uint8_t* const dst) {
+#if (WEBP_SWAP_16BIT_CSP == 0)
const __m128i rg0 = _mm_packus_epi16(*R, *G);
const __m128i ba0 = _mm_packus_epi16(*B, *A);
#else
@@ -136,10 +140,10 @@ static WEBP_INLINE void PackAndStore4444(const __m128i* const R,
}
// Pack R/G/B results into 16b output.
-static WEBP_INLINE void PackAndStore565(const __m128i* const R,
- const __m128i* const G,
- const __m128i* const B,
- uint8_t* const dst) {
+static WEBP_INLINE void PackAndStore565_SSE2(const __m128i* const R,
+ const __m128i* const G,
+ const __m128i* const B,
+ uint8_t* const dst) {
const __m128i r0 = _mm_packus_epi16(*R, *R);
const __m128i g0 = _mm_packus_epi16(*G, *G);
const __m128i b0 = _mm_packus_epi16(*B, *B);
@@ -149,7 +153,7 @@ static WEBP_INLINE void PackAndStore565(const __m128i* const R,
const __m128i g2 = _mm_slli_epi16(_mm_and_si128(g0, _mm_set1_epi8(0x1c)), 3);
const __m128i rg = _mm_or_si128(r1, g1);
const __m128i gb = _mm_or_si128(g2, b1);
-#if !defined(WEBP_SWAP_16BIT_CSP)
+#if (WEBP_SWAP_16BIT_CSP == 0)
const __m128i rgb565 = _mm_unpacklo_epi8(rg, gb);
#else
const __m128i rgb565 = _mm_unpacklo_epi8(gb, rg);
@@ -160,10 +164,10 @@ static WEBP_INLINE void PackAndStore565(const __m128i* const R,
// Pack the planar buffers
// rrrr... rrrr... gggg... gggg... bbbb... bbbb....
// triplet by triplet in the output buffer rgb as rgbrgbrgbrgb ...
-static WEBP_INLINE void PlanarTo24b(__m128i* const in0, __m128i* const in1,
- __m128i* const in2, __m128i* const in3,
- __m128i* const in4, __m128i* const in5,
- uint8_t* const rgb) {
+static WEBP_INLINE void PlanarTo24b_SSE2(__m128i* const in0, __m128i* const in1,
+ __m128i* const in2, __m128i* const in3,
+ __m128i* const in4, __m128i* const in5,
+ uint8_t* const rgb) {
// The input is 6 registers of sixteen 8b but for the sake of explanation,
// let's take 6 registers of four 8b values.
// To pack, we will keep taking one every two 8b integer and move it
@@ -186,69 +190,69 @@ static WEBP_INLINE void PlanarTo24b(__m128i* const in0, __m128i* const in1,
_mm_storeu_si128((__m128i*)(rgb + 80), *in5);
}
-void VP8YuvToRgba32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
- uint8_t* dst) {
+void VP8YuvToRgba32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+ uint8_t* dst) {
const __m128i kAlpha = _mm_set1_epi16(255);
int n;
for (n = 0; n < 32; n += 8, dst += 32) {
__m128i R, G, B;
- YUV444ToRGB(y + n, u + n, v + n, &R, &G, &B);
- PackAndStore4(&R, &G, &B, &kAlpha, dst);
+ YUV444ToRGB_SSE2(y + n, u + n, v + n, &R, &G, &B);
+ PackAndStore4_SSE2(&R, &G, &B, &kAlpha, dst);
}
}
-void VP8YuvToBgra32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
- uint8_t* dst) {
+void VP8YuvToBgra32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+ uint8_t* dst) {
const __m128i kAlpha = _mm_set1_epi16(255);
int n;
for (n = 0; n < 32; n += 8, dst += 32) {
__m128i R, G, B;
- YUV444ToRGB(y + n, u + n, v + n, &R, &G, &B);
- PackAndStore4(&B, &G, &R, &kAlpha, dst);
+ YUV444ToRGB_SSE2(y + n, u + n, v + n, &R, &G, &B);
+ PackAndStore4_SSE2(&B, &G, &R, &kAlpha, dst);
}
}
-void VP8YuvToArgb32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
- uint8_t* dst) {
+void VP8YuvToArgb32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+ uint8_t* dst) {
const __m128i kAlpha = _mm_set1_epi16(255);
int n;
for (n = 0; n < 32; n += 8, dst += 32) {
__m128i R, G, B;
- YUV444ToRGB(y + n, u + n, v + n, &R, &G, &B);
- PackAndStore4(&kAlpha, &R, &G, &B, dst);
+ YUV444ToRGB_SSE2(y + n, u + n, v + n, &R, &G, &B);
+ PackAndStore4_SSE2(&kAlpha, &R, &G, &B, dst);
}
}
-void VP8YuvToRgba444432(const uint8_t* y, const uint8_t* u, const uint8_t* v,
- uint8_t* dst) {
+void VP8YuvToRgba444432_SSE2(const uint8_t* y, const uint8_t* u,
+ const uint8_t* v, uint8_t* dst) {
const __m128i kAlpha = _mm_set1_epi16(255);
int n;
for (n = 0; n < 32; n += 8, dst += 16) {
__m128i R, G, B;
- YUV444ToRGB(y + n, u + n, v + n, &R, &G, &B);
- PackAndStore4444(&R, &G, &B, &kAlpha, dst);
+ YUV444ToRGB_SSE2(y + n, u + n, v + n, &R, &G, &B);
+ PackAndStore4444_SSE2(&R, &G, &B, &kAlpha, dst);
}
}
-void VP8YuvToRgb56532(const uint8_t* y, const uint8_t* u, const uint8_t* v,
- uint8_t* dst) {
+void VP8YuvToRgb56532_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+ uint8_t* dst) {
int n;
for (n = 0; n < 32; n += 8, dst += 16) {
__m128i R, G, B;
- YUV444ToRGB(y + n, u + n, v + n, &R, &G, &B);
- PackAndStore565(&R, &G, &B, dst);
+ YUV444ToRGB_SSE2(y + n, u + n, v + n, &R, &G, &B);
+ PackAndStore565_SSE2(&R, &G, &B, dst);
}
}
-void VP8YuvToRgb32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
- uint8_t* dst) {
+void VP8YuvToRgb32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+ uint8_t* dst) {
__m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
__m128i rgb0, rgb1, rgb2, rgb3, rgb4, rgb5;
- YUV444ToRGB(y + 0, u + 0, v + 0, &R0, &G0, &B0);
- YUV444ToRGB(y + 8, u + 8, v + 8, &R1, &G1, &B1);
- YUV444ToRGB(y + 16, u + 16, v + 16, &R2, &G2, &B2);
- YUV444ToRGB(y + 24, u + 24, v + 24, &R3, &G3, &B3);
+ YUV444ToRGB_SSE2(y + 0, u + 0, v + 0, &R0, &G0, &B0);
+ YUV444ToRGB_SSE2(y + 8, u + 8, v + 8, &R1, &G1, &B1);
+ YUV444ToRGB_SSE2(y + 16, u + 16, v + 16, &R2, &G2, &B2);
+ YUV444ToRGB_SSE2(y + 24, u + 24, v + 24, &R3, &G3, &B3);
// Cast to 8b and store as RRRRGGGGBBBB.
rgb0 = _mm_packus_epi16(R0, R1);
@@ -259,18 +263,18 @@ void VP8YuvToRgb32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
rgb5 = _mm_packus_epi16(B2, B3);
// Pack as RGBRGBRGBRGB.
- PlanarTo24b(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst);
+ PlanarTo24b_SSE2(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst);
}
-void VP8YuvToBgr32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
- uint8_t* dst) {
+void VP8YuvToBgr32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+ uint8_t* dst) {
__m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
__m128i bgr0, bgr1, bgr2, bgr3, bgr4, bgr5;
- YUV444ToRGB(y + 0, u + 0, v + 0, &R0, &G0, &B0);
- YUV444ToRGB(y + 8, u + 8, v + 8, &R1, &G1, &B1);
- YUV444ToRGB(y + 16, u + 16, v + 16, &R2, &G2, &B2);
- YUV444ToRGB(y + 24, u + 24, v + 24, &R3, &G3, &B3);
+ YUV444ToRGB_SSE2(y + 0, u + 0, v + 0, &R0, &G0, &B0);
+ YUV444ToRGB_SSE2(y + 8, u + 8, v + 8, &R1, &G1, &B1);
+ YUV444ToRGB_SSE2(y + 16, u + 16, v + 16, &R2, &G2, &B2);
+ YUV444ToRGB_SSE2(y + 24, u + 24, v + 24, &R3, &G3, &B3);
// Cast to 8b and store as BBBBGGGGRRRR.
bgr0 = _mm_packus_epi16(B0, B1);
@@ -281,20 +285,21 @@ void VP8YuvToBgr32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
bgr5= _mm_packus_epi16(R2, R3);
// Pack as BGRBGRBGRBGR.
- PlanarTo24b(&bgr0, &bgr1, &bgr2, &bgr3, &bgr4, &bgr5, dst);
+ PlanarTo24b_SSE2(&bgr0, &bgr1, &bgr2, &bgr3, &bgr4, &bgr5, dst);
}
//-----------------------------------------------------------------------------
// Arbitrary-length row conversion functions
-static void YuvToRgbaRow(const uint8_t* y, const uint8_t* u, const uint8_t* v,
- uint8_t* dst, int len) {
+static void YuvToRgbaRow_SSE2(const uint8_t* y,
+ const uint8_t* u, const uint8_t* v,
+ uint8_t* dst, int len) {
const __m128i kAlpha = _mm_set1_epi16(255);
int n;
for (n = 0; n + 8 <= len; n += 8, dst += 32) {
__m128i R, G, B;
- YUV420ToRGB(y, u, v, &R, &G, &B);
- PackAndStore4(&R, &G, &B, &kAlpha, dst);
+ YUV420ToRGB_SSE2(y, u, v, &R, &G, &B);
+ PackAndStore4_SSE2(&R, &G, &B, &kAlpha, dst);
y += 8;
u += 4;
v += 4;
@@ -308,14 +313,15 @@ static void YuvToRgbaRow(const uint8_t* y, const uint8_t* u, const uint8_t* v,
}
}
-static void YuvToBgraRow(const uint8_t* y, const uint8_t* u, const uint8_t* v,
- uint8_t* dst, int len) {
+static void YuvToBgraRow_SSE2(const uint8_t* y,
+ const uint8_t* u, const uint8_t* v,
+ uint8_t* dst, int len) {
const __m128i kAlpha = _mm_set1_epi16(255);
int n;
for (n = 0; n + 8 <= len; n += 8, dst += 32) {
__m128i R, G, B;
- YUV420ToRGB(y, u, v, &R, &G, &B);
- PackAndStore4(&B, &G, &R, &kAlpha, dst);
+ YUV420ToRGB_SSE2(y, u, v, &R, &G, &B);
+ PackAndStore4_SSE2(&B, &G, &R, &kAlpha, dst);
y += 8;
u += 4;
v += 4;
@@ -329,14 +335,15 @@ static void YuvToBgraRow(const uint8_t* y, const uint8_t* u, const uint8_t* v,
}
}
-static void YuvToArgbRow(const uint8_t* y, const uint8_t* u, const uint8_t* v,
- uint8_t* dst, int len) {
+static void YuvToArgbRow_SSE2(const uint8_t* y,
+ const uint8_t* u, const uint8_t* v,
+ uint8_t* dst, int len) {
const __m128i kAlpha = _mm_set1_epi16(255);
int n;
for (n = 0; n + 8 <= len; n += 8, dst += 32) {
__m128i R, G, B;
- YUV420ToRGB(y, u, v, &R, &G, &B);
- PackAndStore4(&kAlpha, &R, &G, &B, dst);
+ YUV420ToRGB_SSE2(y, u, v, &R, &G, &B);
+ PackAndStore4_SSE2(&kAlpha, &R, &G, &B, dst);
y += 8;
u += 4;
v += 4;
@@ -350,17 +357,18 @@ static void YuvToArgbRow(const uint8_t* y, const uint8_t* u, const uint8_t* v,
}
}
-static void YuvToRgbRow(const uint8_t* y, const uint8_t* u, const uint8_t* v,
- uint8_t* dst, int len) {
+static void YuvToRgbRow_SSE2(const uint8_t* y,
+ const uint8_t* u, const uint8_t* v,
+ uint8_t* dst, int len) {
int n;
for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) {
__m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
__m128i rgb0, rgb1, rgb2, rgb3, rgb4, rgb5;
- YUV420ToRGB(y + 0, u + 0, v + 0, &R0, &G0, &B0);
- YUV420ToRGB(y + 8, u + 4, v + 4, &R1, &G1, &B1);
- YUV420ToRGB(y + 16, u + 8, v + 8, &R2, &G2, &B2);
- YUV420ToRGB(y + 24, u + 12, v + 12, &R3, &G3, &B3);
+ YUV420ToRGB_SSE2(y + 0, u + 0, v + 0, &R0, &G0, &B0);
+ YUV420ToRGB_SSE2(y + 8, u + 4, v + 4, &R1, &G1, &B1);
+ YUV420ToRGB_SSE2(y + 16, u + 8, v + 8, &R2, &G2, &B2);
+ YUV420ToRGB_SSE2(y + 24, u + 12, v + 12, &R3, &G3, &B3);
// Cast to 8b and store as RRRRGGGGBBBB.
rgb0 = _mm_packus_epi16(R0, R1);
@@ -371,7 +379,7 @@ static void YuvToRgbRow(const uint8_t* y, const uint8_t* u, const uint8_t* v,
rgb5 = _mm_packus_epi16(B2, B3);
// Pack as RGBRGBRGBRGB.
- PlanarTo24b(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst);
+ PlanarTo24b_SSE2(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst);
y += 32;
u += 16;
@@ -386,17 +394,18 @@ static void YuvToRgbRow(const uint8_t* y, const uint8_t* u, const uint8_t* v,
}
}
-static void YuvToBgrRow(const uint8_t* y, const uint8_t* u, const uint8_t* v,
- uint8_t* dst, int len) {
+static void YuvToBgrRow_SSE2(const uint8_t* y,
+ const uint8_t* u, const uint8_t* v,
+ uint8_t* dst, int len) {
int n;
for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) {
__m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
__m128i bgr0, bgr1, bgr2, bgr3, bgr4, bgr5;
- YUV420ToRGB(y + 0, u + 0, v + 0, &R0, &G0, &B0);
- YUV420ToRGB(y + 8, u + 4, v + 4, &R1, &G1, &B1);
- YUV420ToRGB(y + 16, u + 8, v + 8, &R2, &G2, &B2);
- YUV420ToRGB(y + 24, u + 12, v + 12, &R3, &G3, &B3);
+ YUV420ToRGB_SSE2(y + 0, u + 0, v + 0, &R0, &G0, &B0);
+ YUV420ToRGB_SSE2(y + 8, u + 4, v + 4, &R1, &G1, &B1);
+ YUV420ToRGB_SSE2(y + 16, u + 8, v + 8, &R2, &G2, &B2);
+ YUV420ToRGB_SSE2(y + 24, u + 12, v + 12, &R3, &G3, &B3);
// Cast to 8b and store as BBBBGGGGRRRR.
bgr0 = _mm_packus_epi16(B0, B1);
@@ -407,7 +416,7 @@ static void YuvToBgrRow(const uint8_t* y, const uint8_t* u, const uint8_t* v,
bgr5 = _mm_packus_epi16(R2, R3);
// Pack as BGRBGRBGRBGR.
- PlanarTo24b(&bgr0, &bgr1, &bgr2, &bgr3, &bgr4, &bgr5, dst);
+ PlanarTo24b_SSE2(&bgr0, &bgr1, &bgr2, &bgr3, &bgr4, &bgr5, dst);
y += 32;
u += 16;
@@ -428,11 +437,11 @@ static void YuvToBgrRow(const uint8_t* y, const uint8_t* u, const uint8_t* v,
extern void WebPInitSamplersSSE2(void);
WEBP_TSAN_IGNORE_FUNCTION void WebPInitSamplersSSE2(void) {
- WebPSamplers[MODE_RGB] = YuvToRgbRow;
- WebPSamplers[MODE_RGBA] = YuvToRgbaRow;
- WebPSamplers[MODE_BGR] = YuvToBgrRow;
- WebPSamplers[MODE_BGRA] = YuvToBgraRow;
- WebPSamplers[MODE_ARGB] = YuvToArgbRow;
+ WebPSamplers[MODE_RGB] = YuvToRgbRow_SSE2;
+ WebPSamplers[MODE_RGBA] = YuvToRgbaRow_SSE2;
+ WebPSamplers[MODE_BGR] = YuvToBgrRow_SSE2;
+ WebPSamplers[MODE_BGRA] = YuvToBgraRow_SSE2;
+ WebPSamplers[MODE_ARGB] = YuvToArgbRow_SSE2;
}
//------------------------------------------------------------------------------
@@ -445,7 +454,7 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitSamplersSSE2(void) {
// Function that inserts a value of the second half of the in buffer in between
// every two char of the first half.
-static WEBP_INLINE void RGB24PackedToPlanarHelper(
+static WEBP_INLINE void RGB24PackedToPlanarHelper_SSE2(
const __m128i* const in /*in[6]*/, __m128i* const out /*out[6]*/) {
out[0] = _mm_unpacklo_epi8(in[0], in[3]);
out[1] = _mm_unpackhi_epi8(in[0], in[3]);
@@ -458,8 +467,8 @@ static WEBP_INLINE void RGB24PackedToPlanarHelper(
// Unpack the 8b input rgbrgbrgbrgb ... as contiguous registers:
// rrrr... rrrr... gggg... gggg... bbbb... bbbb....
// Similar to PlanarTo24bHelper(), but in reverse order.
-static WEBP_INLINE void RGB24PackedToPlanar(const uint8_t* const rgb,
- __m128i* const out /*out[6]*/) {
+static WEBP_INLINE void RGB24PackedToPlanar_SSE2(
+ const uint8_t* const rgb, __m128i* const out /*out[6]*/) {
__m128i tmp[6];
tmp[0] = _mm_loadu_si128((const __m128i*)(rgb + 0));
tmp[1] = _mm_loadu_si128((const __m128i*)(rgb + 16));
@@ -468,16 +477,16 @@ static WEBP_INLINE void RGB24PackedToPlanar(const uint8_t* const rgb,
tmp[4] = _mm_loadu_si128((const __m128i*)(rgb + 64));
tmp[5] = _mm_loadu_si128((const __m128i*)(rgb + 80));
- RGB24PackedToPlanarHelper(tmp, out);
- RGB24PackedToPlanarHelper(out, tmp);
- RGB24PackedToPlanarHelper(tmp, out);
- RGB24PackedToPlanarHelper(out, tmp);
- RGB24PackedToPlanarHelper(tmp, out);
+ RGB24PackedToPlanarHelper_SSE2(tmp, out);
+ RGB24PackedToPlanarHelper_SSE2(out, tmp);
+ RGB24PackedToPlanarHelper_SSE2(tmp, out);
+ RGB24PackedToPlanarHelper_SSE2(out, tmp);
+ RGB24PackedToPlanarHelper_SSE2(tmp, out);
}
// Convert 8 packed ARGB to r[], g[], b[]
-static WEBP_INLINE void RGB32PackedToPlanar(const uint32_t* const argb,
- __m128i* const rgb /*in[6]*/) {
+static WEBP_INLINE void RGB32PackedToPlanar_SSE2(const uint32_t* const argb,
+ __m128i* const rgb /*in[6]*/) {
const __m128i zero = _mm_setzero_si128();
__m128i a0 = LOAD_16(argb + 0);
__m128i a1 = LOAD_16(argb + 4);
@@ -511,10 +520,10 @@ static WEBP_INLINE void RGB32PackedToPlanar(const uint32_t* const argb,
} while (0)
#define MK_CST_16(A, B) _mm_set_epi16((B), (A), (B), (A), (B), (A), (B), (A))
-static WEBP_INLINE void ConvertRGBToY(const __m128i* const R,
- const __m128i* const G,
- const __m128i* const B,
- __m128i* const Y) {
+static WEBP_INLINE void ConvertRGBToY_SSE2(const __m128i* const R,
+ const __m128i* const G,
+ const __m128i* const B,
+ __m128i* const Y) {
const __m128i kRG_y = MK_CST_16(16839, 33059 - 16384);
const __m128i kGB_y = MK_CST_16(16384, 6420);
const __m128i kHALF_Y = _mm_set1_epi32((16 << YUV_FIX) + YUV_HALF);
@@ -526,10 +535,11 @@ static WEBP_INLINE void ConvertRGBToY(const __m128i* const R,
TRANSFORM(RG_lo, RG_hi, GB_lo, GB_hi, kRG_y, kGB_y, kHALF_Y, YUV_FIX, *Y);
}
-static WEBP_INLINE void ConvertRGBToUV(const __m128i* const R,
- const __m128i* const G,
- const __m128i* const B,
- __m128i* const U, __m128i* const V) {
+static WEBP_INLINE void ConvertRGBToUV_SSE2(const __m128i* const R,
+ const __m128i* const G,
+ const __m128i* const B,
+ __m128i* const U,
+ __m128i* const V) {
const __m128i kRG_u = MK_CST_16(-9719, -19081);
const __m128i kGB_u = MK_CST_16(0, 28800);
const __m128i kRG_v = MK_CST_16(28800, 0);
@@ -549,14 +559,14 @@ static WEBP_INLINE void ConvertRGBToUV(const __m128i* const R,
#undef MK_CST_16
#undef TRANSFORM
-static void ConvertRGB24ToY(const uint8_t* rgb, uint8_t* y, int width) {
+static void ConvertRGB24ToY_SSE2(const uint8_t* rgb, uint8_t* y, int width) {
const int max_width = width & ~31;
int i;
for (i = 0; i < max_width; rgb += 3 * 16 * 2) {
__m128i rgb_plane[6];
int j;
- RGB24PackedToPlanar(rgb, rgb_plane);
+ RGB24PackedToPlanar_SSE2(rgb, rgb_plane);
for (j = 0; j < 2; ++j, i += 16) {
const __m128i zero = _mm_setzero_si128();
@@ -566,13 +576,13 @@ static void ConvertRGB24ToY(const uint8_t* rgb, uint8_t* y, int width) {
r = _mm_unpacklo_epi8(rgb_plane[0 + j], zero);
g = _mm_unpacklo_epi8(rgb_plane[2 + j], zero);
b = _mm_unpacklo_epi8(rgb_plane[4 + j], zero);
- ConvertRGBToY(&r, &g, &b, &Y0);
+ ConvertRGBToY_SSE2(&r, &g, &b, &Y0);
// Convert to 16-bit Y.
r = _mm_unpackhi_epi8(rgb_plane[0 + j], zero);
g = _mm_unpackhi_epi8(rgb_plane[2 + j], zero);
b = _mm_unpackhi_epi8(rgb_plane[4 + j], zero);
- ConvertRGBToY(&r, &g, &b, &Y1);
+ ConvertRGBToY_SSE2(&r, &g, &b, &Y1);
// Cast to 8-bit and store.
STORE_16(_mm_packus_epi16(Y0, Y1), y + i);
@@ -583,14 +593,14 @@ static void ConvertRGB24ToY(const uint8_t* rgb, uint8_t* y, int width) {
}
}
-static void ConvertBGR24ToY(const uint8_t* bgr, uint8_t* y, int width) {
+static void ConvertBGR24ToY_SSE2(const uint8_t* bgr, uint8_t* y, int width) {
const int max_width = width & ~31;
int i;
for (i = 0; i < max_width; bgr += 3 * 16 * 2) {
__m128i bgr_plane[6];
int j;
- RGB24PackedToPlanar(bgr, bgr_plane);
+ RGB24PackedToPlanar_SSE2(bgr, bgr_plane);
for (j = 0; j < 2; ++j, i += 16) {
const __m128i zero = _mm_setzero_si128();
@@ -600,13 +610,13 @@ static void ConvertBGR24ToY(const uint8_t* bgr, uint8_t* y, int width) {
b = _mm_unpacklo_epi8(bgr_plane[0 + j], zero);
g = _mm_unpacklo_epi8(bgr_plane[2 + j], zero);
r = _mm_unpacklo_epi8(bgr_plane[4 + j], zero);
- ConvertRGBToY(&r, &g, &b, &Y0);
+ ConvertRGBToY_SSE2(&r, &g, &b, &Y0);
// Convert to 16-bit Y.
b = _mm_unpackhi_epi8(bgr_plane[0 + j], zero);
g = _mm_unpackhi_epi8(bgr_plane[2 + j], zero);
r = _mm_unpackhi_epi8(bgr_plane[4 + j], zero);
- ConvertRGBToY(&r, &g, &b, &Y1);
+ ConvertRGBToY_SSE2(&r, &g, &b, &Y1);
// Cast to 8-bit and store.
STORE_16(_mm_packus_epi16(Y0, Y1), y + i);
@@ -617,14 +627,14 @@ static void ConvertBGR24ToY(const uint8_t* bgr, uint8_t* y, int width) {
}
}
-static void ConvertARGBToY(const uint32_t* argb, uint8_t* y, int width) {
+static void ConvertARGBToY_SSE2(const uint32_t* argb, uint8_t* y, int width) {
const int max_width = width & ~15;
int i;
for (i = 0; i < max_width; i += 16) {
__m128i Y0, Y1, rgb[6];
- RGB32PackedToPlanar(&argb[i], rgb);
- ConvertRGBToY(&rgb[0], &rgb[2], &rgb[4], &Y0);
- ConvertRGBToY(&rgb[1], &rgb[3], &rgb[5], &Y1);
+ RGB32PackedToPlanar_SSE2(&argb[i], rgb);
+ ConvertRGBToY_SSE2(&rgb[0], &rgb[2], &rgb[4], &Y0);
+ ConvertRGBToY_SSE2(&rgb[1], &rgb[3], &rgb[5], &Y1);
STORE_16(_mm_packus_epi16(Y0, Y1), y + i);
}
for (; i < width; ++i) { // left-over
@@ -636,31 +646,33 @@ static void ConvertARGBToY(const uint32_t* argb, uint8_t* y, int width) {
// Horizontal add (doubled) of two 16b values, result is 16b.
// in: A | B | C | D | ... -> out: 2*(A+B) | 2*(C+D) | ...
-static void HorizontalAddPack(const __m128i* const A, const __m128i* const B,
- __m128i* const out) {
+static void HorizontalAddPack_SSE2(const __m128i* const A,
+ const __m128i* const B,
+ __m128i* const out) {
const __m128i k2 = _mm_set1_epi16(2);
const __m128i C = _mm_madd_epi16(*A, k2);
const __m128i D = _mm_madd_epi16(*B, k2);
*out = _mm_packs_epi32(C, D);
}
-static void ConvertARGBToUV(const uint32_t* argb, uint8_t* u, uint8_t* v,
- int src_width, int do_store) {
+static void ConvertARGBToUV_SSE2(const uint32_t* argb,
+ uint8_t* u, uint8_t* v,
+ int src_width, int do_store) {
const int max_width = src_width & ~31;
int i;
for (i = 0; i < max_width; i += 32, u += 16, v += 16) {
__m128i rgb[6], U0, V0, U1, V1;
- RGB32PackedToPlanar(&argb[i], rgb);
- HorizontalAddPack(&rgb[0], &rgb[1], &rgb[0]);
- HorizontalAddPack(&rgb[2], &rgb[3], &rgb[2]);
- HorizontalAddPack(&rgb[4], &rgb[5], &rgb[4]);
- ConvertRGBToUV(&rgb[0], &rgb[2], &rgb[4], &U0, &V0);
-
- RGB32PackedToPlanar(&argb[i + 16], rgb);
- HorizontalAddPack(&rgb[0], &rgb[1], &rgb[0]);
- HorizontalAddPack(&rgb[2], &rgb[3], &rgb[2]);
- HorizontalAddPack(&rgb[4], &rgb[5], &rgb[4]);
- ConvertRGBToUV(&rgb[0], &rgb[2], &rgb[4], &U1, &V1);
+ RGB32PackedToPlanar_SSE2(&argb[i], rgb);
+ HorizontalAddPack_SSE2(&rgb[0], &rgb[1], &rgb[0]);
+ HorizontalAddPack_SSE2(&rgb[2], &rgb[3], &rgb[2]);
+ HorizontalAddPack_SSE2(&rgb[4], &rgb[5], &rgb[4]);
+ ConvertRGBToUV_SSE2(&rgb[0], &rgb[2], &rgb[4], &U0, &V0);
+
+ RGB32PackedToPlanar_SSE2(&argb[i + 16], rgb);
+ HorizontalAddPack_SSE2(&rgb[0], &rgb[1], &rgb[0]);
+ HorizontalAddPack_SSE2(&rgb[2], &rgb[3], &rgb[2]);
+ HorizontalAddPack_SSE2(&rgb[4], &rgb[5], &rgb[4]);
+ ConvertRGBToUV_SSE2(&rgb[0], &rgb[2], &rgb[4], &U1, &V1);
U0 = _mm_packus_epi16(U0, U1);
V0 = _mm_packus_epi16(V0, V1);
@@ -679,10 +691,9 @@ static void ConvertARGBToUV(const uint32_t* argb, uint8_t* u, uint8_t* v,
}
// Convert 16 packed ARGB 16b-values to r[], g[], b[]
-static WEBP_INLINE void RGBA32PackedToPlanar_16b(const uint16_t* const rgbx,
- __m128i* const r,
- __m128i* const g,
- __m128i* const b) {
+static WEBP_INLINE void RGBA32PackedToPlanar_16b_SSE2(
+ const uint16_t* const rgbx,
+ __m128i* const r, __m128i* const g, __m128i* const b) {
const __m128i in0 = LOAD_16(rgbx + 0); // r0 | g0 | b0 |x| r1 | g1 | b1 |x
const __m128i in1 = LOAD_16(rgbx + 8); // r2 | g2 | b2 |x| r3 | g3 | b3 |x
const __m128i in2 = LOAD_16(rgbx + 16); // r4 | ...
@@ -701,16 +712,16 @@ static WEBP_INLINE void RGBA32PackedToPlanar_16b(const uint16_t* const rgbx,
*b = _mm_unpacklo_epi64(B1, B3);
}
-static void ConvertRGBA32ToUV(const uint16_t* rgb,
- uint8_t* u, uint8_t* v, int width) {
+static void ConvertRGBA32ToUV_SSE2(const uint16_t* rgb,
+ uint8_t* u, uint8_t* v, int width) {
const int max_width = width & ~15;
const uint16_t* const last_rgb = rgb + 4 * max_width;
while (rgb < last_rgb) {
__m128i r, g, b, U0, V0, U1, V1;
- RGBA32PackedToPlanar_16b(rgb + 0, &r, &g, &b);
- ConvertRGBToUV(&r, &g, &b, &U0, &V0);
- RGBA32PackedToPlanar_16b(rgb + 32, &r, &g, &b);
- ConvertRGBToUV(&r, &g, &b, &U1, &V1);
+ RGBA32PackedToPlanar_16b_SSE2(rgb + 0, &r, &g, &b);
+ ConvertRGBToUV_SSE2(&r, &g, &b, &U0, &V0);
+ RGBA32PackedToPlanar_16b_SSE2(rgb + 32, &r, &g, &b);
+ ConvertRGBToUV_SSE2(&r, &g, &b, &U1, &V1);
STORE_16(_mm_packus_epi16(U0, U1), u);
STORE_16(_mm_packus_epi16(V0, V1), v);
u += 16;
@@ -727,13 +738,13 @@ static void ConvertRGBA32ToUV(const uint16_t* rgb,
extern void WebPInitConvertARGBToYUVSSE2(void);
WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUVSSE2(void) {
- WebPConvertARGBToY = ConvertARGBToY;
- WebPConvertARGBToUV = ConvertARGBToUV;
+ WebPConvertARGBToY = ConvertARGBToY_SSE2;
+ WebPConvertARGBToUV = ConvertARGBToUV_SSE2;
- WebPConvertRGB24ToY = ConvertRGB24ToY;
- WebPConvertBGR24ToY = ConvertBGR24ToY;
+ WebPConvertRGB24ToY = ConvertRGB24ToY_SSE2;
+ WebPConvertBGR24ToY = ConvertBGR24ToY_SSE2;
- WebPConvertRGBA32ToUV = ConvertRGBA32ToUV;
+ WebPConvertRGBA32ToUV = ConvertRGBA32ToUV_SSE2;
}
//------------------------------------------------------------------------------