1 files changed, 102 insertions, 190 deletions
diff --git a/src/3rdparty/libwebp/src/dsp/enc.c b/src/3rdparty/libwebp/src/dsp/enc.c
index f31bc6d..1c807f1 100644
--- a/src/3rdparty/libwebp/src/dsp/enc.c
+++ b/src/3rdparty/libwebp/src/dsp/enc.c
@@ -14,16 +14,18 @@
 #include <assert.h>
 #include <stdlib.h>  // for abs()
 
-#include "./dsp.h"
-#include "../enc/vp8i_enc.h"
+#include "src/dsp/dsp.h"
+#include "src/enc/vp8i_enc.h"
 
 static WEBP_INLINE uint8_t clip_8b(int v) {
   return (!(v & ~0xff)) ? v : (v < 0) ? 0 : 255;
 }
 
+#if !WEBP_NEON_OMIT_C_CODE
 static WEBP_INLINE int clip_max(int v, int max) {
   return (v > max) ? max : v;
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE
 
 //------------------------------------------------------------------------------
 // Compute susceptibility based on DCT-coeff histograms:
@@ -56,9 +58,10 @@ void VP8SetHistogramData(const int distribution[MAX_COEFF_THRESH + 1],
   histo->last_non_zero = last_non_zero;
 }
 
-static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
-                             int start_block, int end_block,
-                             VP8Histogram* const histo) {
+#if !WEBP_NEON_OMIT_C_CODE
+static void CollectHistogram_C(const uint8_t* ref, const uint8_t* pred,
+                               int start_block, int end_block,
+                               VP8Histogram* const histo) {
   int j;
   int distribution[MAX_COEFF_THRESH + 1] = { 0 };
   for (j = start_block; j < end_block; ++j) {
@@ -76,6 +79,7 @@ static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
   }
   VP8SetHistogramData(distribution, histo);
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE
 
 //------------------------------------------------------------------------------
 // run-time tables (~4k)
@@ -100,6 +104,8 @@ static WEBP_TSAN_IGNORE_FUNCTION void InitTables(void) {
 //------------------------------------------------------------------------------
 // Transforms (Paragraph 14.4)
 
+#if !WEBP_NEON_OMIT_C_CODE
+
 #define STORE(x, y, v) \
   dst[(x) + (y) * BPS] = clip_8b(ref[(x) + (y) * BPS] + ((v) >> 3))
 
@@ -140,15 +146,15 @@ static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
   }
 }
 
-static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst,
-                       int do_two) {
+static void ITransform_C(const uint8_t* ref, const int16_t* in, uint8_t* dst,
+                         int do_two) {
   ITransformOne(ref, in, dst);
   if (do_two) {
     ITransformOne(ref + 4, in + 16, dst + 4);
   }
 }
 
-static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
+static void FTransform_C(const uint8_t* src, const uint8_t* ref, int16_t* out) {
   int i;
   int tmp[16];
   for (i = 0; i < 4; ++i, src += BPS, ref += BPS) {
@@ -176,13 +182,16 @@ static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
     out[12+ i] = ((a3 * 2217 - a2 * 5352 + 51000) >> 16);
   }
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE
 
-static void FTransform2(const uint8_t* src, const uint8_t* ref, int16_t* out) {
+static void FTransform2_C(const uint8_t* src, const uint8_t* ref,
+                          int16_t* out) {
   VP8FTransform(src, ref, out);
   VP8FTransform(src + 4, ref + 4, out + 16);
 }
 
-static void FTransformWHT(const int16_t* in, int16_t* out) {
+#if !WEBP_NEON_OMIT_C_CODE
+static void FTransformWHT_C(const int16_t* in, int16_t* out) {
   // input is 12b signed
   int32_t tmp[16];
   int i;
@@ -211,6 +220,7 @@ static void FTransformWHT(const int16_t* in, int16_t* out) {
     out[12 + i] = b3 >> 1;
   }
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE
 
 #undef MUL
 #undef STORE
@@ -303,8 +313,8 @@ static WEBP_INLINE void DCMode(uint8_t* dst, const uint8_t* left,
 //------------------------------------------------------------------------------
 // Chroma 8x8 prediction (paragraph 12.2)
 
-static void IntraChromaPreds(uint8_t* dst, const uint8_t* left,
-                             const uint8_t* top) {
+static void IntraChromaPreds_C(uint8_t* dst, const uint8_t* left,
+                               const uint8_t* top) {
   // U block
   DCMode(C8DC8 + dst, left, top, 8, 8, 4);
   VerticalPred(C8VE8 + dst, top, 8);
@@ -323,8 +333,8 @@ static void IntraChromaPreds(uint8_t* dst, const uint8_t* left,
 //------------------------------------------------------------------------------
 // luma 16x16 prediction (paragraph 12.3)
 
-static void Intra16Preds(uint8_t* dst,
-                         const uint8_t* left, const uint8_t* top) {
+static void Intra16Preds_C(uint8_t* dst,
+                           const uint8_t* left, const uint8_t* top) {
   DCMode(I16DC16 + dst, left, top, 16, 16, 5);
   VerticalPred(I16VE16 + dst, top, 16);
   HorizontalPred(I16HE16 + dst, left, 16);
@@ -507,7 +517,7 @@ static void TM4(uint8_t* dst, const uint8_t* top) {
 
 // Left samples are top[-5 .. -2], top_left is top[-1], top are
 // located at top[0..3], and top right is top[4..7]
-static void Intra4Preds(uint8_t* dst, const uint8_t* top) {
+static void Intra4Preds_C(uint8_t* dst, const uint8_t* top) {
   DC4(I4DC4 + dst, top);
   TM4(I4TM4 + dst, top);
   VE4(I4VE4 + dst, top);
@@ -523,6 +533,7 @@ static void Intra4Preds(uint8_t* dst, const uint8_t* top) {
 //------------------------------------------------------------------------------
 // Metric
 
+#if !WEBP_NEON_OMIT_C_CODE
 static WEBP_INLINE int GetSSE(const uint8_t* a, const uint8_t* b,
                               int w, int h) {
   int count = 0;
@@ -538,20 +549,21 @@ static WEBP_INLINE int GetSSE(const uint8_t* a, const uint8_t* b,
   return count;
 }
 
-static int SSE16x16(const uint8_t* a, const uint8_t* b) {
+static int SSE16x16_C(const uint8_t* a, const uint8_t* b) {
   return GetSSE(a, b, 16, 16);
 }
-static int SSE16x8(const uint8_t* a, const uint8_t* b) {
+static int SSE16x8_C(const uint8_t* a, const uint8_t* b) {
   return GetSSE(a, b, 16, 8);
 }
-static int SSE8x8(const uint8_t* a, const uint8_t* b) {
+static int SSE8x8_C(const uint8_t* a, const uint8_t* b) {
   return GetSSE(a, b, 8, 8);
 }
-static int SSE4x4(const uint8_t* a, const uint8_t* b) {
+static int SSE4x4_C(const uint8_t* a, const uint8_t* b) {
   return GetSSE(a, b, 4, 4);
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE
 
-static void Mean16x4(const uint8_t* ref, uint32_t dc[4]) {
+static void Mean16x4_C(const uint8_t* ref, uint32_t dc[4]) {
   int k, x, y;
   for (k = 0; k < 4; ++k) {
     uint32_t avg = 0;
@@ -571,6 +583,7 @@ static void Mean16x4(const uint8_t* ref, uint32_t dc[4]) {
 // We try to match the spectral content (weighted) between source and
 // reconstructed samples.
 
+#if !WEBP_NEON_OMIT_C_CODE
 // Hadamard transform
 // Returns the weighted sum of the absolute value of transformed coefficients.
 // w[] contains a row-major 4 by 4 symmetric matrix.
@@ -608,24 +621,25 @@ static int TTransform(const uint8_t* in, const uint16_t* w) {
   return sum;
 }
 
-static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
-                    const uint16_t* const w) {
+static int Disto4x4_C(const uint8_t* const a, const uint8_t* const b,
+                      const uint16_t* const w) {
   const int sum1 = TTransform(a, w);
   const int sum2 = TTransform(b, w);
   return abs(sum2 - sum1) >> 5;
 }
 
-static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
-                      const uint16_t* const w) {
+static int Disto16x16_C(const uint8_t* const a, const uint8_t* const b,
+                        const uint16_t* const w) {
   int D = 0;
   int x, y;
   for (y = 0; y < 16 * BPS; y += 4 * BPS) {
     for (x = 0; x < 16; x += 4) {
-      D += Disto4x4(a + x + y, b + x + y, w);
+      D += Disto4x4_C(a + x + y, b + x + y, w);
     }
   }
   return D;
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE
 
 //------------------------------------------------------------------------------
 // Quantization
@@ -636,8 +650,8 @@ static const uint8_t kZigzag[16] = {
 };
 
 // Simple quantization
-static int QuantizeBlock(int16_t in[16], int16_t out[16],
-                         const VP8Matrix* const mtx) {
+static int QuantizeBlock_C(int16_t in[16], int16_t out[16],
+                           const VP8Matrix* const mtx) {
   int last = -1;
   int n;
   for (n = 0; n < 16; ++n) {
@@ -662,13 +676,15 @@ static int QuantizeBlock(int16_t in[16], int16_t out[16],
   return (last >= 0);
 }
 
-static int Quantize2Blocks(int16_t in[32], int16_t out[32],
-                           const VP8Matrix* const mtx) {
+#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
+static int Quantize2Blocks_C(int16_t in[32], int16_t out[32],
+                             const VP8Matrix* const mtx) {
   int nz;
   nz  = VP8EncQuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0;
   nz |= VP8EncQuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1;
   return nz;
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
 
 //------------------------------------------------------------------------------
 // Block copy
@@ -682,149 +698,15 @@ static WEBP_INLINE void Copy(const uint8_t* src, uint8_t* dst, int w, int h) {
   }
 }
 
-static void Copy4x4(const uint8_t* src, uint8_t* dst) {
+static void Copy4x4_C(const uint8_t* src, uint8_t* dst) {
   Copy(src, dst, 4, 4);
 }
 
-static void Copy16x8(const uint8_t* src, uint8_t* dst) {
+static void Copy16x8_C(const uint8_t* src, uint8_t* dst) {
   Copy(src, dst, 16, 8);
 }
 
 //------------------------------------------------------------------------------
-// SSIM / PSNR
-
-// hat-shaped filter. Sum of coefficients is equal to 16.
-static const uint32_t kWeight[2 * VP8_SSIM_KERNEL + 1] = {
-  1, 2, 3, 4, 3, 2, 1
-};
-static const uint32_t kWeightSum = 16 * 16;   // sum{kWeight}^2
-
-static WEBP_INLINE double SSIMCalculation(
-    const VP8DistoStats* const stats, uint32_t N  /*num samples*/) {
-  const uint32_t w2 =  N * N;
-  const uint32_t C1 = 20 * w2;
-  const uint32_t C2 = 60 * w2;
-  const uint32_t C3 = 8 * 8 * w2;   // 'dark' limit ~= 6
-  const uint64_t xmxm = (uint64_t)stats->xm * stats->xm;
-  const uint64_t ymym = (uint64_t)stats->ym * stats->ym;
-  if (xmxm + ymym >= C3) {
-    const int64_t xmym = (int64_t)stats->xm * stats->ym;
-    const int64_t sxy = (int64_t)stats->xym * N - xmym;    // can be negative
-    const uint64_t sxx = (uint64_t)stats->xxm * N - xmxm;
-    const uint64_t syy = (uint64_t)stats->yym * N - ymym;
-    // we descale by 8 to prevent overflow during the fnum/fden multiply.
-    const uint64_t num_S = (2 * (uint64_t)(sxy < 0 ? 0 : sxy) + C2) >> 8;
-    const uint64_t den_S = (sxx + syy + C2) >> 8;
-    const uint64_t fnum = (2 * xmym + C1) * num_S;
-    const uint64_t fden = (xmxm + ymym + C1) * den_S;
-    const double r = (double)fnum / fden;
-    assert(r >= 0. && r <= 1.0);
-    return r;
-  }
-  return 1.;   // area is too dark to contribute meaningfully
-}
-
-double VP8SSIMFromStats(const VP8DistoStats* const stats) {
-  return SSIMCalculation(stats, kWeightSum);
-}
-
-double VP8SSIMFromStatsClipped(const VP8DistoStats* const stats) {
-  return SSIMCalculation(stats, stats->w);
-}
-
-static double SSIMGetClipped_C(const uint8_t* src1, int stride1,
-                               const uint8_t* src2, int stride2,
-                               int xo, int yo, int W, int H) {
-  VP8DistoStats stats = { 0, 0, 0, 0, 0, 0 };
-  const int ymin = (yo - VP8_SSIM_KERNEL < 0) ? 0 : yo - VP8_SSIM_KERNEL;
-  const int ymax = (yo + VP8_SSIM_KERNEL > H - 1) ? H - 1
-                                                  : yo + VP8_SSIM_KERNEL;
-  const int xmin = (xo - VP8_SSIM_KERNEL < 0) ? 0 : xo - VP8_SSIM_KERNEL;
-  const int xmax = (xo + VP8_SSIM_KERNEL > W - 1) ? W - 1
-                                                  : xo + VP8_SSIM_KERNEL;
-  int x, y;
-  src1 += ymin * stride1;
-  src2 += ymin * stride2;
-  for (y = ymin; y <= ymax; ++y, src1 += stride1, src2 += stride2) {
-    for (x = xmin; x <= xmax; ++x) {
-      const uint32_t w = kWeight[VP8_SSIM_KERNEL + x - xo]
-                       * kWeight[VP8_SSIM_KERNEL + y - yo];
-      const uint32_t s1 = src1[x];
-      const uint32_t s2 = src2[x];
-      stats.w   += w;
-      stats.xm  += w * s1;
-      stats.ym  += w * s2;
-      stats.xxm += w * s1 * s1;
-      stats.xym += w * s1 * s2;
-      stats.yym += w * s2 * s2;
-    }
-  }
-  return VP8SSIMFromStatsClipped(&stats);
-}
-
-static double SSIMGet_C(const uint8_t* src1, int stride1,
-                        const uint8_t* src2, int stride2) {
-  VP8DistoStats stats = { 0, 0, 0, 0, 0, 0 };
-  int x, y;
-  for (y = 0; y <= 2 * VP8_SSIM_KERNEL; ++y, src1 += stride1, src2 += stride2) {
-    for (x = 0; x <= 2 * VP8_SSIM_KERNEL; ++x) {
-      const uint32_t w = kWeight[x] * kWeight[y];
-      const uint32_t s1 = src1[x];
-      const uint32_t s2 = src2[x];
-      stats.xm  += w * s1;
-      stats.ym  += w * s2;
-      stats.xxm += w * s1 * s1;
-      stats.xym += w * s1 * s2;
-      stats.yym += w * s2 * s2;
-    }
-  }
-  return VP8SSIMFromStats(&stats);
-}
-
-//------------------------------------------------------------------------------
-
-static uint32_t AccumulateSSE(const uint8_t* src1,
-                              const uint8_t* src2, int len) {
-  int i;
-  uint32_t sse2 = 0;
-  assert(len <= 65535);  // to ensure that accumulation fits within uint32_t
-  for (i = 0; i < len; ++i) {
-    const int32_t diff = src1[i] - src2[i];
-    sse2 += diff * diff;
-  }
-  return sse2;
-}
-
-//------------------------------------------------------------------------------
-
-VP8SSIMGetFunc VP8SSIMGet;
-VP8SSIMGetClippedFunc VP8SSIMGetClipped;
-VP8AccumulateSSEFunc VP8AccumulateSSE;
-
-extern void VP8SSIMDspInitSSE2(void);
-
-static volatile VP8CPUInfo ssim_last_cpuinfo_used =
-    (VP8CPUInfo)&ssim_last_cpuinfo_used;
-
-WEBP_TSAN_IGNORE_FUNCTION void VP8SSIMDspInit(void) {
-  if (ssim_last_cpuinfo_used == VP8GetCPUInfo) return;
-
-  VP8SSIMGetClipped = SSIMGetClipped_C;
-  VP8SSIMGet = SSIMGet_C;
-
-  VP8AccumulateSSE = AccumulateSSE;
-  if (VP8GetCPUInfo != NULL) {
-#if defined(WEBP_USE_SSE2)
-    if (VP8GetCPUInfo(kSSE2)) {
-      VP8SSIMDspInitSSE2();
-    }
-#endif
-  }
-
-  ssim_last_cpuinfo_used = VP8GetCPUInfo;
-}
-
-//------------------------------------------------------------------------------
 // Initialization
 
 // Speed-critical function pointers. We have to initialize them to the default
@@ -868,26 +750,32 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInit(void) {
   InitTables();
 
   // default C implementations
-  VP8CollectHistogram = CollectHistogram;
-  VP8ITransform = ITransform;
-  VP8FTransform = FTransform;
-  VP8FTransform2 = FTransform2;
-  VP8FTransformWHT = FTransformWHT;
-  VP8EncPredLuma4 = Intra4Preds;
-  VP8EncPredLuma16 = Intra16Preds;
-  VP8EncPredChroma8 = IntraChromaPreds;
-  VP8SSE16x16 = SSE16x16;
-  VP8SSE8x8 = SSE8x8;
-  VP8SSE16x8 = SSE16x8;
-  VP8SSE4x4 = SSE4x4;
-  VP8TDisto4x4 = Disto4x4;
-  VP8TDisto16x16 = Disto16x16;
-  VP8Mean16x4 = Mean16x4;
-  VP8EncQuantizeBlock = QuantizeBlock;
-  VP8EncQuantize2Blocks = Quantize2Blocks;
-  VP8EncQuantizeBlockWHT = QuantizeBlock;
-  VP8Copy4x4 = Copy4x4;
-  VP8Copy16x8 = Copy16x8;
+#if !WEBP_NEON_OMIT_C_CODE
+  VP8ITransform = ITransform_C;
+  VP8FTransform = FTransform_C;
+  VP8FTransformWHT = FTransformWHT_C;
+  VP8TDisto4x4 = Disto4x4_C;
+  VP8TDisto16x16 = Disto16x16_C;
+  VP8CollectHistogram = CollectHistogram_C;
+  VP8SSE16x16 = SSE16x16_C;
+  VP8SSE16x8 = SSE16x8_C;
+  VP8SSE8x8 = SSE8x8_C;
+  VP8SSE4x4 = SSE4x4_C;
+#endif
+
+#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
+  VP8EncQuantizeBlock = QuantizeBlock_C;
+  VP8EncQuantize2Blocks = Quantize2Blocks_C;
+#endif
+
+  VP8FTransform2 = FTransform2_C;
+  VP8EncPredLuma4 = Intra4Preds_C;
+  VP8EncPredLuma16 = Intra16Preds_C;
+  VP8EncPredChroma8 = IntraChromaPreds_C;
+  VP8Mean16x4 = Mean16x4_C;
+  VP8EncQuantizeBlockWHT = QuantizeBlock_C;
+  VP8Copy4x4 = Copy4x4_C;
+  VP8Copy16x8 = Copy16x8_C;
 
   // If defined, use CPUInfo() to overwrite some pointers with faster versions.
   if (VP8GetCPUInfo != NULL) {
@@ -906,11 +794,6 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInit(void) {
       VP8EncDspInitAVX2();
     }
 #endif
-#if defined(WEBP_USE_NEON)
-    if (VP8GetCPUInfo(kNEON)) {
-      VP8EncDspInitNEON();
-    }
-#endif
 #if defined(WEBP_USE_MIPS32)
     if (VP8GetCPUInfo(kMIPS32)) {
       VP8EncDspInitMIPS32();
@@ -927,5 +810,34 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInit(void) {
     }
 #endif
   }
+
+#if defined(WEBP_USE_NEON)
+  if (WEBP_NEON_OMIT_C_CODE ||
+      (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
+    VP8EncDspInitNEON();
+  }
+#endif
+
+  assert(VP8ITransform != NULL);
+  assert(VP8FTransform != NULL);
+  assert(VP8FTransformWHT != NULL);
+  assert(VP8TDisto4x4 != NULL);
+  assert(VP8TDisto16x16 != NULL);
+  assert(VP8CollectHistogram != NULL);
+  assert(VP8SSE16x16 != NULL);
+  assert(VP8SSE16x8 != NULL);
+  assert(VP8SSE8x8 != NULL);
+  assert(VP8SSE4x4 != NULL);
+  assert(VP8EncQuantizeBlock != NULL);
+  assert(VP8EncQuantize2Blocks != NULL);
+  assert(VP8FTransform2 != NULL);
+  assert(VP8EncPredLuma4 != NULL);
+  assert(VP8EncPredLuma16 != NULL);
+  assert(VP8EncPredChroma8 != NULL);
+  assert(VP8Mean16x4 != NULL);
+  assert(VP8EncQuantizeBlockWHT != NULL);
+  assert(VP8Copy4x4 != NULL);
+  assert(VP8Copy16x8 != NULL);
+
   enc_last_cpuinfo_used = VP8GetCPUInfo;
 }