1 files changed, 116 insertions, 27 deletions
diff --git a/src/3rdparty/libwebp/src/enc/quant_enc.c b/src/3rdparty/libwebp/src/enc/quant_enc.c
index b118fb2..01eb565 100644
--- a/src/3rdparty/libwebp/src/enc/quant_enc.c
+++ b/src/3rdparty/libwebp/src/enc/quant_enc.c
@@ -15,8 +15,9 @@
 #include <math.h>
 #include <stdlib.h>  // for abs()
 
-#include "./vp8i_enc.h"
-#include "./cost_enc.h"
+#include "src/dsp/quant.h"
+#include "src/enc/vp8i_enc.h"
+#include "src/enc/cost_enc.h"
 
 #define DO_TRELLIS_I4  1
 #define DO_TRELLIS_I16 1   // not a huge gain, but ok at low bitrate.
@@ -32,7 +33,7 @@
 
 // number of non-zero coeffs below which we consider the block very flat
 // (and apply a penalty to complex predictions)
-#define FLATNESS_LIMIT_I16 10      // I16 mode
+#define FLATNESS_LIMIT_I16 0       // I16 mode (special case)
 #define FLATNESS_LIMIT_I4  3       // I4 mode
 #define FLATNESS_LIMIT_UV  2       // UV mode
 #define FLATNESS_PENALTY   140     // roughly ~1bit per block
@@ -457,11 +458,11 @@ void VP8SetSegmentParams(VP8Encoder* const enc, float quality) {
 // Form the predictions in cache
 
 // Must be ordered using {DC_PRED, TM_PRED, V_PRED, H_PRED} as index
-const int VP8I16ModeOffsets[4] = { I16DC16, I16TM16, I16VE16, I16HE16 };
-const int VP8UVModeOffsets[4] = { C8DC8, C8TM8, C8VE8, C8HE8 };
+const uint16_t VP8I16ModeOffsets[4] = { I16DC16, I16TM16, I16VE16, I16HE16 };
+const uint16_t VP8UVModeOffsets[4] = { C8DC8, C8TM8, C8VE8, C8HE8 };
 
 // Must be indexed using {B_DC_PRED -> B_HU_PRED} as index
-const int VP8I4ModeOffsets[NUM_BMODES] = {
+const uint16_t VP8I4ModeOffsets[NUM_BMODES] = {
   I4DC4, I4TM4, I4VE4, I4HE4, I4RD4, I4VR4, I4LD4, I4VL4, I4HD4, I4HU4
 };
 
@@ -492,14 +493,14 @@ void VP8MakeIntra4Preds(const VP8EncIterator* const it) {
 // |YYYY|....| 12
 // +----+----+
 
-const int VP8Scan[16] = {  // Luma
+const uint16_t VP8Scan[16] = {  // Luma
   0 +  0 * BPS,  4 +  0 * BPS, 8 +  0 * BPS, 12 +  0 * BPS,
   0 +  4 * BPS,  4 +  4 * BPS, 8 +  4 * BPS, 12 +  4 * BPS,
   0 +  8 * BPS,  4 +  8 * BPS, 8 +  8 * BPS, 12 +  8 * BPS,
   0 + 12 * BPS,  4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS,
 };
 
-static const int VP8ScanUV[4 + 4] = {
+static const uint16_t VP8ScanUV[4 + 4] = {
   0 + 0 * BPS,   4 + 0 * BPS, 0 + 4 * BPS,  4 + 4 * BPS,    // U
   8 + 0 * BPS,  12 + 0 * BPS, 8 + 4 * BPS, 12 + 4 * BPS     // V
 };
@@ -826,6 +827,85 @@ static int ReconstructIntra4(VP8EncIterator* const it,
   return nz;
 }
 
+//------------------------------------------------------------------------------
+// DC-error diffusion
+
+// Diffusion weights. We under-correct a bit (15/16th of the error is actually
+// diffused) to avoid 'rainbow' chessboard pattern of blocks at q~=0.
+#define C1 7    // fraction of error sent to the 4x4 block below
+#define C2 8    // fraction of error sent to the 4x4 block on the right
+#define DSHIFT 4
+#define DSCALE 1   // storage descaling, needed to make the error fit int8_t
+
+// Quantize as usual, but also compute and return the quantization error.
+// Error is already divided by DSHIFT.
+static int QuantizeSingle(int16_t* const v, const VP8Matrix* const mtx) {
+  int V = *v;
+  const int sign = (V < 0);
+  if (sign) V = -V;
+  if (V > (int)mtx->zthresh_[0]) {
+    const int qV = QUANTDIV(V, mtx->iq_[0], mtx->bias_[0]) * mtx->q_[0];
+    const int err = (V - qV);
+    *v = sign ? -qV : qV;
+    return (sign ? -err : err) >> DSCALE;
+  }
+  *v = 0;
+  return (sign ? -V : V) >> DSCALE;
+}
+
+static void CorrectDCValues(const VP8EncIterator* const it,
+                            const VP8Matrix* const mtx,
+                            int16_t tmp[][16], VP8ModeScore* const rd) {
+  //         | top[0] | top[1]
+  // --------+--------+---------
+  // left[0] | tmp[0]   tmp[1]  <->   err0 err1
+  // left[1] | tmp[2]   tmp[3]        err2 err3
+  //
+  // Final errors {err1,err2,err3} are preserved and later restored
+  // as top[]/left[] on the next block.
+  int ch;
+  for (ch = 0; ch <= 1; ++ch) {
+    const int8_t* const top = it->top_derr_[it->x_][ch];
+    const int8_t* const left = it->left_derr_[ch];
+    int16_t (* const c)[16] = &tmp[ch * 4];
+    int err0, err1, err2, err3;
+    c[0][0] += (C1 * top[0] + C2 * left[0]) >> (DSHIFT - DSCALE);
+    err0 = QuantizeSingle(&c[0][0], mtx);
+    c[1][0] += (C1 * top[1] + C2 * err0) >> (DSHIFT - DSCALE);
+    err1 = QuantizeSingle(&c[1][0], mtx);
+    c[2][0] += (C1 * err0 + C2 * left[1]) >> (DSHIFT - DSCALE);
+    err2 = QuantizeSingle(&c[2][0], mtx);
+    c[3][0] += (C1 * err1 + C2 * err2) >> (DSHIFT - DSCALE);
+    err3 = QuantizeSingle(&c[3][0], mtx);
+    // error 'err' is bounded by mtx->q_[0] which is 132 at max. Hence
+    // err >> DSCALE will fit in an int8_t type if DSCALE>=1.
+    assert(abs(err1) <= 127 && abs(err2) <= 127 && abs(err3) <= 127);
+    rd->derr[ch][0] = (int8_t)err1;
+    rd->derr[ch][1] = (int8_t)err2;
+    rd->derr[ch][2] = (int8_t)err3;
+  }
+}
+
+static void StoreDiffusionErrors(VP8EncIterator* const it,
+                                 const VP8ModeScore* const rd) {
+  int ch;
+  for (ch = 0; ch <= 1; ++ch) {
+    int8_t* const top = it->top_derr_[it->x_][ch];
+    int8_t* const left = it->left_derr_[ch];
+    left[0] = rd->derr[ch][0];            // restore err1
+    left[1] = 3 * rd->derr[ch][2] >> 2;   //     ... 3/4th of err3
+    top[0]  = rd->derr[ch][1];            //     ... err2
+    top[1]  = rd->derr[ch][2] - left[1];  //     ... 1/4th of err3.
+  }
+}
+
+#undef C1
+#undef C2
+#undef DSHIFT
+#undef DSCALE
+
+//------------------------------------------------------------------------------
+
 static int ReconstructUV(VP8EncIterator* const it, VP8ModeScore* const rd,
                          uint8_t* const yuv_out, int mode) {
   const VP8Encoder* const enc = it->enc_;
@@ -839,6 +919,8 @@ static int ReconstructUV(VP8EncIterator* const it, VP8ModeScore* const rd,
   for (n = 0; n < 8; n += 2) {
     VP8FTransform2(src + VP8ScanUV[n], ref + VP8ScanUV[n], tmp[n]);
   }
+  if (it->top_derr_ != NULL) CorrectDCValues(it, &dqm->uv_, tmp, rd);
+
   if (DO_TRELLIS_UV && it->do_trellis_) {
     int ch, x, y;
     for (ch = 0, n = 0; ch <= 2; ch += 2) {
@@ -896,19 +978,6 @@ static void SwapOut(VP8EncIterator* const it) {
   SwapPtr(&it->yuv_out_, &it->yuv_out2_);
 }
 
-static score_t IsFlat(const int16_t* levels, int num_blocks, score_t thresh) {
-  score_t score = 0;
-  while (num_blocks-- > 0) {      // TODO(skal): refine positional scoring?
-    int i;
-    for (i = 1; i < 16; ++i) {    // omit DC, we're only interested in AC
-      score += (levels[i] != 0);
-      if (score > thresh) return 0;
-    }
-    levels += 16;
-  }
-  return 1;
-}
-
 static void PickBestIntra16(VP8EncIterator* const it, VP8ModeScore* rd) {
   const int kNumBlocks = 16;
   VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
@@ -919,6 +988,7 @@ static void PickBestIntra16(VP8EncIterator* const it, VP8ModeScore* rd) {
   VP8ModeScore* rd_cur = &rd_tmp;
   VP8ModeScore* rd_best = rd;
   int mode;
+  int is_flat = IsFlatSource16(it->yuv_in_ + Y_OFF_ENC);
 
   rd->mode_i16 = -1;
   for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
@@ -934,10 +1004,14 @@ static void PickBestIntra16(VP8EncIterator* const it, VP8ModeScore* rd) {
         tlambda ? MULT_8B(tlambda, VP8TDisto16x16(src, tmp_dst, kWeightY)) : 0;
     rd_cur->H = VP8FixedCostsI16[mode];
     rd_cur->R = VP8GetCostLuma16(it, rd_cur);
-    if (mode > 0 &&
-        IsFlat(rd_cur->y_ac_levels[0], kNumBlocks, FLATNESS_LIMIT_I16)) {
-      // penalty to avoid flat area to be mispredicted by complex mode
-      rd_cur->R += FLATNESS_PENALTY * kNumBlocks;
+    if (is_flat) {
+      // refine the first impression (which was in pixel space)
+      is_flat = IsFlat(rd_cur->y_ac_levels[0], kNumBlocks, FLATNESS_LIMIT_I16);
+      if (is_flat) {
+        // Block is very flat. We put emphasis on the distortion being very low!
+        rd_cur->D *= 2;
+        rd_cur->SD *= 2;
+      }
     }
 
     // Since we always examine Intra16 first, we can overwrite *rd directly.
@@ -1018,7 +1092,8 @@ static int PickBestIntra4(VP8EncIterator* const it, VP8ModeScore* const rd) {
                   : 0;
       rd_tmp.H = mode_costs[mode];
 
-      // Add flatness penalty
+      // Add flatness penalty, to avoid flat area to be mispredicted
+      // by a complex mode.
       if (mode > 0 && IsFlat(tmp_levels, kNumBlocks, FLATNESS_LIMIT_I4)) {
         rd_tmp.R = FLATNESS_PENALTY * kNumBlocks;
       } else {
@@ -1101,6 +1176,9 @@ static void PickBestUV(VP8EncIterator* const it, VP8ModeScore* const rd) {
       CopyScore(&rd_best, &rd_uv);
       rd->mode_uv = mode;
       memcpy(rd->uv_levels, rd_uv.uv_levels, sizeof(rd->uv_levels));
+      if (it->top_derr_ != NULL) {
+        memcpy(rd->derr, rd_uv.derr, sizeof(rd_uv.derr));
+      }
       SwapPtr(&dst, &tmp_dst);
     }
   }
@@ -1109,6 +1187,9 @@ static void PickBestUV(VP8EncIterator* const it, VP8ModeScore* const rd) {
   if (dst != dst0) {   // copy 16x8 block if needed
     VP8Copy16x8(dst, dst0);
   }
+  if (it->top_derr_ != NULL) {  // store diffusion errors for next block
+    StoreDiffusionErrors(it, rd);
+  }
 }
 
 //------------------------------------------------------------------------------
@@ -1162,16 +1243,24 @@ static void RefineUsingDistortion(VP8EncIterator* const it,
     const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC;
     for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
       const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode];
-      const score_t score = VP8SSE16x16(src, ref) * RD_DISTO_MULT
+      const score_t score = (score_t)VP8SSE16x16(src, ref) * RD_DISTO_MULT
                           + VP8FixedCostsI16[mode] * lambda_d_i16;
       if (mode > 0 && VP8FixedCostsI16[mode] > bit_limit) {
         continue;
       }
+
       if (score < best_score) {
         best_mode = mode;
         best_score = score;
       }
     }
+    if (it->x_ == 0 || it->y_ == 0) {
+      // avoid starting a checkerboard resonance from the border. See bug #432.
+      if (IsFlatSource16(src)) {
+        best_mode = (it->x_ == 0) ? 0 : 2;
+        try_both_modes = 0;  // stick to i16
+      }
+    }
     VP8SetIntra16Mode(it, best_mode);
     // we'll reconstruct later, if i16 mode actually gets selected
   }