diff options
Diffstat (limited to 'chromium/third_party/libwebp/enc/quant.c')
-rw-r--r-- | chromium/third_party/libwebp/enc/quant.c | 272 |
1 files changed, 188 insertions, 84 deletions
diff --git a/chromium/third_party/libwebp/enc/quant.c b/chromium/third_party/libwebp/enc/quant.c index 462d4e9e6ef..e1d202b5a3b 100644 --- a/chromium/third_party/libwebp/enc/quant.c +++ b/chromium/third_party/libwebp/enc/quant.c @@ -13,6 +13,7 @@ #include <assert.h> #include <math.h> +#include <stdlib.h> // for abs() #include "./vp8enci.h" #include "./cost.h" @@ -24,18 +25,78 @@ #define MID_ALPHA 64 // neutral value for susceptibility #define MIN_ALPHA 30 // lowest usable value for susceptibility -#define MAX_ALPHA 100 // higher meaninful value for susceptibility +#define MAX_ALPHA 100 // higher meaningful value for susceptibility #define SNS_TO_DQ 0.9 // Scaling constant between the sns value and the QP // power-law modulation. Must be strictly less than 1. #define I4_PENALTY 4000 // Rate-penalty for quick i4/i16 decision +// number of non-zero coeffs below which we consider the block very flat +// (and apply a penalty to complex predictions) +#define FLATNESS_LIMIT_I16 10 // I16 mode +#define FLATNESS_LIMIT_I4 3 // I4 mode +#define FLATNESS_LIMIT_UV 2 // UV mode +#define FLATNESS_PENALTY 140 // roughly ~1bit per block + #define MULT_8B(a, b) (((a) * (b) + 128) >> 8) -#if defined(__cplusplus) || defined(c_plusplus) -extern "C" { -#endif +// #define DEBUG_BLOCK + +//------------------------------------------------------------------------------ + +#if defined(DEBUG_BLOCK) + +#include <stdio.h> +#include <stdlib.h> + +static void PrintBlockInfo(const VP8EncIterator* const it, + const VP8ModeScore* const rd) { + int i, j; + const int is_i16 = (it->mb_->type_ == 1); + printf("SOURCE / OUTPUT / ABS DELTA\n"); + for (j = 0; j < 24; ++j) { + if (j == 16) printf("\n"); // newline before the U/V block + for (i = 0; i < 16; ++i) printf("%3d ", it->yuv_in_[i + j * BPS]); + printf(" "); + for (i = 0; i < 16; ++i) printf("%3d ", it->yuv_out_[i + j * BPS]); + printf(" "); + for (i = 0; i < 16; ++i) { + printf("%1d ", abs(it->yuv_out_[i + j * BPS] - it->yuv_in_[i + j * BPS])); + } + printf("\n"); + } + printf("\nD:%d SD:%d R:%d H:%d nz:0x%x score:%d\n", + (int)rd->D, (int)rd->SD, (int)rd->R, (int)rd->H, (int)rd->nz, + (int)rd->score); + if (is_i16) { + printf("Mode: %d\n", rd->mode_i16); + printf("y_dc_levels:"); + for (i = 0; i < 16; ++i) printf("%3d ", rd->y_dc_levels[i]); + printf("\n"); + } else { + printf("Modes[16]: "); + for (i = 0; i < 16; ++i) printf("%d ", rd->modes_i4[i]); + printf("\n"); + } + printf("y_ac_levels:\n"); + for (j = 0; j < 16; ++j) { + for (i = is_i16 ? 1 : 0; i < 16; ++i) { + printf("%4d ", rd->y_ac_levels[j][i]); + } + printf("\n"); + } + printf("\n"); + printf("uv_levels (mode=%d):\n", rd->mode_uv); + for (j = 0; j < 8; ++j) { + for (i = 0; i < 16; ++i) { + printf("%4d ", rd->uv_levels[j][i]); + } + printf("\n"); + } +} + +#endif // DEBUG_BLOCK //------------------------------------------------------------------------------ @@ -104,31 +165,13 @@ static const uint16_t kAcTable2[128] = { 385, 393, 401, 409, 416, 424, 432, 440 }; -static const uint16_t kCoeffThresh[16] = { - 0, 10, 20, 30, - 10, 20, 30, 30, - 20, 30, 30, 30, - 30, 30, 30, 30 -}; - -// TODO(skal): tune more. Coeff thresholding? -static const uint8_t kBiasMatrices[3][16] = { // [3] = [luma-ac,luma-dc,chroma] - { 96, 96, 96, 96, - 96, 96, 96, 96, - 96, 96, 96, 96, - 96, 96, 96, 96 }, - { 96, 96, 96, 96, - 96, 96, 96, 96, - 96, 96, 96, 96, - 96, 96, 96, 96 }, - { 96, 96, 96, 96, - 96, 96, 96, 96, - 96, 96, 96, 96, - 96, 96, 96, 96 } +static const uint8_t kBiasMatrices[3][2] = { // [luma-ac,luma-dc,chroma][dc,ac] + { 96, 110 }, { 96, 108 }, { 110, 115 } }; -// Sharpening by (slightly) raising the hi-frequency coeffs (only for trellis). +// Sharpening by (slightly) raising the hi-frequency coeffs. // Hack-ish but helpful for mid-bitrate range. Use with care. +#define SHARPEN_BITS 11 // number of descaling bits for sharpening bias static const uint8_t kFreqSharpening[16] = { 0, 30, 60, 90, 30, 60, 90, 90, @@ -141,20 +184,30 @@ static const uint8_t kFreqSharpening[16] = { // Returns the average quantizer static int ExpandMatrix(VP8Matrix* const m, int type) { - int i; - int sum = 0; + int i, sum; + for (i = 0; i < 2; ++i) { + const int is_ac_coeff = (i > 0); + const int bias = kBiasMatrices[type][is_ac_coeff]; + m->iq_[i] = (1 << QFIX) / m->q_[i]; + m->bias_[i] = BIAS(bias); + // zthresh_ is the exact value such that QUANTDIV(coeff, iQ, B) is: + // * zero if coeff <= zthresh + // * non-zero if coeff > zthresh + m->zthresh_[i] = ((1 << QFIX) - 1 - m->bias_[i]) / m->iq_[i]; + } for (i = 2; i < 16; ++i) { m->q_[i] = m->q_[1]; + m->iq_[i] = m->iq_[1]; + m->bias_[i] = m->bias_[1]; + m->zthresh_[i] = m->zthresh_[1]; } - for (i = 0; i < 16; ++i) { - const int j = kZigzag[i]; - const int bias = kBiasMatrices[type][j]; - m->iq_[j] = (1 << QFIX) / m->q_[j]; - m->bias_[j] = BIAS(bias); - // TODO(skal): tune kCoeffThresh[] - m->zthresh_[j] = ((256 /*+ kCoeffThresh[j]*/ - bias) * m->q_[j] + 127) >> 8; - m->sharpen_[j] = (kFreqSharpening[j] * m->q_[j]) >> 11; - sum += m->q_[j]; + for (sum = 0, i = 0; i < 16; ++i) { + if (type == 0) { // we only use sharpening for AC luma coeffs + m->sharpen_[i] = (kFreqSharpening[i] * m->q_[i]) >> SHARPEN_BITS; + } else { + m->sharpen_[i] = 0; + } + sum += m->q_[i]; } return (sum + 8) >> 4; } @@ -182,17 +235,17 @@ static void SetupMatrices(VP8Encoder* enc) { q16 = ExpandMatrix(&m->y2_, 1); quv = ExpandMatrix(&m->uv_, 2); - // TODO: Switch to kLambda*[] tables? - { - m->lambda_i4_ = (3 * q4 * q4) >> 7; - m->lambda_i16_ = (3 * q16 * q16); - m->lambda_uv_ = (3 * quv * quv) >> 6; - m->lambda_mode_ = (1 * q4 * q4) >> 7; - m->lambda_trellis_i4_ = (7 * q4 * q4) >> 3; - m->lambda_trellis_i16_ = (q16 * q16) >> 2; - m->lambda_trellis_uv_ = (quv *quv) << 1; - m->tlambda_ = (tlambda_scale * q4) >> 5; - } + m->lambda_i4_ = (3 * q4 * q4) >> 7; + m->lambda_i16_ = (3 * q16 * q16); + m->lambda_uv_ = (3 * quv * quv) >> 6; + m->lambda_mode_ = (1 * q4 * q4) >> 7; + m->lambda_trellis_i4_ = (7 * q4 * q4) >> 3; + m->lambda_trellis_i16_ = (q16 * q16) >> 2; + m->lambda_trellis_uv_ = (quv *quv) << 1; + m->tlambda_ = (tlambda_scale * q4) >> 5; + + m->min_disto_ = 10 * m->y1_.q_[0]; // quantization-aware min disto + m->max_edge_ = 0; } } @@ -201,16 +254,21 @@ static void SetupMatrices(VP8Encoder* enc) { // Very small filter-strength values have close to no visual effect. So we can // save a little decoding-CPU by turning filtering off for these. -#define FSTRENGTH_CUTOFF 3 +#define FSTRENGTH_CUTOFF 2 static void SetupFilterStrength(VP8Encoder* const enc) { int i; - const int level0 = enc->config_->filter_strength; + // level0 is in [0..500]. Using '-f 50' as filter_strength is mid-filtering. + const int level0 = 5 * enc->config_->filter_strength; for (i = 0; i < NUM_MB_SEGMENTS; ++i) { - // Segments with lower quantizer will be less filtered. TODO: tune (wrt SNS) - const int level = level0 * 256 * enc->dqm_[i].quant_ / 128; - const int f = level / (256 + enc->dqm_[i].beta_); - enc->dqm_[i].fstrength_ = (f < FSTRENGTH_CUTOFF) ? 0 : (f > 63) ? 63 : f; + VP8SegmentInfo* const m = &enc->dqm_[i]; + // We focus on the quantization of AC coeffs. + const int qstep = kAcTable[clip(m->quant_, 0, 127)] >> 2; + const int base_strength = + VP8FilterStrengthFromDelta(enc->filter_hdr_.sharpness_, qstep); + // Segments with lower complexity ('beta') will be less filtered. + const int f = base_strength * level0 / (256 + m->beta_); + m->fstrength_ = (f < FSTRENGTH_CUTOFF) ? 0 : (f > 63) ? 63 : f; } // We record the initial strength (mainly for the case of 1-segment only). enc->filter_hdr_.level_ = enc->dqm_[0].fstrength_; @@ -234,7 +292,7 @@ static double QualityToCompression(double c) { // exponent is somewhere between 2.8 and 3.2, but we're mostly interested // in the mid-quant range. So we scale the compressibility inversely to // this power-law: quant ~= compression ^ 1/3. This law holds well for - // low quant. Finer modelling for high-quant would make use of kAcTable[] + // low quant. Finer modeling for high-quant would make use of kAcTable[] // more explicitly. const double v = pow(linear_c, 1 / 3.); return v; @@ -367,16 +425,14 @@ const int VP8I4ModeOffsets[NUM_BMODES] = { }; void VP8MakeLuma16Preds(const VP8EncIterator* const it) { - const VP8Encoder* const enc = it->enc_; - const uint8_t* const left = it->x_ ? enc->y_left_ : NULL; - const uint8_t* const top = it->y_ ? enc->y_top_ + it->x_ * 16 : NULL; + const uint8_t* const left = it->x_ ? it->y_left_ : NULL; + const uint8_t* const top = it->y_ ? it->y_top_ : NULL; VP8EncPredLuma16(it->yuv_p_, left, top); } void VP8MakeChroma8Preds(const VP8EncIterator* const it) { - const VP8Encoder* const enc = it->enc_; - const uint8_t* const left = it->x_ ? enc->u_left_ : NULL; - const uint8_t* const top = it->y_ ? enc->uv_top_ + it->x_ * 16 : NULL; + const uint8_t* const left = it->x_ ? it->u_left_ : NULL; + const uint8_t* const top = it->y_ ? it->uv_top_ : NULL; VP8EncPredChroma8(it->yuv_p_, left, top); } @@ -432,6 +488,7 @@ static void InitScore(VP8ModeScore* const rd) { rd->D = 0; rd->SD = 0; rd->R = 0; + rd->H = 0; rd->nz = 0; rd->score = MAX_COST; } @@ -440,6 +497,7 @@ static void CopyScore(VP8ModeScore* const dst, const VP8ModeScore* const src) { dst->D = src->D; dst->SD = src->SD; dst->R = src->R; + dst->H = src->H; dst->nz = src->nz; // note that nz is not accumulated, but just copied. dst->score = src->score; } @@ -448,6 +506,7 @@ static void AddScore(VP8ModeScore* const dst, const VP8ModeScore* const src) { dst->D += src->D; dst->SD += src->SD; dst->R += src->R; + dst->H += src->H; dst->nz |= src->nz; // here, new nz bits are accumulated. dst->score += src->score; } @@ -476,7 +535,7 @@ typedef struct { static WEBP_INLINE void SetRDScore(int lambda, VP8ModeScore* const rd) { // TODO: incorporate the "* 256" in the tables? - rd->score = rd->R * lambda + 256 * (rd->D + rd->SD); + rd->score = (rd->R + rd->H) * lambda + 256 * (rd->D + rd->SD); } static WEBP_INLINE score_t RDScoreTrellis(int lambda, score_t rate, @@ -539,11 +598,10 @@ static int TrellisQuantizeBlock(const VP8EncIterator* const it, // note: it's important to take sign of the _original_ coeff, // so we don't have to consider level < 0 afterward. const int sign = (in[j] < 0); - int coeff0 = (sign ? -in[j] : in[j]) + mtx->sharpen_[j]; - int level0; - if (coeff0 > 2047) coeff0 = 2047; + const int coeff0 = (sign ? -in[j] : in[j]) + mtx->sharpen_[j]; + int level0 = QUANTDIV(coeff0, iQ, B); + if (level0 > MAX_LEVEL) level0 = MAX_LEVEL; - level0 = QUANTDIV(coeff0, iQ, B); // test all alternate level values around level0. for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) { Node* const cur = &NODE(n, m); @@ -555,7 +613,7 @@ static int TrellisQuantizeBlock(const VP8EncIterator* const it, cur->sign = sign; cur->level = level; cur->ctx = (level == 0) ? 0 : (level == 1) ? 1 : 2; - if (level >= 2048 || level < 0) { // node is dead? + if (level > MAX_LEVEL || level < 0) { // node is dead? cur->cost = MAX_COST; continue; } @@ -648,10 +706,10 @@ static int ReconstructIntra16(VP8EncIterator* const it, VP8ModeScore* const rd, uint8_t* const yuv_out, int mode) { - const VP8Encoder* const enc = it->enc_; + VP8Encoder* const enc = it->enc_; const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode]; const uint8_t* const src = it->yuv_in_ + Y_OFF; - const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_]; + VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_]; int nz = 0; int n; int16_t tmp[16][16], dc_tmp[16]; @@ -660,7 +718,7 @@ static int ReconstructIntra16(VP8EncIterator* const it, VP8FTransform(src + VP8Scan[n], ref + VP8Scan[n], tmp[n]); } VP8FTransformWHT(tmp[0], dc_tmp); - nz |= VP8EncQuantizeBlock(dc_tmp, rd->y_dc_levels, 0, &dqm->y2_) << 24; + nz |= VP8EncQuantizeBlockWHT(dc_tmp, rd->y_dc_levels, &dqm->y2_) << 24; if (DO_TRELLIS_I16 && it->do_trellis_) { int x, y; @@ -755,7 +813,18 @@ static int ReconstructUV(VP8EncIterator* const it, VP8ModeScore* const rd, //------------------------------------------------------------------------------ // RD-opt decision. Reconstruct each modes, evalue distortion and bit-cost. -// Pick the mode is lower RD-cost = Rate + lamba * Distortion. +// Pick the mode is lower RD-cost = Rate + lambda * Distortion. + +static void StoreMaxDelta(VP8SegmentInfo* const dqm, const int16_t DCs[16]) { + // We look at the first three AC coefficients to determine what is the average + // delta between each sub-4x4 block. + const int v0 = abs(DCs[1]); + const int v1 = abs(DCs[4]); + const int v2 = abs(DCs[5]); + int max_v = (v0 > v1) ? v1 : v0; + max_v = (v2 > max_v) ? v2 : max_v; + if (max_v > dqm->max_edge_) dqm->max_edge_ = max_v; +} static void SwapPtr(uint8_t** a, uint8_t** b) { uint8_t* const tmp = *a; @@ -767,9 +836,23 @@ static void SwapOut(VP8EncIterator* const it) { SwapPtr(&it->yuv_out_, &it->yuv_out2_); } +static score_t IsFlat(const int16_t* levels, int num_blocks, score_t thresh) { + score_t score = 0; + while (num_blocks-- > 0) { // TODO(skal): refine positional scoring? + int i; + for (i = 1; i < 16; ++i) { // omit DC, we're only interested in AC + score += (levels[i] != 0); + if (score > thresh) return 0; + } + levels += 16; + } + return 1; +} + static void PickBestIntra16(VP8EncIterator* const it, VP8ModeScore* const rd) { - const VP8Encoder* const enc = it->enc_; - const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_]; + const int kNumBlocks = 16; + VP8Encoder* const enc = it->enc_; + VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_]; const int lambda = dqm->lambda_i16_; const int tlambda = dqm->tlambda_; const uint8_t* const src = it->yuv_in_ + Y_OFF; @@ -788,8 +871,13 @@ static void PickBestIntra16(VP8EncIterator* const it, VP8ModeScore* const rd) { rd16.D = VP8SSE16x16(src, tmp_dst); rd16.SD = tlambda ? MULT_8B(tlambda, VP8TDisto16x16(src, tmp_dst, kWeightY)) : 0; + rd16.H = VP8FixedCostsI16[mode]; rd16.R = VP8GetCostLuma16(it, &rd16); - rd16.R += VP8FixedCostsI16[mode]; + if (mode > 0 && + IsFlat(rd16.y_ac_levels[0], kNumBlocks, FLATNESS_LIMIT_I16)) { + // penalty to avoid flat area to be mispredicted by complex mode + rd16.R += FLATNESS_PENALTY * kNumBlocks; + } // Since we always examine Intra16 first, we can overwrite *rd directly. SetRDScore(lambda, &rd16); @@ -804,6 +892,13 @@ static void PickBestIntra16(VP8EncIterator* const it, VP8ModeScore* const rd) { } SetRDScore(dqm->lambda_mode_, rd); // finalize score for mode decision. VP8SetIntra16Mode(it, rd->mode_i16); + + // we have a blocky macroblock (only DCs are non-zero) with fairly high + // distortion, record max delta so we can later adjust the minimal filtering + // strength needed to smooth these blocks out. + if ((rd->nz & 0xffff) == 0 && rd->D > dqm->min_disto_) { + StoreMaxDelta(dqm, rd->y_dc_levels); + } } //------------------------------------------------------------------------------ @@ -833,9 +928,11 @@ static int PickBestIntra4(VP8EncIterator* const it, VP8ModeScore* const rd) { } InitScore(&rd_best); - rd_best.score = 211; // '211' is the value of VP8BitCost(0, 145) + rd_best.H = 211; // '211' is the value of VP8BitCost(0, 145) + SetRDScore(dqm->lambda_mode_, &rd_best); VP8IteratorStartI4(it); do { + const int kNumBlocks = 1; VP8ModeScore rd_i4; int mode; int best_mode = -1; @@ -859,8 +956,11 @@ static int PickBestIntra4(VP8EncIterator* const it, VP8ModeScore* const rd) { rd_tmp.SD = tlambda ? MULT_8B(tlambda, VP8TDisto4x4(src, tmp_dst, kWeightY)) : 0; + rd_tmp.H = mode_costs[mode]; rd_tmp.R = VP8GetCostLuma4(it, tmp_levels); - rd_tmp.R += mode_costs[mode]; + if (mode > 0 && IsFlat(tmp_levels, kNumBlocks, FLATNESS_LIMIT_I4)) { + rd_tmp.R += FLATNESS_PENALTY * kNumBlocks; + } SetRDScore(lambda, &rd_tmp); if (best_mode < 0 || rd_tmp.score < rd_i4.score) { @@ -872,14 +972,17 @@ static int PickBestIntra4(VP8EncIterator* const it, VP8ModeScore* const rd) { } SetRDScore(dqm->lambda_mode_, &rd_i4); AddScore(&rd_best, &rd_i4); - total_header_bits += mode_costs[best_mode]; - if (rd_best.score >= rd->score || - total_header_bits > enc->max_i4_header_bits_) { + if (rd_best.score >= rd->score) { + return 0; + } + total_header_bits += (int)rd_i4.H; // <- equal to mode_costs[best_mode]; + if (total_header_bits > enc->max_i4_header_bits_) { return 0; } // Copy selected samples if not in the right place already. - if (best_block != best_blocks + VP8Scan[it->i4_]) + if (best_block != best_blocks + VP8Scan[it->i4_]) { VP8Copy4x4(best_block, best_blocks + VP8Scan[it->i4_]); + } rd->modes_i4[it->i4_] = best_mode; it->top_nz_[it->i4_ & 3] = it->left_nz_[it->i4_ >> 2] = (rd_i4.nz ? 1 : 0); } while (VP8IteratorRotateI4(it, best_blocks)); @@ -895,6 +998,7 @@ static int PickBestIntra4(VP8EncIterator* const it, VP8ModeScore* const rd) { //------------------------------------------------------------------------------ static void PickBestUV(VP8EncIterator* const it, VP8ModeScore* const rd) { + const int kNumBlocks = 8; const VP8Encoder* const enc = it->enc_; const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_]; const int lambda = dqm->lambda_uv_; @@ -915,8 +1019,11 @@ static void PickBestUV(VP8EncIterator* const it, VP8ModeScore* const rd) { // Compute RD-score rd_uv.D = VP8SSE16x8(src, tmp_dst); rd_uv.SD = 0; // TODO: should we call TDisto? it tends to flatten areas. + rd_uv.H = VP8FixedCostsUV[mode]; rd_uv.R = VP8GetCostUV(it, &rd_uv); - rd_uv.R += VP8FixedCostsUV[mode]; + if (mode > 0 && IsFlat(rd_uv.uv_levels[0], kNumBlocks, FLATNESS_LIMIT_UV)) { + rd_uv.R += FLATNESS_PENALTY * kNumBlocks; + } SetRDScore(lambda, &rd_uv); if (mode == 0 || rd_uv.score < rd_best.score) { @@ -1047,6 +1154,3 @@ int VP8Decimate(VP8EncIterator* const it, VP8ModeScore* const rd, return is_skipped; } -#if defined(__cplusplus) || defined(c_plusplus) -} // extern "C" -#endif |