Update Chromium to beta version 37.0.2062.68

Change-Id: I188e3b5aff1bec75566014291b654eb19f5bc8ca Reviewed-by: Andras Becsi <andras.becsi@digia.com>
author: Jocelyn Turcotte <jocelyn.turcotte@digia.com> 2014-08-08 14:30:41 +0200
committer: Jocelyn Turcotte <jocelyn.turcotte@digia.com> 2014-08-12 13:49:54 +0200
commit: ab0a50979b9eb4dfa3320eff7e187e41efedf7a9 (patch)
tree: 498dfb8a97ff3361a9f7486863a52bb4e26bb898 /chromium/third_party/libvpx/source/libvpx/vp9/encoder
parent: 4ce69f7403811819800e7c5ae1318b2647e778d1 (diff)
95 files changed, 24196 insertions, 17381 deletions
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_complexity.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_complexity.c
new file mode 100644
index 00000000000..47ad8d8cc42
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_complexity.c
@@ -0,0 +1,103 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <limits.h>
+#include <math.h>
+
+#include "vp9/common/vp9_seg_common.h"
+
+#include "vp9/encoder/vp9_segmentation.h"
+
+static const double in_frame_q_adj_ratio[MAX_SEGMENTS] =
+  {1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
+
+void vp9_setup_in_frame_q_adj(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  struct segmentation *const seg = &cm->seg;
+
+  // Make SURE use of floating point in this function is safe.
+  vp9_clear_system_state();
+
+  if (cm->frame_type == KEY_FRAME ||
+      cpi->refresh_alt_ref_frame ||
+      (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref)) {
+    int segment;
+
+    // Clear down the segment map.
+    vpx_memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols);
+
+    // Clear down the complexity map used for rd.
+    vpx_memset(cpi->complexity_map, 0, cm->mi_rows * cm->mi_cols);
+
+    vp9_enable_segmentation(seg);
+    vp9_clearall_segfeatures(seg);
+
+    // Select delta coding method.
+    seg->abs_delta = SEGMENT_DELTADATA;
+
+    // Segment 0 "Q" feature is disabled so it defaults to the baseline Q.
+    vp9_disable_segfeature(seg, 0, SEG_LVL_ALT_Q);
+
+    // Use some of the segments for in frame Q adjustment.
+    for (segment = 1; segment < 2; segment++) {
+      const int qindex_delta =
+          vp9_compute_qdelta_by_rate(&cpi->rc, cm->frame_type, cm->base_qindex,
+                                     in_frame_q_adj_ratio[segment]);
+      vp9_enable_segfeature(seg, segment, SEG_LVL_ALT_Q);
+      vp9_set_segdata(seg, segment, SEG_LVL_ALT_Q, qindex_delta);
+    }
+  }
+}
+
+// Select a segment for the current SB64
+void vp9_select_in_frame_q_segment(VP9_COMP *cpi,
+                                      int mi_row, int mi_col,
+                                      int output_enabled, int projected_rate) {
+  VP9_COMMON *const cm = &cpi->common;
+
+  const int mi_offset = mi_row * cm->mi_cols + mi_col;
+  const int bw = num_8x8_blocks_wide_lookup[BLOCK_64X64];
+  const int bh = num_8x8_blocks_high_lookup[BLOCK_64X64];
+  const int xmis = MIN(cm->mi_cols - mi_col, bw);
+  const int ymis = MIN(cm->mi_rows - mi_row, bh);
+  int complexity_metric = 64;
+  int x, y;
+
+  unsigned char segment;
+
+  if (!output_enabled) {
+    segment = 0;
+  } else {
+    // Rate depends on fraction of a SB64 in frame (xmis * ymis / bw * bh).
+    // It is converted to bits * 256 units.
+    const int target_rate = (cpi->rc.sb64_target_rate * xmis * ymis * 256) /
+                            (bw * bh);
+
+    if (projected_rate < (target_rate / 4)) {
+      segment = 1;
+    } else {
+      segment = 0;
+    }
+
+    if (target_rate > 0) {
+      complexity_metric =
+        clamp((int)((projected_rate * 64) / target_rate), 16, 255);
+    }
+  }
+
+  // Fill in the entires in the segment map corresponding to this SB64.
+  for (y = 0; y < ymis; y++) {
+    for (x = 0; x < xmis; x++) {
+      cpi->segmentation_map[mi_offset + y * cm->mi_cols + x] = segment;
+      cpi->complexity_map[mi_offset + y * cm->mi_cols + x] =
+        (unsigned char)complexity_metric;
+    }
+  }
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_complexity.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_complexity.h
new file mode 100644
index 00000000000..af031a46c6c
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_complexity.h
@@ -0,0 +1,34 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP9_ENCODER_VP9_AQ_COMPLEXITY_H_
+#define VP9_ENCODER_VP9_AQ_COMPLEXITY_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct VP9_COMP;
+
+// Select a segment for the current SB64.
+void vp9_select_in_frame_q_segment(struct VP9_COMP *cpi, int mi_row, int mi_col,
+                                   int output_enabled, int projected_rate);
+
+
+// This function sets up a set of segments with delta Q values around
+// the baseline frame quantizer.
+void vp9_setup_in_frame_q_adj(struct VP9_COMP *cpi);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_ENCODER_VP9_AQ_COMPLEXITY_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c
new file mode 100644
index 00000000000..d1437d3770f
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c
@@ -0,0 +1,324 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <limits.h>
+#include <math.h>
+
+#include "vp9/encoder/vp9_aq_cyclicrefresh.h"
+
+#include "vp9/common/vp9_seg_common.h"
+
+#include "vp9/encoder/vp9_ratectrl.h"
+#include "vp9/encoder/vp9_rdopt.h"
+#include "vp9/encoder/vp9_segmentation.h"
+
+struct CYCLIC_REFRESH {
+  // Percentage of super-blocks per frame that are targeted as candidates
+  // for cyclic refresh.
+  int max_sbs_perframe;
+  // Maximum q-delta as percentage of base q.
+  int max_qdelta_perc;
+  // Block size below which we don't apply cyclic refresh.
+  BLOCK_SIZE min_block_size;
+  // Superblock starting index for cycling through the frame.
+  int sb_index;
+  // Controls how long a block will need to wait to be refreshed again.
+  int time_for_refresh;
+  // Actual number of (8x8) blocks that were applied delta-q (segment 1).
+  int num_seg_blocks;
+  // Actual encoding bits for segment 1.
+  int actual_seg_bits;
+  // RD mult. parameters for segment 1.
+  int rdmult;
+  // Cyclic refresh map.
+  signed char *map;
+  // Projected rate and distortion for the current superblock.
+  int64_t projected_rate_sb;
+  int64_t projected_dist_sb;
+  // Thresholds applied to projected rate/distortion of the superblock.
+  int64_t thresh_rate_sb;
+  int64_t thresh_dist_sb;
+};
+
+CYCLIC_REFRESH *vp9_cyclic_refresh_alloc(int mi_rows, int mi_cols) {
+  CYCLIC_REFRESH *const cr = vpx_calloc(1, sizeof(*cr));
+  if (cr == NULL)
+    return NULL;
+
+  cr->map = vpx_calloc(mi_rows * mi_cols, sizeof(*cr->map));
+  if (cr->map == NULL) {
+    vpx_free(cr);
+    return NULL;
+  }
+
+  return cr;
+}
+
+void vp9_cyclic_refresh_free(CYCLIC_REFRESH *cr) {
+  vpx_free(cr->map);
+  vpx_free(cr);
+}
+
+// Check if we should turn off cyclic refresh based on bitrate condition.
+static int apply_cyclic_refresh_bitrate(const VP9_COMMON *cm,
+                                        const RATE_CONTROL *rc) {
+  // Turn off cyclic refresh if bits available per frame is not sufficiently
+  // larger than bit cost of segmentation. Segment map bit cost should scale
+  // with number of seg blocks, so compare available bits to number of blocks.
+  // Average bits available per frame = avg_frame_bandwidth
+  // Number of (8x8) blocks in frame = mi_rows * mi_cols;
+  const float factor  = 0.5;
+  const int number_blocks = cm->mi_rows  * cm->mi_cols;
+  // The condition below corresponds to turning off at target bitrates:
+  // ~24kbps for CIF, 72kbps for VGA (at 30fps).
+  // Also turn off at very small frame sizes, to avoid too large fraction of
+  // superblocks to be refreshed per frame. Threshold below is less than QCIF.
+  if (rc->avg_frame_bandwidth < factor * number_blocks ||
+      number_blocks / 64 < 5)
+    return 0;
+  else
+    return 1;
+}
+
+// Check if this coding block, of size bsize, should be considered for refresh
+// (lower-qp coding). Decision can be based on various factors, such as
+// size of the coding block (i.e., below min_block size rejected), coding
+// mode, and rate/distortion.
+static int candidate_refresh_aq(const CYCLIC_REFRESH *cr,
+                                const MB_MODE_INFO *mbmi,
+                                BLOCK_SIZE bsize, int use_rd) {
+  if (use_rd) {
+    // If projected rate is below the thresh_rate (well below target,
+    // so undershoot expected), accept it for lower-qp coding.
+    if (cr->projected_rate_sb < cr->thresh_rate_sb)
+      return 1;
+    // Otherwise, reject the block for lower-qp coding if any of the following:
+    // 1) prediction block size is below min_block_size
+    // 2) mode is non-zero mv and projected distortion is above thresh_dist
+    // 3) mode is an intra-mode (we may want to allow some of this under
+    // another thresh_dist)
+    else if (bsize < cr->min_block_size ||
+             (mbmi->mv[0].as_int != 0 &&
+              cr->projected_dist_sb > cr->thresh_dist_sb) ||
+             !is_inter_block(mbmi))
+      return 0;
+    else
+      return 1;
+  } else {
+    // Rate/distortion not used for update.
+    if (bsize < cr->min_block_size ||
+        mbmi->mv[0].as_int != 0 ||
+        !is_inter_block(mbmi))
+      return 0;
+    else
+      return 1;
+  }
+}
+
+// Prior to coding a given prediction block, of size bsize at (mi_row, mi_col),
+// check if we should reset the segment_id, and update the cyclic_refresh map
+// and segmentation map.
+void vp9_cyclic_refresh_update_segment(VP9_COMP *const cpi,
+                                       MB_MODE_INFO *const mbmi,
+                                       int mi_row, int mi_col,
+                                       BLOCK_SIZE bsize, int use_rd) {
+  const VP9_COMMON *const cm = &cpi->common;
+  CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+  const int bw = num_8x8_blocks_wide_lookup[bsize];
+  const int bh = num_8x8_blocks_high_lookup[bsize];
+  const int xmis = MIN(cm->mi_cols - mi_col, bw);
+  const int ymis = MIN(cm->mi_rows - mi_row, bh);
+  const int block_index = mi_row * cm->mi_cols + mi_col;
+  const int refresh_this_block = cpi->mb.in_static_area ||
+                                 candidate_refresh_aq(cr, mbmi, bsize, use_rd);
+  // Default is to not update the refresh map.
+  int new_map_value = cr->map[block_index];
+  int x = 0; int y = 0;
+
+  // Check if we should reset the segment_id for this block.
+  if (mbmi->segment_id > 0 && !refresh_this_block)
+    mbmi->segment_id = 0;
+
+  // Update the cyclic refresh map, to be used for setting segmentation map
+  // for the next frame. If the block  will be refreshed this frame, mark it
+  // as clean. The magnitude of the -ve influences how long before we consider
+  // it for refresh again.
+  if (mbmi->segment_id == 1) {
+    new_map_value = -cr->time_for_refresh;
+  } else if (refresh_this_block) {
+    // Else if it is accepted as candidate for refresh, and has not already
+    // been refreshed (marked as 1) then mark it as a candidate for cleanup
+    // for future time (marked as 0), otherwise don't update it.
+    if (cr->map[block_index] == 1)
+      new_map_value = 0;
+  } else {
+    // Leave it marked as block that is not candidate for refresh.
+    new_map_value = 1;
+  }
+  // Update entries in the cyclic refresh map with new_map_value, and
+  // copy mbmi->segment_id into global segmentation map.
+  for (y = 0; y < ymis; y++)
+    for (x = 0; x < xmis; x++) {
+      cr->map[block_index + y * cm->mi_cols + x] = new_map_value;
+      cpi->segmentation_map[block_index + y * cm->mi_cols + x] =
+          mbmi->segment_id;
+    }
+  // Keep track of actual number (in units of 8x8) of blocks in segment 1 used
+  // for encoding this frame.
+  if (mbmi->segment_id)
+    cr->num_seg_blocks += xmis * ymis;
+}
+
+// Setup cyclic background refresh: set delta q and segmentation map.
+void vp9_cyclic_refresh_setup(VP9_COMP *const cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  const RATE_CONTROL *const rc = &cpi->rc;
+  CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+  struct segmentation *const seg = &cm->seg;
+  unsigned char *const seg_map = cpi->segmentation_map;
+  const int apply_cyclic_refresh  = apply_cyclic_refresh_bitrate(cm, rc);
+  // Don't apply refresh on key frame or enhancement layer frames.
+  if (!apply_cyclic_refresh ||
+      (cm->frame_type == KEY_FRAME) ||
+      (cpi->svc.temporal_layer_id > 0)) {
+    // Set segmentation map to 0 and disable.
+    vpx_memset(seg_map, 0, cm->mi_rows * cm->mi_cols);
+    vp9_disable_segmentation(&cm->seg);
+    if (cm->frame_type == KEY_FRAME)
+      cr->sb_index = 0;
+    return;
+  } else {
+    int qindex_delta = 0;
+    int i, block_count, bl_index, sb_rows, sb_cols, sbs_in_frame;
+    int xmis, ymis, x, y, qindex2;
+
+    // Rate target ratio to set q delta.
+    const float rate_ratio_qdelta = 2.0;
+    const double q = vp9_convert_qindex_to_q(cm->base_qindex);
+    vp9_clear_system_state();
+    // Some of these parameters may be set via codec-control function later.
+    cr->max_sbs_perframe = 10;
+    cr->max_qdelta_perc = 50;
+    cr->min_block_size = BLOCK_8X8;
+    cr->time_for_refresh = 1;
+    // Set rate threshold to some fraction of target (and scaled by 256).
+    cr->thresh_rate_sb = (rc->sb64_target_rate * 256) >> 2;
+    // Distortion threshold, quadratic in Q, scale factor to be adjusted.
+    cr->thresh_dist_sb = 8 * (int)(q * q);
+    if (cpi->sf.use_nonrd_pick_mode) {
+      // May want to be more conservative with thresholds in non-rd mode for now
+      // as rate/distortion are derived from model based on prediction residual.
+      cr->thresh_rate_sb = (rc->sb64_target_rate * 256) >> 3;
+      cr->thresh_dist_sb = 4 * (int)(q * q);
+    }
+
+    cr->num_seg_blocks = 0;
+    // Set up segmentation.
+    // Clear down the segment map.
+    vpx_memset(seg_map, 0, cm->mi_rows * cm->mi_cols);
+    vp9_enable_segmentation(&cm->seg);
+    vp9_clearall_segfeatures(seg);
+    // Select delta coding method.
+    seg->abs_delta = SEGMENT_DELTADATA;
+
+    // Note: setting temporal_update has no effect, as the seg-map coding method
+    // (temporal or spatial) is determined in vp9_choose_segmap_coding_method(),
+    // based on the coding cost of each method. For error_resilient mode on the
+    // last_frame_seg_map is set to 0, so if temporal coding is used, it is
+    // relative to 0 previous map.
+    // seg->temporal_update = 0;
+
+    // Segment 0 "Q" feature is disabled so it defaults to the baseline Q.
+    vp9_disable_segfeature(seg, 0, SEG_LVL_ALT_Q);
+    // Use segment 1 for in-frame Q adjustment.
+    vp9_enable_segfeature(seg, 1, SEG_LVL_ALT_Q);
+
+    // Set the q delta for segment 1.
+    qindex_delta = vp9_compute_qdelta_by_rate(rc, cm->frame_type,
+                                              cm->base_qindex,
+                                              rate_ratio_qdelta);
+    // TODO(marpan): Incorporate the actual-vs-target rate over/undershoot from
+    // previous encoded frame.
+    if (-qindex_delta > cr->max_qdelta_perc * cm->base_qindex / 100)
+      qindex_delta = -cr->max_qdelta_perc * cm->base_qindex / 100;
+
+    // Compute rd-mult for segment 1.
+    qindex2 = clamp(cm->base_qindex + cm->y_dc_delta_q + qindex_delta, 0, MAXQ);
+    cr->rdmult = vp9_compute_rd_mult(cpi, qindex2);
+
+    vp9_set_segdata(seg, 1, SEG_LVL_ALT_Q, qindex_delta);
+
+    sb_cols = (cm->mi_cols + MI_BLOCK_SIZE - 1) / MI_BLOCK_SIZE;
+    sb_rows = (cm->mi_rows + MI_BLOCK_SIZE - 1) / MI_BLOCK_SIZE;
+    sbs_in_frame = sb_cols * sb_rows;
+    // Number of target superblocks to get the q delta (segment 1).
+    block_count = cr->max_sbs_perframe * sbs_in_frame / 100;
+    // Set the segmentation map: cycle through the superblocks, starting at
+    // cr->mb_index, and stopping when either block_count blocks have been found
+    // to be refreshed, or we have passed through whole frame.
+    assert(cr->sb_index < sbs_in_frame);
+    i = cr->sb_index;
+    do {
+      int sum_map = 0;
+      // Get the mi_row/mi_col corresponding to superblock index i.
+      int sb_row_index = (i / sb_cols);
+      int sb_col_index = i - sb_row_index * sb_cols;
+      int mi_row = sb_row_index * MI_BLOCK_SIZE;
+      int mi_col = sb_col_index * MI_BLOCK_SIZE;
+      assert(mi_row >= 0 && mi_row < cm->mi_rows);
+      assert(mi_col >= 0 && mi_col < cm->mi_cols);
+      bl_index = mi_row * cm->mi_cols + mi_col;
+      // Loop through all 8x8 blocks in superblock and update map.
+      xmis = MIN(cm->mi_cols - mi_col,
+                 num_8x8_blocks_wide_lookup[BLOCK_64X64]);
+      ymis = MIN(cm->mi_rows - mi_row,
+                 num_8x8_blocks_high_lookup[BLOCK_64X64]);
+      for (y = 0; y < ymis; y++) {
+        for (x = 0; x < xmis; x++) {
+          const int bl_index2 = bl_index + y * cm->mi_cols + x;
+          // If the block is as a candidate for clean up then mark it
+          // for possible boost/refresh (segment 1). The segment id may get
+          // reset to 0 later if block gets coded anything other than ZEROMV.
+          if (cr->map[bl_index2] == 0) {
+            seg_map[bl_index2] = 1;
+            sum_map++;
+          } else if (cr->map[bl_index2] < 0) {
+            cr->map[bl_index2]++;
+          }
+        }
+      }
+      // Enforce constant segment over superblock.
+      // If segment is partial over superblock, reset to either all 1 or 0.
+      if (sum_map > 0 && sum_map < xmis * ymis) {
+        const int new_value = (sum_map >= xmis * ymis / 2);
+        for (y = 0; y < ymis; y++)
+          for (x = 0; x < xmis; x++)
+            seg_map[bl_index + y * cm->mi_cols + x] = new_value;
+      }
+      i++;
+      if (i == sbs_in_frame) {
+        i = 0;
+      }
+      if (sum_map >= xmis * ymis /2)
+        block_count--;
+    } while (block_count && i != cr->sb_index);
+    cr->sb_index = i;
+  }
+}
+
+void vp9_cyclic_refresh_set_rate_and_dist_sb(CYCLIC_REFRESH *cr,
+                                             int64_t rate_sb, int64_t dist_sb) {
+  cr->projected_rate_sb = rate_sb;
+  cr->projected_dist_sb = dist_sb;
+}
+
+int vp9_cyclic_refresh_get_rdmult(const CYCLIC_REFRESH *cr) {
+  return cr->rdmult;
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.h
new file mode 100644
index 00000000000..f556d658bdc
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.h
@@ -0,0 +1,50 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP9_ENCODER_VP9_AQ_CYCLICREFRESH_H_
+#define VP9_ENCODER_VP9_AQ_CYCLICREFRESH_H_
+
+#include "vp9/common/vp9_blockd.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct VP9_COMP;
+
+struct CYCLIC_REFRESH;
+typedef struct CYCLIC_REFRESH CYCLIC_REFRESH;
+
+CYCLIC_REFRESH *vp9_cyclic_refresh_alloc(int mi_rows, int mi_cols);
+
+void vp9_cyclic_refresh_free(CYCLIC_REFRESH *cr);
+
+// Prior to coding a given prediction block, of size bsize at (mi_row, mi_col),
+// check if we should reset the segment_id, and update the cyclic_refresh map
+// and segmentation map.
+void vp9_cyclic_refresh_update_segment(struct VP9_COMP *const cpi,
+                                       MB_MODE_INFO *const mbmi,
+                                       int mi_row, int mi_col,
+                                       BLOCK_SIZE bsize, int use_rd);
+
+// Setup cyclic background refresh: set delta q and segmentation map.
+void vp9_cyclic_refresh_setup(struct VP9_COMP *const cpi);
+
+void vp9_cyclic_refresh_set_rate_and_dist_sb(CYCLIC_REFRESH *cr,
+                                             int64_t rate_sb, int64_t dist_sb);
+
+int vp9_cyclic_refresh_get_rdmult(const CYCLIC_REFRESH *cr);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_ENCODER_VP9_AQ_CYCLICREFRESH_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_vaq.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_variance.c
index 3179ae301be..ae2a163b126 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_vaq.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_variance.c
@@ -10,7 +10,7 @@
 
 #include <math.h>
 
-#include "vp9/encoder/vp9_vaq.h"
+#include "vp9/encoder/vp9_aq_variance.h"
 
 #include "vp9/common/vp9_seg_common.h"
 
@@ -19,8 +19,8 @@
 #include "vp9/encoder/vp9_segmentation.h"
 #include "vp9/common/vp9_systemdependent.h"
 
-#define ENERGY_MIN (-3)
-#define ENERGY_MAX (3)
+#define ENERGY_MIN (-1)
+#define ENERGY_MAX (1)
 #define ENERGY_SPAN (ENERGY_MAX - ENERGY_MIN +  1)
 #define ENERGY_IN_BOUNDS(energy)\
   assert((energy) >= ENERGY_MIN && (energy) <= ENERGY_MAX)
@@ -44,7 +44,7 @@ unsigned int vp9_vaq_segment_id(int energy) {
 double vp9_vaq_rdmult_ratio(int energy) {
   ENERGY_IN_BOUNDS(energy);
 
-  vp9_clear_system_state();  // __asm emms;
+  vp9_clear_system_state();
 
   return RDMULT_RATIO(energy);
 }
@@ -52,7 +52,7 @@ double vp9_vaq_rdmult_ratio(int energy) {
 double vp9_vaq_inv_q_ratio(int energy) {
   ENERGY_IN_BOUNDS(energy);
 
-  vp9_clear_system_state();  // __asm emms;
+  vp9_clear_system_state();
 
   return Q_RATIO(-energy);
 }
@@ -63,9 +63,9 @@ void vp9_vaq_init() {
 
   assert(ENERGY_SPAN <= MAX_SEGMENTS);
 
-  vp9_clear_system_state();  // __asm emms;
+  vp9_clear_system_state();
 
-  base_ratio = 1.8;
+  base_ratio = 1.5;
 
   for (i = ENERGY_MIN; i <= ENERGY_MAX; i++) {
     Q_RATIO(i) = pow(base_ratio, i/3.0);
@@ -75,35 +75,39 @@ void vp9_vaq_init() {
 void vp9_vaq_frame_setup(VP9_COMP *cpi) {
   VP9_COMMON *cm = &cpi->common;
   struct segmentation *seg = &cm->seg;
-  int base_q = vp9_convert_qindex_to_q(cm->base_qindex);
-  int base_rdmult = vp9_compute_rd_mult(cpi, cm->base_qindex +
-                                        cm->y_dc_delta_q);
+  const double base_q = vp9_convert_qindex_to_q(cm->base_qindex);
+  const int base_rdmult = vp9_compute_rd_mult(cpi, cm->base_qindex +
+                                              cm->y_dc_delta_q);
   int i;
 
-  vp9_enable_segmentation((VP9_PTR)cpi);
-  vp9_clearall_segfeatures(seg);
+  if (cm->frame_type == KEY_FRAME ||
+      cpi->refresh_alt_ref_frame ||
+      (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref)) {
+    vp9_enable_segmentation(seg);
+    vp9_clearall_segfeatures(seg);
 
-  seg->abs_delta = SEGMENT_DELTADATA;
+    seg->abs_delta = SEGMENT_DELTADATA;
 
-  vp9_clear_system_state();  // __asm emms;
+  vp9_clear_system_state();
 
-  for (i = ENERGY_MIN; i <= ENERGY_MAX; i++) {
-    int qindex_delta, segment_rdmult;
+    for (i = ENERGY_MIN; i <= ENERGY_MAX; i++) {
+      int qindex_delta, segment_rdmult;
 
-    if (Q_RATIO(i) == 1) {
-      // No need to enable SEG_LVL_ALT_Q for this segment
-      RDMULT_RATIO(i) = 1;
-      continue;
-    }
+      if (Q_RATIO(i) == 1) {
+        // No need to enable SEG_LVL_ALT_Q for this segment
+        RDMULT_RATIO(i) = 1;
+        continue;
+      }
 
-    qindex_delta = vp9_compute_qdelta(cpi, base_q, base_q * Q_RATIO(i));
-    vp9_set_segdata(seg, SEGMENT_ID(i), SEG_LVL_ALT_Q, qindex_delta);
-    vp9_enable_segfeature(seg, SEGMENT_ID(i), SEG_LVL_ALT_Q);
+      qindex_delta = vp9_compute_qdelta(&cpi->rc, base_q, base_q * Q_RATIO(i));
+      vp9_set_segdata(seg, SEGMENT_ID(i), SEG_LVL_ALT_Q, qindex_delta);
+      vp9_enable_segfeature(seg, SEGMENT_ID(i), SEG_LVL_ALT_Q);
 
-    segment_rdmult = vp9_compute_rd_mult(cpi, cm->base_qindex + qindex_delta +
-                                         cm->y_dc_delta_q);
+      segment_rdmult = vp9_compute_rd_mult(cpi, cm->base_qindex + qindex_delta +
+                                           cm->y_dc_delta_q);
 
-    RDMULT_RATIO(i) = (double) segment_rdmult / base_rdmult;
+      RDMULT_RATIO(i) = (double) segment_rdmult / base_rdmult;
+    }
   }
 }
 
@@ -118,8 +122,8 @@ static unsigned int block_variance(VP9_COMP *cpi, MACROBLOCK *x,
       ((-xd->mb_to_bottom_edge) >> 3) : 0;
 
   if (right_overflow || bottom_overflow) {
-    int bw = (1 << (mi_width_log2(bs)  + 3)) - right_overflow;
-    int bh = (1 << (mi_height_log2(bs) + 3)) - bottom_overflow;
+    const int bw = 8 * num_8x8_blocks_wide_lookup[bs] - right_overflow;
+    const int bh = 8 * num_8x8_blocks_high_lookup[bs] - bottom_overflow;
     int avg;
     variance(x->plane[0].src.buf, x->plane[0].src.stride,
              vp9_64_zeros, 0, bw, bh, &sse, &avg);
@@ -137,11 +141,8 @@ int vp9_block_energy(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs) {
   double energy;
   unsigned int var = block_variance(cpi, x, bs);
 
-  vp9_clear_system_state();  // __asm emms;
-
-  // if (var <= 1000)
-  //   return 0;
+  vp9_clear_system_state();
 
-  energy = 0.9*(logf(var + 1) - 10.0);
-  return clamp(round(energy), ENERGY_MIN, ENERGY_MAX);
+  energy = 0.9 * (log(var + 1.0) - 10.0);
+  return clamp((int)round(energy), ENERGY_MIN, ENERGY_MAX);
 }
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_vaq.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_variance.h
index dc18b22f251..d1a459fe9ec 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_vaq.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_variance.h
@@ -9,10 +9,14 @@
  */
 
 
-#ifndef VP9_ENCODER_VP9_CONFIG_VAQ_H_
-#define VP9_ENCODER_VP9_CONFIG_VAQ_H_
+#ifndef VP9_ENCODER_VP9_AQ_VARIANCE_H_
+#define VP9_ENCODER_VP9_AQ_VARIANCE_H_
 
-#include "vp9/encoder/vp9_onyx_int.h"
+#include "vp9/encoder/vp9_encoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
 
 unsigned int vp9_vaq_segment_id(int energy);
 double vp9_vaq_rdmult_ratio(int energy);
@@ -23,4 +27,8 @@ void vp9_vaq_frame_setup(VP9_COMP *cpi);
 
 int vp9_block_energy(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs);
 
-#endif  // VP9_ENCODER_VP9_CONFIG_VAQ_H_
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_ENCODER_VP9_AQ_VARIANCE_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_bitstream.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_bitstream.c
index a996e0e3bc4..8ef2b2eeda5 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_bitstream.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_bitstream.c
@@ -14,286 +14,124 @@
 
 #include "vpx/vpx_encoder.h"
 #include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/mem_ops.h"
 
+#include "vp9/common/vp9_entropy.h"
 #include "vp9/common/vp9_entropymode.h"
 #include "vp9/common/vp9_entropymv.h"
-#include "vp9/common/vp9_findnearmv.h"
-#include "vp9/common/vp9_tile_common.h"
-#include "vp9/common/vp9_seg_common.h"
-#include "vp9/common/vp9_pred_common.h"
-#include "vp9/common/vp9_entropy.h"
 #include "vp9/common/vp9_mvref_common.h"
-#include "vp9/common/vp9_treecoder.h"
-#include "vp9/common/vp9_systemdependent.h"
 #include "vp9/common/vp9_pragmas.h"
+#include "vp9/common/vp9_pred_common.h"
+#include "vp9/common/vp9_seg_common.h"
+#include "vp9/common/vp9_systemdependent.h"
+#include "vp9/common/vp9_tile_common.h"
 
-#include "vp9/encoder/vp9_mcomp.h"
-#include "vp9/encoder/vp9_encodemv.h"
+#include "vp9/encoder/vp9_cost.h"
 #include "vp9/encoder/vp9_bitstream.h"
+#include "vp9/encoder/vp9_encodemv.h"
+#include "vp9/encoder/vp9_mcomp.h"
 #include "vp9/encoder/vp9_segmentation.h"
 #include "vp9/encoder/vp9_subexp.h"
+#include "vp9/encoder/vp9_tokenize.h"
 #include "vp9/encoder/vp9_write_bit_buffer.h"
 
+static struct vp9_token intra_mode_encodings[INTRA_MODES];
+static struct vp9_token switchable_interp_encodings[SWITCHABLE_FILTERS];
+static struct vp9_token partition_encodings[PARTITION_TYPES];
+static struct vp9_token inter_mode_encodings[INTER_MODES];
 
-#if defined(SECTIONBITS_OUTPUT)
-unsigned __int64 Sectionbits[500];
-#endif
-
-#ifdef ENTROPY_STATS
-int intra_mode_stats[INTRA_MODES]
-                    [INTRA_MODES]
-                    [INTRA_MODES];
-vp9_coeff_stats tree_update_hist[TX_SIZES][BLOCK_TYPES];
-
-extern unsigned int active_section;
-#endif
-
-
-#ifdef MODE_STATS
-int64_t tx_count_32x32p_stats[TX_SIZE_CONTEXTS][TX_SIZES];
-int64_t tx_count_16x16p_stats[TX_SIZE_CONTEXTS][TX_SIZES - 1];
-int64_t tx_count_8x8p_stats[TX_SIZE_CONTEXTS][TX_SIZES - 2];
-int64_t switchable_interp_stats[SWITCHABLE_FILTER_CONTEXTS][SWITCHABLE_FILTERS];
-
-void init_tx_count_stats() {
-  vp9_zero(tx_count_32x32p_stats);
-  vp9_zero(tx_count_16x16p_stats);
-  vp9_zero(tx_count_8x8p_stats);
-}
-
-void init_switchable_interp_stats() {
-  vp9_zero(switchable_interp_stats);
-}
-
-static void update_tx_count_stats(VP9_COMMON *cm) {
-  int i, j;
-  for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
-    for (j = 0; j < TX_SIZES; j++) {
-      tx_count_32x32p_stats[i][j] += cm->fc.tx_count_32x32p[i][j];
-    }
-  }
-  for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
-    for (j = 0; j < TX_SIZES - 1; j++) {
-      tx_count_16x16p_stats[i][j] += cm->fc.tx_count_16x16p[i][j];
-    }
-  }
-  for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
-    for (j = 0; j < TX_SIZES - 2; j++) {
-      tx_count_8x8p_stats[i][j] += cm->fc.tx_count_8x8p[i][j];
-    }
-  }
-}
-
-static void update_switchable_interp_stats(VP9_COMMON *cm) {
-  int i, j;
-  for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
-    for (j = 0; j < SWITCHABLE_FILTERS; ++j)
-      switchable_interp_stats[i][j] += cm->fc.switchable_interp_count[i][j];
-}
-
-void write_tx_count_stats() {
-  int i, j;
-  FILE *fp = fopen("tx_count.bin", "wb");
-  fwrite(tx_count_32x32p_stats, sizeof(tx_count_32x32p_stats), 1, fp);
-  fwrite(tx_count_16x16p_stats, sizeof(tx_count_16x16p_stats), 1, fp);
-  fwrite(tx_count_8x8p_stats, sizeof(tx_count_8x8p_stats), 1, fp);
-  fclose(fp);
-
-  printf(
-      "vp9_default_tx_count_32x32p[TX_SIZE_CONTEXTS][TX_SIZES] = {\n");
-  for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
-    printf("  { ");
-    for (j = 0; j < TX_SIZES; j++) {
-      printf("%"PRId64", ", tx_count_32x32p_stats[i][j]);
-    }
-    printf("},\n");
-  }
-  printf("};\n");
-  printf(
-      "vp9_default_tx_count_16x16p[TX_SIZE_CONTEXTS][TX_SIZES-1] = {\n");
-  for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
-    printf("  { ");
-    for (j = 0; j < TX_SIZES - 1; j++) {
-      printf("%"PRId64", ", tx_count_16x16p_stats[i][j]);
-    }
-    printf("},\n");
-  }
-  printf("};\n");
-  printf(
-      "vp9_default_tx_count_8x8p[TX_SIZE_CONTEXTS][TX_SIZES-2] = {\n");
-  for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
-    printf("  { ");
-    for (j = 0; j < TX_SIZES - 2; j++) {
-      printf("%"PRId64", ", tx_count_8x8p_stats[i][j]);
-    }
-    printf("},\n");
-  }
-  printf("};\n");
+void vp9_entropy_mode_init() {
+  vp9_tokens_from_tree(intra_mode_encodings, vp9_intra_mode_tree);
+  vp9_tokens_from_tree(switchable_interp_encodings, vp9_switchable_interp_tree);
+  vp9_tokens_from_tree(partition_encodings, vp9_partition_tree);
+  vp9_tokens_from_tree(inter_mode_encodings, vp9_inter_mode_tree);
 }
 
-void write_switchable_interp_stats() {
-  int i, j;
-  FILE *fp = fopen("switchable_interp.bin", "wb");
-  fwrite(switchable_interp_stats, sizeof(switchable_interp_stats), 1, fp);
-  fclose(fp);
-
-  printf(
-      "vp9_default_switchable_filter_count[SWITCHABLE_FILTER_CONTEXTS]"
-      "[SWITCHABLE_FILTERS] = {\n");
-  for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
-    printf("  { ");
-    for (j = 0; j < SWITCHABLE_FILTERS; j++) {
-      printf("%"PRId64", ", switchable_interp_stats[i][j]);
-    }
-    printf("},\n");
-  }
-  printf("};\n");
+static void write_intra_mode(vp9_writer *w, PREDICTION_MODE mode,
+                             const vp9_prob *probs) {
+  vp9_write_token(w, vp9_intra_mode_tree, probs, &intra_mode_encodings[mode]);
 }
-#endif
 
-static INLINE void write_be32(uint8_t *p, int value) {
-  p[0] = value >> 24;
-  p[1] = value >> 16;
-  p[2] = value >> 8;
-  p[3] = value;
+static void write_inter_mode(vp9_writer *w, PREDICTION_MODE mode,
+                             const vp9_prob *probs) {
+  assert(is_inter_mode(mode));
+  vp9_write_token(w, vp9_inter_mode_tree, probs,
+                  &inter_mode_encodings[INTER_OFFSET(mode)]);
 }
 
-void vp9_encode_unsigned_max(struct vp9_write_bit_buffer *wb,
-                             int data, int max) {
+static void encode_unsigned_max(struct vp9_write_bit_buffer *wb,
+                                int data, int max) {
   vp9_wb_write_literal(wb, data, get_unsigned_bits(max));
 }
 
-static void update_mode(
-  vp9_writer *w,
-  int n,
-  vp9_tree tree,
-  vp9_prob Pnew[/* n-1 */],
-  vp9_prob Pcur[/* n-1 */],
-  unsigned int bct[/* n-1 */] [2],
-  const unsigned int num_events[/* n */]
-) {
-  int i = 0;
-
-  vp9_tree_probs_from_distribution(tree, Pnew, bct, num_events, 0);
-  n--;
-
-  for (i = 0; i < n; ++i)
-    vp9_cond_prob_diff_update(w, &Pcur[i], bct[i]);
-}
+static void prob_diff_update(const vp9_tree_index *tree,
+                             vp9_prob probs[/*n - 1*/],
+                             const unsigned int counts[/*n - 1*/],
+                             int n, vp9_writer *w) {
+  int i;
+  unsigned int branch_ct[32][2];
 
-static void update_mbintra_mode_probs(VP9_COMP* const cpi,
-                                      vp9_writer* const bc) {
-  VP9_COMMON *const cm = &cpi->common;
-  int j;
-  vp9_prob pnew[INTRA_MODES - 1];
-  unsigned int bct[INTRA_MODES - 1][2];
+  // Assuming max number of probabilities <= 32
+  assert(n <= 32);
 
-  for (j = 0; j < BLOCK_SIZE_GROUPS; j++)
-    update_mode(bc, INTRA_MODES, vp9_intra_mode_tree, pnew,
-                cm->fc.y_mode_prob[j], bct,
-                (unsigned int *)cpi->y_mode_count[j]);
+  vp9_tree_probs_from_distribution(tree, branch_ct, counts);
+  for (i = 0; i < n - 1; ++i)
+    vp9_cond_prob_diff_update(w, &probs[i], branch_ct[i]);
 }
 
-static void write_selected_tx_size(const VP9_COMP *cpi, MODE_INFO *m,
+static void write_selected_tx_size(const VP9_COMP *cpi,
                                    TX_SIZE tx_size, BLOCK_SIZE bsize,
                                    vp9_writer *w) {
+  const TX_SIZE max_tx_size = max_txsize_lookup[bsize];
   const MACROBLOCKD *const xd = &cpi->mb.e_mbd;
-  const vp9_prob *tx_probs = get_tx_probs2(xd, &cpi->common.fc.tx_probs, m);
+  const vp9_prob *const tx_probs = get_tx_probs2(max_tx_size, xd,
+                                                 &cpi->common.fc.tx_probs);
   vp9_write(w, tx_size != TX_4X4, tx_probs[0]);
-  if (bsize >= BLOCK_16X16 && tx_size != TX_4X4) {
+  if (tx_size != TX_4X4 && max_tx_size >= TX_16X16) {
     vp9_write(w, tx_size != TX_8X8, tx_probs[1]);
-    if (bsize >= BLOCK_32X32 && tx_size != TX_8X8)
+    if (tx_size != TX_8X8 && max_tx_size >= TX_32X32)
       vp9_write(w, tx_size != TX_16X16, tx_probs[2]);
   }
 }
 
-static int write_skip_coeff(const VP9_COMP *cpi, int segment_id, MODE_INFO *m,
-                            vp9_writer *w) {
+static int write_skip(const VP9_COMP *cpi, int segment_id, const MODE_INFO *mi,
+                      vp9_writer *w) {
   const MACROBLOCKD *const xd = &cpi->mb.e_mbd;
   if (vp9_segfeature_active(&cpi->common.seg, segment_id, SEG_LVL_SKIP)) {
     return 1;
   } else {
-    const int skip_coeff = m->mbmi.skip_coeff;
-    vp9_write(w, skip_coeff, vp9_get_pred_prob_mbskip(&cpi->common, xd));
-    return skip_coeff;
+    const int skip = mi->mbmi.skip;
+    vp9_write(w, skip, vp9_get_skip_prob(&cpi->common, xd));
+    return skip;
   }
 }
 
-void vp9_update_skip_probs(VP9_COMP *cpi, vp9_writer *w) {
-  VP9_COMMON *cm = &cpi->common;
+static void update_skip_probs(VP9_COMMON *cm, vp9_writer *w) {
   int k;
 
-  for (k = 0; k < MBSKIP_CONTEXTS; ++k)
-    vp9_cond_prob_diff_update(w, &cm->fc.mbskip_probs[k], cm->counts.mbskip[k]);
-}
-
-static void write_intra_mode(vp9_writer *bc, int m, const vp9_prob *p) {
-  write_token(bc, vp9_intra_mode_tree, p, vp9_intra_mode_encodings + m);
+  for (k = 0; k < SKIP_CONTEXTS; ++k)
+    vp9_cond_prob_diff_update(w, &cm->fc.skip_probs[k], cm->counts.skip[k]);
 }
 
-static void update_switchable_interp_probs(VP9_COMP *const cpi,
-                                           vp9_writer* const bc) {
-  VP9_COMMON *const cm = &cpi->common;
-  unsigned int branch_ct[SWITCHABLE_FILTER_CONTEXTS][SWITCHABLE_FILTERS - 1][2];
-  vp9_prob new_prob[SWITCHABLE_FILTER_CONTEXTS][SWITCHABLE_FILTERS - 1];
-  int i, j;
-  for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j) {
-    vp9_tree_probs_from_distribution(
-        vp9_switchable_interp_tree,
-        new_prob[j], branch_ct[j],
-        cm->counts.switchable_interp[j], 0);
-  }
-  for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j) {
-    for (i = 0; i < SWITCHABLE_FILTERS - 1; ++i) {
-      vp9_cond_prob_diff_update(bc, &cm->fc.switchable_interp_prob[j][i],
-                                branch_ct[j][i]);
-    }
-  }
-#ifdef MODE_STATS
-  if (!cpi->dummy_packing)
-    update_switchable_interp_stats(cm);
-#endif
-}
-
-static void update_inter_mode_probs(VP9_COMMON *cm, vp9_writer* const bc) {
-  int i, j;
-
-  for (i = 0; i < INTER_MODE_CONTEXTS; ++i) {
-    unsigned int branch_ct[INTER_MODES - 1][2];
-    vp9_prob new_prob[INTER_MODES - 1];
-
-    vp9_tree_probs_from_distribution(vp9_inter_mode_tree,
-                                     new_prob, branch_ct,
-                                     cm->counts.inter_mode[i], NEARESTMV);
-
-    for (j = 0; j < INTER_MODES - 1; ++j)
-      vp9_cond_prob_diff_update(bc, &cm->fc.inter_mode_probs[i][j],
-                                branch_ct[j]);
-  }
+static void update_switchable_interp_probs(VP9_COMMON *cm, vp9_writer *w) {
+  int j;
+  for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j)
+    prob_diff_update(vp9_switchable_interp_tree,
+                     cm->fc.switchable_interp_prob[j],
+                     cm->counts.switchable_interp[j], SWITCHABLE_FILTERS, w);
 }
 
-static void pack_mb_tokens(vp9_writer* const bc,
-                           TOKENEXTRA **tp,
-                           const TOKENEXTRA *const stop) {
+static void pack_mb_tokens(vp9_writer *w,
+                           TOKENEXTRA **tp, const TOKENEXTRA *stop) {
   TOKENEXTRA *p = *tp;
 
   while (p < stop && p->token != EOSB_TOKEN) {
     const int t = p->token;
-    const struct vp9_token *const a = vp9_coef_encodings + t;
-    const vp9_extra_bit *const b = vp9_extra_bits + t;
+    const struct vp9_token *const a = &vp9_coef_encodings[t];
+    const vp9_extra_bit *const b = &vp9_extra_bits[t];
     int i = 0;
-    const vp9_prob *pp;
     int v = a->value;
     int n = a->len;
-    vp9_prob probs[ENTROPY_NODES];
-
-    if (t >= TWO_TOKEN) {
-      vp9_model_to_full_probs(p->context_tree, probs);
-      pp = probs;
-    } else {
-      pp = p->context_tree;
-    }
-    assert(pp != 0);
 
     /* skip one or two nodes */
     if (p->skip_eob_node) {
@@ -301,11 +139,24 @@ static void pack_mb_tokens(vp9_writer* const bc,
       i = 2 * p->skip_eob_node;
     }
 
-    do {
-      const int bb = (v >> --n) & 1;
-      vp9_write(bc, bb, pp[i >> 1]);
-      i = vp9_coef_tree[i + bb];
-    } while (n);
+    // TODO(jbb): expanding this can lead to big gains.  It allows
+    // much better branch prediction and would enable us to avoid numerous
+    // lookups and compares.
+
+    // If we have a token that's in the constrained set, the coefficient tree
+    // is split into two treed writes.  The first treed write takes care of the
+    // unconstrained nodes.  The second treed write takes care of the
+    // constrained nodes.
+    if (t >= TWO_TOKEN && t < EOB_TOKEN) {
+      int len = UNCONSTRAINED_NODES - p->skip_eob_node;
+      int bits = v >> (n - len);
+      vp9_write_tree(w, vp9_coef_tree, p->context_tree, bits, len, i);
+      vp9_write_tree(w, vp9_coef_con_tree,
+                     vp9_pareto8_full[p->context_tree[PIVOT_NODE] - 1],
+                     v, n - len, 0);
+    } else {
+      vp9_write_tree(w, vp9_coef_tree, p->context_tree, v, n, i);
+    }
 
     if (b->base_val) {
       const int e = p->extra, l = b->len;
@@ -318,12 +169,12 @@ static void pack_mb_tokens(vp9_writer* const bc,
 
         do {
           const int bb = (v >> --n) & 1;
-          vp9_write(bc, bb, pb[i >> 1]);
+          vp9_write(w, bb, pb[i >> 1]);
           i = b->tree[i + bb];
         } while (n);
       }
 
-      vp9_write_bit(bc, e & 1);
+      vp9_write_bit(w, e & 1);
     }
     ++p;
   }
@@ -331,457 +182,361 @@ static void pack_mb_tokens(vp9_writer* const bc,
   *tp = p + (p->token == EOSB_TOKEN);
 }
 
-static void write_sb_mv_ref(vp9_writer *w, MB_PREDICTION_MODE mode,
-                            const vp9_prob *p) {
-  assert(is_inter_mode(mode));
-  write_token(w, vp9_inter_mode_tree, p,
-              &vp9_inter_mode_encodings[inter_mode_offset(mode)]);
-}
-
-
 static void write_segment_id(vp9_writer *w, const struct segmentation *seg,
                              int segment_id) {
   if (seg->enabled && seg->update_map)
-    treed_write(w, vp9_segment_tree, seg->tree_probs, segment_id, 3);
+    vp9_write_tree(w, vp9_segment_tree, seg->tree_probs, segment_id, 3, 0);
 }
 
 // This function encodes the reference frame
-static void encode_ref_frame(VP9_COMP *cpi, vp9_writer *bc) {
-  VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCK *const x = &cpi->mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *mi = &xd->mi_8x8[0]->mbmi;
-  const int segment_id = mi->segment_id;
-  int seg_ref_active = vp9_segfeature_active(&cm->seg, segment_id,
-                                             SEG_LVL_REF_FRAME);
+static void write_ref_frames(const VP9_COMP *cpi, vp9_writer *w) {
+  const VP9_COMMON *const cm = &cpi->common;
+  const MACROBLOCKD *const xd = &cpi->mb.e_mbd;
+  const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  const int is_compound = has_second_ref(mbmi);
+  const int segment_id = mbmi->segment_id;
+
   // If segment level coding of this signal is disabled...
   // or the segment allows multiple reference frame options
-  if (!seg_ref_active) {
+  if (vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) {
+    assert(!is_compound);
+    assert(mbmi->ref_frame[0] ==
+               vp9_get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME));
+  } else {
     // does the feature use compound prediction or not
     // (if not specified at the frame/segment level)
-    if (cm->comp_pred_mode == HYBRID_PREDICTION) {
-      vp9_write(bc, mi->ref_frame[1] > INTRA_FRAME,
-                vp9_get_pred_prob_comp_inter_inter(cm, xd));
+    if (cm->reference_mode == REFERENCE_MODE_SELECT) {
+      vp9_write(w, is_compound, vp9_get_reference_mode_prob(cm, xd));
     } else {
-      assert((mi->ref_frame[1] <= INTRA_FRAME) ==
-                 (cm->comp_pred_mode == SINGLE_PREDICTION_ONLY));
+      assert(!is_compound == (cm->reference_mode == SINGLE_REFERENCE));
     }
 
-    if (mi->ref_frame[1] > INTRA_FRAME) {
-      vp9_write(bc, mi->ref_frame[0] == GOLDEN_FRAME,
+    if (is_compound) {
+      vp9_write(w, mbmi->ref_frame[0] == GOLDEN_FRAME,
                 vp9_get_pred_prob_comp_ref_p(cm, xd));
     } else {
-      vp9_write(bc, mi->ref_frame[0] != LAST_FRAME,
-                vp9_get_pred_prob_single_ref_p1(cm, xd));
-      if (mi->ref_frame[0] != LAST_FRAME)
-        vp9_write(bc, mi->ref_frame[0] != GOLDEN_FRAME,
-                  vp9_get_pred_prob_single_ref_p2(cm, xd));
+      const int bit0 = mbmi->ref_frame[0] != LAST_FRAME;
+      vp9_write(w, bit0, vp9_get_pred_prob_single_ref_p1(cm, xd));
+      if (bit0) {
+        const int bit1 = mbmi->ref_frame[0] != GOLDEN_FRAME;
+        vp9_write(w, bit1, vp9_get_pred_prob_single_ref_p2(cm, xd));
+      }
     }
-  } else {
-    assert(mi->ref_frame[1] <= INTRA_FRAME);
-    assert(vp9_get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME) ==
-           mi->ref_frame[0]);
   }
-
-  // If using the prediction model we have nothing further to do because
-  // the reference frame is fully coded by the segment.
 }
 
-static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc) {
+static void pack_inter_mode_mvs(VP9_COMP *cpi, const MODE_INFO *mi,
+                                vp9_writer *w) {
   VP9_COMMON *const cm = &cpi->common;
   const nmv_context *nmvc = &cm->fc.nmvc;
-  MACROBLOCK *const x = &cpi->mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  struct segmentation *seg = &cm->seg;
-  MB_MODE_INFO *const mi = &m->mbmi;
-  const MV_REFERENCE_FRAME rf = mi->ref_frame[0];
-  const MB_PREDICTION_MODE mode = mi->mode;
-  const int segment_id = mi->segment_id;
-  int skip_coeff;
-  const BLOCK_SIZE bsize = mi->sb_type;
+  const MACROBLOCK *const x = &cpi->mb;
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const struct segmentation *const seg = &cm->seg;
+  const MB_MODE_INFO *const mbmi = &mi->mbmi;
+  const PREDICTION_MODE mode = mbmi->mode;
+  const int segment_id = mbmi->segment_id;
+  const BLOCK_SIZE bsize = mbmi->sb_type;
   const int allow_hp = cm->allow_high_precision_mv;
-
-#ifdef ENTROPY_STATS
-  active_section = 9;
-#endif
+  const int is_inter = is_inter_block(mbmi);
+  const int is_compound = has_second_ref(mbmi);
+  int skip, ref;
 
   if (seg->update_map) {
     if (seg->temporal_update) {
-      const int pred_flag = mi->seg_id_predicted;
+      const int pred_flag = mbmi->seg_id_predicted;
       vp9_prob pred_prob = vp9_get_pred_prob_seg_id(seg, xd);
-      vp9_write(bc, pred_flag, pred_prob);
+      vp9_write(w, pred_flag, pred_prob);
       if (!pred_flag)
-        write_segment_id(bc, seg, segment_id);
+        write_segment_id(w, seg, segment_id);
     } else {
-      write_segment_id(bc, seg, segment_id);
+      write_segment_id(w, seg, segment_id);
     }
   }
 
-  skip_coeff = write_skip_coeff(cpi, segment_id, m, bc);
+  skip = write_skip(cpi, segment_id, mi, w);
 
   if (!vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME))
-    vp9_write(bc, rf != INTRA_FRAME,
-              vp9_get_pred_prob_intra_inter(cm, xd));
+    vp9_write(w, is_inter, vp9_get_intra_inter_prob(cm, xd));
 
   if (bsize >= BLOCK_8X8 && cm->tx_mode == TX_MODE_SELECT &&
-      !(rf != INTRA_FRAME &&
-        (skip_coeff || vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP)))) {
-    write_selected_tx_size(cpi, m, mi->tx_size, bsize, bc);
+      !(is_inter &&
+        (skip || vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP)))) {
+    write_selected_tx_size(cpi, mbmi->tx_size, bsize, w);
   }
 
-  if (rf == INTRA_FRAME) {
-#ifdef ENTROPY_STATS
-    active_section = 6;
-#endif
-
+  if (!is_inter) {
     if (bsize >= BLOCK_8X8) {
-      write_intra_mode(bc, mode, cm->fc.y_mode_prob[size_group_lookup[bsize]]);
+      write_intra_mode(w, mode, cm->fc.y_mode_prob[size_group_lookup[bsize]]);
     } else {
       int idx, idy;
-      const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
-      const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
-      for (idy = 0; idy < 2; idy += num_4x4_blocks_high) {
-        for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
-          const MB_PREDICTION_MODE bm = m->bmi[idy * 2 + idx].as_mode;
-          write_intra_mode(bc, bm, cm->fc.y_mode_prob[0]);
+      const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
+      const int num_4x4_h = num_4x4_blocks_high_lookup[bsize];
+      for (idy = 0; idy < 2; idy += num_4x4_h) {
+        for (idx = 0; idx < 2; idx += num_4x4_w) {
+          const PREDICTION_MODE b_mode = mi->bmi[idy * 2 + idx].as_mode;
+          write_intra_mode(w, b_mode, cm->fc.y_mode_prob[0]);
         }
       }
     }
-    write_intra_mode(bc, mi->uv_mode, cm->fc.uv_mode_prob[mode]);
+    write_intra_mode(w, mbmi->uv_mode, cm->fc.uv_mode_prob[mode]);
   } else {
-    vp9_prob *mv_ref_p;
-    encode_ref_frame(cpi, bc);
-    mv_ref_p = cpi->common.fc.inter_mode_probs[mi->mode_context[rf]];
-
-#ifdef ENTROPY_STATS
-    active_section = 3;
-#endif
+    const int mode_ctx = mbmi->mode_context[mbmi->ref_frame[0]];
+    const vp9_prob *const inter_probs = cm->fc.inter_mode_probs[mode_ctx];
+    write_ref_frames(cpi, w);
 
     // If segment skip is not enabled code the mode.
     if (!vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP)) {
       if (bsize >= BLOCK_8X8) {
-        write_sb_mv_ref(bc, mode, mv_ref_p);
-        ++cm->counts.inter_mode[mi->mode_context[rf]]
-                               [inter_mode_offset(mode)];
+        write_inter_mode(w, mode, inter_probs);
+        ++cm->counts.inter_mode[mode_ctx][INTER_OFFSET(mode)];
       }
     }
 
-    if (cm->mcomp_filter_type == SWITCHABLE) {
+    if (cm->interp_filter == SWITCHABLE) {
       const int ctx = vp9_get_pred_context_switchable_interp(xd);
-      write_token(bc, vp9_switchable_interp_tree,
-                  cm->fc.switchable_interp_prob[ctx],
-                  &vp9_switchable_interp_encodings[mi->interp_filter]);
+      vp9_write_token(w, vp9_switchable_interp_tree,
+                      cm->fc.switchable_interp_prob[ctx],
+                      &switchable_interp_encodings[mbmi->interp_filter]);
     } else {
-      assert(mi->interp_filter == cm->mcomp_filter_type);
+      assert(mbmi->interp_filter == cm->interp_filter);
     }
 
     if (bsize < BLOCK_8X8) {
-      const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
-      const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
+      const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
+      const int num_4x4_h = num_4x4_blocks_high_lookup[bsize];
       int idx, idy;
-      for (idy = 0; idy < 2; idy += num_4x4_blocks_high) {
-        for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
+      for (idy = 0; idy < 2; idy += num_4x4_h) {
+        for (idx = 0; idx < 2; idx += num_4x4_w) {
           const int j = idy * 2 + idx;
-          const MB_PREDICTION_MODE blockmode = m->bmi[j].as_mode;
-          write_sb_mv_ref(bc, blockmode, mv_ref_p);
-          ++cm->counts.inter_mode[mi->mode_context[rf]]
-                                 [inter_mode_offset(blockmode)];
-
-          if (blockmode == NEWMV) {
-#ifdef ENTROPY_STATS
-            active_section = 11;
-#endif
-            vp9_encode_mv(cpi, bc, &m->bmi[j].as_mv[0].as_mv,
-                          &mi->best_mv[0].as_mv, nmvc, allow_hp);
-
-            if (has_second_ref(mi))
-              vp9_encode_mv(cpi, bc, &m->bmi[j].as_mv[1].as_mv,
-                            &mi->best_mv[1].as_mv, nmvc, allow_hp);
+          const PREDICTION_MODE b_mode = mi->bmi[j].as_mode;
+          write_inter_mode(w, b_mode, inter_probs);
+          ++cm->counts.inter_mode[mode_ctx][INTER_OFFSET(b_mode)];
+          if (b_mode == NEWMV) {
+            for (ref = 0; ref < 1 + is_compound; ++ref)
+              vp9_encode_mv(cpi, w, &mi->bmi[j].as_mv[ref].as_mv,
+                            &mbmi->ref_mvs[mbmi->ref_frame[ref]][0].as_mv,
+                            nmvc, allow_hp);
           }
         }
       }
-    } else if (mode == NEWMV) {
-#ifdef ENTROPY_STATS
-      active_section = 5;
-#endif
-      vp9_encode_mv(cpi, bc, &mi->mv[0].as_mv,
-                    &mi->best_mv[0].as_mv, nmvc, allow_hp);
-
-      if (has_second_ref(mi))
-        vp9_encode_mv(cpi, bc, &mi->mv[1].as_mv,
-                      &mi->best_mv[1].as_mv, nmvc, allow_hp);
+    } else {
+      if (mode == NEWMV) {
+        for (ref = 0; ref < 1 + is_compound; ++ref)
+          vp9_encode_mv(cpi, w, &mbmi->mv[ref].as_mv,
+                        &mbmi->ref_mvs[mbmi->ref_frame[ref]][0].as_mv, nmvc,
+                        allow_hp);
+      }
     }
   }
 }
 
 static void write_mb_modes_kf(const VP9_COMP *cpi, MODE_INFO **mi_8x8,
-                              vp9_writer *bc) {
+                              vp9_writer *w) {
   const VP9_COMMON *const cm = &cpi->common;
   const MACROBLOCKD *const xd = &cpi->mb.e_mbd;
   const struct segmentation *const seg = &cm->seg;
-  MODE_INFO *m = mi_8x8[0];
-  const int ym = m->mbmi.mode;
-  const int segment_id = m->mbmi.segment_id;
-  MODE_INFO *above_mi = mi_8x8[-xd->mode_info_stride];
-  MODE_INFO *left_mi = xd->left_available ? mi_8x8[-1] : NULL;
+  const MODE_INFO *const mi = mi_8x8[0];
+  const MODE_INFO *const above_mi = mi_8x8[-xd->mi_stride];
+  const MODE_INFO *const left_mi = xd->left_available ? mi_8x8[-1] : NULL;
+  const MB_MODE_INFO *const mbmi = &mi->mbmi;
+  const BLOCK_SIZE bsize = mbmi->sb_type;
 
   if (seg->update_map)
-    write_segment_id(bc, seg, m->mbmi.segment_id);
+    write_segment_id(w, seg, mbmi->segment_id);
 
-  write_skip_coeff(cpi, segment_id, m, bc);
+  write_skip(cpi, mbmi->segment_id, mi, w);
 
-  if (m->mbmi.sb_type >= BLOCK_8X8 && cm->tx_mode == TX_MODE_SELECT)
-    write_selected_tx_size(cpi, m, m->mbmi.tx_size, m->mbmi.sb_type, bc);
+  if (bsize >= BLOCK_8X8 && cm->tx_mode == TX_MODE_SELECT)
+    write_selected_tx_size(cpi, mbmi->tx_size, bsize, w);
 
-  if (m->mbmi.sb_type >= BLOCK_8X8) {
-    const MB_PREDICTION_MODE A = above_block_mode(m, above_mi, 0);
-    const MB_PREDICTION_MODE L = left_block_mode(m, left_mi, 0);
-    write_intra_mode(bc, ym, vp9_kf_y_mode_prob[A][L]);
+  if (bsize >= BLOCK_8X8) {
+    write_intra_mode(w, mbmi->mode, get_y_mode_probs(mi, above_mi, left_mi, 0));
   } else {
+    const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
+    const int num_4x4_h = num_4x4_blocks_high_lookup[bsize];
     int idx, idy;
-    const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[m->mbmi.sb_type];
-    const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[m->mbmi.sb_type];
-    for (idy = 0; idy < 2; idy += num_4x4_blocks_high) {
-      for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
-        int i = idy * 2 + idx;
-        const MB_PREDICTION_MODE A = above_block_mode(m, above_mi, i);
-        const MB_PREDICTION_MODE L = left_block_mode(m, left_mi, i);
-        const int bm = m->bmi[i].as_mode;
-#ifdef ENTROPY_STATS
-        ++intra_mode_stats[A][L][bm];
-#endif
-        write_intra_mode(bc, bm, vp9_kf_y_mode_prob[A][L]);
+
+    for (idy = 0; idy < 2; idy += num_4x4_h) {
+      for (idx = 0; idx < 2; idx += num_4x4_w) {
+        const int block = idy * 2 + idx;
+        write_intra_mode(w, mi->bmi[block].as_mode,
+                         get_y_mode_probs(mi, above_mi, left_mi, block));
       }
     }
   }
 
-  write_intra_mode(bc, m->mbmi.uv_mode, vp9_kf_uv_mode_prob[ym]);
+  write_intra_mode(w, mbmi->uv_mode, vp9_kf_uv_mode_prob[mbmi->mode]);
 }
 
 static void write_modes_b(VP9_COMP *cpi, const TileInfo *const tile,
-                          MODE_INFO **mi_8x8, vp9_writer *bc,
-                          TOKENEXTRA **tok, TOKENEXTRA *tok_end,
-                          int mi_row, int mi_col, int index) {
+                          vp9_writer *w, TOKENEXTRA **tok, TOKENEXTRA *tok_end,
+                          int mi_row, int mi_col) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &cpi->mb.e_mbd;
-  MODE_INFO *m = mi_8x8[0];
-
-  if (m->mbmi.sb_type < BLOCK_8X8)
-    if (index > 0)
-      return;
+  MODE_INFO *m;
 
-  xd->mi_8x8 = mi_8x8;
+  xd->mi = cm->mi_grid_visible + (mi_row * cm->mi_stride + mi_col);
+  m = xd->mi[0];
 
   set_mi_row_col(xd, tile,
                  mi_row, num_8x8_blocks_high_lookup[m->mbmi.sb_type],
                  mi_col, num_8x8_blocks_wide_lookup[m->mbmi.sb_type],
                  cm->mi_rows, cm->mi_cols);
   if (frame_is_intra_only(cm)) {
-    write_mb_modes_kf(cpi, mi_8x8, bc);
-#ifdef ENTROPY_STATS
-    active_section = 8;
-#endif
+    write_mb_modes_kf(cpi, xd->mi, w);
   } else {
-    pack_inter_mode_mvs(cpi, m, bc);
-#ifdef ENTROPY_STATS
-    active_section = 1;
-#endif
+    pack_inter_mode_mvs(cpi, m, w);
   }
 
   assert(*tok < tok_end);
-  pack_mb_tokens(bc, tok, tok_end);
+  pack_mb_tokens(w, tok, tok_end);
 }
 
-static void write_partition(PARTITION_TYPE partition,
-                            int hbs, int mi_rows, int mi_cols,
-                            int mi_row, int mi_col,
-                            vp9_prob probs[PARTITION_TYPES - 1],
-                            vp9_writer *w) {
-  const int has_rows = (mi_row + hbs) < mi_rows;
-  const int has_cols = (mi_col + hbs) < mi_cols;
+static void write_partition(VP9_COMMON *cm, MACROBLOCKD *xd,
+                            int hbs, int mi_row, int mi_col,
+                            PARTITION_TYPE p, BLOCK_SIZE bsize, vp9_writer *w) {
+  const int ctx = partition_plane_context(xd, mi_row, mi_col, bsize);
+  const vp9_prob *const probs = get_partition_probs(cm, ctx);
+  const int has_rows = (mi_row + hbs) < cm->mi_rows;
+  const int has_cols = (mi_col + hbs) < cm->mi_cols;
 
   if (has_rows && has_cols) {
-    write_token(w, vp9_partition_tree, probs,
-                &vp9_partition_encodings[partition]);
+    vp9_write_token(w, vp9_partition_tree, probs, &partition_encodings[p]);
   } else if (!has_rows && has_cols) {
-    assert(partition == PARTITION_SPLIT || partition == PARTITION_HORZ);
-    vp9_write(w, partition == PARTITION_SPLIT, probs[1]);
+    assert(p == PARTITION_SPLIT || p == PARTITION_HORZ);
+    vp9_write(w, p == PARTITION_SPLIT, probs[1]);
   } else if (has_rows && !has_cols) {
-    assert(partition == PARTITION_SPLIT || partition == PARTITION_VERT);
-    vp9_write(w, partition == PARTITION_SPLIT, probs[2]);
+    assert(p == PARTITION_SPLIT || p == PARTITION_VERT);
+    vp9_write(w, p == PARTITION_SPLIT, probs[2]);
   } else {
-    assert(partition == PARTITION_SPLIT);
+    assert(p == PARTITION_SPLIT);
   }
 }
 
-static void write_modes_sb(VP9_COMP *cpi, const TileInfo *const tile,
-                           MODE_INFO **mi_8x8, vp9_writer *bc,
-                           TOKENEXTRA **tok, TOKENEXTRA *tok_end,
-                           int mi_row, int mi_col, BLOCK_SIZE bsize,
-                           int index) {
+static void write_modes_sb(VP9_COMP *cpi,
+                           const TileInfo *const tile,
+                           vp9_writer *w, TOKENEXTRA **tok, TOKENEXTRA *tok_end,
+                           int mi_row, int mi_col, BLOCK_SIZE bsize) {
   VP9_COMMON *const cm = &cpi->common;
-  const int mis = cm->mode_info_stride;
-  int bsl = b_width_log2(bsize);
-  int bs = (1 << bsl) / 4;  // mode_info step for subsize
-  int n;
-  PARTITION_TYPE partition = PARTITION_NONE;
+  MACROBLOCKD *const xd = &cpi->mb.e_mbd;
+
+  const int bsl = b_width_log2(bsize);
+  const int bs = (1 << bsl) / 4;
+  PARTITION_TYPE partition;
   BLOCK_SIZE subsize;
-  MODE_INFO *m = mi_8x8[0];
+  MODE_INFO *m = cm->mi_grid_visible[mi_row * cm->mi_stride + mi_col];
 
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
     return;
 
   partition = partition_lookup[bsl][m->mbmi.sb_type];
-
-  if (bsize < BLOCK_8X8) {
-    if (index > 0)
-      return;
-  } else {
-    const int ctx = partition_plane_context(cpi->above_seg_context,
-                                            cpi->left_seg_context,
-                                            mi_row, mi_col, bsize);
-    write_partition(partition, bs, cm->mi_rows, cm->mi_cols, mi_row, mi_col,
-                    cm->fc.partition_prob[cm->frame_type][ctx], bc);
-  }
-
+  write_partition(cm, xd, bs, mi_row, mi_col, partition, bsize, w);
   subsize = get_subsize(bsize, partition);
-
-  switch (partition) {
-    case PARTITION_NONE:
-      write_modes_b(cpi, tile, mi_8x8, bc, tok, tok_end, mi_row, mi_col, 0);
-      break;
-    case PARTITION_HORZ:
-      write_modes_b(cpi, tile, mi_8x8, bc, tok, tok_end, mi_row, mi_col, 0);
-      if ((mi_row + bs) < cm->mi_rows)
-        write_modes_b(cpi, tile, mi_8x8 + bs * mis, bc, tok, tok_end,
-                      mi_row + bs, mi_col, 1);
-      break;
-    case PARTITION_VERT:
-      write_modes_b(cpi, tile, mi_8x8, bc, tok, tok_end, mi_row, mi_col, 0);
-      if ((mi_col + bs) < cm->mi_cols)
-        write_modes_b(cpi, tile, mi_8x8 + bs, bc, tok, tok_end,
-                      mi_row, mi_col + bs, 1);
-      break;
-    case PARTITION_SPLIT:
-      for (n = 0; n < 4; n++) {
-        const int j = n >> 1, i = n & 1;
-        write_modes_sb(cpi, tile, mi_8x8 + j * bs * mis + i * bs, bc,
-                       tok, tok_end,
-                       mi_row + j * bs, mi_col + i * bs, subsize, n);
-      }
-      break;
-    default:
-      assert(0);
+  if (subsize < BLOCK_8X8) {
+    write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+  } else {
+    switch (partition) {
+      case PARTITION_NONE:
+        write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+        break;
+      case PARTITION_HORZ:
+        write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+        if (mi_row + bs < cm->mi_rows)
+          write_modes_b(cpi, tile, w, tok, tok_end, mi_row + bs, mi_col);
+        break;
+      case PARTITION_VERT:
+        write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+        if (mi_col + bs < cm->mi_cols)
+          write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col + bs);
+        break;
+      case PARTITION_SPLIT:
+        write_modes_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col, subsize);
+        write_modes_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col + bs,
+                       subsize);
+        write_modes_sb(cpi, tile, w, tok, tok_end, mi_row + bs, mi_col,
+                       subsize);
+        write_modes_sb(cpi, tile, w, tok, tok_end, mi_row + bs, mi_col + bs,
+                       subsize);
+        break;
+      default:
+        assert(0);
+    }
   }
 
   // update partition context
   if (bsize >= BLOCK_8X8 &&
       (bsize == BLOCK_8X8 || partition != PARTITION_SPLIT))
-    update_partition_context(cpi->above_seg_context, cpi->left_seg_context,
-                             mi_row, mi_col, subsize, bsize);
+    update_partition_context(xd, mi_row, mi_col, subsize, bsize);
 }
 
-static void write_modes(VP9_COMP *cpi, const TileInfo *const tile,
-                        vp9_writer* const bc,
-                        TOKENEXTRA **tok, TOKENEXTRA *tok_end) {
-  VP9_COMMON *const cm = &cpi->common;
-  const int mis = cm->mode_info_stride;
+static void write_modes(VP9_COMP *cpi,
+                        const TileInfo *const tile,
+                        vp9_writer *w, TOKENEXTRA **tok, TOKENEXTRA *tok_end) {
   int mi_row, mi_col;
-  MODE_INFO **mi_8x8 = cm->mi_grid_visible;
-  MODE_INFO **m_8x8;
-
-  mi_8x8 += tile->mi_col_start + tile->mi_row_start * mis;
 
   for (mi_row = tile->mi_row_start; mi_row < tile->mi_row_end;
-       mi_row += 8, mi_8x8 += 8 * mis) {
-    m_8x8 = mi_8x8;
-    vp9_zero(cpi->left_seg_context);
+       mi_row += MI_BLOCK_SIZE) {
+    vp9_zero(cpi->mb.e_mbd.left_seg_context);
     for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
-         mi_col += MI_BLOCK_SIZE, m_8x8 += MI_BLOCK_SIZE) {
-      write_modes_sb(cpi, tile, m_8x8, bc, tok, tok_end, mi_row, mi_col,
-                     BLOCK_64X64, 0);
-    }
+         mi_col += MI_BLOCK_SIZE)
+      write_modes_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col,
+                     BLOCK_64X64);
   }
 }
 
-static void build_tree_distribution(VP9_COMP *cpi, TX_SIZE tx_size) {
-  vp9_coeff_probs_model *coef_probs = cpi->frame_coef_probs[tx_size];
+static void build_tree_distribution(VP9_COMP *cpi, TX_SIZE tx_size,
+                                    vp9_coeff_stats *coef_branch_ct,
+                                    vp9_coeff_probs_model *coef_probs) {
   vp9_coeff_count *coef_counts = cpi->coef_counts[tx_size];
-  unsigned int (*eob_branch_ct)[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS] =
+  unsigned int (*eob_branch_ct)[REF_TYPES][COEF_BANDS][COEFF_CONTEXTS] =
       cpi->common.counts.eob_branch[tx_size];
-  vp9_coeff_stats *coef_branch_ct = cpi->frame_branch_ct[tx_size];
-  vp9_prob full_probs[ENTROPY_NODES];
-  int i, j, k, l;
+  int i, j, k, l, m;
 
-  for (i = 0; i < BLOCK_TYPES; ++i) {
+  for (i = 0; i < PLANE_TYPES; ++i) {
     for (j = 0; j < REF_TYPES; ++j) {
       for (k = 0; k < COEF_BANDS; ++k) {
-        for (l = 0; l < PREV_COEF_CONTEXTS; ++l) {
-          if (l >= 3 && k == 0)
-            continue;
+        for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) {
           vp9_tree_probs_from_distribution(vp9_coef_tree,
-                                           full_probs,
                                            coef_branch_ct[i][j][k][l],
-                                           coef_counts[i][j][k][l], 0);
-          vpx_memcpy(coef_probs[i][j][k][l], full_probs,
-                     sizeof(vp9_prob) * UNCONSTRAINED_NODES);
+                                           coef_counts[i][j][k][l]);
           coef_branch_ct[i][j][k][l][0][1] = eob_branch_ct[i][j][k][l] -
                                              coef_branch_ct[i][j][k][l][0][0];
-          coef_probs[i][j][k][l][0] =
-              get_binary_prob(coef_branch_ct[i][j][k][l][0][0],
-                              coef_branch_ct[i][j][k][l][0][1]);
-#ifdef ENTROPY_STATS
-          if (!cpi->dummy_packing) {
-            int t;
-            for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)
-              context_counters[tx_size][i][j][k][l][t] +=
-                  coef_counts[i][j][k][l][t];
-            context_counters[tx_size][i][j][k][l][MAX_ENTROPY_TOKENS] +=
-                eob_branch_ct[i][j][k][l];
-          }
-#endif
+          for (m = 0; m < UNCONSTRAINED_NODES; ++m)
+            coef_probs[i][j][k][l][m] = get_binary_prob(
+                                            coef_branch_ct[i][j][k][l][m][0],
+                                            coef_branch_ct[i][j][k][l][m][1]);
         }
       }
     }
   }
 }
 
-static void build_coeff_contexts(VP9_COMP *cpi) {
-  TX_SIZE t;
-  for (t = TX_4X4; t <= TX_32X32; t++)
-    build_tree_distribution(cpi, t);
-}
-
 static void update_coef_probs_common(vp9_writer* const bc, VP9_COMP *cpi,
-                                     TX_SIZE tx_size) {
-  vp9_coeff_probs_model *new_frame_coef_probs = cpi->frame_coef_probs[tx_size];
-  vp9_coeff_probs_model *old_frame_coef_probs =
-      cpi->common.fc.coef_probs[tx_size];
-  vp9_coeff_stats *frame_branch_ct = cpi->frame_branch_ct[tx_size];
+                                     TX_SIZE tx_size,
+                                     vp9_coeff_stats *frame_branch_ct,
+                                     vp9_coeff_probs_model *new_coef_probs) {
+  vp9_coeff_probs_model *old_coef_probs = cpi->common.fc.coef_probs[tx_size];
   const vp9_prob upd = DIFF_UPDATE_PROB;
   const int entropy_nodes_update = UNCONSTRAINED_NODES;
   int i, j, k, l, t;
   switch (cpi->sf.use_fast_coef_updates) {
-    case 0: {
+    case TWO_LOOP: {
       /* dry run to see if there is any udpate at all needed */
       int savings = 0;
       int update[2] = {0, 0};
-      for (i = 0; i < BLOCK_TYPES; ++i) {
+      for (i = 0; i < PLANE_TYPES; ++i) {
         for (j = 0; j < REF_TYPES; ++j) {
           for (k = 0; k < COEF_BANDS; ++k) {
-            for (l = 0; l < PREV_COEF_CONTEXTS; ++l) {
+            for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) {
               for (t = 0; t < entropy_nodes_update; ++t) {
-                vp9_prob newp = new_frame_coef_probs[i][j][k][l][t];
-                const vp9_prob oldp = old_frame_coef_probs[i][j][k][l][t];
+                vp9_prob newp = new_coef_probs[i][j][k][l][t];
+                const vp9_prob oldp = old_coef_probs[i][j][k][l][t];
                 int s;
                 int u = 0;
-
-                if (l >= 3 && k == 0)
-                  continue;
                 if (t == PIVOT_NODE)
                   s = vp9_prob_diff_update_savings_search_model(
                       frame_branch_ct[i][j][k][l][0],
-                      old_frame_coef_probs[i][j][k][l], &newp, upd, i, j);
+                      old_coef_probs[i][j][k][l], &newp, upd);
                 else
                   s = vp9_prob_diff_update_savings_search(
                       frame_branch_ct[i][j][k][l][t], oldp, &newp, upd);
@@ -805,23 +560,21 @@ static void update_coef_probs_common(vp9_writer* const bc, VP9_COMP *cpi,
         return;
       }
       vp9_write_bit(bc, 1);
-      for (i = 0; i < BLOCK_TYPES; ++i) {
+      for (i = 0; i < PLANE_TYPES; ++i) {
         for (j = 0; j < REF_TYPES; ++j) {
           for (k = 0; k < COEF_BANDS; ++k) {
-            for (l = 0; l < PREV_COEF_CONTEXTS; ++l) {
+            for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) {
               // calc probs and branch cts for this frame only
               for (t = 0; t < entropy_nodes_update; ++t) {
-                vp9_prob newp = new_frame_coef_probs[i][j][k][l][t];
-                vp9_prob *oldp = old_frame_coef_probs[i][j][k][l] + t;
+                vp9_prob newp = new_coef_probs[i][j][k][l][t];
+                vp9_prob *oldp = old_coef_probs[i][j][k][l] + t;
                 const vp9_prob upd = DIFF_UPDATE_PROB;
                 int s;
                 int u = 0;
-                if (l >= 3 && k == 0)
-                  continue;
                 if (t == PIVOT_NODE)
                   s = vp9_prob_diff_update_savings_search_model(
                       frame_branch_ct[i][j][k][l][0],
-                      old_frame_coef_probs[i][j][k][l], &newp, upd, i, j);
+                      old_coef_probs[i][j][k][l], &newp, upd);
                 else
                   s = vp9_prob_diff_update_savings_search(
                       frame_branch_ct[i][j][k][l][t],
@@ -829,10 +582,6 @@ static void update_coef_probs_common(vp9_writer* const bc, VP9_COMP *cpi,
                 if (s > 0 && newp != *oldp)
                   u = 1;
                 vp9_write(bc, u, upd);
-#ifdef ENTROPY_STATS
-                if (!cpi->dummy_packing)
-                  ++tree_update_hist[tx_size][i][j][k][l][t][u];
-#endif
                 if (u) {
                   /* send/use new probability */
                   vp9_write_prob_diff_update(bc, newp, *oldp);
@@ -846,28 +595,26 @@ static void update_coef_probs_common(vp9_writer* const bc, VP9_COMP *cpi,
       return;
     }
 
-    case 1:
-    case 2: {
+    case ONE_LOOP:
+    case ONE_LOOP_REDUCED: {
       const int prev_coef_contexts_to_update =
-          (cpi->sf.use_fast_coef_updates == 2 ?
-           PREV_COEF_CONTEXTS >> 1 : PREV_COEF_CONTEXTS);
+          cpi->sf.use_fast_coef_updates == ONE_LOOP_REDUCED ?
+              COEFF_CONTEXTS >> 1 : COEFF_CONTEXTS;
       const int coef_band_to_update =
-          (cpi->sf.use_fast_coef_updates == 2 ?
-           COEF_BANDS >> 1 : COEF_BANDS);
+          cpi->sf.use_fast_coef_updates == ONE_LOOP_REDUCED ?
+              COEF_BANDS >> 1 : COEF_BANDS;
       int updates = 0;
       int noupdates_before_first = 0;
-      for (i = 0; i < BLOCK_TYPES; ++i) {
+      for (i = 0; i < PLANE_TYPES; ++i) {
         for (j = 0; j < REF_TYPES; ++j) {
           for (k = 0; k < COEF_BANDS; ++k) {
-            for (l = 0; l < PREV_COEF_CONTEXTS; ++l) {
+            for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) {
               // calc probs and branch cts for this frame only
               for (t = 0; t < entropy_nodes_update; ++t) {
-                vp9_prob newp = new_frame_coef_probs[i][j][k][l][t];
-                vp9_prob *oldp = old_frame_coef_probs[i][j][k][l] + t;
+                vp9_prob newp = new_coef_probs[i][j][k][l][t];
+                vp9_prob *oldp = old_coef_probs[i][j][k][l] + t;
                 int s;
                 int u = 0;
-                if (l >= 3 && k == 0)
-                  continue;
                 if (l >= prev_coef_contexts_to_update ||
                     k >= coef_band_to_update) {
                   u = 0;
@@ -875,7 +622,7 @@ static void update_coef_probs_common(vp9_writer* const bc, VP9_COMP *cpi,
                   if (t == PIVOT_NODE)
                     s = vp9_prob_diff_update_savings_search_model(
                         frame_branch_ct[i][j][k][l][0],
-                        old_frame_coef_probs[i][j][k][l], &newp, upd, i, j);
+                        old_coef_probs[i][j][k][l], &newp, upd);
                   else
                     s = vp9_prob_diff_update_savings_search(
                         frame_branch_ct[i][j][k][l][t],
@@ -886,10 +633,6 @@ static void update_coef_probs_common(vp9_writer* const bc, VP9_COMP *cpi,
                 updates += u;
                 if (u == 0 && updates == 0) {
                   noupdates_before_first++;
-#ifdef ENTROPY_STATS
-                  if (!cpi->dummy_packing)
-                    ++tree_update_hist[tx_size][i][j][k][l][t][u];
-#endif
                   continue;
                 }
                 if (u == 1 && updates == 1) {
@@ -900,10 +643,6 @@ static void update_coef_probs_common(vp9_writer* const bc, VP9_COMP *cpi,
                     vp9_write(bc, 0, upd);
                 }
                 vp9_write(bc, u, upd);
-#ifdef ENTROPY_STATS
-                if (!cpi->dummy_packing)
-                  ++tree_update_hist[tx_size][i][j][k][l][t][u];
-#endif
                 if (u) {
                   /* send/use new probability */
                   vp9_write_prob_diff_update(bc, newp, *oldp);
@@ -925,25 +664,22 @@ static void update_coef_probs_common(vp9_writer* const bc, VP9_COMP *cpi,
   }
 }
 
-static void update_coef_probs(VP9_COMP* const cpi, vp9_writer* const bc) {
+static void update_coef_probs(VP9_COMP *cpi, vp9_writer* w) {
   const TX_MODE tx_mode = cpi->common.tx_mode;
+  const TX_SIZE max_tx_size = tx_mode_to_biggest_tx_size[tx_mode];
+  TX_SIZE tx_size;
+  vp9_coeff_stats frame_branch_ct[TX_SIZES][PLANE_TYPES];
+  vp9_coeff_probs_model frame_coef_probs[TX_SIZES][PLANE_TYPES];
 
   vp9_clear_system_state();
 
-  // Build the cofficient contexts based on counts collected in encode loop
-  build_coeff_contexts(cpi);
-
-  update_coef_probs_common(bc, cpi, TX_4X4);
+  for (tx_size = TX_4X4; tx_size <= TX_32X32; ++tx_size)
+    build_tree_distribution(cpi, tx_size, frame_branch_ct[tx_size],
+                            frame_coef_probs[tx_size]);
 
-  // do not do this if not even allowed
-  if (tx_mode > ONLY_4X4)
-    update_coef_probs_common(bc, cpi, TX_8X8);
-
-  if (tx_mode > ALLOW_8X8)
-    update_coef_probs_common(bc, cpi, TX_16X16);
-
-  if (tx_mode > ALLOW_16X16)
-    update_coef_probs_common(bc, cpi, TX_32X32);
+  for (tx_size = TX_4X4; tx_size <= max_tx_size; ++tx_size)
+    update_coef_probs_common(w, cpi, tx_size, frame_branch_ct[tx_size],
+                             frame_coef_probs[tx_size]);
 }
 
 static void encode_loopfilter(struct loopfilter *lf,
@@ -959,38 +695,27 @@ static void encode_loopfilter(struct loopfilter *lf,
   vp9_wb_write_bit(wb, lf->mode_ref_delta_enabled);
 
   if (lf->mode_ref_delta_enabled) {
-    // Do the deltas need to be updated
     vp9_wb_write_bit(wb, lf->mode_ref_delta_update);
     if (lf->mode_ref_delta_update) {
-      // Send update
       for (i = 0; i < MAX_REF_LF_DELTAS; i++) {
         const int delta = lf->ref_deltas[i];
-
-        // Frame level data
-        if (delta != lf->last_ref_deltas[i]) {
+        const int changed = delta != lf->last_ref_deltas[i];
+        vp9_wb_write_bit(wb, changed);
+        if (changed) {
           lf->last_ref_deltas[i] = delta;
-          vp9_wb_write_bit(wb, 1);
-
-          assert(delta != 0);
           vp9_wb_write_literal(wb, abs(delta) & 0x3F, 6);
           vp9_wb_write_bit(wb, delta < 0);
-        } else {
-          vp9_wb_write_bit(wb, 0);
         }
       }
 
-      // Send update
       for (i = 0; i < MAX_MODE_LF_DELTAS; i++) {
         const int delta = lf->mode_deltas[i];
-        if (delta != lf->last_mode_deltas[i]) {
+        const int changed = delta != lf->last_mode_deltas[i];
+        vp9_wb_write_bit(wb, changed);
+        if (changed) {
           lf->last_mode_deltas[i] = delta;
-          vp9_wb_write_bit(wb, 1);
-
-          assert(delta != 0);
           vp9_wb_write_literal(wb, abs(delta) & 0x3F, 6);
           vp9_wb_write_bit(wb, delta < 0);
-        } else {
-          vp9_wb_write_bit(wb, 0);
         }
       }
     }
@@ -1067,10 +792,10 @@ static void encode_segmentation(VP9_COMP *cpi,
           const int data_max = vp9_seg_feature_data_max(j);
 
           if (vp9_is_segfeature_signed(j)) {
-            vp9_encode_unsigned_max(wb, abs(data), data_max);
+            encode_unsigned_max(wb, abs(data), data_max);
             vp9_wb_write_bit(wb, data < 0);
           } else {
-            vp9_encode_unsigned_max(wb, data, data_max);
+            encode_unsigned_max(wb, data, data_max);
           }
         }
       }
@@ -1079,9 +804,7 @@ static void encode_segmentation(VP9_COMP *cpi,
 }
 
 
-static void encode_txfm_probs(VP9_COMP *cpi, vp9_writer *w) {
-  VP9_COMMON *const cm = &cpi->common;
-
+static void encode_txfm_probs(VP9_COMMON *cm, vp9_writer *w) {
   // Mode
   vp9_write_literal(w, MIN(cm->tx_mode, ALLOW_32X32), 2);
   if (cm->tx_mode >= ALLOW_32X32)
@@ -1114,26 +837,20 @@ static void encode_txfm_probs(VP9_COMP *cpi, vp9_writer *w) {
         vp9_cond_prob_diff_update(w, &cm->fc.tx_probs.p32x32[i][j],
                                   ct_32x32p[j]);
     }
-#ifdef MODE_STATS
-    if (!cpi->dummy_packing)
-      update_tx_count_stats(cm);
-#endif
   }
 }
 
-static void write_interp_filter_type(INTERPOLATION_TYPE type,
-                                     struct vp9_write_bit_buffer *wb) {
-  const int type_to_literal[] = { 1, 0, 2, 3 };
+static void write_interp_filter(INTERP_FILTER filter,
+                                struct vp9_write_bit_buffer *wb) {
+  const int filter_to_literal[] = { 1, 0, 2, 3 };
 
-  vp9_wb_write_bit(wb, type == SWITCHABLE);
-  if (type != SWITCHABLE)
-    vp9_wb_write_literal(wb, type_to_literal[type], 2);
+  vp9_wb_write_bit(wb, filter == SWITCHABLE);
+  if (filter != SWITCHABLE)
+    vp9_wb_write_literal(wb, filter_to_literal[filter], 2);
 }
 
-static void fix_mcomp_filter_type(VP9_COMP *cpi) {
-  VP9_COMMON *const cm = &cpi->common;
-
-  if (cm->mcomp_filter_type == SWITCHABLE) {
+static void fix_interp_filter(VP9_COMMON *cm) {
+  if (cm->interp_filter == SWITCHABLE) {
     // Check to see if only one of the filters is actually used
     int count[SWITCHABLE_FILTERS];
     int i, j, c = 0;
@@ -1147,7 +864,7 @@ static void fix_mcomp_filter_type(VP9_COMP *cpi) {
       // Only one filter is used. So set the filter at frame level
       for (i = 0; i < SWITCHABLE_FILTERS; ++i) {
         if (count[i]) {
-          cm->mcomp_filter_type = i;
+          cm->interp_filter = i;
           break;
         }
       }
@@ -1188,7 +905,7 @@ static int get_refresh_mask(VP9_COMP *cpi) {
       // other uses are implemented (like RTC/temporal scaling)
       //
       // gld_fb_idx and alt_fb_idx need to be swapped for future frames, but
-      // that happens in vp9_onyx_if.c:update_reference_frames() so that it can
+      // that happens in vp9_encoder.c:update_reference_frames() so that it can
       // be done outside of the recode loop.
       return (cpi->refresh_last_frame << cpi->lst_fb_idx) |
              (cpi->refresh_golden_frame << cpi->alt_fb_idx);
@@ -1219,7 +936,7 @@ static size_t encode_tiles(VP9_COMP *cpi, uint8_t *data_ptr) {
   const int tile_cols = 1 << cm->log2_tile_cols;
   const int tile_rows = 1 << cm->log2_tile_rows;
 
-  vpx_memset(cpi->above_seg_context, 0, sizeof(*cpi->above_seg_context) *
+  vpx_memset(cm->above_seg_context, 0, sizeof(*cm->above_seg_context) *
              mi_cols_aligned_to_sb(cm->mi_cols));
 
   tok[0][0] = cpi->tok;
@@ -1237,7 +954,7 @@ static size_t encode_tiles(VP9_COMP *cpi, uint8_t *data_ptr) {
     for (tile_col = 0; tile_col < tile_cols; tile_col++) {
       TileInfo tile;
 
-      vp9_tile_init(&tile, cm, 0, tile_col);
+      vp9_tile_init(&tile, cm, tile_row, tile_col);
       tok_end = tok[tile_row][tile_col] + cpi->tok_count[tile_row][tile_col];
 
       if (tile_col < tile_cols - 1 || tile_row < tile_rows - 1)
@@ -1250,7 +967,7 @@ static size_t encode_tiles(VP9_COMP *cpi, uint8_t *data_ptr) {
       vp9_stop_encode(&residual_bc);
       if (tile_col < tile_cols - 1 || tile_row < tile_rows - 1) {
         // size of this tile
-        write_be32(data_ptr + total_size, residual_bc.pos);
+        mem_put_be32(data_ptr + total_size, residual_bc.pos);
         total_size += 4;
       }
 
@@ -1261,9 +978,8 @@ static size_t encode_tiles(VP9_COMP *cpi, uint8_t *data_ptr) {
   return total_size;
 }
 
-static void write_display_size(VP9_COMP *cpi, struct vp9_write_bit_buffer *wb) {
-  VP9_COMMON *const cm = &cpi->common;
-
+static void write_display_size(const VP9_COMMON *cm,
+                               struct vp9_write_bit_buffer *wb) {
   const int scaling_active = cm->width != cm->display_width ||
                              cm->height != cm->display_height;
   vp9_wb_write_bit(wb, scaling_active);
@@ -1273,30 +989,29 @@ static void write_display_size(VP9_COMP *cpi, struct vp9_write_bit_buffer *wb) {
   }
 }
 
-static void write_frame_size(VP9_COMP *cpi,
+static void write_frame_size(const VP9_COMMON *cm,
                              struct vp9_write_bit_buffer *wb) {
-  VP9_COMMON *const cm = &cpi->common;
   vp9_wb_write_literal(wb, cm->width - 1, 16);
   vp9_wb_write_literal(wb, cm->height - 1, 16);
 
-  write_display_size(cpi, wb);
+  write_display_size(cm, wb);
 }
 
 static void write_frame_size_with_refs(VP9_COMP *cpi,
                                        struct vp9_write_bit_buffer *wb) {
   VP9_COMMON *const cm = &cpi->common;
-  int refs[ALLOWED_REFS_PER_FRAME] = {cpi->lst_fb_idx, cpi->gld_fb_idx,
-                                      cpi->alt_fb_idx};
-  int i, found = 0;
+  int found = 0;
 
-  for (i = 0; i < ALLOWED_REFS_PER_FRAME; ++i) {
-    YV12_BUFFER_CONFIG *cfg = &cm->yv12_fb[cm->ref_frame_map[refs[i]]];
+  MV_REFERENCE_FRAME ref_frame;
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+    YV12_BUFFER_CONFIG *cfg = get_ref_frame_buffer(cpi, ref_frame);
     found = cm->width == cfg->y_crop_width &&
             cm->height == cfg->y_crop_height;
 
-    // TODO(ivan): This prevents a bug while more than 3 buffers are used. Do it
-    // in a better way.
-    if (cpi->use_svc) {
+    // Set "found" to 0 for temporal svc and for spatial svc key frame
+    if (cpi->use_svc &&
+        (cpi->svc.number_spatial_layers == 1 ||
+         cpi->svc.layer_context[cpi->svc.spatial_layer_id].is_key_frame)) {
       found = 0;
     }
     vp9_wb_write_bit(wb, found);
@@ -1310,7 +1025,7 @@ static void write_frame_size_with_refs(VP9_COMP *cpi,
     vp9_wb_write_literal(wb, cm->height - 1, 16);
   }
 
-  write_display_size(cpi, wb);
+  write_display_size(cm, wb);
 }
 
 static void write_sync_code(struct vp9_write_bit_buffer *wb) {
@@ -1319,19 +1034,22 @@ static void write_sync_code(struct vp9_write_bit_buffer *wb) {
   vp9_wb_write_literal(wb, VP9_SYNC_CODE_2, 8);
 }
 
+static void write_profile(BITSTREAM_PROFILE profile,
+                          struct vp9_write_bit_buffer *wb) {
+  assert(profile < MAX_PROFILES);
+  vp9_wb_write_bit(wb, profile & 1);
+  vp9_wb_write_bit(wb, profile >> 1);
+}
+
 static void write_uncompressed_header(VP9_COMP *cpi,
                                       struct vp9_write_bit_buffer *wb) {
   VP9_COMMON *const cm = &cpi->common;
 
   vp9_wb_write_literal(wb, VP9_FRAME_MARKER, 2);
 
-  // bitstream version.
-  // 00 - profile 0. 4:2:0 only
-  // 10 - profile 1. adds 4:4:4, 4:2:2, alpha
-  vp9_wb_write_bit(wb, cm->version);
-  vp9_wb_write_bit(wb, 0);
+  write_profile(cm->profile, wb);
 
-  vp9_wb_write_bit(wb, 0);
+  vp9_wb_write_bit(wb, 0);  // show_existing_frame
   vp9_wb_write_bit(wb, cm->frame_type);
   vp9_wb_write_bit(wb, cm->show_frame);
   vp9_wb_write_bit(wb, cm->error_resilient_mode);
@@ -1339,23 +1057,25 @@ static void write_uncompressed_header(VP9_COMP *cpi,
   if (cm->frame_type == KEY_FRAME) {
     const COLOR_SPACE cs = UNKNOWN;
     write_sync_code(wb);
+    if (cm->profile > PROFILE_1) {
+      assert(cm->bit_depth > BITS_8);
+      vp9_wb_write_bit(wb, cm->bit_depth - BITS_10);
+    }
     vp9_wb_write_literal(wb, cs, 3);
     if (cs != SRGB) {
       vp9_wb_write_bit(wb, 0);  // 0: [16, 235] (i.e. xvYCC), 1: [0, 255]
-      if (cm->version == 1) {
+      if (cm->profile >= PROFILE_1) {
         vp9_wb_write_bit(wb, cm->subsampling_x);
         vp9_wb_write_bit(wb, cm->subsampling_y);
         vp9_wb_write_bit(wb, 0);  // has extra plane
       }
     } else {
-      assert(cm->version == 1);
+      assert(cm->profile == PROFILE_1);
       vp9_wb_write_bit(wb, 0);  // has extra plane
     }
 
-    write_frame_size(cpi, wb);
+    write_frame_size(cm, wb);
   } else {
-    const int refs[ALLOWED_REFS_PER_FRAME] = {cpi->lst_fb_idx, cpi->gld_fb_idx,
-                                              cpi->alt_fb_idx};
     if (!cm->show_frame)
       vp9_wb_write_bit(wb, cm->intra_only);
 
@@ -1365,22 +1085,23 @@ static void write_uncompressed_header(VP9_COMP *cpi,
     if (cm->intra_only) {
       write_sync_code(wb);
 
-      vp9_wb_write_literal(wb, get_refresh_mask(cpi), NUM_REF_FRAMES);
-      write_frame_size(cpi, wb);
+      vp9_wb_write_literal(wb, get_refresh_mask(cpi), REF_FRAMES);
+      write_frame_size(cm, wb);
     } else {
-      int i;
-      vp9_wb_write_literal(wb, get_refresh_mask(cpi), NUM_REF_FRAMES);
-      for (i = 0; i < ALLOWED_REFS_PER_FRAME; ++i) {
-        vp9_wb_write_literal(wb, refs[i], NUM_REF_FRAMES_LOG2);
-        vp9_wb_write_bit(wb, cm->ref_frame_sign_bias[LAST_FRAME + i]);
+      MV_REFERENCE_FRAME ref_frame;
+      vp9_wb_write_literal(wb, get_refresh_mask(cpi), REF_FRAMES);
+      for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+        vp9_wb_write_literal(wb, get_ref_frame_idx(cpi, ref_frame),
+                             REF_FRAMES_LOG2);
+        vp9_wb_write_bit(wb, cm->ref_frame_sign_bias[ref_frame]);
       }
 
       write_frame_size_with_refs(cpi, wb);
 
       vp9_wb_write_bit(wb, cm->allow_high_precision_mv);
 
-      fix_mcomp_filter_type(cpi);
-      write_interp_filter_type(cm->mcomp_filter_type, wb);
+      fix_interp_filter(cm);
+      write_interp_filter(cm->interp_filter, wb);
     }
   }
 
@@ -1389,7 +1110,7 @@ static void write_uncompressed_header(VP9_COMP *cpi,
     vp9_wb_write_bit(wb, cm->frame_parallel_decoding_mode);
   }
 
-  vp9_wb_write_literal(wb, cm->frame_context_idx, NUM_FRAME_CONTEXTS_LOG2);
+  vp9_wb_write_literal(wb, cm->frame_context_idx, FRAME_CONTEXTS_LOG2);
 
   encode_loopfilter(&cm->lf, wb);
   encode_quantization(cm, wb);
@@ -1409,36 +1130,30 @@ static size_t write_compressed_header(VP9_COMP *cpi, uint8_t *data) {
   if (xd->lossless)
     cm->tx_mode = ONLY_4X4;
   else
-    encode_txfm_probs(cpi, &header_bc);
+    encode_txfm_probs(cm, &header_bc);
 
   update_coef_probs(cpi, &header_bc);
-
-#ifdef ENTROPY_STATS
-  active_section = 2;
-#endif
-
-  vp9_update_skip_probs(cpi, &header_bc);
+  update_skip_probs(cm, &header_bc);
 
   if (!frame_is_intra_only(cm)) {
     int i;
-#ifdef ENTROPY_STATS
-    active_section = 1;
-#endif
 
-    update_inter_mode_probs(cm, &header_bc);
+    for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
+      prob_diff_update(vp9_inter_mode_tree, cm->fc.inter_mode_probs[i],
+                       cm->counts.inter_mode[i], INTER_MODES, &header_bc);
+
     vp9_zero(cm->counts.inter_mode);
 
-    if (cm->mcomp_filter_type == SWITCHABLE)
-      update_switchable_interp_probs(cpi, &header_bc);
+    if (cm->interp_filter == SWITCHABLE)
+      update_switchable_interp_probs(cm, &header_bc);
 
     for (i = 0; i < INTRA_INTER_CONTEXTS; i++)
       vp9_cond_prob_diff_update(&header_bc, &fc->intra_inter_prob[i],
-                                cpi->intra_inter_count[i]);
+                                cm->counts.intra_inter[i]);
 
     if (cm->allow_comp_inter_inter) {
-      const int comp_pred_mode = cpi->common.comp_pred_mode;
-      const int use_compound_pred = comp_pred_mode != SINGLE_PREDICTION_ONLY;
-      const int use_hybrid_pred = comp_pred_mode == HYBRID_PREDICTION;
+      const int use_compound_pred = cm->reference_mode != SINGLE_REFERENCE;
+      const int use_hybrid_pred = cm->reference_mode == REFERENCE_MODE_SELECT;
 
       vp9_write_bit(&header_bc, use_compound_pred);
       if (use_compound_pred) {
@@ -1446,36 +1161,33 @@ static size_t write_compressed_header(VP9_COMP *cpi, uint8_t *data) {
         if (use_hybrid_pred)
           for (i = 0; i < COMP_INTER_CONTEXTS; i++)
             vp9_cond_prob_diff_update(&header_bc, &fc->comp_inter_prob[i],
-                                      cpi->comp_inter_count[i]);
+                                      cm->counts.comp_inter[i]);
       }
     }
 
-    if (cm->comp_pred_mode != COMP_PREDICTION_ONLY) {
+    if (cm->reference_mode != COMPOUND_REFERENCE) {
       for (i = 0; i < REF_CONTEXTS; i++) {
         vp9_cond_prob_diff_update(&header_bc, &fc->single_ref_prob[i][0],
-                                  cpi->single_ref_count[i][0]);
+                                  cm->counts.single_ref[i][0]);
         vp9_cond_prob_diff_update(&header_bc, &fc->single_ref_prob[i][1],
-                                  cpi->single_ref_count[i][1]);
+                                  cm->counts.single_ref[i][1]);
       }
     }
 
-    if (cm->comp_pred_mode != SINGLE_PREDICTION_ONLY)
+    if (cm->reference_mode != SINGLE_REFERENCE)
       for (i = 0; i < REF_CONTEXTS; i++)
         vp9_cond_prob_diff_update(&header_bc, &fc->comp_ref_prob[i],
-                                  cpi->comp_ref_count[i]);
+                                  cm->counts.comp_ref[i]);
 
-    update_mbintra_mode_probs(cpi, &header_bc);
+    for (i = 0; i < BLOCK_SIZE_GROUPS; ++i)
+      prob_diff_update(vp9_intra_mode_tree, cm->fc.y_mode_prob[i],
+                       cm->counts.y_mode[i], INTRA_MODES, &header_bc);
 
-    for (i = 0; i < PARTITION_CONTEXTS; ++i) {
-      vp9_prob pnew[PARTITION_TYPES - 1];
-      unsigned int bct[PARTITION_TYPES - 1][2];
-      update_mode(&header_bc, PARTITION_TYPES,
-                  vp9_partition_tree, pnew,
-                  fc->partition_prob[cm->frame_type][i], bct,
-                  (unsigned int *)cpi->partition_count[i]);
-    }
+    for (i = 0; i < PARTITION_CONTEXTS; ++i)
+      prob_diff_update(vp9_partition_tree, fc->partition_prob[i],
+                       cm->counts.partition[i], PARTITION_TYPES, &header_bc);
 
-    vp9_write_nmv_probs(cpi, cm->allow_high_precision_mv, &header_bc);
+    vp9_write_nmv_probs(cm, cm->allow_high_precision_mv, &header_bc);
   }
 
   vp9_stop_encode(&header_bc);
@@ -1484,9 +1196,9 @@ static size_t write_compressed_header(VP9_COMP *cpi, uint8_t *data) {
   return header_bc.pos;
 }
 
-void vp9_pack_bitstream(VP9_COMP *cpi, uint8_t *dest, unsigned long *size) {
+void vp9_pack_bitstream(VP9_COMP *cpi, uint8_t *dest, size_t *size) {
   uint8_t *data = dest;
-  size_t first_part_size;
+  size_t first_part_size, uncompressed_hdr_size;
   struct vp9_write_bit_buffer wb = {data, 0};
   struct vp9_write_bit_buffer saved_wb;
 
@@ -1494,75 +1206,20 @@ void vp9_pack_bitstream(VP9_COMP *cpi, uint8_t *dest, unsigned long *size) {
   saved_wb = wb;
   vp9_wb_write_literal(&wb, 0, 16);  // don't know in advance first part. size
 
-  data += vp9_rb_bytes_written(&wb);
+  uncompressed_hdr_size = vp9_rb_bytes_written(&wb);
+  data += uncompressed_hdr_size;
 
   vp9_compute_update_table();
 
-#ifdef ENTROPY_STATS
-  if (cm->frame_type == INTER_FRAME)
-    active_section = 0;
-  else
-    active_section = 7;
-#endif
-
-  vp9_clear_system_state();  // __asm emms;
+  vp9_clear_system_state();
 
   first_part_size = write_compressed_header(cpi, data);
   data += first_part_size;
-  vp9_wb_write_literal(&saved_wb, first_part_size, 16);
+  // TODO(jbb): Figure out what to do if first_part_size > 16 bits.
+  vp9_wb_write_literal(&saved_wb, (int)first_part_size, 16);
 
   data += encode_tiles(cpi, data);
 
   *size = data - dest;
 }
 
-#ifdef ENTROPY_STATS
-static void print_tree_update_for_type(FILE *f,
-                                       vp9_coeff_stats *tree_update_hist,
-                                       int block_types, const char *header) {
-  int i, j, k, l, m;
-
-  fprintf(f, "const vp9_coeff_prob %s = {\n", header);
-  for (i = 0; i < block_types; i++) {
-    fprintf(f, "  { \n");
-    for (j = 0; j < REF_TYPES; j++) {
-      fprintf(f, "  { \n");
-      for (k = 0; k < COEF_BANDS; k++) {
-        fprintf(f, "    {\n");
-        for (l = 0; l < PREV_COEF_CONTEXTS; l++) {
-          fprintf(f, "      {");
-          for (m = 0; m < ENTROPY_NODES; m++) {
-            fprintf(f, "%3d, ",
-                    get_binary_prob(tree_update_hist[i][j][k][l][m][0],
-                                    tree_update_hist[i][j][k][l][m][1]));
-          }
-          fprintf(f, "},\n");
-        }
-        fprintf(f, "},\n");
-      }
-      fprintf(f, "    },\n");
-    }
-    fprintf(f, "  },\n");
-  }
-  fprintf(f, "};\n");
-}
-
-void print_tree_update_probs() {
-  FILE *f = fopen("coefupdprob.h", "w");
-  fprintf(f, "\n/* Update probabilities for token entropy tree. */\n\n");
-
-  print_tree_update_for_type(f, tree_update_hist[TX_4X4],   BLOCK_TYPES,
-                             "vp9_coef_update_probs_4x4[BLOCK_TYPES]");
-  print_tree_update_for_type(f, tree_update_hist[TX_8X8],   BLOCK_TYPES,
-                             "vp9_coef_update_probs_8x8[BLOCK_TYPES]");
-  print_tree_update_for_type(f, tree_update_hist[TX_16X16], BLOCK_TYPES,
-                             "vp9_coef_update_probs_16x16[BLOCK_TYPES]");
-  print_tree_update_for_type(f, tree_update_hist[TX_32X32], BLOCK_TYPES,
-                             "vp9_coef_update_probs_32x32[BLOCK_TYPES]");
-
-  fclose(f);
-  f = fopen("treeupdate.bin", "wb");
-  fwrite(tree_update_hist, sizeof(tree_update_hist), 1, f);
-  fclose(f);
-}
-#endif
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_bitstream.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_bitstream.h
index b3dbee1a772..ddfd0ed4ff2 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_bitstream.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_bitstream.h
@@ -12,6 +12,18 @@
 #ifndef VP9_ENCODER_VP9_BITSTREAM_H_
 #define VP9_ENCODER_VP9_BITSTREAM_H_
 
-void vp9_update_skip_probs(VP9_COMP *cpi, vp9_writer *bc);
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct VP9_COMP;
+
+void vp9_entropy_mode_init();
+
+void vp9_pack_bitstream(struct VP9_COMP *cpi, uint8_t *dest, size_t *size);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
 
 #endif  // VP9_ENCODER_VP9_BITSTREAM_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_block.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_block.h
index 583c6c8d02e..2ccf4f80e87 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_block.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_block.h
@@ -11,30 +11,37 @@
 #ifndef VP9_ENCODER_VP9_BLOCK_H_
 #define VP9_ENCODER_VP9_BLOCK_H_
 
-#include "vp9/common/vp9_onyx.h"
 #include "vp9/common/vp9_entropymv.h"
 #include "vp9/common/vp9_entropy.h"
 #include "vpx_ports/mem.h"
 #include "vp9/common/vp9_onyxc_int.h"
 
-// motion search site
-typedef struct {
-  MV mv;
-  int offset;
-} search_site;
+#ifdef __cplusplus
+extern "C" {
+#endif
 
 // Structure to hold snapshot of coding context during the mode picking process
 typedef struct {
   MODE_INFO mic;
   uint8_t *zcoeff_blk;
+  int16_t *coeff[MAX_MB_PLANE][3];
+  int16_t *qcoeff[MAX_MB_PLANE][3];
+  int16_t *dqcoeff[MAX_MB_PLANE][3];
+  uint16_t *eobs[MAX_MB_PLANE][3];
+
+  // dual buffer pointers, 0: in use, 1: best in store
+  int16_t *coeff_pbuf[MAX_MB_PLANE][3];
+  int16_t *qcoeff_pbuf[MAX_MB_PLANE][3];
+  int16_t *dqcoeff_pbuf[MAX_MB_PLANE][3];
+  uint16_t *eobs_pbuf[MAX_MB_PLANE][3];
+
+  int is_coded;
   int num_4x4_blk;
   int skip;
-  int_mv best_ref_mv;
-  int_mv second_best_ref_mv;
+  int_mv best_ref_mv[2];
   int_mv ref_mvs[MAX_REF_FRAMES][MAX_MV_REF_CANDIDATES];
   int rate;
   int distortion;
-  int64_t intra_error;
   int best_mode_index;
   int rddiv;
   int rdmult;
@@ -47,17 +54,14 @@ typedef struct {
   // motion vector cache for adaptive motion search control in partition
   // search loop
   int_mv pred_mv[MAX_REF_FRAMES];
-
-  // Bit flag for each mode whether it has high error in comparison to others.
-  unsigned int modes_with_high_error;
-
-  // Bit flag for each ref frame whether it has high error compared to others.
-  unsigned int frames_with_high_error;
+  INTERP_FILTER pred_interp_filter;
 } PICK_MODE_CONTEXT;
 
 struct macroblock_plane {
   DECLARE_ALIGNED(16, int16_t, src_diff[64 * 64]);
-  DECLARE_ALIGNED(16, int16_t, coeff[64 * 64]);
+  int16_t *qcoeff;
+  int16_t *coeff;
+  uint16_t *eobs;
   struct buf_2d src;
 
   // Quantizer setings
@@ -69,11 +73,23 @@ struct macroblock_plane {
   // Zbin Over Quant value
   int16_t zbin_extra;
 };
+typedef struct PC_TREE {
+  int index;
+  PARTITION_TYPE partitioning;
+  BLOCK_SIZE block_size;
+  PICK_MODE_CONTEXT none;
+  PICK_MODE_CONTEXT horizontal[2];
+  PICK_MODE_CONTEXT vertical[2];
+  union {
+    struct PC_TREE *split[4];
+    PICK_MODE_CONTEXT *leaf_split[4];
+  };
+} PC_TREE;
 
 /* The [2] dimension is for whether we skip the EOB node (i.e. if previous
  * coefficient in this block was zero) or not. */
-typedef unsigned int vp9_coeff_cost[BLOCK_TYPES][REF_TYPES][COEF_BANDS][2]
-                                   [PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS];
+typedef unsigned int vp9_coeff_cost[PLANE_TYPES][REF_TYPES][COEF_BANDS][2]
+                                   [COEFF_CONTEXTS][ENTROPY_TOKENS];
 
 typedef struct macroblock MACROBLOCK;
 struct macroblock {
@@ -81,10 +97,10 @@ struct macroblock {
 
   MACROBLOCKD e_mbd;
   int skip_block;
-
-  search_site *ss;
-  int ss_count;
-  int searches_per_step;
+  int select_txfm_size;
+  int skip_recode;
+  int skip_optimize;
+  int q_index;
 
   int errorperbit;
   int sadperbit16;
@@ -92,13 +108,12 @@ struct macroblock {
   int rddiv;
   int rdmult;
   unsigned int mb_energy;
-  unsigned int *mb_activity_ptr;
-  int *mb_norm_activity_ptr;
-  signed int act_zbin_adj;
 
   int mv_best_ref_index[MAX_REF_FRAMES];
   unsigned int max_mv_context[MAX_REF_FRAMES];
   unsigned int source_variance;
+  unsigned int pred_sse[MAX_REF_FRAMES];
+  int pred_mv_sad[MAX_REF_FRAMES];
 
   int nmvjointcost[MV_JOINTS];
   int nmvcosts[2][MV_VALS];
@@ -114,12 +129,6 @@ struct macroblock {
   int *nmvsadcost_hp[2];
   int **mvsadcost;
 
-  int mbmode_cost[MB_MODE_COUNT];
-  unsigned inter_mode_cost[INTER_MODE_CONTEXTS][INTER_MODES];
-  int intra_uv_mode_cost[2][MB_MODE_COUNT];
-  int y_mode_costs[INTRA_MODES][INTRA_MODES][INTRA_MODES];
-  int switchable_interp_costs[SWITCHABLE_FILTER_CONTEXTS][SWITCHABLE_FILTERS];
-
   // These define limits to motion vector components to prevent them
   // from extending outside the UMV borders
   int mv_col_min;
@@ -132,11 +141,12 @@ struct macroblock {
 
   int encode_breakout;
 
-  unsigned char *active_ptr;
+  int in_active_map;
 
   // note that token_costs is the cost when eob node is skipped
   vp9_coeff_cost token_costs[TX_SIZES];
-  uint8_t token_cache[1024];
+
+  int in_static_area;
 
   int optimize;
 
@@ -145,92 +155,19 @@ struct macroblock {
   int skip_encode;
 
   // Used to store sub partition's choices.
-  int fast_ms;
   int_mv pred_mv[MAX_REF_FRAMES];
-  int subblock_ref;
-
-  // TODO(jingning): Need to refactor the structure arrays that buffers the
-  // coding mode decisions of each partition type.
-  PICK_MODE_CONTEXT ab4x4_context[4][4][4];
-  PICK_MODE_CONTEXT sb8x4_context[4][4][4];
-  PICK_MODE_CONTEXT sb4x8_context[4][4][4];
-  PICK_MODE_CONTEXT sb8x8_context[4][4][4];
-  PICK_MODE_CONTEXT sb8x16_context[4][4][2];
-  PICK_MODE_CONTEXT sb16x8_context[4][4][2];
-  PICK_MODE_CONTEXT mb_context[4][4];
-  PICK_MODE_CONTEXT sb32x16_context[4][2];
-  PICK_MODE_CONTEXT sb16x32_context[4][2];
-  // when 4 MBs share coding parameters:
-  PICK_MODE_CONTEXT sb32_context[4];
-  PICK_MODE_CONTEXT sb32x64_context[2];
-  PICK_MODE_CONTEXT sb64x32_context[2];
-  PICK_MODE_CONTEXT sb64_context;
-  int partition_cost[PARTITION_CONTEXTS][PARTITION_TYPES];
 
-  BLOCK_SIZE b_partitioning[4][4][4];
-  BLOCK_SIZE mb_partitioning[4][4];
-  BLOCK_SIZE sb_partitioning[4];
-  BLOCK_SIZE sb64_partitioning;
+  PICK_MODE_CONTEXT *leaf_tree;
+  PC_TREE *pc_tree;
+  PC_TREE *pc_root;
+  int partition_cost[PARTITION_CONTEXTS][PARTITION_TYPES];
 
   void (*fwd_txm4x4)(const int16_t *input, int16_t *output, int stride);
 };
 
-// TODO(jingning): the variables used here are little complicated. need further
-// refactoring on organizing the temporary buffers, when recursive
-// partition down to 4x4 block size is enabled.
-static PICK_MODE_CONTEXT *get_block_context(MACROBLOCK *x, BLOCK_SIZE bsize) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-
-  switch (bsize) {
-    case BLOCK_64X64:
-      return &x->sb64_context;
-    case BLOCK_64X32:
-      return &x->sb64x32_context[xd->sb_index];
-    case BLOCK_32X64:
-      return &x->sb32x64_context[xd->sb_index];
-    case BLOCK_32X32:
-      return &x->sb32_context[xd->sb_index];
-    case BLOCK_32X16:
-      return &x->sb32x16_context[xd->sb_index][xd->mb_index];
-    case BLOCK_16X32:
-      return &x->sb16x32_context[xd->sb_index][xd->mb_index];
-    case BLOCK_16X16:
-      return &x->mb_context[xd->sb_index][xd->mb_index];
-    case BLOCK_16X8:
-      return &x->sb16x8_context[xd->sb_index][xd->mb_index][xd->b_index];
-    case BLOCK_8X16:
-      return &x->sb8x16_context[xd->sb_index][xd->mb_index][xd->b_index];
-    case BLOCK_8X8:
-      return &x->sb8x8_context[xd->sb_index][xd->mb_index][xd->b_index];
-    case BLOCK_8X4:
-      return &x->sb8x4_context[xd->sb_index][xd->mb_index][xd->b_index];
-    case BLOCK_4X8:
-      return &x->sb4x8_context[xd->sb_index][xd->mb_index][xd->b_index];
-    case BLOCK_4X4:
-      return &x->ab4x4_context[xd->sb_index][xd->mb_index][xd->b_index];
-    default:
-      assert(0);
-      return NULL;
-  }
-}
-
-struct rdcost_block_args {
-  MACROBLOCK *x;
-  ENTROPY_CONTEXT t_above[16];
-  ENTROPY_CONTEXT t_left[16];
-  TX_SIZE tx_size;
-  int bw;
-  int bh;
-  int rate;
-  int64_t dist;
-  int64_t sse;
-  int this_rate;
-  int64_t this_dist;
-  int64_t this_sse;
-  int64_t this_rd;
-  int64_t best_rd;
-  int skip;
-  const int16_t *scan, *nb;
-};
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
 
 #endif  // VP9_ENCODER_VP9_BLOCK_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_context_tree.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_context_tree.c
new file mode 100644
index 00000000000..ac9b562248d
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_context_tree.c
@@ -0,0 +1,156 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vp9/encoder/vp9_context_tree.h"
+
+static const BLOCK_SIZE square[] = {
+  BLOCK_8X8,
+  BLOCK_16X16,
+  BLOCK_32X32,
+  BLOCK_64X64,
+};
+
+static void alloc_mode_context(VP9_COMMON *cm, int num_4x4_blk,
+                               PICK_MODE_CONTEXT *ctx) {
+  const int num_blk = (num_4x4_blk < 4 ? 4 : num_4x4_blk);
+  const int num_pix = num_blk << 4;
+  int i, k;
+  ctx->num_4x4_blk = num_blk;
+
+  CHECK_MEM_ERROR(cm, ctx->zcoeff_blk,
+                  vpx_calloc(num_4x4_blk, sizeof(uint8_t)));
+  for (i = 0; i < MAX_MB_PLANE; ++i) {
+    for (k = 0; k < 3; ++k) {
+      CHECK_MEM_ERROR(cm, ctx->coeff[i][k],
+                      vpx_memalign(16, num_pix * sizeof(int16_t)));
+      CHECK_MEM_ERROR(cm, ctx->qcoeff[i][k],
+                      vpx_memalign(16, num_pix * sizeof(int16_t)));
+      CHECK_MEM_ERROR(cm, ctx->dqcoeff[i][k],
+                      vpx_memalign(16, num_pix * sizeof(int16_t)));
+      CHECK_MEM_ERROR(cm, ctx->eobs[i][k],
+                      vpx_memalign(16, num_pix * sizeof(uint16_t)));
+      ctx->coeff_pbuf[i][k]   = ctx->coeff[i][k];
+      ctx->qcoeff_pbuf[i][k]  = ctx->qcoeff[i][k];
+      ctx->dqcoeff_pbuf[i][k] = ctx->dqcoeff[i][k];
+      ctx->eobs_pbuf[i][k]    = ctx->eobs[i][k];
+    }
+  }
+}
+
+static void free_mode_context(PICK_MODE_CONTEXT *ctx) {
+  int i, k;
+  vpx_free(ctx->zcoeff_blk);
+  ctx->zcoeff_blk = 0;
+  for (i = 0; i < MAX_MB_PLANE; ++i) {
+    for (k = 0; k < 3; ++k) {
+      vpx_free(ctx->coeff[i][k]);
+      ctx->coeff[i][k] = 0;
+      vpx_free(ctx->qcoeff[i][k]);
+      ctx->qcoeff[i][k] = 0;
+      vpx_free(ctx->dqcoeff[i][k]);
+      ctx->dqcoeff[i][k] = 0;
+      vpx_free(ctx->eobs[i][k]);
+      ctx->eobs[i][k] = 0;
+    }
+  }
+}
+
+static void alloc_tree_contexts(VP9_COMMON *cm, PC_TREE *tree,
+                                int num_4x4_blk) {
+  alloc_mode_context(cm, num_4x4_blk, &tree->none);
+  alloc_mode_context(cm, num_4x4_blk/2, &tree->horizontal[0]);
+  alloc_mode_context(cm, num_4x4_blk/2, &tree->vertical[0]);
+
+  /* TODO(Jbb): for 4x8 and 8x4 these allocated values are not used.
+   * Figure out a better way to do this. */
+  alloc_mode_context(cm, num_4x4_blk/2, &tree->horizontal[1]);
+  alloc_mode_context(cm, num_4x4_blk/2, &tree->vertical[1]);
+}
+
+static void free_tree_contexts(PC_TREE *tree) {
+  free_mode_context(&tree->none);
+  free_mode_context(&tree->horizontal[0]);
+  free_mode_context(&tree->horizontal[1]);
+  free_mode_context(&tree->vertical[0]);
+  free_mode_context(&tree->vertical[1]);
+}
+
+// This function sets up a tree of contexts such that at each square
+// partition level. There are contexts for none, horizontal, vertical, and
+// split.  Along with a block_size value and a selected block_size which
+// represents the state of our search.
+void vp9_setup_pc_tree(VP9_COMMON *cm, MACROBLOCK *x) {
+  int i, j;
+  const int leaf_nodes = 64;
+  const int tree_nodes = 64 + 16 + 4 + 1;
+  int pc_tree_index = 0;
+  PC_TREE *this_pc;
+  PICK_MODE_CONTEXT *this_leaf;
+  int square_index = 1;
+  int nodes;
+
+  vpx_free(x->leaf_tree);
+  CHECK_MEM_ERROR(cm, x->leaf_tree, vpx_calloc(leaf_nodes,
+                                               sizeof(*x->leaf_tree)));
+  vpx_free(x->pc_tree);
+  CHECK_MEM_ERROR(cm, x->pc_tree, vpx_calloc(tree_nodes, sizeof(*x->pc_tree)));
+
+  this_pc = &x->pc_tree[0];
+  this_leaf = &x->leaf_tree[0];
+
+  // 4x4 blocks smaller than 8x8 but in the same 8x8 block share the same
+  // context so we only need to allocate 1 for each 8x8 block.
+  for (i = 0; i < leaf_nodes; ++i)
+    alloc_mode_context(cm, 1, &x->leaf_tree[i]);
+
+  // Sets up all the leaf nodes in the tree.
+  for (pc_tree_index = 0; pc_tree_index < leaf_nodes; ++pc_tree_index) {
+    PC_TREE *const tree = &x->pc_tree[pc_tree_index];
+    tree->block_size = square[0];
+    alloc_tree_contexts(cm, tree, 4);
+    tree->leaf_split[0] = this_leaf++;
+    for (j = 1; j < 4; j++)
+      tree->leaf_split[j] = tree->leaf_split[0];
+  }
+
+  // Each node has 4 leaf nodes, fill each block_size level of the tree
+  // from leafs to the root.
+  for (nodes = 16; nodes > 0; nodes >>= 2) {
+    for (i = 0; i < nodes; ++i) {
+      PC_TREE *const tree = &x->pc_tree[pc_tree_index];
+      alloc_tree_contexts(cm, tree, 4 << (2 * square_index));
+      tree->block_size = square[square_index];
+      for (j = 0; j < 4; j++)
+        tree->split[j] = this_pc++;
+      ++pc_tree_index;
+    }
+    ++square_index;
+  }
+  x->pc_root = &x->pc_tree[tree_nodes - 1];
+  x->pc_root[0].none.best_mode_index = 2;
+}
+
+void vp9_free_pc_tree(MACROBLOCK *x) {
+  const int tree_nodes = 64 + 16 + 4 + 1;
+  int i;
+
+  // Set up all 4x4 mode contexts
+  for (i = 0; i < 64; ++i)
+    free_mode_context(&x->leaf_tree[i]);
+
+  // Sets up all the leaf nodes in the tree.
+  for (i = 0; i < tree_nodes; ++i)
+    free_tree_contexts(&x->pc_tree[i]);
+
+  vpx_free(x->pc_tree);
+  x->pc_tree = NULL;
+  vpx_free(x->leaf_tree);
+  x->leaf_tree = NULL;
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_modecosts.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_context_tree.h
index f43033e5fc8..66a6f00e3ef 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_modecosts.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_context_tree.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
  *
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
@@ -8,10 +8,12 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#ifndef VP9_ENCODER_VP9_CONTEXT_TREE_H_
+#define VP9_ENCODER_VP9_CONTEXT_TREE_H_
 
-#ifndef VP9_ENCODER_VP9_MODECOSTS_H_
-#define VP9_ENCODER_VP9_MODECOSTS_H_
+#include "vp9/encoder/vp9_encoder.h"
 
-void vp9_init_mode_costs(VP9_COMP *x);
+void vp9_setup_pc_tree(VP9_COMMON *cm, MACROBLOCK *x);
+void vp9_free_pc_tree(MACROBLOCK *x);
 
-#endif  // VP9_ENCODER_VP9_MODECOSTS_H_
+#endif /* VP9_ENCODER_VP9_CONTEXT_TREE_H_ */
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_boolhuff.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_cost.c
index 32c136e0f70..1c3c3d24847 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_boolhuff.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_cost.c
@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
  *
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
@@ -8,18 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include <assert.h>
-#include "vp9/encoder/vp9_boolhuff.h"
-#include "vp9/common/vp9_entropy.h"
-
-#if defined(SECTIONBITS_OUTPUT)
-unsigned __int64 Sectionbits[500];
-
-#endif
-
-#ifdef ENTROPY_STATS
-unsigned int active_section = 0;
-#endif
+#include "vp9/encoder/vp9_cost.h"
 
 const unsigned int vp9_prob_cost[256] = {
   2047, 2047, 1791, 1641, 1535, 1452, 1385, 1328, 1279, 1235, 1196, 1161,
@@ -45,24 +34,29 @@ const unsigned int vp9_prob_cost[256] = {
   22,   21,   19,   18,   16,   15,   13,   12,   10,   9,    7,    6,
   4,    3,    1,    1};
 
-void vp9_start_encode(vp9_writer *br, uint8_t *source) {
-  br->lowvalue = 0;
-  br->range    = 255;
-  br->value    = 0;
-  br->count    = -24;
-  br->buffer   = source;
-  br->pos      = 0;
-  vp9_write_bit(br, 0);
-}
+static void cost(int *costs, vp9_tree tree, const vp9_prob *probs,
+                 int i, int c) {
+  const vp9_prob prob = probs[i / 2];
+  int b;
 
-void vp9_stop_encode(vp9_writer *br) {
-  int i;
+  for (b = 0; b <= 1; ++b) {
+    const int cc = c + vp9_cost_bit(prob, b);
+    const vp9_tree_index ii = tree[i + b];
 
-  for (i = 0; i < 32; i++)
-    vp9_write_bit(br, 0);
+    if (ii <= 0)
+      costs[-ii] = cc;
+    else
+      cost(costs, tree, probs, ii, cc);
+  }
+}
 
-  // Ensure there's no ambigous collision with any index marker bytes
-  if ((br->buffer[br->pos - 1] & 0xe0) == 0xc0)
-    br->buffer[br->pos++] = 0;
+void vp9_cost_tokens(int *costs, const vp9_prob *probs, vp9_tree tree) {
+  cost(costs, tree, probs, 0, 0);
 }
 
+void vp9_cost_tokens_skip(int *costs, const vp9_prob *probs, vp9_tree tree) {
+  assert(tree[0] <= 0 && tree[1] > 0);
+
+  costs[-tree[0]] = vp9_cost_bit(probs[0], 0);
+  cost(costs, tree, probs, 2, 0);
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_cost.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_cost.h
new file mode 100644
index 00000000000..6d2b9400d7e
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_cost.h
@@ -0,0 +1,55 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_ENCODER_VP9_COST_H_
+#define VP9_ENCODER_VP9_COST_H_
+
+#include "vp9/common/vp9_prob.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern const unsigned int vp9_prob_cost[256];
+
+#define vp9_cost_zero(prob) (vp9_prob_cost[prob])
+
+#define vp9_cost_one(prob) vp9_cost_zero(vp9_complement(prob))
+
+#define vp9_cost_bit(prob, bit) vp9_cost_zero((bit) ? vp9_complement(prob) \
+                                                    : (prob))
+
+static INLINE unsigned int cost_branch256(const unsigned int ct[2],
+                                          vp9_prob p) {
+  return ct[0] * vp9_cost_zero(p) + ct[1] * vp9_cost_one(p);
+}
+
+static INLINE int treed_cost(vp9_tree tree, const vp9_prob *probs,
+                             int bits, int len) {
+  int cost = 0;
+  vp9_tree_index i = 0;
+
+  do {
+    const int bit = (bits >> --len) & 1;
+    cost += vp9_cost_bit(probs[i >> 1], bit);
+    i = tree[i + bit];
+  } while (len);
+
+  return cost;
+}
+
+void vp9_cost_tokens(int *costs, const vp9_prob *probs, vp9_tree tree);
+void vp9_cost_tokens_skip(int *costs, const vp9_prob *probs, vp9_tree tree);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_ENCODER_VP9_COST_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_dct.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_dct.c
index 065992a257a..d5232393f3c 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_dct.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_dct.c
@@ -18,7 +18,11 @@
 #include "vp9/common/vp9_idct.h"
 #include "vp9/common/vp9_systemdependent.h"
 
-#include "vp9/encoder/vp9_dct.h"
+static INLINE int fdct_round_shift(int input) {
+  int rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
+  assert(INT16_MIN <= rv && rv <= INT16_MAX);
+  return rv;
+}
 
 static void fdct4(const int16_t *input, int16_t *output) {
   int16_t step[4];
@@ -31,19 +35,19 @@ static void fdct4(const int16_t *input, int16_t *output) {
 
   temp1 = (step[0] + step[1]) * cospi_16_64;
   temp2 = (step[0] - step[1]) * cospi_16_64;
-  output[0] = dct_const_round_shift(temp1);
-  output[2] = dct_const_round_shift(temp2);
+  output[0] = fdct_round_shift(temp1);
+  output[2] = fdct_round_shift(temp2);
   temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64;
   temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64;
-  output[1] = dct_const_round_shift(temp1);
-  output[3] = dct_const_round_shift(temp2);
+  output[1] = fdct_round_shift(temp1);
+  output[3] = fdct_round_shift(temp2);
 }
 
 void vp9_fdct4x4_c(const int16_t *input, int16_t *output, int stride) {
   // The 2D transform is done with two passes which are actually pretty
   // similar. In the first one, we transform the columns and transpose
   // the results. In the second one, we transform the rows. To achieve that,
-  // as the first pass results are transposed, we tranpose the columns (that
+  // as the first pass results are transposed, we transpose the columns (that
   // is the transposed rows) and transpose the results (so that it goes back
   // in normal/row positions).
   int pass;
@@ -80,12 +84,12 @@ void vp9_fdct4x4_c(const int16_t *input, int16_t *output, int stride) {
       step[3] = input[0] - input[3];
       temp1 = (step[0] + step[1]) * cospi_16_64;
       temp2 = (step[0] - step[1]) * cospi_16_64;
-      out[0] = dct_const_round_shift(temp1);
-      out[2] = dct_const_round_shift(temp2);
+      out[0] = fdct_round_shift(temp1);
+      out[2] = fdct_round_shift(temp2);
       temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64;
       temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64;
-      out[1] = dct_const_round_shift(temp1);
-      out[3] = dct_const_round_shift(temp2);
+      out[1] = fdct_round_shift(temp1);
+      out[3] = fdct_round_shift(temp2);
       // Do next column (which is a transposed row in second/horizontal pass)
       in++;
       out += 4;
@@ -138,10 +142,10 @@ static void fadst4(const int16_t *input, int16_t *output) {
   s3 = x2 - x0 + x3;
 
   // 1-D transform scaling factor is sqrt(2).
-  output[0] = dct_const_round_shift(s0);
-  output[1] = dct_const_round_shift(s1);
-  output[2] = dct_const_round_shift(s2);
-  output[3] = dct_const_round_shift(s3);
+  output[0] = fdct_round_shift(s0);
+  output[1] = fdct_round_shift(s1);
+  output[2] = fdct_round_shift(s2);
+  output[3] = fdct_round_shift(s3);
 }
 
 static const transform_2d FHT_4[] = {
@@ -151,32 +155,36 @@ static const transform_2d FHT_4[] = {
   { fadst4, fadst4 }   // ADST_ADST = 3
 };
 
-void vp9_short_fht4x4_c(const int16_t *input, int16_t *output,
-                        int stride, int tx_type) {
-  int16_t out[4 * 4];
-  int16_t *outptr = &out[0];
-  int i, j;
-  int16_t temp_in[4], temp_out[4];
-  const transform_2d ht = FHT_4[tx_type];
+void vp9_fht4x4_c(const int16_t *input, int16_t *output,
+                  int stride, int tx_type) {
+  if (tx_type == DCT_DCT) {
+    vp9_fdct4x4_c(input, output, stride);
+  } else {
+    int16_t out[4 * 4];
+    int16_t *outptr = &out[0];
+    int i, j;
+    int16_t temp_in[4], temp_out[4];
+    const transform_2d ht = FHT_4[tx_type];
 
-  // Columns
-  for (i = 0; i < 4; ++i) {
-    for (j = 0; j < 4; ++j)
-      temp_in[j] = input[j * stride + i] * 16;
-    if (i == 0 && temp_in[0])
-      temp_in[0] += 1;
-    ht.cols(temp_in, temp_out);
-    for (j = 0; j < 4; ++j)
-      outptr[j * 4 + i] = temp_out[j];
-  }
+    // Columns
+    for (i = 0; i < 4; ++i) {
+      for (j = 0; j < 4; ++j)
+        temp_in[j] = input[j * stride + i] * 16;
+      if (i == 0 && temp_in[0])
+        temp_in[0] += 1;
+      ht.cols(temp_in, temp_out);
+      for (j = 0; j < 4; ++j)
+        outptr[j * 4 + i] = temp_out[j];
+    }
 
-  // Rows
-  for (i = 0; i < 4; ++i) {
-    for (j = 0; j < 4; ++j)
-      temp_in[j] = out[j + i * 4];
-    ht.rows(temp_in, temp_out);
-    for (j = 0; j < 4; ++j)
-      output[j + i * 4] = (temp_out[j] + 1) >> 2;
+    // Rows
+    for (i = 0; i < 4; ++i) {
+      for (j = 0; j < 4; ++j)
+        temp_in[j] = out[j + i * 4];
+      ht.rows(temp_in, temp_out);
+      for (j = 0; j < 4; ++j)
+        output[j + i * 4] = (temp_out[j] + 1) >> 2;
+    }
   }
 }
 
@@ -204,16 +212,16 @@ static void fdct8(const int16_t *input, int16_t *output) {
   t1 = (x0 - x1) * cospi_16_64;
   t2 =  x2 * cospi_24_64 + x3 *  cospi_8_64;
   t3 = -x2 * cospi_8_64  + x3 * cospi_24_64;
-  output[0] = dct_const_round_shift(t0);
-  output[2] = dct_const_round_shift(t2);
-  output[4] = dct_const_round_shift(t1);
-  output[6] = dct_const_round_shift(t3);
+  output[0] = fdct_round_shift(t0);
+  output[2] = fdct_round_shift(t2);
+  output[4] = fdct_round_shift(t1);
+  output[6] = fdct_round_shift(t3);
 
   // Stage 2
   t0 = (s6 - s5) * cospi_16_64;
   t1 = (s6 + s5) * cospi_16_64;
-  t2 = dct_const_round_shift(t0);
-  t3 = dct_const_round_shift(t1);
+  t2 = fdct_round_shift(t0);
+  t3 = fdct_round_shift(t1);
 
   // Stage 3
   x0 = s4 + t2;
@@ -226,10 +234,10 @@ static void fdct8(const int16_t *input, int16_t *output) {
   t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
   t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
   t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
-  output[1] = dct_const_round_shift(t0);
-  output[3] = dct_const_round_shift(t2);
-  output[5] = dct_const_round_shift(t1);
-  output[7] = dct_const_round_shift(t3);
+  output[1] = fdct_round_shift(t0);
+  output[3] = fdct_round_shift(t2);
+  output[5] = fdct_round_shift(t1);
+  output[7] = fdct_round_shift(t3);
 }
 
 void vp9_fdct8x8_c(const int16_t *input, int16_t *final_output, int stride) {
@@ -264,16 +272,16 @@ void vp9_fdct8x8_c(const int16_t *input, int16_t *final_output, int stride) {
       t1 = (x0 - x1) * cospi_16_64;
       t2 =  x2 * cospi_24_64 + x3 *  cospi_8_64;
       t3 = -x2 * cospi_8_64  + x3 * cospi_24_64;
-      output[0 * 8] = dct_const_round_shift(t0);
-      output[2 * 8] = dct_const_round_shift(t2);
-      output[4 * 8] = dct_const_round_shift(t1);
-      output[6 * 8] = dct_const_round_shift(t3);
+      output[0 * 8] = fdct_round_shift(t0);
+      output[2 * 8] = fdct_round_shift(t2);
+      output[4 * 8] = fdct_round_shift(t1);
+      output[6 * 8] = fdct_round_shift(t3);
 
       // Stage 2
       t0 = (s6 - s5) * cospi_16_64;
       t1 = (s6 + s5) * cospi_16_64;
-      t2 = dct_const_round_shift(t0);
-      t3 = dct_const_round_shift(t1);
+      t2 = fdct_round_shift(t0);
+      t3 = fdct_round_shift(t1);
 
       // Stage 3
       x0 = s4 + t2;
@@ -286,10 +294,10 @@ void vp9_fdct8x8_c(const int16_t *input, int16_t *final_output, int stride) {
       t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
       t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
       t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
-      output[1 * 8] = dct_const_round_shift(t0);
-      output[3 * 8] = dct_const_round_shift(t2);
-      output[5 * 8] = dct_const_round_shift(t1);
-      output[7 * 8] = dct_const_round_shift(t3);
+      output[1 * 8] = fdct_round_shift(t0);
+      output[3 * 8] = fdct_round_shift(t2);
+      output[5 * 8] = fdct_round_shift(t1);
+      output[7 * 8] = fdct_round_shift(t3);
       input++;
       output++;
     }
@@ -307,7 +315,7 @@ void vp9_fdct16x16_c(const int16_t *input, int16_t *output, int stride) {
   // The 2D transform is done with two passes which are actually pretty
   // similar. In the first one, we transform the columns and transpose
   // the results. In the second one, we transform the rows. To achieve that,
-  // as the first pass results are transposed, we tranpose the columns (that
+  // as the first pass results are transposed, we transpose the columns (that
   // is the transposed rows) and transpose the results (so that it goes back
   // in normal/row positions).
   int pass;
@@ -388,16 +396,16 @@ void vp9_fdct16x16_c(const int16_t *input, int16_t *output, int stride) {
         t1 = (x0 - x1) * cospi_16_64;
         t2 = x3 * cospi_8_64  + x2 * cospi_24_64;
         t3 = x3 * cospi_24_64 - x2 * cospi_8_64;
-        out[0] = dct_const_round_shift(t0);
-        out[4] = dct_const_round_shift(t2);
-        out[8] = dct_const_round_shift(t1);
-        out[12] = dct_const_round_shift(t3);
+        out[0] = fdct_round_shift(t0);
+        out[4] = fdct_round_shift(t2);
+        out[8] = fdct_round_shift(t1);
+        out[12] = fdct_round_shift(t3);
 
         // Stage 2
         t0 = (s6 - s5) * cospi_16_64;
         t1 = (s6 + s5) * cospi_16_64;
-        t2 = dct_const_round_shift(t0);
-        t3 = dct_const_round_shift(t1);
+        t2 = fdct_round_shift(t0);
+        t3 = fdct_round_shift(t1);
 
         // Stage 3
         x0 = s4 + t2;
@@ -410,22 +418,22 @@ void vp9_fdct16x16_c(const int16_t *input, int16_t *output, int stride) {
         t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
         t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
         t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
-        out[2] = dct_const_round_shift(t0);
-        out[6] = dct_const_round_shift(t2);
-        out[10] = dct_const_round_shift(t1);
-        out[14] = dct_const_round_shift(t3);
+        out[2] = fdct_round_shift(t0);
+        out[6] = fdct_round_shift(t2);
+        out[10] = fdct_round_shift(t1);
+        out[14] = fdct_round_shift(t3);
       }
       // Work on the next eight values; step1 -> odd_results
       {
         // step 2
         temp1 = (step1[5] - step1[2]) * cospi_16_64;
         temp2 = (step1[4] - step1[3]) * cospi_16_64;
-        step2[2] = dct_const_round_shift(temp1);
-        step2[3] = dct_const_round_shift(temp2);
+        step2[2] = fdct_round_shift(temp1);
+        step2[3] = fdct_round_shift(temp2);
         temp1 = (step1[4] + step1[3]) * cospi_16_64;
         temp2 = (step1[5] + step1[2]) * cospi_16_64;
-        step2[4] = dct_const_round_shift(temp1);
-        step2[5] = dct_const_round_shift(temp2);
+        step2[4] = fdct_round_shift(temp1);
+        step2[5] = fdct_round_shift(temp2);
         // step 3
         step3[0] = step1[0] + step2[3];
         step3[1] = step1[1] + step2[2];
@@ -438,12 +446,12 @@ void vp9_fdct16x16_c(const int16_t *input, int16_t *output, int stride) {
         // step 4
         temp1 = step3[1] *  -cospi_8_64 + step3[6] * cospi_24_64;
         temp2 = step3[2] * -cospi_24_64 - step3[5] *  cospi_8_64;
-        step2[1] = dct_const_round_shift(temp1);
-        step2[2] = dct_const_round_shift(temp2);
+        step2[1] = fdct_round_shift(temp1);
+        step2[2] = fdct_round_shift(temp2);
         temp1 = step3[2] * -cospi_8_64 + step3[5] * cospi_24_64;
         temp2 = step3[1] * cospi_24_64 + step3[6] *  cospi_8_64;
-        step2[5] = dct_const_round_shift(temp1);
-        step2[6] = dct_const_round_shift(temp2);
+        step2[5] = fdct_round_shift(temp1);
+        step2[6] = fdct_round_shift(temp2);
         // step 5
         step1[0] = step3[0] + step2[1];
         step1[1] = step3[0] - step2[1];
@@ -456,20 +464,20 @@ void vp9_fdct16x16_c(const int16_t *input, int16_t *output, int stride) {
         // step 6
         temp1 = step1[0] * cospi_30_64 + step1[7] *  cospi_2_64;
         temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64;
-        out[1] = dct_const_round_shift(temp1);
-        out[9] = dct_const_round_shift(temp2);
+        out[1] = fdct_round_shift(temp1);
+        out[9] = fdct_round_shift(temp2);
         temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64;
         temp2 = step1[3] *  cospi_6_64 + step1[4] * cospi_26_64;
-        out[5] = dct_const_round_shift(temp1);
-        out[13] = dct_const_round_shift(temp2);
+        out[5] = fdct_round_shift(temp1);
+        out[13] = fdct_round_shift(temp2);
         temp1 = step1[3] * -cospi_26_64 + step1[4] *  cospi_6_64;
         temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64;
-        out[3] = dct_const_round_shift(temp1);
-        out[11] = dct_const_round_shift(temp2);
+        out[3] = fdct_round_shift(temp1);
+        out[11] = fdct_round_shift(temp2);
         temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64;
         temp2 = step1[0] *  -cospi_2_64 + step1[7] * cospi_30_64;
-        out[7] = dct_const_round_shift(temp1);
-        out[15] = dct_const_round_shift(temp2);
+        out[7] = fdct_round_shift(temp1);
+        out[15] = fdct_round_shift(temp2);
       }
       // Do next column (which is a transposed row in second/horizontal pass)
       in++;
@@ -503,14 +511,14 @@ static void fadst8(const int16_t *input, int16_t *output) {
   s6 = cospi_26_64 * x6 + cospi_6_64  * x7;
   s7 = cospi_6_64  * x6 - cospi_26_64 * x7;
 
-  x0 = dct_const_round_shift(s0 + s4);
-  x1 = dct_const_round_shift(s1 + s5);
-  x2 = dct_const_round_shift(s2 + s6);
-  x3 = dct_const_round_shift(s3 + s7);
-  x4 = dct_const_round_shift(s0 - s4);
-  x5 = dct_const_round_shift(s1 - s5);
-  x6 = dct_const_round_shift(s2 - s6);
-  x7 = dct_const_round_shift(s3 - s7);
+  x0 = fdct_round_shift(s0 + s4);
+  x1 = fdct_round_shift(s1 + s5);
+  x2 = fdct_round_shift(s2 + s6);
+  x3 = fdct_round_shift(s3 + s7);
+  x4 = fdct_round_shift(s0 - s4);
+  x5 = fdct_round_shift(s1 - s5);
+  x6 = fdct_round_shift(s2 - s6);
+  x7 = fdct_round_shift(s3 - s7);
 
   // stage 2
   s0 = x0;
@@ -526,10 +534,10 @@ static void fadst8(const int16_t *input, int16_t *output) {
   x1 = s1 + s3;
   x2 = s0 - s2;
   x3 = s1 - s3;
-  x4 = dct_const_round_shift(s4 + s6);
-  x5 = dct_const_round_shift(s5 + s7);
-  x6 = dct_const_round_shift(s4 - s6);
-  x7 = dct_const_round_shift(s5 - s7);
+  x4 = fdct_round_shift(s4 + s6);
+  x5 = fdct_round_shift(s5 + s7);
+  x6 = fdct_round_shift(s4 - s6);
+  x7 = fdct_round_shift(s5 - s7);
 
   // stage 3
   s2 = cospi_16_64 * (x2 + x3);
@@ -537,10 +545,10 @@ static void fadst8(const int16_t *input, int16_t *output) {
   s6 = cospi_16_64 * (x6 + x7);
   s7 = cospi_16_64 * (x6 - x7);
 
-  x2 = dct_const_round_shift(s2);
-  x3 = dct_const_round_shift(s3);
-  x6 = dct_const_round_shift(s6);
-  x7 = dct_const_round_shift(s7);
+  x2 = fdct_round_shift(s2);
+  x3 = fdct_round_shift(s3);
+  x6 = fdct_round_shift(s6);
+  x7 = fdct_round_shift(s7);
 
   output[0] =   x0;
   output[1] = - x4;
@@ -559,30 +567,34 @@ static const transform_2d FHT_8[] = {
   { fadst8, fadst8 }   // ADST_ADST = 3
 };
 
-void vp9_short_fht8x8_c(const int16_t *input, int16_t *output,
-                        int stride, int tx_type) {
-  int16_t out[64];
-  int16_t *outptr = &out[0];
-  int i, j;
-  int16_t temp_in[8], temp_out[8];
-  const transform_2d ht = FHT_8[tx_type];
-
-  // Columns
-  for (i = 0; i < 8; ++i) {
-    for (j = 0; j < 8; ++j)
-      temp_in[j] = input[j * stride + i] * 4;
-    ht.cols(temp_in, temp_out);
-    for (j = 0; j < 8; ++j)
-      outptr[j * 8 + i] = temp_out[j];
-  }
+void vp9_fht8x8_c(const int16_t *input, int16_t *output,
+                  int stride, int tx_type) {
+  if (tx_type == DCT_DCT) {
+    vp9_fdct8x8_c(input, output, stride);
+  } else {
+    int16_t out[64];
+    int16_t *outptr = &out[0];
+    int i, j;
+    int16_t temp_in[8], temp_out[8];
+    const transform_2d ht = FHT_8[tx_type];
+
+    // Columns
+    for (i = 0; i < 8; ++i) {
+      for (j = 0; j < 8; ++j)
+        temp_in[j] = input[j * stride + i] * 4;
+      ht.cols(temp_in, temp_out);
+      for (j = 0; j < 8; ++j)
+        outptr[j * 8 + i] = temp_out[j];
+    }
 
-  // Rows
-  for (i = 0; i < 8; ++i) {
-    for (j = 0; j < 8; ++j)
-      temp_in[j] = out[j + i * 8];
-    ht.rows(temp_in, temp_out);
-    for (j = 0; j < 8; ++j)
-      output[j + i * 8] = (temp_out[j] + (temp_out[j] < 0)) >> 1;
+    // Rows
+    for (i = 0; i < 8; ++i) {
+      for (j = 0; j < 8; ++j)
+        temp_in[j] = out[j + i * 8];
+      ht.rows(temp_in, temp_out);
+      for (j = 0; j < 8; ++j)
+        output[j + i * 8] = (temp_out[j] + (temp_out[j] < 0)) >> 1;
+    }
   }
 }
 
@@ -693,16 +705,16 @@ static void fdct16(const int16_t in[16], int16_t out[16]) {
     t1 = (x0 - x1) * cospi_16_64;
     t2 = x3 * cospi_8_64  + x2 * cospi_24_64;
     t3 = x3 * cospi_24_64 - x2 * cospi_8_64;
-    out[0] = dct_const_round_shift(t0);
-    out[4] = dct_const_round_shift(t2);
-    out[8] = dct_const_round_shift(t1);
-    out[12] = dct_const_round_shift(t3);
+    out[0] = fdct_round_shift(t0);
+    out[4] = fdct_round_shift(t2);
+    out[8] = fdct_round_shift(t1);
+    out[12] = fdct_round_shift(t3);
 
     // Stage 2
     t0 = (s6 - s5) * cospi_16_64;
     t1 = (s6 + s5) * cospi_16_64;
-    t2 = dct_const_round_shift(t0);
-    t3 = dct_const_round_shift(t1);
+    t2 = fdct_round_shift(t0);
+    t3 = fdct_round_shift(t1);
 
     // Stage 3
     x0 = s4 + t2;
@@ -715,21 +727,21 @@ static void fdct16(const int16_t in[16], int16_t out[16]) {
     t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
     t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
     t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
-    out[2] = dct_const_round_shift(t0);
-    out[6] = dct_const_round_shift(t2);
-    out[10] = dct_const_round_shift(t1);
-    out[14] = dct_const_round_shift(t3);
+    out[2] = fdct_round_shift(t0);
+    out[6] = fdct_round_shift(t2);
+    out[10] = fdct_round_shift(t1);
+    out[14] = fdct_round_shift(t3);
   }
 
   // step 2
   temp1 = (step1[5] - step1[2]) * cospi_16_64;
   temp2 = (step1[4] - step1[3]) * cospi_16_64;
-  step2[2] = dct_const_round_shift(temp1);
-  step2[3] = dct_const_round_shift(temp2);
+  step2[2] = fdct_round_shift(temp1);
+  step2[3] = fdct_round_shift(temp2);
   temp1 = (step1[4] + step1[3]) * cospi_16_64;
   temp2 = (step1[5] + step1[2]) * cospi_16_64;
-  step2[4] = dct_const_round_shift(temp1);
-  step2[5] = dct_const_round_shift(temp2);
+  step2[4] = fdct_round_shift(temp1);
+  step2[5] = fdct_round_shift(temp2);
 
   // step 3
   step3[0] = step1[0] + step2[3];
@@ -744,12 +756,12 @@ static void fdct16(const int16_t in[16], int16_t out[16]) {
   // step 4
   temp1 = step3[1] *  -cospi_8_64 + step3[6] * cospi_24_64;
   temp2 = step3[2] * -cospi_24_64 - step3[5] *  cospi_8_64;
-  step2[1] = dct_const_round_shift(temp1);
-  step2[2] = dct_const_round_shift(temp2);
+  step2[1] = fdct_round_shift(temp1);
+  step2[2] = fdct_round_shift(temp2);
   temp1 = step3[2] * -cospi_8_64 + step3[5] * cospi_24_64;
   temp2 = step3[1] * cospi_24_64 + step3[6] *  cospi_8_64;
-  step2[5] = dct_const_round_shift(temp1);
-  step2[6] = dct_const_round_shift(temp2);
+  step2[5] = fdct_round_shift(temp1);
+  step2[6] = fdct_round_shift(temp2);
 
   // step 5
   step1[0] = step3[0] + step2[1];
@@ -764,23 +776,23 @@ static void fdct16(const int16_t in[16], int16_t out[16]) {
   // step 6
   temp1 = step1[0] * cospi_30_64 + step1[7] *  cospi_2_64;
   temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64;
-  out[1] = dct_const_round_shift(temp1);
-  out[9] = dct_const_round_shift(temp2);
+  out[1] = fdct_round_shift(temp1);
+  out[9] = fdct_round_shift(temp2);
 
   temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64;
   temp2 = step1[3] *  cospi_6_64 + step1[4] * cospi_26_64;
-  out[5] = dct_const_round_shift(temp1);
-  out[13] = dct_const_round_shift(temp2);
+  out[5] = fdct_round_shift(temp1);
+  out[13] = fdct_round_shift(temp2);
 
   temp1 = step1[3] * -cospi_26_64 + step1[4] *  cospi_6_64;
   temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64;
-  out[3] = dct_const_round_shift(temp1);
-  out[11] = dct_const_round_shift(temp2);
+  out[3] = fdct_round_shift(temp1);
+  out[11] = fdct_round_shift(temp2);
 
   temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64;
   temp2 = step1[0] *  -cospi_2_64 + step1[7] * cospi_30_64;
-  out[7] = dct_const_round_shift(temp1);
-  out[15] = dct_const_round_shift(temp2);
+  out[7] = fdct_round_shift(temp1);
+  out[15] = fdct_round_shift(temp2);
 }
 
 static void fadst16(const int16_t *input, int16_t *output) {
@@ -821,22 +833,22 @@ static void fadst16(const int16_t *input, int16_t *output) {
   s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
   s15 = x14 * cospi_3_64  - x15 * cospi_29_64;
 
-  x0 = dct_const_round_shift(s0 + s8);
-  x1 = dct_const_round_shift(s1 + s9);
-  x2 = dct_const_round_shift(s2 + s10);
-  x3 = dct_const_round_shift(s3 + s11);
-  x4 = dct_const_round_shift(s4 + s12);
-  x5 = dct_const_round_shift(s5 + s13);
-  x6 = dct_const_round_shift(s6 + s14);
-  x7 = dct_const_round_shift(s7 + s15);
-  x8  = dct_const_round_shift(s0 - s8);
-  x9  = dct_const_round_shift(s1 - s9);
-  x10 = dct_const_round_shift(s2 - s10);
-  x11 = dct_const_round_shift(s3 - s11);
-  x12 = dct_const_round_shift(s4 - s12);
-  x13 = dct_const_round_shift(s5 - s13);
-  x14 = dct_const_round_shift(s6 - s14);
-  x15 = dct_const_round_shift(s7 - s15);
+  x0 = fdct_round_shift(s0 + s8);
+  x1 = fdct_round_shift(s1 + s9);
+  x2 = fdct_round_shift(s2 + s10);
+  x3 = fdct_round_shift(s3 + s11);
+  x4 = fdct_round_shift(s4 + s12);
+  x5 = fdct_round_shift(s5 + s13);
+  x6 = fdct_round_shift(s6 + s14);
+  x7 = fdct_round_shift(s7 + s15);
+  x8  = fdct_round_shift(s0 - s8);
+  x9  = fdct_round_shift(s1 - s9);
+  x10 = fdct_round_shift(s2 - s10);
+  x11 = fdct_round_shift(s3 - s11);
+  x12 = fdct_round_shift(s4 - s12);
+  x13 = fdct_round_shift(s5 - s13);
+  x14 = fdct_round_shift(s6 - s14);
+  x15 = fdct_round_shift(s7 - s15);
 
   // stage 2
   s0 = x0;
@@ -864,14 +876,14 @@ static void fadst16(const int16_t *input, int16_t *output) {
   x5 = s1 - s5;
   x6 = s2 - s6;
   x7 = s3 - s7;
-  x8 = dct_const_round_shift(s8 + s12);
-  x9 = dct_const_round_shift(s9 + s13);
-  x10 = dct_const_round_shift(s10 + s14);
-  x11 = dct_const_round_shift(s11 + s15);
-  x12 = dct_const_round_shift(s8 - s12);
-  x13 = dct_const_round_shift(s9 - s13);
-  x14 = dct_const_round_shift(s10 - s14);
-  x15 = dct_const_round_shift(s11 - s15);
+  x8 = fdct_round_shift(s8 + s12);
+  x9 = fdct_round_shift(s9 + s13);
+  x10 = fdct_round_shift(s10 + s14);
+  x11 = fdct_round_shift(s11 + s15);
+  x12 = fdct_round_shift(s8 - s12);
+  x13 = fdct_round_shift(s9 - s13);
+  x14 = fdct_round_shift(s10 - s14);
+  x15 = fdct_round_shift(s11 - s15);
 
   // stage 3
   s0 = x0;
@@ -895,18 +907,18 @@ static void fadst16(const int16_t *input, int16_t *output) {
   x1 = s1 + s3;
   x2 = s0 - s2;
   x3 = s1 - s3;
-  x4 = dct_const_round_shift(s4 + s6);
-  x5 = dct_const_round_shift(s5 + s7);
-  x6 = dct_const_round_shift(s4 - s6);
-  x7 = dct_const_round_shift(s5 - s7);
+  x4 = fdct_round_shift(s4 + s6);
+  x5 = fdct_round_shift(s5 + s7);
+  x6 = fdct_round_shift(s4 - s6);
+  x7 = fdct_round_shift(s5 - s7);
   x8 = s8 + s10;
   x9 = s9 + s11;
   x10 = s8 - s10;
   x11 = s9 - s11;
-  x12 = dct_const_round_shift(s12 + s14);
-  x13 = dct_const_round_shift(s13 + s15);
-  x14 = dct_const_round_shift(s12 - s14);
-  x15 = dct_const_round_shift(s13 - s15);
+  x12 = fdct_round_shift(s12 + s14);
+  x13 = fdct_round_shift(s13 + s15);
+  x14 = fdct_round_shift(s12 - s14);
+  x15 = fdct_round_shift(s13 - s15);
 
   // stage 4
   s2 = (- cospi_16_64) * (x2 + x3);
@@ -918,14 +930,14 @@ static void fadst16(const int16_t *input, int16_t *output) {
   s14 = (- cospi_16_64) * (x14 + x15);
   s15 = cospi_16_64 * (x14 - x15);
 
-  x2 = dct_const_round_shift(s2);
-  x3 = dct_const_round_shift(s3);
-  x6 = dct_const_round_shift(s6);
-  x7 = dct_const_round_shift(s7);
-  x10 = dct_const_round_shift(s10);
-  x11 = dct_const_round_shift(s11);
-  x14 = dct_const_round_shift(s14);
-  x15 = dct_const_round_shift(s15);
+  x2 = fdct_round_shift(s2);
+  x3 = fdct_round_shift(s3);
+  x6 = fdct_round_shift(s6);
+  x7 = fdct_round_shift(s7);
+  x10 = fdct_round_shift(s10);
+  x11 = fdct_round_shift(s11);
+  x14 = fdct_round_shift(s14);
+  x15 = fdct_round_shift(s15);
 
   output[0] = x0;
   output[1] = - x8;
@@ -952,31 +964,34 @@ static const transform_2d FHT_16[] = {
   { fadst16, fadst16 }   // ADST_ADST = 3
 };
 
-void vp9_short_fht16x16_c(const int16_t *input, int16_t *output,
-                          int stride, int tx_type) {
-  int16_t out[256];
-  int16_t *outptr = &out[0];
-  int i, j;
-  int16_t temp_in[16], temp_out[16];
-  const transform_2d ht = FHT_16[tx_type];
-
-  // Columns
-  for (i = 0; i < 16; ++i) {
-    for (j = 0; j < 16; ++j)
-      temp_in[j] = input[j * stride + i] * 4;
-    ht.cols(temp_in, temp_out);
-    for (j = 0; j < 16; ++j)
-      outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
-//      outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
-  }
+void vp9_fht16x16_c(const int16_t *input, int16_t *output,
+                    int stride, int tx_type) {
+  if (tx_type == DCT_DCT) {
+    vp9_fdct16x16_c(input, output, stride);
+  } else {
+    int16_t out[256];
+    int16_t *outptr = &out[0];
+    int i, j;
+    int16_t temp_in[16], temp_out[16];
+    const transform_2d ht = FHT_16[tx_type];
+
+    // Columns
+    for (i = 0; i < 16; ++i) {
+      for (j = 0; j < 16; ++j)
+        temp_in[j] = input[j * stride + i] * 4;
+      ht.cols(temp_in, temp_out);
+      for (j = 0; j < 16; ++j)
+        outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
+    }
 
-  // Rows
-  for (i = 0; i < 16; ++i) {
-    for (j = 0; j < 16; ++j)
-      temp_in[j] = out[j + i * 16];
-    ht.rows(temp_in, temp_out);
-    for (j = 0; j < 16; ++j)
-      output[j + i * 16] = temp_out[j];
+    // Rows
+    for (i = 0; i < 16; ++i) {
+      for (j = 0; j < 16; ++j)
+        temp_in[j] = out[j + i * 16];
+      ht.rows(temp_in, temp_out);
+      for (j = 0; j < 16; ++j)
+        output[j + i * 16] = temp_out[j];
+    }
   }
 }
 
@@ -991,7 +1006,7 @@ static INLINE int half_round_shift(int input) {
   return rv;
 }
 
-static void dct32_1d(const int *input, int *output, int round) {
+static void fdct32(const int *input, int *output, int round) {
   int step[32];
   // Stage 1
   step[0] = input[0] + input[(32 - 1)];
@@ -1323,7 +1338,7 @@ void vp9_fdct32x32_c(const int16_t *input, int16_t *out, int stride) {
     int temp_in[32], temp_out[32];
     for (j = 0; j < 32; ++j)
       temp_in[j] = input[j * stride + i] * 4;
-    dct32_1d(temp_in, temp_out, 0);
+    fdct32(temp_in, temp_out, 0);
     for (j = 0; j < 32; ++j)
       output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
   }
@@ -1333,13 +1348,13 @@ void vp9_fdct32x32_c(const int16_t *input, int16_t *out, int stride) {
     int temp_in[32], temp_out[32];
     for (j = 0; j < 32; ++j)
       temp_in[j] = output[j + i * 32];
-    dct32_1d(temp_in, temp_out, 0);
+    fdct32(temp_in, temp_out, 0);
     for (j = 0; j < 32; ++j)
       out[j + i * 32] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
   }
 }
 
-// Note that although we use dct_32_round in dct32_1d computation flow,
+// Note that although we use dct_32_round in dct32 computation flow,
 // this 2d fdct32x32 for rate-distortion optimization loop is operating
 // within 16 bits precision.
 void vp9_fdct32x32_rd_c(const int16_t *input, int16_t *out, int stride) {
@@ -1351,7 +1366,7 @@ void vp9_fdct32x32_rd_c(const int16_t *input, int16_t *out, int stride) {
     int temp_in[32], temp_out[32];
     for (j = 0; j < 32; ++j)
       temp_in[j] = input[j * stride + i] * 4;
-    dct32_1d(temp_in, temp_out, 0);
+    fdct32(temp_in, temp_out, 0);
     for (j = 0; j < 32; ++j)
       // TODO(cd): see quality impact of only doing
       //           output[j * 32 + i] = (temp_out[j] + 1) >> 2;
@@ -1364,32 +1379,8 @@ void vp9_fdct32x32_rd_c(const int16_t *input, int16_t *out, int stride) {
     int temp_in[32], temp_out[32];
     for (j = 0; j < 32; ++j)
       temp_in[j] = output[j + i * 32];
-    dct32_1d(temp_in, temp_out, 1);
+    fdct32(temp_in, temp_out, 1);
     for (j = 0; j < 32; ++j)
       out[j + i * 32] = temp_out[j];
   }
 }
-
-void vp9_fht4x4(TX_TYPE tx_type, const int16_t *input, int16_t *output,
-                int stride) {
-  if (tx_type == DCT_DCT)
-    vp9_fdct4x4(input, output, stride);
-  else
-    vp9_short_fht4x4(input, output, stride, tx_type);
-}
-
-void vp9_fht8x8(TX_TYPE tx_type, const int16_t *input, int16_t *output,
-                int stride) {
-  if (tx_type == DCT_DCT)
-    vp9_fdct8x8(input, output, stride);
-  else
-    vp9_short_fht8x8(input, output, stride, tx_type);
-}
-
-void vp9_fht16x16(TX_TYPE tx_type, const int16_t *input, int16_t *output,
-                  int stride) {
-  if (tx_type == DCT_DCT)
-    vp9_fdct16x16(input, output, stride);
-  else
-    vp9_short_fht16x16(input, output, stride, tx_type);
-}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_dct.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_dct.h
deleted file mode 100644
index aaf976d93c5..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_dct.h
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef VP9_ENCODER_VP9_DCT_H_
-#define VP9_ENCODER_VP9_DCT_H_
-
-void vp9_fht4x4(TX_TYPE tx_type, const int16_t *input, int16_t *output,
-                int stride);
-
-void vp9_fht8x8(TX_TYPE tx_type, const int16_t *input, int16_t *output,
-                int stride);
-
-void vp9_fht16x16(TX_TYPE tx_type, const int16_t *input, int16_t *output,
-                  int stride);
-
-#endif  // VP9_ENCODER_VP9_DCT_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodeframe.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodeframe.c
index 44ade18de39..86e59863bb2 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodeframe.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodeframe.c
@@ -20,8 +20,6 @@
 #include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_entropy.h"
 #include "vp9/common/vp9_entropymode.h"
-#include "vp9/common/vp9_extend.h"
-#include "vp9/common/vp9_findnearmv.h"
 #include "vp9/common/vp9_idct.h"
 #include "vp9/common/vp9_mvref_common.h"
 #include "vp9/common/vp9_pred_common.h"
@@ -29,72 +27,38 @@
 #include "vp9/common/vp9_reconintra.h"
 #include "vp9/common/vp9_reconinter.h"
 #include "vp9/common/vp9_seg_common.h"
+#include "vp9/common/vp9_systemdependent.h"
 #include "vp9/common/vp9_tile_common.h"
+
+#include "vp9/encoder/vp9_aq_complexity.h"
+#include "vp9/encoder/vp9_aq_cyclicrefresh.h"
+#include "vp9/encoder/vp9_aq_variance.h"
 #include "vp9/encoder/vp9_encodeframe.h"
-#include "vp9/encoder/vp9_encodeintra.h"
 #include "vp9/encoder/vp9_encodemb.h"
 #include "vp9/encoder/vp9_encodemv.h"
-#include "vp9/encoder/vp9_onyx_int.h"
+#include "vp9/encoder/vp9_extend.h"
+#include "vp9/encoder/vp9_pickmode.h"
 #include "vp9/encoder/vp9_rdopt.h"
 #include "vp9/encoder/vp9_segmentation.h"
-#include "vp9/common/vp9_systemdependent.h"
 #include "vp9/encoder/vp9_tokenize.h"
-#include "vp9/encoder/vp9_vaq.h"
-
-
-#define DBG_PRNT_SEGMAP 0
-
-
-// #define ENC_DEBUG
-#ifdef ENC_DEBUG
-int enc_debug = 0;
-#endif
 
-static INLINE uint8_t *get_sb_index(MACROBLOCKD *xd, BLOCK_SIZE subsize) {
-  switch (subsize) {
-    case BLOCK_64X64:
-    case BLOCK_64X32:
-    case BLOCK_32X64:
-    case BLOCK_32X32:
-      return &xd->sb_index;
-    case BLOCK_32X16:
-    case BLOCK_16X32:
-    case BLOCK_16X16:
-      return &xd->mb_index;
-    case BLOCK_16X8:
-    case BLOCK_8X16:
-    case BLOCK_8X8:
-      return &xd->b_index;
-    case BLOCK_8X4:
-    case BLOCK_4X8:
-    case BLOCK_4X4:
-      return &xd->ab_index;
-    default:
-      assert(0);
-      return NULL;
-  }
-}
+#define GF_ZEROMV_ZBIN_BOOST 0
+#define LF_ZEROMV_ZBIN_BOOST 0
+#define MV_ZBIN_BOOST        0
+#define SPLIT_MV_ZBIN_BOOST  0
+#define INTRA_ZBIN_BOOST     0
 
 static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled,
-                              int mi_row, int mi_col, BLOCK_SIZE bsize);
+                              int mi_row, int mi_col, BLOCK_SIZE bsize,
+                              PICK_MODE_CONTEXT *ctx);
 
-static void adjust_act_zbin(VP9_COMP *cpi, MACROBLOCK *x);
+// Motion vector component magnitude threshold for defining fast motion.
+#define FAST_MOTION_MV_THRESH 24
 
-/* activity_avg must be positive, or flat regions could get a zero weight
- *  (infinite lambda), which confounds analysis.
- * This also avoids the need for divide by zero checks in
- *  vp9_activity_masking().
- */
-#define ACTIVITY_AVG_MIN (64)
-
-/* Motion vector component magnitude threshold for defining fast motion. */
-#define FAST_MOTION_MV_THRESH (24)
-
-/* This is used as a reference when computing the source variance for the
- *  purposes of activity masking.
- * Eventually this should be replaced by custom no-reference routines,
- *  which will be faster.
- */
+// This is used as a reference when computing the source variance for the
+//  purposes of activity masking.
+// Eventually this should be replaced by custom no-reference routines,
+//  which will be faster.
 static const uint8_t VP9_VAR_OFFS[64] = {
   128, 128, 128, 128, 128, 128, 128, 128,
   128, 128, 128, 128, 128, 128, 128, 128,
@@ -106,294 +70,573 @@ static const uint8_t VP9_VAR_OFFS[64] = {
   128, 128, 128, 128, 128, 128, 128, 128
 };
 
-static unsigned int get_sby_perpixel_variance(VP9_COMP *cpi, MACROBLOCK *x,
-                                              BLOCK_SIZE bs) {
-  unsigned int var, sse;
-  var = cpi->fn_ptr[bs].vf(x->plane[0].src.buf,
-                           x->plane[0].src.stride,
-                           VP9_VAR_OFFS, 0, &sse);
-  return (var + (1 << (num_pels_log2_lookup[bs] - 1))) >>
-      num_pels_log2_lookup[bs];
+static void get_sse_sum_8x8(const uint8_t *src, int src_stride,
+                            const uint8_t *ref, int ref_stride,
+                            unsigned int *sse, int *sum) {
+  variance(src, src_stride, ref, ref_stride, 8, 8, sse, sum);
 }
 
-// Original activity measure from Tim T's code.
-static unsigned int tt_activity_measure(MACROBLOCK *x) {
-  unsigned int act;
+static void get_sse_sum_16x16(const uint8_t *src, int src_stride,
+                              const uint8_t *ref, int ref_stride,
+                              unsigned int *sse, int *sum) {
+  variance(src, src_stride, ref, ref_stride, 16, 16, sse, sum);
+}
+
+static unsigned int get_sby_perpixel_variance(VP9_COMP *cpi,
+                                              const struct buf_2d *ref,
+                                              BLOCK_SIZE bs) {
   unsigned int sse;
-  /* TODO: This could also be done over smaller areas (8x8), but that would
-   *  require extensive changes elsewhere, as lambda is assumed to be fixed
-   *  over an entire MB in most of the code.
-   * Another option is to compute four 8x8 variances, and pick a single
-   *  lambda using a non-linear combination (e.g., the smallest, or second
-   *  smallest, etc.).
-   */
-  act = vp9_variance16x16(x->plane[0].src.buf, x->plane[0].src.stride,
-                          VP9_VAR_OFFS, 0, &sse);
-  act <<= 4;
+  const unsigned int var = cpi->fn_ptr[bs].vf(ref->buf, ref->stride,
+                                              VP9_VAR_OFFS, 0, &sse);
+  return ROUND_POWER_OF_TWO(var, num_pels_log2_lookup[bs]);
+}
 
-  /* If the region is flat, lower the activity some more. */
-  if (act < 8 << 12)
-    act = act < 5 << 12 ? act : 5 << 12;
+static unsigned int get_sby_perpixel_diff_variance(VP9_COMP *cpi,
+                                                   const struct buf_2d *ref,
+                                                   int mi_row, int mi_col,
+                                                   BLOCK_SIZE bs) {
+  const YV12_BUFFER_CONFIG *last = get_ref_frame_buffer(cpi, LAST_FRAME);
+  const uint8_t* last_y = &last->y_buffer[mi_row * MI_SIZE * last->y_stride +
+                                              mi_col * MI_SIZE];
+  unsigned int sse;
+  const unsigned int var = cpi->fn_ptr[bs].vf(ref->buf, ref->stride,
+                                              last_y, last->y_stride, &sse);
+  return ROUND_POWER_OF_TWO(var, num_pels_log2_lookup[bs]);
+}
 
-  return act;
+static BLOCK_SIZE get_rd_var_based_fixed_partition(VP9_COMP *cpi,
+                                                   int mi_row,
+                                                   int mi_col) {
+  unsigned int var = get_sby_perpixel_diff_variance(cpi, &cpi->mb.plane[0].src,
+                                                    mi_row, mi_col,
+                                                    BLOCK_64X64);
+  if (var < 8)
+    return BLOCK_64X64;
+  else if (var < 128)
+    return BLOCK_32X32;
+  else if (var < 2048)
+    return BLOCK_16X16;
+  else
+    return BLOCK_8X8;
 }
 
-// Stub for alternative experimental activity measures.
-static unsigned int alt_activity_measure(MACROBLOCK *x, int use_dc_pred) {
-  return vp9_encode_intra(x, use_dc_pred);
+static BLOCK_SIZE get_nonrd_var_based_fixed_partition(VP9_COMP *cpi,
+                                                      int mi_row,
+                                                      int mi_col) {
+  unsigned int var = get_sby_perpixel_diff_variance(cpi, &cpi->mb.plane[0].src,
+                                                    mi_row, mi_col,
+                                                    BLOCK_64X64);
+  if (var < 4)
+    return BLOCK_64X64;
+  else if (var < 10)
+    return BLOCK_32X32;
+  else
+    return BLOCK_16X16;
 }
 
-// Measure the activity of the current macroblock
-// What we measure here is TBD so abstracted to this function
-#define ALT_ACT_MEASURE 1
-static unsigned int mb_activity_measure(MACROBLOCK *x, int mb_row, int mb_col) {
-  unsigned int mb_activity;
+// Lighter version of set_offsets that only sets the mode info
+// pointers.
+static INLINE void set_modeinfo_offsets(VP9_COMMON *const cm,
+                                        MACROBLOCKD *const xd,
+                                        int mi_row,
+                                        int mi_col) {
+  const int idx_str = xd->mi_stride * mi_row + mi_col;
+  xd->mi = cm->mi_grid_visible + idx_str;
+  xd->mi[0] = cm->mi + idx_str;
+}
 
-  if (ALT_ACT_MEASURE) {
-    int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row);
+static int is_block_in_mb_map(const VP9_COMP *cpi, int mi_row, int mi_col,
+                              BLOCK_SIZE bsize) {
+  const VP9_COMMON *const cm = &cpi->common;
+  const int mb_rows = cm->mb_rows;
+  const int mb_cols = cm->mb_cols;
+  const int mb_row = mi_row >> 1;
+  const int mb_col = mi_col >> 1;
+  const int mb_width = num_8x8_blocks_wide_lookup[bsize] >> 1;
+  const int mb_height = num_8x8_blocks_high_lookup[bsize] >> 1;
+  int r, c;
+  if (bsize <= BLOCK_16X16) {
+    return cpi->active_map[mb_row * mb_cols + mb_col];
+  }
+  for (r = 0; r < mb_height; ++r) {
+    for (c = 0; c < mb_width; ++c) {
+      int row = mb_row + r;
+      int col = mb_col + c;
+      if (row >= mb_rows || col >= mb_cols)
+        continue;
+      if (cpi->active_map[row * mb_cols + col])
+        return 1;
+    }
+  }
+  return 0;
+}
 
-    // Or use and alternative.
-    mb_activity = alt_activity_measure(x, use_dc_pred);
+static int check_active_map(const VP9_COMP *cpi, const MACROBLOCK *x,
+                            int mi_row, int mi_col,
+                            BLOCK_SIZE bsize) {
+  if (cpi->active_map_enabled && !x->e_mbd.lossless) {
+    return is_block_in_mb_map(cpi, mi_row, mi_col, bsize);
   } else {
-    // Original activity measure from Tim T's code.
-    mb_activity = tt_activity_measure(x);
+    return 1;
   }
-
-  if (mb_activity < ACTIVITY_AVG_MIN)
-    mb_activity = ACTIVITY_AVG_MIN;
-
-  return mb_activity;
 }
 
-// Calculate an "average" mb activity value for the frame
-#define ACT_MEDIAN 0
-static void calc_av_activity(VP9_COMP *cpi, int64_t activity_sum) {
-#if ACT_MEDIAN
-  // Find median: Simple n^2 algorithm for experimentation
-  {
-    unsigned int median;
-    unsigned int i, j;
-    unsigned int *sortlist;
-    unsigned int tmp;
-
-    // Create a list to sort to
-    CHECK_MEM_ERROR(&cpi->common, sortlist, vpx_calloc(sizeof(unsigned int),
-                    cpi->common.MBs));
-
-    // Copy map to sort list
-    vpx_memcpy(sortlist, cpi->mb_activity_map,
-        sizeof(unsigned int) * cpi->common.MBs);
-
-    // Ripple each value down to its correct position
-    for (i = 1; i < cpi->common.MBs; i ++) {
-      for (j = i; j > 0; j --) {
-        if (sortlist[j] < sortlist[j - 1]) {
-          // Swap values
-          tmp = sortlist[j - 1];
-          sortlist[j - 1] = sortlist[j];
-          sortlist[j] = tmp;
-        } else {
-          break;
-        }
-      }
-    }
-
-    // Even number MBs so estimate median as mean of two either side.
-    median = (1 + sortlist[cpi->common.MBs >> 1] +
-        sortlist[(cpi->common.MBs >> 1) + 1]) >> 1;
-
-    cpi->activity_avg = median;
-
-    vpx_free(sortlist);
-  }
-#else
-  // Simple mean for now
-  cpi->activity_avg = (unsigned int) (activity_sum / cpi->common.MBs);
-#endif  // ACT_MEDIAN
+static void set_offsets(VP9_COMP *cpi, const TileInfo *const tile,
+                        int mi_row, int mi_col, BLOCK_SIZE bsize) {
+  MACROBLOCK *const x = &cpi->mb;
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi;
+  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+  const int mi_height = num_8x8_blocks_high_lookup[bsize];
+  const struct segmentation *const seg = &cm->seg;
 
-  if (cpi->activity_avg < ACTIVITY_AVG_MIN)
-    cpi->activity_avg = ACTIVITY_AVG_MIN;
+  set_skip_context(xd, mi_row, mi_col);
 
-  // Experimental code: return fixed value normalized for several clips
-  if (ALT_ACT_MEASURE)
-    cpi->activity_avg = 100000;
-}
+  // Activity map pointer
+  x->in_active_map = check_active_map(cpi, x, mi_row, mi_col, bsize);
 
-#define USE_ACT_INDEX   0
-#define OUTPUT_NORM_ACT_STATS   0
+  set_modeinfo_offsets(cm, xd, mi_row, mi_col);
 
-#if USE_ACT_INDEX
-// Calculate an activity index for each mb
-static void calc_activity_index(VP9_COMP *cpi, MACROBLOCK *x) {
-  VP9_COMMON *const cm = &cpi->common;
-  int mb_row, mb_col;
+  mbmi = &xd->mi[0]->mbmi;
 
-  int64_t act;
-  int64_t a;
-  int64_t b;
+  // Set up destination pointers.
+  vp9_setup_dst_planes(xd->plane, get_frame_new_buffer(cm), mi_row, mi_col);
 
-#if OUTPUT_NORM_ACT_STATS
-  FILE *f = fopen("norm_act.stt", "a");
-  fprintf(f, "\n%12d\n", cpi->activity_avg);
-#endif
-
-  // Reset pointers to start of activity map
-  x->mb_activity_ptr = cpi->mb_activity_map;
+  // Set up limit values for MV components.
+  // Mv beyond the range do not produce new/different prediction block.
+  x->mv_row_min = -(((mi_row + mi_height) * MI_SIZE) + VP9_INTERP_EXTEND);
+  x->mv_col_min = -(((mi_col + mi_width) * MI_SIZE) + VP9_INTERP_EXTEND);
+  x->mv_row_max = (cm->mi_rows - mi_row) * MI_SIZE + VP9_INTERP_EXTEND;
+  x->mv_col_max = (cm->mi_cols - mi_col) * MI_SIZE + VP9_INTERP_EXTEND;
 
-  // Calculate normalized mb activity number.
-  for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) {
-    // for each macroblock col in image
-    for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {
-      // Read activity from the map
-      act = *(x->mb_activity_ptr);
+  // Set up distance of MB to edge of frame in 1/8th pel units.
+  assert(!(mi_col & (mi_width - 1)) && !(mi_row & (mi_height - 1)));
+  set_mi_row_col(xd, tile, mi_row, mi_height, mi_col, mi_width,
+                 cm->mi_rows, cm->mi_cols);
 
-      // Calculate a normalized activity number
-      a = act + 4 * cpi->activity_avg;
-      b = 4 * act + cpi->activity_avg;
+  // Set up source buffers.
+  vp9_setup_src_planes(x, cpi->Source, mi_row, mi_col);
 
-      if (b >= a)
-      *(x->activity_ptr) = (int)((b + (a >> 1)) / a) - 1;
-      else
-      *(x->activity_ptr) = 1 - (int)((a + (b >> 1)) / b);
+  // R/D setup.
+  x->rddiv = cpi->rd.RDDIV;
+  x->rdmult = cpi->rd.RDMULT;
 
-#if OUTPUT_NORM_ACT_STATS
-      fprintf(f, " %6d", *(x->mb_activity_ptr));
-#endif
-      // Increment activity map pointers
-      x->mb_activity_ptr++;
+  // Setup segment ID.
+  if (seg->enabled) {
+    if (cpi->oxcf.aq_mode != VARIANCE_AQ) {
+      const uint8_t *const map = seg->update_map ? cpi->segmentation_map
+                                                 : cm->last_frame_seg_map;
+      mbmi->segment_id = vp9_get_segment_id(cm, map, bsize, mi_row, mi_col);
     }
+    vp9_init_plane_quantizers(cpi, x);
 
-#if OUTPUT_NORM_ACT_STATS
-    fprintf(f, "\n");
-#endif
+    x->encode_breakout = cpi->segment_encode_breakout[mbmi->segment_id];
+  } else {
+    mbmi->segment_id = 0;
+    x->encode_breakout = cpi->encode_breakout;
   }
-
-#if OUTPUT_NORM_ACT_STATS
-  fclose(f);
-#endif
 }
-#endif  // USE_ACT_INDEX
 
-// Loop through all MBs. Note activity of each, average activity and
-// calculate a normalized activity for each
-static void build_activity_map(VP9_COMP *cpi) {
-  MACROBLOCK * const x = &cpi->mb;
-  MACROBLOCKD *xd = &x->e_mbd;
-  VP9_COMMON * const cm = &cpi->common;
-
-#if ALT_ACT_MEASURE
-  YV12_BUFFER_CONFIG *new_yv12 = get_frame_new_buffer(cm);
-  int recon_yoffset;
-  int recon_y_stride = new_yv12->y_stride;
-#endif
+static void duplicate_mode_info_in_sb(VP9_COMMON * const cm,
+                                     MACROBLOCKD *const xd,
+                                     int mi_row,
+                                     int mi_col,
+                                     BLOCK_SIZE bsize) {
+  const int block_width = num_8x8_blocks_wide_lookup[bsize];
+  const int block_height = num_8x8_blocks_high_lookup[bsize];
+  int i, j;
+  for (j = 0; j < block_height; ++j)
+    for (i = 0; i < block_width; ++i) {
+      if (mi_row + j < cm->mi_rows && mi_col + i < cm->mi_cols)
+        xd->mi[j * xd->mi_stride + i] = xd->mi[0];
+    }
+}
 
-  int mb_row, mb_col;
-  unsigned int mb_activity;
-  int64_t activity_sum = 0;
+static void set_block_size(VP9_COMP * const cpi,
+                           int mi_row, int mi_col,
+                           BLOCK_SIZE bsize) {
+  if (cpi->common.mi_cols > mi_col && cpi->common.mi_rows > mi_row) {
+    MACROBLOCKD *const xd = &cpi->mb.e_mbd;
+    set_modeinfo_offsets(&cpi->common, xd, mi_row, mi_col);
+    xd->mi[0]->mbmi.sb_type = bsize;
+    duplicate_mode_info_in_sb(&cpi->common, xd, mi_row, mi_col, bsize);
+  }
+}
 
-  x->mb_activity_ptr = cpi->mb_activity_map;
+typedef struct {
+  int64_t sum_square_error;
+  int64_t sum_error;
+  int count;
+  int variance;
+} var;
+
+typedef struct {
+  var none;
+  var horz[2];
+  var vert[2];
+} partition_variance;
+
+typedef struct {
+  partition_variance part_variances;
+  var split[4];
+} v8x8;
+
+typedef struct {
+  partition_variance part_variances;
+  v8x8 split[4];
+} v16x16;
+
+typedef struct {
+  partition_variance part_variances;
+  v16x16 split[4];
+} v32x32;
+
+typedef struct {
+  partition_variance part_variances;
+  v32x32 split[4];
+} v64x64;
+
+typedef struct {
+  partition_variance *part_variances;
+  var *split[4];
+} variance_node;
+
+typedef enum {
+  V16X16,
+  V32X32,
+  V64X64,
+} TREE_LEVEL;
+
+static void tree_to_node(void *data, BLOCK_SIZE bsize, variance_node *node) {
+  int i;
+  switch (bsize) {
+    case BLOCK_64X64: {
+      v64x64 *vt = (v64x64 *) data;
+      node->part_variances = &vt->part_variances;
+      for (i = 0; i < 4; i++)
+        node->split[i] = &vt->split[i].part_variances.none;
+      break;
+    }
+    case BLOCK_32X32: {
+      v32x32 *vt = (v32x32 *) data;
+      node->part_variances = &vt->part_variances;
+      for (i = 0; i < 4; i++)
+        node->split[i] = &vt->split[i].part_variances.none;
+      break;
+    }
+    case BLOCK_16X16: {
+      v16x16 *vt = (v16x16 *) data;
+      node->part_variances = &vt->part_variances;
+      for (i = 0; i < 4; i++)
+        node->split[i] = &vt->split[i].part_variances.none;
+      break;
+    }
+    case BLOCK_8X8: {
+      v8x8 *vt = (v8x8 *) data;
+      node->part_variances = &vt->part_variances;
+      for (i = 0; i < 4; i++)
+        node->split[i] = &vt->split[i];
+      break;
+    }
+    default: {
+      assert(0);
+    }
+  }
+}
 
-  // for each macroblock row in image
-  for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) {
-#if ALT_ACT_MEASURE
-    // reset above block coeffs
-    xd->up_available = (mb_row != 0);
-    recon_yoffset = (mb_row * recon_y_stride * 16);
-#endif
-    // for each macroblock col in image
-    for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {
-#if ALT_ACT_MEASURE
-      xd->plane[0].dst.buf = new_yv12->y_buffer + recon_yoffset;
-      xd->left_available = (mb_col != 0);
-      recon_yoffset += 16;
-#endif
+// Set variance values given sum square error, sum error, count.
+static void fill_variance(int64_t s2, int64_t s, int c, var *v) {
+  v->sum_square_error = s2;
+  v->sum_error = s;
+  v->count = c;
+  if (c > 0)
+    v->variance = (int)(256 *
+                        (v->sum_square_error - v->sum_error * v->sum_error /
+                         v->count) / v->count);
+  else
+    v->variance = 0;
+}
 
-      // measure activity
-      mb_activity = mb_activity_measure(x, mb_row, mb_col);
+void sum_2_variances(const var *a, const var *b, var *r) {
+  fill_variance(a->sum_square_error + b->sum_square_error,
+                a->sum_error + b->sum_error, a->count + b->count, r);
+}
 
-      // Keep frame sum
-      activity_sum += mb_activity;
+static void fill_variance_tree(void *data, BLOCK_SIZE bsize) {
+  variance_node node;
+  tree_to_node(data, bsize, &node);
+  sum_2_variances(node.split[0], node.split[1], &node.part_variances->horz[0]);
+  sum_2_variances(node.split[2], node.split[3], &node.part_variances->horz[1]);
+  sum_2_variances(node.split[0], node.split[2], &node.part_variances->vert[0]);
+  sum_2_variances(node.split[1], node.split[3], &node.part_variances->vert[1]);
+  sum_2_variances(&node.part_variances->vert[0], &node.part_variances->vert[1],
+                  &node.part_variances->none);
+}
 
-      // Store MB level activity details.
-      *x->mb_activity_ptr = mb_activity;
+static int set_vt_partitioning(VP9_COMP *cpi,
+                               void *data,
+                               BLOCK_SIZE bsize,
+                               int mi_row,
+                               int mi_col) {
+  VP9_COMMON * const cm = &cpi->common;
+  variance_node vt;
+  const int block_width = num_8x8_blocks_wide_lookup[bsize];
+  const int block_height = num_8x8_blocks_high_lookup[bsize];
+  // TODO(debargha): Choose this more intelligently.
+  const int64_t threshold_multiplier = 25;
+  int64_t threshold = threshold_multiplier * cpi->common.base_qindex;
+  assert(block_height == block_width);
+
+  tree_to_node(data, bsize, &vt);
+
+  // Split none is available only if we have more than half a block size
+  // in width and height inside the visible image.
+  if (mi_col + block_width / 2 < cm->mi_cols &&
+      mi_row + block_height / 2 < cm->mi_rows &&
+      vt.part_variances->none.variance < threshold) {
+    set_block_size(cpi, mi_row, mi_col, bsize);
+    return 1;
+  }
+
+  // Vertical split is available on all but the bottom border.
+  if (mi_row + block_height / 2 < cm->mi_rows &&
+      vt.part_variances->vert[0].variance < threshold &&
+      vt.part_variances->vert[1].variance < threshold) {
+    BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_VERT);
+    set_block_size(cpi, mi_row, mi_col, subsize);
+    set_block_size(cpi, mi_row, mi_col + block_width / 2, subsize);
+    return 1;
+  }
+
+  // Horizontal split is available on all but the right border.
+  if (mi_col + block_width / 2 < cm->mi_cols &&
+      vt.part_variances->horz[0].variance < threshold &&
+      vt.part_variances->horz[1].variance < threshold) {
+    BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_HORZ);
+    set_block_size(cpi, mi_row, mi_col, subsize);
+    set_block_size(cpi, mi_row + block_height / 2, mi_col, subsize);
+    return 1;
+  }
+  return 0;
+}
 
-      // Increment activity map pointer
-      x->mb_activity_ptr++;
+// TODO(debargha): Fix this function and make it work as expected.
+static void choose_partitioning(VP9_COMP *cpi,
+                                const TileInfo *const tile,
+                                int mi_row, int mi_col) {
+  VP9_COMMON * const cm = &cpi->common;
+  MACROBLOCK *x = &cpi->mb;
+  MACROBLOCKD *xd = &cpi->mb.e_mbd;
 
-      // adjust to the next column of source macroblocks
-      x->plane[0].src.buf += 16;
+  int i, j, k;
+  v64x64 vt;
+  uint8_t *s;
+  const uint8_t *d;
+  int sp;
+  int dp;
+  int pixels_wide = 64, pixels_high = 64;
+  int_mv nearest_mv, near_mv;
+  const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, LAST_FRAME);
+  const struct scale_factors *const sf = &cm->frame_refs[LAST_FRAME - 1].sf;
+
+  vp9_zero(vt);
+  set_offsets(cpi, tile, mi_row, mi_col, BLOCK_64X64);
+
+  if (xd->mb_to_right_edge < 0)
+    pixels_wide += (xd->mb_to_right_edge >> 3);
+  if (xd->mb_to_bottom_edge < 0)
+    pixels_high += (xd->mb_to_bottom_edge >> 3);
+
+  s = x->plane[0].src.buf;
+  sp = x->plane[0].src.stride;
+
+  if (cm->frame_type != KEY_FRAME) {
+    vp9_setup_pre_planes(xd, 0, yv12, mi_row, mi_col, sf);
+
+    xd->mi[0]->mbmi.ref_frame[0] = LAST_FRAME;
+    xd->mi[0]->mbmi.sb_type = BLOCK_64X64;
+    vp9_find_best_ref_mvs(xd, cm->allow_high_precision_mv,
+                          xd->mi[0]->mbmi.ref_mvs[LAST_FRAME],
+                          &nearest_mv, &near_mv);
+
+    xd->mi[0]->mbmi.mv[0] = nearest_mv;
+    vp9_build_inter_predictors_sby(xd, mi_row, mi_col, BLOCK_64X64);
+
+    d = xd->plane[0].dst.buf;
+    dp = xd->plane[0].dst.stride;
+  } else {
+    d = VP9_VAR_OFFS;
+    dp = 0;
+  }
+
+  // Fill in the entire tree of 8x8 variances for splits.
+  for (i = 0; i < 4; i++) {
+    const int x32_idx = ((i & 1) << 5);
+    const int y32_idx = ((i >> 1) << 5);
+    for (j = 0; j < 4; j++) {
+      const int x16_idx = x32_idx + ((j & 1) << 4);
+      const int y16_idx = y32_idx + ((j >> 1) << 4);
+      v16x16 *vst = &vt.split[i].split[j];
+      for (k = 0; k < 4; k++) {
+        int x_idx = x16_idx + ((k & 1) << 3);
+        int y_idx = y16_idx + ((k >> 1) << 3);
+        unsigned int sse = 0;
+        int sum = 0;
+        if (x_idx < pixels_wide && y_idx < pixels_high)
+          get_sse_sum_8x8(s + y_idx * sp + x_idx, sp,
+                          d + y_idx * dp + x_idx, dp, &sse, &sum);
+        fill_variance(sse, sum, 64, &vst->split[k].part_variances.none);
+      }
     }
-
-    // adjust to the next row of mbs
-    x->plane[0].src.buf += 16 * x->plane[0].src.stride - 16 * cm->mb_cols;
   }
-
-  // Calculate an "average" MB activity
-  calc_av_activity(cpi, activity_sum);
-
-#if USE_ACT_INDEX
-  // Calculate an activity index number of each mb
-  calc_activity_index(cpi, x);
-#endif
-}
-
-// Macroblock activity masking
-void vp9_activity_masking(VP9_COMP *cpi, MACROBLOCK *x) {
-#if USE_ACT_INDEX
-  x->rdmult += *(x->mb_activity_ptr) * (x->rdmult >> 2);
-  x->errorperbit = x->rdmult * 100 / (110 * x->rddiv);
-  x->errorperbit += (x->errorperbit == 0);
+  // Fill the rest of the variance tree by summing split partition values.
+  for (i = 0; i < 4; i++) {
+    for (j = 0; j < 4; j++) {
+      fill_variance_tree(&vt.split[i].split[j], BLOCK_16X16);
+    }
+    fill_variance_tree(&vt.split[i], BLOCK_32X32);
+  }
+  fill_variance_tree(&vt, BLOCK_64X64);
+
+  // Now go through the entire structure,  splitting every block size until
+  // we get to one that's got a variance lower than our threshold,  or we
+  // hit 8x8.
+  if (!set_vt_partitioning(cpi, &vt, BLOCK_64X64,
+                           mi_row, mi_col)) {
+    for (i = 0; i < 4; ++i) {
+      const int x32_idx = ((i & 1) << 2);
+      const int y32_idx = ((i >> 1) << 2);
+      if (!set_vt_partitioning(cpi, &vt.split[i], BLOCK_32X32,
+                               (mi_row + y32_idx), (mi_col + x32_idx))) {
+        for (j = 0; j < 4; ++j) {
+          const int x16_idx = ((j & 1) << 1);
+          const int y16_idx = ((j >> 1) << 1);
+          // NOTE: This is a temporary hack to disable 8x8 partitions,
+          // since it works really bad - possibly due to a bug
+#define DISABLE_8X8_VAR_BASED_PARTITION
+#ifdef DISABLE_8X8_VAR_BASED_PARTITION
+          if (mi_row + y32_idx + y16_idx + 1 < cm->mi_rows &&
+              mi_row + x32_idx + x16_idx + 1 < cm->mi_cols) {
+            set_block_size(cpi,
+                           (mi_row + y32_idx + y16_idx),
+                           (mi_col + x32_idx + x16_idx),
+                           BLOCK_16X16);
+          } else {
+            for (k = 0; k < 4; ++k) {
+              const int x8_idx = (k & 1);
+              const int y8_idx = (k >> 1);
+              set_block_size(cpi,
+                             (mi_row + y32_idx + y16_idx + y8_idx),
+                             (mi_col + x32_idx + x16_idx + x8_idx),
+                             BLOCK_8X8);
+            }
+          }
 #else
-  int64_t a;
-  int64_t b;
-  int64_t act = *(x->mb_activity_ptr);
-
-  // Apply the masking to the RD multiplier.
-  a = act + (2 * cpi->activity_avg);
-  b = (2 * act) + cpi->activity_avg;
-
-  x->rdmult = (unsigned int) (((int64_t) x->rdmult * b + (a >> 1)) / a);
-  x->errorperbit = x->rdmult * 100 / (110 * x->rddiv);
-  x->errorperbit += (x->errorperbit == 0);
+          if (!set_vt_partitioning(cpi, &vt.split[i].split[j], tile,
+                                   BLOCK_16X16,
+                                   (mi_row + y32_idx + y16_idx),
+                                   (mi_col + x32_idx + x16_idx), 2)) {
+            for (k = 0; k < 4; ++k) {
+              const int x8_idx = (k & 1);
+              const int y8_idx = (k >> 1);
+              set_block_size(cpi,
+                             (mi_row + y32_idx + y16_idx + y8_idx),
+                             (mi_col + x32_idx + x16_idx + x8_idx),
+                             BLOCK_8X8);
+            }
+          }
 #endif
+        }
+      }
+    }
+  }
+}
 
-  // Activity based Zbin adjustment
-  adjust_act_zbin(cpi, x);
+// Original activity measure from Tim T's code.
+static unsigned int tt_activity_measure(MACROBLOCK *x) {
+  unsigned int sse;
+  // TODO: This could also be done over smaller areas (8x8), but that would
+  // require extensive changes elsewhere, as lambda is assumed to be fixed
+  // over an entire MB in most of the code.
+  // Another option is to compute four 8x8 variances, and pick a single
+  // lambda using a non-linear combination (e.g., the smallest, or second
+  // smallest, etc.).
+  const unsigned int act = vp9_variance16x16(x->plane[0].src.buf,
+                                             x->plane[0].src.stride,
+                                             VP9_VAR_OFFS, 0, &sse) << 4;
+  // If the region is flat, lower the activity some more.
+  return act < (8 << 12) ? MIN(act, 5 << 12) : act;
 }
 
 static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
-                         BLOCK_SIZE bsize, int output_enabled) {
+                         int mi_row, int mi_col, BLOCK_SIZE bsize,
+                         int output_enabled) {
   int i, x_idx, y;
   VP9_COMMON *const cm = &cpi->common;
+  RD_OPT *const rd_opt = &cpi->rd;
   MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
+  struct macroblock_plane *const p = x->plane;
+  struct macroblockd_plane *const pd = xd->plane;
   MODE_INFO *mi = &ctx->mic;
-  MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
-  MODE_INFO *mi_addr = xd->mi_8x8[0];
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  MODE_INFO *mi_addr = xd->mi[0];
+  const struct segmentation *const seg = &cm->seg;
 
-  int mb_mode_index = ctx->best_mode_index;
-  const int mis = cm->mode_info_stride;
+  const int mis = cm->mi_stride;
   const int mi_width = num_8x8_blocks_wide_lookup[bsize];
   const int mi_height = num_8x8_blocks_high_lookup[bsize];
+  int max_plane;
 
-  assert(mi->mbmi.mode < MB_MODE_COUNT);
-  assert(mi->mbmi.ref_frame[0] < MAX_REF_FRAMES);
-  assert(mi->mbmi.ref_frame[1] < MAX_REF_FRAMES);
   assert(mi->mbmi.sb_type == bsize);
 
   *mi_addr = *mi;
 
+  // If segmentation in use
+  if (seg->enabled && output_enabled) {
+    // For in frame complexity AQ copy the segment id from the segment map.
+    if (cpi->oxcf.aq_mode == COMPLEXITY_AQ) {
+      const uint8_t *const map = seg->update_map ? cpi->segmentation_map
+                                                 : cm->last_frame_seg_map;
+      mi_addr->mbmi.segment_id =
+        vp9_get_segment_id(cm, map, bsize, mi_row, mi_col);
+    }
+    // Else for cyclic refresh mode update the segment map, set the segment id
+    // and then update the quantizer.
+    else if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
+      vp9_cyclic_refresh_update_segment(cpi, &xd->mi[0]->mbmi,
+                                        mi_row, mi_col, bsize, 1);
+      vp9_init_plane_quantizers(cpi, x);
+    }
+  }
+
+  max_plane = is_inter_block(mbmi) ? MAX_MB_PLANE : 1;
+  for (i = 0; i < max_plane; ++i) {
+    p[i].coeff = ctx->coeff_pbuf[i][1];
+    p[i].qcoeff = ctx->qcoeff_pbuf[i][1];
+    pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][1];
+    p[i].eobs = ctx->eobs_pbuf[i][1];
+  }
+
+  for (i = max_plane; i < MAX_MB_PLANE; ++i) {
+    p[i].coeff = ctx->coeff_pbuf[i][2];
+    p[i].qcoeff = ctx->qcoeff_pbuf[i][2];
+    pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][2];
+    p[i].eobs = ctx->eobs_pbuf[i][2];
+  }
+
   // Restore the coding context of the MB to that that was in place
   // when the mode was picked for it
   for (y = 0; y < mi_height; y++)
     for (x_idx = 0; x_idx < mi_width; x_idx++)
       if ((xd->mb_to_right_edge >> (3 + MI_SIZE_LOG2)) + mi_width > x_idx
-          && (xd->mb_to_bottom_edge >> (3 + MI_SIZE_LOG2)) + mi_height > y)
-        xd->mi_8x8[x_idx + y * mis] = mi_addr;
+        && (xd->mb_to_bottom_edge >> (3 + MI_SIZE_LOG2)) + mi_height > y) {
+        xd->mi[x_idx + y * mis] = mi_addr;
+      }
 
-  if (cpi->sf.variance_adaptive_quantization) {
-    vp9_mb_init_quantizer(cpi, x);
-  }
+  if (cpi->oxcf.aq_mode)
+    vp9_init_plane_quantizers(cpi, x);
 
   // FIXME(rbultje) I'm pretty sure this should go to the end of this block
   // (i.e. after the output_enabled)
@@ -417,56 +660,45 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
 
   if (!vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
     for (i = 0; i < TX_MODES; i++)
-      cpi->rd_tx_select_diff[i] += ctx->tx_rd_diff[i];
+      rd_opt->tx_select_diff[i] += ctx->tx_rd_diff[i];
   }
 
-  if (frame_is_intra_only(cm)) {
 #if CONFIG_INTERNAL_STATS
+  if (frame_is_intra_only(cm)) {
     static const int kf_mode_index[] = {
-      THR_DC /*DC_PRED*/,
-      THR_V_PRED /*V_PRED*/,
-      THR_H_PRED /*H_PRED*/,
-      THR_D45_PRED /*D45_PRED*/,
+      THR_DC        /*DC_PRED*/,
+      THR_V_PRED    /*V_PRED*/,
+      THR_H_PRED    /*H_PRED*/,
+      THR_D45_PRED  /*D45_PRED*/,
       THR_D135_PRED /*D135_PRED*/,
       THR_D117_PRED /*D117_PRED*/,
       THR_D153_PRED /*D153_PRED*/,
       THR_D207_PRED /*D207_PRED*/,
-      THR_D63_PRED /*D63_PRED*/,
-      THR_TM /*TM_PRED*/,
+      THR_D63_PRED  /*D63_PRED*/,
+      THR_TM        /*TM_PRED*/,
     };
-    cpi->mode_chosen_counts[kf_mode_index[mi->mbmi.mode]]++;
-#endif
+    ++cpi->mode_chosen_counts[kf_mode_index[mbmi->mode]];
   } else {
     // Note how often each mode chosen as best
-    cpi->mode_chosen_counts[mb_mode_index]++;
-    if (is_inter_block(mbmi)
-        && (mbmi->sb_type < BLOCK_8X8 || mbmi->mode == NEWMV)) {
-      int_mv best_mv[2];
-      const MV_REFERENCE_FRAME rf1 = mbmi->ref_frame[0];
-      const MV_REFERENCE_FRAME rf2 = mbmi->ref_frame[1];
-      best_mv[0].as_int = ctx->best_ref_mv.as_int;
-      best_mv[1].as_int = ctx->second_best_ref_mv.as_int;
-      if (mbmi->mode == NEWMV) {
-        best_mv[0].as_int = mbmi->ref_mvs[rf1][0].as_int;
-        if (rf2 > 0)
-          best_mv[1].as_int = mbmi->ref_mvs[rf2][0].as_int;
-      }
-      mbmi->best_mv[0].as_int = best_mv[0].as_int;
-      mbmi->best_mv[1].as_int = best_mv[1].as_int;
-      vp9_update_mv_count(cpi, x, best_mv);
-    }
+    ++cpi->mode_chosen_counts[ctx->best_mode_index];
+  }
+#endif
+  if (!frame_is_intra_only(cm)) {
+    if (is_inter_block(mbmi)) {
+      vp9_update_mv_count(cm, xd);
 
-    if (cm->mcomp_filter_type == SWITCHABLE && is_inter_mode(mbmi->mode)) {
-      const int ctx = vp9_get_pred_context_switchable_interp(xd);
-      ++cm->counts.switchable_interp[ctx][mbmi->interp_filter];
+      if (cm->interp_filter == SWITCHABLE) {
+        const int ctx = vp9_get_pred_context_switchable_interp(xd);
+        ++cm->counts.switchable_interp[ctx][mbmi->interp_filter];
+      }
     }
 
-    cpi->rd_comp_pred_diff[SINGLE_PREDICTION_ONLY] += ctx->single_pred_diff;
-    cpi->rd_comp_pred_diff[COMP_PREDICTION_ONLY] += ctx->comp_pred_diff;
-    cpi->rd_comp_pred_diff[HYBRID_PREDICTION] += ctx->hybrid_pred_diff;
+    rd_opt->comp_pred_diff[SINGLE_REFERENCE] += ctx->single_pred_diff;
+    rd_opt->comp_pred_diff[COMPOUND_REFERENCE] += ctx->comp_pred_diff;
+    rd_opt->comp_pred_diff[REFERENCE_MODE_SELECT] += ctx->hybrid_pred_diff;
 
-    for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
-      cpi->rd_filter_diff[i] += ctx->best_filter_diff[i];
+    for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
+      rd_opt->filter_diff[i] += ctx->best_filter_diff[i];
   }
 }
 
@@ -478,119 +710,43 @@ void vp9_setup_src_planes(MACROBLOCK *x, const YV12_BUFFER_CONFIG *src,
                           src->alpha_stride};
   int i;
 
+  // Set current frame pointer.
+  x->e_mbd.cur_buf = src;
+
   for (i = 0; i < MAX_MB_PLANE; i++)
     setup_pred_plane(&x->plane[i].src, buffers[i], strides[i], mi_row, mi_col,
                      NULL, x->e_mbd.plane[i].subsampling_x,
                      x->e_mbd.plane[i].subsampling_y);
 }
 
-static void set_offsets(VP9_COMP *cpi, const TileInfo *const tile,
-                        int mi_row, int mi_col, BLOCK_SIZE bsize) {
-  MACROBLOCK *const x = &cpi->mb;
-  VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi;
-  const int dst_fb_idx = cm->new_fb_idx;
-  const int idx_str = xd->mode_info_stride * mi_row + mi_col;
-  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
-  const int mi_height = num_8x8_blocks_high_lookup[bsize];
-  const int mb_row = mi_row >> 1;
-  const int mb_col = mi_col >> 1;
-  const int idx_map = mb_row * cm->mb_cols + mb_col;
-  const struct segmentation *const seg = &cm->seg;
-
-  set_skip_context(xd, cpi->above_context, cpi->left_context, mi_row, mi_col);
-
-  // Activity map pointer
-  x->mb_activity_ptr = &cpi->mb_activity_map[idx_map];
-  x->active_ptr = cpi->active_map + idx_map;
-
-  xd->mi_8x8 = cm->mi_grid_visible + idx_str;
-  xd->prev_mi_8x8 = cm->prev_mi_grid_visible + idx_str;
-
-  // Special case: if prev_mi is NULL, the previous mode info context
-  // cannot be used.
-  xd->last_mi = cm->prev_mi ? xd->prev_mi_8x8[0] : NULL;
-
-  xd->mi_8x8[0] = cm->mi + idx_str;
-
-  mbmi = &xd->mi_8x8[0]->mbmi;
-
-  // Set up destination pointers
-  setup_dst_planes(xd, &cm->yv12_fb[dst_fb_idx], mi_row, mi_col);
-
-  // Set up limit values for MV components
-  // mv beyond the range do not produce new/different prediction block
-  x->mv_row_min = -(((mi_row + mi_height) * MI_SIZE) + VP9_INTERP_EXTEND);
-  x->mv_col_min = -(((mi_col + mi_width) * MI_SIZE) + VP9_INTERP_EXTEND);
-  x->mv_row_max = (cm->mi_rows - mi_row) * MI_SIZE + VP9_INTERP_EXTEND;
-  x->mv_col_max = (cm->mi_cols - mi_col) * MI_SIZE + VP9_INTERP_EXTEND;
-
-  // Set up distance of MB to edge of frame in 1/8th pel units
-  assert(!(mi_col & (mi_width - 1)) && !(mi_row & (mi_height - 1)));
-  set_mi_row_col(xd, tile, mi_row, mi_height, mi_col, mi_width,
-                 cm->mi_rows, cm->mi_cols);
-
-  /* set up source buffers */
-  vp9_setup_src_planes(x, cpi->Source, mi_row, mi_col);
-
-  /* R/D setup */
-  x->rddiv = cpi->RDDIV;
-  x->rdmult = cpi->RDMULT;
-
-  /* segment ID */
-  if (seg->enabled) {
-    if (!cpi->sf.variance_adaptive_quantization) {
-      uint8_t *map = seg->update_map ? cpi->segmentation_map
-          : cm->last_frame_seg_map;
-      mbmi->segment_id = vp9_get_segment_id(cm, map, bsize, mi_row, mi_col);
-    }
-    vp9_mb_init_quantizer(cpi, x);
-
-    if (seg->enabled && cpi->seg0_cnt > 0
-        && !vp9_segfeature_active(seg, 0, SEG_LVL_REF_FRAME)
-        && vp9_segfeature_active(seg, 1, SEG_LVL_REF_FRAME)) {
-      cpi->seg0_progress = (cpi->seg0_idx << 16) / cpi->seg0_cnt;
-    } else {
-      const int y = mb_row & ~3;
-      const int x = mb_col & ~3;
-      const int p16 = ((mb_row & 1) << 1) + (mb_col & 1);
-      const int p32 = ((mb_row & 2) << 2) + ((mb_col & 2) << 1);
-      const int tile_progress = tile->mi_col_start * cm->mb_rows >> 1;
-      const int mb_cols = (tile->mi_col_end - tile->mi_col_start) >> 1;
-
-      cpi->seg0_progress = ((y * mb_cols + x * 4 + p32 + p16 + tile_progress)
-          << 16) / cm->MBs;
-    }
-
-    x->encode_breakout = cpi->segment_encode_breakout[mbmi->segment_id];
-  } else {
-    mbmi->segment_id = 0;
-    x->encode_breakout = cpi->oxcf.encode_breakout;
-  }
-}
-
-static void pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile,
-                          int mi_row, int mi_col,
-                          int *totalrate, int64_t *totaldist,
-                          BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
-                          int64_t best_rd) {
+static void rd_pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile,
+                             int mi_row, int mi_col,
+                             int *totalrate, int64_t *totaldist,
+                             BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
+                             int64_t best_rd, int block) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
-  int orig_rdmult = x->rdmult;
+  MB_MODE_INFO *mbmi;
+  struct macroblock_plane *const p = x->plane;
+  struct macroblockd_plane *const pd = xd->plane;
+  const AQ_MODE aq_mode = cpi->oxcf.aq_mode;
+  int i, orig_rdmult;
   double rdmult_ratio;
 
-  vp9_clear_system_state();  // __asm emms;
+  vp9_clear_system_state();
   rdmult_ratio = 1.0;  // avoid uninitialized warnings
 
   // Use the lower precision, but faster, 32x32 fdct for mode selection.
   x->use_lp32x32fdct = 1;
 
+  // TODO(JBB): Most other places in the code instead of calling the function
+  // and then checking if its not the first 8x8 we put the check in the
+  // calling function.  Do that here.
   if (bsize < BLOCK_8X8) {
     // When ab_index = 0 all sub-blocks are handled, so for ab_index != 0
     // there is nothing to be done.
-    if (xd->ab_index != 0) {
+    if (block != 0) {
       *totalrate = 0;
       *totaldist = 0;
       return;
@@ -598,32 +754,56 @@ static void pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile,
   }
 
   set_offsets(cpi, tile, mi_row, mi_col, bsize);
-  xd->mi_8x8[0]->mbmi.sb_type = bsize;
+  mbmi = &xd->mi[0]->mbmi;
+  mbmi->sb_type = bsize;
+
+  for (i = 0; i < MAX_MB_PLANE; ++i) {
+    p[i].coeff = ctx->coeff_pbuf[i][0];
+    p[i].qcoeff = ctx->qcoeff_pbuf[i][0];
+    pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][0];
+    p[i].eobs = ctx->eobs_pbuf[i][0];
+  }
+  ctx->is_coded = 0;
+  x->skip_recode = 0;
 
   // Set to zero to make sure we do not use the previous encoded frame stats
-  xd->mi_8x8[0]->mbmi.skip_coeff = 0;
+  mbmi->skip = 0;
 
-  x->source_variance = get_sby_perpixel_variance(cpi, x, bsize);
+  x->source_variance = get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize);
 
-  if (cpi->sf.variance_adaptive_quantization) {
-    int energy;
-    if (bsize <= BLOCK_16X16) {
-      energy = x->mb_energy;
+  // Save rdmult before it might be changed, so it can be restored later.
+  orig_rdmult = x->rdmult;
+
+  if (aq_mode == VARIANCE_AQ) {
+    const int energy = bsize <= BLOCK_16X16 ? x->mb_energy
+                                            : vp9_block_energy(cpi, x, bsize);
+    if (cm->frame_type == KEY_FRAME ||
+        cpi->refresh_alt_ref_frame ||
+        (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref)) {
+      mbmi->segment_id = vp9_vaq_segment_id(energy);
     } else {
-      energy = vp9_block_energy(cpi, x, bsize);
+      const uint8_t *const map = cm->seg.update_map ? cpi->segmentation_map
+                                                    : cm->last_frame_seg_map;
+      mbmi->segment_id = vp9_get_segment_id(cm, map, bsize, mi_row, mi_col);
     }
 
-    xd->mi_8x8[0]->mbmi.segment_id = vp9_vaq_segment_id(energy);
     rdmult_ratio = vp9_vaq_rdmult_ratio(energy);
-    vp9_mb_init_quantizer(cpi, x);
-  }
-
-  if (cpi->oxcf.tuning == VP8_TUNE_SSIM)
-    vp9_activity_masking(cpi, x);
-
-  if (cpi->sf.variance_adaptive_quantization) {
-    vp9_clear_system_state();  // __asm emms;
-    x->rdmult = round(x->rdmult * rdmult_ratio);
+    vp9_init_plane_quantizers(cpi, x);
+    vp9_clear_system_state();
+    x->rdmult = (int)round(x->rdmult * rdmult_ratio);
+  } else if (aq_mode == COMPLEXITY_AQ) {
+    const int mi_offset = mi_row * cm->mi_cols + mi_col;
+    unsigned char complexity = cpi->complexity_map[mi_offset];
+    const int is_edge = (mi_row <= 1) || (mi_row >= (cm->mi_rows - 2)) ||
+                        (mi_col <= 1) || (mi_col >= (cm->mi_cols - 2));
+    if (!is_edge && (complexity > 128))
+      x->rdmult += ((x->rdmult * (complexity - 128)) / 256);
+  } else if (aq_mode == CYCLIC_REFRESH_AQ) {
+    const uint8_t *const map = cm->seg.update_map ? cpi->segmentation_map
+                                                  : cm->last_frame_seg_map;
+    // If segment 1, use rdmult for that segment.
+    if (vp9_get_segment_id(cm, map, bsize, mi_row, mi_col))
+      x->rdmult = vp9_cyclic_refresh_get_rdmult(cpi->cyclic_refresh);
   }
 
   // Find best coding mode & reconstruct the MB so it is available
@@ -640,70 +820,52 @@ static void pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile,
                                     totaldist, bsize, ctx, best_rd);
   }
 
-  if (cpi->sf.variance_adaptive_quantization) {
-    x->rdmult = orig_rdmult;
-    if (*totalrate != INT_MAX) {
-      vp9_clear_system_state();  // __asm emms;
-      *totalrate = round(*totalrate * rdmult_ratio);
-    }
+  x->rdmult = orig_rdmult;
+
+  if (aq_mode == VARIANCE_AQ && *totalrate != INT_MAX) {
+    vp9_clear_system_state();
+    *totalrate = (int)round(*totalrate * rdmult_ratio);
   }
 }
 
 static void update_stats(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCK *const x = &cpi->mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MODE_INFO *mi = xd->mi_8x8[0];
-  MB_MODE_INFO *const mbmi = &mi->mbmi;
+  const MACROBLOCK *const x = &cpi->mb;
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const MODE_INFO *const mi = xd->mi[0];
+  const MB_MODE_INFO *const mbmi = &mi->mbmi;
 
   if (!frame_is_intra_only(cm)) {
     const int seg_ref_active = vp9_segfeature_active(&cm->seg, mbmi->segment_id,
                                                      SEG_LVL_REF_FRAME);
+    if (!seg_ref_active) {
+      FRAME_COUNTS *const counts = &cm->counts;
+      const int inter_block = is_inter_block(mbmi);
 
-    if (!seg_ref_active)
-      cpi->intra_inter_count[vp9_get_pred_context_intra_inter(xd)]
-                            [is_inter_block(mbmi)]++;
-
-    // If the segment reference feature is enabled we have only a single
-    // reference frame allowed for the segment so exclude it from
-    // the reference frame counts used to work out probabilities.
-    if (is_inter_block(mbmi) && !seg_ref_active) {
-      if (cm->comp_pred_mode == HYBRID_PREDICTION)
-        cpi->comp_inter_count[vp9_get_pred_context_comp_inter_inter(cm, xd)]
-                             [has_second_ref(mbmi)]++;
-
-      if (has_second_ref(mbmi)) {
-        cpi->comp_ref_count[vp9_get_pred_context_comp_ref_p(cm, xd)]
-                           [mbmi->ref_frame[0] == GOLDEN_FRAME]++;
-      } else {
-        cpi->single_ref_count[vp9_get_pred_context_single_ref_p1(xd)][0]
-                             [mbmi->ref_frame[0] != LAST_FRAME]++;
-        if (mbmi->ref_frame[0] != LAST_FRAME)
-          cpi->single_ref_count[vp9_get_pred_context_single_ref_p2(xd)][1]
-                               [mbmi->ref_frame[0] != GOLDEN_FRAME]++;
-      }
-    }
+      counts->intra_inter[vp9_get_intra_inter_context(xd)][inter_block]++;
 
-    // Count of last ref frame 0,0 usage
-    if (mbmi->mode == ZEROMV && mbmi->ref_frame[0] == LAST_FRAME)
-      cpi->inter_zz_count++;
-  }
-}
+      // If the segment reference feature is enabled we have only a single
+      // reference frame allowed for the segment so exclude it from
+      // the reference frame counts used to work out probabilities.
+      if (inter_block) {
+        const MV_REFERENCE_FRAME ref0 = mbmi->ref_frame[0];
 
-static BLOCK_SIZE *get_sb_partitioning(MACROBLOCK *x, BLOCK_SIZE bsize) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  switch (bsize) {
-    case BLOCK_64X64:
-      return &x->sb64_partitioning;
-    case BLOCK_32X32:
-      return &x->sb_partitioning[xd->sb_index];
-    case BLOCK_16X16:
-      return &x->mb_partitioning[xd->sb_index][xd->mb_index];
-    case BLOCK_8X8:
-      return &x->b_partitioning[xd->sb_index][xd->mb_index][xd->b_index];
-    default:
-      assert(0);
-      return NULL;
+        if (cm->reference_mode == REFERENCE_MODE_SELECT)
+          counts->comp_inter[vp9_get_reference_mode_context(cm, xd)]
+                            [has_second_ref(mbmi)]++;
+
+        if (has_second_ref(mbmi)) {
+          counts->comp_ref[vp9_get_pred_context_comp_ref_p(cm, xd)]
+                          [ref0 == GOLDEN_FRAME]++;
+        } else {
+          counts->single_ref[vp9_get_pred_context_single_ref_p1(xd)][0]
+                            [ref0 != LAST_FRAME]++;
+          if (ref0 != LAST_FRAME)
+            counts->single_ref[vp9_get_pred_context_single_ref_p2(xd)][1]
+                              [ref0 != GOLDEN_FRAME]++;
+        }
+      }
+    }
   }
 }
 
@@ -721,21 +883,21 @@ static void restore_context(VP9_COMP *cpi, int mi_row, int mi_col,
   int mi_height = num_8x8_blocks_high_lookup[bsize];
   for (p = 0; p < MAX_MB_PLANE; p++) {
     vpx_memcpy(
-        cpi->above_context[p] + ((mi_col * 2) >> xd->plane[p].subsampling_x),
+        xd->above_context[p] + ((mi_col * 2) >> xd->plane[p].subsampling_x),
         a + num_4x4_blocks_wide * p,
         (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_wide) >>
         xd->plane[p].subsampling_x);
     vpx_memcpy(
-        cpi->left_context[p]
+        xd->left_context[p]
             + ((mi_row & MI_MASK) * 2 >> xd->plane[p].subsampling_y),
         l + num_4x4_blocks_high * p,
         (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_high) >>
         xd->plane[p].subsampling_y);
   }
-  vpx_memcpy(cpi->above_seg_context + mi_col, sa,
-             sizeof(*cpi->above_seg_context) * mi_width);
-  vpx_memcpy(cpi->left_seg_context + (mi_row & MI_MASK), sl,
-             sizeof(cpi->left_seg_context[0]) * mi_height);
+  vpx_memcpy(xd->above_seg_context + mi_col, sa,
+             sizeof(*xd->above_seg_context) * mi_width);
+  vpx_memcpy(xd->left_seg_context + (mi_row & MI_MASK), sl,
+             sizeof(xd->left_seg_context[0]) * mi_height);
 }
 static void save_context(VP9_COMP *cpi, int mi_row, int mi_col,
                          ENTROPY_CONTEXT a[16 * MAX_MB_PLANE],
@@ -754,44 +916,30 @@ static void save_context(VP9_COMP *cpi, int mi_row, int mi_col,
   for (p = 0; p < MAX_MB_PLANE; ++p) {
     vpx_memcpy(
         a + num_4x4_blocks_wide * p,
-        cpi->above_context[p] + (mi_col * 2 >> xd->plane[p].subsampling_x),
+        xd->above_context[p] + (mi_col * 2 >> xd->plane[p].subsampling_x),
         (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_wide) >>
         xd->plane[p].subsampling_x);
     vpx_memcpy(
         l + num_4x4_blocks_high * p,
-        cpi->left_context[p]
+        xd->left_context[p]
             + ((mi_row & MI_MASK) * 2 >> xd->plane[p].subsampling_y),
         (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_high) >>
         xd->plane[p].subsampling_y);
   }
-  vpx_memcpy(sa, cpi->above_seg_context + mi_col,
-             sizeof(*cpi->above_seg_context) * mi_width);
-  vpx_memcpy(sl, cpi->left_seg_context + (mi_row & MI_MASK),
-             sizeof(cpi->left_seg_context[0]) * mi_height);
+  vpx_memcpy(sa, xd->above_seg_context + mi_col,
+             sizeof(*xd->above_seg_context) * mi_width);
+  vpx_memcpy(sl, xd->left_seg_context + (mi_row & MI_MASK),
+             sizeof(xd->left_seg_context[0]) * mi_height);
 }
 
 static void encode_b(VP9_COMP *cpi, const TileInfo *const tile,
                      TOKENEXTRA **tp, int mi_row, int mi_col,
-                     int output_enabled, BLOCK_SIZE bsize, int sub_index) {
-  VP9_COMMON * const cm = &cpi->common;
-  MACROBLOCK * const x = &cpi->mb;
-  MACROBLOCKD * const xd = &x->e_mbd;
+                     int output_enabled, BLOCK_SIZE bsize,
+                     PICK_MODE_CONTEXT *ctx) {
 
-  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
-    return;
-
-  if (sub_index != -1)
-    *get_sb_index(xd, bsize) = sub_index;
-
-  if (bsize < BLOCK_8X8) {
-    // When ab_index = 0 all sub-blocks are handled, so for ab_index != 0
-    // there is nothing to be done.
-    if (xd->ab_index > 0)
-      return;
-  }
   set_offsets(cpi, tile, mi_row, mi_col, bsize);
-  update_state(cpi, get_block_context(x, bsize), bsize, output_enabled);
-  encode_superblock(cpi, tp, output_enabled, mi_row, mi_col, bsize);
+  update_state(cpi, ctx, mi_row, mi_col, bsize, output_enabled);
+  encode_superblock(cpi, tp, output_enabled, mi_row, mi_col, bsize, ctx);
 
   if (output_enabled) {
     update_stats(cpi);
@@ -803,68 +951,74 @@ static void encode_b(VP9_COMP *cpi, const TileInfo *const tile,
 
 static void encode_sb(VP9_COMP *cpi, const TileInfo *const tile,
                       TOKENEXTRA **tp, int mi_row, int mi_col,
-                      int output_enabled, BLOCK_SIZE bsize) {
-  VP9_COMMON * const cm = &cpi->common;
-  MACROBLOCK * const x = &cpi->mb;
-  MACROBLOCKD * const xd = &x->e_mbd;
-  BLOCK_SIZE c1 = BLOCK_8X8;
-  const int bsl = b_width_log2(bsize), bs = (1 << bsl) / 4;
-  int pl = 0;
+                      int output_enabled, BLOCK_SIZE bsize,
+                      PC_TREE *pc_tree) {
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &cpi->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+
+  const int bsl = b_width_log2(bsize), hbs = (1 << bsl) / 4;
+  int ctx;
   PARTITION_TYPE partition;
-  BLOCK_SIZE subsize;
-  int i;
+  BLOCK_SIZE subsize = bsize;
 
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
     return;
 
-  c1 = BLOCK_4X4;
   if (bsize >= BLOCK_8X8) {
-    pl = partition_plane_context(cpi->above_seg_context, cpi->left_seg_context,
-                                 mi_row, mi_col, bsize);
-    c1 = *(get_sb_partitioning(x, bsize));
+    ctx = partition_plane_context(xd, mi_row, mi_col, bsize);
+    subsize = get_subsize(bsize, pc_tree->partitioning);
+  } else {
+    ctx = 0;
+    subsize = BLOCK_4X4;
   }
-  partition = partition_lookup[bsl][c1];
+
+  partition = partition_lookup[bsl][subsize];
+  if (output_enabled && bsize != BLOCK_4X4)
+    cm->counts.partition[ctx][partition]++;
 
   switch (partition) {
     case PARTITION_NONE:
-      if (output_enabled && bsize >= BLOCK_8X8)
-        cpi->partition_count[pl][PARTITION_NONE]++;
-      encode_b(cpi, tile, tp, mi_row, mi_col, output_enabled, c1, -1);
+      encode_b(cpi, tile, tp, mi_row, mi_col, output_enabled, subsize,
+               &pc_tree->none);
       break;
     case PARTITION_VERT:
-      if (output_enabled)
-        cpi->partition_count[pl][PARTITION_VERT]++;
-      encode_b(cpi, tile, tp, mi_row, mi_col, output_enabled, c1, 0);
-      encode_b(cpi, tile, tp, mi_row, mi_col + bs, output_enabled, c1, 1);
+      encode_b(cpi, tile, tp, mi_row, mi_col, output_enabled, subsize,
+               &pc_tree->vertical[0]);
+      if (mi_col + hbs < cm->mi_cols && bsize > BLOCK_8X8) {
+        encode_b(cpi, tile, tp, mi_row, mi_col + hbs, output_enabled, subsize,
+                 &pc_tree->vertical[1]);
+      }
       break;
     case PARTITION_HORZ:
-      if (output_enabled)
-        cpi->partition_count[pl][PARTITION_HORZ]++;
-      encode_b(cpi, tile, tp, mi_row, mi_col, output_enabled, c1, 0);
-      encode_b(cpi, tile, tp, mi_row + bs, mi_col, output_enabled, c1, 1);
+      encode_b(cpi, tile, tp, mi_row, mi_col, output_enabled, subsize,
+               &pc_tree->horizontal[0]);
+      if (mi_row + hbs < cm->mi_rows && bsize > BLOCK_8X8) {
+        encode_b(cpi, tile, tp, mi_row + hbs, mi_col, output_enabled, subsize,
+                 &pc_tree->horizontal[1]);
+      }
       break;
     case PARTITION_SPLIT:
-      subsize = get_subsize(bsize, PARTITION_SPLIT);
-
-      if (output_enabled)
-        cpi->partition_count[pl][PARTITION_SPLIT]++;
-
-      for (i = 0; i < 4; i++) {
-        const int x_idx = i & 1, y_idx = i >> 1;
-
-        *get_sb_index(xd, subsize) = i;
-        encode_sb(cpi, tile, tp, mi_row + y_idx * bs, mi_col + x_idx * bs,
-                  output_enabled, subsize);
+      if (bsize == BLOCK_8X8) {
+        encode_b(cpi, tile, tp, mi_row, mi_col, output_enabled, subsize,
+                 pc_tree->leaf_split[0]);
+      } else {
+        encode_sb(cpi, tile, tp, mi_row, mi_col, output_enabled, subsize,
+                  pc_tree->split[0]);
+        encode_sb(cpi, tile, tp, mi_row, mi_col + hbs, output_enabled, subsize,
+                  pc_tree->split[1]);
+        encode_sb(cpi, tile, tp, mi_row + hbs, mi_col, output_enabled, subsize,
+                  pc_tree->split[2]);
+        encode_sb(cpi, tile, tp, mi_row + hbs, mi_col + hbs, output_enabled,
+                  subsize, pc_tree->split[3]);
       }
       break;
     default:
-      assert(0);
-      break;
+      assert("Invalid partition type.");
   }
 
   if (partition != PARTITION_SPLIT || bsize == BLOCK_8X8)
-    update_partition_context(cpi->above_seg_context, cpi->left_seg_context,
-                             mi_row, mi_col, c1, bsize);
+    update_partition_context(xd, mi_row, mi_col, subsize, bsize);
 }
 
 // Check to see if the given partition size is allowed for a specified number
@@ -873,10 +1027,10 @@ static void encode_sb(VP9_COMP *cpi, const TileInfo *const tile,
 static BLOCK_SIZE find_partition_size(BLOCK_SIZE bsize,
                                       int rows_left, int cols_left,
                                       int *bh, int *bw) {
-  if ((rows_left <= 0) || (cols_left <= 0)) {
+  if (rows_left <= 0 || cols_left <= 0) {
     return MIN(bsize, BLOCK_8X8);
   } else {
-    for (; bsize > 0; --bsize) {
+    for (; bsize > 0; bsize -= 3) {
       *bh = num_8x8_blocks_high_lookup[bsize];
       *bw = num_8x8_blocks_wide_lookup[bsize];
       if ((*bh <= rows_left) && (*bw <= cols_left)) {
@@ -887,20 +1041,36 @@ static BLOCK_SIZE find_partition_size(BLOCK_SIZE bsize,
   return bsize;
 }
 
+static void set_partial_b64x64_partition(MODE_INFO *mi, int mis,
+    int bh_in, int bw_in, int row8x8_remaining, int col8x8_remaining,
+    BLOCK_SIZE bsize, MODE_INFO **mi_8x8) {
+  int bh = bh_in;
+  int r, c;
+  for (r = 0; r < MI_BLOCK_SIZE; r += bh) {
+    int bw = bw_in;
+    for (c = 0; c < MI_BLOCK_SIZE; c += bw) {
+      const int index = r * mis + c;
+      mi_8x8[index] = mi + index;
+      mi_8x8[index]->mbmi.sb_type = find_partition_size(bsize,
+          row8x8_remaining - r, col8x8_remaining - c, &bh, &bw);
+    }
+  }
+}
+
 // This function attempts to set all mode info entries in a given SB64
 // to the same block partition size.
 // However, at the bottom and right borders of the image the requested size
 // may not be allowed in which case this code attempts to choose the largest
 // allowable partition.
-static void set_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
-                             MODE_INFO **mi_8x8, int mi_row, int mi_col) {
+static void set_fixed_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
+                                   MODE_INFO **mi_8x8, int mi_row, int mi_col,
+                                   BLOCK_SIZE bsize) {
   VP9_COMMON *const cm = &cpi->common;
-  BLOCK_SIZE bsize = cpi->sf.always_this_block_size;
-  const int mis = cm->mode_info_stride;
-  int row8x8_remaining = tile->mi_row_end - mi_row;
-  int col8x8_remaining = tile->mi_col_end - mi_col;
+  const int mis = cm->mi_stride;
+  const int row8x8_remaining = tile->mi_row_end - mi_row;
+  const int col8x8_remaining = tile->mi_col_end - mi_col;
   int block_row, block_col;
-  MODE_INFO * mi_upper_left = cm->mi + mi_row * mis + mi_col;
+  MODE_INFO *mi_upper_left = cm->mi + mi_row * mis + mi_col;
   int bh = num_8x8_blocks_high_lookup[bsize];
   int bw = num_8x8_blocks_wide_lookup[bsize];
 
@@ -918,34 +1088,23 @@ static void set_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
     }
   } else {
     // Else this is a partial SB64.
-    for (block_row = 0; block_row < MI_BLOCK_SIZE; block_row += bh) {
-      for (block_col = 0; block_col < MI_BLOCK_SIZE; block_col += bw) {
-        int index = block_row * mis + block_col;
-        // Find a partition size that fits
-        bsize = find_partition_size(cpi->sf.always_this_block_size,
-                                    (row8x8_remaining - block_row),
-                                    (col8x8_remaining - block_col), &bh, &bw);
-        mi_8x8[index] = mi_upper_left + index;
-        mi_8x8[index]->mbmi.sb_type = bsize;
-      }
-    }
+    set_partial_b64x64_partition(mi_upper_left, mis, bh, bw, row8x8_remaining,
+        col8x8_remaining, bsize, mi_8x8);
   }
 }
 
-static void copy_partitioning(VP9_COMP *cpi, MODE_INFO **mi_8x8,
-                              MODE_INFO **prev_mi_8x8) {
-  VP9_COMMON *const cm = &cpi->common;
-  const int mis = cm->mode_info_stride;
+static void copy_partitioning(VP9_COMMON *cm, MODE_INFO **mi_8x8,
+  MODE_INFO **prev_mi_8x8) {
+  const int mis = cm->mi_stride;
   int block_row, block_col;
 
   for (block_row = 0; block_row < 8; ++block_row) {
     for (block_col = 0; block_col < 8; ++block_col) {
-      MODE_INFO * prev_mi = prev_mi_8x8[block_row * mis + block_col];
-      BLOCK_SIZE sb_type = prev_mi ? prev_mi->mbmi.sb_type : 0;
-      ptrdiff_t offset;
+      MODE_INFO *const prev_mi = prev_mi_8x8[block_row * mis + block_col];
+      const BLOCK_SIZE sb_type = prev_mi ? prev_mi->mbmi.sb_type : 0;
 
       if (prev_mi) {
-        offset = prev_mi - cm->prev_mi;
+        const ptrdiff_t offset = prev_mi - cm->prev_mi;
         mi_8x8[block_row * mis + block_col] = cm->mi + offset;
         mi_8x8[block_row * mis + block_col]->mbmi.sb_type = sb_type;
       }
@@ -953,15 +1112,214 @@ static void copy_partitioning(VP9_COMP *cpi, MODE_INFO **mi_8x8,
   }
 }
 
-static int sb_has_motion(VP9_COMP *cpi, MODE_INFO **prev_mi_8x8) {
+static void constrain_copy_partitioning(VP9_COMP *const cpi,
+                                        const TileInfo *const tile,
+                                        MODE_INFO **mi_8x8,
+                                        MODE_INFO **prev_mi_8x8,
+                                        int mi_row, int mi_col,
+                                        BLOCK_SIZE bsize) {
   VP9_COMMON *const cm = &cpi->common;
-  const int mis = cm->mode_info_stride;
+  const int mis = cm->mi_stride;
+  const int row8x8_remaining = tile->mi_row_end - mi_row;
+  const int col8x8_remaining = tile->mi_col_end - mi_col;
+  MODE_INFO *const mi_upper_left = cm->mi + mi_row * mis + mi_col;
+  const int bh = num_8x8_blocks_high_lookup[bsize];
+  const int bw = num_8x8_blocks_wide_lookup[bsize];
+  int block_row, block_col;
+
+  assert((row8x8_remaining > 0) && (col8x8_remaining > 0));
+
+  // If the SB64 if it is all "in image".
+  if ((col8x8_remaining >= MI_BLOCK_SIZE) &&
+      (row8x8_remaining >= MI_BLOCK_SIZE)) {
+    for (block_row = 0; block_row < MI_BLOCK_SIZE; block_row += bh) {
+      for (block_col = 0; block_col < MI_BLOCK_SIZE; block_col += bw) {
+        const int index = block_row * mis + block_col;
+        MODE_INFO *prev_mi = prev_mi_8x8[index];
+        const BLOCK_SIZE sb_type = prev_mi ? prev_mi->mbmi.sb_type : 0;
+        // Use previous partition if block size is not larger than bsize.
+        if (prev_mi && sb_type <= bsize) {
+          int block_row2, block_col2;
+          for (block_row2 = 0; block_row2 < bh; ++block_row2) {
+            for (block_col2 = 0; block_col2 < bw; ++block_col2) {
+              const int index2 = (block_row + block_row2) * mis +
+                  block_col + block_col2;
+              prev_mi = prev_mi_8x8[index2];
+              if (prev_mi) {
+                const ptrdiff_t offset = prev_mi - cm->prev_mi;
+                mi_8x8[index2] = cm->mi + offset;
+                mi_8x8[index2]->mbmi.sb_type = prev_mi->mbmi.sb_type;
+              }
+            }
+          }
+        } else {
+          // Otherwise, use fixed partition of size bsize.
+          mi_8x8[index] = mi_upper_left + index;
+          mi_8x8[index]->mbmi.sb_type = bsize;
+        }
+      }
+    }
+  } else {
+    // Else this is a partial SB64, copy previous partition.
+    copy_partitioning(cm, mi_8x8, prev_mi_8x8);
+  }
+}
+
+
+const struct {
+  int row;
+  int col;
+} coord_lookup[16] = {
+    // 32x32 index = 0
+    {0, 0}, {0, 2}, {2, 0}, {2, 2},
+    // 32x32 index = 1
+    {0, 4}, {0, 6}, {2, 4}, {2, 6},
+    // 32x32 index = 2
+    {4, 0}, {4, 2}, {6, 0}, {6, 2},
+    // 32x32 index = 3
+    {4, 4}, {4, 6}, {6, 4}, {6, 6},
+};
+
+static void set_source_var_based_partition(VP9_COMP *cpi,
+                                           const TileInfo *const tile,
+                                           MODE_INFO **mi_8x8,
+                                           int mi_row, int mi_col) {
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &cpi->mb;
+  const int mis = cm->mi_stride;
+  const int row8x8_remaining = tile->mi_row_end - mi_row;
+  const int col8x8_remaining = tile->mi_col_end - mi_col;
+  MODE_INFO *mi_upper_left = cm->mi + mi_row * mis + mi_col;
+
+  vp9_setup_src_planes(x, cpi->Source, mi_row, mi_col);
+
+  assert((row8x8_remaining > 0) && (col8x8_remaining > 0));
+
+  // In-image SB64
+  if ((col8x8_remaining >= MI_BLOCK_SIZE) &&
+      (row8x8_remaining >= MI_BLOCK_SIZE)) {
+    const int src_stride = x->plane[0].src.stride;
+    const int pre_stride = cpi->Last_Source->y_stride;
+    const uint8_t *src = x->plane[0].src.buf;
+    const int pre_offset = (mi_row * MI_SIZE) * pre_stride +
+                           (mi_col * MI_SIZE);
+    const uint8_t *pre_src = cpi->Last_Source->y_buffer + pre_offset;
+    const unsigned int thr_32x32 = cpi->sf.source_var_thresh;
+    const unsigned int thr_64x64 = thr_32x32 << 1;
+    int i, j;
+    int index;
+    diff d32[4];
+    int use16x16 = 0;
+
+    for (i = 0; i < 4; i++) {
+      diff d16[4];
+
+      for (j = 0; j < 4; j++) {
+        int b_mi_row = coord_lookup[i * 4 + j].row;
+        int b_mi_col = coord_lookup[i * 4 + j].col;
+        int b_offset = b_mi_row * MI_SIZE * src_stride +
+                       b_mi_col * MI_SIZE;
+
+        get_sse_sum_16x16(src + b_offset, src_stride,
+                          pre_src + b_offset, pre_stride,
+                          &d16[j].sse, &d16[j].sum);
+
+        d16[j].var = d16[j].sse -
+            (((uint32_t)d16[j].sum * d16[j].sum) >> 8);
+
+        index = b_mi_row * mis + b_mi_col;
+        mi_8x8[index] = mi_upper_left + index;
+        mi_8x8[index]->mbmi.sb_type = BLOCK_16X16;
+
+        // TODO(yunqingwang): If d16[j].var is very large, use 8x8 partition
+        // size to further improve quality.
+      }
+
+      if (d16[0].var < thr_32x32 && d16[1].var < thr_32x32 &&
+          d16[2].var < thr_32x32 && d16[3].var < thr_32x32) {
+        d32[i].sse = d16[0].sse;
+        d32[i].sum = d16[0].sum;
+
+        for (j = 1; j < 4; j++) {
+          d32[i].sse += d16[j].sse;
+          d32[i].sum += d16[j].sum;
+        }
+
+        d32[i].var = d32[i].sse - (((int64_t)d32[i].sum * d32[i].sum) >> 10);
+
+        index = coord_lookup[i*4].row * mis + coord_lookup[i*4].col;
+        mi_8x8[index] = mi_upper_left + index;
+        mi_8x8[index]->mbmi.sb_type = BLOCK_32X32;
+
+        if (!((cm->current_video_frame - 1) %
+            cpi->sf.search_type_check_frequency))
+          cpi->use_large_partition_rate += 1;
+      } else {
+        use16x16 = 1;
+      }
+    }
+
+    if (!use16x16) {
+      if (d32[0].var < thr_64x64 && d32[1].var < thr_64x64 &&
+          d32[2].var < thr_64x64 && d32[3].var < thr_64x64)  {
+        mi_8x8[0] = mi_upper_left;
+        mi_8x8[0]->mbmi.sb_type = BLOCK_64X64;
+      }
+    }
+  } else {   // partial in-image SB64
+    int bh = num_8x8_blocks_high_lookup[BLOCK_16X16];
+    int bw = num_8x8_blocks_wide_lookup[BLOCK_16X16];
+    set_partial_b64x64_partition(mi_upper_left, mis, bh, bw,
+        row8x8_remaining, col8x8_remaining, BLOCK_16X16, mi_8x8);
+  }
+}
+
+static int is_background(VP9_COMP *cpi, const TileInfo *const tile,
+                         int mi_row, int mi_col) {
+  MACROBLOCK *x = &cpi->mb;
+  uint8_t *src, *pre;
+  int src_stride, pre_stride;
+
+  const int row8x8_remaining = tile->mi_row_end - mi_row;
+  const int col8x8_remaining = tile->mi_col_end - mi_col;
+
+  int this_sad = 0;
+  int threshold = 0;
+
+  // This assumes the input source frames are of the same dimension.
+  src_stride = cpi->Source->y_stride;
+  src = cpi->Source->y_buffer + (mi_row * MI_SIZE) * src_stride +
+            (mi_col * MI_SIZE);
+  pre_stride = cpi->Last_Source->y_stride;
+  pre = cpi->Last_Source->y_buffer + (mi_row * MI_SIZE) * pre_stride +
+          (mi_col * MI_SIZE);
+
+  if (row8x8_remaining >= MI_BLOCK_SIZE &&
+      col8x8_remaining >= MI_BLOCK_SIZE) {
+    this_sad = cpi->fn_ptr[BLOCK_64X64].sdf(src, src_stride,
+                                            pre, pre_stride, 0x7fffffff);
+    threshold = (1 << 12);
+  } else {
+    int r, c;
+    for (r = 0; r < row8x8_remaining; r += 2)
+      for (c = 0; c < col8x8_remaining; c += 2)
+        this_sad += cpi->fn_ptr[BLOCK_16X16].sdf(src, src_stride, pre,
+                                                 pre_stride, 0x7fffffff);
+    threshold = (row8x8_remaining * col8x8_remaining) << 6;
+  }
+
+  x->in_static_area = (this_sad < 2 * threshold);
+  return x->in_static_area;
+}
+
+static int sb_has_motion(const VP9_COMMON *cm, MODE_INFO **prev_mi_8x8) {
+  const int mis = cm->mi_stride;
   int block_row, block_col;
 
   if (cm->prev_mi) {
     for (block_row = 0; block_row < 8; ++block_row) {
       for (block_col = 0; block_col < 8; ++block_col) {
-        MODE_INFO * prev_mi = prev_mi_8x8[block_row * mis + block_col];
+        const MODE_INFO *prev_mi = prev_mi_8x8[block_row * mis + block_col];
         if (prev_mi) {
           if (abs(prev_mi->mbmi.mv[0].as_mv.row) >= 8 ||
               abs(prev_mi->mbmi.mv[0].as_mv.col) >= 8)
@@ -973,68 +1331,188 @@ static int sb_has_motion(VP9_COMP *cpi, MODE_INFO **prev_mi_8x8) {
   return 0;
 }
 
+static void update_state_rt(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
+                            int mi_row, int mi_col, int bsize) {
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &cpi->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  const struct segmentation *const seg = &cm->seg;
+
+  *(xd->mi[0]) = ctx->mic;
+
+  // For in frame adaptive Q, check for reseting the segment_id and updating
+  // the cyclic refresh map.
+  if ((cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) && seg->enabled) {
+    vp9_cyclic_refresh_update_segment(cpi, &xd->mi[0]->mbmi,
+                                      mi_row, mi_col, bsize, 1);
+    vp9_init_plane_quantizers(cpi, x);
+  }
+
+  if (is_inter_block(mbmi)) {
+    vp9_update_mv_count(cm, xd);
+
+    if (cm->interp_filter == SWITCHABLE) {
+      const int pred_ctx = vp9_get_pred_context_switchable_interp(xd);
+      ++cm->counts.switchable_interp[pred_ctx][mbmi->interp_filter];
+    }
+  }
+
+  x->skip = ctx->skip;
+}
+
+static void encode_b_rt(VP9_COMP *cpi, const TileInfo *const tile,
+                        TOKENEXTRA **tp, int mi_row, int mi_col,
+                     int output_enabled, BLOCK_SIZE bsize,
+                     PICK_MODE_CONTEXT *ctx) {
+
+
+  set_offsets(cpi, tile, mi_row, mi_col, bsize);
+  update_state_rt(cpi, ctx, mi_row, mi_col, bsize);
+
+  encode_superblock(cpi, tp, output_enabled, mi_row, mi_col, bsize, ctx);
+  update_stats(cpi);
+
+  (*tp)->token = EOSB_TOKEN;
+  (*tp)++;
+}
+
+static void encode_sb_rt(VP9_COMP *cpi, const TileInfo *const tile,
+                         TOKENEXTRA **tp, int mi_row, int mi_col,
+                         int output_enabled, BLOCK_SIZE bsize,
+                         PC_TREE *pc_tree) {
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &cpi->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+
+  const int bsl = b_width_log2(bsize), hbs = (1 << bsl) / 4;
+  int ctx;
+  PARTITION_TYPE partition;
+  BLOCK_SIZE subsize;
+
+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
+    return;
+
+  if (bsize >= BLOCK_8X8) {
+    MACROBLOCKD *const xd = &cpi->mb.e_mbd;
+    const int idx_str = xd->mi_stride * mi_row + mi_col;
+    MODE_INFO ** mi_8x8 = cm->mi_grid_visible + idx_str;
+    ctx = partition_plane_context(xd, mi_row, mi_col, bsize);
+    subsize = mi_8x8[0]->mbmi.sb_type;
+  } else {
+    ctx = 0;
+    subsize = BLOCK_4X4;
+  }
+
+  partition = partition_lookup[bsl][subsize];
+  if (output_enabled && bsize != BLOCK_4X4)
+    cm->counts.partition[ctx][partition]++;
+
+  switch (partition) {
+    case PARTITION_NONE:
+      encode_b_rt(cpi, tile, tp, mi_row, mi_col, output_enabled, subsize,
+                  &pc_tree->none);
+      break;
+    case PARTITION_VERT:
+      encode_b_rt(cpi, tile, tp, mi_row, mi_col, output_enabled, subsize,
+                  &pc_tree->vertical[0]);
+      if (mi_col + hbs < cm->mi_cols && bsize > BLOCK_8X8) {
+        encode_b_rt(cpi, tile, tp, mi_row, mi_col + hbs, output_enabled,
+                    subsize, &pc_tree->vertical[1]);
+      }
+      break;
+    case PARTITION_HORZ:
+      encode_b_rt(cpi, tile, tp, mi_row, mi_col, output_enabled, subsize,
+                  &pc_tree->horizontal[0]);
+      if (mi_row + hbs < cm->mi_rows && bsize > BLOCK_8X8) {
+        encode_b_rt(cpi, tile, tp, mi_row + hbs, mi_col, output_enabled,
+                    subsize, &pc_tree->horizontal[1]);
+      }
+      break;
+    case PARTITION_SPLIT:
+      subsize = get_subsize(bsize, PARTITION_SPLIT);
+      encode_sb_rt(cpi, tile, tp, mi_row, mi_col, output_enabled, subsize,
+                   pc_tree->split[0]);
+      encode_sb_rt(cpi, tile, tp, mi_row, mi_col + hbs, output_enabled,
+                   subsize, pc_tree->split[1]);
+      encode_sb_rt(cpi, tile, tp, mi_row + hbs, mi_col, output_enabled,
+                   subsize, pc_tree->split[2]);
+      encode_sb_rt(cpi, tile, tp, mi_row + hbs, mi_col + hbs, output_enabled,
+                   subsize, pc_tree->split[3]);
+      break;
+    default:
+      assert("Invalid partition type.");
+  }
+
+  if (partition != PARTITION_SPLIT || bsize == BLOCK_8X8)
+    update_partition_context(xd, mi_row, mi_col, subsize, bsize);
+}
+
 static void rd_use_partition(VP9_COMP *cpi,
                              const TileInfo *const tile,
                              MODE_INFO **mi_8x8,
                              TOKENEXTRA **tp, int mi_row, int mi_col,
                              BLOCK_SIZE bsize, int *rate, int64_t *dist,
-                             int do_recon) {
-  VP9_COMMON * const cm = &cpi->common;
-  MACROBLOCK * const x = &cpi->mb;
-  MACROBLOCKD *xd = &cpi->mb.e_mbd;
-  const int mis = cm->mode_info_stride;
-  int bsl = b_width_log2(bsize);
-  const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
-  const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
-  int ms = num_4x4_blocks_wide / 2;
-  int mh = num_4x4_blocks_high / 2;
-  int bss = (1 << bsl) / 4;
+                             int do_recon, PC_TREE *pc_tree) {
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &cpi->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const int mis = cm->mi_stride;
+  const int bsl = b_width_log2(bsize);
+  const int mi_step = num_4x4_blocks_wide_lookup[bsize] / 2;
+  const int bss = (1 << bsl) / 4;
   int i, pl;
   PARTITION_TYPE partition = PARTITION_NONE;
   BLOCK_SIZE subsize;
   ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
   PARTITION_CONTEXT sl[8], sa[8];
   int last_part_rate = INT_MAX;
-  int64_t last_part_dist = INT_MAX;
-  int split_rate = INT_MAX;
-  int64_t split_dist = INT_MAX;
+  int64_t last_part_dist = INT64_MAX;
+  int64_t last_part_rd = INT64_MAX;
   int none_rate = INT_MAX;
-  int64_t none_dist = INT_MAX;
+  int64_t none_dist = INT64_MAX;
+  int64_t none_rd = INT64_MAX;
   int chosen_rate = INT_MAX;
-  int64_t chosen_dist = INT_MAX;
+  int64_t chosen_dist = INT64_MAX;
+  int64_t chosen_rd = INT64_MAX;
   BLOCK_SIZE sub_subsize = BLOCK_4X4;
   int splits_below = 0;
   BLOCK_SIZE bs_type = mi_8x8[0]->mbmi.sb_type;
+  int do_partition_search = 1;
+  PICK_MODE_CONTEXT *ctx = &pc_tree->none;
 
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
     return;
 
-  partition = partition_lookup[bsl][bs_type];
+  assert(num_4x4_blocks_wide_lookup[bsize] ==
+         num_4x4_blocks_high_lookup[bsize]);
 
+  partition = partition_lookup[bsl][bs_type];
   subsize = get_subsize(bsize, partition);
 
-  if (bsize < BLOCK_8X8) {
-    // When ab_index = 0 all sub-blocks are handled, so for ab_index != 0
-    // there is nothing to be done.
-    if (xd->ab_index != 0) {
-      *rate = 0;
-      *dist = 0;
-      return;
-    }
-  } else {
-    *(get_sb_partitioning(x, bsize)) = subsize;
-  }
+  pc_tree->partitioning = partition;
   save_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
 
   if (bsize == BLOCK_16X16) {
     set_offsets(cpi, tile, mi_row, mi_col, bsize);
     x->mb_energy = vp9_block_energy(cpi, x, bsize);
+  } else {
+    x->in_active_map = check_active_map(cpi, x, mi_row, mi_col, bsize);
   }
 
-  x->fast_ms = 0;
-  x->subblock_ref = 0;
-
-  if (cpi->sf.adjust_partitioning_from_last_frame) {
+  if (!x->in_active_map) {
+    do_partition_search = 0;
+    if (mi_row + (mi_step >> 1) < cm->mi_rows &&
+        mi_col + (mi_step >> 1) < cm->mi_cols) {
+      pc_tree->partitioning = PARTITION_NONE;
+      bs_type = mi_8x8[0]->mbmi.sb_type = bsize;
+      subsize = bsize;
+      partition = PARTITION_NONE;
+    }
+  }
+  if (do_partition_search &&
+      cpi->sf.partition_search_type == SEARCH_PARTITION &&
+      cpi->sf.adjust_partitioning_from_last_frame) {
     // Check if any of the sub blocks are further split.
     if (partition == PARTITION_SPLIT && subsize > BLOCK_8X8) {
       sub_subsize = get_subsize(subsize, PARTITION_SPLIT);
@@ -1051,44 +1529,46 @@ static void rd_use_partition(VP9_COMP *cpi,
     // If partition is not none try none unless each of the 4 splits are split
     // even further..
     if (partition != PARTITION_NONE && !splits_below &&
-        mi_row + (ms >> 1) < cm->mi_rows &&
-        mi_col + (ms >> 1) < cm->mi_cols) {
-      *(get_sb_partitioning(x, bsize)) = bsize;
-      pick_sb_modes(cpi, tile, mi_row, mi_col, &none_rate, &none_dist, bsize,
-                    get_block_context(x, bsize), INT64_MAX);
+        mi_row + (mi_step >> 1) < cm->mi_rows &&
+        mi_col + (mi_step >> 1) < cm->mi_cols) {
+      pc_tree->partitioning = PARTITION_NONE;
+      rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &none_rate, &none_dist, bsize,
+                       ctx, INT64_MAX, 0);
 
-      pl = partition_plane_context(cpi->above_seg_context,
-                                   cpi->left_seg_context,
-                                   mi_row, mi_col, bsize);
-      none_rate += x->partition_cost[pl][PARTITION_NONE];
+      pl = partition_plane_context(xd, mi_row, mi_col, bsize);
+
+      if (none_rate < INT_MAX) {
+        none_rate += x->partition_cost[pl][PARTITION_NONE];
+        none_rd = RDCOST(x->rdmult, x->rddiv, none_rate, none_dist);
+      }
 
       restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
       mi_8x8[0]->mbmi.sb_type = bs_type;
-      *(get_sb_partitioning(x, bsize)) = subsize;
+      pc_tree->partitioning = partition;
     }
   }
 
   switch (partition) {
     case PARTITION_NONE:
-      pick_sb_modes(cpi, tile, mi_row, mi_col, &last_part_rate, &last_part_dist,
-                    bsize, get_block_context(x, bsize), INT64_MAX);
+      rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &last_part_rate,
+                       &last_part_dist, bsize, ctx, INT64_MAX, 0);
       break;
     case PARTITION_HORZ:
-      *get_sb_index(xd, subsize) = 0;
-      pick_sb_modes(cpi, tile, mi_row, mi_col, &last_part_rate, &last_part_dist,
-                    subsize, get_block_context(x, subsize), INT64_MAX);
+      rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &last_part_rate,
+                       &last_part_dist, subsize, &pc_tree->horizontal[0],
+                       INT64_MAX, 0);
       if (last_part_rate != INT_MAX &&
-          bsize >= BLOCK_8X8 && mi_row + (mh >> 1) < cm->mi_rows) {
+          bsize >= BLOCK_8X8 && mi_row + (mi_step >> 1) < cm->mi_rows) {
         int rt = 0;
         int64_t dt = 0;
-        update_state(cpi, get_block_context(x, subsize), subsize, 0);
-        encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
-        *get_sb_index(xd, subsize) = 1;
-        pick_sb_modes(cpi, tile, mi_row + (ms >> 1), mi_col, &rt, &dt, subsize,
-                      get_block_context(x, subsize), INT64_MAX);
-        if (rt == INT_MAX || dt == INT_MAX) {
+        PICK_MODE_CONTEXT *ctx = &pc_tree->horizontal[0];
+        update_state(cpi, ctx, mi_row, mi_col, subsize, 0);
+        encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize, ctx);
+        rd_pick_sb_modes(cpi, tile, mi_row + (mi_step >> 1), mi_col, &rt, &dt,
+                         subsize, &pc_tree->horizontal[1], INT64_MAX, 1);
+        if (rt == INT_MAX || dt == INT64_MAX) {
           last_part_rate = INT_MAX;
-          last_part_dist = INT_MAX;
+          last_part_dist = INT64_MAX;
           break;
         }
 
@@ -1097,21 +1577,22 @@ static void rd_use_partition(VP9_COMP *cpi,
       }
       break;
     case PARTITION_VERT:
-      *get_sb_index(xd, subsize) = 0;
-      pick_sb_modes(cpi, tile, mi_row, mi_col, &last_part_rate, &last_part_dist,
-                    subsize, get_block_context(x, subsize), INT64_MAX);
+      rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &last_part_rate,
+                       &last_part_dist, subsize, &pc_tree->vertical[0],
+                       INT64_MAX, 0);
       if (last_part_rate != INT_MAX &&
-          bsize >= BLOCK_8X8 && mi_col + (ms >> 1) < cm->mi_cols) {
+          bsize >= BLOCK_8X8 && mi_col + (mi_step >> 1) < cm->mi_cols) {
         int rt = 0;
         int64_t dt = 0;
-        update_state(cpi, get_block_context(x, subsize), subsize, 0);
-        encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
-        *get_sb_index(xd, subsize) = 1;
-        pick_sb_modes(cpi, tile, mi_row, mi_col + (ms >> 1), &rt, &dt, subsize,
-                      get_block_context(x, subsize), INT64_MAX);
-        if (rt == INT_MAX || dt == INT_MAX) {
+        PICK_MODE_CONTEXT *ctx = &pc_tree->vertical[0];
+        update_state(cpi, ctx, mi_row, mi_col, subsize, 0);
+        encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize, ctx);
+        rd_pick_sb_modes(cpi, tile, mi_row, mi_col + (mi_step >> 1), &rt, &dt,
+                         subsize, &pc_tree->vertical[bsize > BLOCK_8X8],
+                         INT64_MAX, 1);
+        if (rt == INT_MAX || dt == INT64_MAX) {
           last_part_rate = INT_MAX;
-          last_part_dist = INT_MAX;
+          last_part_dist = INT64_MAX;
           break;
         }
         last_part_rate += rt;
@@ -1119,12 +1600,17 @@ static void rd_use_partition(VP9_COMP *cpi,
       }
       break;
     case PARTITION_SPLIT:
-      // Split partition.
+      if (bsize == BLOCK_8X8) {
+        rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &last_part_rate,
+                         &last_part_dist, subsize, pc_tree->leaf_split[0],
+                         INT64_MAX, 0);
+        break;
+      }
       last_part_rate = 0;
       last_part_dist = 0;
       for (i = 0; i < 4; i++) {
-        int x_idx = (i & 1) * (ms >> 1);
-        int y_idx = (i >> 1) * (ms >> 1);
+        int x_idx = (i & 1) * (mi_step >> 1);
+        int y_idx = (i >> 1) * (mi_step >> 1);
         int jj = i >> 1, ii = i & 0x01;
         int rt;
         int64_t dt;
@@ -1132,14 +1618,12 @@ static void rd_use_partition(VP9_COMP *cpi,
         if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols))
           continue;
 
-        *get_sb_index(xd, subsize) = i;
-
         rd_use_partition(cpi, tile, mi_8x8 + jj * bss * mis + ii * bss, tp,
                          mi_row + y_idx, mi_col + x_idx, subsize, &rt, &dt,
-                         i != 3);
-        if (rt == INT_MAX || dt == INT_MAX) {
+                         i != 3, pc_tree->split[i]);
+        if (rt == INT_MAX || dt == INT64_MAX) {
           last_part_rate = INT_MAX;
-          last_part_dist = INT_MAX;
+          last_part_dist = INT64_MAX;
           break;
         }
         last_part_rate += rt;
@@ -1150,86 +1634,83 @@ static void rd_use_partition(VP9_COMP *cpi,
       assert(0);
   }
 
-  pl = partition_plane_context(cpi->above_seg_context, cpi->left_seg_context,
-                               mi_row, mi_col, bsize);
-  if (last_part_rate < INT_MAX)
+  pl = partition_plane_context(xd, mi_row, mi_col, bsize);
+  if (last_part_rate < INT_MAX) {
     last_part_rate += x->partition_cost[pl][partition];
+    last_part_rd = RDCOST(x->rdmult, x->rddiv, last_part_rate, last_part_dist);
+  }
 
-  if (cpi->sf.adjust_partitioning_from_last_frame
+  if (do_partition_search
+      && cpi->sf.adjust_partitioning_from_last_frame
+      && cpi->sf.partition_search_type == SEARCH_PARTITION
       && partition != PARTITION_SPLIT && bsize > BLOCK_8X8
-      && (mi_row + ms < cm->mi_rows || mi_row + (ms >> 1) == cm->mi_rows)
-      && (mi_col + ms < cm->mi_cols || mi_col + (ms >> 1) == cm->mi_cols)) {
+      && (mi_row + mi_step < cm->mi_rows ||
+          mi_row + (mi_step >> 1) == cm->mi_rows)
+      && (mi_col + mi_step < cm->mi_cols ||
+          mi_col + (mi_step >> 1) == cm->mi_cols)) {
     BLOCK_SIZE split_subsize = get_subsize(bsize, PARTITION_SPLIT);
-    split_rate = 0;
-    split_dist = 0;
+    chosen_rate = 0;
+    chosen_dist = 0;
     restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
+    pc_tree->partitioning = PARTITION_SPLIT;
 
     // Split partition.
     for (i = 0; i < 4; i++) {
-      int x_idx = (i & 1) * (num_4x4_blocks_wide >> 2);
-      int y_idx = (i >> 1) * (num_4x4_blocks_wide >> 2);
+      int x_idx = (i & 1) * (mi_step >> 1);
+      int y_idx = (i >> 1) * (mi_step >> 1);
       int rt = 0;
       int64_t dt = 0;
       ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
       PARTITION_CONTEXT sl[8], sa[8];
 
-      if ((mi_row + y_idx >= cm->mi_rows)
-          || (mi_col + x_idx >= cm->mi_cols))
+      if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols))
         continue;
 
-      *get_sb_index(xd, split_subsize) = i;
-      *get_sb_partitioning(x, bsize) = split_subsize;
-      *get_sb_partitioning(x, split_subsize) = split_subsize;
-
       save_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
-
-      pick_sb_modes(cpi, tile, mi_row + y_idx, mi_col + x_idx, &rt, &dt,
-                    split_subsize, get_block_context(x, split_subsize),
-                    INT64_MAX);
+      pc_tree->split[i]->partitioning = PARTITION_NONE;
+      rd_pick_sb_modes(cpi, tile, mi_row + y_idx, mi_col + x_idx, &rt, &dt,
+                       split_subsize, &pc_tree->split[i]->none,
+                       INT64_MAX, i);
 
       restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
 
-      if (rt == INT_MAX || dt == INT_MAX) {
-        split_rate = INT_MAX;
-        split_dist = INT_MAX;
+      if (rt == INT_MAX || dt == INT64_MAX) {
+        chosen_rate = INT_MAX;
+        chosen_dist = INT64_MAX;
         break;
       }
 
+      chosen_rate += rt;
+      chosen_dist += dt;
+
       if (i != 3)
         encode_sb(cpi, tile, tp,  mi_row + y_idx, mi_col + x_idx, 0,
-                  split_subsize);
+                  split_subsize, pc_tree->split[i]);
 
-      split_rate += rt;
-      split_dist += dt;
-      pl = partition_plane_context(cpi->above_seg_context,
-                                   cpi->left_seg_context,
-                                   mi_row + y_idx, mi_col + x_idx, bsize);
-      split_rate += x->partition_cost[pl][PARTITION_NONE];
+      pl = partition_plane_context(xd, mi_row + y_idx, mi_col + x_idx,
+                                   split_subsize);
+      chosen_rate += x->partition_cost[pl][PARTITION_NONE];
     }
-    pl = partition_plane_context(cpi->above_seg_context, cpi->left_seg_context,
-                                 mi_row, mi_col, bsize);
-    if (split_rate < INT_MAX) {
-      split_rate += x->partition_cost[pl][PARTITION_SPLIT];
-
-      chosen_rate = split_rate;
-      chosen_dist = split_dist;
+    pl = partition_plane_context(xd, mi_row, mi_col, bsize);
+    if (chosen_rate < INT_MAX) {
+      chosen_rate += x->partition_cost[pl][PARTITION_SPLIT];
+      chosen_rd = RDCOST(x->rdmult, x->rddiv, chosen_rate, chosen_dist);
     }
   }
 
-  // If last_part is better set the partitioning to that...
-  if (RDCOST(x->rdmult, x->rddiv, last_part_rate, last_part_dist)
-      < RDCOST(x->rdmult, x->rddiv, chosen_rate, chosen_dist)) {
+  // If last_part is better set the partitioning to that.
+  if (last_part_rd < chosen_rd) {
     mi_8x8[0]->mbmi.sb_type = bsize;
     if (bsize >= BLOCK_8X8)
-      *(get_sb_partitioning(x, bsize)) = subsize;
+      pc_tree->partitioning = partition;
     chosen_rate = last_part_rate;
     chosen_dist = last_part_dist;
+    chosen_rd = last_part_rd;
   }
-  // If none was better set the partitioning to that...
-  if (RDCOST(x->rdmult, x->rddiv, chosen_rate, chosen_dist)
-      > RDCOST(x->rdmult, x->rddiv, none_rate, none_dist)) {
+  // If none was better set the partitioning to that.
+  if (none_rd < chosen_rd) {
     if (bsize >= BLOCK_8X8)
-      *(get_sb_partitioning(x, bsize)) = bsize;
+      pc_tree->partitioning = PARTITION_NONE;
     chosen_rate = none_rate;
     chosen_dist = none_dist;
   }
@@ -1239,25 +1720,44 @@ static void rd_use_partition(VP9_COMP *cpi,
   // We must have chosen a partitioning and encoding or we'll fail later on.
   // No other opportunities for success.
   if ( bsize == BLOCK_64X64)
-    assert(chosen_rate < INT_MAX && chosen_dist < INT_MAX);
+    assert(chosen_rate < INT_MAX && chosen_dist < INT64_MAX);
+
+  if (do_recon) {
+    int output_enabled = (bsize == BLOCK_64X64);
 
-  if (do_recon)
-    encode_sb(cpi, tile, tp, mi_row, mi_col, bsize == BLOCK_64X64, bsize);
+    // Check the projected output rate for this SB against it's target
+    // and and if necessary apply a Q delta using segmentation to get
+    // closer to the target.
+    if ((cpi->oxcf.aq_mode == COMPLEXITY_AQ) && cm->seg.update_map) {
+      vp9_select_in_frame_q_segment(cpi, mi_row, mi_col,
+                                    output_enabled, chosen_rate);
+    }
+
+    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
+      vp9_cyclic_refresh_set_rate_and_dist_sb(cpi->cyclic_refresh,
+                                              chosen_rate, chosen_dist);
+    encode_sb(cpi, tile, tp, mi_row, mi_col, output_enabled, bsize,
+              pc_tree);
+  }
 
   *rate = chosen_rate;
   *dist = chosen_dist;
 }
 
 static const BLOCK_SIZE min_partition_size[BLOCK_SIZES] = {
-  BLOCK_4X4, BLOCK_4X4, BLOCK_4X4, BLOCK_4X4,
-  BLOCK_4X4, BLOCK_4X4, BLOCK_8X8, BLOCK_8X8,
-  BLOCK_8X8, BLOCK_16X16, BLOCK_16X16, BLOCK_16X16, BLOCK_16X16
+  BLOCK_4X4,   BLOCK_4X4,   BLOCK_4X4,
+  BLOCK_4X4,   BLOCK_4X4,   BLOCK_4X4,
+  BLOCK_8X8,   BLOCK_8X8,   BLOCK_8X8,
+  BLOCK_16X16, BLOCK_16X16, BLOCK_16X16,
+  BLOCK_16X16
 };
 
 static const BLOCK_SIZE max_partition_size[BLOCK_SIZES] = {
-  BLOCK_8X8, BLOCK_16X16, BLOCK_16X16, BLOCK_16X16,
-  BLOCK_32X32, BLOCK_32X32, BLOCK_32X32, BLOCK_64X64,
-  BLOCK_64X64, BLOCK_64X64, BLOCK_64X64, BLOCK_64X64, BLOCK_64X64
+  BLOCK_8X8,   BLOCK_16X16, BLOCK_16X16,
+  BLOCK_16X16, BLOCK_32X32, BLOCK_32X32,
+  BLOCK_32X32, BLOCK_64X64, BLOCK_64X64,
+  BLOCK_64X64, BLOCK_64X64, BLOCK_64X64,
+  BLOCK_64X64
 };
 
 // Look at all the mode_info entries for blocks that are part of this
@@ -1284,165 +1784,142 @@ static void get_sb_partition_size_range(VP9_COMP *cpi, MODE_INFO ** mi_8x8,
       *min_block_size = MIN(*min_block_size, sb_type);
       *max_block_size = MAX(*max_block_size, sb_type);
     }
-    index += xd->mode_info_stride;
+    index += xd->mi_stride;
   }
 }
 
+// Next square block size less or equal than current block size.
+static const BLOCK_SIZE next_square_size[BLOCK_SIZES] = {
+  BLOCK_4X4, BLOCK_4X4, BLOCK_4X4,
+  BLOCK_8X8, BLOCK_8X8, BLOCK_8X8,
+  BLOCK_16X16, BLOCK_16X16, BLOCK_16X16,
+  BLOCK_32X32, BLOCK_32X32, BLOCK_32X32,
+  BLOCK_64X64
+};
+
 // Look at neighboring blocks and set a min and max partition size based on
 // what they chose.
 static void rd_auto_partition_range(VP9_COMP *cpi, const TileInfo *const tile,
-                                    int row, int col,
+                                    int mi_row, int mi_col,
                                     BLOCK_SIZE *min_block_size,
                                     BLOCK_SIZE *max_block_size) {
-  VP9_COMMON * const cm = &cpi->common;
+  VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &cpi->mb.e_mbd;
-  MODE_INFO ** mi_8x8 = xd->mi_8x8;
-  MODE_INFO ** prev_mi_8x8 = xd->prev_mi_8x8;
-
-  const int left_in_image = xd->left_available && mi_8x8[-1];
-  const int above_in_image = xd->up_available &&
-                             mi_8x8[-xd->mode_info_stride];
-  MODE_INFO ** above_sb64_mi_8x8;
-  MODE_INFO ** left_sb64_mi_8x8;
-
-  int row8x8_remaining = tile->mi_row_end - row;
-  int col8x8_remaining = tile->mi_col_end - col;
+  MODE_INFO **mi = xd->mi;
+  const int left_in_image = xd->left_available && mi[-1];
+  const int above_in_image = xd->up_available && mi[-xd->mi_stride];
+  const int row8x8_remaining = tile->mi_row_end - mi_row;
+  const int col8x8_remaining = tile->mi_col_end - mi_col;
   int bh, bw;
-
+  BLOCK_SIZE min_size = BLOCK_4X4;
+  BLOCK_SIZE max_size = BLOCK_64X64;
   // Trap case where we do not have a prediction.
-  if (!left_in_image && !above_in_image &&
-      ((cm->frame_type == KEY_FRAME) || !cm->prev_mi)) {
-    *min_block_size = BLOCK_4X4;
-    *max_block_size = BLOCK_64X64;
-  } else {
+  if (left_in_image || above_in_image || cm->frame_type != KEY_FRAME) {
     // Default "min to max" and "max to min"
-    *min_block_size = BLOCK_64X64;
-    *max_block_size = BLOCK_4X4;
+    min_size = BLOCK_64X64;
+    max_size = BLOCK_4X4;
 
     // NOTE: each call to get_sb_partition_size_range() uses the previous
     // passed in values for min and max as a starting point.
-    //
     // Find the min and max partition used in previous frame at this location
-    if (cm->prev_mi && (cm->frame_type != KEY_FRAME)) {
-      get_sb_partition_size_range(cpi, prev_mi_8x8,
-                                  min_block_size, max_block_size);
+    if (cm->frame_type != KEY_FRAME) {
+      MODE_INFO **const prev_mi =
+          &cm->prev_mi_grid_visible[mi_row * xd->mi_stride + mi_col];
+      get_sb_partition_size_range(cpi, prev_mi, &min_size, &max_size);
     }
-
     // Find the min and max partition sizes used in the left SB64
     if (left_in_image) {
-      left_sb64_mi_8x8 = &mi_8x8[-MI_BLOCK_SIZE];
-      get_sb_partition_size_range(cpi, left_sb64_mi_8x8,
-                                  min_block_size, max_block_size);
+      MODE_INFO **left_sb64_mi = &mi[-MI_BLOCK_SIZE];
+      get_sb_partition_size_range(cpi, left_sb64_mi, &min_size, &max_size);
     }
-
     // Find the min and max partition sizes used in the above SB64.
     if (above_in_image) {
-      above_sb64_mi_8x8 = &mi_8x8[-xd->mode_info_stride * MI_BLOCK_SIZE];
-      get_sb_partition_size_range(cpi, above_sb64_mi_8x8,
-                                  min_block_size, max_block_size);
+      MODE_INFO **above_sb64_mi = &mi[-xd->mi_stride * MI_BLOCK_SIZE];
+      get_sb_partition_size_range(cpi, above_sb64_mi, &min_size, &max_size);
+    }
+    // adjust observed min and max
+    if (cpi->sf.auto_min_max_partition_size == RELAXED_NEIGHBORING_MIN_MAX) {
+      min_size = min_partition_size[min_size];
+      max_size = max_partition_size[max_size];
     }
   }
 
-  // Give a bit of leaway either side of the observed min and max
-  *min_block_size = min_partition_size[*min_block_size];
-  *max_block_size = max_partition_size[*max_block_size];
+  // Check border cases where max and min from neighbors may not be legal.
+  max_size = find_partition_size(max_size,
+                                 row8x8_remaining, col8x8_remaining,
+                                 &bh, &bw);
+  min_size = MIN(min_size, max_size);
 
-  // Check border cases where max and min from neighbours may not be legal.
-  *max_block_size = find_partition_size(*max_block_size,
-                                        row8x8_remaining, col8x8_remaining,
-                                        &bh, &bw);
-  *min_block_size = MIN(*min_block_size, *max_block_size);
+  // When use_square_partition_only is true, make sure at least one square
+  // partition is allowed by selecting the next smaller square size as
+  // *min_block_size.
+  if (cpi->sf.use_square_partition_only &&
+      next_square_size[max_size] < min_size) {
+     min_size = next_square_size[max_size];
+  }
+  *min_block_size = min_size;
+  *max_block_size = max_size;
 }
 
-static void compute_fast_motion_search_level(VP9_COMP *cpi, BLOCK_SIZE bsize) {
+static void auto_partition_range(VP9_COMP *cpi, const TileInfo *const tile,
+                                 int mi_row, int mi_col,
+                                 BLOCK_SIZE *min_block_size,
+                                 BLOCK_SIZE *max_block_size) {
   VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCK *const x = &cpi->mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
-
-  // Only use 8x8 result for non HD videos.
-  // int use_8x8 = (MIN(cpi->common.width, cpi->common.height) < 720) ? 1 : 0;
-  int use_8x8 = 1;
-
-  if (cm->frame_type && !cpi->is_src_frame_alt_ref &&
-      ((use_8x8 && bsize == BLOCK_16X16) ||
-      bsize == BLOCK_32X32 || bsize == BLOCK_64X64)) {
-    int ref0 = 0, ref1 = 0, ref2 = 0, ref3 = 0;
-    PICK_MODE_CONTEXT *block_context = NULL;
-
-    if (bsize == BLOCK_16X16) {
-      block_context = x->sb8x8_context[xd->sb_index][xd->mb_index];
-    } else if (bsize == BLOCK_32X32) {
-      block_context = x->mb_context[xd->sb_index];
-    } else if (bsize == BLOCK_64X64) {
-      block_context = x->sb32_context;
-    }
-
-    if (block_context) {
-      ref0 = block_context[0].mic.mbmi.ref_frame[0];
-      ref1 = block_context[1].mic.mbmi.ref_frame[0];
-      ref2 = block_context[2].mic.mbmi.ref_frame[0];
-      ref3 = block_context[3].mic.mbmi.ref_frame[0];
-    }
-
-    // Currently, only consider 4 inter reference frames.
-    if (ref0 && ref1 && ref2 && ref3) {
-      int d01, d23, d02, d13;
-
-      // Motion vectors for the four subblocks.
-      int16_t mvr0 = block_context[0].mic.mbmi.mv[0].as_mv.row;
-      int16_t mvc0 = block_context[0].mic.mbmi.mv[0].as_mv.col;
-      int16_t mvr1 = block_context[1].mic.mbmi.mv[0].as_mv.row;
-      int16_t mvc1 = block_context[1].mic.mbmi.mv[0].as_mv.col;
-      int16_t mvr2 = block_context[2].mic.mbmi.mv[0].as_mv.row;
-      int16_t mvc2 = block_context[2].mic.mbmi.mv[0].as_mv.col;
-      int16_t mvr3 = block_context[3].mic.mbmi.mv[0].as_mv.row;
-      int16_t mvc3 = block_context[3].mic.mbmi.mv[0].as_mv.col;
-
-      // Adjust sign if ref is alt_ref.
-      if (cm->ref_frame_sign_bias[ref0]) {
-        mvr0 *= -1;
-        mvc0 *= -1;
-      }
-
-      if (cm->ref_frame_sign_bias[ref1]) {
-        mvr1 *= -1;
-        mvc1 *= -1;
-      }
-
-      if (cm->ref_frame_sign_bias[ref2]) {
-        mvr2 *= -1;
-        mvc2 *= -1;
-      }
+  MACROBLOCKD *const xd = &cpi->mb.e_mbd;
+  MODE_INFO **mi_8x8 = xd->mi;
+  const int left_in_image = xd->left_available && mi_8x8[-1];
+  const int above_in_image = xd->up_available &&
+                             mi_8x8[-xd->mi_stride];
+  int row8x8_remaining = tile->mi_row_end - mi_row;
+  int col8x8_remaining = tile->mi_col_end - mi_col;
+  int bh, bw;
+  BLOCK_SIZE min_size = BLOCK_32X32;
+  BLOCK_SIZE max_size = BLOCK_8X8;
+  int bsl = mi_width_log2_lookup[BLOCK_64X64];
+  int search_range_ctrl = (((mi_row + mi_col) >> bsl) +
+                           cpi->sf.chessboard_index) & 0x01;
+  // Trap case where we do not have a prediction.
+  if (search_range_ctrl &&
+      (left_in_image || above_in_image || cm->frame_type != KEY_FRAME)) {
+    int block;
+    MODE_INFO **mi;
+    BLOCK_SIZE sb_type;
 
-      if (cm->ref_frame_sign_bias[ref3]) {
-        mvr3 *= -1;
-        mvc3 *= -1;
+    // Find the min and max partition sizes used in the left SB64.
+    if (left_in_image) {
+      MODE_INFO *cur_mi;
+      mi = &mi_8x8[-1];
+      for (block = 0; block < MI_BLOCK_SIZE; ++block) {
+        cur_mi = mi[block * xd->mi_stride];
+        sb_type = cur_mi ? cur_mi->mbmi.sb_type : 0;
+        min_size = MIN(min_size, sb_type);
+        max_size = MAX(max_size, sb_type);
       }
-
-      // Calculate mv distances.
-      d01 = MAX(abs(mvr0 - mvr1), abs(mvc0 - mvc1));
-      d23 = MAX(abs(mvr2 - mvr3), abs(mvc2 - mvc3));
-      d02 = MAX(abs(mvr0 - mvr2), abs(mvc0 - mvc2));
-      d13 = MAX(abs(mvr1 - mvr3), abs(mvc1 - mvc3));
-
-      if (d01 < FAST_MOTION_MV_THRESH && d23 < FAST_MOTION_MV_THRESH &&
-          d02 < FAST_MOTION_MV_THRESH && d13 < FAST_MOTION_MV_THRESH) {
-        // Set fast motion search level.
-        x->fast_ms = 1;
-
-        if (ref0 == ref1 && ref1 == ref2 && ref2 == ref3 &&
-            d01 < 2 && d23 < 2 && d02 < 2 && d13 < 2) {
-          // Set fast motion search level.
-          x->fast_ms = 2;
-
-          if (!d01 && !d23 && !d02 && !d13) {
-            x->fast_ms = 3;
-            x->subblock_ref = ref0;
-          }
-        }
+    }
+    // Find the min and max partition sizes used in the above SB64.
+    if (above_in_image) {
+      mi = &mi_8x8[-xd->mi_stride * MI_BLOCK_SIZE];
+      for (block = 0; block < MI_BLOCK_SIZE; ++block) {
+        sb_type = mi[block] ? mi[block]->mbmi.sb_type : 0;
+        min_size = MIN(min_size, sb_type);
+        max_size = MAX(max_size, sb_type);
       }
     }
+
+    min_size = min_partition_size[min_size];
+    max_size = find_partition_size(max_size, row8x8_remaining, col8x8_remaining,
+                                   &bh, &bw);
+    min_size = MIN(min_size, max_size);
+    min_size = MAX(min_size, BLOCK_8X8);
+    max_size = MIN(max_size, BLOCK_32X32);
+  } else {
+    min_size = BLOCK_8X8;
+    max_size = BLOCK_32X32;
   }
+
+  *min_block_size = min_size;
+  *max_block_size = max_size;
 }
 
 static INLINE void store_pred_mv(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) {
@@ -1459,14 +1936,16 @@ static INLINE void load_pred_mv(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) {
 static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
                               TOKENEXTRA **tp, int mi_row,
                               int mi_col, BLOCK_SIZE bsize, int *rate,
-                              int64_t *dist, int do_recon, int64_t best_rd) {
-  VP9_COMMON * const cm = &cpi->common;
-  MACROBLOCK * const x = &cpi->mb;
-  MACROBLOCKD * const xd = &x->e_mbd;
-  const int ms = num_8x8_blocks_wide_lookup[bsize] / 2;
+                              int64_t *dist, int do_recon, int64_t best_rd,
+                              PC_TREE *pc_tree) {
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &cpi->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const int mi_step = num_8x8_blocks_wide_lookup[bsize] / 2;
   ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
   PARTITION_CONTEXT sl[8], sa[8];
   TOKENEXTRA *tp_orig = *tp;
+  PICK_MODE_CONTEXT *ctx = &pc_tree->none;
   int i, pl;
   BLOCK_SIZE subsize;
   int this_rate, sum_rate = 0, best_rate = INT_MAX;
@@ -1475,32 +1954,27 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
   int do_split = bsize >= BLOCK_8X8;
   int do_rect = 1;
   // Override skipping rectangular partition operations for edge blocks
-  const int force_horz_split = (mi_row + ms >= cm->mi_rows);
-  const int force_vert_split = (mi_col + ms >= cm->mi_cols);
+  const int force_horz_split = (mi_row + mi_step >= cm->mi_rows);
+  const int force_vert_split = (mi_col + mi_step >= cm->mi_cols);
+  const int xss = x->e_mbd.plane[1].subsampling_x;
+  const int yss = x->e_mbd.plane[1].subsampling_y;
 
   int partition_none_allowed = !force_horz_split && !force_vert_split;
-  int partition_horz_allowed = !force_vert_split && bsize >= BLOCK_8X8;
-  int partition_vert_allowed = !force_horz_split && bsize >= BLOCK_8X8;
-
-  int partition_split_done = 0;
+  int partition_horz_allowed = !force_vert_split && yss <= xss &&
+                               bsize >= BLOCK_8X8;
+  int partition_vert_allowed = !force_horz_split && xss <= yss &&
+                               bsize >= BLOCK_8X8;
   (void) *tp_orig;
 
-  if (bsize < BLOCK_8X8) {
-    // When ab_index = 0 all sub-blocks are handled, so for ab_index != 0
-    // there is nothing to be done.
-    if (xd->ab_index != 0) {
-      *rate = 0;
-      *dist = 0;
-      return;
-    }
-  }
-  assert(mi_height_log2(bsize) == mi_width_log2(bsize));
+  assert(num_8x8_blocks_wide_lookup[bsize] ==
+             num_8x8_blocks_high_lookup[bsize]);
 
   if (bsize == BLOCK_16X16) {
     set_offsets(cpi, tile, mi_row, mi_col, bsize);
     x->mb_energy = vp9_block_energy(cpi, x, bsize);
+  } else {
+    x->in_active_map = check_active_map(cpi, x, mi_row, mi_col, bsize);
   }
-
   // Determine partition types in search according to the speed features.
   // The threshold set here has to be of square block size.
   if (cpi->sf.auto_min_max_partition_size) {
@@ -1524,7 +1998,7 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
   if (cpi->sf.disable_split_var_thresh && partition_none_allowed) {
     unsigned int source_variancey;
     vp9_setup_src_planes(x, cpi->Source, mi_row, mi_col);
-    source_variancey = get_sby_perpixel_variance(cpi, x, bsize);
+    source_variancey = get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize);
     if (source_variancey < cpi->sf.disable_split_var_thresh) {
       do_split = 0;
       if (source_variancey < cpi->sf.disable_split_var_thresh / 2)
@@ -1532,45 +2006,51 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
     }
   }
 
+  if (!x->in_active_map && (partition_horz_allowed || partition_vert_allowed))
+    do_split = 0;
   // PARTITION_NONE
   if (partition_none_allowed) {
-    pick_sb_modes(cpi, tile, mi_row, mi_col, &this_rate, &this_dist, bsize,
-                  get_block_context(x, bsize), best_rd);
+    rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &this_rate, &this_dist, bsize,
+                     ctx, best_rd, 0);
     if (this_rate != INT_MAX) {
       if (bsize >= BLOCK_8X8) {
-        pl = partition_plane_context(cpi->above_seg_context,
-                                     cpi->left_seg_context,
-                                     mi_row, mi_col, bsize);
+        pl = partition_plane_context(xd, mi_row, mi_col, bsize);
         this_rate += x->partition_cost[pl][PARTITION_NONE];
       }
       sum_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_dist);
       if (sum_rd < best_rd) {
-        int64_t stop_thresh = 2048;
+        int64_t stop_thresh = 4096;
+        int64_t stop_thresh_rd;
 
         best_rate = this_rate;
         best_dist = this_dist;
         best_rd = sum_rd;
         if (bsize >= BLOCK_8X8)
-          *(get_sb_partitioning(x, bsize)) = bsize;
+          pc_tree->partitioning = PARTITION_NONE;
 
         // Adjust threshold according to partition size.
         stop_thresh >>= 8 - (b_width_log2_lookup[bsize] +
             b_height_log2_lookup[bsize]);
 
+        stop_thresh_rd = RDCOST(x->rdmult, x->rddiv, 0, stop_thresh);
         // If obtained distortion is very small, choose current partition
         // and stop splitting.
-        if (this_dist < stop_thresh) {
+        if (!x->e_mbd.lossless && best_rd < stop_thresh_rd) {
           do_split = 0;
           do_rect = 0;
         }
       }
     }
+    if (!x->in_active_map) {
+      do_split = 0;
+      do_rect = 0;
+    }
     restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
   }
 
   // store estimated motion vector
   if (cpi->sf.adaptive_motion_search)
-    store_pred_mv(x, get_block_context(x, bsize));
+    store_pred_mv(x, ctx);
 
   // PARTITION_SPLIT
   sum_rd = 0;
@@ -1578,38 +2058,58 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
   // the starting point of motion search in the following partition type check.
   if (do_split) {
     subsize = get_subsize(bsize, PARTITION_SPLIT);
-    for (i = 0; i < 4 && sum_rd < best_rd; ++i) {
-      const int x_idx = (i & 1) * ms;
-      const int y_idx = (i >> 1) * ms;
-
-      if (mi_row + y_idx >= cm->mi_rows || mi_col + x_idx >= cm->mi_cols)
-        continue;
-
-      *get_sb_index(xd, subsize) = i;
-      if (cpi->sf.adaptive_motion_search)
-        load_pred_mv(x, get_block_context(x, bsize));
-      rd_pick_partition(cpi, tile, tp, mi_row + y_idx, mi_col + x_idx, subsize,
-                        &this_rate, &this_dist, i != 3, best_rd - sum_rd);
-
-      if (this_rate == INT_MAX) {
+    if (bsize == BLOCK_8X8) {
+      i = 4;
+      if (cpi->sf.adaptive_pred_interp_filter && partition_none_allowed)
+        pc_tree->leaf_split[0]->pred_interp_filter =
+            ctx->mic.mbmi.interp_filter;
+      rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &sum_rate, &sum_dist, subsize,
+                       pc_tree->leaf_split[0], best_rd, 0);
+      if (sum_rate == INT_MAX) {
         sum_rd = INT64_MAX;
       } else {
-        sum_rate += this_rate;
-        sum_dist += this_dist;
         sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
+        if (sum_rd < best_rd) {
+          update_state(cpi, pc_tree->leaf_split[0], mi_row, mi_col, subsize, 0);
+          encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize,
+                            pc_tree->leaf_split[0]);
+          update_partition_context(xd, mi_row, mi_col, subsize, bsize);
+        }
+      }
+    } else {
+      for (i = 0; i < 4 && sum_rd < best_rd; ++i) {
+      const int x_idx = (i & 1) * mi_step;
+      const int y_idx = (i >> 1) * mi_step;
+
+        if (mi_row + y_idx >= cm->mi_rows || mi_col + x_idx >= cm->mi_cols)
+          continue;
+
+        if (cpi->sf.adaptive_motion_search)
+          load_pred_mv(x, ctx);
+
+        rd_pick_partition(cpi, tile, tp, mi_row + y_idx, mi_col + x_idx,
+                          subsize, &this_rate, &this_dist, i != 3,
+                          best_rd - sum_rd, pc_tree->split[i]);
+
+        if (this_rate == INT_MAX) {
+          sum_rd = INT64_MAX;
+        } else {
+          sum_rate += this_rate;
+          sum_dist += this_dist;
+          sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
+        }
       }
     }
+
     if (sum_rd < best_rd && i == 4) {
-      pl = partition_plane_context(cpi->above_seg_context,
-                                   cpi->left_seg_context,
-                                   mi_row, mi_col, bsize);
+      pl = partition_plane_context(xd, mi_row, mi_col, bsize);
       sum_rate += x->partition_cost[pl][PARTITION_SPLIT];
       sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
       if (sum_rd < best_rd) {
         best_rate = sum_rate;
         best_dist = sum_dist;
         best_rd = sum_rd;
-        *(get_sb_partitioning(x, bsize)) = subsize;
+        pc_tree->partitioning = PARTITION_SPLIT;
       }
     } else {
       // skip rectangular partition test when larger block size
@@ -1617,38 +2117,36 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
       if (cpi->sf.less_rectangular_check)
         do_rect &= !partition_none_allowed;
     }
-    partition_split_done = 1;
     restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
   }
 
-  x->fast_ms = 0;
-  x->subblock_ref = 0;
-
-  if (partition_split_done &&
-      cpi->sf.using_small_partition_info) {
-    compute_fast_motion_search_level(cpi, bsize);
-  }
-
   // PARTITION_HORZ
   if (partition_horz_allowed && do_rect) {
     subsize = get_subsize(bsize, PARTITION_HORZ);
-    *get_sb_index(xd, subsize) = 0;
     if (cpi->sf.adaptive_motion_search)
-      load_pred_mv(x, get_block_context(x, bsize));
-    pick_sb_modes(cpi, tile, mi_row, mi_col, &sum_rate, &sum_dist, subsize,
-                  get_block_context(x, subsize), best_rd);
+      load_pred_mv(x, ctx);
+    if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
+        partition_none_allowed)
+      pc_tree->horizontal[0].pred_interp_filter =
+          ctx->mic.mbmi.interp_filter;
+    rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &sum_rate, &sum_dist, subsize,
+                     &pc_tree->horizontal[0], best_rd, 0);
     sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
 
-    if (sum_rd < best_rd && mi_row + ms < cm->mi_rows) {
-      update_state(cpi, get_block_context(x, subsize), subsize, 0);
-      encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
+    if (sum_rd < best_rd && mi_row + mi_step < cm->mi_rows) {
+      PICK_MODE_CONTEXT *ctx = &pc_tree->horizontal[0];
+      update_state(cpi, ctx, mi_row, mi_col, subsize, 0);
+      encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize, ctx);
 
-      *get_sb_index(xd, subsize) = 1;
       if (cpi->sf.adaptive_motion_search)
-        load_pred_mv(x, get_block_context(x, bsize));
-      pick_sb_modes(cpi, tile, mi_row + ms, mi_col, &this_rate,
-                    &this_dist, subsize, get_block_context(x, subsize),
-                    best_rd - sum_rd);
+        load_pred_mv(x, ctx);
+      if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
+          partition_none_allowed)
+        pc_tree->horizontal[1].pred_interp_filter =
+            ctx->mic.mbmi.interp_filter;
+      rd_pick_sb_modes(cpi, tile, mi_row + mi_step, mi_col, &this_rate,
+                       &this_dist, subsize, &pc_tree->horizontal[1],
+                       best_rd - sum_rd, 1);
       if (this_rate == INT_MAX) {
         sum_rd = INT64_MAX;
       } else {
@@ -1658,41 +2156,46 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
       }
     }
     if (sum_rd < best_rd) {
-      pl = partition_plane_context(cpi->above_seg_context,
-                                   cpi->left_seg_context,
-                                   mi_row, mi_col, bsize);
+      pl = partition_plane_context(xd, mi_row, mi_col, bsize);
       sum_rate += x->partition_cost[pl][PARTITION_HORZ];
       sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
       if (sum_rd < best_rd) {
         best_rd = sum_rd;
         best_rate = sum_rate;
         best_dist = sum_dist;
-        *(get_sb_partitioning(x, bsize)) = subsize;
+        pc_tree->partitioning = PARTITION_HORZ;
       }
     }
     restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
   }
-
   // PARTITION_VERT
   if (partition_vert_allowed && do_rect) {
     subsize = get_subsize(bsize, PARTITION_VERT);
 
-    *get_sb_index(xd, subsize) = 0;
     if (cpi->sf.adaptive_motion_search)
-      load_pred_mv(x, get_block_context(x, bsize));
-    pick_sb_modes(cpi, tile, mi_row, mi_col, &sum_rate, &sum_dist, subsize,
-                  get_block_context(x, subsize), best_rd);
+      load_pred_mv(x, ctx);
+    if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
+        partition_none_allowed)
+      pc_tree->vertical[0].pred_interp_filter =
+          ctx->mic.mbmi.interp_filter;
+    rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &sum_rate, &sum_dist, subsize,
+                     &pc_tree->vertical[0], best_rd, 0);
     sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
-    if (sum_rd < best_rd && mi_col + ms < cm->mi_cols) {
-      update_state(cpi, get_block_context(x, subsize), subsize, 0);
-      encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
+    if (sum_rd < best_rd && mi_col + mi_step < cm->mi_cols) {
+      update_state(cpi, &pc_tree->vertical[0], mi_row, mi_col, subsize, 0);
+      encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize,
+                        &pc_tree->vertical[0]);
 
-      *get_sb_index(xd, subsize) = 1;
       if (cpi->sf.adaptive_motion_search)
-        load_pred_mv(x, get_block_context(x, bsize));
-      pick_sb_modes(cpi, tile, mi_row, mi_col + ms, &this_rate,
-                    &this_dist, subsize, get_block_context(x, subsize),
-                    best_rd - sum_rd);
+        load_pred_mv(x, ctx);
+      if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
+          partition_none_allowed)
+        pc_tree->vertical[1].pred_interp_filter =
+            ctx->mic.mbmi.interp_filter;
+      rd_pick_sb_modes(cpi, tile, mi_row, mi_col + mi_step, &this_rate,
+                       &this_dist, subsize,
+                       &pc_tree->vertical[1], best_rd - sum_rd,
+                       1);
       if (this_rate == INT_MAX) {
         sum_rd = INT64_MAX;
       } else {
@@ -1702,79 +2205,61 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
       }
     }
     if (sum_rd < best_rd) {
-      pl = partition_plane_context(cpi->above_seg_context,
-                                   cpi->left_seg_context,
-                                   mi_row, mi_col, bsize);
+      pl = partition_plane_context(xd, mi_row, mi_col, bsize);
       sum_rate += x->partition_cost[pl][PARTITION_VERT];
       sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
       if (sum_rd < best_rd) {
         best_rate = sum_rate;
         best_dist = sum_dist;
         best_rd = sum_rd;
-        *(get_sb_partitioning(x, bsize)) = subsize;
+        pc_tree->partitioning = PARTITION_VERT;
       }
     }
     restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
   }
-
-
+  // TODO(jbb): This code added so that we avoid static analysis
+  // warning related to the fact that best_rd isn't used after this
+  // point.  This code should be refactored so that the duplicate
+  // checks occur in some sub function and thus are used...
+  (void) best_rd;
   *rate = best_rate;
   *dist = best_dist;
 
-  if (best_rate < INT_MAX && best_dist < INT64_MAX && do_recon)
-    encode_sb(cpi, tile, tp, mi_row, mi_col, bsize == BLOCK_64X64, bsize);
+  if (best_rate < INT_MAX && best_dist < INT64_MAX && do_recon) {
+    int output_enabled = (bsize == BLOCK_64X64);
+
+    // Check the projected output rate for this SB against it's target
+    // and and if necessary apply a Q delta using segmentation to get
+    // closer to the target.
+    if ((cpi->oxcf.aq_mode == COMPLEXITY_AQ) && cm->seg.update_map)
+      vp9_select_in_frame_q_segment(cpi, mi_row, mi_col, output_enabled,
+                                    best_rate);
+    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
+      vp9_cyclic_refresh_set_rate_and_dist_sb(cpi->cyclic_refresh,
+                                              best_rate, best_dist);
+
+    encode_sb(cpi, tile, tp, mi_row, mi_col, output_enabled, bsize, pc_tree);
+  }
+
   if (bsize == BLOCK_64X64) {
     assert(tp_orig < *tp);
     assert(best_rate < INT_MAX);
-    assert(best_dist < INT_MAX);
+    assert(best_dist < INT64_MAX);
   } else {
     assert(tp_orig == *tp);
   }
 }
 
-// Examines 64x64 block and chooses a best reference frame
-static void rd_pick_reference_frame(VP9_COMP *cpi, const TileInfo *const tile,
-                                    int mi_row, int mi_col) {
-  VP9_COMMON * const cm = &cpi->common;
-  MACROBLOCK * const x = &cpi->mb;
-  int bsl = b_width_log2(BLOCK_64X64), bs = 1 << bsl;
-  int ms = bs / 2;
-  ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
-  PARTITION_CONTEXT sl[8], sa[8];
-  int pl;
-  int r;
-  int64_t d;
-
-  save_context(cpi, mi_row, mi_col, a, l, sa, sl, BLOCK_64X64);
-
-  // Default is non mask (all reference frames allowed.
-  cpi->ref_frame_mask = 0;
-
-  // Do RD search for 64x64.
-  if ((mi_row + (ms >> 1) < cm->mi_rows) &&
-      (mi_col + (ms >> 1) < cm->mi_cols)) {
-    cpi->set_ref_frame_mask = 1;
-    pick_sb_modes(cpi, tile, mi_row, mi_col, &r, &d, BLOCK_64X64,
-                  get_block_context(x, BLOCK_64X64), INT64_MAX);
-    pl = partition_plane_context(cpi->above_seg_context, cpi->left_seg_context,
-                                 mi_row, mi_col, BLOCK_64X64);
-    r += x->partition_cost[pl][PARTITION_NONE];
-
-    *(get_sb_partitioning(x, BLOCK_64X64)) = BLOCK_64X64;
-    cpi->set_ref_frame_mask = 0;
-  }
-
-  restore_context(cpi, mi_row, mi_col, a, l, sa, sl, BLOCK_64X64);
-}
-
-static void encode_sb_row(VP9_COMP *cpi, const TileInfo *const tile,
-                          int mi_row, TOKENEXTRA **tp, int *totalrate) {
-  VP9_COMMON * const cm = &cpi->common;
+static void encode_rd_sb_row(VP9_COMP *cpi, const TileInfo *const tile,
+                             int mi_row, TOKENEXTRA **tp) {
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &cpi->mb.e_mbd;
+  SPEED_FEATURES *const sf = &cpi->sf;
   int mi_col;
 
   // Initialize the left context for the new SB row
-  vpx_memset(&cpi->left_context, 0, sizeof(cpi->left_context));
-  vpx_memset(cpi->left_seg_context, 0, sizeof(cpi->left_seg_context));
+  vpx_memset(&xd->left_context, 0, sizeof(xd->left_context));
+  vpx_memset(xd->left_seg_context, 0, sizeof(xd->left_seg_context));
 
   // Code each SB in the row
   for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
@@ -1782,58 +2267,89 @@ static void encode_sb_row(VP9_COMP *cpi, const TileInfo *const tile,
     int dummy_rate;
     int64_t dummy_dist;
 
-    vp9_zero(cpi->mb.pred_mv);
+    int i;
+    MACROBLOCK *x = &cpi->mb;
 
-    if (cpi->sf.reference_masking)
-      rd_pick_reference_frame(cpi, tile, mi_row, mi_col);
+    if (sf->adaptive_pred_interp_filter) {
+      for (i = 0; i < 64; ++i)
+        x->leaf_tree[i].pred_interp_filter = SWITCHABLE;
 
-    if (cpi->sf.use_lastframe_partitioning ||
-        cpi->sf.use_one_partition_size_always ) {
-      const int idx_str = cm->mode_info_stride * mi_row + mi_col;
-      MODE_INFO **mi_8x8 = cm->mi_grid_visible + idx_str;
-      MODE_INFO **prev_mi_8x8 = cm->prev_mi_grid_visible + idx_str;
+      for (i = 0; i < 64; ++i) {
+        x->pc_tree[i].vertical[0].pred_interp_filter = SWITCHABLE;
+        x->pc_tree[i].vertical[1].pred_interp_filter = SWITCHABLE;
+        x->pc_tree[i].horizontal[0].pred_interp_filter = SWITCHABLE;
+        x->pc_tree[i].horizontal[1].pred_interp_filter = SWITCHABLE;
+      }
+    }
+
+    vp9_zero(cpi->mb.pred_mv);
 
+    if ((sf->partition_search_type == SEARCH_PARTITION &&
+         sf->use_lastframe_partitioning) ||
+         sf->partition_search_type == FIXED_PARTITION ||
+         sf->partition_search_type == VAR_BASED_PARTITION ||
+         sf->partition_search_type == VAR_BASED_FIXED_PARTITION) {
+      const int idx_str = cm->mi_stride * mi_row + mi_col;
+      MODE_INFO **mi = cm->mi_grid_visible + idx_str;
+      MODE_INFO **prev_mi = cm->prev_mi_grid_visible + idx_str;
       cpi->mb.source_variance = UINT_MAX;
-      if (cpi->sf.use_one_partition_size_always) {
+      if (sf->partition_search_type == FIXED_PARTITION) {
         set_offsets(cpi, tile, mi_row, mi_col, BLOCK_64X64);
-        set_partitioning(cpi, tile, mi_8x8, mi_row, mi_col);
-        rd_use_partition(cpi, tile, mi_8x8, tp, mi_row, mi_col, BLOCK_64X64,
-                         &dummy_rate, &dummy_dist, 1);
+        set_fixed_partitioning(cpi, tile, mi, mi_row, mi_col,
+                               sf->always_this_block_size);
+        rd_use_partition(cpi, tile, mi, tp, mi_row, mi_col, BLOCK_64X64,
+                         &dummy_rate, &dummy_dist, 1, x->pc_root);
+      } else if (sf->partition_search_type == VAR_BASED_FIXED_PARTITION) {
+        BLOCK_SIZE bsize;
+        set_offsets(cpi, tile, mi_row, mi_col, BLOCK_64X64);
+        bsize = get_rd_var_based_fixed_partition(cpi, mi_row, mi_col);
+        set_fixed_partitioning(cpi, tile, mi, mi_row, mi_col, bsize);
+        rd_use_partition(cpi, tile, mi, tp, mi_row, mi_col, BLOCK_64X64,
+                         &dummy_rate, &dummy_dist, 1, x->pc_root);
+      } else if (sf->partition_search_type == VAR_BASED_PARTITION) {
+        choose_partitioning(cpi, tile, mi_row, mi_col);
+        rd_use_partition(cpi, tile, mi, tp, mi_row, mi_col, BLOCK_64X64,
+                         &dummy_rate, &dummy_dist, 1, x->pc_root);
       } else {
-        if ((cpi->common.current_video_frame
-            % cpi->sf.last_partitioning_redo_frequency) == 0
+        if ((cm->current_video_frame
+            % sf->last_partitioning_redo_frequency) == 0
             || cm->prev_mi == 0
-            || cpi->common.show_frame == 0
-            || cpi->common.frame_type == KEY_FRAME
-            || cpi->is_src_frame_alt_ref
-            || ((cpi->sf.use_lastframe_partitioning ==
+            || cm->show_frame == 0
+            || cm->frame_type == KEY_FRAME
+            || cpi->rc.is_src_frame_alt_ref
+            || ((sf->use_lastframe_partitioning ==
                  LAST_FRAME_PARTITION_LOW_MOTION) &&
-                 sb_has_motion(cpi, prev_mi_8x8))) {
+                 sb_has_motion(cm, prev_mi))) {
           // If required set upper and lower partition size limits
-          if (cpi->sf.auto_min_max_partition_size) {
+          if (sf->auto_min_max_partition_size) {
             set_offsets(cpi, tile, mi_row, mi_col, BLOCK_64X64);
             rd_auto_partition_range(cpi, tile, mi_row, mi_col,
-                                    &cpi->sf.min_partition_size,
-                                    &cpi->sf.max_partition_size);
+                                    &sf->min_partition_size,
+                                    &sf->max_partition_size);
           }
           rd_pick_partition(cpi, tile, tp, mi_row, mi_col, BLOCK_64X64,
-                            &dummy_rate, &dummy_dist, 1, INT64_MAX);
+                            &dummy_rate, &dummy_dist, 1, INT64_MAX, x->pc_root);
         } else {
-          copy_partitioning(cpi, mi_8x8, prev_mi_8x8);
-          rd_use_partition(cpi, tile, mi_8x8, tp, mi_row, mi_col, BLOCK_64X64,
-                           &dummy_rate, &dummy_dist, 1);
+          if (sf->constrain_copy_partition &&
+              sb_has_motion(cm, prev_mi))
+            constrain_copy_partitioning(cpi, tile, mi, prev_mi,
+                                        mi_row, mi_col, BLOCK_16X16);
+          else
+            copy_partitioning(cm, mi, prev_mi);
+          rd_use_partition(cpi, tile, mi, tp, mi_row, mi_col, BLOCK_64X64,
+                           &dummy_rate, &dummy_dist, 1, x->pc_root);
         }
       }
     } else {
       // If required set upper and lower partition size limits
-      if (cpi->sf.auto_min_max_partition_size) {
+      if (sf->auto_min_max_partition_size) {
         set_offsets(cpi, tile, mi_row, mi_col, BLOCK_64X64);
         rd_auto_partition_range(cpi, tile, mi_row, mi_col,
-                                &cpi->sf.min_partition_size,
-                                &cpi->sf.max_partition_size);
+                                &sf->min_partition_size,
+                                &sf->max_partition_size);
       }
       rd_pick_partition(cpi, tile, tp, mi_row, mi_col, BLOCK_64X64,
-                        &dummy_rate, &dummy_dist, 1, INT64_MAX);
+                        &dummy_rate, &dummy_dist, 1, INT64_MAX, x->pc_root);
     }
   }
 }
@@ -1844,46 +2360,18 @@ static void init_encode_frame_mb_context(VP9_COMP *cpi) {
   MACROBLOCKD *const xd = &x->e_mbd;
   const int aligned_mi_cols = mi_cols_aligned_to_sb(cm->mi_cols);
 
-  x->act_zbin_adj = 0;
-  cpi->seg0_idx = 0;
-
-  xd->mode_info_stride = cm->mode_info_stride;
-
-  // reset intra mode contexts
-  if (frame_is_intra_only(cm))
-    vp9_init_mbmode_probs(cm);
-
   // Copy data over into macro block data structures.
   vp9_setup_src_planes(x, cpi->Source, 0, 0);
 
-  // TODO(jkoleszar): are these initializations required?
-  setup_pre_planes(xd, 0, &cm->yv12_fb[cm->ref_frame_map[cpi->lst_fb_idx]],
-                   0, 0, NULL);
-  setup_dst_planes(xd, get_frame_new_buffer(cm), 0, 0);
-
-  setup_block_dptrs(&x->e_mbd, cm->subsampling_x, cm->subsampling_y);
-
-  xd->mi_8x8[0]->mbmi.mode = DC_PRED;
-  xd->mi_8x8[0]->mbmi.uv_mode = DC_PRED;
-
-  vp9_zero(cpi->y_mode_count);
-  vp9_zero(cpi->y_uv_mode_count);
-  vp9_zero(cm->counts.inter_mode);
-  vp9_zero(cpi->partition_count);
-  vp9_zero(cpi->intra_inter_count);
-  vp9_zero(cpi->comp_inter_count);
-  vp9_zero(cpi->single_ref_count);
-  vp9_zero(cpi->comp_ref_count);
-  vp9_zero(cm->counts.tx);
-  vp9_zero(cm->counts.mbskip);
+  vp9_setup_block_planes(&x->e_mbd, cm->subsampling_x, cm->subsampling_y);
 
   // Note: this memset assumes above_context[0], [1] and [2]
   // are allocated as part of the same buffer.
-  vpx_memset(cpi->above_context[0], 0,
-             sizeof(*cpi->above_context[0]) *
+  vpx_memset(xd->above_context[0], 0,
+             sizeof(*xd->above_context[0]) *
              2 * aligned_mi_cols * MAX_MB_PLANE);
-  vpx_memset(cpi->above_seg_context, 0,
-             sizeof(*cpi->above_seg_context) * aligned_mi_cols);
+  vpx_memset(xd->above_seg_context, 0,
+             sizeof(*xd->above_seg_context) * aligned_mi_cols);
 }
 
 static void switch_lossless_mode(VP9_COMP *cpi, int lossless) {
@@ -1902,292 +2390,751 @@ static void switch_lossless_mode(VP9_COMP *cpi, int lossless) {
   }
 }
 
-static void switch_tx_mode(VP9_COMP *cpi) {
-  if (cpi->sf.tx_size_search_method == USE_LARGESTALL &&
-      cpi->common.tx_mode >= ALLOW_32X32)
-    cpi->common.tx_mode = ALLOW_32X32;
-}
+static int check_dual_ref_flags(VP9_COMP *cpi) {
+  const int ref_flags = cpi->ref_frame_flags;
 
-static void encode_frame_internal(VP9_COMP *cpi) {
-  int mi_row;
-  MACROBLOCK * const x = &cpi->mb;
-  VP9_COMMON * const cm = &cpi->common;
-  MACROBLOCKD * const xd = &x->e_mbd;
-  int totalrate;
+  if (vp9_segfeature_active(&cpi->common.seg, 1, SEG_LVL_REF_FRAME)) {
+    return 0;
+  } else {
+    return (!!(ref_flags & VP9_GOLD_FLAG) + !!(ref_flags & VP9_LAST_FLAG)
+        + !!(ref_flags & VP9_ALT_FLAG)) >= 2;
+  }
+}
 
-//  fprintf(stderr, "encode_frame_internal frame %d (%d) type %d\n",
-//           cpi->common.current_video_frame, cpi->common.show_frame,
-//           cm->frame_type);
+static void reset_skip_txfm_size(VP9_COMMON *cm, TX_SIZE txfm_max) {
+  int mi_row, mi_col;
+  const int mis = cm->mi_stride;
+  MODE_INFO **mi_ptr = cm->mi_grid_visible;
 
-// debug output
-#if DBG_PRNT_SEGMAP
-  {
-    FILE *statsfile;
-    statsfile = fopen("segmap2.stt", "a");
-    fprintf(statsfile, "\n");
-    fclose(statsfile);
+  for (mi_row = 0; mi_row < cm->mi_rows; ++mi_row, mi_ptr += mis) {
+    for (mi_col = 0; mi_col < cm->mi_cols; ++mi_col) {
+      if (mi_ptr[mi_col]->mbmi.tx_size > txfm_max)
+        mi_ptr[mi_col]->mbmi.tx_size = txfm_max;
+    }
   }
-#endif
+}
 
-  totalrate = 0;
+static MV_REFERENCE_FRAME get_frame_type(const VP9_COMP *cpi) {
+  if (frame_is_intra_only(&cpi->common))
+    return INTRA_FRAME;
+  else if (cpi->rc.is_src_frame_alt_ref && cpi->refresh_golden_frame)
+    return ALTREF_FRAME;
+  else if (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)
+    return LAST_FRAME;
+  else
+    return GOLDEN_FRAME;
+}
 
-  // Reset frame count of inter 0,0 motion vector usage.
-  cpi->inter_zz_count = 0;
+static TX_MODE select_tx_mode(const VP9_COMP *cpi) {
+  if (cpi->oxcf.lossless) {
+    return ONLY_4X4;
+  } else if (cpi->common.current_video_frame == 0) {
+    return TX_MODE_SELECT;
+  } else {
+    if (cpi->sf.tx_size_search_method == USE_LARGESTALL) {
+      return ALLOW_32X32;
+    } else if (cpi->sf.tx_size_search_method == USE_FULL_RD) {
+      const RD_OPT *const rd_opt = &cpi->rd;
+      const MV_REFERENCE_FRAME frame_type = get_frame_type(cpi);
+      return rd_opt->tx_select_threshes[frame_type][ALLOW_32X32] >
+                 rd_opt->tx_select_threshes[frame_type][TX_MODE_SELECT] ?
+                     ALLOW_32X32 : TX_MODE_SELECT;
+    } else {
+      unsigned int total = 0;
+      int i;
+      for (i = 0; i < TX_SIZES; ++i)
+        total += cpi->tx_stepdown_count[i];
 
-  vp9_zero(cm->counts.switchable_interp);
-  vp9_zero(cpi->tx_stepdown_count);
+      if (total) {
+        const double fraction = (double)cpi->tx_stepdown_count[0] / total;
+        return fraction > 0.90 ? ALLOW_32X32 : TX_MODE_SELECT;
+      } else {
+        return cpi->common.tx_mode;
+      }
+    }
+  }
+}
 
-  xd->mi_8x8 = cm->mi_grid_visible;
-  // required for vp9_frame_init_quantizer
-  xd->mi_8x8[0] = cm->mi;
+// Start RTC Exploration
+typedef enum {
+  BOTH_ZERO = 0,
+  ZERO_PLUS_PREDICTED = 1,
+  BOTH_PREDICTED = 2,
+  NEW_PLUS_NON_INTRA = 3,
+  BOTH_NEW = 4,
+  INTRA_PLUS_NON_INTRA = 5,
+  BOTH_INTRA = 6,
+  INVALID_CASE = 9
+} motion_vector_context;
+
+static void set_mode_info(MB_MODE_INFO *mbmi, BLOCK_SIZE bsize,
+                          PREDICTION_MODE mode) {
+  mbmi->mode = mode;
+  mbmi->uv_mode = mode;
+  mbmi->mv[0].as_int = 0;
+  mbmi->mv[1].as_int = 0;
+  mbmi->ref_frame[0] = INTRA_FRAME;
+  mbmi->ref_frame[1] = NONE;
+  mbmi->tx_size = max_txsize_lookup[bsize];
+  mbmi->skip = 0;
+  mbmi->sb_type = bsize;
+  mbmi->segment_id = 0;
+}
 
-  xd->last_mi = cm->prev_mi;
+static void nonrd_pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile,
+                                int mi_row, int mi_col,
+                                int *rate, int64_t *dist,
+                                BLOCK_SIZE bsize) {
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &cpi->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  set_offsets(cpi, tile, mi_row, mi_col, bsize);
+  xd->mi[0]->mbmi.sb_type = bsize;
 
-  vp9_zero(cpi->NMVcount);
-  vp9_zero(cpi->coef_counts);
-  vp9_zero(cm->counts.eob_branch);
+  if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled) {
+    if (xd->mi[0]->mbmi.segment_id && x->in_static_area)
+      x->rdmult = vp9_cyclic_refresh_get_rdmult(cpi->cyclic_refresh);
+  }
 
-  cpi->mb.e_mbd.lossless = cm->base_qindex == 0 && cm->y_dc_delta_q == 0
-      && cm->uv_dc_delta_q == 0 && cm->uv_ac_delta_q == 0;
-  switch_lossless_mode(cpi, cpi->mb.e_mbd.lossless);
+  if (!frame_is_intra_only(cm)) {
+    vp9_pick_inter_mode(cpi, x, tile, mi_row, mi_col,
+                        rate, dist, bsize);
+  } else {
+    set_mode_info(&xd->mi[0]->mbmi, bsize, DC_PRED);
+  }
+  duplicate_mode_info_in_sb(cm, xd, mi_row, mi_col, bsize);
+}
 
-  vp9_frame_init_quantizer(cpi);
+static void fill_mode_info_sb(VP9_COMMON *cm, MACROBLOCK *x,
+                              int mi_row, int mi_col,
+                              BLOCK_SIZE bsize, BLOCK_SIZE subsize,
+                              PC_TREE *pc_tree) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  int bsl = b_width_log2(bsize), hbs = (1 << bsl) / 4;
+  PARTITION_TYPE partition = pc_tree->partitioning;
 
-  vp9_initialize_rd_consts(cpi);
-  vp9_initialize_me_consts(cpi, cm->base_qindex);
-  switch_tx_mode(cpi);
+  assert(bsize >= BLOCK_8X8);
 
-  if (cpi->oxcf.tuning == VP8_TUNE_SSIM) {
-    // Initialize encode frame context.
-    init_encode_frame_mb_context(cpi);
+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
+    return;
 
-    // Build a frame level activity map
-    build_activity_map(cpi);
+  switch (partition) {
+    case PARTITION_NONE:
+      set_modeinfo_offsets(cm, xd, mi_row, mi_col);
+      *(xd->mi[0]) = pc_tree->none.mic;
+      duplicate_mode_info_in_sb(cm, xd, mi_row, mi_col, bsize);
+      break;
+    case PARTITION_VERT:
+      set_modeinfo_offsets(cm, xd, mi_row, mi_col);
+      *(xd->mi[0]) = pc_tree->vertical[0].mic;
+      duplicate_mode_info_in_sb(cm, xd, mi_row, mi_col, bsize);
+
+      if (mi_col + hbs < cm->mi_cols) {
+        set_modeinfo_offsets(cm, xd, mi_row, mi_col + hbs);
+        *(xd->mi[0]) = pc_tree->vertical[1].mic;
+        duplicate_mode_info_in_sb(cm, xd, mi_row, mi_col + hbs, bsize);
+      }
+      break;
+    case PARTITION_HORZ:
+      set_modeinfo_offsets(cm, xd, mi_row, mi_col);
+      *(xd->mi[0]) = pc_tree->horizontal[0].mic;
+      duplicate_mode_info_in_sb(cm, xd, mi_row, mi_col, bsize);
+      if (mi_row + hbs < cm->mi_rows) {
+        set_modeinfo_offsets(cm, xd, mi_row + hbs, mi_col);
+        *(xd->mi[0]) = pc_tree->horizontal[1].mic;
+        duplicate_mode_info_in_sb(cm, xd, mi_row + hbs, mi_col, bsize);
+      }
+      break;
+    case PARTITION_SPLIT: {
+      BLOCK_SIZE subsubsize = get_subsize(subsize, PARTITION_SPLIT);
+      fill_mode_info_sb(cm, x, mi_row, mi_col, subsize,
+                        subsubsize, pc_tree->split[0]);
+      fill_mode_info_sb(cm, x, mi_row, mi_col + hbs, subsize,
+                        subsubsize, pc_tree->split[1]);
+      fill_mode_info_sb(cm, x, mi_row + hbs, mi_col, subsize,
+                        subsubsize, pc_tree->split[2]);
+      fill_mode_info_sb(cm, x, mi_row + hbs, mi_col + hbs, subsize,
+                        subsubsize, pc_tree->split[3]);
+      break;
+    }
+    default:
+      break;
   }
+}
 
-  // Re-initialize encode frame context.
-  init_encode_frame_mb_context(cpi);
+static void nonrd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
+                                 TOKENEXTRA **tp, int mi_row,
+                                 int mi_col, BLOCK_SIZE bsize, int *rate,
+                                 int64_t *dist, int do_recon, int64_t best_rd,
+                                 PC_TREE *pc_tree) {
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &cpi->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const int ms = num_8x8_blocks_wide_lookup[bsize] / 2;
+  TOKENEXTRA *tp_orig = *tp;
+  PICK_MODE_CONTEXT *ctx = &pc_tree->none;
+  int i;
+  BLOCK_SIZE subsize = bsize;
+  int this_rate, sum_rate = 0, best_rate = INT_MAX;
+  int64_t this_dist, sum_dist = 0, best_dist = INT64_MAX;
+  int64_t sum_rd = 0;
+  int do_split = bsize >= BLOCK_8X8;
+  int do_rect = 1;
+  // Override skipping rectangular partition operations for edge blocks
+  const int force_horz_split = (mi_row + ms >= cm->mi_rows);
+  const int force_vert_split = (mi_col + ms >= cm->mi_cols);
+  const int xss = x->e_mbd.plane[1].subsampling_x;
+  const int yss = x->e_mbd.plane[1].subsampling_y;
 
-  vp9_zero(cpi->rd_comp_pred_diff);
-  vp9_zero(cpi->rd_filter_diff);
-  vp9_zero(cpi->rd_tx_select_diff);
-  vp9_zero(cpi->rd_tx_select_threshes);
+  int partition_none_allowed = !force_horz_split && !force_vert_split;
+  int partition_horz_allowed = !force_vert_split && yss <= xss &&
+                               bsize >= BLOCK_8X8;
+  int partition_vert_allowed = !force_horz_split && xss <= yss &&
+                               bsize >= BLOCK_8X8;
+  (void) *tp_orig;
 
-  set_prev_mi(cm);
+  assert(num_8x8_blocks_wide_lookup[bsize] ==
+             num_8x8_blocks_high_lookup[bsize]);
 
-  {
-    struct vpx_usec_timer emr_timer;
-    vpx_usec_timer_start(&emr_timer);
+  x->in_active_map = check_active_map(cpi, x, mi_row, mi_col, bsize);
 
-    {
-      // Take tiles into account and give start/end MB
-      int tile_col, tile_row;
-      TOKENEXTRA *tp = cpi->tok;
-      const int tile_cols = 1 << cm->log2_tile_cols;
-      const int tile_rows = 1 << cm->log2_tile_rows;
+  // Determine partition types in search according to the speed features.
+  // The threshold set here has to be of square block size.
+  if (cpi->sf.auto_min_max_partition_size) {
+    partition_none_allowed &= (bsize <= cpi->sf.max_partition_size &&
+                               bsize >= cpi->sf.min_partition_size);
+    partition_horz_allowed &= ((bsize <= cpi->sf.max_partition_size &&
+                                bsize >  cpi->sf.min_partition_size) ||
+                                force_horz_split);
+    partition_vert_allowed &= ((bsize <= cpi->sf.max_partition_size &&
+                                bsize >  cpi->sf.min_partition_size) ||
+                                force_vert_split);
+    do_split &= bsize > cpi->sf.min_partition_size;
+  }
+  if (cpi->sf.use_square_partition_only) {
+    partition_horz_allowed &= force_horz_split;
+    partition_vert_allowed &= force_vert_split;
+  }
 
-      for (tile_row = 0; tile_row < tile_rows; tile_row++) {
-        for (tile_col = 0; tile_col < tile_cols; tile_col++) {
-          TileInfo tile;
-          TOKENEXTRA *tp_old = tp;
+  if (!x->in_active_map && (partition_horz_allowed || partition_vert_allowed))
+    do_split = 0;
 
-          // For each row of SBs in the frame
-          vp9_tile_init(&tile, cm, tile_row, tile_col);
-          for (mi_row = tile.mi_row_start;
-               mi_row < tile.mi_row_end; mi_row += 8)
-            encode_sb_row(cpi, &tile, mi_row, &tp, &totalrate);
+  // PARTITION_NONE
+  if (partition_none_allowed) {
+    nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col,
+                        &this_rate, &this_dist, bsize);
+    ctx->mic.mbmi = xd->mi[0]->mbmi;
 
-          cpi->tok_count[tile_row][tile_col] = (unsigned int)(tp - tp_old);
-          assert(tp - cpi->tok <= get_token_alloc(cm->mb_rows, cm->mb_cols));
+    if (this_rate != INT_MAX) {
+      int pl = partition_plane_context(xd, mi_row, mi_col, bsize);
+      this_rate += x->partition_cost[pl][PARTITION_NONE];
+      sum_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_dist);
+      if (sum_rd < best_rd) {
+        int64_t stop_thresh = 4096;
+        int64_t stop_thresh_rd;
+
+        best_rate = this_rate;
+        best_dist = this_dist;
+        best_rd = sum_rd;
+        if (bsize >= BLOCK_8X8)
+          pc_tree->partitioning = PARTITION_NONE;
+
+        // Adjust threshold according to partition size.
+        stop_thresh >>= 8 - (b_width_log2_lookup[bsize] +
+            b_height_log2_lookup[bsize]);
+
+        stop_thresh_rd = RDCOST(x->rdmult, x->rddiv, 0, stop_thresh);
+        // If obtained distortion is very small, choose current partition
+        // and stop splitting.
+        if (!x->e_mbd.lossless && best_rd < stop_thresh_rd) {
+          do_split = 0;
+          do_rect = 0;
         }
       }
     }
-
-    vpx_usec_timer_mark(&emr_timer);
-    cpi->time_encode_sb_row += vpx_usec_timer_elapsed(&emr_timer);
+    if (!x->in_active_map) {
+      do_split = 0;
+      do_rect = 0;
+    }
   }
 
-  if (cpi->sf.skip_encode_sb) {
-    int j;
-    unsigned int intra_count = 0, inter_count = 0;
-    for (j = 0; j < INTRA_INTER_CONTEXTS; ++j) {
-      intra_count += cpi->intra_inter_count[j][0];
-      inter_count += cpi->intra_inter_count[j][1];
+  // store estimated motion vector
+  store_pred_mv(x, ctx);
+
+  // PARTITION_SPLIT
+  sum_rd = 0;
+  if (do_split) {
+    int pl = partition_plane_context(xd, mi_row, mi_col, bsize);
+    sum_rate += x->partition_cost[pl][PARTITION_SPLIT];
+    subsize = get_subsize(bsize, PARTITION_SPLIT);
+    for (i = 0; i < 4 && sum_rd < best_rd; ++i) {
+      const int x_idx = (i & 1) * ms;
+      const int y_idx = (i >> 1) * ms;
+
+      if (mi_row + y_idx >= cm->mi_rows || mi_col + x_idx >= cm->mi_cols)
+        continue;
+      load_pred_mv(x, ctx);
+      nonrd_pick_partition(cpi, tile, tp, mi_row + y_idx, mi_col + x_idx,
+                           subsize, &this_rate, &this_dist, 0,
+                           best_rd - sum_rd, pc_tree->split[i]);
+
+      if (this_rate == INT_MAX) {
+        sum_rd = INT64_MAX;
+      } else {
+        sum_rate += this_rate;
+        sum_dist += this_dist;
+        sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
+      }
+    }
+
+    if (sum_rd < best_rd) {
+      best_rate = sum_rate;
+      best_dist = sum_dist;
+      best_rd = sum_rd;
+      pc_tree->partitioning = PARTITION_SPLIT;
+    } else {
+      // skip rectangular partition test when larger block size
+      // gives better rd cost
+      if (cpi->sf.less_rectangular_check)
+        do_rect &= !partition_none_allowed;
     }
-    cpi->sf.skip_encode_frame = ((intra_count << 2) < inter_count);
-    cpi->sf.skip_encode_frame &= (cm->frame_type != KEY_FRAME);
-    cpi->sf.skip_encode_frame &= cm->show_frame;
-  } else {
-    cpi->sf.skip_encode_frame = 0;
   }
 
-  // 256 rate units to the bit,
-  // projected_frame_size in units of BYTES
-  cpi->projected_frame_size = totalrate >> 8;
+  // PARTITION_HORZ
+  if (partition_horz_allowed && do_rect) {
+    subsize = get_subsize(bsize, PARTITION_HORZ);
+    if (cpi->sf.adaptive_motion_search)
+      load_pred_mv(x, ctx);
+
+    nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col,
+                        &this_rate, &this_dist, subsize);
 
-#if 0
-  // Keep record of the total distortion this time around for future use
-  cpi->last_frame_distortion = cpi->frame_distortion;
-#endif
-}
+    pc_tree->horizontal[0].mic.mbmi = xd->mi[0]->mbmi;
 
-static int check_dual_ref_flags(VP9_COMP *cpi) {
-  const int ref_flags = cpi->ref_frame_flags;
+    sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
 
-  if (vp9_segfeature_active(&cpi->common.seg, 1, SEG_LVL_REF_FRAME)) {
-    return 0;
-  } else {
-    return (!!(ref_flags & VP9_GOLD_FLAG) + !!(ref_flags & VP9_LAST_FLAG)
-        + !!(ref_flags & VP9_ALT_FLAG)) >= 2;
-  }
-}
+    if (sum_rd < best_rd && mi_row + ms < cm->mi_rows) {
+      load_pred_mv(x, ctx);
+      nonrd_pick_sb_modes(cpi, tile, mi_row + ms, mi_col,
+                          &this_rate, &this_dist, subsize);
 
-static int get_skip_flag(MODE_INFO **mi_8x8, int mis, int ymbs, int xmbs) {
-  int x, y;
+      pc_tree->horizontal[1].mic.mbmi = xd->mi[0]->mbmi;
 
-  for (y = 0; y < ymbs; y++) {
-    for (x = 0; x < xmbs; x++) {
-      if (!mi_8x8[y * mis + x]->mbmi.skip_coeff)
-        return 0;
+      if (this_rate == INT_MAX) {
+        sum_rd = INT64_MAX;
+      } else {
+        int pl = partition_plane_context(xd, mi_row, mi_col, bsize);
+        this_rate += x->partition_cost[pl][PARTITION_HORZ];
+        sum_rate += this_rate;
+        sum_dist += this_dist;
+        sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
+      }
+    }
+    if (sum_rd < best_rd) {
+      best_rd = sum_rd;
+      best_rate = sum_rate;
+      best_dist = sum_dist;
+      pc_tree->partitioning = PARTITION_HORZ;
     }
   }
 
-  return 1;
-}
+  // PARTITION_VERT
+  if (partition_vert_allowed && do_rect) {
+    subsize = get_subsize(bsize, PARTITION_VERT);
 
-static void set_txfm_flag(MODE_INFO **mi_8x8, int mis, int ymbs, int xmbs,
-                          TX_SIZE tx_size) {
-  int x, y;
+    if (cpi->sf.adaptive_motion_search)
+      load_pred_mv(x, ctx);
 
-  for (y = 0; y < ymbs; y++) {
-    for (x = 0; x < xmbs; x++)
-      mi_8x8[y * mis + x]->mbmi.tx_size = tx_size;
+    nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col,
+                        &this_rate, &this_dist, subsize);
+    pc_tree->vertical[0].mic.mbmi = xd->mi[0]->mbmi;
+    sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
+    if (sum_rd < best_rd && mi_col + ms < cm->mi_cols) {
+      load_pred_mv(x, ctx);
+      nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col + ms,
+                          &this_rate, &this_dist, subsize);
+      pc_tree->vertical[1].mic.mbmi = xd->mi[0]->mbmi;
+      if (this_rate == INT_MAX) {
+        sum_rd = INT64_MAX;
+      } else {
+        int pl = partition_plane_context(xd, mi_row, mi_col, bsize);
+        this_rate += x->partition_cost[pl][PARTITION_VERT];
+        sum_rate += this_rate;
+        sum_dist += this_dist;
+        sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
+      }
+    }
+    if (sum_rd < best_rd) {
+      best_rate = sum_rate;
+      best_dist = sum_dist;
+      best_rd = sum_rd;
+      pc_tree->partitioning = PARTITION_VERT;
+    }
   }
-}
+  // TODO(JBB): The following line is here just to avoid a static warning
+  // that occurs because at this point we never again reuse best_rd
+  // despite setting it here.  The code should be refactored to avoid this.
+  (void) best_rd;
 
-static void reset_skip_txfm_size_b(VP9_COMP *cpi, MODE_INFO **mi_8x8,
-                                   int mis, TX_SIZE max_tx_size, int bw, int bh,
-                                   int mi_row, int mi_col, BLOCK_SIZE bsize) {
-  VP9_COMMON * const cm = &cpi->common;
+  *rate = best_rate;
+  *dist = best_dist;
 
-  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) {
+  if (best_rate == INT_MAX)
     return;
-  } else {
-    MB_MODE_INFO * const mbmi = &mi_8x8[0]->mbmi;
-    if (mbmi->tx_size > max_tx_size) {
-      const int ymbs = MIN(bh, cm->mi_rows - mi_row);
-      const int xmbs = MIN(bw, cm->mi_cols - mi_col);
 
-      assert(vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP) ||
-             get_skip_flag(mi_8x8, mis, ymbs, xmbs));
-      set_txfm_flag(mi_8x8, mis, ymbs, xmbs, max_tx_size);
+  // update mode info array
+  subsize = get_subsize(bsize, pc_tree->partitioning);
+  fill_mode_info_sb(cm, x, mi_row, mi_col, bsize, subsize,
+                    pc_tree);
+
+  if (best_rate < INT_MAX && best_dist < INT64_MAX && do_recon) {
+    int output_enabled = (bsize == BLOCK_64X64);
+
+    // Check the projected output rate for this SB against it's target
+    // and and if necessary apply a Q delta using segmentation to get
+    // closer to the target.
+    if ((cpi->oxcf.aq_mode == COMPLEXITY_AQ) && cm->seg.update_map) {
+      vp9_select_in_frame_q_segment(cpi, mi_row, mi_col, output_enabled,
+                                    best_rate);
     }
+
+    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
+      vp9_cyclic_refresh_set_rate_and_dist_sb(cpi->cyclic_refresh,
+                                              best_rate, best_dist);
+
+    encode_sb_rt(cpi, tile, tp, mi_row, mi_col, output_enabled, bsize, pc_tree);
+  }
+
+  if (bsize == BLOCK_64X64) {
+    assert(tp_orig < *tp);
+    assert(best_rate < INT_MAX);
+    assert(best_dist < INT64_MAX);
+  } else {
+    assert(tp_orig == *tp);
   }
 }
 
-static void reset_skip_txfm_size_sb(VP9_COMP *cpi, MODE_INFO **mi_8x8,
-                                    TX_SIZE max_tx_size, int mi_row, int mi_col,
-                                    BLOCK_SIZE bsize) {
-  VP9_COMMON * const cm = &cpi->common;
-  const int mis = cm->mode_info_stride;
-  int bw, bh;
-  const int bs = num_8x8_blocks_wide_lookup[bsize], hbs = bs / 2;
+static void nonrd_use_partition(VP9_COMP *cpi,
+                                const TileInfo *const tile,
+                                MODE_INFO **mi,
+                                TOKENEXTRA **tp,
+                                int mi_row, int mi_col,
+                                BLOCK_SIZE bsize, int output_enabled,
+                                int *totrate, int64_t *totdist,
+                                PC_TREE *pc_tree) {
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &cpi->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const int bsl = b_width_log2(bsize), hbs = (1 << bsl) / 4;
+  const int mis = cm->mi_stride;
+  PARTITION_TYPE partition;
+  BLOCK_SIZE subsize;
+  int rate = INT_MAX;
+  int64_t dist = INT64_MAX;
 
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
     return;
 
-  bw = num_8x8_blocks_wide_lookup[mi_8x8[0]->mbmi.sb_type];
-  bh = num_8x8_blocks_high_lookup[mi_8x8[0]->mbmi.sb_type];
-
-  if (bw == bs && bh == bs) {
-    reset_skip_txfm_size_b(cpi, mi_8x8, mis, max_tx_size, bs, bs, mi_row,
-                           mi_col, bsize);
-  } else if (bw == bs && bh < bs) {
-    reset_skip_txfm_size_b(cpi, mi_8x8, mis, max_tx_size, bs, hbs, mi_row,
-                           mi_col, bsize);
-    reset_skip_txfm_size_b(cpi, mi_8x8 + hbs * mis, mis, max_tx_size, bs, hbs,
-                           mi_row + hbs, mi_col, bsize);
-  } else if (bw < bs && bh == bs) {
-    reset_skip_txfm_size_b(cpi, mi_8x8, mis, max_tx_size, hbs, bs, mi_row,
-                           mi_col, bsize);
-    reset_skip_txfm_size_b(cpi, mi_8x8 + hbs, mis, max_tx_size, hbs, bs, mi_row,
-                           mi_col + hbs, bsize);
+  subsize = (bsize >= BLOCK_8X8) ? mi[0]->mbmi.sb_type : BLOCK_4X4;
+  partition = partition_lookup[bsl][subsize];
 
-  } else {
-    const BLOCK_SIZE subsize = subsize_lookup[PARTITION_SPLIT][bsize];
-    int n;
+  switch (partition) {
+    case PARTITION_NONE:
+      nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col, totrate, totdist, subsize);
+      pc_tree->none.mic.mbmi = xd->mi[0]->mbmi;
+      break;
+    case PARTITION_VERT:
+      nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col, totrate, totdist, subsize);
+      pc_tree->vertical[0].mic.mbmi = xd->mi[0]->mbmi;
+      if (mi_col + hbs < cm->mi_cols) {
+        nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col + hbs,
+                            &rate, &dist, subsize);
+        pc_tree->vertical[1].mic.mbmi = xd->mi[0]->mbmi;
+        if (rate != INT_MAX && dist != INT64_MAX &&
+            *totrate != INT_MAX && *totdist != INT64_MAX) {
+          *totrate += rate;
+          *totdist += dist;
+        }
+      }
+      break;
+    case PARTITION_HORZ:
+      nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col, totrate, totdist, subsize);
+      pc_tree->horizontal[0].mic.mbmi = xd->mi[0]->mbmi;
+      if (mi_row + hbs < cm->mi_rows) {
+        nonrd_pick_sb_modes(cpi, tile, mi_row + hbs, mi_col,
+                            &rate, &dist, subsize);
+        pc_tree->horizontal[1].mic.mbmi = mi[0]->mbmi;
+        if (rate != INT_MAX && dist != INT64_MAX &&
+            *totrate != INT_MAX && *totdist != INT64_MAX) {
+          *totrate += rate;
+          *totdist += dist;
+        }
+      }
+      break;
+    case PARTITION_SPLIT:
+      subsize = get_subsize(bsize, PARTITION_SPLIT);
+      nonrd_use_partition(cpi, tile, mi, tp, mi_row, mi_col,
+                          subsize, output_enabled, totrate, totdist,
+                          pc_tree->split[0]);
+      nonrd_use_partition(cpi, tile, mi + hbs, tp,
+                          mi_row, mi_col + hbs, subsize, output_enabled,
+                          &rate, &dist, pc_tree->split[1]);
+      if (rate != INT_MAX && dist != INT64_MAX &&
+          *totrate != INT_MAX && *totdist != INT64_MAX) {
+        *totrate += rate;
+        *totdist += dist;
+      }
+      nonrd_use_partition(cpi, tile, mi + hbs * mis, tp,
+                          mi_row + hbs, mi_col, subsize, output_enabled,
+                          &rate, &dist, pc_tree->split[2]);
+      if (rate != INT_MAX && dist != INT64_MAX &&
+          *totrate != INT_MAX && *totdist != INT64_MAX) {
+        *totrate += rate;
+        *totdist += dist;
+      }
+      nonrd_use_partition(cpi, tile, mi + hbs * mis + hbs, tp,
+                          mi_row + hbs, mi_col + hbs, subsize, output_enabled,
+                          &rate, &dist, pc_tree->split[3]);
+      if (rate != INT_MAX && dist != INT64_MAX &&
+          *totrate != INT_MAX && *totdist != INT64_MAX) {
+        *totrate += rate;
+        *totdist += dist;
+      }
+      break;
+    default:
+      assert("Invalid partition type.");
+  }
 
-    assert(bw < bs && bh < bs);
+  if (bsize == BLOCK_64X64 && output_enabled) {
+    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
+      vp9_cyclic_refresh_set_rate_and_dist_sb(cpi->cyclic_refresh,
+                                              *totrate, *totdist);
+    encode_sb_rt(cpi, tile, tp, mi_row, mi_col, 1, bsize, pc_tree);
+  }
+}
+
+static void encode_nonrd_sb_row(VP9_COMP *cpi, const TileInfo *const tile,
+                                int mi_row, TOKENEXTRA **tp) {
+  VP9_COMMON *cm = &cpi->common;
+  MACROBLOCK *x = &cpi->mb;
+  MACROBLOCKD *xd = &x->e_mbd;
+  int mi_col;
 
-    for (n = 0; n < 4; n++) {
-      const int mi_dc = hbs * (n & 1);
-      const int mi_dr = hbs * (n >> 1);
+  // Initialize the left context for the new SB row
+  vpx_memset(&xd->left_context, 0, sizeof(xd->left_context));
+  vpx_memset(xd->left_seg_context, 0, sizeof(xd->left_seg_context));
 
-      reset_skip_txfm_size_sb(cpi, &mi_8x8[mi_dr * mis + mi_dc], max_tx_size,
-                              mi_row + mi_dr, mi_col + mi_dc, subsize);
+  // Code each SB in the row
+  for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
+       mi_col += MI_BLOCK_SIZE) {
+    MACROBLOCK *x = &cpi->mb;
+    int dummy_rate = 0;
+    int64_t dummy_dist = 0;
+    const int idx_str = cm->mi_stride * mi_row + mi_col;
+    MODE_INFO **mi = cm->mi_grid_visible + idx_str;
+    MODE_INFO **prev_mi = cm->prev_mi_grid_visible + idx_str;
+    BLOCK_SIZE bsize;
+
+    x->in_static_area = 0;
+    x->source_variance = UINT_MAX;
+    vp9_zero(x->pred_mv);
+
+    // Set the partition type of the 64X64 block
+    switch (cpi->sf.partition_search_type) {
+      case VAR_BASED_PARTITION:
+        choose_partitioning(cpi, tile, mi_row, mi_col);
+        nonrd_use_partition(cpi, tile, mi, tp, mi_row, mi_col, BLOCK_64X64,
+                            1, &dummy_rate, &dummy_dist, x->pc_root);
+        break;
+      case SOURCE_VAR_BASED_PARTITION:
+        set_source_var_based_partition(cpi, tile, mi, mi_row, mi_col);
+        nonrd_use_partition(cpi, tile, mi, tp, mi_row, mi_col, BLOCK_64X64,
+                            1, &dummy_rate, &dummy_dist, x->pc_root);
+        break;
+      case VAR_BASED_FIXED_PARTITION:
+      case FIXED_PARTITION:
+        bsize = cpi->sf.partition_search_type == FIXED_PARTITION ?
+                cpi->sf.always_this_block_size :
+                get_nonrd_var_based_fixed_partition(cpi, mi_row, mi_col);
+        set_fixed_partitioning(cpi, tile, mi, mi_row, mi_col, bsize);
+        nonrd_use_partition(cpi, tile, mi, tp, mi_row, mi_col, BLOCK_64X64,
+                            1, &dummy_rate, &dummy_dist, x->pc_root);
+        break;
+      case REFERENCE_PARTITION:
+        if (cpi->sf.partition_check ||
+            !is_background(cpi, tile, mi_row, mi_col)) {
+          set_modeinfo_offsets(cm, xd, mi_row, mi_col);
+          auto_partition_range(cpi, tile, mi_row, mi_col,
+                               &cpi->sf.min_partition_size,
+                               &cpi->sf.max_partition_size);
+          nonrd_pick_partition(cpi, tile, tp, mi_row, mi_col, BLOCK_64X64,
+                               &dummy_rate, &dummy_dist, 1, INT64_MAX,
+                               x->pc_root);
+        } else {
+          copy_partitioning(cm, mi, prev_mi);
+          nonrd_use_partition(cpi, tile, mi, tp, mi_row, mi_col,
+                              BLOCK_64X64, 1, &dummy_rate, &dummy_dist,
+                              x->pc_root);
+        }
+        break;
+      default:
+        assert(0);
     }
   }
 }
+// end RTC play code
 
-static void reset_skip_txfm_size(VP9_COMP *cpi, TX_SIZE txfm_max) {
-  VP9_COMMON * const cm = &cpi->common;
-  int mi_row, mi_col;
-  const int mis = cm->mode_info_stride;
-//  MODE_INFO *mi, *mi_ptr = cm->mi;
-  MODE_INFO **mi_8x8, **mi_ptr = cm->mi_grid_visible;
+static int get_skip_encode_frame(const VP9_COMMON *cm) {
+  unsigned int intra_count = 0, inter_count = 0;
+  int j;
 
-  for (mi_row = 0; mi_row < cm->mi_rows; mi_row += 8, mi_ptr += 8 * mis) {
-    mi_8x8 = mi_ptr;
-    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += 8, mi_8x8 += 8) {
-      reset_skip_txfm_size_sb(cpi, mi_8x8, txfm_max, mi_row, mi_col,
-                              BLOCK_64X64);
-    }
+  for (j = 0; j < INTRA_INTER_CONTEXTS; ++j) {
+    intra_count += cm->counts.intra_inter[j][0];
+    inter_count += cm->counts.intra_inter[j][1];
   }
+
+  return (intra_count << 2) < inter_count &&
+         cm->frame_type != KEY_FRAME &&
+         cm->show_frame;
 }
 
-static int get_frame_type(VP9_COMP *cpi) {
-  int frame_type;
-  if (frame_is_intra_only(&cpi->common))
-    frame_type = 0;
-  else if (cpi->is_src_frame_alt_ref && cpi->refresh_golden_frame)
-    frame_type = 3;
-  else if (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)
-    frame_type = 1;
-  else
-    frame_type = 2;
-  return frame_type;
+static void encode_frame_internal(VP9_COMP *cpi) {
+  SPEED_FEATURES *const sf = &cpi->sf;
+  RD_OPT *const rd_opt = &cpi->rd;
+  MACROBLOCK *const x = &cpi->mb;
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+
+  xd->mi = cm->mi_grid_visible;
+  xd->mi[0] = cm->mi;
+
+  vp9_zero(cm->counts);
+  vp9_zero(cpi->coef_counts);
+  vp9_zero(cpi->tx_stepdown_count);
+  vp9_zero(rd_opt->comp_pred_diff);
+  vp9_zero(rd_opt->filter_diff);
+  vp9_zero(rd_opt->tx_select_diff);
+  vp9_zero(rd_opt->tx_select_threshes);
+
+  cm->tx_mode = select_tx_mode(cpi);
+
+  cpi->mb.e_mbd.lossless = cm->base_qindex == 0 &&
+                           cm->y_dc_delta_q == 0 &&
+                           cm->uv_dc_delta_q == 0 &&
+                           cm->uv_ac_delta_q == 0;
+  switch_lossless_mode(cpi, cpi->mb.e_mbd.lossless);
+
+  vp9_frame_init_quantizer(cpi);
+
+  vp9_initialize_rd_consts(cpi);
+  vp9_initialize_me_consts(cpi, cm->base_qindex);
+  init_encode_frame_mb_context(cpi);
+  set_prev_mi(cm);
+
+  if (sf->use_nonrd_pick_mode) {
+    // Initialize internal buffer pointers for rtc coding, where non-RD
+    // mode decision is used and hence no buffer pointer swap needed.
+    int i;
+    struct macroblock_plane *const p = x->plane;
+    struct macroblockd_plane *const pd = xd->plane;
+    PICK_MODE_CONTEXT *ctx = &x->pc_root->none;
+
+    for (i = 0; i < MAX_MB_PLANE; ++i) {
+      p[i].coeff = ctx->coeff_pbuf[i][0];
+      p[i].qcoeff = ctx->qcoeff_pbuf[i][0];
+      pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][0];
+      p[i].eobs = ctx->eobs_pbuf[i][0];
+    }
+    vp9_zero(x->zcoeff_blk);
+
+    if (sf->partition_search_type == SOURCE_VAR_BASED_PARTITION &&
+        cm->current_video_frame > 0) {
+      int check_freq = sf->search_type_check_frequency;
+
+      if ((cm->current_video_frame - 1) % check_freq == 0) {
+        cpi->use_large_partition_rate = 0;
+      }
+
+      if ((cm->current_video_frame - 1) % check_freq == 1) {
+        const int mbs_in_b32x32 = 1 << ((b_width_log2_lookup[BLOCK_32X32] -
+                                  b_width_log2_lookup[BLOCK_16X16]) +
+                                  (b_height_log2_lookup[BLOCK_32X32] -
+                                  b_height_log2_lookup[BLOCK_16X16]));
+        cpi->use_large_partition_rate = cpi->use_large_partition_rate * 100 *
+                                        mbs_in_b32x32 / cm->MBs;
+      }
+
+      if ((cm->current_video_frame - 1) % check_freq >= 1) {
+        if (cpi->use_large_partition_rate < 15)
+          sf->partition_search_type = FIXED_PARTITION;
+      }
+    }
+  }
+
+  {
+    struct vpx_usec_timer emr_timer;
+    vpx_usec_timer_start(&emr_timer);
+
+    {
+      // Take tiles into account and give start/end MB
+      int tile_col, tile_row;
+      TOKENEXTRA *tp = cpi->tok;
+      const int tile_cols = 1 << cm->log2_tile_cols;
+      const int tile_rows = 1 << cm->log2_tile_rows;
+
+      for (tile_row = 0; tile_row < tile_rows; tile_row++) {
+        for (tile_col = 0; tile_col < tile_cols; tile_col++) {
+          TileInfo tile;
+          TOKENEXTRA *tp_old = tp;
+          int mi_row;
+
+          // For each row of SBs in the frame
+          vp9_tile_init(&tile, cm, tile_row, tile_col);
+          for (mi_row = tile.mi_row_start;
+               mi_row < tile.mi_row_end; mi_row += MI_BLOCK_SIZE) {
+            if (sf->use_nonrd_pick_mode && cm->frame_type != KEY_FRAME)
+              encode_nonrd_sb_row(cpi, &tile, mi_row, &tp);
+            else
+              encode_rd_sb_row(cpi, &tile, mi_row, &tp);
+          }
+          cpi->tok_count[tile_row][tile_col] = (unsigned int)(tp - tp_old);
+          assert(tp - cpi->tok <= get_token_alloc(cm->mb_rows, cm->mb_cols));
+        }
+      }
+    }
+
+    vpx_usec_timer_mark(&emr_timer);
+    cpi->time_encode_sb_row += vpx_usec_timer_elapsed(&emr_timer);
+  }
+
+  sf->skip_encode_frame = sf->skip_encode_sb ? get_skip_encode_frame(cm) : 0;
+
+#if 0
+  // Keep record of the total distortion this time around for future use
+  cpi->last_frame_distortion = cpi->frame_distortion;
+#endif
 }
 
-static void select_tx_mode(VP9_COMP *cpi) {
-  if (cpi->oxcf.lossless) {
-    cpi->common.tx_mode = ONLY_4X4;
-  } else if (cpi->common.current_video_frame == 0) {
-    cpi->common.tx_mode = TX_MODE_SELECT;
+static INTERP_FILTER get_interp_filter(
+    const int64_t threshes[SWITCHABLE_FILTER_CONTEXTS], int is_alt_ref) {
+  if (!is_alt_ref &&
+      threshes[EIGHTTAP_SMOOTH] > threshes[EIGHTTAP] &&
+      threshes[EIGHTTAP_SMOOTH] > threshes[EIGHTTAP_SHARP] &&
+      threshes[EIGHTTAP_SMOOTH] > threshes[SWITCHABLE - 1]) {
+    return EIGHTTAP_SMOOTH;
+  } else if (threshes[EIGHTTAP_SHARP] > threshes[EIGHTTAP] &&
+             threshes[EIGHTTAP_SHARP] > threshes[SWITCHABLE - 1]) {
+    return EIGHTTAP_SHARP;
+  } else if (threshes[EIGHTTAP] > threshes[SWITCHABLE - 1]) {
+    return EIGHTTAP;
   } else {
-    if (cpi->sf.tx_size_search_method == USE_LARGESTALL) {
-      cpi->common.tx_mode = ALLOW_32X32;
-    } else if (cpi->sf.tx_size_search_method == USE_FULL_RD) {
-      int frame_type = get_frame_type(cpi);
-      cpi->common.tx_mode =
-          cpi->rd_tx_select_threshes[frame_type][ALLOW_32X32]
-          > cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] ?
-          ALLOW_32X32 : TX_MODE_SELECT;
-    } else {
-      unsigned int total = 0;
-      int i;
-      for (i = 0; i < TX_SIZES; ++i)
-        total += cpi->tx_stepdown_count[i];
-      if (total) {
-        double fraction = (double)cpi->tx_stepdown_count[0] / total;
-        cpi->common.tx_mode = fraction > 0.90 ? ALLOW_32X32 : TX_MODE_SELECT;
-        // printf("fraction = %f\n", fraction);
-      }  // else keep unchanged
-    }
+    return SWITCHABLE;
   }
 }
 
 void vp9_encode_frame(VP9_COMP *cpi) {
-  VP9_COMMON * const cm = &cpi->common;
+  VP9_COMMON *const cm = &cpi->common;
+  RD_OPT *const rd_opt = &cpi->rd;
 
   // In the longer term the encoder should be generalized to match the
   // decoder such that we allow compound where one of the 3 buffers has a
@@ -2196,10 +3143,10 @@ void vp9_encode_frame(VP9_COMP *cpi) {
   // side behavior is where the ALT ref buffer has opposite sign bias to
   // the other two.
   if (!frame_is_intra_only(cm)) {
-    if ((cm->ref_frame_sign_bias[ALTREF_FRAME]
-         == cm->ref_frame_sign_bias[GOLDEN_FRAME])
-        || (cm->ref_frame_sign_bias[ALTREF_FRAME]
-            == cm->ref_frame_sign_bias[LAST_FRAME])) {
+    if ((cm->ref_frame_sign_bias[ALTREF_FRAME] ==
+             cm->ref_frame_sign_bias[GOLDEN_FRAME]) ||
+        (cm->ref_frame_sign_bias[ALTREF_FRAME] ==
+             cm->ref_frame_sign_bias[LAST_FRAME])) {
       cm->allow_comp_inter_inter = 0;
     } else {
       cm->allow_comp_inter_inter = 1;
@@ -2209,112 +3156,73 @@ void vp9_encode_frame(VP9_COMP *cpi) {
     }
   }
 
-  if (cpi->sf.RD) {
-    int i, pred_type;
-    INTERPOLATION_TYPE filter_type;
-    /*
-     * This code does a single RD pass over the whole frame assuming
-     * either compound, single or hybrid prediction as per whatever has
-     * worked best for that type of frame in the past.
-     * It also predicts whether another coding mode would have worked
-     * better that this coding mode. If that is the case, it remembers
-     * that for subsequent frames.
-     * It does the same analysis for transform size selection also.
-     */
-    int frame_type = get_frame_type(cpi);
+  if (cpi->sf.frame_parameter_update) {
+    int i;
+
+    // This code does a single RD pass over the whole frame assuming
+    // either compound, single or hybrid prediction as per whatever has
+    // worked best for that type of frame in the past.
+    // It also predicts whether another coding mode would have worked
+    // better that this coding mode. If that is the case, it remembers
+    // that for subsequent frames.
+    // It does the same analysis for transform size selection also.
+    const MV_REFERENCE_FRAME frame_type = get_frame_type(cpi);
+    int64_t *const mode_thrs = rd_opt->prediction_type_threshes[frame_type];
+    int64_t *const filter_thrs = rd_opt->filter_threshes[frame_type];
+    int *const tx_thrs = rd_opt->tx_select_threshes[frame_type];
+    const int is_alt_ref = frame_type == ALTREF_FRAME;
 
     /* prediction (compound, single or hybrid) mode selection */
-    if (frame_type == 3 || !cm->allow_comp_inter_inter)
-      pred_type = SINGLE_PREDICTION_ONLY;
-    else if (cpi->rd_prediction_type_threshes[frame_type][1]
-             > cpi->rd_prediction_type_threshes[frame_type][0]
-             && cpi->rd_prediction_type_threshes[frame_type][1]
-             > cpi->rd_prediction_type_threshes[frame_type][2]
-             && check_dual_ref_flags(cpi) && cpi->static_mb_pct == 100)
-      pred_type = COMP_PREDICTION_ONLY;
-    else if (cpi->rd_prediction_type_threshes[frame_type][0]
-             > cpi->rd_prediction_type_threshes[frame_type][2])
-      pred_type = SINGLE_PREDICTION_ONLY;
+    if (is_alt_ref || !cm->allow_comp_inter_inter)
+      cm->reference_mode = SINGLE_REFERENCE;
+    else if (mode_thrs[COMPOUND_REFERENCE] > mode_thrs[SINGLE_REFERENCE] &&
+             mode_thrs[COMPOUND_REFERENCE] >
+                 mode_thrs[REFERENCE_MODE_SELECT] &&
+             check_dual_ref_flags(cpi) &&
+             cpi->static_mb_pct == 100)
+      cm->reference_mode = COMPOUND_REFERENCE;
+    else if (mode_thrs[SINGLE_REFERENCE] > mode_thrs[REFERENCE_MODE_SELECT])
+      cm->reference_mode = SINGLE_REFERENCE;
     else
-      pred_type = HYBRID_PREDICTION;
-
-    /* filter type selection */
-    // FIXME(rbultje) for some odd reason, we often select smooth_filter
-    // as default filter for ARF overlay frames. This is a REALLY BAD
-    // IDEA so we explicitly disable it here.
-    if (frame_type != 3 &&
-        cpi->rd_filter_threshes[frame_type][1] >
-            cpi->rd_filter_threshes[frame_type][0] &&
-        cpi->rd_filter_threshes[frame_type][1] >
-            cpi->rd_filter_threshes[frame_type][2] &&
-        cpi->rd_filter_threshes[frame_type][1] >
-            cpi->rd_filter_threshes[frame_type][SWITCHABLE_FILTERS]) {
-      filter_type = EIGHTTAP_SMOOTH;
-    } else if (cpi->rd_filter_threshes[frame_type][2] >
-            cpi->rd_filter_threshes[frame_type][0] &&
-        cpi->rd_filter_threshes[frame_type][2] >
-            cpi->rd_filter_threshes[frame_type][SWITCHABLE_FILTERS]) {
-      filter_type = EIGHTTAP_SHARP;
-    } else if (cpi->rd_filter_threshes[frame_type][0] >
-                  cpi->rd_filter_threshes[frame_type][SWITCHABLE_FILTERS]) {
-      filter_type = EIGHTTAP;
-    } else {
-      filter_type = SWITCHABLE;
-    }
+      cm->reference_mode = REFERENCE_MODE_SELECT;
 
-    cpi->mb.e_mbd.lossless = 0;
-    if (cpi->oxcf.lossless) {
-      cpi->mb.e_mbd.lossless = 1;
-    }
+    if (cm->interp_filter == SWITCHABLE)
+      cm->interp_filter = get_interp_filter(filter_thrs, is_alt_ref);
 
-    /* transform size selection (4x4, 8x8, 16x16 or select-per-mb) */
-    select_tx_mode(cpi);
-    cpi->common.comp_pred_mode = pred_type;
-    cpi->common.mcomp_filter_type = filter_type;
     encode_frame_internal(cpi);
 
-    for (i = 0; i < NB_PREDICTION_TYPES; ++i) {
-      const int diff = (int) (cpi->rd_comp_pred_diff[i] / cpi->common.MBs);
-      cpi->rd_prediction_type_threshes[frame_type][i] += diff;
-      cpi->rd_prediction_type_threshes[frame_type][i] >>= 1;
-    }
+    for (i = 0; i < REFERENCE_MODES; ++i)
+      mode_thrs[i] = (mode_thrs[i] + rd_opt->comp_pred_diff[i] / cm->MBs) / 2;
 
-    for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
-      const int64_t diff = cpi->rd_filter_diff[i] / cpi->common.MBs;
-      cpi->rd_filter_threshes[frame_type][i] =
-          (cpi->rd_filter_threshes[frame_type][i] + diff) / 2;
-    }
+    for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
+      filter_thrs[i] = (filter_thrs[i] + rd_opt->filter_diff[i] / cm->MBs) / 2;
 
     for (i = 0; i < TX_MODES; ++i) {
-      int64_t pd = cpi->rd_tx_select_diff[i];
-      int diff;
+      int64_t pd = rd_opt->tx_select_diff[i];
       if (i == TX_MODE_SELECT)
-        pd -= RDCOST(cpi->mb.rdmult, cpi->mb.rddiv,
-                     2048 * (TX_SIZES - 1), 0);
-      diff = (int) (pd / cpi->common.MBs);
-      cpi->rd_tx_select_threshes[frame_type][i] += diff;
-      cpi->rd_tx_select_threshes[frame_type][i] /= 2;
+        pd -= RDCOST(cpi->mb.rdmult, cpi->mb.rddiv, 2048 * (TX_SIZES - 1), 0);
+      tx_thrs[i] = (tx_thrs[i] + (int)(pd / cm->MBs)) / 2;
     }
 
-    if (cpi->common.comp_pred_mode == HYBRID_PREDICTION) {
+    if (cm->reference_mode == REFERENCE_MODE_SELECT) {
       int single_count_zero = 0;
       int comp_count_zero = 0;
 
       for (i = 0; i < COMP_INTER_CONTEXTS; i++) {
-        single_count_zero += cpi->comp_inter_count[i][0];
-        comp_count_zero += cpi->comp_inter_count[i][1];
+        single_count_zero += cm->counts.comp_inter[i][0];
+        comp_count_zero += cm->counts.comp_inter[i][1];
       }
 
       if (comp_count_zero == 0) {
-        cpi->common.comp_pred_mode = SINGLE_PREDICTION_ONLY;
-        vp9_zero(cpi->comp_inter_count);
+        cm->reference_mode = SINGLE_REFERENCE;
+        vp9_zero(cm->counts.comp_inter);
       } else if (single_count_zero == 0) {
-        cpi->common.comp_pred_mode = COMP_PREDICTION_ONLY;
-        vp9_zero(cpi->comp_inter_count);
+        cm->reference_mode = COMPOUND_REFERENCE;
+        vp9_zero(cm->counts.comp_inter);
       }
     }
 
-    if (cpi->common.tx_mode == TX_MODE_SELECT) {
+    if (cm->tx_mode == TX_MODE_SELECT) {
       int count4x4 = 0;
       int count8x8_lp = 0, count8x8_8x8p = 0;
       int count16x16_16x16p = 0, count16x16_lp = 0;
@@ -2334,189 +3242,155 @@ void vp9_encode_frame(VP9_COMP *cpi) {
         count32x32 += cm->counts.tx.p32x32[i][TX_32X32];
       }
 
-      if (count4x4 == 0 && count16x16_lp == 0 && count16x16_16x16p == 0
-          && count32x32 == 0) {
-        cpi->common.tx_mode = ALLOW_8X8;
-        reset_skip_txfm_size(cpi, TX_8X8);
-      } else if (count8x8_8x8p == 0 && count16x16_16x16p == 0
-                 && count8x8_lp == 0 && count16x16_lp == 0 && count32x32 == 0) {
-        cpi->common.tx_mode = ONLY_4X4;
-        reset_skip_txfm_size(cpi, TX_4X4);
+      if (count4x4 == 0 && count16x16_lp == 0 && count16x16_16x16p == 0 &&
+          count32x32 == 0) {
+        cm->tx_mode = ALLOW_8X8;
+        reset_skip_txfm_size(cm, TX_8X8);
+      } else if (count8x8_8x8p == 0 && count16x16_16x16p == 0 &&
+                 count8x8_lp == 0 && count16x16_lp == 0 && count32x32 == 0) {
+        cm->tx_mode = ONLY_4X4;
+        reset_skip_txfm_size(cm, TX_4X4);
       } else if (count8x8_lp == 0 && count16x16_lp == 0 && count4x4 == 0) {
-        cpi->common.tx_mode = ALLOW_32X32;
+        cm->tx_mode = ALLOW_32X32;
       } else if (count32x32 == 0 && count8x8_lp == 0 && count4x4 == 0) {
-        cpi->common.tx_mode = ALLOW_16X16;
-        reset_skip_txfm_size(cpi, TX_16X16);
+        cm->tx_mode = ALLOW_16X16;
+        reset_skip_txfm_size(cm, TX_16X16);
       }
     }
   } else {
+    cm->reference_mode = SINGLE_REFERENCE;
+    cm->interp_filter = SWITCHABLE;
     encode_frame_internal(cpi);
   }
 }
 
-static void sum_intra_stats(VP9_COMP *cpi, const MODE_INFO *mi) {
-  const MB_PREDICTION_MODE y_mode = mi->mbmi.mode;
-  const MB_PREDICTION_MODE uv_mode = mi->mbmi.uv_mode;
+static void sum_intra_stats(FRAME_COUNTS *counts, const MODE_INFO *mi) {
+  const PREDICTION_MODE y_mode = mi->mbmi.mode;
+  const PREDICTION_MODE uv_mode = mi->mbmi.uv_mode;
   const BLOCK_SIZE bsize = mi->mbmi.sb_type;
 
-  ++cpi->y_uv_mode_count[y_mode][uv_mode];
-
   if (bsize < BLOCK_8X8) {
     int idx, idy;
-    const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
-    const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
-    for (idy = 0; idy < 2; idy += num_4x4_blocks_high)
-      for (idx = 0; idx < 2; idx += num_4x4_blocks_wide)
-        ++cpi->y_mode_count[0][mi->bmi[idy * 2 + idx].as_mode];
+    const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
+    const int num_4x4_h = num_4x4_blocks_high_lookup[bsize];
+    for (idy = 0; idy < 2; idy += num_4x4_h)
+      for (idx = 0; idx < 2; idx += num_4x4_w)
+        ++counts->y_mode[0][mi->bmi[idy * 2 + idx].as_mode];
   } else {
-    ++cpi->y_mode_count[size_group_lookup[bsize]][y_mode];
+    ++counts->y_mode[size_group_lookup[bsize]][y_mode];
   }
-}
-
-// Experimental stub function to create a per MB zbin adjustment based on
-// some previously calculated measure of MB activity.
-static void adjust_act_zbin(VP9_COMP *cpi, MACROBLOCK *x) {
-#if USE_ACT_INDEX
-  x->act_zbin_adj = *(x->mb_activity_ptr);
-#else
-  int64_t a;
-  int64_t b;
-  int64_t act = *(x->mb_activity_ptr);
 
-  // Apply the masking to the RD multiplier.
-  a = act + 4 * cpi->activity_avg;
-  b = 4 * act + cpi->activity_avg;
+  ++counts->uv_mode[y_mode][uv_mode];
+}
 
-  if (act > cpi->activity_avg)
-    x->act_zbin_adj = (int) (((int64_t) b + (a >> 1)) / a) - 1;
-  else
-    x->act_zbin_adj = 1 - (int) (((int64_t) a + (b >> 1)) / b);
-#endif
+static int get_zbin_mode_boost(const MB_MODE_INFO *mbmi, int enabled) {
+  if (enabled) {
+    if (is_inter_block(mbmi)) {
+      if (mbmi->mode == ZEROMV) {
+        return mbmi->ref_frame[0] != LAST_FRAME ? GF_ZEROMV_ZBIN_BOOST
+                                                : LF_ZEROMV_ZBIN_BOOST;
+      } else {
+        return mbmi->sb_type < BLOCK_8X8 ? SPLIT_MV_ZBIN_BOOST
+                                         : MV_ZBIN_BOOST;
+      }
+    } else {
+      return INTRA_ZBIN_BOOST;
+    }
+  } else {
+    return 0;
+  }
 }
+
 static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled,
-                              int mi_row, int mi_col, BLOCK_SIZE bsize) {
-  VP9_COMMON * const cm = &cpi->common;
-  MACROBLOCK * const x = &cpi->mb;
-  MACROBLOCKD * const xd = &x->e_mbd;
-  MODE_INFO **mi_8x8 = xd->mi_8x8;
+                              int mi_row, int mi_col, BLOCK_SIZE bsize,
+                              PICK_MODE_CONTEXT *ctx) {
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &cpi->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MODE_INFO **mi_8x8 = xd->mi;
   MODE_INFO *mi = mi_8x8[0];
   MB_MODE_INFO *mbmi = &mi->mbmi;
   unsigned int segment_id = mbmi->segment_id;
-  const int mis = cm->mode_info_stride;
+  const int mis = cm->mi_stride;
   const int mi_width = num_8x8_blocks_wide_lookup[bsize];
   const int mi_height = num_8x8_blocks_high_lookup[bsize];
+
+  x->skip_recode = !x->select_txfm_size && mbmi->sb_type >= BLOCK_8X8 &&
+                   cpi->oxcf.aq_mode != COMPLEXITY_AQ &&
+                   cpi->oxcf.aq_mode != CYCLIC_REFRESH_AQ &&
+                   cpi->sf.allow_skip_recode;
+
+  x->skip_optimize = ctx->is_coded;
+  ctx->is_coded = 1;
   x->use_lp32x32fdct = cpi->sf.use_lp32x32fdct;
   x->skip_encode = (!output_enabled && cpi->sf.skip_encode_frame &&
-                    xd->q_index < QIDX_SKIP_THRESH);
+                    x->q_index < QIDX_SKIP_THRESH);
+
   if (x->skip_encode)
     return;
 
-  if (cm->frame_type == KEY_FRAME) {
-    if (cpi->oxcf.tuning == VP8_TUNE_SSIM) {
-      adjust_act_zbin(cpi, x);
-      vp9_update_zbin_extra(cpi, x);
-    }
-  } else {
-    vp9_setup_interp_filters(xd, mbmi->interp_filter, cm);
-
-    if (cpi->oxcf.tuning == VP8_TUNE_SSIM) {
-      // Adjust the zbin based on this MB rate.
-      adjust_act_zbin(cpi, x);
-    }
-
-    // Experimental code. Special case for gf and arf zeromv modes.
-    // Increase zbin size to suppress noise
-    cpi->zbin_mode_boost = 0;
-    if (cpi->zbin_mode_boost_enabled) {
-      if (is_inter_block(mbmi)) {
-        if (mbmi->mode == ZEROMV) {
-          if (mbmi->ref_frame[0] != LAST_FRAME)
-            cpi->zbin_mode_boost = GF_ZEROMV_ZBIN_BOOST;
-          else
-            cpi->zbin_mode_boost = LF_ZEROMV_ZBIN_BOOST;
-        } else if (mbmi->sb_type < BLOCK_8X8) {
-          cpi->zbin_mode_boost = SPLIT_MV_ZBIN_BOOST;
-        } else {
-          cpi->zbin_mode_boost = MV_ZBIN_BOOST;
-        }
-      } else {
-        cpi->zbin_mode_boost = INTRA_ZBIN_BOOST;
-      }
-    }
+  set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
 
-    vp9_update_zbin_extra(cpi, x);
-  }
+  // Experimental code. Special case for gf and arf zeromv modes.
+  // Increase zbin size to suppress noise
+  cpi->zbin_mode_boost = get_zbin_mode_boost(mbmi,
+                                             cpi->zbin_mode_boost_enabled);
+  vp9_update_zbin_extra(cpi, x);
 
   if (!is_inter_block(mbmi)) {
-    vp9_encode_intra_block_y(x, MAX(bsize, BLOCK_8X8));
-    vp9_encode_intra_block_uv(x, MAX(bsize, BLOCK_8X8));
+    int plane;
+    mbmi->skip = 1;
+    for (plane = 0; plane < MAX_MB_PLANE; ++plane)
+      vp9_encode_intra_block_plane(x, MAX(bsize, BLOCK_8X8), plane);
     if (output_enabled)
-      sum_intra_stats(cpi, mi);
+      sum_intra_stats(&cm->counts, mi);
+    vp9_tokenize_sb(cpi, t, !output_enabled, MAX(bsize, BLOCK_8X8));
   } else {
-    int idx = cm->ref_frame_map[get_ref_frame_idx(cpi, mbmi->ref_frame[0])];
-    YV12_BUFFER_CONFIG *ref_fb = &cm->yv12_fb[idx];
-    YV12_BUFFER_CONFIG *second_ref_fb = NULL;
-    if (has_second_ref(mbmi)) {
-      idx = cm->ref_frame_map[get_ref_frame_idx(cpi, mbmi->ref_frame[1])];
-      second_ref_fb = &cm->yv12_fb[idx];
+    int ref;
+    const int is_compound = has_second_ref(mbmi);
+    for (ref = 0; ref < 1 + is_compound; ++ref) {
+      YV12_BUFFER_CONFIG *cfg = get_ref_frame_buffer(cpi,
+                                                     mbmi->ref_frame[ref]);
+      vp9_setup_pre_planes(xd, ref, cfg, mi_row, mi_col,
+                           &xd->block_refs[ref]->sf);
     }
-
-    assert(cm->frame_type != KEY_FRAME);
-
-    setup_pre_planes(xd, 0, ref_fb, mi_row, mi_col,
-                     &xd->scale_factor[0]);
-    setup_pre_planes(xd, 1, second_ref_fb, mi_row, mi_col,
-                     &xd->scale_factor[1]);
-
     vp9_build_inter_predictors_sb(xd, mi_row, mi_col, MAX(bsize, BLOCK_8X8));
-  }
 
-  if (!is_inter_block(mbmi)) {
-    vp9_tokenize_sb(cpi, t, !output_enabled, MAX(bsize, BLOCK_8X8));
-  } else if (!x->skip) {
-    vp9_encode_sb(x, MAX(bsize, BLOCK_8X8));
-    vp9_tokenize_sb(cpi, t, !output_enabled, MAX(bsize, BLOCK_8X8));
-  } else {
-    int mb_skip_context = xd->left_available ? mi_8x8[-1]->mbmi.skip_coeff : 0;
-    mb_skip_context += mi_8x8[-mis] ? mi_8x8[-mis]->mbmi.skip_coeff : 0;
-
-    mbmi->skip_coeff = 1;
-    if (output_enabled)
-      cm->counts.mbskip[mb_skip_context][1]++;
-    reset_skip_context(xd, MAX(bsize, BLOCK_8X8));
+    if (!x->skip) {
+      mbmi->skip = 1;
+      vp9_encode_sb(x, MAX(bsize, BLOCK_8X8));
+      vp9_tokenize_sb(cpi, t, !output_enabled, MAX(bsize, BLOCK_8X8));
+    } else {
+      mbmi->skip = 1;
+      if (output_enabled)
+        cm->counts.skip[vp9_get_skip_context(xd)][1]++;
+      reset_skip_context(xd, MAX(bsize, BLOCK_8X8));
+    }
   }
 
   if (output_enabled) {
     if (cm->tx_mode == TX_MODE_SELECT &&
         mbmi->sb_type >= BLOCK_8X8  &&
         !(is_inter_block(mbmi) &&
-            (mbmi->skip_coeff ||
+            (mbmi->skip ||
              vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)))) {
-      const uint8_t context = vp9_get_pred_context_tx_size(xd);
-      ++get_tx_counts(bsize, context, &cm->counts.tx)[mbmi->tx_size];
+      ++get_tx_counts(max_txsize_lookup[bsize], vp9_get_tx_size_context(xd),
+                      &cm->counts.tx)[mbmi->tx_size];
     } else {
       int x, y;
-      TX_SIZE sz = tx_mode_to_biggest_tx_size[cm->tx_mode];
-      assert(sizeof(tx_mode_to_biggest_tx_size) /
-             sizeof(tx_mode_to_biggest_tx_size[0]) == TX_MODES);
+      TX_SIZE tx_size;
       // The new intra coding scheme requires no change of transform size
       if (is_inter_block(&mi->mbmi)) {
-        if (sz == TX_32X32 && bsize < BLOCK_32X32)
-          sz = TX_16X16;
-        if (sz == TX_16X16 && bsize < BLOCK_16X16)
-          sz = TX_8X8;
-        if (sz == TX_8X8 && bsize < BLOCK_8X8)
-          sz = TX_4X4;
-      } else if (bsize >= BLOCK_8X8) {
-        sz = mbmi->tx_size;
+        tx_size = MIN(tx_mode_to_biggest_tx_size[cm->tx_mode],
+                      max_txsize_lookup[bsize]);
       } else {
-        sz = TX_4X4;
+        tx_size = (bsize >= BLOCK_8X8) ? mbmi->tx_size : TX_4X4;
       }
 
       for (y = 0; y < mi_height; y++)
         for (x = 0; x < mi_width; x++)
           if (mi_col + x < cm->mi_cols && mi_row + y < cm->mi_rows)
-            mi_8x8[mis * y + x]->mbmi.tx_size = sz;
+            mi_8x8[mis * y + x]->mbmi.tx_size = tx_size;
     }
   }
 }
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodeframe.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodeframe.h
index 3e9f5381c06..131e9320199 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodeframe.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodeframe.h
@@ -12,11 +12,28 @@
 #ifndef VP9_ENCODER_VP9_ENCODEFRAME_H_
 #define VP9_ENCODER_VP9_ENCODEFRAME_H_
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 struct macroblock;
 struct yv12_buffer_config;
+struct VP9_COMP;
+
+typedef struct {
+  unsigned int sse;
+  int sum;
+  unsigned int var;
+} diff;
 
 void vp9_setup_src_planes(struct macroblock *x,
                           const struct yv12_buffer_config *src,
                           int mi_row, int mi_col);
 
+void vp9_encode_frame(struct VP9_COMP *cpi);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
 #endif  // VP9_ENCODER_VP9_ENCODEFRAME_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodeintra.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodeintra.c
deleted file mode 100644
index 32b4593fcb8..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodeintra.c
+++ /dev/null
@@ -1,28 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "./vpx_config.h"
-#include "./vp9_rtcd.h"
-#include "vp9/encoder/vp9_quantize.h"
-#include "vp9/common/vp9_reconintra.h"
-#include "vp9/encoder/vp9_encodemb.h"
-#include "vp9/encoder/vp9_encodeintra.h"
-
-int vp9_encode_intra(MACROBLOCK *x, int use_16x16_pred) {
-  MB_MODE_INFO * mbmi = &x->e_mbd.mi_8x8[0]->mbmi;
-  x->skip_encode = 0;
-  mbmi->mode = DC_PRED;
-  mbmi->ref_frame[0] = INTRA_FRAME;
-  mbmi->tx_size = use_16x16_pred ? (mbmi->sb_type >= BLOCK_16X16 ? TX_16X16
-                                                                 : TX_8X8)
-                                   : TX_4X4;
-  vp9_encode_intra_block_y(x, mbmi->sb_type);
-  return vp9_get_mb_ss(x->plane[0].src_diff);
-}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodeintra.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodeintra.h
deleted file mode 100644
index e217924653f..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodeintra.h
+++ /dev/null
@@ -1,20 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef VP9_ENCODER_VP9_ENCODEINTRA_H_
-#define VP9_ENCODER_VP9_ENCODEINTRA_H_
-
-#include "vp9/encoder/vp9_onyx_int.h"
-
-int vp9_encode_intra(MACROBLOCK *x, int use_16x16_pred);
-void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
-                            TX_SIZE tx_size, void *arg);
-
-#endif  // VP9_ENCODER_VP9_ENCODEINTRA_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodemb.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodemb.c
index e52e8ec1e2d..3b231b7f250 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodemb.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodemb.c
@@ -19,75 +19,61 @@
 #include "vp9/common/vp9_reconintra.h"
 #include "vp9/common/vp9_systemdependent.h"
 
-#include "vp9/encoder/vp9_dct.h"
 #include "vp9/encoder/vp9_encodemb.h"
 #include "vp9/encoder/vp9_quantize.h"
 #include "vp9/encoder/vp9_rdopt.h"
 #include "vp9/encoder/vp9_tokenize.h"
 
+struct optimize_ctx {
+  ENTROPY_CONTEXT ta[MAX_MB_PLANE][16];
+  ENTROPY_CONTEXT tl[MAX_MB_PLANE][16];
+};
+
+struct encode_b_args {
+  MACROBLOCK *x;
+  struct optimize_ctx *ctx;
+  unsigned char *skip;
+};
+
 void vp9_subtract_block_c(int rows, int cols,
-                          int16_t *diff_ptr, ptrdiff_t diff_stride,
-                          const uint8_t *src_ptr, ptrdiff_t src_stride,
-                          const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
+                          int16_t *diff, ptrdiff_t diff_stride,
+                          const uint8_t *src, ptrdiff_t src_stride,
+                          const uint8_t *pred, ptrdiff_t pred_stride) {
   int r, c;
 
   for (r = 0; r < rows; r++) {
     for (c = 0; c < cols; c++)
-      diff_ptr[c] = src_ptr[c] - pred_ptr[c];
+      diff[c] = src[c] - pred[c];
 
-    diff_ptr += diff_stride;
-    pred_ptr += pred_stride;
-    src_ptr  += src_stride;
+    diff += diff_stride;
+    pred += pred_stride;
+    src  += src_stride;
   }
 }
 
-static void subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {
+void vp9_subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {
   struct macroblock_plane *const p = &x->plane[plane];
-  const MACROBLOCKD *const xd = &x->e_mbd;
-  const struct macroblockd_plane *const pd = &xd->plane[plane];
-  const int bw = plane_block_width(bsize, pd);
-  const int bh = plane_block_height(bsize, pd);
+  const struct macroblockd_plane *const pd = &x->e_mbd.plane[plane];
+  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
+  const int bw = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
+  const int bh = 4 * num_4x4_blocks_high_lookup[plane_bsize];
 
-  vp9_subtract_block(bh, bw, p->src_diff, bw,
-                     p->src.buf, p->src.stride,
+  vp9_subtract_block(bh, bw, p->src_diff, bw, p->src.buf, p->src.stride,
                      pd->dst.buf, pd->dst.stride);
 }
 
-void vp9_subtract_sby(MACROBLOCK *x, BLOCK_SIZE bsize) {
-  subtract_plane(x, bsize, 0);
-}
-
-void vp9_subtract_sbuv(MACROBLOCK *x, BLOCK_SIZE bsize) {
-  int i;
-
-  for (i = 1; i < MAX_MB_PLANE; i++)
-    subtract_plane(x, bsize, i);
-}
-
-void vp9_subtract_sb(MACROBLOCK *x, BLOCK_SIZE bsize) {
-  vp9_subtract_sby(x, bsize);
-  vp9_subtract_sbuv(x, bsize);
-}
-
 #define RDTRUNC(RM, DM, R, D) ((128 + (R) * (RM)) & 0xFF)
-typedef struct vp9_token_state vp9_token_state;
 
-struct vp9_token_state {
+typedef struct vp9_token_state {
   int           rate;
   int           error;
   int           next;
   signed char   token;
   short         qc;
-};
+} vp9_token_state;
 
 // TODO(jimbankoski): experiment to find optimal RD numbers.
-#define Y1_RD_MULT 4
-#define UV_RD_MULT 2
-
-static const int plane_rd_mult[4] = {
-  Y1_RD_MULT,
-  UV_RD_MULT,
-};
+static const int plane_rd_mult[PLANE_TYPES] = { 4, 2 };
 
 #define UPDATE_RD_COST()\
 {\
@@ -112,62 +98,56 @@ static int trellis_get_coeff_context(const int16_t *scan,
   return pt;
 }
 
-static void optimize_b(MACROBLOCK *mb,
-                       int plane, int block, BLOCK_SIZE plane_bsize,
-                       ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
-                       TX_SIZE tx_size) {
+static int optimize_b(MACROBLOCK *mb, int plane, int block,
+                      TX_SIZE tx_size, int ctx) {
   MACROBLOCKD *const xd = &mb->e_mbd;
-  struct macroblockd_plane *pd = &xd->plane[plane];
-  const int ref = is_inter_block(&xd->mi_8x8[0]->mbmi);
+  struct macroblock_plane *const p = &mb->plane[plane];
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  const int ref = is_inter_block(&xd->mi[0]->mbmi);
   vp9_token_state tokens[1025][2];
   unsigned best_index[1025][2];
-  const int16_t *coeff_ptr = BLOCK_OFFSET(mb->plane[plane].coeff, block);
-  int16_t *qcoeff_ptr;
-  int16_t *dqcoeff_ptr;
-  int eob = pd->eobs[block], final_eob, sz = 0;
-  const int i0 = 0;
-  int rc, x, next, i;
-  int64_t rdmult, rddiv, rd_cost0, rd_cost1;
-  int rate0, rate1, error0, error1, t0, t1;
-  int best, band, pt;
-  PLANE_TYPE type = pd->plane_type;
-  int err_mult = plane_rd_mult[type];
+  uint8_t token_cache[1024];
+  const int16_t *const coeff = BLOCK_OFFSET(mb->plane[plane].coeff, block);
+  int16_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+  int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+  const int eob = p->eobs[block];
+  const PLANE_TYPE type = pd->plane_type;
   const int default_eob = 16 << (tx_size << 1);
-  const int16_t *scan, *nb;
   const int mul = 1 + (tx_size == TX_32X32);
-  uint8_t token_cache[1024];
-  const int ib = txfrm_block_to_raster_block(plane_bsize, tx_size, block);
   const int16_t *dequant_ptr = pd->dequant;
   const uint8_t *const band_translate = get_band_translate(tx_size);
+  const scan_order *const so = get_scan(xd, tx_size, type, block);
+  const int16_t *const scan = so->scan;
+  const int16_t *const nb = so->neighbors;
+  int next = eob, sz = 0;
+  int64_t rdmult = mb->rdmult * plane_rd_mult[type], rddiv = mb->rddiv;
+  int64_t rd_cost0, rd_cost1;
+  int rate0, rate1, error0, error1, t0, t1;
+  int best, band, pt, i, final_eob;
 
   assert((!type && !plane) || (type && plane));
-  dqcoeff_ptr = BLOCK_OFFSET(pd->dqcoeff, block);
-  qcoeff_ptr = BLOCK_OFFSET(pd->qcoeff, block);
-  get_scan(xd, tx_size, type, ib, &scan, &nb);
   assert(eob <= default_eob);
 
   /* Now set up a Viterbi trellis to evaluate alternative roundings. */
-  rdmult = mb->rdmult * err_mult;
-  if (mb->e_mbd.mi_8x8[0]->mbmi.ref_frame[0] == INTRA_FRAME)
+  if (!ref)
     rdmult = (rdmult * 9) >> 4;
-  rddiv = mb->rddiv;
+
   /* Initialize the sentinel node of the trellis. */
   tokens[eob][0].rate = 0;
   tokens[eob][0].error = 0;
   tokens[eob][0].next = default_eob;
-  tokens[eob][0].token = DCT_EOB_TOKEN;
+  tokens[eob][0].token = EOB_TOKEN;
   tokens[eob][0].qc = 0;
-  *(tokens[eob] + 1) = *(tokens[eob] + 0);
-  next = eob;
+  tokens[eob][1] = tokens[eob][0];
+
   for (i = 0; i < eob; i++)
-    token_cache[scan[i]] = vp9_pt_energy_class[vp9_dct_value_tokens_ptr[
-        qcoeff_ptr[scan[i]]].token];
+    token_cache[scan[i]] =
+        vp9_pt_energy_class[vp9_dct_value_tokens_ptr[qcoeff[scan[i]]].token];
 
-  for (i = eob; i-- > i0;) {
+  for (i = eob; i-- > 0;) {
     int base_bits, d2, dx;
-
-    rc = scan[i];
-    x = qcoeff_ptr[rc];
+    const int rc = scan[i];
+    int x = qcoeff[rc];
     /* Only add a trellis state for non-zero coefficients. */
     if (x) {
       int shortcut = 0;
@@ -179,20 +159,18 @@ static void optimize_b(MACROBLOCK *mb,
       t0 = (vp9_dct_value_tokens_ptr + x)->token;
       /* Consider both possible successor states. */
       if (next < default_eob) {
-        band = get_coef_band(band_translate, i + 1);
+        band = band_translate[i + 1];
         pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache);
-        rate0 +=
-          mb->token_costs[tx_size][type][ref][band][0][pt]
-                         [tokens[next][0].token];
-        rate1 +=
-          mb->token_costs[tx_size][type][ref][band][0][pt]
-                         [tokens[next][1].token];
+        rate0 += mb->token_costs[tx_size][type][ref][band][0][pt]
+                                [tokens[next][0].token];
+        rate1 += mb->token_costs[tx_size][type][ref][band][0][pt]
+                                [tokens[next][1].token];
       }
       UPDATE_RD_COST();
       /* And pick the best. */
       best = rd_cost1 < rd_cost0;
-      base_bits = *(vp9_dct_value_cost_ptr + x);
-      dx = mul * (dqcoeff_ptr[rc] - coeff_ptr[rc]);
+      base_bits = vp9_dct_value_cost_ptr[x];
+      dx = mul * (dqcoeff[rc] - coeff[rc]);
       d2 = dx * dx;
       tokens[i][0].rate = base_bits + (best ? rate1 : rate0);
       tokens[i][0].error = d2 + (best ? error1 : error0);
@@ -205,9 +183,9 @@ static void optimize_b(MACROBLOCK *mb,
       rate0 = tokens[next][0].rate;
       rate1 = tokens[next][1].rate;
 
-      if ((abs(x)*dequant_ptr[rc != 0] > abs(coeff_ptr[rc]) * mul) &&
-          (abs(x)*dequant_ptr[rc != 0] < abs(coeff_ptr[rc]) * mul +
-                                         dequant_ptr[rc != 0]))
+      if ((abs(x) * dequant_ptr[rc != 0] > abs(coeff[rc]) * mul) &&
+          (abs(x) * dequant_ptr[rc != 0] < abs(coeff[rc]) * mul +
+                                               dequant_ptr[rc != 0]))
         shortcut = 1;
       else
         shortcut = 0;
@@ -222,21 +200,19 @@ static void optimize_b(MACROBLOCK *mb,
         /* If we reduced this coefficient to zero, check to see if
          *  we need to move the EOB back here.
          */
-        t0 = tokens[next][0].token == DCT_EOB_TOKEN ?
-             DCT_EOB_TOKEN : ZERO_TOKEN;
-        t1 = tokens[next][1].token == DCT_EOB_TOKEN ?
-             DCT_EOB_TOKEN : ZERO_TOKEN;
+        t0 = tokens[next][0].token == EOB_TOKEN ? EOB_TOKEN : ZERO_TOKEN;
+        t1 = tokens[next][1].token == EOB_TOKEN ? EOB_TOKEN : ZERO_TOKEN;
       } else {
         t0 = t1 = (vp9_dct_value_tokens_ptr + x)->token;
       }
       if (next < default_eob) {
-        band = get_coef_band(band_translate, i + 1);
-        if (t0 != DCT_EOB_TOKEN) {
+        band = band_translate[i + 1];
+        if (t0 != EOB_TOKEN) {
           pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache);
           rate0 += mb->token_costs[tx_size][type][ref][band][!x][pt]
                                   [tokens[next][0].token];
         }
-        if (t1 != DCT_EOB_TOKEN) {
+        if (t1 != EOB_TOKEN) {
           pt = trellis_get_coeff_context(scan, nb, i, t1, token_cache);
           rate1 += mb->token_costs[tx_size][type][ref][band][!x][pt]
                                   [tokens[next][1].token];
@@ -246,7 +222,7 @@ static void optimize_b(MACROBLOCK *mb,
       UPDATE_RD_COST();
       /* And pick the best. */
       best = rd_cost1 < rd_cost0;
-      base_bits = *(vp9_dct_value_cost_ptr + x);
+      base_bits = vp9_dct_value_cost_ptr[x];
 
       if (shortcut) {
         dx -= (dequant_ptr[rc != 0] + sz) ^ sz;
@@ -264,16 +240,16 @@ static void optimize_b(MACROBLOCK *mb,
       /* There's no choice to make for a zero coefficient, so we don't
        *  add a new trellis node, but we do need to update the costs.
        */
-      band = get_coef_band(band_translate, i + 1);
+      band = band_translate[i + 1];
       t0 = tokens[next][0].token;
       t1 = tokens[next][1].token;
       /* Update the cost of each path if we're past the EOB token. */
-      if (t0 != DCT_EOB_TOKEN) {
+      if (t0 != EOB_TOKEN) {
         tokens[next][0].rate +=
             mb->token_costs[tx_size][type][ref][band][1][0][t0];
         tokens[next][0].token = ZERO_TOKEN;
       }
-      if (t1 != DCT_EOB_TOKEN) {
+      if (t1 != EOB_TOKEN) {
         tokens[next][1].rate +=
             mb->token_costs[tx_size][type][ref][band][1][0][t1];
         tokens[next][1].token = ZERO_TOKEN;
@@ -284,129 +260,91 @@ static void optimize_b(MACROBLOCK *mb,
   }
 
   /* Now pick the best path through the whole trellis. */
-  band = get_coef_band(band_translate, i + 1);
-  pt = combine_entropy_contexts(*a, *l);
+  band = band_translate[i + 1];
   rate0 = tokens[next][0].rate;
   rate1 = tokens[next][1].rate;
   error0 = tokens[next][0].error;
   error1 = tokens[next][1].error;
   t0 = tokens[next][0].token;
   t1 = tokens[next][1].token;
-  rate0 += mb->token_costs[tx_size][type][ref][band][0][pt][t0];
-  rate1 += mb->token_costs[tx_size][type][ref][band][0][pt][t1];
+  rate0 += mb->token_costs[tx_size][type][ref][band][0][ctx][t0];
+  rate1 += mb->token_costs[tx_size][type][ref][band][0][ctx][t1];
   UPDATE_RD_COST();
   best = rd_cost1 < rd_cost0;
-  final_eob = i0 - 1;
-  vpx_memset(qcoeff_ptr, 0, sizeof(*qcoeff_ptr) * (16 << (tx_size * 2)));
-  vpx_memset(dqcoeff_ptr, 0, sizeof(*dqcoeff_ptr) * (16 << (tx_size * 2)));
+  final_eob = -1;
+  vpx_memset(qcoeff, 0, sizeof(*qcoeff) * (16 << (tx_size * 2)));
+  vpx_memset(dqcoeff, 0, sizeof(*dqcoeff) * (16 << (tx_size * 2)));
   for (i = next; i < eob; i = next) {
-    x = tokens[i][best].qc;
+    const int x = tokens[i][best].qc;
+    const int rc = scan[i];
     if (x) {
       final_eob = i;
     }
-    rc = scan[i];
-    qcoeff_ptr[rc] = x;
-    dqcoeff_ptr[rc] = (x * dequant_ptr[rc != 0]) / mul;
+
+    qcoeff[rc] = x;
+    dqcoeff[rc] = (x * dequant_ptr[rc != 0]) / mul;
 
     next = tokens[i][best].next;
     best = best_index[i][best];
   }
   final_eob++;
 
-  xd->plane[plane].eobs[block] = final_eob;
-  *a = *l = (final_eob > 0);
+  mb->plane[plane].eobs[block] = final_eob;
+  return final_eob;
 }
 
-void vp9_optimize_b(int plane, int block, BLOCK_SIZE plane_bsize,
-                    TX_SIZE tx_size, MACROBLOCK *mb, struct optimize_ctx *ctx) {
-  int x, y;
-  txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &x, &y);
-  optimize_b(mb, plane, block, plane_bsize,
-             &ctx->ta[plane][x], &ctx->tl[plane][y], tx_size);
+static INLINE void fdct32x32(int rd_transform,
+                             const int16_t *src, int16_t *dst, int src_stride) {
+  if (rd_transform)
+    vp9_fdct32x32_rd(src, dst, src_stride);
+  else
+    vp9_fdct32x32(src, dst, src_stride);
 }
 
-static void optimize_init_b(int plane, BLOCK_SIZE bsize,
-                            struct encode_b_args *args) {
-  const MACROBLOCKD *xd = &args->x->e_mbd;
-  const struct macroblockd_plane* const pd = &xd->plane[plane];
-  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
-  const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
-  const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
-  const MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
-  const TX_SIZE tx_size = plane ? get_uv_tx_size(mbmi) : mbmi->tx_size;
-
-  vp9_get_entropy_contexts(tx_size, args->ctx->ta[plane], args->ctx->tl[plane],
-                           pd->above_context, pd->left_context,
-                           num_4x4_w, num_4x4_h);
-}
-
-void vp9_xform_quant(int plane, int block, BLOCK_SIZE plane_bsize,
-                     TX_SIZE tx_size, void *arg) {
-  struct encode_b_args* const args = arg;
-  MACROBLOCK* const x = args->x;
-  MACROBLOCKD* const xd = &x->e_mbd;
-  struct macroblock_plane *const p = &x->plane[plane];
-  struct macroblockd_plane *const pd = &xd->plane[plane];
-  int16_t *coeff = BLOCK_OFFSET(p->coeff, block);
-  int16_t *qcoeff = BLOCK_OFFSET(pd->qcoeff, block);
-  int16_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
-  const int16_t *scan, *iscan;
-  uint16_t *eob = &pd->eobs[block];
-  const int bwl = b_width_log2(plane_bsize), bw = 1 << bwl;
-  const int twl = bwl - tx_size, twmask = (1 << twl) - 1;
-  int xoff, yoff;
-  int16_t *src_diff;
+void vp9_xform_quant(MACROBLOCK *x, int plane, int block,
+                     BLOCK_SIZE plane_bsize, TX_SIZE tx_size) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const struct macroblock_plane *const p = &x->plane[plane];
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const scan_order *const scan_order = &vp9_default_scan_orders[tx_size];
+  int16_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+  int16_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+  int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+  uint16_t *const eob = &p->eobs[block];
+  const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
+  int i, j;
+  const int16_t *src_diff;
+  txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
+  src_diff = &p->src_diff[4 * (j * diff_stride + i)];
 
   switch (tx_size) {
     case TX_32X32:
-      scan = vp9_default_scan_32x32;
-      iscan = vp9_default_iscan_32x32;
-      block >>= 6;
-      xoff = 32 * (block & twmask);
-      yoff = 32 * (block >> twl);
-      src_diff = p->src_diff + 4 * bw * yoff + xoff;
-      if (x->use_lp32x32fdct)
-        vp9_fdct32x32_rd(src_diff, coeff, bw * 4);
-      else
-        vp9_fdct32x32(src_diff, coeff, bw * 4);
+      fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
       vp9_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round,
                            p->quant, p->quant_shift, qcoeff, dqcoeff,
-                           pd->dequant, p->zbin_extra, eob, scan, iscan);
+                           pd->dequant, p->zbin_extra, eob, scan_order->scan,
+                           scan_order->iscan);
       break;
     case TX_16X16:
-      scan = vp9_default_scan_16x16;
-      iscan = vp9_default_iscan_16x16;
-      block >>= 4;
-      xoff = 16 * (block & twmask);
-      yoff = 16 * (block >> twl);
-      src_diff = p->src_diff + 4 * bw * yoff + xoff;
-      vp9_fdct16x16(src_diff, coeff, bw * 4);
+      vp9_fdct16x16(src_diff, coeff, diff_stride);
       vp9_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
                      p->quant, p->quant_shift, qcoeff, dqcoeff,
-                     pd->dequant, p->zbin_extra, eob, scan, iscan);
+                     pd->dequant, p->zbin_extra, eob,
+                     scan_order->scan, scan_order->iscan);
       break;
     case TX_8X8:
-      scan = vp9_default_scan_8x8;
-      iscan = vp9_default_iscan_8x8;
-      block >>= 2;
-      xoff = 8 * (block & twmask);
-      yoff = 8 * (block >> twl);
-      src_diff = p->src_diff + 4 * bw * yoff + xoff;
-      vp9_fdct8x8(src_diff, coeff, bw * 4);
+      vp9_fdct8x8(src_diff, coeff, diff_stride);
       vp9_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round,
                      p->quant, p->quant_shift, qcoeff, dqcoeff,
-                     pd->dequant, p->zbin_extra, eob, scan, iscan);
+                     pd->dequant, p->zbin_extra, eob,
+                     scan_order->scan, scan_order->iscan);
       break;
     case TX_4X4:
-      scan = vp9_default_scan_4x4;
-      iscan = vp9_default_iscan_4x4;
-      xoff = 4 * (block & twmask);
-      yoff = 4 * (block >> twl);
-      src_diff = p->src_diff + 4 * bw * yoff + xoff;
-      x->fwd_txm4x4(src_diff, coeff, bw * 4);
+      x->fwd_txm4x4(src_diff, coeff, diff_stride);
       vp9_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round,
                      p->quant, p->quant_shift, qcoeff, dqcoeff,
-                     pd->dequant, p->zbin_extra, eob, scan, iscan);
+                     pd->dequant, p->zbin_extra, eob,
+                     scan_order->scan, scan_order->iscan);
       break;
     default:
       assert(0);
@@ -419,251 +357,247 @@ static void encode_block(int plane, int block, BLOCK_SIZE plane_bsize,
   MACROBLOCK *const x = args->x;
   MACROBLOCKD *const xd = &x->e_mbd;
   struct optimize_ctx *const ctx = args->ctx;
+  struct macroblock_plane *const p = &x->plane[plane];
   struct macroblockd_plane *const pd = &xd->plane[plane];
-  const int raster_block = txfrm_block_to_raster_block(plane_bsize, tx_size,
-                                                       block);
-
   int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
-  uint8_t *const dst = raster_block_offset_uint8(plane_bsize, raster_block,
-                                                 pd->dst.buf, pd->dst.stride);
+  int i, j;
+  uint8_t *dst;
+  ENTROPY_CONTEXT *a, *l;
+  txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
+  dst = &pd->dst.buf[4 * j * pd->dst.stride + 4 * i];
+  a = &ctx->ta[plane][i];
+  l = &ctx->tl[plane][j];
 
   // TODO(jingning): per transformed block zero forcing only enabled for
   // luma component. will integrate chroma components as well.
   if (x->zcoeff_blk[tx_size][block] && plane == 0) {
-    int x, y;
-    pd->eobs[block] = 0;
-    txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &x, &y);
-    ctx->ta[plane][x] = 0;
-    ctx->tl[plane][y] = 0;
+    p->eobs[block] = 0;
+    *a = *l = 0;
     return;
   }
 
-  vp9_xform_quant(plane, block, plane_bsize, tx_size, arg);
+  if (!x->skip_recode)
+    vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
+
+  if (x->optimize && (!x->skip_recode || !x->skip_optimize)) {
+    const int ctx = combine_entropy_contexts(*a, *l);
+    *a = *l = optimize_b(x, plane, block, tx_size, ctx) > 0;
+  } else {
+    *a = *l = p->eobs[block] > 0;
+  }
 
-  if (x->optimize)
-    vp9_optimize_b(plane, block, plane_bsize, tx_size, x, ctx);
+  if (p->eobs[block])
+    *(args->skip) = 0;
 
-  if (x->skip_encode || pd->eobs[block] == 0)
+  if (x->skip_encode || p->eobs[block] == 0)
     return;
 
   switch (tx_size) {
     case TX_32X32:
-      vp9_idct32x32_add(dqcoeff, dst, pd->dst.stride, pd->eobs[block]);
+      vp9_idct32x32_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
       break;
     case TX_16X16:
-      vp9_idct16x16_add(dqcoeff, dst, pd->dst.stride, pd->eobs[block]);
+      vp9_idct16x16_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
       break;
     case TX_8X8:
-      vp9_idct8x8_add(dqcoeff, dst, pd->dst.stride, pd->eobs[block]);
+      vp9_idct8x8_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
       break;
     case TX_4X4:
       // this is like vp9_short_idct4x4 but has a special case around eob<=1
       // which is significant (not just an optimization) for the lossless
       // case.
-      xd->itxm_add(dqcoeff, dst, pd->dst.stride, pd->eobs[block]);
+      xd->itxm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
       break;
     default:
-      assert(!"Invalid transform size");
+      assert(0 && "Invalid transform size");
   }
 }
 
 static void encode_block_pass1(int plane, int block, BLOCK_SIZE plane_bsize,
                                TX_SIZE tx_size, void *arg) {
-  struct encode_b_args *const args = arg;
-  MACROBLOCK *const x = args->x;
+  MACROBLOCK *const x = (MACROBLOCK *)arg;
   MACROBLOCKD *const xd = &x->e_mbd;
+  struct macroblock_plane *const p = &x->plane[plane];
   struct macroblockd_plane *const pd = &xd->plane[plane];
-  const int raster_block = txfrm_block_to_raster_block(plane_bsize, tx_size,
-                                                       block);
-
   int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
-  uint8_t *const dst = raster_block_offset_uint8(plane_bsize, raster_block,
-                                                 pd->dst.buf, pd->dst.stride);
-
-  vp9_xform_quant(plane, block, plane_bsize, tx_size, arg);
+  int i, j;
+  uint8_t *dst;
+  txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
+  dst = &pd->dst.buf[4 * j * pd->dst.stride + 4 * i];
 
-  if (pd->eobs[block] == 0)
-    return;
+  vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
 
-  xd->itxm_add(dqcoeff, dst, pd->dst.stride, pd->eobs[block]);
+  if (p->eobs[block] > 0)
+    xd->itxm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
 }
 
-void vp9_encode_sby(MACROBLOCK *x, BLOCK_SIZE bsize) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  struct optimize_ctx ctx;
-  struct encode_b_args arg = {x, &ctx};
-
-  vp9_subtract_sby(x, bsize);
-  if (x->optimize)
-    optimize_init_b(0, bsize, &arg);
-
-  foreach_transformed_block_in_plane(xd, bsize, 0, encode_block_pass1, &arg);
+void vp9_encode_sby_pass1(MACROBLOCK *x, BLOCK_SIZE bsize) {
+  vp9_subtract_plane(x, bsize, 0);
+  vp9_foreach_transformed_block_in_plane(&x->e_mbd, bsize, 0,
+                                         encode_block_pass1, x);
 }
 
 void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize) {
   MACROBLOCKD *const xd = &x->e_mbd;
   struct optimize_ctx ctx;
-  struct encode_b_args arg = {x, &ctx};
-
-  vp9_subtract_sb(x, bsize);
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  struct encode_b_args arg = {x, &ctx, &mbmi->skip};
+  int plane;
+
+  for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+    if (!x->skip_recode)
+      vp9_subtract_plane(x, bsize, plane);
+
+    if (x->optimize && (!x->skip_recode || !x->skip_optimize)) {
+      const struct macroblockd_plane* const pd = &xd->plane[plane];
+      const TX_SIZE tx_size = plane ? get_uv_tx_size(mbmi) : mbmi->tx_size;
+      vp9_get_entropy_contexts(bsize, tx_size, pd,
+                               ctx.ta[plane], ctx.tl[plane]);
+    }
 
-  if (x->optimize) {
-    int i;
-    for (i = 0; i < MAX_MB_PLANE; ++i)
-      optimize_init_b(i, bsize, &arg);
+    vp9_foreach_transformed_block_in_plane(xd, bsize, plane, encode_block,
+                                           &arg);
   }
-
-  foreach_transformed_block(xd, bsize, encode_block, &arg);
 }
 
-void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
-                            TX_SIZE tx_size, void *arg) {
+static void encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
+                               TX_SIZE tx_size, void *arg) {
   struct encode_b_args* const args = arg;
   MACROBLOCK *const x = args->x;
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
   struct macroblock_plane *const p = &x->plane[plane];
   struct macroblockd_plane *const pd = &xd->plane[plane];
   int16_t *coeff = BLOCK_OFFSET(p->coeff, block);
-  int16_t *qcoeff = BLOCK_OFFSET(pd->qcoeff, block);
+  int16_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
   int16_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
-  const int16_t *scan, *iscan;
+  const scan_order *scan_order;
   TX_TYPE tx_type;
-  MB_PREDICTION_MODE mode;
-  const int bwl = b_width_log2(plane_bsize), bw = 1 << bwl;
-  const int twl = bwl - tx_size, twmask = (1 << twl) - 1;
-  int xoff, yoff;
+  PREDICTION_MODE mode;
+  const int bwl = b_width_log2(plane_bsize);
+  const int diff_stride = 4 * (1 << bwl);
   uint8_t *src, *dst;
   int16_t *src_diff;
-  uint16_t *eob = &pd->eobs[block];
-
-  if (xd->mb_to_right_edge < 0 || xd->mb_to_bottom_edge < 0)
-    extend_for_intra(xd, plane_bsize, plane, block, tx_size);
-
-  // if (x->optimize)
-  // vp9_optimize_b(plane, block, plane_bsize, tx_size, x, args->ctx);
+  uint16_t *eob = &p->eobs[block];
+  const int src_stride = p->src.stride;
+  const int dst_stride = pd->dst.stride;
+  int i, j;
+  txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
+  dst = &pd->dst.buf[4 * (j * dst_stride + i)];
+  src = &p->src.buf[4 * (j * src_stride + i)];
+  src_diff = &p->src_diff[4 * (j * diff_stride + i)];
 
   switch (tx_size) {
     case TX_32X32:
-      scan = vp9_default_scan_32x32;
-      iscan = vp9_default_iscan_32x32;
+      scan_order = &vp9_default_scan_orders[TX_32X32];
       mode = plane == 0 ? mbmi->mode : mbmi->uv_mode;
-      block >>= 6;
-      xoff = 32 * (block & twmask);
-      yoff = 32 * (block >> twl);
-      dst = pd->dst.buf + yoff * pd->dst.stride + xoff;
-      src = p->src.buf + yoff * p->src.stride + xoff;
-      src_diff = p->src_diff + 4 * bw * yoff + xoff;
-      vp9_predict_intra_block(xd, block, bwl, TX_32X32, mode,
-                              dst, pd->dst.stride, dst, pd->dst.stride);
-      vp9_subtract_block(32, 32, src_diff, bw * 4,
-                         src, p->src.stride, dst, pd->dst.stride);
-      if (x->use_lp32x32fdct)
-        vp9_fdct32x32_rd(src_diff, coeff, bw * 4);
-      else
-        vp9_fdct32x32(src_diff, coeff, bw * 4);
-      vp9_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round,
-                           p->quant, p->quant_shift, qcoeff, dqcoeff,
-                           pd->dequant, p->zbin_extra, eob, scan, iscan);
+      vp9_predict_intra_block(xd, block >> 6, bwl, TX_32X32, mode,
+                              x->skip_encode ? src : dst,
+                              x->skip_encode ? src_stride : dst_stride,
+                              dst, dst_stride, i, j, plane);
+      if (!x->skip_recode) {
+        vp9_subtract_block(32, 32, src_diff, diff_stride,
+                           src, src_stride, dst, dst_stride);
+        fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
+        vp9_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round,
+                             p->quant, p->quant_shift, qcoeff, dqcoeff,
+                             pd->dequant, p->zbin_extra, eob, scan_order->scan,
+                             scan_order->iscan);
+      }
       if (!x->skip_encode && *eob)
-        vp9_idct32x32_add(dqcoeff, dst, pd->dst.stride, *eob);
+        vp9_idct32x32_add(dqcoeff, dst, dst_stride, *eob);
       break;
     case TX_16X16:
-      tx_type = get_tx_type_16x16(pd->plane_type, xd);
-      scan = get_scan_16x16(tx_type);
-      iscan = get_iscan_16x16(tx_type);
+      tx_type = get_tx_type(pd->plane_type, xd);
+      scan_order = &vp9_scan_orders[TX_16X16][tx_type];
       mode = plane == 0 ? mbmi->mode : mbmi->uv_mode;
-      block >>= 4;
-      xoff = 16 * (block & twmask);
-      yoff = 16 * (block >> twl);
-      dst = pd->dst.buf + yoff * pd->dst.stride + xoff;
-      src = p->src.buf + yoff * p->src.stride + xoff;
-      src_diff = p->src_diff + 4 * bw * yoff + xoff;
-      vp9_predict_intra_block(xd, block, bwl, TX_16X16, mode,
-                              dst, pd->dst.stride, dst, pd->dst.stride);
-      vp9_subtract_block(16, 16, src_diff, bw * 4,
-                         src, p->src.stride, dst, pd->dst.stride);
-      vp9_fht16x16(tx_type, src_diff, coeff, bw * 4);
-      vp9_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
-                     p->quant, p->quant_shift, qcoeff, dqcoeff,
-                     pd->dequant, p->zbin_extra, eob, scan, iscan);
+      vp9_predict_intra_block(xd, block >> 4, bwl, TX_16X16, mode,
+                              x->skip_encode ? src : dst,
+                              x->skip_encode ? src_stride : dst_stride,
+                              dst, dst_stride, i, j, plane);
+      if (!x->skip_recode) {
+        vp9_subtract_block(16, 16, src_diff, diff_stride,
+                           src, src_stride, dst, dst_stride);
+        vp9_fht16x16(src_diff, coeff, diff_stride, tx_type);
+        vp9_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
+                       p->quant, p->quant_shift, qcoeff, dqcoeff,
+                       pd->dequant, p->zbin_extra, eob, scan_order->scan,
+                       scan_order->iscan);
+      }
       if (!x->skip_encode && *eob)
-        vp9_iht16x16_add(tx_type, dqcoeff, dst, pd->dst.stride, *eob);
+        vp9_iht16x16_add(tx_type, dqcoeff, dst, dst_stride, *eob);
       break;
     case TX_8X8:
-      tx_type = get_tx_type_8x8(pd->plane_type, xd);
-      scan = get_scan_8x8(tx_type);
-      iscan = get_iscan_8x8(tx_type);
+      tx_type = get_tx_type(pd->plane_type, xd);
+      scan_order = &vp9_scan_orders[TX_8X8][tx_type];
       mode = plane == 0 ? mbmi->mode : mbmi->uv_mode;
-      block >>= 2;
-      xoff = 8 * (block & twmask);
-      yoff = 8 * (block >> twl);
-      dst = pd->dst.buf + yoff * pd->dst.stride + xoff;
-      src = p->src.buf + yoff * p->src.stride + xoff;
-      src_diff = p->src_diff + 4 * bw * yoff + xoff;
-      vp9_predict_intra_block(xd, block, bwl, TX_8X8, mode,
-                              dst, pd->dst.stride, dst, pd->dst.stride);
-      vp9_subtract_block(8, 8, src_diff, bw * 4,
-                         src, p->src.stride, dst, pd->dst.stride);
-      vp9_fht8x8(tx_type, src_diff, coeff, bw * 4);
-      vp9_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round, p->quant,
-                     p->quant_shift, qcoeff, dqcoeff,
-                     pd->dequant, p->zbin_extra, eob, scan, iscan);
+      vp9_predict_intra_block(xd, block >> 2, bwl, TX_8X8, mode,
+                              x->skip_encode ? src : dst,
+                              x->skip_encode ? src_stride : dst_stride,
+                              dst, dst_stride, i, j, plane);
+      if (!x->skip_recode) {
+        vp9_subtract_block(8, 8, src_diff, diff_stride,
+                           src, src_stride, dst, dst_stride);
+        vp9_fht8x8(src_diff, coeff, diff_stride, tx_type);
+        vp9_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round, p->quant,
+                       p->quant_shift, qcoeff, dqcoeff,
+                       pd->dequant, p->zbin_extra, eob, scan_order->scan,
+                       scan_order->iscan);
+      }
       if (!x->skip_encode && *eob)
-        vp9_iht8x8_add(tx_type, dqcoeff, dst, pd->dst.stride, *eob);
+        vp9_iht8x8_add(tx_type, dqcoeff, dst, dst_stride, *eob);
       break;
     case TX_4X4:
       tx_type = get_tx_type_4x4(pd->plane_type, xd, block);
-      scan = get_scan_4x4(tx_type);
-      iscan = get_iscan_4x4(tx_type);
-      if (mbmi->sb_type < BLOCK_8X8 && plane == 0)
-        mode = xd->mi_8x8[0]->bmi[block].as_mode;
-      else
-        mode = plane == 0 ? mbmi->mode : mbmi->uv_mode;
-
-      xoff = 4 * (block & twmask);
-      yoff = 4 * (block >> twl);
-      dst = pd->dst.buf + yoff * pd->dst.stride + xoff;
-      src = p->src.buf + yoff * p->src.stride + xoff;
-      src_diff = p->src_diff + 4 * bw * yoff + xoff;
+      scan_order = &vp9_scan_orders[TX_4X4][tx_type];
+      mode = plane == 0 ? get_y_mode(xd->mi[0], block) : mbmi->uv_mode;
       vp9_predict_intra_block(xd, block, bwl, TX_4X4, mode,
-                              dst, pd->dst.stride, dst, pd->dst.stride);
-      vp9_subtract_block(4, 4, src_diff, bw * 4,
-                         src, p->src.stride, dst, pd->dst.stride);
-      if (tx_type != DCT_DCT)
-        vp9_short_fht4x4(src_diff, coeff, bw * 4, tx_type);
-      else
-        x->fwd_txm4x4(src_diff, coeff, bw * 4);
-      vp9_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round, p->quant,
-                     p->quant_shift, qcoeff, dqcoeff,
-                     pd->dequant, p->zbin_extra, eob, scan, iscan);
+                              x->skip_encode ? src : dst,
+                              x->skip_encode ? src_stride : dst_stride,
+                              dst, dst_stride, i, j, plane);
+
+      if (!x->skip_recode) {
+        vp9_subtract_block(4, 4, src_diff, diff_stride,
+                           src, src_stride, dst, dst_stride);
+        if (tx_type != DCT_DCT)
+          vp9_fht4x4(src_diff, coeff, diff_stride, tx_type);
+        else
+          x->fwd_txm4x4(src_diff, coeff, diff_stride);
+        vp9_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round, p->quant,
+                       p->quant_shift, qcoeff, dqcoeff,
+                       pd->dequant, p->zbin_extra, eob, scan_order->scan,
+                       scan_order->iscan);
+      }
+
       if (!x->skip_encode && *eob) {
         if (tx_type == DCT_DCT)
           // this is like vp9_short_idct4x4 but has a special case around eob<=1
           // which is significant (not just an optimization) for the lossless
           // case.
-          xd->itxm_add(dqcoeff, dst, pd->dst.stride, *eob);
+          xd->itxm_add(dqcoeff, dst, dst_stride, *eob);
         else
-          vp9_iht4x4_16_add(dqcoeff, dst, pd->dst.stride, tx_type);
+          vp9_iht4x4_16_add(dqcoeff, dst, dst_stride, tx_type);
       }
       break;
     default:
       assert(0);
   }
+  if (*eob)
+    *(args->skip) = 0;
 }
 
-void vp9_encode_intra_block_y(MACROBLOCK *x, BLOCK_SIZE bsize) {
-  MACROBLOCKD* const xd = &x->e_mbd;
-  struct optimize_ctx ctx;
-  struct encode_b_args arg = {x, &ctx};
-
-  foreach_transformed_block_in_plane(xd, bsize, 0, vp9_encode_block_intra,
-                                     &arg);
-}
-void vp9_encode_intra_block_uv(MACROBLOCK *x, BLOCK_SIZE bsize) {
-  MACROBLOCKD* const xd = &x->e_mbd;
-  struct optimize_ctx ctx;
-  struct encode_b_args arg = {x, &ctx};
-  foreach_transformed_block_uv(xd, bsize, vp9_encode_block_intra, &arg);
+void vp9_encode_block_intra(MACROBLOCK *x, int plane, int block,
+                            BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+                            unsigned char *skip) {
+  struct encode_b_args arg = {x, NULL, skip};
+  encode_block_intra(plane, block, plane_bsize, tx_size, &arg);
 }
 
+
+void vp9_encode_intra_block_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  struct encode_b_args arg = {x, NULL, &xd->mi[0]->mbmi.skip};
+
+  vp9_foreach_transformed_block_in_plane(xd, bsize, plane, encode_block_intra,
+                                         &arg);
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodemb.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodemb.h
index 61dd7358e0e..80214598484 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodemb.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodemb.h
@@ -13,42 +13,29 @@
 
 #include "./vpx_config.h"
 #include "vp9/encoder/vp9_block.h"
-#include "vp9/encoder/vp9_onyx_int.h"
+#include "vp9/encoder/vp9_encoder.h"
 #include "vp9/common/vp9_onyxc_int.h"
 
-typedef struct {
-  MB_PREDICTION_MODE mode;
-  MV_REFERENCE_FRAME ref_frame;
-  MV_REFERENCE_FRAME second_ref_frame;
-} MODE_DEFINITION;
-
-typedef struct {
-  MV_REFERENCE_FRAME ref_frame;
-  MV_REFERENCE_FRAME second_ref_frame;
-} REF_DEFINITION;
-
-struct optimize_ctx {
-  ENTROPY_CONTEXT ta[MAX_MB_PLANE][16];
-  ENTROPY_CONTEXT tl[MAX_MB_PLANE][16];
-};
-
-struct encode_b_args {
-  MACROBLOCK *x;
-  struct optimize_ctx *ctx;
-};
+#ifdef __cplusplus
+extern "C" {
+#endif
 
 void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize);
-void vp9_encode_sby(MACROBLOCK *x, BLOCK_SIZE bsize);
+void vp9_encode_sby_pass1(MACROBLOCK *x, BLOCK_SIZE bsize);
+
+void vp9_xform_quant(MACROBLOCK *x, int plane, int block,
+                     BLOCK_SIZE plane_bsize, TX_SIZE tx_size);
 
-void vp9_xform_quant(int plane, int block, BLOCK_SIZE plane_bsize,
-                     TX_SIZE tx_size, void *arg);
+void vp9_subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane);
 
-void vp9_subtract_sby(MACROBLOCK *x, BLOCK_SIZE bsize);
-void vp9_subtract_sbuv(MACROBLOCK *x, BLOCK_SIZE bsize);
-void vp9_subtract_sb(MACROBLOCK *x, BLOCK_SIZE bsize);
+void vp9_encode_block_intra(MACROBLOCK *x, int plane, int block,
+                            BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+                            unsigned char *skip);
 
-void vp9_encode_intra_block_y(MACROBLOCK *x, BLOCK_SIZE bsize);
-void vp9_encode_intra_block_uv(MACROBLOCK *x, BLOCK_SIZE bsize);
+void vp9_encode_intra_block_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane);
 
+#ifdef __cplusplus
+}  // extern "C"
+#endif
 
 #endif  // VP9_ENCODER_VP9_ENCODEMB_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodemv.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodemv.c
index 9ebcc498392..9d448651161 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodemv.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodemv.c
@@ -13,12 +13,21 @@
 #include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_entropymode.h"
 #include "vp9/common/vp9_systemdependent.h"
+
+#include "vp9/encoder/vp9_cost.h"
 #include "vp9/encoder/vp9_encodemv.h"
 
+static struct vp9_token mv_joint_encodings[MV_JOINTS];
+static struct vp9_token mv_class_encodings[MV_CLASSES];
+static struct vp9_token mv_fp_encodings[MV_FP_SIZE];
+static struct vp9_token mv_class0_encodings[CLASS0_SIZE];
 
-#ifdef ENTROPY_STATS
-extern unsigned int active_section;
-#endif
+void vp9_entropy_mv_init() {
+  vp9_tokens_from_tree(mv_joint_encodings, vp9_mv_joint_tree);
+  vp9_tokens_from_tree(mv_class_encodings, vp9_mv_class_tree);
+  vp9_tokens_from_tree(mv_class0_encodings, vp9_mv_class0_tree);
+  vp9_tokens_from_tree(mv_fp_encodings, vp9_mv_fp_tree);
+}
 
 static void encode_mv_component(vp9_writer* w, int comp,
                                 const nmv_component* mvcomp, int usehp) {
@@ -36,13 +45,13 @@ static void encode_mv_component(vp9_writer* w, int comp,
   vp9_write(w, sign, mvcomp->sign);
 
   // Class
-  write_token(w, vp9_mv_class_tree, mvcomp->classes,
-              &vp9_mv_class_encodings[mv_class]);
+  vp9_write_token(w, vp9_mv_class_tree, mvcomp->classes,
+                  &mv_class_encodings[mv_class]);
 
   // Integer bits
   if (mv_class == MV_CLASS_0) {
-    write_token(w, vp9_mv_class0_tree, mvcomp->class0,
-                &vp9_mv_class0_encodings[d]);
+    vp9_write_token(w, vp9_mv_class0_tree, mvcomp->class0,
+                    &mv_class0_encodings[d]);
   } else {
     int i;
     const int n = mv_class + CLASS0_BITS - 1;  // number of bits
@@ -51,9 +60,9 @@ static void encode_mv_component(vp9_writer* w, int comp,
   }
 
   // Fractional bits
-  write_token(w, vp9_mv_fp_tree,
-              mv_class == MV_CLASS_0 ?  mvcomp->class0_fp[d] : mvcomp->fp,
-              &vp9_mv_fp_encodings[fr]);
+  vp9_write_token(w, vp9_mv_fp_tree,
+                  mv_class == MV_CLASS_0 ?  mvcomp->class0_fp[d] : mvcomp->fp,
+                  &mv_fp_encodings[fr]);
 
   // High precision bit
   if (usehp)
@@ -68,7 +77,7 @@ static void build_nmv_component_cost_table(int *mvcost,
   int i, v;
   int sign_cost[2], class_cost[MV_CLASSES], class0_cost[CLASS0_SIZE];
   int bits_cost[MV_OFFSET_BITS][2];
-  int class0_fp_cost[CLASS0_SIZE][4], fp_cost[4];
+  int class0_fp_cost[CLASS0_SIZE][MV_FP_SIZE], fp_cost[MV_FP_SIZE];
   int class0_hp_cost[2], hp_cost[2];
 
   sign_cost[0] = vp9_cost_zero(mvcomp->sign);
@@ -124,155 +133,68 @@ static void build_nmv_component_cost_table(int *mvcost,
   }
 }
 
-static int update_mv(vp9_writer *w, const unsigned int ct[2],
-                     vp9_prob *cur_p, vp9_prob new_p, vp9_prob upd_p) {
-  vp9_prob mod_p = new_p | 1;
-  const int cur_b = cost_branch256(ct, *cur_p);
-  const int mod_b = cost_branch256(ct, mod_p);
-  const int cost = 7 * 256 + (vp9_cost_one(upd_p) - vp9_cost_zero(upd_p));
-  if (cur_b - mod_b > cost) {
-    *cur_p = mod_p;
-    vp9_write(w, 1, upd_p);
-    vp9_write_literal(w, mod_p >> 1, 7);
-    return 1;
-  } else {
-    vp9_write(w, 0, upd_p);
-    return 0;
+static int update_mv(vp9_writer *w, const unsigned int ct[2], vp9_prob *cur_p,
+                     vp9_prob upd_p) {
+  const vp9_prob new_p = get_binary_prob(ct[0], ct[1]) | 1;
+  const int update = cost_branch256(ct, *cur_p) + vp9_cost_zero(upd_p) >
+                     cost_branch256(ct, new_p) + vp9_cost_one(upd_p) + 7 * 256;
+  vp9_write(w, update, upd_p);
+  if (update) {
+    *cur_p = new_p;
+    vp9_write_literal(w, new_p >> 1, 7);
   }
+  return update;
 }
 
-static void counts_to_nmv_context(
-    nmv_context_counts *nmv_count,
-    nmv_context *prob,
-    int usehp,
-    unsigned int (*branch_ct_joint)[2],
-    unsigned int (*branch_ct_sign)[2],
-    unsigned int (*branch_ct_classes)[MV_CLASSES - 1][2],
-    unsigned int (*branch_ct_class0)[CLASS0_SIZE - 1][2],
-    unsigned int (*branch_ct_bits)[MV_OFFSET_BITS][2],
-    unsigned int (*branch_ct_class0_fp)[CLASS0_SIZE][4 - 1][2],
-    unsigned int (*branch_ct_fp)[4 - 1][2],
-    unsigned int (*branch_ct_class0_hp)[2],
-    unsigned int (*branch_ct_hp)[2]) {
-  int i, j, k;
-  vp9_tree_probs_from_distribution(vp9_mv_joint_tree,
-                                   prob->joints,
-                                   branch_ct_joint,
-                                   nmv_count->joints, 0);
-  for (i = 0; i < 2; ++i) {
-    const uint32_t s0 = nmv_count->comps[i].sign[0];
-    const uint32_t s1 = nmv_count->comps[i].sign[1];
-
-    prob->comps[i].sign = get_binary_prob(s0, s1);
-    branch_ct_sign[i][0] = s0;
-    branch_ct_sign[i][1] = s1;
-    vp9_tree_probs_from_distribution(vp9_mv_class_tree,
-                                     prob->comps[i].classes,
-                                     branch_ct_classes[i],
-                                     nmv_count->comps[i].classes, 0);
-    vp9_tree_probs_from_distribution(vp9_mv_class0_tree,
-                                     prob->comps[i].class0,
-                                     branch_ct_class0[i],
-                                     nmv_count->comps[i].class0, 0);
-    for (j = 0; j < MV_OFFSET_BITS; ++j) {
-      const uint32_t b0 = nmv_count->comps[i].bits[j][0];
-      const uint32_t b1 = nmv_count->comps[i].bits[j][1];
-
-      prob->comps[i].bits[j] = get_binary_prob(b0, b1);
-      branch_ct_bits[i][j][0] = b0;
-      branch_ct_bits[i][j][1] = b1;
-    }
-  }
-  for (i = 0; i < 2; ++i) {
-    for (k = 0; k < CLASS0_SIZE; ++k) {
-      vp9_tree_probs_from_distribution(vp9_mv_fp_tree,
-                                       prob->comps[i].class0_fp[k],
-                                       branch_ct_class0_fp[i][k],
-                                       nmv_count->comps[i].class0_fp[k], 0);
-    }
-    vp9_tree_probs_from_distribution(vp9_mv_fp_tree,
-                                     prob->comps[i].fp,
-                                     branch_ct_fp[i],
-                                     nmv_count->comps[i].fp, 0);
-  }
-  if (usehp) {
-    for (i = 0; i < 2; ++i) {
-      const uint32_t c0_hp0 = nmv_count->comps[i].class0_hp[0];
-      const uint32_t c0_hp1 = nmv_count->comps[i].class0_hp[1];
-      const uint32_t hp0 = nmv_count->comps[i].hp[0];
-      const uint32_t hp1 = nmv_count->comps[i].hp[1];
-
-      prob->comps[i].class0_hp = get_binary_prob(c0_hp0, c0_hp1);
-      branch_ct_class0_hp[i][0] = c0_hp0;
-      branch_ct_class0_hp[i][1] = c0_hp1;
-
-      prob->comps[i].hp = get_binary_prob(hp0, hp1);
-      branch_ct_hp[i][0] = hp0;
-      branch_ct_hp[i][1] = hp1;
-    }
-  }
+static void write_mv_update(const vp9_tree_index *tree,
+                            vp9_prob probs[/*n - 1*/],
+                            const unsigned int counts[/*n - 1*/],
+                            int n, vp9_writer *w) {
+  int i;
+  unsigned int branch_ct[32][2];
+
+  // Assuming max number of probabilities <= 32
+  assert(n <= 32);
+
+  vp9_tree_probs_from_distribution(tree, branch_ct, counts);
+  for (i = 0; i < n - 1; ++i)
+    update_mv(w, branch_ct[i], &probs[i], MV_UPDATE_PROB);
 }
 
-void vp9_write_nmv_probs(VP9_COMP* const cpi, int usehp, vp9_writer* const bc) {
+void vp9_write_nmv_probs(VP9_COMMON *cm, int usehp, vp9_writer *w) {
   int i, j;
-  nmv_context prob;
-  unsigned int branch_ct_joint[MV_JOINTS - 1][2];
-  unsigned int branch_ct_sign[2][2];
-  unsigned int branch_ct_classes[2][MV_CLASSES - 1][2];
-  unsigned int branch_ct_class0[2][CLASS0_SIZE - 1][2];
-  unsigned int branch_ct_bits[2][MV_OFFSET_BITS][2];
-  unsigned int branch_ct_class0_fp[2][CLASS0_SIZE][4 - 1][2];
-  unsigned int branch_ct_fp[2][4 - 1][2];
-  unsigned int branch_ct_class0_hp[2][2];
-  unsigned int branch_ct_hp[2][2];
-  nmv_context *mvc = &cpi->common.fc.nmvc;
-
-  counts_to_nmv_context(&cpi->NMVcount, &prob, usehp,
-                        branch_ct_joint, branch_ct_sign, branch_ct_classes,
-                        branch_ct_class0, branch_ct_bits,
-                        branch_ct_class0_fp, branch_ct_fp,
-                        branch_ct_class0_hp, branch_ct_hp);
-
-  for (j = 0; j < MV_JOINTS - 1; ++j)
-    update_mv(bc, branch_ct_joint[j], &mvc->joints[j], prob.joints[j],
-              NMV_UPDATE_PROB);
-
-  for (i = 0; i < 2; ++i) {
-    update_mv(bc, branch_ct_sign[i], &mvc->comps[i].sign,
-              prob.comps[i].sign, NMV_UPDATE_PROB);
-    for (j = 0; j < MV_CLASSES - 1; ++j)
-      update_mv(bc, branch_ct_classes[i][j], &mvc->comps[i].classes[j],
-                prob.comps[i].classes[j], NMV_UPDATE_PROB);
+  nmv_context *const mvc = &cm->fc.nmvc;
+  nmv_context_counts *const counts = &cm->counts.mv;
 
-    for (j = 0; j < CLASS0_SIZE - 1; ++j)
-      update_mv(bc, branch_ct_class0[i][j], &mvc->comps[i].class0[j],
-                prob.comps[i].class0[j], NMV_UPDATE_PROB);
+  write_mv_update(vp9_mv_joint_tree, mvc->joints, counts->joints, MV_JOINTS, w);
 
+  for (i = 0; i < 2; ++i) {
+    nmv_component *comp = &mvc->comps[i];
+    nmv_component_counts *comp_counts = &counts->comps[i];
+
+    update_mv(w, comp_counts->sign, &comp->sign, MV_UPDATE_PROB);
+    write_mv_update(vp9_mv_class_tree, comp->classes, comp_counts->classes,
+                    MV_CLASSES, w);
+    write_mv_update(vp9_mv_class0_tree, comp->class0, comp_counts->class0,
+                    CLASS0_SIZE, w);
     for (j = 0; j < MV_OFFSET_BITS; ++j)
-      update_mv(bc, branch_ct_bits[i][j], &mvc->comps[i].bits[j],
-                prob.comps[i].bits[j], NMV_UPDATE_PROB);
+      update_mv(w, comp_counts->bits[j], &comp->bits[j], MV_UPDATE_PROB);
   }
 
   for (i = 0; i < 2; ++i) {
-    for (j = 0; j < CLASS0_SIZE; ++j) {
-      int k;
-      for (k = 0; k < 3; ++k)
-        update_mv(bc, branch_ct_class0_fp[i][j][k],
-                  &mvc->comps[i].class0_fp[j][k],
-                  prob.comps[i].class0_fp[j][k], NMV_UPDATE_PROB);
-    }
+    for (j = 0; j < CLASS0_SIZE; ++j)
+      write_mv_update(vp9_mv_fp_tree, mvc->comps[i].class0_fp[j],
+                      counts->comps[i].class0_fp[j], MV_FP_SIZE, w);
 
-    for (j = 0; j < 3; ++j)
-      update_mv(bc, branch_ct_fp[i][j], &mvc->comps[i].fp[j],
-                prob.comps[i].fp[j], NMV_UPDATE_PROB);
+    write_mv_update(vp9_mv_fp_tree, mvc->comps[i].fp, counts->comps[i].fp,
+                    MV_FP_SIZE, w);
   }
 
   if (usehp) {
     for (i = 0; i < 2; ++i) {
-      update_mv(bc, branch_ct_class0_hp[i], &mvc->comps[i].class0_hp,
-                prob.comps[i].class0_hp, NMV_UPDATE_PROB);
-      update_mv(bc, branch_ct_hp[i], &mvc->comps[i].hp,
-                prob.comps[i].hp, NMV_UPDATE_PROB);
+      update_mv(w, counts->comps[i].class0_hp, &mvc->comps[i].class0_hp,
+                MV_UPDATE_PROB);
+      update_mv(w, counts->comps[i].hp, &mvc->comps[i].hp, MV_UPDATE_PROB);
     }
   }
 }
@@ -285,7 +207,7 @@ void vp9_encode_mv(VP9_COMP* cpi, vp9_writer* w,
   const MV_JOINT_TYPE j = vp9_get_mv_joint(&diff);
   usehp = usehp && vp9_use_mv_hp(ref);
 
-  write_token(w, vp9_mv_joint_tree, mvctx->joints, &vp9_mv_joint_encodings[j]);
+  vp9_write_token(w, vp9_mv_joint_tree, mvctx->joints, &mv_joint_encodings[j]);
   if (mv_joint_vertical(j))
     encode_mv_component(w, diff.row, &mvctx->comps[0], usehp);
 
@@ -300,34 +222,28 @@ void vp9_encode_mv(VP9_COMP* cpi, vp9_writer* w,
   }
 }
 
-void vp9_build_nmv_cost_table(int *mvjoint,
-                              int *mvcost[2],
-                              const nmv_context* const mvctx,
-                              int usehp,
-                              int mvc_flag_v,
-                              int mvc_flag_h) {
-  vp9_clear_system_state();
-  vp9_cost_tokens(mvjoint, mvctx->joints, vp9_mv_joint_tree);
-  if (mvc_flag_v)
-    build_nmv_component_cost_table(mvcost[0], &mvctx->comps[0], usehp);
-  if (mvc_flag_h)
-    build_nmv_component_cost_table(mvcost[1], &mvctx->comps[1], usehp);
+void vp9_build_nmv_cost_table(int *mvjoint, int *mvcost[2],
+                              const nmv_context* ctx, int usehp) {
+  vp9_cost_tokens(mvjoint, ctx->joints, vp9_mv_joint_tree);
+  build_nmv_component_cost_table(mvcost[0], &ctx->comps[0], usehp);
+  build_nmv_component_cost_table(mvcost[1], &ctx->comps[1], usehp);
 }
 
-static void inc_mvs(int_mv mv[2], int_mv ref[2], int is_compound,
+static void inc_mvs(const MB_MODE_INFO *mbmi, const int_mv mvs[2],
                     nmv_context_counts *counts) {
   int i;
-  for (i = 0; i < 1 + is_compound; ++i) {
-    const MV diff = { mv[i].as_mv.row - ref[i].as_mv.row,
-                      mv[i].as_mv.col - ref[i].as_mv.col };
+
+  for (i = 0; i < 1 + has_second_ref(mbmi); ++i) {
+    const MV *ref = &mbmi->ref_mvs[mbmi->ref_frame[i]][0].as_mv;
+    const MV diff = {mvs[i].as_mv.row - ref->row,
+                     mvs[i].as_mv.col - ref->col};
     vp9_inc_mv(&diff, counts);
   }
 }
 
-void vp9_update_mv_count(VP9_COMP *cpi, MACROBLOCK *x, int_mv best_ref_mv[2]) {
-  MODE_INFO *mi = x->e_mbd.mi_8x8[0];
-  MB_MODE_INFO *const mbmi = &mi->mbmi;
-  const int is_compound = has_second_ref(mbmi);
+void vp9_update_mv_count(VP9_COMMON *cm, const MACROBLOCKD *xd) {
+  const MODE_INFO *mi = xd->mi[0];
+  const MB_MODE_INFO *const mbmi = &mi->mbmi;
 
   if (mbmi->sb_type < BLOCK_8X8) {
     const int num_4x4_w = num_4x4_blocks_wide_lookup[mbmi->sb_type];
@@ -338,10 +254,12 @@ void vp9_update_mv_count(VP9_COMP *cpi, MACROBLOCK *x, int_mv best_ref_mv[2]) {
       for (idx = 0; idx < 2; idx += num_4x4_w) {
         const int i = idy * 2 + idx;
         if (mi->bmi[i].as_mode == NEWMV)
-          inc_mvs(mi->bmi[i].as_mv, best_ref_mv, is_compound, &cpi->NMVcount);
+          inc_mvs(mbmi, mi->bmi[i].as_mv, &cm->counts.mv);
       }
     }
-  } else if (mbmi->mode == NEWMV) {
-    inc_mvs(mbmi->mv, best_ref_mv, is_compound, &cpi->NMVcount);
+  } else {
+    if (mbmi->mode == NEWMV)
+      inc_mvs(mbmi, mbmi->mv, &cm->counts.mv);
   }
 }
+
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodemv.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodemv.h
index 63317788536..e67f9e3b075 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodemv.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodemv.h
@@ -12,20 +12,26 @@
 #ifndef VP9_ENCODER_VP9_ENCODEMV_H_
 #define VP9_ENCODER_VP9_ENCODEMV_H_
 
-#include "vp9/encoder/vp9_onyx_int.h"
+#include "vp9/encoder/vp9_encoder.h"
 
-void vp9_write_nmv_probs(VP9_COMP* const, int usehp, vp9_writer* const);
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vp9_entropy_mv_init();
+
+void vp9_write_nmv_probs(VP9_COMMON *cm, int usehp, vp9_writer *w);
 
 void vp9_encode_mv(VP9_COMP *cpi, vp9_writer* w, const MV* mv, const MV* ref,
                    const nmv_context* mvctx, int usehp);
 
-void vp9_build_nmv_cost_table(int *mvjoint,
-                              int *mvcost[2],
-                              const nmv_context* const mvctx,
-                              int usehp,
-                              int mvc_flag_v,
-                              int mvc_flag_h);
+void vp9_build_nmv_cost_table(int *mvjoint, int *mvcost[2],
+                              const nmv_context* mvctx, int usehp);
+
+void vp9_update_mv_count(VP9_COMMON *cm, const MACROBLOCKD *xd);
 
-void vp9_update_mv_count(VP9_COMP *cpi, MACROBLOCK *x, int_mv best_ref_mv[2]);
+#ifdef __cplusplus
+}  // extern "C"
+#endif
 
 #endif  // VP9_ENCODER_VP9_ENCODEMV_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encoder.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encoder.c
new file mode 100644
index 00000000000..911ce7c614b
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encoder.c
@@ -0,0 +1,2896 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include <stdio.h>
+#include <limits.h>
+
+#include "./vpx_config.h"
+#include "./vpx_scale_rtcd.h"
+#include "vpx/internal/vpx_psnr.h"
+#include "vpx_ports/vpx_timer.h"
+
+#include "vp9/common/vp9_alloccommon.h"
+#include "vp9/common/vp9_filter.h"
+#include "vp9/common/vp9_idct.h"
+#if CONFIG_VP9_POSTPROC
+#include "vp9/common/vp9_postproc.h"
+#endif
+#include "vp9/common/vp9_reconinter.h"
+#include "vp9/common/vp9_systemdependent.h"
+#include "vp9/common/vp9_tile_common.h"
+
+#include "vp9/encoder/vp9_aq_complexity.h"
+#include "vp9/encoder/vp9_aq_cyclicrefresh.h"
+#include "vp9/encoder/vp9_aq_variance.h"
+#include "vp9/encoder/vp9_bitstream.h"
+#include "vp9/encoder/vp9_context_tree.h"
+#include "vp9/encoder/vp9_encodeframe.h"
+#include "vp9/encoder/vp9_encodemv.h"
+#include "vp9/encoder/vp9_firstpass.h"
+#include "vp9/encoder/vp9_mbgraph.h"
+#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_picklpf.h"
+#include "vp9/encoder/vp9_ratectrl.h"
+#include "vp9/encoder/vp9_rdopt.h"
+#include "vp9/encoder/vp9_segmentation.h"
+#include "vp9/encoder/vp9_speed_features.h"
+#if CONFIG_INTERNAL_STATS
+#include "vp9/encoder/vp9_ssim.h"
+#endif
+#include "vp9/encoder/vp9_temporal_filter.h"
+#include "vp9/encoder/vp9_resize.h"
+#include "vp9/encoder/vp9_svc_layercontext.h"
+
+void vp9_coef_tree_initialize();
+
+#define DEFAULT_INTERP_FILTER SWITCHABLE
+
+#define SHARP_FILTER_QTHRESH 0          /* Q threshold for 8-tap sharp filter */
+
+#define ALTREF_HIGH_PRECISION_MV 1      // Whether to use high precision mv
+                                         //  for altref computation.
+#define HIGH_PRECISION_MV_QTHRESH 200   // Q threshold for high precision
+                                         // mv. Choose a very high value for
+                                         // now so that HIGH_PRECISION is always
+                                         // chosen.
+
+// #define OUTPUT_YUV_REC
+
+#ifdef OUTPUT_YUV_SRC
+FILE *yuv_file;
+#endif
+#ifdef OUTPUT_YUV_REC
+FILE *yuv_rec_file;
+#endif
+
+#if 0
+FILE *framepsnr;
+FILE *kf_list;
+FILE *keyfile;
+#endif
+
+static INLINE void Scale2Ratio(VPX_SCALING mode, int *hr, int *hs) {
+  switch (mode) {
+    case NORMAL:
+      *hr = 1;
+      *hs = 1;
+      break;
+    case FOURFIVE:
+      *hr = 4;
+      *hs = 5;
+      break;
+    case THREEFIVE:
+      *hr = 3;
+      *hs = 5;
+    break;
+    case ONETWO:
+      *hr = 1;
+      *hs = 2;
+    break;
+    default:
+      *hr = 1;
+      *hs = 1;
+       assert(0);
+      break;
+  }
+}
+
+static void set_high_precision_mv(VP9_COMP *cpi, int allow_high_precision_mv) {
+  MACROBLOCK *const mb = &cpi->mb;
+  cpi->common.allow_high_precision_mv = allow_high_precision_mv;
+  if (cpi->common.allow_high_precision_mv) {
+    mb->mvcost = mb->nmvcost_hp;
+    mb->mvsadcost = mb->nmvsadcost_hp;
+  } else {
+    mb->mvcost = mb->nmvcost;
+    mb->mvsadcost = mb->nmvsadcost;
+  }
+}
+
+static void setup_frame(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  // Set up entropy context depending on frame type. The decoder mandates
+  // the use of the default context, index 0, for keyframes and inter
+  // frames where the error_resilient_mode or intra_only flag is set. For
+  // other inter-frames the encoder currently uses only two contexts;
+  // context 1 for ALTREF frames and context 0 for the others.
+  if (frame_is_intra_only(cm) || cm->error_resilient_mode) {
+    vp9_setup_past_independence(cm);
+  } else {
+    if (!cpi->use_svc)
+      cm->frame_context_idx = cpi->refresh_alt_ref_frame;
+  }
+
+  if (cm->frame_type == KEY_FRAME) {
+    cpi->refresh_golden_frame = 1;
+    cpi->refresh_alt_ref_frame = 1;
+  } else {
+    cm->fc = cm->frame_contexts[cm->frame_context_idx];
+  }
+}
+
+void vp9_initialize_enc() {
+  static int init_done = 0;
+
+  if (!init_done) {
+    vp9_init_neighbors();
+    vp9_init_quant_tables();
+
+    vp9_coef_tree_initialize();
+    vp9_tokenize_initialize();
+    vp9_init_me_luts();
+    vp9_rc_init_minq_luts();
+    vp9_entropy_mv_init();
+    vp9_entropy_mode_init();
+    vp9_temporal_filter_init();
+    init_done = 1;
+  }
+}
+
+static void dealloc_compressor_data(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  int i;
+
+  // Delete sementation map
+  vpx_free(cpi->segmentation_map);
+  cpi->segmentation_map = NULL;
+  vpx_free(cm->last_frame_seg_map);
+  cm->last_frame_seg_map = NULL;
+  vpx_free(cpi->coding_context.last_frame_seg_map_copy);
+  cpi->coding_context.last_frame_seg_map_copy = NULL;
+
+  vpx_free(cpi->complexity_map);
+  cpi->complexity_map = NULL;
+
+  vp9_cyclic_refresh_free(cpi->cyclic_refresh);
+  cpi->cyclic_refresh = NULL;
+
+  vpx_free(cpi->active_map);
+  cpi->active_map = NULL;
+
+  vp9_free_frame_buffers(cm);
+
+  vp9_free_frame_buffer(&cpi->last_frame_uf);
+  vp9_free_frame_buffer(&cpi->scaled_source);
+  vp9_free_frame_buffer(&cpi->scaled_last_source);
+  vp9_free_frame_buffer(&cpi->alt_ref_buffer);
+  vp9_lookahead_destroy(cpi->lookahead);
+
+  vpx_free(cpi->tok);
+  cpi->tok = 0;
+
+  vp9_free_pc_tree(&cpi->mb);
+
+  for (i = 0; i < cpi->svc.number_spatial_layers; ++i) {
+    LAYER_CONTEXT *const lc = &cpi->svc.layer_context[i];
+    vpx_free(lc->rc_twopass_stats_in.buf);
+    lc->rc_twopass_stats_in.buf = NULL;
+    lc->rc_twopass_stats_in.sz = 0;
+  }
+}
+
+static void save_coding_context(VP9_COMP *cpi) {
+  CODING_CONTEXT *const cc = &cpi->coding_context;
+  VP9_COMMON *cm = &cpi->common;
+
+  // Stores a snapshot of key state variables which can subsequently be
+  // restored with a call to vp9_restore_coding_context. These functions are
+  // intended for use in a re-code loop in vp9_compress_frame where the
+  // quantizer value is adjusted between loop iterations.
+  vp9_copy(cc->nmvjointcost,  cpi->mb.nmvjointcost);
+  vp9_copy(cc->nmvcosts,  cpi->mb.nmvcosts);
+  vp9_copy(cc->nmvcosts_hp,  cpi->mb.nmvcosts_hp);
+
+  vp9_copy(cc->segment_pred_probs, cm->seg.pred_probs);
+
+  vpx_memcpy(cpi->coding_context.last_frame_seg_map_copy,
+             cm->last_frame_seg_map, (cm->mi_rows * cm->mi_cols));
+
+  vp9_copy(cc->last_ref_lf_deltas, cm->lf.last_ref_deltas);
+  vp9_copy(cc->last_mode_lf_deltas, cm->lf.last_mode_deltas);
+
+  cc->fc = cm->fc;
+}
+
+static void restore_coding_context(VP9_COMP *cpi) {
+  CODING_CONTEXT *const cc = &cpi->coding_context;
+  VP9_COMMON *cm = &cpi->common;
+
+  // Restore key state variables to the snapshot state stored in the
+  // previous call to vp9_save_coding_context.
+  vp9_copy(cpi->mb.nmvjointcost, cc->nmvjointcost);
+  vp9_copy(cpi->mb.nmvcosts, cc->nmvcosts);
+  vp9_copy(cpi->mb.nmvcosts_hp, cc->nmvcosts_hp);
+
+  vp9_copy(cm->seg.pred_probs, cc->segment_pred_probs);
+
+  vpx_memcpy(cm->last_frame_seg_map,
+             cpi->coding_context.last_frame_seg_map_copy,
+             (cm->mi_rows * cm->mi_cols));
+
+  vp9_copy(cm->lf.last_ref_deltas, cc->last_ref_lf_deltas);
+  vp9_copy(cm->lf.last_mode_deltas, cc->last_mode_lf_deltas);
+
+  cm->fc = cc->fc;
+}
+
+static void configure_static_seg_features(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  const RATE_CONTROL *const rc = &cpi->rc;
+  struct segmentation *const seg = &cm->seg;
+
+  int high_q = (int)(rc->avg_q > 48.0);
+  int qi_delta;
+
+  // Disable and clear down for KF
+  if (cm->frame_type == KEY_FRAME) {
+    // Clear down the global segmentation map
+    vpx_memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols);
+    seg->update_map = 0;
+    seg->update_data = 0;
+    cpi->static_mb_pct = 0;
+
+    // Disable segmentation
+    vp9_disable_segmentation(seg);
+
+    // Clear down the segment features.
+    vp9_clearall_segfeatures(seg);
+  } else if (cpi->refresh_alt_ref_frame) {
+    // If this is an alt ref frame
+    // Clear down the global segmentation map
+    vpx_memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols);
+    seg->update_map = 0;
+    seg->update_data = 0;
+    cpi->static_mb_pct = 0;
+
+    // Disable segmentation and individual segment features by default
+    vp9_disable_segmentation(seg);
+    vp9_clearall_segfeatures(seg);
+
+    // Scan frames from current to arf frame.
+    // This function re-enables segmentation if appropriate.
+    vp9_update_mbgraph_stats(cpi);
+
+    // If segmentation was enabled set those features needed for the
+    // arf itself.
+    if (seg->enabled) {
+      seg->update_map = 1;
+      seg->update_data = 1;
+
+      qi_delta = vp9_compute_qdelta(rc, rc->avg_q, rc->avg_q * 0.875);
+      vp9_set_segdata(seg, 1, SEG_LVL_ALT_Q, qi_delta - 2);
+      vp9_set_segdata(seg, 1, SEG_LVL_ALT_LF, -2);
+
+      vp9_enable_segfeature(seg, 1, SEG_LVL_ALT_Q);
+      vp9_enable_segfeature(seg, 1, SEG_LVL_ALT_LF);
+
+      // Where relevant assume segment data is delta data
+      seg->abs_delta = SEGMENT_DELTADATA;
+    }
+  } else if (seg->enabled) {
+    // All other frames if segmentation has been enabled
+
+    // First normal frame in a valid gf or alt ref group
+    if (rc->frames_since_golden == 0) {
+      // Set up segment features for normal frames in an arf group
+      if (rc->source_alt_ref_active) {
+        seg->update_map = 0;
+        seg->update_data = 1;
+        seg->abs_delta = SEGMENT_DELTADATA;
+
+        qi_delta = vp9_compute_qdelta(rc, rc->avg_q, rc->avg_q * 1.125);
+        vp9_set_segdata(seg, 1, SEG_LVL_ALT_Q, qi_delta + 2);
+        vp9_enable_segfeature(seg, 1, SEG_LVL_ALT_Q);
+
+        vp9_set_segdata(seg, 1, SEG_LVL_ALT_LF, -2);
+        vp9_enable_segfeature(seg, 1, SEG_LVL_ALT_LF);
+
+        // Segment coding disabled for compred testing
+        if (high_q || (cpi->static_mb_pct == 100)) {
+          vp9_set_segdata(seg, 1, SEG_LVL_REF_FRAME, ALTREF_FRAME);
+          vp9_enable_segfeature(seg, 1, SEG_LVL_REF_FRAME);
+          vp9_enable_segfeature(seg, 1, SEG_LVL_SKIP);
+        }
+      } else {
+        // Disable segmentation and clear down features if alt ref
+        // is not active for this group
+
+        vp9_disable_segmentation(seg);
+
+        vpx_memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols);
+
+        seg->update_map = 0;
+        seg->update_data = 0;
+
+        vp9_clearall_segfeatures(seg);
+      }
+    } else if (rc->is_src_frame_alt_ref) {
+      // Special case where we are coding over the top of a previous
+      // alt ref frame.
+      // Segment coding disabled for compred testing
+
+      // Enable ref frame features for segment 0 as well
+      vp9_enable_segfeature(seg, 0, SEG_LVL_REF_FRAME);
+      vp9_enable_segfeature(seg, 1, SEG_LVL_REF_FRAME);
+
+      // All mbs should use ALTREF_FRAME
+      vp9_clear_segdata(seg, 0, SEG_LVL_REF_FRAME);
+      vp9_set_segdata(seg, 0, SEG_LVL_REF_FRAME, ALTREF_FRAME);
+      vp9_clear_segdata(seg, 1, SEG_LVL_REF_FRAME);
+      vp9_set_segdata(seg, 1, SEG_LVL_REF_FRAME, ALTREF_FRAME);
+
+      // Skip all MBs if high Q (0,0 mv and skip coeffs)
+      if (high_q) {
+        vp9_enable_segfeature(seg, 0, SEG_LVL_SKIP);
+        vp9_enable_segfeature(seg, 1, SEG_LVL_SKIP);
+      }
+      // Enable data update
+      seg->update_data = 1;
+    } else {
+      // All other frames.
+
+      // No updates.. leave things as they are.
+      seg->update_map = 0;
+      seg->update_data = 0;
+    }
+  }
+}
+
+static void update_reference_segmentation_map(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  MODE_INFO **mi_8x8_ptr = cm->mi_grid_visible;
+  uint8_t *cache_ptr = cm->last_frame_seg_map;
+  int row, col;
+
+  for (row = 0; row < cm->mi_rows; row++) {
+    MODE_INFO **mi_8x8 = mi_8x8_ptr;
+    uint8_t *cache = cache_ptr;
+    for (col = 0; col < cm->mi_cols; col++, mi_8x8++, cache++)
+      cache[0] = mi_8x8[0]->mbmi.segment_id;
+    mi_8x8_ptr += cm->mi_stride;
+    cache_ptr += cm->mi_cols;
+  }
+}
+
+
+static void set_speed_features(VP9_COMP *cpi) {
+#if CONFIG_INTERNAL_STATS
+  int i;
+  for (i = 0; i < MAX_MODES; ++i)
+    cpi->mode_chosen_counts[i] = 0;
+#endif
+
+  vp9_set_speed_features(cpi);
+
+  // Set rd thresholds based on mode and speed setting
+  vp9_set_rd_speed_thresholds(cpi);
+  vp9_set_rd_speed_thresholds_sub8x8(cpi);
+
+  cpi->mb.fwd_txm4x4 = vp9_fdct4x4;
+  if (cpi->oxcf.lossless || cpi->mb.e_mbd.lossless) {
+    cpi->mb.fwd_txm4x4 = vp9_fwht4x4;
+  }
+}
+
+static void alloc_raw_frame_buffers(VP9_COMP *cpi) {
+  VP9_COMMON *cm = &cpi->common;
+  const VP9EncoderConfig *oxcf = &cpi->oxcf;
+
+  cpi->lookahead = vp9_lookahead_init(oxcf->width, oxcf->height,
+                                      cm->subsampling_x, cm->subsampling_y,
+                                      oxcf->lag_in_frames);
+  if (!cpi->lookahead)
+    vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+                       "Failed to allocate lag buffers");
+
+  if (vp9_realloc_frame_buffer(&cpi->alt_ref_buffer,
+                               oxcf->width, oxcf->height,
+                               cm->subsampling_x, cm->subsampling_y,
+                               VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL))
+    vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+                       "Failed to allocate altref buffer");
+}
+
+void vp9_alloc_compressor_data(VP9_COMP *cpi) {
+  VP9_COMMON *cm = &cpi->common;
+
+  if (vp9_alloc_frame_buffers(cm, cm->width, cm->height))
+    vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+                       "Failed to allocate frame buffers");
+
+  if (vp9_alloc_frame_buffer(&cpi->last_frame_uf,
+                             cm->width, cm->height,
+                             cm->subsampling_x, cm->subsampling_y,
+                             VP9_ENC_BORDER_IN_PIXELS))
+    vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+                       "Failed to allocate last frame buffer");
+
+  if (vp9_alloc_frame_buffer(&cpi->scaled_source,
+                             cm->width, cm->height,
+                             cm->subsampling_x, cm->subsampling_y,
+                             VP9_ENC_BORDER_IN_PIXELS))
+    vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+                       "Failed to allocate scaled source buffer");
+
+  if (vp9_alloc_frame_buffer(&cpi->scaled_last_source,
+                             cm->width, cm->height,
+                             cm->subsampling_x, cm->subsampling_y,
+                             VP9_ENC_BORDER_IN_PIXELS))
+    vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+                       "Failed to allocate scaled last source buffer");
+
+  vpx_free(cpi->tok);
+
+  {
+    unsigned int tokens = get_token_alloc(cm->mb_rows, cm->mb_cols);
+
+    CHECK_MEM_ERROR(cm, cpi->tok, vpx_calloc(tokens, sizeof(*cpi->tok)));
+  }
+
+  vp9_setup_pc_tree(&cpi->common, &cpi->mb);
+}
+
+static void update_frame_size(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &cpi->mb.e_mbd;
+
+  vp9_update_frame_size(cm);
+
+  // Update size of buffers local to this frame
+  if (vp9_realloc_frame_buffer(&cpi->last_frame_uf,
+                               cm->width, cm->height,
+                               cm->subsampling_x, cm->subsampling_y,
+                               VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL))
+    vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+                       "Failed to reallocate last frame buffer");
+
+  if (vp9_realloc_frame_buffer(&cpi->scaled_source,
+                               cm->width, cm->height,
+                               cm->subsampling_x, cm->subsampling_y,
+                               VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL))
+    vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+                       "Failed to reallocate scaled source buffer");
+
+  if (vp9_realloc_frame_buffer(&cpi->scaled_last_source,
+                               cm->width, cm->height,
+                               cm->subsampling_x, cm->subsampling_y,
+                               VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL))
+    vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+                       "Failed to reallocate scaled last source buffer");
+
+  {
+    int y_stride = cpi->scaled_source.y_stride;
+
+    if (cpi->sf.search_method == NSTEP) {
+      vp9_init3smotion_compensation(&cpi->ss_cfg, y_stride);
+    } else if (cpi->sf.search_method == DIAMOND) {
+      vp9_init_dsmotion_compensation(&cpi->ss_cfg, y_stride);
+    }
+  }
+
+  init_macroblockd(cm, xd);
+}
+
+void vp9_new_framerate(VP9_COMP *cpi, double framerate) {
+  cpi->oxcf.framerate = framerate < 0.1 ? 30 : framerate;
+  vp9_rc_update_framerate(cpi);
+}
+
+int64_t vp9_rescale(int64_t val, int64_t num, int denom) {
+  int64_t llnum = num;
+  int64_t llden = denom;
+  int64_t llval = val;
+
+  return (llval * llnum / llden);
+}
+
+static void set_tile_limits(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+
+  int min_log2_tile_cols, max_log2_tile_cols;
+  vp9_get_tile_n_bits(cm->mi_cols, &min_log2_tile_cols, &max_log2_tile_cols);
+
+  cm->log2_tile_cols = clamp(cpi->oxcf.tile_columns,
+                             min_log2_tile_cols, max_log2_tile_cols);
+  cm->log2_tile_rows = cpi->oxcf.tile_rows;
+}
+
+static void init_config(struct VP9_COMP *cpi, VP9EncoderConfig *oxcf) {
+  VP9_COMMON *const cm = &cpi->common;
+
+  cpi->oxcf = *oxcf;
+
+  cm->profile = oxcf->profile;
+  cm->bit_depth = oxcf->bit_depth;
+
+  cm->width = oxcf->width;
+  cm->height = oxcf->height;
+  cm->subsampling_x = 0;
+  cm->subsampling_y = 0;
+  vp9_alloc_compressor_data(cpi);
+
+  // Spatial scalability.
+  cpi->svc.number_spatial_layers = oxcf->ss_number_layers;
+  // Temporal scalability.
+  cpi->svc.number_temporal_layers = oxcf->ts_number_layers;
+
+  if ((cpi->svc.number_temporal_layers > 1 &&
+      cpi->oxcf.rc_mode == RC_MODE_CBR) ||
+      (cpi->svc.number_spatial_layers > 1 &&
+      cpi->oxcf.mode == TWO_PASS_SECOND_BEST)) {
+    vp9_init_layer_context(cpi);
+  }
+
+  // change includes all joint functionality
+  vp9_change_config(cpi, oxcf);
+
+  cpi->static_mb_pct = 0;
+
+  cpi->lst_fb_idx = 0;
+  cpi->gld_fb_idx = 1;
+  cpi->alt_fb_idx = 2;
+
+  set_tile_limits(cpi);
+}
+
+static int get_pass(MODE mode) {
+  switch (mode) {
+    case REALTIME:
+    case ONE_PASS_GOOD:
+    case ONE_PASS_BEST:
+      return 0;
+
+    case TWO_PASS_FIRST:
+      return 1;
+
+    case TWO_PASS_SECOND_GOOD:
+    case TWO_PASS_SECOND_BEST:
+      return 2;
+  }
+  return -1;
+}
+
+void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
+  VP9_COMMON *const cm = &cpi->common;
+  RATE_CONTROL *const rc = &cpi->rc;
+
+  if (cm->profile != oxcf->profile)
+    cm->profile = oxcf->profile;
+  cm->bit_depth = oxcf->bit_depth;
+
+  if (cm->profile <= PROFILE_1)
+    assert(cm->bit_depth == BITS_8);
+  else
+    assert(cm->bit_depth > BITS_8);
+
+  cpi->oxcf = *oxcf;
+  cpi->pass = get_pass(cpi->oxcf.mode);
+  if (cpi->oxcf.mode == REALTIME)
+    cpi->oxcf.play_alternate = 0;
+
+  cpi->oxcf.lossless = oxcf->lossless;
+  if (cpi->oxcf.lossless) {
+    // In lossless mode, make sure right quantizer range and correct transform
+    // is set.
+    cpi->oxcf.worst_allowed_q = 0;
+    cpi->oxcf.best_allowed_q = 0;
+    cpi->mb.e_mbd.itxm_add = vp9_iwht4x4_add;
+  } else {
+    cpi->mb.e_mbd.itxm_add = vp9_idct4x4_add;
+  }
+  rc->baseline_gf_interval = DEFAULT_GF_INTERVAL;
+  cpi->ref_frame_flags = VP9_ALT_FLAG | VP9_GOLD_FLAG | VP9_LAST_FLAG;
+
+  cpi->refresh_golden_frame = 0;
+  cpi->refresh_last_frame = 1;
+  cm->refresh_frame_context = 1;
+  cm->reset_frame_context = 0;
+
+  vp9_reset_segment_features(&cm->seg);
+  set_high_precision_mv(cpi, 0);
+
+  {
+    int i;
+
+    for (i = 0; i < MAX_SEGMENTS; i++)
+      cpi->segment_encode_breakout[i] = cpi->oxcf.encode_breakout;
+  }
+  cpi->encode_breakout = cpi->oxcf.encode_breakout;
+
+  // local file playback mode == really big buffer
+  if (cpi->oxcf.rc_mode == RC_MODE_VBR) {
+    cpi->oxcf.starting_buffer_level   = 60000;
+    cpi->oxcf.optimal_buffer_level    = 60000;
+    cpi->oxcf.maximum_buffer_size     = 240000;
+  }
+
+  // Convert target bandwidth from Kbit/s to Bit/s
+  cpi->oxcf.target_bandwidth       *= 1000;
+
+  cpi->oxcf.starting_buffer_level =
+      vp9_rescale(cpi->oxcf.starting_buffer_level,
+                  cpi->oxcf.target_bandwidth, 1000);
+
+  // Set or reset optimal and maximum buffer levels.
+  if (cpi->oxcf.optimal_buffer_level == 0)
+    cpi->oxcf.optimal_buffer_level = cpi->oxcf.target_bandwidth / 8;
+  else
+    cpi->oxcf.optimal_buffer_level =
+        vp9_rescale(cpi->oxcf.optimal_buffer_level,
+                    cpi->oxcf.target_bandwidth, 1000);
+
+  if (cpi->oxcf.maximum_buffer_size == 0)
+    cpi->oxcf.maximum_buffer_size = cpi->oxcf.target_bandwidth / 8;
+  else
+    cpi->oxcf.maximum_buffer_size =
+        vp9_rescale(cpi->oxcf.maximum_buffer_size,
+                    cpi->oxcf.target_bandwidth, 1000);
+  // Under a configuration change, where maximum_buffer_size may change,
+  // keep buffer level clipped to the maximum allowed buffer size.
+  rc->bits_off_target = MIN(rc->bits_off_target, cpi->oxcf.maximum_buffer_size);
+  rc->buffer_level = MIN(rc->buffer_level, cpi->oxcf.maximum_buffer_size);
+
+  // Set up frame rate and related parameters rate control values.
+  vp9_new_framerate(cpi, cpi->oxcf.framerate);
+
+  // Set absolute upper and lower quality limits
+  rc->worst_quality = cpi->oxcf.worst_allowed_q;
+  rc->best_quality = cpi->oxcf.best_allowed_q;
+
+  cm->interp_filter = DEFAULT_INTERP_FILTER;
+
+  cm->display_width = cpi->oxcf.width;
+  cm->display_height = cpi->oxcf.height;
+
+  if (cpi->initial_width) {
+    // Increasing the size of the frame beyond the first seen frame, or some
+    // otherwise signaled maximum size, is not supported.
+    // TODO(jkoleszar): exit gracefully.
+    assert(cm->width <= cpi->initial_width);
+    assert(cm->height <= cpi->initial_height);
+  }
+  update_frame_size(cpi);
+
+  if ((cpi->svc.number_temporal_layers > 1 &&
+      cpi->oxcf.rc_mode == RC_MODE_CBR) ||
+      (cpi->svc.number_spatial_layers > 1 && cpi->pass == 2)) {
+    vp9_update_layer_context_change_config(cpi,
+                                           (int)cpi->oxcf.target_bandwidth);
+  }
+
+#if CONFIG_MULTIPLE_ARF
+  vp9_zero(cpi->alt_ref_source);
+#else
+  cpi->alt_ref_source = NULL;
+#endif
+  rc->is_src_frame_alt_ref = 0;
+
+#if 0
+  // Experimental RD Code
+  cpi->frame_distortion = 0;
+  cpi->last_frame_distortion = 0;
+#endif
+
+  set_tile_limits(cpi);
+
+  cpi->ext_refresh_frame_flags_pending = 0;
+  cpi->ext_refresh_frame_context_pending = 0;
+}
+
+#ifndef M_LOG2_E
+#define M_LOG2_E 0.693147180559945309417
+#endif
+#define log2f(x) (log (x) / (float) M_LOG2_E)
+
+static void cal_nmvjointsadcost(int *mvjointsadcost) {
+  mvjointsadcost[0] = 600;
+  mvjointsadcost[1] = 300;
+  mvjointsadcost[2] = 300;
+  mvjointsadcost[3] = 300;
+}
+
+static void cal_nmvsadcosts(int *mvsadcost[2]) {
+  int i = 1;
+
+  mvsadcost[0][0] = 0;
+  mvsadcost[1][0] = 0;
+
+  do {
+    double z = 256 * (2 * (log2f(8 * i) + .6));
+    mvsadcost[0][i] = (int)z;
+    mvsadcost[1][i] = (int)z;
+    mvsadcost[0][-i] = (int)z;
+    mvsadcost[1][-i] = (int)z;
+  } while (++i <= MV_MAX);
+}
+
+static void cal_nmvsadcosts_hp(int *mvsadcost[2]) {
+  int i = 1;
+
+  mvsadcost[0][0] = 0;
+  mvsadcost[1][0] = 0;
+
+  do {
+    double z = 256 * (2 * (log2f(8 * i) + .6));
+    mvsadcost[0][i] = (int)z;
+    mvsadcost[1][i] = (int)z;
+    mvsadcost[0][-i] = (int)z;
+    mvsadcost[1][-i] = (int)z;
+  } while (++i <= MV_MAX);
+}
+
+
+VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf) {
+  unsigned int i, j;
+  VP9_COMP *const cpi = vpx_memalign(32, sizeof(VP9_COMP));
+  VP9_COMMON *const cm = cpi != NULL ? &cpi->common : NULL;
+
+  if (!cm)
+    return NULL;
+
+  vp9_zero(*cpi);
+
+  if (setjmp(cm->error.jmp)) {
+    cm->error.setjmp = 0;
+    vp9_remove_compressor(cpi);
+    return 0;
+  }
+
+  cm->error.setjmp = 1;
+
+  vp9_rtcd();
+
+  cpi->use_svc = 0;
+
+  init_config(cpi, oxcf);
+  vp9_rc_init(&cpi->oxcf, cpi->pass, &cpi->rc);
+
+  cm->current_video_frame = 0;
+
+  // Set reference frame sign bias for ALTREF frame to 1 (for now)
+  cm->ref_frame_sign_bias[ALTREF_FRAME] = 1;
+
+  cpi->gold_is_last = 0;
+  cpi->alt_is_last = 0;
+  cpi->gold_is_alt = 0;
+
+  // Create the encoder segmentation map and set all entries to 0
+  CHECK_MEM_ERROR(cm, cpi->segmentation_map,
+                  vpx_calloc(cm->mi_rows * cm->mi_cols, 1));
+
+  // Create a complexity map used for rd adjustment
+  CHECK_MEM_ERROR(cm, cpi->complexity_map,
+                  vpx_calloc(cm->mi_rows * cm->mi_cols, 1));
+
+  // Create a map used for cyclic background refresh.
+  CHECK_MEM_ERROR(cm, cpi->cyclic_refresh,
+                  vp9_cyclic_refresh_alloc(cm->mi_rows, cm->mi_cols));
+
+  // And a place holder structure is the coding context
+  // for use if we want to save and restore it
+  CHECK_MEM_ERROR(cm, cpi->coding_context.last_frame_seg_map_copy,
+                  vpx_calloc(cm->mi_rows * cm->mi_cols, 1));
+
+  CHECK_MEM_ERROR(cm, cpi->active_map, vpx_calloc(cm->MBs, 1));
+  vpx_memset(cpi->active_map, 1, cm->MBs);
+  cpi->active_map_enabled = 0;
+
+  for (i = 0; i < (sizeof(cpi->mbgraph_stats) /
+                   sizeof(cpi->mbgraph_stats[0])); i++) {
+    CHECK_MEM_ERROR(cm, cpi->mbgraph_stats[i].mb_stats,
+                    vpx_calloc(cm->MBs *
+                               sizeof(*cpi->mbgraph_stats[i].mb_stats), 1));
+  }
+
+  cpi->refresh_alt_ref_frame = 0;
+
+#if CONFIG_MULTIPLE_ARF
+  // Turn multiple ARF usage on/off. This is a quick hack for the initial test
+  // version. It should eventually be set via the codec API.
+  cpi->multi_arf_enabled = 1;
+
+  if (cpi->multi_arf_enabled) {
+    cpi->sequence_number = 0;
+    cpi->frame_coding_order_period = 0;
+    vp9_zero(cpi->frame_coding_order);
+    vp9_zero(cpi->arf_buffer_idx);
+  }
+#endif
+
+  cpi->b_calculate_psnr = CONFIG_INTERNAL_STATS;
+#if CONFIG_INTERNAL_STATS
+  cpi->b_calculate_ssimg = 0;
+
+  cpi->count = 0;
+  cpi->bytes = 0;
+
+  if (cpi->b_calculate_psnr) {
+    cpi->total_y = 0.0;
+    cpi->total_u = 0.0;
+    cpi->total_v = 0.0;
+    cpi->total = 0.0;
+    cpi->total_sq_error = 0;
+    cpi->total_samples = 0;
+
+    cpi->totalp_y = 0.0;
+    cpi->totalp_u = 0.0;
+    cpi->totalp_v = 0.0;
+    cpi->totalp = 0.0;
+    cpi->totalp_sq_error = 0;
+    cpi->totalp_samples = 0;
+
+    cpi->tot_recode_hits = 0;
+    cpi->summed_quality = 0;
+    cpi->summed_weights = 0;
+    cpi->summedp_quality = 0;
+    cpi->summedp_weights = 0;
+  }
+
+  if (cpi->b_calculate_ssimg) {
+    cpi->total_ssimg_y = 0;
+    cpi->total_ssimg_u = 0;
+    cpi->total_ssimg_v = 0;
+    cpi->total_ssimg_all = 0;
+  }
+
+#endif
+
+  cpi->first_time_stamp_ever = INT64_MAX;
+
+  cal_nmvjointsadcost(cpi->mb.nmvjointsadcost);
+  cpi->mb.nmvcost[0] = &cpi->mb.nmvcosts[0][MV_MAX];
+  cpi->mb.nmvcost[1] = &cpi->mb.nmvcosts[1][MV_MAX];
+  cpi->mb.nmvsadcost[0] = &cpi->mb.nmvsadcosts[0][MV_MAX];
+  cpi->mb.nmvsadcost[1] = &cpi->mb.nmvsadcosts[1][MV_MAX];
+  cal_nmvsadcosts(cpi->mb.nmvsadcost);
+
+  cpi->mb.nmvcost_hp[0] = &cpi->mb.nmvcosts_hp[0][MV_MAX];
+  cpi->mb.nmvcost_hp[1] = &cpi->mb.nmvcosts_hp[1][MV_MAX];
+  cpi->mb.nmvsadcost_hp[0] = &cpi->mb.nmvsadcosts_hp[0][MV_MAX];
+  cpi->mb.nmvsadcost_hp[1] = &cpi->mb.nmvsadcosts_hp[1][MV_MAX];
+  cal_nmvsadcosts_hp(cpi->mb.nmvsadcost_hp);
+
+#ifdef OUTPUT_YUV_SRC
+  yuv_file = fopen("bd.yuv", "ab");
+#endif
+#ifdef OUTPUT_YUV_REC
+  yuv_rec_file = fopen("rec.yuv", "wb");
+#endif
+
+#if 0
+  framepsnr = fopen("framepsnr.stt", "a");
+  kf_list = fopen("kf_list.stt", "w");
+#endif
+
+  cpi->output_pkt_list = oxcf->output_pkt_list;
+
+  cpi->allow_encode_breakout = ENCODE_BREAKOUT_ENABLED;
+
+  if (cpi->pass == 1) {
+    vp9_init_first_pass(cpi);
+  } else if (cpi->pass == 2) {
+    const size_t packet_sz = sizeof(FIRSTPASS_STATS);
+    const int packets = (int)(oxcf->two_pass_stats_in.sz / packet_sz);
+
+    if (cpi->svc.number_spatial_layers > 1
+        && cpi->svc.number_temporal_layers == 1) {
+      FIRSTPASS_STATS *const stats = oxcf->two_pass_stats_in.buf;
+      FIRSTPASS_STATS *stats_copy[VPX_SS_MAX_LAYERS] = {0};
+      int i;
+
+      for (i = 0; i < oxcf->ss_number_layers; ++i) {
+        FIRSTPASS_STATS *const last_packet_for_layer =
+            &stats[packets - oxcf->ss_number_layers + i];
+        const int layer_id = (int)last_packet_for_layer->spatial_layer_id;
+        const int packets_in_layer = (int)last_packet_for_layer->count + 1;
+        if (layer_id >= 0 && layer_id < oxcf->ss_number_layers) {
+          LAYER_CONTEXT *const lc = &cpi->svc.layer_context[layer_id];
+
+          vpx_free(lc->rc_twopass_stats_in.buf);
+
+          lc->rc_twopass_stats_in.sz = packets_in_layer * packet_sz;
+          CHECK_MEM_ERROR(cm, lc->rc_twopass_stats_in.buf,
+                          vpx_malloc(lc->rc_twopass_stats_in.sz));
+          lc->twopass.stats_in_start = lc->rc_twopass_stats_in.buf;
+          lc->twopass.stats_in = lc->twopass.stats_in_start;
+          lc->twopass.stats_in_end = lc->twopass.stats_in_start
+                                     + packets_in_layer - 1;
+          stats_copy[layer_id] = lc->rc_twopass_stats_in.buf;
+        }
+      }
+
+      for (i = 0; i < packets; ++i) {
+        const int layer_id = (int)stats[i].spatial_layer_id;
+        if (layer_id >= 0 && layer_id < oxcf->ss_number_layers
+            && stats_copy[layer_id] != NULL) {
+          *stats_copy[layer_id] = stats[i];
+          ++stats_copy[layer_id];
+        }
+      }
+
+      vp9_init_second_pass_spatial_svc(cpi);
+    } else {
+      cpi->twopass.stats_in_start = oxcf->two_pass_stats_in.buf;
+      cpi->twopass.stats_in = cpi->twopass.stats_in_start;
+      cpi->twopass.stats_in_end = &cpi->twopass.stats_in[packets - 1];
+
+      vp9_init_second_pass(cpi);
+    }
+  }
+
+  set_speed_features(cpi);
+
+  // Default rd threshold factors for mode selection
+  for (i = 0; i < BLOCK_SIZES; ++i) {
+    for (j = 0; j < MAX_MODES; ++j)
+      cpi->rd.thresh_freq_fact[i][j] = 32;
+  }
+
+#define BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX3F, SDX8F, SDX4DF)\
+    cpi->fn_ptr[BT].sdf            = SDF; \
+    cpi->fn_ptr[BT].sdaf           = SDAF; \
+    cpi->fn_ptr[BT].vf             = VF; \
+    cpi->fn_ptr[BT].svf            = SVF; \
+    cpi->fn_ptr[BT].svaf           = SVAF; \
+    cpi->fn_ptr[BT].sdx3f          = SDX3F; \
+    cpi->fn_ptr[BT].sdx8f          = SDX8F; \
+    cpi->fn_ptr[BT].sdx4df         = SDX4DF;
+
+  BFP(BLOCK_32X16, vp9_sad32x16, vp9_sad32x16_avg,
+      vp9_variance32x16, vp9_sub_pixel_variance32x16,
+      vp9_sub_pixel_avg_variance32x16, NULL, NULL, vp9_sad32x16x4d)
+
+  BFP(BLOCK_16X32, vp9_sad16x32, vp9_sad16x32_avg,
+      vp9_variance16x32, vp9_sub_pixel_variance16x32,
+      vp9_sub_pixel_avg_variance16x32, NULL, NULL, vp9_sad16x32x4d)
+
+  BFP(BLOCK_64X32, vp9_sad64x32, vp9_sad64x32_avg,
+      vp9_variance64x32, vp9_sub_pixel_variance64x32,
+      vp9_sub_pixel_avg_variance64x32, NULL, NULL, vp9_sad64x32x4d)
+
+  BFP(BLOCK_32X64, vp9_sad32x64, vp9_sad32x64_avg,
+      vp9_variance32x64, vp9_sub_pixel_variance32x64,
+      vp9_sub_pixel_avg_variance32x64, NULL, NULL, vp9_sad32x64x4d)
+
+  BFP(BLOCK_32X32, vp9_sad32x32, vp9_sad32x32_avg,
+      vp9_variance32x32, vp9_sub_pixel_variance32x32,
+      vp9_sub_pixel_avg_variance32x32, vp9_sad32x32x3, vp9_sad32x32x8,
+      vp9_sad32x32x4d)
+
+  BFP(BLOCK_64X64, vp9_sad64x64, vp9_sad64x64_avg,
+      vp9_variance64x64, vp9_sub_pixel_variance64x64,
+      vp9_sub_pixel_avg_variance64x64, vp9_sad64x64x3, vp9_sad64x64x8,
+      vp9_sad64x64x4d)
+
+  BFP(BLOCK_16X16, vp9_sad16x16, vp9_sad16x16_avg,
+      vp9_variance16x16, vp9_sub_pixel_variance16x16,
+      vp9_sub_pixel_avg_variance16x16, vp9_sad16x16x3, vp9_sad16x16x8,
+      vp9_sad16x16x4d)
+
+  BFP(BLOCK_16X8, vp9_sad16x8, vp9_sad16x8_avg,
+      vp9_variance16x8, vp9_sub_pixel_variance16x8,
+      vp9_sub_pixel_avg_variance16x8,
+      vp9_sad16x8x3, vp9_sad16x8x8, vp9_sad16x8x4d)
+
+  BFP(BLOCK_8X16, vp9_sad8x16, vp9_sad8x16_avg,
+      vp9_variance8x16, vp9_sub_pixel_variance8x16,
+      vp9_sub_pixel_avg_variance8x16,
+      vp9_sad8x16x3, vp9_sad8x16x8, vp9_sad8x16x4d)
+
+  BFP(BLOCK_8X8, vp9_sad8x8, vp9_sad8x8_avg,
+      vp9_variance8x8, vp9_sub_pixel_variance8x8,
+      vp9_sub_pixel_avg_variance8x8,
+      vp9_sad8x8x3, vp9_sad8x8x8, vp9_sad8x8x4d)
+
+  BFP(BLOCK_8X4, vp9_sad8x4, vp9_sad8x4_avg,
+      vp9_variance8x4, vp9_sub_pixel_variance8x4,
+      vp9_sub_pixel_avg_variance8x4, NULL, vp9_sad8x4x8, vp9_sad8x4x4d)
+
+  BFP(BLOCK_4X8, vp9_sad4x8, vp9_sad4x8_avg,
+      vp9_variance4x8, vp9_sub_pixel_variance4x8,
+      vp9_sub_pixel_avg_variance4x8, NULL, vp9_sad4x8x8, vp9_sad4x8x4d)
+
+  BFP(BLOCK_4X4, vp9_sad4x4, vp9_sad4x4_avg,
+      vp9_variance4x4, vp9_sub_pixel_variance4x4,
+      vp9_sub_pixel_avg_variance4x4,
+      vp9_sad4x4x3, vp9_sad4x4x8, vp9_sad4x4x4d)
+
+  cpi->full_search_sad = vp9_full_search_sad;
+  cpi->diamond_search_sad = vp9_diamond_search_sad;
+  cpi->refining_search_sad = vp9_refining_search_sad;
+
+  /* vp9_init_quantizer() is first called here. Add check in
+   * vp9_frame_init_quantizer() so that vp9_init_quantizer is only
+   * called later when needed. This will avoid unnecessary calls of
+   * vp9_init_quantizer() for every frame.
+   */
+  vp9_init_quantizer(cpi);
+
+  vp9_loop_filter_init(cm);
+
+  cm->error.setjmp = 0;
+
+  return cpi;
+}
+
+void vp9_remove_compressor(VP9_COMP *cpi) {
+  unsigned int i;
+
+  if (!cpi)
+    return;
+
+  if (cpi && (cpi->common.current_video_frame > 0)) {
+#if CONFIG_INTERNAL_STATS
+
+    vp9_clear_system_state();
+
+    // printf("\n8x8-4x4:%d-%d\n", cpi->t8x8_count, cpi->t4x4_count);
+    if (cpi->pass != 1) {
+      FILE *f = fopen("opsnr.stt", "a");
+      double time_encoded = (cpi->last_end_time_stamp_seen
+                             - cpi->first_time_stamp_ever) / 10000000.000;
+      double total_encode_time = (cpi->time_receive_data +
+                                  cpi->time_compress_data)   / 1000.000;
+      double dr = (double)cpi->bytes * (double) 8 / (double)1000
+                  / time_encoded;
+
+      if (cpi->b_calculate_psnr) {
+        const double total_psnr =
+            vpx_sse_to_psnr((double)cpi->total_samples, 255.0,
+                            (double)cpi->total_sq_error);
+        const double totalp_psnr =
+            vpx_sse_to_psnr((double)cpi->totalp_samples, 255.0,
+                            (double)cpi->totalp_sq_error);
+        const double total_ssim = 100 * pow(cpi->summed_quality /
+                                                cpi->summed_weights, 8.0);
+        const double totalp_ssim = 100 * pow(cpi->summedp_quality /
+                                                cpi->summedp_weights, 8.0);
+
+        fprintf(f, "Bitrate\tAVGPsnr\tGLBPsnr\tAVPsnrP\tGLPsnrP\t"
+                "VPXSSIM\tVPSSIMP\t  Time(ms)\n");
+        fprintf(f, "%7.2f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t%8.0f\n",
+                dr, cpi->total / cpi->count, total_psnr,
+                cpi->totalp / cpi->count, totalp_psnr, total_ssim, totalp_ssim,
+                total_encode_time);
+      }
+
+      if (cpi->b_calculate_ssimg) {
+        fprintf(f, "BitRate\tSSIM_Y\tSSIM_U\tSSIM_V\tSSIM_A\t  Time(ms)\n");
+        fprintf(f, "%7.2f\t%6.4f\t%6.4f\t%6.4f\t%6.4f\t%8.0f\n", dr,
+                cpi->total_ssimg_y / cpi->count,
+                cpi->total_ssimg_u / cpi->count,
+                cpi->total_ssimg_v / cpi->count,
+                cpi->total_ssimg_all / cpi->count, total_encode_time);
+      }
+
+      fclose(f);
+    }
+
+#endif
+
+#if 0
+    {
+      printf("\n_pick_loop_filter_level:%d\n", cpi->time_pick_lpf / 1000);
+      printf("\n_frames recive_data encod_mb_row compress_frame  Total\n");
+      printf("%6d %10ld %10ld %10ld %10ld\n", cpi->common.current_video_frame,
+             cpi->time_receive_data / 1000, cpi->time_encode_sb_row / 1000,
+             cpi->time_compress_data / 1000,
+             (cpi->time_receive_data + cpi->time_compress_data) / 1000);
+    }
+#endif
+  }
+
+  dealloc_compressor_data(cpi);
+  vpx_free(cpi->tok);
+
+  for (i = 0; i < sizeof(cpi->mbgraph_stats) /
+                  sizeof(cpi->mbgraph_stats[0]); ++i) {
+    vpx_free(cpi->mbgraph_stats[i].mb_stats);
+  }
+
+  vp9_remove_common(&cpi->common);
+  vpx_free(cpi);
+
+#ifdef OUTPUT_YUV_SRC
+  fclose(yuv_file);
+#endif
+#ifdef OUTPUT_YUV_REC
+  fclose(yuv_rec_file);
+#endif
+
+#if 0
+
+  if (keyfile)
+    fclose(keyfile);
+
+  if (framepsnr)
+    fclose(framepsnr);
+
+  if (kf_list)
+    fclose(kf_list);
+
+#endif
+}
+static int64_t get_sse(const uint8_t *a, int a_stride,
+                       const uint8_t *b, int b_stride,
+                       int width, int height) {
+  const int dw = width % 16;
+  const int dh = height % 16;
+  int64_t total_sse = 0;
+  unsigned int sse = 0;
+  int sum = 0;
+  int x, y;
+
+  if (dw > 0) {
+    variance(&a[width - dw], a_stride, &b[width - dw], b_stride,
+             dw, height, &sse, &sum);
+    total_sse += sse;
+  }
+
+  if (dh > 0) {
+    variance(&a[(height - dh) * a_stride], a_stride,
+             &b[(height - dh) * b_stride], b_stride,
+             width - dw, dh, &sse, &sum);
+    total_sse += sse;
+  }
+
+  for (y = 0; y < height / 16; ++y) {
+    const uint8_t *pa = a;
+    const uint8_t *pb = b;
+    for (x = 0; x < width / 16; ++x) {
+      vp9_mse16x16(pa, a_stride, pb, b_stride, &sse);
+      total_sse += sse;
+
+      pa += 16;
+      pb += 16;
+    }
+
+    a += 16 * a_stride;
+    b += 16 * b_stride;
+  }
+
+  return total_sse;
+}
+
+typedef struct {
+  double psnr[4];       // total/y/u/v
+  uint64_t sse[4];      // total/y/u/v
+  uint32_t samples[4];  // total/y/u/v
+} PSNR_STATS;
+
+static void calc_psnr(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b,
+                      PSNR_STATS *psnr) {
+  const int widths[3]        = {a->y_width,  a->uv_width,  a->uv_width };
+  const int heights[3]       = {a->y_height, a->uv_height, a->uv_height};
+  const uint8_t *a_planes[3] = {a->y_buffer, a->u_buffer,  a->v_buffer };
+  const int a_strides[3]     = {a->y_stride, a->uv_stride, a->uv_stride};
+  const uint8_t *b_planes[3] = {b->y_buffer, b->u_buffer,  b->v_buffer };
+  const int b_strides[3]     = {b->y_stride, b->uv_stride, b->uv_stride};
+  int i;
+  uint64_t total_sse = 0;
+  uint32_t total_samples = 0;
+
+  for (i = 0; i < 3; ++i) {
+    const int w = widths[i];
+    const int h = heights[i];
+    const uint32_t samples = w * h;
+    const uint64_t sse = get_sse(a_planes[i], a_strides[i],
+                                 b_planes[i], b_strides[i],
+                                 w, h);
+    psnr->sse[1 + i] = sse;
+    psnr->samples[1 + i] = samples;
+    psnr->psnr[1 + i] = vpx_sse_to_psnr(samples, 255.0, (double)sse);
+
+    total_sse += sse;
+    total_samples += samples;
+  }
+
+  psnr->sse[0] = total_sse;
+  psnr->samples[0] = total_samples;
+  psnr->psnr[0] = vpx_sse_to_psnr((double)total_samples, 255.0,
+                                  (double)total_sse);
+}
+
+static void generate_psnr_packet(VP9_COMP *cpi) {
+  struct vpx_codec_cx_pkt pkt;
+  int i;
+  PSNR_STATS psnr;
+  calc_psnr(cpi->Source, cpi->common.frame_to_show, &psnr);
+  for (i = 0; i < 4; ++i) {
+    pkt.data.psnr.samples[i] = psnr.samples[i];
+    pkt.data.psnr.sse[i] = psnr.sse[i];
+    pkt.data.psnr.psnr[i] = psnr.psnr[i];
+  }
+  pkt.kind = VPX_CODEC_PSNR_PKT;
+  vpx_codec_pkt_list_add(cpi->output_pkt_list, &pkt);
+}
+
+int vp9_use_as_reference(VP9_COMP *cpi, int ref_frame_flags) {
+  if (ref_frame_flags > 7)
+    return -1;
+
+  cpi->ref_frame_flags = ref_frame_flags;
+  return 0;
+}
+
+void vp9_update_reference(VP9_COMP *cpi, int ref_frame_flags) {
+  cpi->ext_refresh_golden_frame = (ref_frame_flags & VP9_GOLD_FLAG) != 0;
+  cpi->ext_refresh_alt_ref_frame = (ref_frame_flags & VP9_ALT_FLAG) != 0;
+  cpi->ext_refresh_last_frame = (ref_frame_flags & VP9_LAST_FLAG) != 0;
+  cpi->ext_refresh_frame_flags_pending = 1;
+}
+
+static YV12_BUFFER_CONFIG *get_vp9_ref_frame_buffer(VP9_COMP *cpi,
+                                VP9_REFFRAME ref_frame_flag) {
+  MV_REFERENCE_FRAME ref_frame = NONE;
+  if (ref_frame_flag == VP9_LAST_FLAG)
+    ref_frame = LAST_FRAME;
+  else if (ref_frame_flag == VP9_GOLD_FLAG)
+    ref_frame = GOLDEN_FRAME;
+  else if (ref_frame_flag == VP9_ALT_FLAG)
+    ref_frame = ALTREF_FRAME;
+
+  return ref_frame == NONE ? NULL : get_ref_frame_buffer(cpi, ref_frame);
+}
+
+int vp9_copy_reference_enc(VP9_COMP *cpi, VP9_REFFRAME ref_frame_flag,
+                           YV12_BUFFER_CONFIG *sd) {
+  YV12_BUFFER_CONFIG *cfg = get_vp9_ref_frame_buffer(cpi, ref_frame_flag);
+  if (cfg) {
+    vp8_yv12_copy_frame(cfg, sd);
+    return 0;
+  } else {
+    return -1;
+  }
+}
+
+int vp9_get_reference_enc(VP9_COMP *cpi, int index, YV12_BUFFER_CONFIG **fb) {
+  VP9_COMMON *cm = &cpi->common;
+
+  if (index < 0 || index >= REF_FRAMES)
+    return -1;
+
+  *fb = &cm->frame_bufs[cm->ref_frame_map[index]].buf;
+  return 0;
+}
+
+int vp9_set_reference_enc(VP9_COMP *cpi, VP9_REFFRAME ref_frame_flag,
+                          YV12_BUFFER_CONFIG *sd) {
+  YV12_BUFFER_CONFIG *cfg = get_vp9_ref_frame_buffer(cpi, ref_frame_flag);
+  if (cfg) {
+    vp8_yv12_copy_frame(sd, cfg);
+    return 0;
+  } else {
+    return -1;
+  }
+}
+
+int vp9_update_entropy(VP9_COMP * cpi, int update) {
+  cpi->ext_refresh_frame_context = update;
+  cpi->ext_refresh_frame_context_pending = 1;
+  return 0;
+}
+
+
+#ifdef OUTPUT_YUV_SRC
+void vp9_write_yuv_frame(YV12_BUFFER_CONFIG *s) {
+  uint8_t *src = s->y_buffer;
+  int h = s->y_height;
+
+  do {
+    fwrite(src, s->y_width, 1,  yuv_file);
+    src += s->y_stride;
+  } while (--h);
+
+  src = s->u_buffer;
+  h = s->uv_height;
+
+  do {
+    fwrite(src, s->uv_width, 1,  yuv_file);
+    src += s->uv_stride;
+  } while (--h);
+
+  src = s->v_buffer;
+  h = s->uv_height;
+
+  do {
+    fwrite(src, s->uv_width, 1, yuv_file);
+    src += s->uv_stride;
+  } while (--h);
+}
+#endif
+
+#ifdef OUTPUT_YUV_REC
+void vp9_write_yuv_rec_frame(VP9_COMMON *cm) {
+  YV12_BUFFER_CONFIG *s = cm->frame_to_show;
+  uint8_t *src = s->y_buffer;
+  int h = cm->height;
+
+  do {
+    fwrite(src, s->y_width, 1,  yuv_rec_file);
+    src += s->y_stride;
+  } while (--h);
+
+  src = s->u_buffer;
+  h = s->uv_height;
+
+  do {
+    fwrite(src, s->uv_width, 1,  yuv_rec_file);
+    src += s->uv_stride;
+  } while (--h);
+
+  src = s->v_buffer;
+  h = s->uv_height;
+
+  do {
+    fwrite(src, s->uv_width, 1, yuv_rec_file);
+    src += s->uv_stride;
+  } while (--h);
+
+#if CONFIG_ALPHA
+  if (s->alpha_buffer) {
+    src = s->alpha_buffer;
+    h = s->alpha_height;
+    do {
+      fwrite(src, s->alpha_width, 1,  yuv_rec_file);
+      src += s->alpha_stride;
+    } while (--h);
+  }
+#endif
+
+  fflush(yuv_rec_file);
+}
+#endif
+
+static void scale_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src,
+                                                YV12_BUFFER_CONFIG *dst) {
+  // TODO(dkovalev): replace YV12_BUFFER_CONFIG with vpx_image_t
+  int i;
+  const uint8_t *const srcs[4] = {src->y_buffer, src->u_buffer, src->v_buffer,
+                                  src->alpha_buffer};
+  const int src_strides[4] = {src->y_stride, src->uv_stride, src->uv_stride,
+                              src->alpha_stride};
+  const int src_widths[4] = {src->y_crop_width, src->uv_crop_width,
+                             src->uv_crop_width, src->y_crop_width};
+  const int src_heights[4] = {src->y_crop_height, src->uv_crop_height,
+                              src->uv_crop_height, src->y_crop_height};
+  uint8_t *const dsts[4] = {dst->y_buffer, dst->u_buffer, dst->v_buffer,
+                            dst->alpha_buffer};
+  const int dst_strides[4] = {dst->y_stride, dst->uv_stride, dst->uv_stride,
+                              dst->alpha_stride};
+  const int dst_widths[4] = {dst->y_crop_width, dst->uv_crop_width,
+                             dst->uv_crop_width, dst->y_crop_width};
+  const int dst_heights[4] = {dst->y_crop_height, dst->uv_crop_height,
+                              dst->uv_crop_height, dst->y_crop_height};
+
+  for (i = 0; i < MAX_MB_PLANE; ++i)
+    vp9_resize_plane(srcs[i], src_heights[i], src_widths[i], src_strides[i],
+                     dsts[i], dst_heights[i], dst_widths[i], dst_strides[i]);
+
+  // TODO(hkuang): Call C version explicitly
+  // as neon version only expand border size 32.
+  vp8_yv12_extend_frame_borders_c(dst);
+}
+
+static void scale_and_extend_frame(const YV12_BUFFER_CONFIG *src,
+                                   YV12_BUFFER_CONFIG *dst) {
+  const int src_w = src->y_crop_width;
+  const int src_h = src->y_crop_height;
+  const int dst_w = dst->y_crop_width;
+  const int dst_h = dst->y_crop_height;
+  const uint8_t *const srcs[4] = {src->y_buffer, src->u_buffer, src->v_buffer,
+                                  src->alpha_buffer};
+  const int src_strides[4] = {src->y_stride, src->uv_stride, src->uv_stride,
+                              src->alpha_stride};
+  uint8_t *const dsts[4] = {dst->y_buffer, dst->u_buffer, dst->v_buffer,
+                            dst->alpha_buffer};
+  const int dst_strides[4] = {dst->y_stride, dst->uv_stride, dst->uv_stride,
+                              dst->alpha_stride};
+  int x, y, i;
+
+  for (y = 0; y < dst_h; y += 16) {
+    for (x = 0; x < dst_w; x += 16) {
+      for (i = 0; i < MAX_MB_PLANE; ++i) {
+        const int factor = (i == 0 || i == 3 ? 1 : 2);
+        const int x_q4 = x * (16 / factor) * src_w / dst_w;
+        const int y_q4 = y * (16 / factor) * src_h / dst_h;
+        const int src_stride = src_strides[i];
+        const int dst_stride = dst_strides[i];
+        const uint8_t *src_ptr = srcs[i] + (y / factor) * src_h / dst_h *
+                                     src_stride + (x / factor) * src_w / dst_w;
+        uint8_t *dst_ptr = dsts[i] + (y / factor) * dst_stride + (x / factor);
+
+        vp9_convolve8(src_ptr, src_stride, dst_ptr, dst_stride,
+                      vp9_sub_pel_filters_8[x_q4 & 0xf], 16 * src_w / dst_w,
+                      vp9_sub_pel_filters_8[y_q4 & 0xf], 16 * src_h / dst_h,
+                      16 / factor, 16 / factor);
+      }
+    }
+  }
+
+  // TODO(hkuang): Call C version explicitly
+  // as neon version only expand border size 32.
+  vp8_yv12_extend_frame_borders_c(dst);
+}
+
+static int find_fp_qindex() {
+  int i;
+
+  for (i = 0; i < QINDEX_RANGE; i++) {
+    if (vp9_convert_qindex_to_q(i) >= 30.0) {
+      break;
+    }
+  }
+
+  if (i == QINDEX_RANGE)
+    i--;
+
+  return i;
+}
+
+#define WRITE_RECON_BUFFER 0
+#if WRITE_RECON_BUFFER
+void write_cx_frame_to_file(YV12_BUFFER_CONFIG *frame, int this_frame) {
+  FILE *yframe;
+  int i;
+  char filename[255];
+
+  snprintf(filename, sizeof(filename), "cx\\y%04d.raw", this_frame);
+  yframe = fopen(filename, "wb");
+
+  for (i = 0; i < frame->y_height; i++)
+    fwrite(frame->y_buffer + i * frame->y_stride,
+           frame->y_width, 1, yframe);
+
+  fclose(yframe);
+  snprintf(filename, sizeof(filename), "cx\\u%04d.raw", this_frame);
+  yframe = fopen(filename, "wb");
+
+  for (i = 0; i < frame->uv_height; i++)
+    fwrite(frame->u_buffer + i * frame->uv_stride,
+           frame->uv_width, 1, yframe);
+
+  fclose(yframe);
+  snprintf(filename, sizeof(filename), "cx\\v%04d.raw", this_frame);
+  yframe = fopen(filename, "wb");
+
+  for (i = 0; i < frame->uv_height; i++)
+    fwrite(frame->v_buffer + i * frame->uv_stride,
+           frame->uv_width, 1, yframe);
+
+  fclose(yframe);
+}
+#endif
+
+// Function to test for conditions that indicate we should loop
+// back and recode a frame.
+static int recode_loop_test(const VP9_COMP *cpi,
+                            int high_limit, int low_limit,
+                            int q, int maxq, int minq) {
+  const VP9_COMMON *const cm = &cpi->common;
+  const RATE_CONTROL *const rc = &cpi->rc;
+  const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+  int force_recode = 0;
+
+  // Special case trap if maximum allowed frame size exceeded.
+  if (rc->projected_frame_size > rc->max_frame_bandwidth) {
+    force_recode = 1;
+
+  // Is frame recode allowed.
+  // Yes if either recode mode 1 is selected or mode 2 is selected
+  // and the frame is a key frame, golden frame or alt_ref_frame
+  } else if ((cpi->sf.recode_loop == ALLOW_RECODE) ||
+             ((cpi->sf.recode_loop == ALLOW_RECODE_KFARFGF) &&
+              (cm->frame_type == KEY_FRAME ||
+               cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame))) {
+    // General over and under shoot tests
+    if ((rc->projected_frame_size > high_limit && q < maxq) ||
+        (rc->projected_frame_size < low_limit && q > minq)) {
+      force_recode = 1;
+    } else if (cpi->oxcf.rc_mode == RC_MODE_CONSTRAINED_QUALITY) {
+      // Deal with frame undershoot and whether or not we are
+      // below the automatically set cq level.
+      if (q > oxcf->cq_level &&
+          rc->projected_frame_size < ((rc->this_frame_target * 7) >> 3)) {
+        force_recode = 1;
+      }
+    }
+  }
+  return force_recode;
+}
+
+void vp9_update_reference_frames(VP9_COMP *cpi) {
+  VP9_COMMON * const cm = &cpi->common;
+
+  // At this point the new frame has been encoded.
+  // If any buffer copy / swapping is signaled it should be done here.
+  if (cm->frame_type == KEY_FRAME) {
+    ref_cnt_fb(cm->frame_bufs,
+               &cm->ref_frame_map[cpi->gld_fb_idx], cm->new_fb_idx);
+    ref_cnt_fb(cm->frame_bufs,
+               &cm->ref_frame_map[cpi->alt_fb_idx], cm->new_fb_idx);
+  }
+#if CONFIG_MULTIPLE_ARF
+  else if (!cpi->multi_arf_enabled && cpi->refresh_golden_frame &&
+      !cpi->refresh_alt_ref_frame) {
+#else
+  else if (cpi->refresh_golden_frame && !cpi->refresh_alt_ref_frame &&
+           !cpi->use_svc) {
+#endif
+    /* Preserve the previously existing golden frame and update the frame in
+     * the alt ref slot instead. This is highly specific to the current use of
+     * alt-ref as a forward reference, and this needs to be generalized as
+     * other uses are implemented (like RTC/temporal scaling)
+     *
+     * The update to the buffer in the alt ref slot was signaled in
+     * vp9_pack_bitstream(), now swap the buffer pointers so that it's treated
+     * as the golden frame next time.
+     */
+    int tmp;
+
+    ref_cnt_fb(cm->frame_bufs,
+               &cm->ref_frame_map[cpi->alt_fb_idx], cm->new_fb_idx);
+
+    tmp = cpi->alt_fb_idx;
+    cpi->alt_fb_idx = cpi->gld_fb_idx;
+    cpi->gld_fb_idx = tmp;
+  }  else { /* For non key/golden frames */
+    if (cpi->refresh_alt_ref_frame) {
+      int arf_idx = cpi->alt_fb_idx;
+#if CONFIG_MULTIPLE_ARF
+      if (cpi->multi_arf_enabled) {
+        arf_idx = cpi->arf_buffer_idx[cpi->sequence_number + 1];
+      }
+#endif
+      ref_cnt_fb(cm->frame_bufs,
+                 &cm->ref_frame_map[arf_idx], cm->new_fb_idx);
+    }
+
+    if (cpi->refresh_golden_frame) {
+      ref_cnt_fb(cm->frame_bufs,
+                 &cm->ref_frame_map[cpi->gld_fb_idx], cm->new_fb_idx);
+    }
+  }
+
+  if (cpi->refresh_last_frame) {
+    ref_cnt_fb(cm->frame_bufs,
+               &cm->ref_frame_map[cpi->lst_fb_idx], cm->new_fb_idx);
+  }
+}
+
+static void loopfilter_frame(VP9_COMP *cpi, VP9_COMMON *cm) {
+  MACROBLOCKD *xd = &cpi->mb.e_mbd;
+  struct loopfilter *lf = &cm->lf;
+  if (xd->lossless) {
+      lf->filter_level = 0;
+  } else {
+    struct vpx_usec_timer timer;
+
+    vp9_clear_system_state();
+
+    vpx_usec_timer_start(&timer);
+
+    vp9_pick_filter_level(cpi->Source, cpi, cpi->sf.lpf_pick);
+
+    vpx_usec_timer_mark(&timer);
+    cpi->time_pick_lpf += vpx_usec_timer_elapsed(&timer);
+  }
+
+  if (lf->filter_level > 0) {
+    vp9_loop_filter_frame(cm->frame_to_show, cm, xd, lf->filter_level, 0, 0);
+  }
+
+  vp9_extend_frame_inner_borders(cm->frame_to_show);
+}
+
+void vp9_scale_references(VP9_COMP *cpi) {
+  VP9_COMMON *cm = &cpi->common;
+  MV_REFERENCE_FRAME ref_frame;
+
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+    const int idx = cm->ref_frame_map[get_ref_frame_idx(cpi, ref_frame)];
+    const YV12_BUFFER_CONFIG *const ref = &cm->frame_bufs[idx].buf;
+
+    if (ref->y_crop_width != cm->width ||
+        ref->y_crop_height != cm->height) {
+      const int new_fb = get_free_fb(cm);
+      vp9_realloc_frame_buffer(&cm->frame_bufs[new_fb].buf,
+                               cm->width, cm->height,
+                               cm->subsampling_x, cm->subsampling_y,
+                               VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL);
+      scale_and_extend_frame(ref, &cm->frame_bufs[new_fb].buf);
+      cpi->scaled_ref_idx[ref_frame - 1] = new_fb;
+    } else {
+      cpi->scaled_ref_idx[ref_frame - 1] = idx;
+      cm->frame_bufs[idx].ref_count++;
+    }
+  }
+}
+
+static void release_scaled_references(VP9_COMP *cpi) {
+  VP9_COMMON *cm = &cpi->common;
+  int i;
+
+  for (i = 0; i < 3; i++)
+    cm->frame_bufs[cpi->scaled_ref_idx[i]].ref_count--;
+}
+
+static void full_to_model_count(unsigned int *model_count,
+                                unsigned int *full_count) {
+  int n;
+  model_count[ZERO_TOKEN] = full_count[ZERO_TOKEN];
+  model_count[ONE_TOKEN] = full_count[ONE_TOKEN];
+  model_count[TWO_TOKEN] = full_count[TWO_TOKEN];
+  for (n = THREE_TOKEN; n < EOB_TOKEN; ++n)
+    model_count[TWO_TOKEN] += full_count[n];
+  model_count[EOB_MODEL_TOKEN] = full_count[EOB_TOKEN];
+}
+
+static void full_to_model_counts(vp9_coeff_count_model *model_count,
+                                 vp9_coeff_count *full_count) {
+  int i, j, k, l;
+
+  for (i = 0; i < PLANE_TYPES; ++i)
+    for (j = 0; j < REF_TYPES; ++j)
+      for (k = 0; k < COEF_BANDS; ++k)
+        for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l)
+          full_to_model_count(model_count[i][j][k][l], full_count[i][j][k][l]);
+}
+
+#if 0 && CONFIG_INTERNAL_STATS
+static void output_frame_level_debug_stats(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  FILE *const f = fopen("tmp.stt", cm->current_video_frame ? "a" : "w");
+  int recon_err;
+
+  vp9_clear_system_state();
+
+  recon_err = vp9_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
+
+  if (cpi->twopass.total_left_stats.coded_error != 0.0)
+    fprintf(f, "%10u %10d %10d %10d %10d"
+        "%10"PRId64" %10"PRId64" %10"PRId64" %10"PRId64" %10d "
+        "%7.2lf %7.2lf %7.2lf %7.2lf %7.2lf"
+        "%6d %6d %5d %5d %5d "
+        "%10"PRId64" %10.3lf"
+        "%10lf %8u %10d %10d %10d\n",
+        cpi->common.current_video_frame, cpi->rc.this_frame_target,
+        cpi->rc.projected_frame_size,
+        cpi->rc.projected_frame_size / cpi->common.MBs,
+        (cpi->rc.projected_frame_size - cpi->rc.this_frame_target),
+        cpi->rc.vbr_bits_off_target,
+        cpi->rc.total_target_vs_actual,
+        (cpi->oxcf.starting_buffer_level - cpi->rc.bits_off_target),
+        cpi->rc.total_actual_bits, cm->base_qindex,
+        vp9_convert_qindex_to_q(cm->base_qindex),
+        (double)vp9_dc_quant(cm->base_qindex, 0) / 4.0,
+        cpi->rc.avg_q,
+        vp9_convert_qindex_to_q(cpi->rc.ni_av_qi),
+        vp9_convert_qindex_to_q(cpi->oxcf.cq_level),
+        cpi->refresh_last_frame, cpi->refresh_golden_frame,
+        cpi->refresh_alt_ref_frame, cm->frame_type, cpi->rc.gfu_boost,
+        cpi->twopass.bits_left,
+        cpi->twopass.total_left_stats.coded_error,
+        cpi->twopass.bits_left /
+            (1 + cpi->twopass.total_left_stats.coded_error),
+        cpi->tot_recode_hits, recon_err, cpi->rc.kf_boost,
+        cpi->twopass.kf_zeromotion_pct);
+
+  fclose(f);
+
+  if (0) {
+    FILE *const fmodes = fopen("Modes.stt", "a");
+    int i;
+
+    fprintf(fmodes, "%6d:%1d:%1d:%1d ", cpi->common.current_video_frame,
+            cm->frame_type, cpi->refresh_golden_frame,
+            cpi->refresh_alt_ref_frame);
+
+    for (i = 0; i < MAX_MODES; ++i)
+      fprintf(fmodes, "%5d ", cpi->mode_chosen_counts[i]);
+
+    fprintf(fmodes, "\n");
+
+    fclose(fmodes);
+  }
+}
+#endif
+
+static void encode_without_recode_loop(VP9_COMP *cpi,
+                                       int q) {
+  VP9_COMMON *const cm = &cpi->common;
+  vp9_clear_system_state();
+  vp9_set_quantizer(cm, q);
+  setup_frame(cpi);
+  // Variance adaptive and in frame q adjustment experiments are mutually
+  // exclusive.
+  if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
+    vp9_vaq_frame_setup(cpi);
+  } else if (cpi->oxcf.aq_mode == COMPLEXITY_AQ) {
+    vp9_setup_in_frame_q_adj(cpi);
+  } else if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
+    vp9_cyclic_refresh_setup(cpi);
+  }
+  // transform / motion compensation build reconstruction frame
+  vp9_encode_frame(cpi);
+
+  // Update the skip mb flag probabilities based on the distribution
+  // seen in the last encoder iteration.
+  // update_base_skip_probs(cpi);
+  vp9_clear_system_state();
+}
+
+static void encode_with_recode_loop(VP9_COMP *cpi,
+                                    size_t *size,
+                                    uint8_t *dest,
+                                    int q,
+                                    int bottom_index,
+                                    int top_index) {
+  VP9_COMMON *const cm = &cpi->common;
+  RATE_CONTROL *const rc = &cpi->rc;
+  int loop_count = 0;
+  int loop = 0;
+  int overshoot_seen = 0;
+  int undershoot_seen = 0;
+  int q_low = bottom_index, q_high = top_index;
+  int frame_over_shoot_limit;
+  int frame_under_shoot_limit;
+
+  // Decide frame size bounds
+  vp9_rc_compute_frame_size_bounds(cpi, rc->this_frame_target,
+                                   &frame_under_shoot_limit,
+                                   &frame_over_shoot_limit);
+
+  do {
+    vp9_clear_system_state();
+
+    vp9_set_quantizer(cm, q);
+
+    if (loop_count == 0)
+      setup_frame(cpi);
+
+    // Variance adaptive and in frame q adjustment experiments are mutually
+    // exclusive.
+    if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
+      vp9_vaq_frame_setup(cpi);
+    } else if (cpi->oxcf.aq_mode == COMPLEXITY_AQ) {
+      vp9_setup_in_frame_q_adj(cpi);
+    }
+
+    // transform / motion compensation build reconstruction frame
+    vp9_encode_frame(cpi);
+
+    // Update the skip mb flag probabilities based on the distribution
+    // seen in the last encoder iteration.
+    // update_base_skip_probs(cpi);
+
+    vp9_clear_system_state();
+
+    // Dummy pack of the bitstream using up to date stats to get an
+    // accurate estimate of output frame size to determine if we need
+    // to recode.
+    if (cpi->sf.recode_loop >= ALLOW_RECODE_KFARFGF) {
+      save_coding_context(cpi);
+      cpi->dummy_packing = 1;
+      if (!cpi->sf.use_nonrd_pick_mode)
+        vp9_pack_bitstream(cpi, dest, size);
+
+      rc->projected_frame_size = (int)(*size) << 3;
+      restore_coding_context(cpi);
+
+      if (frame_over_shoot_limit == 0)
+        frame_over_shoot_limit = 1;
+    }
+
+    if (cpi->oxcf.rc_mode == RC_MODE_CONSTANT_QUALITY) {
+      loop = 0;
+    } else {
+      if ((cm->frame_type == KEY_FRAME) &&
+           rc->this_key_frame_forced &&
+           (rc->projected_frame_size < rc->max_frame_bandwidth)) {
+        int last_q = q;
+        int kf_err = vp9_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
+
+        int high_err_target = cpi->ambient_err;
+        int low_err_target = cpi->ambient_err >> 1;
+
+        // Prevent possible divide by zero error below for perfect KF
+        kf_err += !kf_err;
+
+        // The key frame is not good enough or we can afford
+        // to make it better without undue risk of popping.
+        if ((kf_err > high_err_target &&
+             rc->projected_frame_size <= frame_over_shoot_limit) ||
+            (kf_err > low_err_target &&
+             rc->projected_frame_size <= frame_under_shoot_limit)) {
+          // Lower q_high
+          q_high = q > q_low ? q - 1 : q_low;
+
+          // Adjust Q
+          q = (q * high_err_target) / kf_err;
+          q = MIN(q, (q_high + q_low) >> 1);
+        } else if (kf_err < low_err_target &&
+                   rc->projected_frame_size >= frame_under_shoot_limit) {
+          // The key frame is much better than the previous frame
+          // Raise q_low
+          q_low = q < q_high ? q + 1 : q_high;
+
+          // Adjust Q
+          q = (q * low_err_target) / kf_err;
+          q = MIN(q, (q_high + q_low + 1) >> 1);
+        }
+
+        // Clamp Q to upper and lower limits:
+        q = clamp(q, q_low, q_high);
+
+        loop = q != last_q;
+      } else if (recode_loop_test(
+          cpi, frame_over_shoot_limit, frame_under_shoot_limit,
+          q, MAX(q_high, top_index), bottom_index)) {
+        // Is the projected frame size out of range and are we allowed
+        // to attempt to recode.
+        int last_q = q;
+        int retries = 0;
+
+        // Frame size out of permitted range:
+        // Update correction factor & compute new Q to try...
+
+        // Frame is too large
+        if (rc->projected_frame_size > rc->this_frame_target) {
+          // Special case if the projected size is > the max allowed.
+          if (rc->projected_frame_size >= rc->max_frame_bandwidth)
+            q_high = rc->worst_quality;
+
+          // Raise Qlow as to at least the current value
+          q_low = q < q_high ? q + 1 : q_high;
+
+          if (undershoot_seen || loop_count > 1) {
+            // Update rate_correction_factor unless
+            vp9_rc_update_rate_correction_factors(cpi, 1);
+
+            q = (q_high + q_low + 1) / 2;
+          } else {
+            // Update rate_correction_factor unless
+            vp9_rc_update_rate_correction_factors(cpi, 0);
+
+            q = vp9_rc_regulate_q(cpi, rc->this_frame_target,
+                                   bottom_index, MAX(q_high, top_index));
+
+            while (q < q_low && retries < 10) {
+              vp9_rc_update_rate_correction_factors(cpi, 0);
+              q = vp9_rc_regulate_q(cpi, rc->this_frame_target,
+                                     bottom_index, MAX(q_high, top_index));
+              retries++;
+            }
+          }
+
+          overshoot_seen = 1;
+        } else {
+          // Frame is too small
+          q_high = q > q_low ? q - 1 : q_low;
+
+          if (overshoot_seen || loop_count > 1) {
+            vp9_rc_update_rate_correction_factors(cpi, 1);
+            q = (q_high + q_low) / 2;
+          } else {
+            vp9_rc_update_rate_correction_factors(cpi, 0);
+            q = vp9_rc_regulate_q(cpi, rc->this_frame_target,
+                                   bottom_index, top_index);
+            // Special case reset for qlow for constrained quality.
+            // This should only trigger where there is very substantial
+            // undershoot on a frame and the auto cq level is above
+            // the user passsed in value.
+            if (cpi->oxcf.rc_mode == RC_MODE_CONSTRAINED_QUALITY &&
+                q < q_low) {
+              q_low = q;
+            }
+
+            while (q > q_high && retries < 10) {
+              vp9_rc_update_rate_correction_factors(cpi, 0);
+              q = vp9_rc_regulate_q(cpi, rc->this_frame_target,
+                                     bottom_index, top_index);
+              retries++;
+            }
+          }
+
+          undershoot_seen = 1;
+        }
+
+        // Clamp Q to upper and lower limits:
+        q = clamp(q, q_low, q_high);
+
+        loop = q != last_q;
+      } else {
+        loop = 0;
+      }
+    }
+
+    // Special case for overlay frame.
+    if (rc->is_src_frame_alt_ref &&
+        rc->projected_frame_size < rc->max_frame_bandwidth)
+      loop = 0;
+
+    if (loop) {
+      loop_count++;
+
+#if CONFIG_INTERNAL_STATS
+      cpi->tot_recode_hits++;
+#endif
+    }
+  } while (loop);
+}
+
+static void get_ref_frame_flags(VP9_COMP *cpi) {
+  if (cpi->refresh_last_frame & cpi->refresh_golden_frame)
+    cpi->gold_is_last = 1;
+  else if (cpi->refresh_last_frame ^ cpi->refresh_golden_frame)
+    cpi->gold_is_last = 0;
+
+  if (cpi->refresh_last_frame & cpi->refresh_alt_ref_frame)
+    cpi->alt_is_last = 1;
+  else if (cpi->refresh_last_frame ^ cpi->refresh_alt_ref_frame)
+    cpi->alt_is_last = 0;
+
+  if (cpi->refresh_alt_ref_frame & cpi->refresh_golden_frame)
+    cpi->gold_is_alt = 1;
+  else if (cpi->refresh_alt_ref_frame ^ cpi->refresh_golden_frame)
+    cpi->gold_is_alt = 0;
+
+  cpi->ref_frame_flags = VP9_ALT_FLAG | VP9_GOLD_FLAG | VP9_LAST_FLAG;
+
+  if (cpi->gold_is_last)
+    cpi->ref_frame_flags &= ~VP9_GOLD_FLAG;
+
+  if (cpi->rc.frames_till_gf_update_due == INT_MAX)
+    cpi->ref_frame_flags &= ~VP9_GOLD_FLAG;
+
+  if (cpi->alt_is_last)
+    cpi->ref_frame_flags &= ~VP9_ALT_FLAG;
+
+  if (cpi->gold_is_alt)
+    cpi->ref_frame_flags &= ~VP9_ALT_FLAG;
+}
+
+static void set_ext_overrides(VP9_COMP *cpi) {
+  // Overrides the defaults with the externally supplied values with
+  // vp9_update_reference() and vp9_update_entropy() calls
+  // Note: The overrides are valid only for the next frame passed
+  // to encode_frame_to_data_rate() function
+  if (cpi->ext_refresh_frame_context_pending) {
+    cpi->common.refresh_frame_context = cpi->ext_refresh_frame_context;
+    cpi->ext_refresh_frame_context_pending = 0;
+  }
+  if (cpi->ext_refresh_frame_flags_pending) {
+    cpi->refresh_last_frame = cpi->ext_refresh_last_frame;
+    cpi->refresh_golden_frame = cpi->ext_refresh_golden_frame;
+    cpi->refresh_alt_ref_frame = cpi->ext_refresh_alt_ref_frame;
+    cpi->ext_refresh_frame_flags_pending = 0;
+  }
+}
+
+YV12_BUFFER_CONFIG *vp9_scale_if_required(VP9_COMMON *cm,
+                                          YV12_BUFFER_CONFIG *unscaled,
+                                          YV12_BUFFER_CONFIG *scaled) {
+  if (cm->mi_cols * MI_SIZE != unscaled->y_width ||
+      cm->mi_rows * MI_SIZE != unscaled->y_height) {
+    scale_and_extend_frame_nonnormative(unscaled, scaled);
+    return scaled;
+  } else {
+    return unscaled;
+  }
+}
+
+static void encode_frame_to_data_rate(VP9_COMP *cpi,
+                                      size_t *size,
+                                      uint8_t *dest,
+                                      unsigned int *frame_flags) {
+  VP9_COMMON *const cm = &cpi->common;
+  TX_SIZE t;
+  int q;
+  int top_index;
+  int bottom_index;
+
+  const SPEED_FEATURES *const sf = &cpi->sf;
+  const unsigned int max_mv_def = MIN(cm->width, cm->height);
+  struct segmentation *const seg = &cm->seg;
+  set_ext_overrides(cpi);
+
+  cpi->Source = vp9_scale_if_required(cm, cpi->un_scaled_source,
+                                      &cpi->scaled_source);
+
+  if (cpi->unscaled_last_source != NULL)
+    cpi->Last_Source = vp9_scale_if_required(cm, cpi->unscaled_last_source,
+                                             &cpi->scaled_last_source);
+
+  vp9_scale_references(cpi);
+
+  vp9_clear_system_state();
+
+  // Enable or disable mode based tweaking of the zbin.
+  // For 2 pass only used where GF/ARF prediction quality
+  // is above a threshold.
+  cpi->zbin_mode_boost = 0;
+  cpi->zbin_mode_boost_enabled = 0;
+
+  // Current default encoder behavior for the altref sign bias.
+  cm->ref_frame_sign_bias[ALTREF_FRAME] = cpi->rc.source_alt_ref_active;
+
+  // Set default state for segment based loop filter update flags.
+  cm->lf.mode_ref_delta_update = 0;
+
+  // Initialize cpi->mv_step_param to default based on max resolution.
+  cpi->mv_step_param = vp9_init_search_range(sf, max_mv_def);
+  // Initialize cpi->max_mv_magnitude and cpi->mv_step_param if appropriate.
+  if (sf->auto_mv_step_size) {
+    if (frame_is_intra_only(cm)) {
+      // Initialize max_mv_magnitude for use in the first INTER frame
+      // after a key/intra-only frame.
+      cpi->max_mv_magnitude = max_mv_def;
+    } else {
+      if (cm->show_frame)
+        // Allow mv_steps to correspond to twice the max mv magnitude found
+        // in the previous frame, capped by the default max_mv_magnitude based
+        // on resolution.
+        cpi->mv_step_param = vp9_init_search_range(sf, MIN(max_mv_def, 2 *
+                                 cpi->max_mv_magnitude));
+      cpi->max_mv_magnitude = 0;
+    }
+  }
+
+  // Set various flags etc to special state if it is a key frame.
+  if (frame_is_intra_only(cm)) {
+    // Reset the loop filter deltas and segmentation map.
+    vp9_reset_segment_features(&cm->seg);
+
+    // If segmentation is enabled force a map update for key frames.
+    if (seg->enabled) {
+      seg->update_map = 1;
+      seg->update_data = 1;
+    }
+
+    // The alternate reference frame cannot be active for a key frame.
+    cpi->rc.source_alt_ref_active = 0;
+
+    cm->error_resilient_mode = (cpi->oxcf.error_resilient_mode != 0);
+    cm->frame_parallel_decoding_mode =
+      (cpi->oxcf.frame_parallel_decoding_mode != 0);
+
+    // By default, encoder assumes decoder can use prev_mi.
+    cm->coding_use_prev_mi = 1;
+    if (cm->error_resilient_mode) {
+      cm->coding_use_prev_mi = 0;
+      cm->frame_parallel_decoding_mode = 1;
+      cm->reset_frame_context = 0;
+      cm->refresh_frame_context = 0;
+    } else if (cm->intra_only) {
+      // Only reset the current context.
+      cm->reset_frame_context = 2;
+    }
+  }
+
+  // Configure experimental use of segmentation for enhanced coding of
+  // static regions if indicated.
+  // Only allowed in second pass of two pass (as requires lagged coding)
+  // and if the relevant speed feature flag is set.
+  if (cpi->pass == 2 && cpi->sf.static_segmentation)
+    configure_static_seg_features(cpi);
+
+  // For 1 pass CBR, check if we are dropping this frame.
+  // Never drop on key frame.
+  if (cpi->pass == 0 &&
+      cpi->oxcf.rc_mode == RC_MODE_CBR &&
+      cm->frame_type != KEY_FRAME) {
+    if (vp9_rc_drop_frame(cpi)) {
+      vp9_rc_postencode_update_drop_frame(cpi);
+      ++cm->current_video_frame;
+      return;
+    }
+  }
+
+  vp9_clear_system_state();
+
+  vp9_zero(cpi->rd.tx_select_threshes);
+
+#if CONFIG_VP9_POSTPROC
+  if (cpi->oxcf.noise_sensitivity > 0) {
+    int l = 0;
+    switch (cpi->oxcf.noise_sensitivity) {
+      case 1:
+        l = 20;
+        break;
+      case 2:
+        l = 40;
+        break;
+      case 3:
+        l = 60;
+        break;
+      case 4:
+      case 5:
+        l = 100;
+        break;
+      case 6:
+        l = 150;
+        break;
+    }
+    vp9_denoise(cpi->Source, cpi->Source, l);
+  }
+#endif
+
+#ifdef OUTPUT_YUV_SRC
+  vp9_write_yuv_frame(cpi->Source);
+#endif
+
+  set_speed_features(cpi);
+
+  // Decide q and q bounds.
+  q = vp9_rc_pick_q_and_bounds(cpi, &bottom_index, &top_index);
+
+  if (!frame_is_intra_only(cm)) {
+    cm->interp_filter = DEFAULT_INTERP_FILTER;
+    /* TODO: Decide this more intelligently */
+    set_high_precision_mv(cpi, q < HIGH_PRECISION_MV_QTHRESH);
+  }
+
+  if (cpi->sf.recode_loop == DISALLOW_RECODE) {
+    encode_without_recode_loop(cpi, q);
+  } else {
+    encode_with_recode_loop(cpi, size, dest, q, bottom_index, top_index);
+  }
+
+  // Special case code to reduce pulsing when key frames are forced at a
+  // fixed interval. Note the reconstruction error if it is the frame before
+  // the force key frame
+  if (cpi->rc.next_key_frame_forced && cpi->rc.frames_to_key == 1) {
+    cpi->ambient_err = vp9_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
+  }
+
+  // If the encoder forced a KEY_FRAME decision
+  if (cm->frame_type == KEY_FRAME)
+    cpi->refresh_last_frame = 1;
+
+  cm->frame_to_show = get_frame_new_buffer(cm);
+
+#if WRITE_RECON_BUFFER
+  if (cm->show_frame)
+    write_cx_frame_to_file(cm->frame_to_show,
+                           cm->current_video_frame);
+  else
+    write_cx_frame_to_file(cm->frame_to_show,
+                           cm->current_video_frame + 1000);
+#endif
+
+  // Pick the loop filter level for the frame.
+  loopfilter_frame(cpi, cm);
+
+#if WRITE_RECON_BUFFER
+  if (cm->show_frame)
+    write_cx_frame_to_file(cm->frame_to_show,
+                           cm->current_video_frame + 2000);
+  else
+    write_cx_frame_to_file(cm->frame_to_show,
+                           cm->current_video_frame + 3000);
+#endif
+
+  // build the bitstream
+  cpi->dummy_packing = 0;
+  vp9_pack_bitstream(cpi, dest, size);
+
+  if (cm->seg.update_map)
+    update_reference_segmentation_map(cpi);
+
+  release_scaled_references(cpi);
+  vp9_update_reference_frames(cpi);
+
+  for (t = TX_4X4; t <= TX_32X32; t++)
+    full_to_model_counts(cm->counts.coef[t], cpi->coef_counts[t]);
+
+  if (!cm->error_resilient_mode && !cm->frame_parallel_decoding_mode)
+    vp9_adapt_coef_probs(cm);
+
+  if (!frame_is_intra_only(cm)) {
+    if (!cm->error_resilient_mode && !cm->frame_parallel_decoding_mode) {
+      vp9_adapt_mode_probs(cm);
+      vp9_adapt_mv_probs(cm, cm->allow_high_precision_mv);
+    }
+  }
+
+  if (cpi->refresh_golden_frame == 1)
+    cpi->frame_flags |= FRAMEFLAGS_GOLDEN;
+  else
+    cpi->frame_flags &= ~FRAMEFLAGS_GOLDEN;
+
+  if (cpi->refresh_alt_ref_frame == 1)
+    cpi->frame_flags |= FRAMEFLAGS_ALTREF;
+  else
+    cpi->frame_flags &= ~FRAMEFLAGS_ALTREF;
+
+  get_ref_frame_flags(cpi);
+
+  cm->last_frame_type = cm->frame_type;
+  vp9_rc_postencode_update(cpi, *size);
+
+#if 0
+  output_frame_level_debug_stats(cpi);
+#endif
+
+  if (cm->frame_type == KEY_FRAME) {
+    // Tell the caller that the frame was coded as a key frame
+    *frame_flags = cpi->frame_flags | FRAMEFLAGS_KEY;
+
+#if CONFIG_MULTIPLE_ARF
+    // Reset the sequence number.
+    if (cpi->multi_arf_enabled) {
+      cpi->sequence_number = 0;
+      cpi->frame_coding_order_period = cpi->new_frame_coding_order_period;
+      cpi->new_frame_coding_order_period = -1;
+    }
+#endif
+  } else {
+    *frame_flags = cpi->frame_flags & ~FRAMEFLAGS_KEY;
+
+#if CONFIG_MULTIPLE_ARF
+    /* Increment position in the coded frame sequence. */
+    if (cpi->multi_arf_enabled) {
+      ++cpi->sequence_number;
+      if (cpi->sequence_number >= cpi->frame_coding_order_period) {
+        cpi->sequence_number = 0;
+        cpi->frame_coding_order_period = cpi->new_frame_coding_order_period;
+        cpi->new_frame_coding_order_period = -1;
+      }
+      cpi->this_frame_weight = cpi->arf_weight[cpi->sequence_number];
+      assert(cpi->this_frame_weight >= 0);
+    }
+#endif
+  }
+
+  // Clear the one shot update flags for segmentation map and mode/ref loop
+  // filter deltas.
+  cm->seg.update_map = 0;
+  cm->seg.update_data = 0;
+  cm->lf.mode_ref_delta_update = 0;
+
+  // keep track of the last coded dimensions
+  cm->last_width = cm->width;
+  cm->last_height = cm->height;
+
+  // reset to normal state now that we are done.
+  if (!cm->show_existing_frame)
+    cm->last_show_frame = cm->show_frame;
+
+  if (cm->show_frame) {
+    vp9_swap_mi_and_prev_mi(cm);
+
+    // Don't increment frame counters if this was an altref buffer
+    // update not a real frame
+    ++cm->current_video_frame;
+    if (cpi->use_svc)
+      vp9_inc_frame_in_layer(&cpi->svc);
+  }
+}
+
+static void SvcEncode(VP9_COMP *cpi, size_t *size, uint8_t *dest,
+                      unsigned int *frame_flags) {
+  vp9_rc_get_svc_params(cpi);
+  encode_frame_to_data_rate(cpi, size, dest, frame_flags);
+}
+
+static void Pass0Encode(VP9_COMP *cpi, size_t *size, uint8_t *dest,
+                        unsigned int *frame_flags) {
+  if (cpi->oxcf.rc_mode == RC_MODE_CBR) {
+    vp9_rc_get_one_pass_cbr_params(cpi);
+  } else {
+    vp9_rc_get_one_pass_vbr_params(cpi);
+  }
+  encode_frame_to_data_rate(cpi, size, dest, frame_flags);
+}
+
+static void Pass1Encode(VP9_COMP *cpi, size_t *size, uint8_t *dest,
+                        unsigned int *frame_flags) {
+  (void) size;
+  (void) dest;
+  (void) frame_flags;
+
+  vp9_rc_get_first_pass_params(cpi);
+  vp9_set_quantizer(&cpi->common, find_fp_qindex());
+  vp9_first_pass(cpi);
+}
+
+static void Pass2Encode(VP9_COMP *cpi, size_t *size,
+                        uint8_t *dest, unsigned int *frame_flags) {
+  cpi->allow_encode_breakout = ENCODE_BREAKOUT_ENABLED;
+
+  vp9_rc_get_second_pass_params(cpi);
+  encode_frame_to_data_rate(cpi, size, dest, frame_flags);
+
+  vp9_twopass_postencode_update(cpi);
+}
+
+static void check_initial_width(VP9_COMP *cpi, int subsampling_x,
+                                int subsampling_y) {
+  VP9_COMMON *const cm = &cpi->common;
+
+  if (!cpi->initial_width) {
+    cm->subsampling_x = subsampling_x;
+    cm->subsampling_y = subsampling_y;
+    alloc_raw_frame_buffers(cpi);
+    cpi->initial_width = cm->width;
+    cpi->initial_height = cm->height;
+  }
+}
+
+
+int vp9_receive_raw_frame(VP9_COMP *cpi, unsigned int frame_flags,
+                          YV12_BUFFER_CONFIG *sd, int64_t time_stamp,
+                          int64_t end_time) {
+  VP9_COMMON *cm = &cpi->common;
+  struct vpx_usec_timer timer;
+  int res = 0;
+  const int subsampling_x = sd->uv_width  < sd->y_width;
+  const int subsampling_y = sd->uv_height < sd->y_height;
+
+  check_initial_width(cpi, subsampling_x, subsampling_y);
+  vpx_usec_timer_start(&timer);
+  if (vp9_lookahead_push(cpi->lookahead,
+                         sd, time_stamp, end_time, frame_flags))
+    res = -1;
+  vpx_usec_timer_mark(&timer);
+  cpi->time_receive_data += vpx_usec_timer_elapsed(&timer);
+
+  if (cm->profile == PROFILE_0 && (subsampling_x != 1 || subsampling_y != 1)) {
+    vpx_internal_error(&cm->error, VPX_CODEC_INVALID_PARAM,
+                       "Non-4:2:0 color space requires profile >= 1");
+    res = -1;
+  }
+
+  return res;
+}
+
+
+static int frame_is_reference(const VP9_COMP *cpi) {
+  const VP9_COMMON *cm = &cpi->common;
+
+  return cm->frame_type == KEY_FRAME ||
+         cpi->refresh_last_frame ||
+         cpi->refresh_golden_frame ||
+         cpi->refresh_alt_ref_frame ||
+         cm->refresh_frame_context ||
+         cm->lf.mode_ref_delta_update ||
+         cm->seg.update_map ||
+         cm->seg.update_data;
+}
+
+#if CONFIG_MULTIPLE_ARF
+int is_next_frame_arf(VP9_COMP *cpi) {
+  // Negative entry in frame_coding_order indicates an ARF at this position.
+  return cpi->frame_coding_order[cpi->sequence_number + 1] < 0 ? 1 : 0;
+}
+#endif
+
+void adjust_frame_rate(VP9_COMP *cpi) {
+  int64_t this_duration;
+  int step = 0;
+
+  if (cpi->source->ts_start == cpi->first_time_stamp_ever) {
+    this_duration = cpi->source->ts_end - cpi->source->ts_start;
+    step = 1;
+  } else {
+    int64_t last_duration = cpi->last_end_time_stamp_seen
+        - cpi->last_time_stamp_seen;
+
+    this_duration = cpi->source->ts_end - cpi->last_end_time_stamp_seen;
+
+    // do a step update if the duration changes by 10%
+    if (last_duration)
+      step = (int)((this_duration - last_duration) * 10 / last_duration);
+  }
+
+  if (this_duration) {
+    if (step) {
+      vp9_new_framerate(cpi, 10000000.0 / this_duration);
+    } else {
+      // Average this frame's rate into the last second's average
+      // frame rate. If we haven't seen 1 second yet, then average
+      // over the whole interval seen.
+      const double interval = MIN((double)(cpi->source->ts_end
+                                   - cpi->first_time_stamp_ever), 10000000.0);
+      double avg_duration = 10000000.0 / cpi->oxcf.framerate;
+      avg_duration *= (interval - avg_duration + this_duration);
+      avg_duration /= interval;
+
+      vp9_new_framerate(cpi, 10000000.0 / avg_duration);
+    }
+  }
+  cpi->last_time_stamp_seen = cpi->source->ts_start;
+  cpi->last_end_time_stamp_seen = cpi->source->ts_end;
+}
+
+int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
+                            size_t *size, uint8_t *dest,
+                            int64_t *time_stamp, int64_t *time_end, int flush) {
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &cpi->mb.e_mbd;
+  RATE_CONTROL *const rc = &cpi->rc;
+  struct vpx_usec_timer  cmptimer;
+  YV12_BUFFER_CONFIG *force_src_buffer = NULL;
+  MV_REFERENCE_FRAME ref_frame;
+
+  if (!cpi)
+    return -1;
+
+  if (cpi->svc.number_spatial_layers > 1 && cpi->pass == 2) {
+    vp9_restore_layer_context(cpi);
+  }
+
+  vpx_usec_timer_start(&cmptimer);
+
+  cpi->source = NULL;
+  cpi->last_source = NULL;
+
+  set_high_precision_mv(cpi, ALTREF_HIGH_PRECISION_MV);
+
+  // Normal defaults
+  cm->reset_frame_context = 0;
+  cm->refresh_frame_context = 1;
+  cpi->refresh_last_frame = 1;
+  cpi->refresh_golden_frame = 0;
+  cpi->refresh_alt_ref_frame = 0;
+
+  // Should we code an alternate reference frame.
+  if (cpi->oxcf.play_alternate && rc->source_alt_ref_pending) {
+    int frames_to_arf;
+
+#if CONFIG_MULTIPLE_ARF
+    assert(!cpi->multi_arf_enabled ||
+           cpi->frame_coding_order[cpi->sequence_number] < 0);
+
+    if (cpi->multi_arf_enabled && (cpi->pass == 2))
+      frames_to_arf = (-cpi->frame_coding_order[cpi->sequence_number])
+          - cpi->next_frame_in_order;
+    else
+#endif
+      frames_to_arf = rc->frames_till_gf_update_due;
+
+    assert(frames_to_arf <= rc->frames_to_key);
+
+    if ((cpi->source = vp9_lookahead_peek(cpi->lookahead, frames_to_arf))) {
+#if CONFIG_MULTIPLE_ARF
+      cpi->alt_ref_source[cpi->arf_buffered] = cpi->source;
+#else
+      cpi->alt_ref_source = cpi->source;
+#endif
+
+      if (cpi->oxcf.arnr_max_frames > 0) {
+        // Produce the filtered ARF frame.
+        // TODO(agrange) merge these two functions.
+        vp9_configure_arnr_filter(cpi, frames_to_arf, rc->gfu_boost);
+        vp9_temporal_filter_prepare(cpi, frames_to_arf);
+        vp9_extend_frame_borders(&cpi->alt_ref_buffer);
+        force_src_buffer = &cpi->alt_ref_buffer;
+      }
+
+      cm->show_frame = 0;
+      cpi->refresh_alt_ref_frame = 1;
+      cpi->refresh_golden_frame = 0;
+      cpi->refresh_last_frame = 0;
+      rc->is_src_frame_alt_ref = 0;
+
+#if CONFIG_MULTIPLE_ARF
+      if (!cpi->multi_arf_enabled)
+#endif
+        rc->source_alt_ref_pending = 0;
+    } else {
+      rc->source_alt_ref_pending = 0;
+    }
+  }
+
+  if (!cpi->source) {
+#if CONFIG_MULTIPLE_ARF
+    int i;
+#endif
+
+    // Get last frame source.
+    if (cm->current_video_frame > 0) {
+      if ((cpi->last_source = vp9_lookahead_peek(cpi->lookahead, -1)) == NULL)
+        return -1;
+    }
+
+    if ((cpi->source = vp9_lookahead_pop(cpi->lookahead, flush))) {
+      cm->show_frame = 1;
+      cm->intra_only = 0;
+
+#if CONFIG_MULTIPLE_ARF
+      // Is this frame the ARF overlay.
+      rc->is_src_frame_alt_ref = 0;
+      for (i = 0; i < cpi->arf_buffered; ++i) {
+        if (cpi->source == cpi->alt_ref_source[i]) {
+          rc->is_src_frame_alt_ref = 1;
+          cpi->refresh_golden_frame = 1;
+          break;
+        }
+      }
+#else
+      rc->is_src_frame_alt_ref = cpi->alt_ref_source &&
+                                 (cpi->source == cpi->alt_ref_source);
+#endif
+      if (rc->is_src_frame_alt_ref) {
+        // Current frame is an ARF overlay frame.
+#if CONFIG_MULTIPLE_ARF
+        cpi->alt_ref_source[i] = NULL;
+#else
+        cpi->alt_ref_source = NULL;
+#endif
+        // Don't refresh the last buffer for an ARF overlay frame. It will
+        // become the GF so preserve last as an alternative prediction option.
+        cpi->refresh_last_frame = 0;
+      }
+#if CONFIG_MULTIPLE_ARF
+      ++cpi->next_frame_in_order;
+#endif
+    }
+  }
+
+  if (cpi->source) {
+    cpi->un_scaled_source = cpi->Source = force_src_buffer ? force_src_buffer
+                                                           : &cpi->source->img;
+
+  if (cpi->last_source != NULL) {
+    cpi->unscaled_last_source = &cpi->last_source->img;
+  } else {
+    cpi->unscaled_last_source = NULL;
+  }
+
+    *time_stamp = cpi->source->ts_start;
+    *time_end = cpi->source->ts_end;
+    *frame_flags = cpi->source->flags;
+
+#if CONFIG_MULTIPLE_ARF
+    if (cm->frame_type != KEY_FRAME && cpi->pass == 2)
+      rc->source_alt_ref_pending = is_next_frame_arf(cpi);
+#endif
+  } else {
+    *size = 0;
+    if (flush && cpi->pass == 1 && !cpi->twopass.first_pass_done) {
+      vp9_end_first_pass(cpi);    /* get last stats packet */
+      cpi->twopass.first_pass_done = 1;
+    }
+    return -1;
+  }
+
+  if (cpi->source->ts_start < cpi->first_time_stamp_ever) {
+    cpi->first_time_stamp_ever = cpi->source->ts_start;
+    cpi->last_end_time_stamp_seen = cpi->source->ts_start;
+  }
+
+  // adjust frame rates based on timestamps given
+  if (cm->show_frame) {
+    adjust_frame_rate(cpi);
+  }
+
+  if (cpi->svc.number_temporal_layers > 1 &&
+      cpi->oxcf.rc_mode == RC_MODE_CBR) {
+    vp9_update_temporal_layer_framerate(cpi);
+    vp9_restore_layer_context(cpi);
+  }
+
+  // start with a 0 size frame
+  *size = 0;
+
+  // Clear down mmx registers
+  vp9_clear_system_state();
+
+  /* find a free buffer for the new frame, releasing the reference previously
+   * held.
+   */
+  cm->frame_bufs[cm->new_fb_idx].ref_count--;
+  cm->new_fb_idx = get_free_fb(cm);
+
+#if CONFIG_MULTIPLE_ARF
+  /* Set up the correct ARF frame. */
+  if (cpi->refresh_alt_ref_frame) {
+    ++cpi->arf_buffered;
+  }
+  if (cpi->multi_arf_enabled && (cm->frame_type != KEY_FRAME) &&
+      (cpi->pass == 2)) {
+    cpi->alt_fb_idx = cpi->arf_buffer_idx[cpi->sequence_number];
+  }
+#endif
+
+  cpi->frame_flags = *frame_flags;
+
+  if (cpi->pass == 2 &&
+      cm->current_video_frame == 0 &&
+      cpi->oxcf.allow_spatial_resampling &&
+      cpi->oxcf.rc_mode == RC_MODE_VBR) {
+    // Internal scaling is triggered on the first frame.
+    vp9_set_size_literal(cpi, cpi->oxcf.scaled_frame_width,
+                         cpi->oxcf.scaled_frame_height);
+  }
+
+  // Reset the frame pointers to the current frame size
+  vp9_realloc_frame_buffer(get_frame_new_buffer(cm),
+                           cm->width, cm->height,
+                           cm->subsampling_x, cm->subsampling_y,
+                           VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL);
+
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+    const int idx = cm->ref_frame_map[get_ref_frame_idx(cpi, ref_frame)];
+    YV12_BUFFER_CONFIG *const buf = &cm->frame_bufs[idx].buf;
+    RefBuffer *const ref_buf = &cm->frame_refs[ref_frame - 1];
+    ref_buf->buf = buf;
+    ref_buf->idx = idx;
+    vp9_setup_scale_factors_for_frame(&ref_buf->sf,
+                                      buf->y_crop_width, buf->y_crop_height,
+                                      cm->width, cm->height);
+
+    if (vp9_is_scaled(&ref_buf->sf))
+      vp9_extend_frame_borders(buf);
+  }
+
+  set_ref_ptrs(cm, xd, LAST_FRAME, LAST_FRAME);
+
+  if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
+    vp9_vaq_init();
+  }
+
+  if (cpi->pass == 1 &&
+      (!cpi->use_svc || cpi->svc.number_temporal_layers == 1)) {
+    Pass1Encode(cpi, size, dest, frame_flags);
+  } else if (cpi->pass == 2 &&
+      (!cpi->use_svc || cpi->svc.number_temporal_layers == 1)) {
+    Pass2Encode(cpi, size, dest, frame_flags);
+  } else if (cpi->use_svc) {
+    SvcEncode(cpi, size, dest, frame_flags);
+  } else {
+    // One pass encode
+    Pass0Encode(cpi, size, dest, frame_flags);
+  }
+
+  if (cm->refresh_frame_context)
+    cm->frame_contexts[cm->frame_context_idx] = cm->fc;
+
+  // Frame was dropped, release scaled references.
+  if (*size == 0) {
+    release_scaled_references(cpi);
+  }
+
+  if (*size > 0) {
+    cpi->droppable = !frame_is_reference(cpi);
+  }
+
+  // Save layer specific state.
+  if ((cpi->svc.number_temporal_layers > 1 &&
+      cpi->oxcf.rc_mode == RC_MODE_CBR) ||
+      (cpi->svc.number_spatial_layers > 1 && cpi->pass == 2)) {
+    vp9_save_layer_context(cpi);
+  }
+
+  vpx_usec_timer_mark(&cmptimer);
+  cpi->time_compress_data += vpx_usec_timer_elapsed(&cmptimer);
+
+  if (cpi->b_calculate_psnr && cpi->pass != 1 && cm->show_frame)
+    generate_psnr_packet(cpi);
+
+#if CONFIG_INTERNAL_STATS
+
+  if (cpi->pass != 1) {
+    cpi->bytes += (int)(*size);
+
+    if (cm->show_frame) {
+      cpi->count++;
+
+      if (cpi->b_calculate_psnr) {
+        YV12_BUFFER_CONFIG *orig = cpi->Source;
+        YV12_BUFFER_CONFIG *recon = cpi->common.frame_to_show;
+        YV12_BUFFER_CONFIG *pp = &cm->post_proc_buffer;
+        PSNR_STATS psnr;
+        calc_psnr(orig, recon, &psnr);
+
+        cpi->total += psnr.psnr[0];
+        cpi->total_y += psnr.psnr[1];
+        cpi->total_u += psnr.psnr[2];
+        cpi->total_v += psnr.psnr[3];
+        cpi->total_sq_error += psnr.sse[0];
+        cpi->total_samples += psnr.samples[0];
+
+        {
+          PSNR_STATS psnr2;
+          double frame_ssim2 = 0, weight = 0;
+#if CONFIG_VP9_POSTPROC
+          vp9_deblock(cm->frame_to_show, &cm->post_proc_buffer,
+                      cm->lf.filter_level * 10 / 6);
+#endif
+          vp9_clear_system_state();
+
+          calc_psnr(orig, pp, &psnr2);
+
+          cpi->totalp += psnr2.psnr[0];
+          cpi->totalp_y += psnr2.psnr[1];
+          cpi->totalp_u += psnr2.psnr[2];
+          cpi->totalp_v += psnr2.psnr[3];
+          cpi->totalp_sq_error += psnr2.sse[0];
+          cpi->totalp_samples += psnr2.samples[0];
+
+          frame_ssim2 = vp9_calc_ssim(orig, recon, 1, &weight);
+
+          cpi->summed_quality += frame_ssim2 * weight;
+          cpi->summed_weights += weight;
+
+          frame_ssim2 = vp9_calc_ssim(orig, &cm->post_proc_buffer, 1, &weight);
+
+          cpi->summedp_quality += frame_ssim2 * weight;
+          cpi->summedp_weights += weight;
+#if 0
+          {
+            FILE *f = fopen("q_used.stt", "a");
+            fprintf(f, "%5d : Y%f7.3:U%f7.3:V%f7.3:F%f7.3:S%7.3f\n",
+                    cpi->common.current_video_frame, y2, u2, v2,
+                    frame_psnr2, frame_ssim2);
+            fclose(f);
+          }
+#endif
+        }
+      }
+
+      if (cpi->b_calculate_ssimg) {
+        double y, u, v, frame_all;
+        frame_all = vp9_calc_ssimg(cpi->Source, cm->frame_to_show, &y, &u, &v);
+        cpi->total_ssimg_y += y;
+        cpi->total_ssimg_u += u;
+        cpi->total_ssimg_v += v;
+        cpi->total_ssimg_all += frame_all;
+      }
+    }
+  }
+
+#endif
+  return 0;
+}
+
+int vp9_get_preview_raw_frame(VP9_COMP *cpi, YV12_BUFFER_CONFIG *dest,
+                              vp9_ppflags_t *flags) {
+  VP9_COMMON *cm = &cpi->common;
+#if !CONFIG_VP9_POSTPROC
+  (void)flags;
+#endif
+
+  if (!cm->show_frame) {
+    return -1;
+  } else {
+    int ret;
+#if CONFIG_VP9_POSTPROC
+    ret = vp9_post_proc_frame(cm, dest, flags);
+#else
+    if (cm->frame_to_show) {
+      *dest = *cm->frame_to_show;
+      dest->y_width = cm->width;
+      dest->y_height = cm->height;
+      dest->uv_width = cm->width >> cm->subsampling_x;
+      dest->uv_height = cm->height >> cm->subsampling_y;
+      ret = 0;
+    } else {
+      ret = -1;
+    }
+#endif  // !CONFIG_VP9_POSTPROC
+    vp9_clear_system_state();
+    return ret;
+  }
+}
+
+int vp9_set_active_map(VP9_COMP *cpi, unsigned char *map, int rows, int cols) {
+  if (rows == cpi->common.mb_rows && cols == cpi->common.mb_cols) {
+    if (map) {
+      vpx_memcpy(cpi->active_map, map, rows * cols);
+      cpi->active_map_enabled = 1;
+    } else {
+      cpi->active_map_enabled = 0;
+    }
+
+    return 0;
+  } else {
+    // cpi->active_map_enabled = 0;
+    return -1;
+  }
+}
+
+int vp9_set_internal_size(VP9_COMP *cpi,
+                          VPX_SCALING horiz_mode, VPX_SCALING vert_mode) {
+  VP9_COMMON *cm = &cpi->common;
+  int hr = 0, hs = 0, vr = 0, vs = 0;
+
+  if (horiz_mode > ONETWO || vert_mode > ONETWO)
+    return -1;
+
+  Scale2Ratio(horiz_mode, &hr, &hs);
+  Scale2Ratio(vert_mode, &vr, &vs);
+
+  // always go to the next whole number
+  cm->width = (hs - 1 + cpi->oxcf.width * hr) / hs;
+  cm->height = (vs - 1 + cpi->oxcf.height * vr) / vs;
+
+  assert(cm->width <= cpi->initial_width);
+  assert(cm->height <= cpi->initial_height);
+  update_frame_size(cpi);
+  return 0;
+}
+
+int vp9_set_size_literal(VP9_COMP *cpi, unsigned int width,
+                         unsigned int height) {
+  VP9_COMMON *cm = &cpi->common;
+
+  check_initial_width(cpi, 1, 1);
+
+  if (width) {
+    cm->width = width;
+    if (cm->width * 5 < cpi->initial_width) {
+      cm->width = cpi->initial_width / 5 + 1;
+      printf("Warning: Desired width too small, changed to %d\n", cm->width);
+    }
+    if (cm->width > cpi->initial_width) {
+      cm->width = cpi->initial_width;
+      printf("Warning: Desired width too large, changed to %d\n", cm->width);
+    }
+  }
+
+  if (height) {
+    cm->height = height;
+    if (cm->height * 5 < cpi->initial_height) {
+      cm->height = cpi->initial_height / 5 + 1;
+      printf("Warning: Desired height too small, changed to %d\n", cm->height);
+    }
+    if (cm->height > cpi->initial_height) {
+      cm->height = cpi->initial_height;
+      printf("Warning: Desired height too large, changed to %d\n", cm->height);
+    }
+  }
+
+  assert(cm->width <= cpi->initial_width);
+  assert(cm->height <= cpi->initial_height);
+  update_frame_size(cpi);
+  return 0;
+}
+
+void vp9_set_svc(VP9_COMP *cpi, int use_svc) {
+  cpi->use_svc = use_svc;
+  return;
+}
+
+int vp9_get_y_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b) {
+  assert(a->y_crop_width == b->y_crop_width);
+  assert(a->y_crop_height == b->y_crop_height);
+
+  return (int)get_sse(a->y_buffer, a->y_stride, b->y_buffer, b->y_stride,
+                      a->y_crop_width, a->y_crop_height);
+}
+
+
+int vp9_get_quantizer(VP9_COMP *cpi) {
+  return cpi->common.base_qindex;
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encoder.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encoder.h
new file mode 100644
index 00000000000..17c826f8cf0
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encoder.h
@@ -0,0 +1,629 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_ENCODER_VP9_ENCODER_H_
+#define VP9_ENCODER_VP9_ENCODER_H_
+
+#include <stdio.h>
+
+#include "./vpx_config.h"
+#include "vpx_ports/mem.h"
+#include "vpx/internal/vpx_codec_internal.h"
+#include "vpx/vp8cx.h"
+
+#include "vp9/common/vp9_ppflags.h"
+#include "vp9/common/vp9_entropy.h"
+#include "vp9/common/vp9_entropymode.h"
+#include "vp9/common/vp9_onyxc_int.h"
+
+#include "vp9/encoder/vp9_aq_cyclicrefresh.h"
+#include "vp9/encoder/vp9_encodemb.h"
+#include "vp9/encoder/vp9_firstpass.h"
+#include "vp9/encoder/vp9_lookahead.h"
+#include "vp9/encoder/vp9_mbgraph.h"
+#include "vp9/encoder/vp9_mcomp.h"
+#include "vp9/encoder/vp9_quantize.h"
+#include "vp9/encoder/vp9_ratectrl.h"
+#include "vp9/encoder/vp9_speed_features.h"
+#include "vp9/encoder/vp9_svc_layercontext.h"
+#include "vp9/encoder/vp9_tokenize.h"
+#include "vp9/encoder/vp9_variance.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define DEFAULT_GF_INTERVAL         10
+
+#define MAX_MODES 30
+#define MAX_REFS  6
+
+typedef struct {
+  int nmvjointcost[MV_JOINTS];
+  int nmvcosts[2][MV_VALS];
+  int nmvcosts_hp[2][MV_VALS];
+
+  vp9_prob segment_pred_probs[PREDICTION_PROBS];
+
+  unsigned char *last_frame_seg_map_copy;
+
+  // 0 = Intra, Last, GF, ARF
+  signed char last_ref_lf_deltas[MAX_REF_LF_DELTAS];
+  // 0 = ZERO_MV, MV
+  signed char last_mode_lf_deltas[MAX_MODE_LF_DELTAS];
+
+  FRAME_CONTEXT fc;
+} CODING_CONTEXT;
+
+// This enumerator type needs to be kept aligned with the mode order in
+// const MODE_DEFINITION vp9_mode_order[MAX_MODES] used in the rd code.
+typedef enum {
+  THR_NEARESTMV,
+  THR_NEARESTA,
+  THR_NEARESTG,
+
+  THR_DC,
+
+  THR_NEWMV,
+  THR_NEWA,
+  THR_NEWG,
+
+  THR_NEARMV,
+  THR_NEARA,
+  THR_COMP_NEARESTLA,
+  THR_COMP_NEARESTGA,
+
+  THR_TM,
+
+  THR_COMP_NEARLA,
+  THR_COMP_NEWLA,
+  THR_NEARG,
+  THR_COMP_NEARGA,
+  THR_COMP_NEWGA,
+
+  THR_ZEROMV,
+  THR_ZEROG,
+  THR_ZEROA,
+  THR_COMP_ZEROLA,
+  THR_COMP_ZEROGA,
+
+  THR_H_PRED,
+  THR_V_PRED,
+  THR_D135_PRED,
+  THR_D207_PRED,
+  THR_D153_PRED,
+  THR_D63_PRED,
+  THR_D117_PRED,
+  THR_D45_PRED,
+} THR_MODES;
+
+typedef enum {
+  THR_LAST,
+  THR_GOLD,
+  THR_ALTR,
+  THR_COMP_LA,
+  THR_COMP_GA,
+  THR_INTRA,
+} THR_MODES_SUB8X8;
+
+typedef enum {
+  // encode_breakout is disabled.
+  ENCODE_BREAKOUT_DISABLED = 0,
+  // encode_breakout is enabled.
+  ENCODE_BREAKOUT_ENABLED = 1,
+  // encode_breakout is enabled with small max_thresh limit.
+  ENCODE_BREAKOUT_LIMITED = 2
+} ENCODE_BREAKOUT_TYPE;
+
+typedef enum {
+  NORMAL      = 0,
+  FOURFIVE    = 1,
+  THREEFIVE   = 2,
+  ONETWO      = 3
+} VPX_SCALING;
+
+typedef enum {
+  RC_MODE_VBR = 0,
+  RC_MODE_CBR = 1,
+  RC_MODE_CONSTRAINED_QUALITY = 2,
+  RC_MODE_CONSTANT_QUALITY    = 3,
+} RC_MODE;
+
+typedef enum {
+  // Good Quality Fast Encoding. The encoder balances quality with the
+  // amount of time it takes to encode the output. (speed setting
+  // controls how fast)
+  ONE_PASS_GOOD = 1,
+
+  // One Pass - Best Quality. The encoder places priority on the
+  // quality of the output over encoding speed. The output is compressed
+  // at the highest possible quality. This option takes the longest
+  // amount of time to encode. (speed setting ignored)
+  ONE_PASS_BEST = 2,
+
+  // Two Pass - First Pass. The encoder generates a file of statistics
+  // for use in the second encoding pass. (speed setting controls how fast)
+  TWO_PASS_FIRST = 3,
+
+  // Two Pass - Second Pass. The encoder uses the statistics that were
+  // generated in the first encoding pass to create the compressed
+  // output. (speed setting controls how fast)
+  TWO_PASS_SECOND_GOOD = 4,
+
+  // Two Pass - Second Pass Best.  The encoder uses the statistics that
+  // were generated in the first encoding pass to create the compressed
+  // output using the highest possible quality, and taking a
+  // longer amount of time to encode. (speed setting ignored)
+  TWO_PASS_SECOND_BEST = 5,
+
+  // Realtime/Live Encoding. This mode is optimized for realtime
+  // encoding (for example, capturing a television signal or feed from
+  // a live camera). (speed setting controls how fast)
+  REALTIME = 6,
+} MODE;
+
+typedef enum {
+  FRAMEFLAGS_KEY    = 1 << 0,
+  FRAMEFLAGS_GOLDEN = 1 << 1,
+  FRAMEFLAGS_ALTREF = 1 << 2,
+} FRAMETYPE_FLAGS;
+
+typedef enum {
+  NO_AQ = 0,
+  VARIANCE_AQ = 1,
+  COMPLEXITY_AQ = 2,
+  CYCLIC_REFRESH_AQ = 3,
+  AQ_MODE_COUNT  // This should always be the last member of the enum
+} AQ_MODE;
+
+
+typedef struct VP9EncoderConfig {
+  BITSTREAM_PROFILE profile;
+  BIT_DEPTH bit_depth;
+  int width;  // width of data passed to the compressor
+  int height;  // height of data passed to the compressor
+  double framerate;  // set to passed in framerate
+  int64_t target_bandwidth;  // bandwidth to be used in kilobits per second
+
+  int noise_sensitivity;  // pre processing blur: recommendation 0
+  int sharpness;  // sharpening output: recommendation 0:
+  int speed;
+  unsigned int rc_max_intra_bitrate_pct;
+
+  MODE mode;
+
+  // Key Framing Operations
+  int auto_key;  // autodetect cut scenes and set the keyframes
+  int key_freq;  // maximum distance to key frame.
+
+  int lag_in_frames;  // how many frames lag before we start encoding
+
+  // ----------------------------------------------------------------
+  // DATARATE CONTROL OPTIONS
+
+  RC_MODE rc_mode;  // vbr, cbr, constrained quality or constant quality
+
+  // buffer targeting aggressiveness
+  int under_shoot_pct;
+  int over_shoot_pct;
+
+  // buffering parameters
+  int64_t starting_buffer_level;  // in seconds
+  int64_t optimal_buffer_level;
+  int64_t maximum_buffer_size;
+
+  // Frame drop threshold.
+  int drop_frames_water_mark;
+
+  // controlling quality
+  int fixed_q;
+  int worst_allowed_q;
+  int best_allowed_q;
+  int cq_level;
+  int lossless;
+  AQ_MODE aq_mode;  // Adaptive Quantization mode
+
+  // Internal frame size scaling.
+  int allow_spatial_resampling;
+  int scaled_frame_width;
+  int scaled_frame_height;
+
+  // Enable feature to reduce the frame quantization every x frames.
+  int frame_periodic_boost;
+
+  // two pass datarate control
+  int two_pass_vbrbias;        // two pass datarate control tweaks
+  int two_pass_vbrmin_section;
+  int two_pass_vbrmax_section;
+  // END DATARATE CONTROL OPTIONS
+  // ----------------------------------------------------------------
+
+  // Spatial and temporal scalability.
+  int ss_number_layers;  // Number of spatial layers.
+  int ts_number_layers;  // Number of temporal layers.
+  // Bitrate allocation for spatial layers.
+  int ss_target_bitrate[VPX_SS_MAX_LAYERS];
+  // Bitrate allocation (CBR mode) and framerate factor, for temporal layers.
+  int ts_target_bitrate[VPX_TS_MAX_LAYERS];
+  int ts_rate_decimator[VPX_TS_MAX_LAYERS];
+
+  // these parameters aren't to be used in final build don't use!!!
+  int play_alternate;
+  int alt_freq;
+
+  int encode_breakout;  // early breakout : for video conf recommend 800
+
+  /* Bitfield defining the error resiliency features to enable.
+   * Can provide decodable frames after losses in previous
+   * frames and decodable partitions after losses in the same frame.
+   */
+  unsigned int error_resilient_mode;
+
+  /* Bitfield defining the parallel decoding mode where the
+   * decoding in successive frames may be conducted in parallel
+   * just by decoding the frame headers.
+   */
+  unsigned int frame_parallel_decoding_mode;
+
+  int arnr_max_frames;
+  int arnr_strength;
+  int arnr_type;
+
+  int tile_columns;
+  int tile_rows;
+
+  struct vpx_fixed_buf         two_pass_stats_in;
+  struct vpx_codec_pkt_list  *output_pkt_list;
+
+  vp8e_tuning tuning;
+} VP9EncoderConfig;
+
+static INLINE int is_best_mode(MODE mode) {
+  return mode == ONE_PASS_BEST || mode == TWO_PASS_SECOND_BEST;
+}
+
+typedef struct RD_OPT {
+  // Thresh_mult is used to set a threshold for the rd score. A higher value
+  // means that we will accept the best mode so far more often. This number
+  // is used in combination with the current block size, and thresh_freq_fact
+  // to pick a threshold.
+  int thresh_mult[MAX_MODES];
+  int thresh_mult_sub8x8[MAX_REFS];
+
+  int threshes[MAX_SEGMENTS][BLOCK_SIZES][MAX_MODES];
+  int thresh_freq_fact[BLOCK_SIZES][MAX_MODES];
+
+  int64_t comp_pred_diff[REFERENCE_MODES];
+  int64_t prediction_type_threshes[MAX_REF_FRAMES][REFERENCE_MODES];
+  int64_t tx_select_diff[TX_MODES];
+  // FIXME(rbultje) can this overflow?
+  int tx_select_threshes[MAX_REF_FRAMES][TX_MODES];
+
+  int64_t filter_diff[SWITCHABLE_FILTER_CONTEXTS];
+  int64_t filter_threshes[MAX_REF_FRAMES][SWITCHABLE_FILTER_CONTEXTS];
+  int64_t filter_cache[SWITCHABLE_FILTER_CONTEXTS];
+  int64_t mask_filter;
+
+  int RDMULT;
+  int RDDIV;
+} RD_OPT;
+
+typedef struct VP9_COMP {
+  QUANTS quants;
+  MACROBLOCK mb;
+  VP9_COMMON common;
+  VP9EncoderConfig oxcf;
+  struct lookahead_ctx    *lookahead;
+  struct lookahead_entry  *source;
+#if CONFIG_MULTIPLE_ARF
+  struct lookahead_entry  *alt_ref_source[REF_FRAMES];
+#else
+  struct lookahead_entry  *alt_ref_source;
+#endif
+  struct lookahead_entry  *last_source;
+
+  YV12_BUFFER_CONFIG *Source;
+  YV12_BUFFER_CONFIG *Last_Source;  // NULL for first frame and alt_ref frames
+  YV12_BUFFER_CONFIG *un_scaled_source;
+  YV12_BUFFER_CONFIG scaled_source;
+  YV12_BUFFER_CONFIG *unscaled_last_source;
+  YV12_BUFFER_CONFIG scaled_last_source;
+
+  int gold_is_last;  // gold same as last frame ( short circuit gold searches)
+  int alt_is_last;  // Alt same as last ( short circuit altref search)
+  int gold_is_alt;  // don't do both alt and gold search ( just do gold).
+
+  int scaled_ref_idx[3];
+  int lst_fb_idx;
+  int gld_fb_idx;
+  int alt_fb_idx;
+
+#if CONFIG_MULTIPLE_ARF
+  int alt_ref_fb_idx[REF_FRAMES - 3];
+#endif
+  int refresh_last_frame;
+  int refresh_golden_frame;
+  int refresh_alt_ref_frame;
+
+  int ext_refresh_frame_flags_pending;
+  int ext_refresh_last_frame;
+  int ext_refresh_golden_frame;
+  int ext_refresh_alt_ref_frame;
+
+  int ext_refresh_frame_context_pending;
+  int ext_refresh_frame_context;
+
+  YV12_BUFFER_CONFIG last_frame_uf;
+
+  TOKENEXTRA *tok;
+  unsigned int tok_count[4][1 << 6];
+
+#if CONFIG_MULTIPLE_ARF
+  // Position within a frame coding order (including any additional ARF frames).
+  unsigned int sequence_number;
+  // Next frame in naturally occurring order that has not yet been coded.
+  int next_frame_in_order;
+#endif
+
+  // Ambient reconstruction err target for force key frames
+  int ambient_err;
+
+  RD_OPT rd;
+
+  CODING_CONTEXT coding_context;
+
+  int zbin_mode_boost;
+  int zbin_mode_boost_enabled;
+  int active_arnr_frames;           // <= cpi->oxcf.arnr_max_frames
+  int active_arnr_strength;         // <= cpi->oxcf.arnr_max_strength
+
+  int64_t last_time_stamp_seen;
+  int64_t last_end_time_stamp_seen;
+  int64_t first_time_stamp_ever;
+
+  RATE_CONTROL rc;
+
+  vp9_coeff_count coef_counts[TX_SIZES][PLANE_TYPES];
+
+  struct vpx_codec_pkt_list  *output_pkt_list;
+
+  MBGRAPH_FRAME_STATS mbgraph_stats[MAX_LAG_BUFFERS];
+  int mbgraph_n_frames;             // number of frames filled in the above
+  int static_mb_pct;                // % forced skip mbs by segmentation
+
+  int pass;
+
+  int ref_frame_flags;
+
+  SPEED_FEATURES sf;
+
+  unsigned int max_mv_magnitude;
+  int mv_step_param;
+
+  // Default value is 1. From first pass stats, encode_breakout may be disabled.
+  ENCODE_BREAKOUT_TYPE allow_encode_breakout;
+
+  // Get threshold from external input. In real time mode, it can be
+  // overwritten according to encoding speed.
+  int encode_breakout;
+
+  unsigned char *segmentation_map;
+
+  // segment threashold for encode breakout
+  int  segment_encode_breakout[MAX_SEGMENTS];
+
+  unsigned char *complexity_map;
+
+  unsigned char *active_map;
+  unsigned int active_map_enabled;
+
+  CYCLIC_REFRESH *cyclic_refresh;
+
+  fractional_mv_step_fp *find_fractional_mv_step;
+  fractional_mv_step_comp_fp *find_fractional_mv_step_comp;
+  vp9_full_search_fn_t full_search_sad;
+  vp9_refining_search_fn_t refining_search_sad;
+  vp9_diamond_search_fn_t diamond_search_sad;
+  vp9_variance_fn_ptr_t fn_ptr[BLOCK_SIZES];
+  uint64_t time_receive_data;
+  uint64_t time_compress_data;
+  uint64_t time_pick_lpf;
+  uint64_t time_encode_sb_row;
+
+  struct twopass_rc twopass;
+
+  YV12_BUFFER_CONFIG alt_ref_buffer;
+  YV12_BUFFER_CONFIG *frames[MAX_LAG_BUFFERS];
+
+#if CONFIG_INTERNAL_STATS
+  unsigned int mode_chosen_counts[MAX_MODES];
+
+  int    count;
+  double total_y;
+  double total_u;
+  double total_v;
+  double total;
+  uint64_t total_sq_error;
+  uint64_t total_samples;
+
+  double totalp_y;
+  double totalp_u;
+  double totalp_v;
+  double totalp;
+  uint64_t totalp_sq_error;
+  uint64_t totalp_samples;
+
+  int    bytes;
+  double summed_quality;
+  double summed_weights;
+  double summedp_quality;
+  double summedp_weights;
+  unsigned int tot_recode_hits;
+
+
+  double total_ssimg_y;
+  double total_ssimg_u;
+  double total_ssimg_v;
+  double total_ssimg_all;
+
+  int b_calculate_ssimg;
+#endif
+  int b_calculate_psnr;
+
+  int droppable;
+
+  int dummy_packing;    /* flag to indicate if packing is dummy */
+
+  unsigned int tx_stepdown_count[TX_SIZES];
+
+  int initial_width;
+  int initial_height;
+
+  int use_svc;
+
+  SVC svc;
+
+  int use_large_partition_rate;
+
+  int frame_flags;
+
+  search_site_config ss_cfg;
+
+  int mbmode_cost[INTRA_MODES];
+  unsigned inter_mode_cost[INTER_MODE_CONTEXTS][INTER_MODES];
+  int intra_uv_mode_cost[FRAME_TYPES][INTRA_MODES];
+  int y_mode_costs[INTRA_MODES][INTRA_MODES][INTRA_MODES];
+  int switchable_interp_costs[SWITCHABLE_FILTER_CONTEXTS][SWITCHABLE_FILTERS];
+
+#if CONFIG_MULTIPLE_ARF
+  // ARF tracking variables.
+  int multi_arf_enabled;
+  unsigned int frame_coding_order_period;
+  unsigned int new_frame_coding_order_period;
+  int frame_coding_order[MAX_LAG_BUFFERS * 2];
+  int arf_buffer_idx[MAX_LAG_BUFFERS * 3 / 2];
+  int arf_weight[MAX_LAG_BUFFERS];
+  int arf_buffered;
+  int this_frame_weight;
+  int max_arf_level;
+#endif
+} VP9_COMP;
+
+void vp9_initialize_enc();
+
+struct VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf);
+void vp9_remove_compressor(VP9_COMP *cpi);
+
+void vp9_change_config(VP9_COMP *cpi, const VP9EncoderConfig *oxcf);
+
+  // receive a frames worth of data. caller can assume that a copy of this
+  // frame is made and not just a copy of the pointer..
+int vp9_receive_raw_frame(VP9_COMP *cpi, unsigned int frame_flags,
+                          YV12_BUFFER_CONFIG *sd, int64_t time_stamp,
+                          int64_t end_time_stamp);
+
+int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
+                            size_t *size, uint8_t *dest,
+                            int64_t *time_stamp, int64_t *time_end, int flush);
+
+int vp9_get_preview_raw_frame(VP9_COMP *cpi, YV12_BUFFER_CONFIG *dest,
+                              vp9_ppflags_t *flags);
+
+int vp9_use_as_reference(VP9_COMP *cpi, int ref_frame_flags);
+
+void vp9_update_reference(VP9_COMP *cpi, int ref_frame_flags);
+
+int vp9_copy_reference_enc(VP9_COMP *cpi, VP9_REFFRAME ref_frame_flag,
+                           YV12_BUFFER_CONFIG *sd);
+
+int vp9_get_reference_enc(VP9_COMP *cpi, int index,
+                          YV12_BUFFER_CONFIG **fb);
+
+int vp9_set_reference_enc(VP9_COMP *cpi, VP9_REFFRAME ref_frame_flag,
+                          YV12_BUFFER_CONFIG *sd);
+
+int vp9_update_entropy(VP9_COMP *cpi, int update);
+
+int vp9_set_active_map(VP9_COMP *cpi, unsigned char *map, int rows, int cols);
+
+int vp9_set_internal_size(VP9_COMP *cpi,
+                          VPX_SCALING horiz_mode, VPX_SCALING vert_mode);
+
+int vp9_set_size_literal(VP9_COMP *cpi, unsigned int width,
+                         unsigned int height);
+
+void vp9_set_svc(VP9_COMP *cpi, int use_svc);
+
+int vp9_get_quantizer(struct VP9_COMP *cpi);
+
+static INLINE int get_ref_frame_idx(const VP9_COMP *cpi,
+                                    MV_REFERENCE_FRAME ref_frame) {
+  if (ref_frame == LAST_FRAME) {
+    return cpi->lst_fb_idx;
+  } else if (ref_frame == GOLDEN_FRAME) {
+    return cpi->gld_fb_idx;
+  } else {
+    return cpi->alt_fb_idx;
+  }
+}
+
+static INLINE YV12_BUFFER_CONFIG *get_ref_frame_buffer(
+    VP9_COMP *cpi, MV_REFERENCE_FRAME ref_frame) {
+  VP9_COMMON * const cm = &cpi->common;
+  return &cm->frame_bufs[cm->ref_frame_map[get_ref_frame_idx(cpi, ref_frame)]]
+      .buf;
+}
+
+// Intra only frames, golden frames (except alt ref overlays) and
+// alt ref frames tend to be coded at a higher than ambient quality
+static INLINE int frame_is_boosted(const VP9_COMP *cpi) {
+  return frame_is_intra_only(&cpi->common) || cpi->refresh_alt_ref_frame ||
+         (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref) ||
+         vp9_is_upper_layer_key_frame(cpi);
+}
+
+static INLINE int get_token_alloc(int mb_rows, int mb_cols) {
+  // TODO(JBB): make this work for alpha channel and double check we can't
+  // exceed this token count if we have a 32x32 transform crossing a boundary
+  // at a multiple of 16.
+  // mb_rows, cols are in units of 16 pixels. We assume 3 planes all at full
+  // resolution. We assume up to 1 token per pixel, and then allow
+  // a head room of 4.
+  return mb_rows * mb_cols * (16 * 16 * 3 + 4);
+}
+
+int vp9_get_y_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b);
+
+void vp9_alloc_compressor_data(VP9_COMP *cpi);
+
+void vp9_scale_references(VP9_COMP *cpi);
+
+void vp9_update_reference_frames(VP9_COMP *cpi);
+
+int64_t vp9_rescale(int64_t val, int64_t num, int denom);
+
+YV12_BUFFER_CONFIG *vp9_scale_if_required(VP9_COMMON *cm,
+                                          YV12_BUFFER_CONFIG *unscaled,
+                                          YV12_BUFFER_CONFIG *scaled);
+
+static INLINE void set_ref_ptrs(VP9_COMMON *cm, MACROBLOCKD *xd,
+                                MV_REFERENCE_FRAME ref0,
+                                MV_REFERENCE_FRAME ref1) {
+  xd->block_refs[0] = &cm->frame_refs[ref0 >= LAST_FRAME ? ref0 - LAST_FRAME
+                                                         : 0];
+  xd->block_refs[1] = &cm->frame_refs[ref1 >= LAST_FRAME ? ref1 - LAST_FRAME
+                                                         : 0];
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_ENCODER_VP9_ENCODER_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_extend.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_extend.c
new file mode 100644
index 00000000000..dcbb5ac3537
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_extend.c
@@ -0,0 +1,143 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_mem/vpx_mem.h"
+
+#include "vp9/common/vp9_common.h"
+#include "vp9/encoder/vp9_extend.h"
+
+static void copy_and_extend_plane(const uint8_t *src, int src_pitch,
+                                  uint8_t *dst, int dst_pitch,
+                                  int w, int h,
+                                  int extend_top, int extend_left,
+                                  int extend_bottom, int extend_right) {
+  int i, linesize;
+
+  // copy the left and right most columns out
+  const uint8_t *src_ptr1 = src;
+  const uint8_t *src_ptr2 = src + w - 1;
+  uint8_t *dst_ptr1 = dst - extend_left;
+  uint8_t *dst_ptr2 = dst + w;
+
+  for (i = 0; i < h; i++) {
+    vpx_memset(dst_ptr1, src_ptr1[0], extend_left);
+    vpx_memcpy(dst_ptr1 + extend_left, src_ptr1, w);
+    vpx_memset(dst_ptr2, src_ptr2[0], extend_right);
+    src_ptr1 += src_pitch;
+    src_ptr2 += src_pitch;
+    dst_ptr1 += dst_pitch;
+    dst_ptr2 += dst_pitch;
+  }
+
+  // Now copy the top and bottom lines into each line of the respective
+  // borders
+  src_ptr1 = dst - extend_left;
+  src_ptr2 = dst + dst_pitch * (h - 1) - extend_left;
+  dst_ptr1 = dst + dst_pitch * (-extend_top) - extend_left;
+  dst_ptr2 = dst + dst_pitch * (h) - extend_left;
+  linesize = extend_left + extend_right + w;
+
+  for (i = 0; i < extend_top; i++) {
+    vpx_memcpy(dst_ptr1, src_ptr1, linesize);
+    dst_ptr1 += dst_pitch;
+  }
+
+  for (i = 0; i < extend_bottom; i++) {
+    vpx_memcpy(dst_ptr2, src_ptr2, linesize);
+    dst_ptr2 += dst_pitch;
+  }
+}
+
+void vp9_copy_and_extend_frame(const YV12_BUFFER_CONFIG *src,
+                               YV12_BUFFER_CONFIG *dst) {
+  // Extend src frame in buffer
+  // Altref filtering assumes 16 pixel extension
+  const int et_y = 16;
+  const int el_y = 16;
+  // Motion estimation may use src block variance with the block size up
+  // to 64x64, so the right and bottom need to be extended to 64 multiple
+  // or up to 16, whichever is greater.
+  const int eb_y = MAX(ALIGN_POWER_OF_TWO(src->y_width, 6) - src->y_width,
+                       16);
+  const int er_y = MAX(ALIGN_POWER_OF_TWO(src->y_height, 6) - src->y_height,
+                       16);
+  const int uv_width_subsampling = (src->uv_width != src->y_width);
+  const int uv_height_subsampling = (src->uv_height != src->y_height);
+  const int et_uv = et_y >> uv_height_subsampling;
+  const int el_uv = el_y >> uv_width_subsampling;
+  const int eb_uv = eb_y >> uv_height_subsampling;
+  const int er_uv = er_y >> uv_width_subsampling;
+
+#if CONFIG_ALPHA
+  const int et_a = dst->border >> (dst->alpha_height != dst->y_height);
+  const int el_a = dst->border >> (dst->alpha_width != dst->y_width);
+  const int eb_a = et_a + dst->alpha_height - src->alpha_height;
+  const int er_a = el_a + dst->alpha_width - src->alpha_width;
+
+  copy_and_extend_plane(src->alpha_buffer, src->alpha_stride,
+                        dst->alpha_buffer, dst->alpha_stride,
+                        src->alpha_width, src->alpha_height,
+                        et_a, el_a, eb_a, er_a);
+#endif
+
+  copy_and_extend_plane(src->y_buffer, src->y_stride,
+                        dst->y_buffer, dst->y_stride,
+                        src->y_width, src->y_height,
+                        et_y, el_y, eb_y, er_y);
+
+  copy_and_extend_plane(src->u_buffer, src->uv_stride,
+                        dst->u_buffer, dst->uv_stride,
+                        src->uv_width, src->uv_height,
+                        et_uv, el_uv, eb_uv, er_uv);
+
+  copy_and_extend_plane(src->v_buffer, src->uv_stride,
+                        dst->v_buffer, dst->uv_stride,
+                        src->uv_width, src->uv_height,
+                        et_uv, el_uv, eb_uv, er_uv);
+}
+
+void vp9_copy_and_extend_frame_with_rect(const YV12_BUFFER_CONFIG *src,
+                                         YV12_BUFFER_CONFIG *dst,
+                                         int srcy, int srcx,
+                                         int srch, int srcw) {
+  // If the side is not touching the bounder then don't extend.
+  const int et_y = srcy ? 0 : dst->border;
+  const int el_y = srcx ? 0 : dst->border;
+  const int eb_y = srcy + srch != src->y_height ? 0 :
+                      dst->border + dst->y_height - src->y_height;
+  const int er_y = srcx + srcw != src->y_width ? 0 :
+                      dst->border + dst->y_width - src->y_width;
+  const int src_y_offset = srcy * src->y_stride + srcx;
+  const int dst_y_offset = srcy * dst->y_stride + srcx;
+
+  const int et_uv = ROUND_POWER_OF_TWO(et_y, 1);
+  const int el_uv = ROUND_POWER_OF_TWO(el_y, 1);
+  const int eb_uv = ROUND_POWER_OF_TWO(eb_y, 1);
+  const int er_uv = ROUND_POWER_OF_TWO(er_y, 1);
+  const int src_uv_offset = ((srcy * src->uv_stride) >> 1) + (srcx >> 1);
+  const int dst_uv_offset = ((srcy * dst->uv_stride) >> 1) + (srcx >> 1);
+  const int srch_uv = ROUND_POWER_OF_TWO(srch, 1);
+  const int srcw_uv = ROUND_POWER_OF_TWO(srcw, 1);
+
+  copy_and_extend_plane(src->y_buffer + src_y_offset, src->y_stride,
+                        dst->y_buffer + dst_y_offset, dst->y_stride,
+                        srcw, srch,
+                        et_y, el_y, eb_y, er_y);
+
+  copy_and_extend_plane(src->u_buffer + src_uv_offset, src->uv_stride,
+                        dst->u_buffer + dst_uv_offset, dst->uv_stride,
+                        srcw_uv, srch_uv,
+                        et_uv, el_uv, eb_uv, er_uv);
+
+  copy_and_extend_plane(src->v_buffer + src_uv_offset, src->uv_stride,
+                        dst->v_buffer + dst_uv_offset, dst->uv_stride,
+                        srcw_uv, srch_uv,
+                        et_uv, el_uv, eb_uv, er_uv);
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_extend.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_extend.h
new file mode 100644
index 00000000000..058fe09cf98
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_extend.h
@@ -0,0 +1,33 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_ENCODER_VP9_EXTEND_H_
+#define VP9_ENCODER_VP9_EXTEND_H_
+
+#include "vpx_scale/yv12config.h"
+#include "vpx/vpx_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+void vp9_copy_and_extend_frame(const YV12_BUFFER_CONFIG *src,
+                               YV12_BUFFER_CONFIG *dst);
+
+void vp9_copy_and_extend_frame_with_rect(const YV12_BUFFER_CONFIG *src,
+                                         YV12_BUFFER_CONFIG *dst,
+                                         int srcy, int srcx,
+                                         int srch, int srcw);
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_ENCODER_VP9_EXTEND_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_firstpass.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_firstpass.c
index c83954e0ceb..ed72d786661 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_firstpass.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_firstpass.c
@@ -8,32 +8,34 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include <math.h>
 #include <limits.h>
+#include <math.h>
 #include <stdio.h>
-#include "vp9/encoder/vp9_block.h"
-#include "vp9/encoder/vp9_onyx_int.h"
-#include "vp9/encoder/vp9_variance.h"
-#include "vp9/encoder/vp9_encodeintra.h"
-#include "vp9/encoder/vp9_mcomp.h"
-#include "vp9/encoder/vp9_firstpass.h"
+
+#include "./vpx_scale_rtcd.h"
+
+#include "vpx_mem/vpx_mem.h"
 #include "vpx_scale/vpx_scale.h"
+#include "vpx_scale/yv12config.h"
+
+#include "vp9/common/vp9_entropymv.h"
+#include "vp9/common/vp9_quant_common.h"
+#include "vp9/common/vp9_reconinter.h"  // vp9_setup_dst_planes()
+#include "vp9/common/vp9_systemdependent.h"
+
+#include "vp9/encoder/vp9_aq_variance.h"
+#include "vp9/encoder/vp9_block.h"
 #include "vp9/encoder/vp9_encodeframe.h"
 #include "vp9/encoder/vp9_encodemb.h"
-#include "vp9/common/vp9_extend.h"
-#include "vp9/common/vp9_systemdependent.h"
-#include "vpx_mem/vpx_mem.h"
-#include "vpx_scale/yv12config.h"
+#include "vp9/encoder/vp9_encodemv.h"
+#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_extend.h"
+#include "vp9/encoder/vp9_firstpass.h"
+#include "vp9/encoder/vp9_mcomp.h"
 #include "vp9/encoder/vp9_quantize.h"
-#include "vp9/encoder/vp9_rdopt.h"
 #include "vp9/encoder/vp9_ratectrl.h"
-#include "vp9/common/vp9_quant_common.h"
-#include "vp9/common/vp9_entropymv.h"
-#include "vp9/encoder/vp9_encodemv.h"
-#include "vp9/encoder/vp9_vaq.h"
-#include "./vpx_scale_rtcd.h"
-// TODO(jkoleszar): for setup_dst_planes
-#include "vp9/common/vp9_reconinter.h"
+#include "vp9/encoder/vp9_rdopt.h"
+#include "vp9/encoder/vp9_variance.h"
 
 #define OUTPUT_FPF 0
 
@@ -50,8 +52,17 @@
 
 #define DOUBLE_DIVIDE_CHECK(x) ((x) < 0 ? (x) - 0.000001 : (x) + 0.000001)
 
-#define POW1 (double)cpi->oxcf.two_pass_vbrbias/100.0
-#define POW2 (double)cpi->oxcf.two_pass_vbrbias/100.0
+#define MIN_KF_BOOST        300
+
+#if CONFIG_MULTIPLE_ARF
+// Set MIN_GF_INTERVAL to 1 for the full decomposition.
+#define MIN_GF_INTERVAL             2
+#else
+#define MIN_GF_INTERVAL             4
+#endif
+
+
+// #define LONG_TERM_VBR_CORRECTION
 
 static void swap_yv12(YV12_BUFFER_CONFIG *a, YV12_BUFFER_CONFIG *b) {
   YV12_BUFFER_CONFIG temp = *a;
@@ -59,51 +70,41 @@ static void swap_yv12(YV12_BUFFER_CONFIG *a, YV12_BUFFER_CONFIG *b) {
   *b = temp;
 }
 
-static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame);
-
-static int select_cq_level(int qindex) {
-  int ret_val = QINDEX_RANGE - 1;
-  int i;
-
-  double target_q = (vp9_convert_qindex_to_q(qindex) * 0.5847) + 1.0;
-
-  for (i = 0; i < QINDEX_RANGE; i++) {
-    if (target_q <= vp9_convert_qindex_to_q(i)) {
-      ret_val = i;
-      break;
-    }
-  }
-
-  return ret_val;
+static int gfboost_qadjust(int qindex) {
+  const double q = vp9_convert_qindex_to_q(qindex);
+  return (int)((0.00000828 * q * q * q) +
+               (-0.0055 * q * q) +
+               (1.32 * q) + 79.3);
 }
 
-
 // Resets the first pass file to the given position using a relative seek from
 // the current position.
-static void reset_fpf_position(VP9_COMP *cpi, FIRSTPASS_STATS *position) {
-  cpi->twopass.stats_in = position;
+static void reset_fpf_position(struct twopass_rc *p,
+                               const FIRSTPASS_STATS *position) {
+  p->stats_in = position;
 }
 
-static int lookup_next_frame_stats(VP9_COMP *cpi, FIRSTPASS_STATS *next_frame) {
-  if (cpi->twopass.stats_in >= cpi->twopass.stats_in_end)
+static int lookup_next_frame_stats(const struct twopass_rc *p,
+                                   FIRSTPASS_STATS *next_frame) {
+  if (p->stats_in >= p->stats_in_end)
     return EOF;
 
-  *next_frame = *cpi->twopass.stats_in;
+  *next_frame = *p->stats_in;
   return 1;
 }
 
-// Read frame stats at an offset from the current position
-static int read_frame_stats(VP9_COMP *cpi,
-                            FIRSTPASS_STATS *frame_stats,
-                            int offset) {
-  FIRSTPASS_STATS *fps_ptr = cpi->twopass.stats_in;
 
-  // Check legality of offset
+// Read frame stats at an offset from the current position.
+static int read_frame_stats(const struct twopass_rc *p,
+                            FIRSTPASS_STATS *frame_stats, int offset) {
+  const FIRSTPASS_STATS *fps_ptr = p->stats_in;
+
+  // Check legality of offset.
   if (offset >= 0) {
-    if (&fps_ptr[offset] >= cpi->twopass.stats_in_end)
+    if (&fps_ptr[offset] >= p->stats_in_end)
       return EOF;
   } else if (offset < 0) {
-    if (&fps_ptr[offset] < cpi->twopass.stats_in_start)
+    if (&fps_ptr[offset] < p->stats_in_start)
       return EOF;
   }
 
@@ -111,19 +112,17 @@ static int read_frame_stats(VP9_COMP *cpi,
   return 1;
 }
 
-static int input_stats(VP9_COMP *cpi, FIRSTPASS_STATS *fps) {
-  if (cpi->twopass.stats_in >= cpi->twopass.stats_in_end)
+static int input_stats(struct twopass_rc *p, FIRSTPASS_STATS *fps) {
+  if (p->stats_in >= p->stats_in_end)
     return EOF;
 
-  *fps = *cpi->twopass.stats_in;
-  cpi->twopass.stats_in =
-    (void *)((char *)cpi->twopass.stats_in + sizeof(FIRSTPASS_STATS));
+  *fps = *p->stats_in;
+  ++p->stats_in;
   return 1;
 }
 
-static void output_stats(const VP9_COMP            *cpi,
-                         struct vpx_codec_pkt_list *pktlist,
-                         FIRSTPASS_STATS            *stats) {
+static void output_stats(FIRSTPASS_STATS *stats,
+                         struct vpx_codec_pkt_list *pktlist) {
   struct vpx_codec_cx_pkt pkt;
   pkt.kind = VPX_CODEC_STATS_PKT;
   pkt.data.twopass_stats.buf = stats;
@@ -132,12 +131,11 @@ static void output_stats(const VP9_COMP            *cpi,
 
 // TEMP debug code
 #if OUTPUT_FPF
-
   {
     FILE *fpfile;
     fpfile = fopen("firstpass.stt", "a");
 
-    fprintf(stdout, "%12.0f %12.0f %12.0f %12.0f %12.0f %12.4f %12.4f"
+    fprintf(fpfile, "%12.0f %12.0f %12.0f %12.0f %12.0f %12.4f %12.4f"
             "%12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f"
             "%12.0f %12.0f %12.4f %12.0f %12.0f %12.4f\n",
             stats->frame,
@@ -184,10 +182,13 @@ static void zero_stats(FIRSTPASS_STATS *section) {
   section->new_mv_count = 0.0;
   section->count      = 0.0;
   section->duration   = 1.0;
+  section->spatial_layer_id = 0;
 }
 
-static void accumulate_stats(FIRSTPASS_STATS *section, FIRSTPASS_STATS *frame) {
+static void accumulate_stats(FIRSTPASS_STATS *section,
+                             const FIRSTPASS_STATS *frame) {
   section->frame += frame->frame;
+  section->spatial_layer_id = frame->spatial_layer_id;
   section->intra_error += frame->intra_error;
   section->coded_error += frame->coded_error;
   section->sr_coded_error += frame->sr_coded_error;
@@ -208,7 +209,8 @@ static void accumulate_stats(FIRSTPASS_STATS *section, FIRSTPASS_STATS *frame) {
   section->duration   += frame->duration;
 }
 
-static void subtract_stats(FIRSTPASS_STATS *section, FIRSTPASS_STATS *frame) {
+static void subtract_stats(FIRSTPASS_STATS *section,
+                           const FIRSTPASS_STATS *frame) {
   section->frame -= frame->frame;
   section->intra_error -= frame->intra_error;
   section->coded_error -= frame->coded_error;
@@ -254,13 +256,27 @@ static void avg_stats(FIRSTPASS_STATS *section) {
 
 // Calculate a modified Error used in distributing bits between easier and
 // harder frames.
-static double calculate_modified_err(VP9_COMP *cpi,
-                                     FIRSTPASS_STATS *this_frame) {
-  const FIRSTPASS_STATS *const stats = &cpi->twopass.total_stats;
-  const double av_err = stats->ssim_weighted_pred_err / stats->count;
-  const double this_err = this_frame->ssim_weighted_pred_err;
-  return av_err * pow(this_err / DOUBLE_DIVIDE_CHECK(av_err),
-                      this_err > av_err ? POW1 : POW2);
+static double calculate_modified_err(const VP9_COMP *cpi,
+                                     const FIRSTPASS_STATS *this_frame) {
+  const struct twopass_rc *twopass = &cpi->twopass;
+  const SVC *const svc = &cpi->svc;
+  const FIRSTPASS_STATS *stats;
+  double av_err;
+  double modified_error;
+
+  if (svc->number_spatial_layers > 1 &&
+      svc->number_temporal_layers == 1) {
+    twopass = &svc->layer_context[svc->spatial_layer_id].twopass;
+  }
+
+  stats = &twopass->total_stats;
+  av_err = stats->ssim_weighted_pred_err / stats->count;
+  modified_error = av_err * pow(this_frame->ssim_weighted_pred_err /
+                   DOUBLE_DIVIDE_CHECK(av_err),
+                   cpi->oxcf.two_pass_vbrbias / 100.0);
+
+  return fclamp(modified_error,
+                twopass->modified_error_min, twopass->modified_error_max);
 }
 
 static const double weight_table[256] = {
@@ -303,43 +319,34 @@ static const double weight_table[256] = {
   1.000000, 1.000000, 1.000000, 1.000000
 };
 
-static double simple_weight(YV12_BUFFER_CONFIG *source) {
+static double simple_weight(const YV12_BUFFER_CONFIG *buf) {
   int i, j;
+  double sum = 0.0;
+  const int w = buf->y_crop_width;
+  const int h = buf->y_crop_height;
+  const uint8_t *row = buf->y_buffer;
+
+  for (i = 0; i < h; ++i) {
+    const uint8_t *pixel = row;
+    for (j = 0; j < w; ++j)
+      sum += weight_table[*pixel++];
+    row += buf->y_stride;
+  }
 
-  uint8_t *src = source->y_buffer;
-  double sum_weights = 0.0;
-
-  // Loop through the Y plane examining levels and creating a weight for
-  // the image.
-  i = source->y_height;
-  do {
-    j = source->y_width;
-    do {
-      sum_weights += weight_table[ *src];
-      src++;
-    } while (--j);
-    src -= source->y_width;
-    src += source->y_stride;
-  } while (--i);
-
-  sum_weights /= (source->y_height * source->y_width);
-
-  return sum_weights;
+  return MAX(0.1, sum / (w * h));
 }
 
-
-// This function returns the current per frame maximum bitrate target.
-static int frame_max_bits(VP9_COMP *cpi) {
-  // Max allocation for a single frame based on the max section guidelines
-  // passed in and how many bits are left.
-  // For VBR base this on the bits and frames left plus the
-  // two_pass_vbrmax_section rate passed in by the user.
-  const double max_bits = (1.0 * cpi->twopass.bits_left /
-      (cpi->twopass.total_stats.count - cpi->common.current_video_frame)) *
-      (cpi->oxcf.two_pass_vbrmax_section / 100.0);
-
-  // Trap case where we are out of bits.
-  return MAX((int)max_bits, 0);
+// This function returns the maximum target rate per frame.
+static int frame_max_bits(const RATE_CONTROL *rc,
+                          const VP9EncoderConfig *oxcf) {
+  int64_t max_bits = ((int64_t)rc->avg_frame_bandwidth *
+                          (int64_t)oxcf->two_pass_vbrmax_section) / 100;
+  if (max_bits < 0)
+    max_bits = 0;
+  else if (max_bits > rc->max_frame_bandwidth)
+    max_bits = rc->max_frame_bandwidth;
+
+  return (int)max_bits;
 }
 
 void vp9_init_first_pass(VP9_COMP *cpi) {
@@ -347,157 +354,145 @@ void vp9_init_first_pass(VP9_COMP *cpi) {
 }
 
 void vp9_end_first_pass(VP9_COMP *cpi) {
-  output_stats(cpi, cpi->output_pkt_list, &cpi->twopass.total_stats);
+  if (cpi->use_svc && cpi->svc.number_temporal_layers == 1) {
+    int i;
+    for (i = 0; i < cpi->svc.number_spatial_layers; ++i) {
+      output_stats(&cpi->svc.layer_context[i].twopass.total_stats,
+                   cpi->output_pkt_list);
+    }
+  } else {
+    output_stats(&cpi->twopass.total_stats, cpi->output_pkt_list);
+  }
 }
 
-static void zz_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
-                             YV12_BUFFER_CONFIG *recon_buffer,
-                             int *best_motion_err, int recon_yoffset) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-
-  // Set up pointers for this macro block recon buffer
-  xd->plane[0].pre[0].buf = recon_buffer->y_buffer + recon_yoffset;
-
-  switch (xd->mi_8x8[0]->mbmi.sb_type) {
+static vp9_variance_fn_t get_block_variance_fn(BLOCK_SIZE bsize) {
+  switch (bsize) {
     case BLOCK_8X8:
-      vp9_mse8x8(x->plane[0].src.buf, x->plane[0].src.stride,
-                 xd->plane[0].pre[0].buf, xd->plane[0].pre[0].stride,
-                 (unsigned int *)(best_motion_err));
-      break;
+      return vp9_mse8x8;
     case BLOCK_16X8:
-      vp9_mse16x8(x->plane[0].src.buf, x->plane[0].src.stride,
-                  xd->plane[0].pre[0].buf, xd->plane[0].pre[0].stride,
-                  (unsigned int *)(best_motion_err));
-      break;
+      return vp9_mse16x8;
     case BLOCK_8X16:
-      vp9_mse8x16(x->plane[0].src.buf, x->plane[0].src.stride,
-                  xd->plane[0].pre[0].buf, xd->plane[0].pre[0].stride,
-                  (unsigned int *)(best_motion_err));
-      break;
+      return vp9_mse8x16;
     default:
-      vp9_mse16x16(x->plane[0].src.buf, x->plane[0].src.stride,
-                   xd->plane[0].pre[0].buf, xd->plane[0].pre[0].stride,
-                   (unsigned int *)(best_motion_err));
-      break;
+      return vp9_mse16x16;
   }
 }
 
+static unsigned int get_prediction_error(BLOCK_SIZE bsize,
+                                         const struct buf_2d *src,
+                                         const struct buf_2d *ref) {
+  unsigned int sse;
+  const vp9_variance_fn_t fn = get_block_variance_fn(bsize);
+  fn(src->buf, src->stride, ref->buf, ref->stride, &sse);
+  return sse;
+}
+
+// Refine the motion search range according to the frame dimension
+// for first pass test.
+static int get_search_range(const VP9_COMMON *cm) {
+  int sr = 0;
+  const int dim = MIN(cm->width, cm->height);
+
+  while ((dim << sr) < MAX_FULL_PEL_VAL)
+    ++sr;
+  return sr;
+}
+
 static void first_pass_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
-                                     int_mv *ref_mv, MV *best_mv,
-                                     YV12_BUFFER_CONFIG *recon_buffer,
-                                     int *best_motion_err, int recon_yoffset) {
+                                     const MV *ref_mv, MV *best_mv,
+                                     int *best_motion_err) {
   MACROBLOCKD *const xd = &x->e_mbd;
-  int num00;
-
-  int_mv tmp_mv;
-  int_mv ref_mv_full;
+  MV tmp_mv = {0, 0};
+  MV ref_mv_full = {ref_mv->row >> 3, ref_mv->col >> 3};
+  int num00, tmp_err, n;
+  const BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
+  vp9_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[bsize];
+  const int new_mv_mode_penalty = 256;
 
-  int tmp_err;
   int step_param = 3;
   int further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param;
-  int n;
-  vp9_variance_fn_ptr_t v_fn_ptr =
-      cpi->fn_ptr[xd->mi_8x8[0]->mbmi.sb_type];
-  int new_mv_mode_penalty = 256;
-
-  int sr = 0;
-  int quart_frm = MIN(cpi->common.width, cpi->common.height);
-
-  // refine the motion search range accroding to the frame dimension
-  // for first pass test
-  while ((quart_frm << sr) < MAX_FULL_PEL_VAL)
-    sr++;
-  if (sr)
-    sr--;
-
-  step_param    += sr;
+  const int sr = get_search_range(&cpi->common);
+  step_param += sr;
   further_steps -= sr;
 
-  // override the default variance function to use MSE
-  switch (xd->mi_8x8[0]->mbmi.sb_type) {
-    case BLOCK_8X8:
-      v_fn_ptr.vf = vp9_mse8x8;
-      break;
-    case BLOCK_16X8:
-      v_fn_ptr.vf = vp9_mse16x8;
-      break;
-    case BLOCK_8X16:
-      v_fn_ptr.vf = vp9_mse8x16;
-      break;
-    default:
-      v_fn_ptr.vf = vp9_mse16x16;
-      break;
-  }
-
-  // Set up pointers for this macro block recon buffer
-  xd->plane[0].pre[0].buf = recon_buffer->y_buffer + recon_yoffset;
+  // Override the default variance function to use MSE.
+  v_fn_ptr.vf = get_block_variance_fn(bsize);
 
-  // Initial step/diamond search centred on best mv
-  tmp_mv.as_int = 0;
-  ref_mv_full.as_mv.col = ref_mv->as_mv.col >> 3;
-  ref_mv_full.as_mv.row = ref_mv->as_mv.row >> 3;
-  tmp_err = cpi->diamond_search_sad(x, &ref_mv_full, &tmp_mv, step_param,
-                                    x->sadperbit16, &num00, &v_fn_ptr,
-                                    x->nmvjointcost,
-                                    x->mvcost, ref_mv);
+  // Center the initial step/diamond search on best mv.
+  tmp_err = cpi->diamond_search_sad(x, &cpi->ss_cfg, &ref_mv_full, &tmp_mv,
+                                    step_param,
+                                    x->sadperbit16, &num00, &v_fn_ptr, ref_mv);
+  if (tmp_err < INT_MAX)
+    tmp_err = vp9_get_mvpred_var(x, &tmp_mv, ref_mv, &v_fn_ptr, 1);
   if (tmp_err < INT_MAX - new_mv_mode_penalty)
     tmp_err += new_mv_mode_penalty;
 
   if (tmp_err < *best_motion_err) {
     *best_motion_err = tmp_err;
-    best_mv->row = tmp_mv.as_mv.row;
-    best_mv->col = tmp_mv.as_mv.col;
+    *best_mv = tmp_mv;
   }
 
-  // Further step/diamond searches as necessary
+  // Carry out further step/diamond searches as necessary.
   n = num00;
   num00 = 0;
 
   while (n < further_steps) {
-    n++;
+    ++n;
 
     if (num00) {
-      num00--;
+      --num00;
     } else {
-      tmp_err = cpi->diamond_search_sad(x, &ref_mv_full, &tmp_mv,
+      tmp_err = cpi->diamond_search_sad(x, &cpi->ss_cfg, &ref_mv_full, &tmp_mv,
                                         step_param + n, x->sadperbit16,
-                                        &num00, &v_fn_ptr,
-                                        x->nmvjointcost,
-                                        x->mvcost, ref_mv);
+                                        &num00, &v_fn_ptr, ref_mv);
+      if (tmp_err < INT_MAX)
+        tmp_err = vp9_get_mvpred_var(x, &tmp_mv, ref_mv, &v_fn_ptr, 1);
       if (tmp_err < INT_MAX - new_mv_mode_penalty)
         tmp_err += new_mv_mode_penalty;
 
       if (tmp_err < *best_motion_err) {
         *best_motion_err = tmp_err;
-        best_mv->row = tmp_mv.as_mv.row;
-        best_mv->col = tmp_mv.as_mv.col;
+        *best_mv = tmp_mv;
       }
     }
   }
 }
 
+static BLOCK_SIZE get_bsize(const VP9_COMMON *cm, int mb_row, int mb_col) {
+  if (2 * mb_col + 1 < cm->mi_cols) {
+    return 2 * mb_row + 1 < cm->mi_rows ? BLOCK_16X16
+                                        : BLOCK_16X8;
+  } else {
+    return 2 * mb_row + 1 < cm->mi_rows ? BLOCK_8X16
+                                        : BLOCK_8X8;
+  }
+}
+
 void vp9_first_pass(VP9_COMP *cpi) {
   int mb_row, mb_col;
   MACROBLOCK *const x = &cpi->mb;
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
   TileInfo tile;
+  struct macroblock_plane *const p = x->plane;
+  struct macroblockd_plane *const pd = xd->plane;
+  const PICK_MODE_CONTEXT *ctx = &x->pc_root->none;
+  int i;
 
   int recon_yoffset, recon_uvoffset;
-  const int lst_yv12_idx = cm->ref_frame_map[cpi->lst_fb_idx];
-  const int gld_yv12_idx = cm->ref_frame_map[cpi->gld_fb_idx];
-  YV12_BUFFER_CONFIG *const lst_yv12 = &cm->yv12_fb[lst_yv12_idx];
-  YV12_BUFFER_CONFIG *const gld_yv12 = &cm->yv12_fb[gld_yv12_idx];
+  YV12_BUFFER_CONFIG *const lst_yv12 = get_ref_frame_buffer(cpi, LAST_FRAME);
+  YV12_BUFFER_CONFIG *gld_yv12 = get_ref_frame_buffer(cpi, GOLDEN_FRAME);
   YV12_BUFFER_CONFIG *const new_yv12 = get_frame_new_buffer(cm);
-  const int recon_y_stride = lst_yv12->y_stride;
-  const int recon_uv_stride = lst_yv12->uv_stride;
+  int recon_y_stride = lst_yv12->y_stride;
+  int recon_uv_stride = lst_yv12->uv_stride;
+  int uv_mb_height = 16 >> (lst_yv12->y_height > lst_yv12->uv_height);
   int64_t intra_error = 0;
   int64_t coded_error = 0;
   int64_t sr_coded_error = 0;
 
   int sum_mvr = 0, sum_mvc = 0;
   int sum_mvr_abs = 0, sum_mvc_abs = 0;
-  int sum_mvrs = 0, sum_mvcs = 0;
+  int64_t sum_mvrs = 0, sum_mvcs = 0;
   int mvcount = 0;
   int intercount = 0;
   int second_ref_count = 0;
@@ -506,103 +501,124 @@ void vp9_first_pass(VP9_COMP *cpi) {
   int new_mv_count = 0;
   int sum_in_vectors = 0;
   uint32_t lastmv_as_int = 0;
+  struct twopass_rc *twopass = &cpi->twopass;
+  const MV zero_mv = {0, 0};
+  const YV12_BUFFER_CONFIG *first_ref_buf = lst_yv12;
+
+  vp9_clear_system_state();
+
+  if (cpi->use_svc && cpi->svc.number_temporal_layers == 1) {
+    MV_REFERENCE_FRAME ref_frame = LAST_FRAME;
+    const YV12_BUFFER_CONFIG *scaled_ref_buf = NULL;
+    twopass = &cpi->svc.layer_context[cpi->svc.spatial_layer_id].twopass;
 
-  int_mv zero_ref_mv;
+    vp9_scale_references(cpi);
 
-  zero_ref_mv.as_int = 0;
+    // Use either last frame or alt frame for motion search.
+    if (cpi->ref_frame_flags & VP9_LAST_FLAG) {
+      scaled_ref_buf = vp9_get_scaled_ref_frame(cpi, LAST_FRAME);
+      ref_frame = LAST_FRAME;
+    } else if (cpi->ref_frame_flags & VP9_ALT_FLAG) {
+      scaled_ref_buf = vp9_get_scaled_ref_frame(cpi, ALTREF_FRAME);
+      ref_frame = ALTREF_FRAME;
+    }
+
+    if (scaled_ref_buf != NULL) {
+      // Update the stride since we are using scaled reference buffer
+      first_ref_buf = scaled_ref_buf;
+      recon_y_stride = first_ref_buf->y_stride;
+      recon_uv_stride = first_ref_buf->uv_stride;
+      uv_mb_height = 16 >> (first_ref_buf->y_height > first_ref_buf->uv_height);
+    }
 
-  vp9_clear_system_state();  // __asm emms;
+    // Disable golden frame for svc first pass for now.
+    gld_yv12 = NULL;
+    set_ref_ptrs(cm, xd, ref_frame, NONE);
+
+    cpi->Source = vp9_scale_if_required(cm, cpi->un_scaled_source,
+                                        &cpi->scaled_source);
+  }
 
   vp9_setup_src_planes(x, cpi->Source, 0, 0);
-  setup_pre_planes(xd, 0, lst_yv12, 0, 0, NULL);
-  setup_dst_planes(xd, new_yv12, 0, 0);
+  vp9_setup_pre_planes(xd, 0, first_ref_buf, 0, 0, NULL);
+  vp9_setup_dst_planes(xd->plane, new_yv12, 0, 0);
 
-  xd->mi_8x8 = cm->mi_grid_visible;
-  // required for vp9_frame_init_quantizer
-  xd->mi_8x8[0] = cm->mi;
+  xd->mi = cm->mi_grid_visible;
+  xd->mi[0] = cm->mi;
 
-  setup_block_dptrs(&x->e_mbd, cm->subsampling_x, cm->subsampling_y);
+  vp9_setup_block_planes(&x->e_mbd, cm->subsampling_x, cm->subsampling_y);
 
   vp9_frame_init_quantizer(cpi);
 
-  // Initialise the MV cost table to the defaults
-  // if( cm->current_video_frame == 0)
-  // if ( 0 )
-  {
-    vp9_init_mv_probs(cm);
-    vp9_initialize_rd_consts(cpi);
+  for (i = 0; i < MAX_MB_PLANE; ++i) {
+    p[i].coeff = ctx->coeff_pbuf[i][1];
+    p[i].qcoeff = ctx->qcoeff_pbuf[i][1];
+    pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][1];
+    p[i].eobs = ctx->eobs_pbuf[i][1];
   }
+  x->skip_recode = 0;
+
+  vp9_init_mv_probs(cm);
+  vp9_initialize_rd_consts(cpi);
 
-  // tiling is ignored in the first pass
+  // Tiling is ignored in the first pass.
   vp9_tile_init(&tile, cm, 0, 0);
 
-  // for each macroblock row in image
-  for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) {
+  for (mb_row = 0; mb_row < cm->mb_rows; ++mb_row) {
     int_mv best_ref_mv;
 
     best_ref_mv.as_int = 0;
 
-    // reset above block coeffs
+    // Reset above block coeffs.
     xd->up_available = (mb_row != 0);
     recon_yoffset = (mb_row * recon_y_stride * 16);
-    recon_uvoffset = (mb_row * recon_uv_stride * 8);
+    recon_uvoffset = (mb_row * recon_uv_stride * uv_mb_height);
 
     // Set up limit values for motion vectors to prevent them extending
-    // outside the UMV borders
+    // outside the UMV borders.
     x->mv_row_min = -((mb_row * 16) + BORDER_MV_PIXELS_B16);
     x->mv_row_max = ((cm->mb_rows - 1 - mb_row) * 16)
                     + BORDER_MV_PIXELS_B16;
 
-    // for each macroblock col in image
-    for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {
+    for (mb_col = 0; mb_col < cm->mb_cols; ++mb_col) {
       int this_error;
-      int gf_motion_error = INT_MAX;
-      int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row);
-      double error_weight;
+      const int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row);
+      double error_weight = 1.0;
+      const BLOCK_SIZE bsize = get_bsize(cm, mb_row, mb_col);
 
-      vp9_clear_system_state();  // __asm emms;
-      error_weight = 1.0;  // avoid uninitialized warnings
+      vp9_clear_system_state();
 
       xd->plane[0].dst.buf = new_yv12->y_buffer + recon_yoffset;
       xd->plane[1].dst.buf = new_yv12->u_buffer + recon_uvoffset;
       xd->plane[2].dst.buf = new_yv12->v_buffer + recon_uvoffset;
       xd->left_available = (mb_col != 0);
-
-      if (mb_col * 2 + 1 < cm->mi_cols) {
-        if (mb_row * 2 + 1 < cm->mi_rows) {
-          xd->mi_8x8[0]->mbmi.sb_type = BLOCK_16X16;
-        } else {
-          xd->mi_8x8[0]->mbmi.sb_type = BLOCK_16X8;
-        }
-      } else {
-        if (mb_row * 2 + 1 < cm->mi_rows) {
-          xd->mi_8x8[0]->mbmi.sb_type = BLOCK_8X16;
-        } else {
-          xd->mi_8x8[0]->mbmi.sb_type = BLOCK_8X8;
-        }
-      }
-      xd->mi_8x8[0]->mbmi.ref_frame[0] = INTRA_FRAME;
+      xd->mi[0]->mbmi.sb_type = bsize;
+      xd->mi[0]->mbmi.ref_frame[0] = INTRA_FRAME;
       set_mi_row_col(xd, &tile,
-                     mb_row << 1,
-                     1 << mi_height_log2(xd->mi_8x8[0]->mbmi.sb_type),
-                     mb_col << 1,
-                     1 << mi_width_log2(xd->mi_8x8[0]->mbmi.sb_type),
+                     mb_row << 1, num_8x8_blocks_high_lookup[bsize],
+                     mb_col << 1, num_8x8_blocks_wide_lookup[bsize],
                      cm->mi_rows, cm->mi_cols);
 
-      if (cpi->sf.variance_adaptive_quantization) {
-        int energy = vp9_block_energy(cpi, x, xd->mi_8x8[0]->mbmi.sb_type);
+      if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
+        const int energy = vp9_block_energy(cpi, x, bsize);
         error_weight = vp9_vaq_inv_q_ratio(energy);
       }
 
-      // do intra 16x16 prediction
-      this_error = vp9_encode_intra(x, use_dc_pred);
-      if (cpi->sf.variance_adaptive_quantization) {
-        vp9_clear_system_state();  // __asm emms;
-        this_error *= error_weight;
+      // Do intra 16x16 prediction.
+      x->skip_encode = 0;
+      xd->mi[0]->mbmi.mode = DC_PRED;
+      xd->mi[0]->mbmi.tx_size = use_dc_pred ?
+         (bsize >= BLOCK_16X16 ? TX_16X16 : TX_8X8) : TX_4X4;
+      vp9_encode_intra_block_plane(x, bsize, 0);
+      this_error = vp9_get_mb_ss(x->plane[0].src_diff);
+
+      if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
+        vp9_clear_system_state();
+        this_error = (int)(this_error * error_weight);
       }
 
-      // intrapenalty below deals with situations where the intra and inter
-      // error scores are very low (eg a plain black frame).
+      // Intrapenalty below deals with situations where the intra and inter
+      // error scores are very low (e.g. a plain black frame).
       // We do not have special cases in first pass for 0,0 and nearest etc so
       // all inter modes carry an overhead cost estimate for the mv.
       // When the error score is very low this causes us to pick all or lots of
@@ -610,44 +626,43 @@ void vp9_first_pass(VP9_COMP *cpi) {
       // This penalty adds a cost matching that of a 0,0 mv to the intra case.
       this_error += intrapenalty;
 
-      // Cumulative intra error total
+      // Accumulate the intra error.
       intra_error += (int64_t)this_error;
 
       // Set up limit values for motion vectors to prevent them extending
       // outside the UMV borders.
       x->mv_col_min = -((mb_col * 16) + BORDER_MV_PIXELS_B16);
-      x->mv_col_max = ((cm->mb_cols - 1 - mb_col) * 16)
-                      + BORDER_MV_PIXELS_B16;
+      x->mv_col_max = ((cm->mb_cols - 1 - mb_col) * 16) + BORDER_MV_PIXELS_B16;
 
-      // Other than for the first frame do a motion search
+      // Other than for the first frame do a motion search.
       if (cm->current_video_frame > 0) {
-        int tmp_err;
-        int motion_error = INT_MAX;
+        int tmp_err, motion_error;
         int_mv mv, tmp_mv;
 
-        // Simple 0,0 motion with no mv overhead
-        zz_motion_search(cpi, x, lst_yv12, &motion_error, recon_yoffset);
+        xd->plane[0].pre[0].buf = first_ref_buf->y_buffer + recon_yoffset;
+        motion_error = get_prediction_error(bsize, &x->plane[0].src,
+                                            &xd->plane[0].pre[0]);
+        // Assume 0,0 motion with no mv overhead.
         mv.as_int = tmp_mv.as_int = 0;
 
         // Test last reference frame using the previous best mv as the
-        // starting point (best reference) for the search
-        first_pass_motion_search(cpi, x, &best_ref_mv,
-                                 &mv.as_mv, lst_yv12,
-                                 &motion_error, recon_yoffset);
-        if (cpi->sf.variance_adaptive_quantization) {
-          vp9_clear_system_state();  // __asm emms;
-          motion_error *= error_weight;
+        // starting point (best reference) for the search.
+        first_pass_motion_search(cpi, x, &best_ref_mv.as_mv, &mv.as_mv,
+                                 &motion_error);
+        if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
+          vp9_clear_system_state();
+          motion_error = (int)(motion_error * error_weight);
         }
 
         // If the current best reference mv is not centered on 0,0 then do a 0,0
         // based search as well.
         if (best_ref_mv.as_int) {
           tmp_err = INT_MAX;
-          first_pass_motion_search(cpi, x, &zero_ref_mv, &tmp_mv.as_mv,
-                                   lst_yv12, &tmp_err, recon_yoffset);
-          if (cpi->sf.variance_adaptive_quantization) {
-            vp9_clear_system_state();  // __asm emms;
-            tmp_err *= error_weight;
+          first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv.as_mv,
+                                   &tmp_err);
+          if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
+            vp9_clear_system_state();
+            tmp_err = (int)(tmp_err * error_weight);
           }
 
           if (tmp_err < motion_error) {
@@ -656,34 +671,34 @@ void vp9_first_pass(VP9_COMP *cpi) {
           }
         }
 
-        // Experimental search in an older reference frame
-        if (cm->current_video_frame > 1) {
-          // Simple 0,0 motion with no mv overhead
-          zz_motion_search(cpi, x, gld_yv12,
-                           &gf_motion_error, recon_yoffset);
-
-          first_pass_motion_search(cpi, x, &zero_ref_mv,
-                                   &tmp_mv.as_mv, gld_yv12,
-                                   &gf_motion_error, recon_yoffset);
-          if (cpi->sf.variance_adaptive_quantization) {
-            vp9_clear_system_state();  // __asm emms;
-            gf_motion_error *= error_weight;
-          }
+        // Search in an older reference frame.
+        if (cm->current_video_frame > 1 && gld_yv12 != NULL) {
+          // Assume 0,0 motion with no mv overhead.
+          int gf_motion_error;
+
+          xd->plane[0].pre[0].buf = gld_yv12->y_buffer + recon_yoffset;
+          gf_motion_error = get_prediction_error(bsize, &x->plane[0].src,
+                                                 &xd->plane[0].pre[0]);
 
-          if ((gf_motion_error < motion_error) &&
-              (gf_motion_error < this_error)) {
-            second_ref_count++;
+          first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv.as_mv,
+                                   &gf_motion_error);
+          if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
+            vp9_clear_system_state();
+            gf_motion_error = (int)(gf_motion_error * error_weight);
           }
 
-          // Reset to last frame as reference buffer
-          xd->plane[0].pre[0].buf = lst_yv12->y_buffer + recon_yoffset;
-          xd->plane[1].pre[0].buf = lst_yv12->u_buffer + recon_uvoffset;
-          xd->plane[2].pre[0].buf = lst_yv12->v_buffer + recon_uvoffset;
+          if (gf_motion_error < motion_error && gf_motion_error < this_error)
+            ++second_ref_count;
 
-          // In accumulating a score for the older reference frame
-          // take the best of the motion predicted score and
-          // the intra coded error (just as will be done for)
-          // accumulation of "coded_error" for the last frame.
+          // Reset to last frame as reference buffer.
+          xd->plane[0].pre[0].buf = first_ref_buf->y_buffer + recon_yoffset;
+          xd->plane[1].pre[0].buf = first_ref_buf->u_buffer + recon_uvoffset;
+          xd->plane[2].pre[0].buf = first_ref_buf->v_buffer + recon_uvoffset;
+
+          // In accumulating a score for the older reference frame take the
+          // best of the motion predicted score and the intra coded error
+          // (just as will be done for) accumulation of "coded_error" for
+          // the last frame.
           if (gf_motion_error < this_error)
             sr_coded_error += gf_motion_error;
           else
@@ -691,74 +706,69 @@ void vp9_first_pass(VP9_COMP *cpi) {
         } else {
           sr_coded_error += motion_error;
         }
-        /* Intra assumed best */
+        // Start by assuming that intra mode is best.
         best_ref_mv.as_int = 0;
 
         if (motion_error <= this_error) {
-          // Keep a count of cases where the inter and intra were
-          // very close and very low. This helps with scene cut
-          // detection for example in cropped clips with black bars
-          // at the sides or top and bottom.
-          if ((((this_error - intrapenalty) * 9) <=
-               (motion_error * 10)) &&
-              (this_error < (2 * intrapenalty))) {
-            neutral_count++;
-          }
+          // Keep a count of cases where the inter and intra were very close
+          // and very low. This helps with scene cut detection for example in
+          // cropped clips with black bars at the sides or top and bottom.
+          if (((this_error - intrapenalty) * 9 <= motion_error * 10) &&
+              this_error < 2 * intrapenalty)
+            ++neutral_count;
 
           mv.as_mv.row *= 8;
           mv.as_mv.col *= 8;
           this_error = motion_error;
-          vp9_set_mbmode_and_mvs(x, NEWMV, &mv);
-          xd->mi_8x8[0]->mbmi.tx_size = TX_4X4;
-          xd->mi_8x8[0]->mbmi.ref_frame[0] = LAST_FRAME;
-          xd->mi_8x8[0]->mbmi.ref_frame[1] = NONE;
-          vp9_build_inter_predictors_sby(xd, mb_row << 1,
-                                         mb_col << 1,
-                                         xd->mi_8x8[0]->mbmi.sb_type);
-          vp9_encode_sby(x, xd->mi_8x8[0]->mbmi.sb_type);
+          xd->mi[0]->mbmi.mode = NEWMV;
+          xd->mi[0]->mbmi.mv[0] = mv;
+          xd->mi[0]->mbmi.tx_size = TX_4X4;
+          xd->mi[0]->mbmi.ref_frame[0] = LAST_FRAME;
+          xd->mi[0]->mbmi.ref_frame[1] = NONE;
+          vp9_build_inter_predictors_sby(xd, mb_row << 1, mb_col << 1, bsize);
+          vp9_encode_sby_pass1(x, bsize);
           sum_mvr += mv.as_mv.row;
           sum_mvr_abs += abs(mv.as_mv.row);
           sum_mvc += mv.as_mv.col;
           sum_mvc_abs += abs(mv.as_mv.col);
           sum_mvrs += mv.as_mv.row * mv.as_mv.row;
           sum_mvcs += mv.as_mv.col * mv.as_mv.col;
-          intercount++;
+          ++intercount;
 
           best_ref_mv.as_int = mv.as_int;
 
-          // Was the vector non-zero
           if (mv.as_int) {
-            mvcount++;
+            ++mvcount;
 
-            // Was it different from the last non zero vector
+            // Non-zero vector, was it different from the last non zero vector?
             if (mv.as_int != lastmv_as_int)
-              new_mv_count++;
+              ++new_mv_count;
             lastmv_as_int = mv.as_int;
 
-            // Does the Row vector point inwards or outwards
+            // Does the row vector point inwards or outwards?
             if (mb_row < cm->mb_rows / 2) {
               if (mv.as_mv.row > 0)
-                sum_in_vectors--;
+                --sum_in_vectors;
               else if (mv.as_mv.row < 0)
-                sum_in_vectors++;
+                ++sum_in_vectors;
             } else if (mb_row > cm->mb_rows / 2) {
               if (mv.as_mv.row > 0)
-                sum_in_vectors++;
+                ++sum_in_vectors;
               else if (mv.as_mv.row < 0)
-                sum_in_vectors--;
+                --sum_in_vectors;
             }
 
-            // Does the Row vector point inwards or outwards
+            // Does the col vector point inwards or outwards?
             if (mb_col < cm->mb_cols / 2) {
               if (mv.as_mv.col > 0)
-                sum_in_vectors--;
+                --sum_in_vectors;
               else if (mv.as_mv.col < 0)
-                sum_in_vectors++;
+                ++sum_in_vectors;
             } else if (mb_col > cm->mb_cols / 2) {
               if (mv.as_mv.col > 0)
-                sum_in_vectors++;
+                ++sum_in_vectors;
               else if (mv.as_mv.col < 0)
-                sum_in_vectors--;
+                --sum_in_vectors;
             }
           }
         }
@@ -767,108 +777,104 @@ void vp9_first_pass(VP9_COMP *cpi) {
       }
       coded_error += (int64_t)this_error;
 
-      // adjust to the next column of macroblocks
+      // Adjust to the next column of MBs.
       x->plane[0].src.buf += 16;
-      x->plane[1].src.buf += 8;
-      x->plane[2].src.buf += 8;
+      x->plane[1].src.buf += uv_mb_height;
+      x->plane[2].src.buf += uv_mb_height;
 
       recon_yoffset += 16;
-      recon_uvoffset += 8;
+      recon_uvoffset += uv_mb_height;
     }
 
-    // adjust to the next row of mbs
+    // Adjust to the next row of MBs.
     x->plane[0].src.buf += 16 * x->plane[0].src.stride - 16 * cm->mb_cols;
-    x->plane[1].src.buf += 8 * x->plane[1].src.stride - 8 * cm->mb_cols;
-    x->plane[2].src.buf += 8 * x->plane[1].src.stride - 8 * cm->mb_cols;
+    x->plane[1].src.buf += uv_mb_height * x->plane[1].src.stride -
+                           uv_mb_height * cm->mb_cols;
+    x->plane[2].src.buf += uv_mb_height * x->plane[1].src.stride -
+                           uv_mb_height * cm->mb_cols;
 
-    vp9_clear_system_state();  // __asm emms;
+    vp9_clear_system_state();
   }
 
-  vp9_clear_system_state();  // __asm emms;
+  vp9_clear_system_state();
   {
-    double weight = 0.0;
-
     FIRSTPASS_STATS fps;
 
-    fps.frame      = cm->current_video_frame;
+    fps.frame = cm->current_video_frame;
+    fps.spatial_layer_id = cpi->svc.spatial_layer_id;
     fps.intra_error = (double)(intra_error >> 8);
     fps.coded_error = (double)(coded_error >> 8);
     fps.sr_coded_error = (double)(sr_coded_error >> 8);
-    weight = simple_weight(cpi->Source);
-
-
-    if (weight < 0.1)
-      weight = 0.1;
-
-    fps.ssim_weighted_pred_err = fps.coded_error * weight;
-
-    fps.pcnt_inter  = 0.0;
-    fps.pcnt_motion = 0.0;
-    fps.MVr        = 0.0;
-    fps.mvr_abs     = 0.0;
-    fps.MVc        = 0.0;
-    fps.mvc_abs     = 0.0;
-    fps.MVrv       = 0.0;
-    fps.MVcv       = 0.0;
-    fps.mv_in_out_count  = 0.0;
-    fps.new_mv_count = 0.0;
-    fps.count      = 1.0;
-
-    fps.pcnt_inter   = 1.0 * (double)intercount / cm->MBs;
-    fps.pcnt_second_ref = 1.0 * (double)second_ref_count / cm->MBs;
-    fps.pcnt_neutral = 1.0 * (double)neutral_count / cm->MBs;
+    fps.ssim_weighted_pred_err = fps.coded_error * simple_weight(cpi->Source);
+    fps.count = 1.0;
+    fps.pcnt_inter = (double)intercount / cm->MBs;
+    fps.pcnt_second_ref = (double)second_ref_count / cm->MBs;
+    fps.pcnt_neutral = (double)neutral_count / cm->MBs;
 
     if (mvcount > 0) {
-      fps.MVr = (double)sum_mvr / (double)mvcount;
-      fps.mvr_abs = (double)sum_mvr_abs / (double)mvcount;
-      fps.MVc = (double)sum_mvc / (double)mvcount;
-      fps.mvc_abs = (double)sum_mvc_abs / (double)mvcount;
-      fps.MVrv = ((double)sum_mvrs - (fps.MVr * fps.MVr / (double)mvcount)) /
-                 (double)mvcount;
-      fps.MVcv = ((double)sum_mvcs - (fps.MVc * fps.MVc / (double)mvcount)) /
-                 (double)mvcount;
-      fps.mv_in_out_count = (double)sum_in_vectors / (double)(mvcount * 2);
+      fps.MVr = (double)sum_mvr / mvcount;
+      fps.mvr_abs = (double)sum_mvr_abs / mvcount;
+      fps.MVc = (double)sum_mvc / mvcount;
+      fps.mvc_abs = (double)sum_mvc_abs / mvcount;
+      fps.MVrv = ((double)sum_mvrs - (fps.MVr * fps.MVr / mvcount)) / mvcount;
+      fps.MVcv = ((double)sum_mvcs - (fps.MVc * fps.MVc / mvcount)) / mvcount;
+      fps.mv_in_out_count = (double)sum_in_vectors / (mvcount * 2);
       fps.new_mv_count = new_mv_count;
-
-      fps.pcnt_motion = 1.0 * (double)mvcount / cpi->common.MBs;
+      fps.pcnt_motion = (double)mvcount / cm->MBs;
+    } else {
+      fps.MVr = 0.0;
+      fps.mvr_abs = 0.0;
+      fps.MVc = 0.0;
+      fps.mvc_abs = 0.0;
+      fps.MVrv = 0.0;
+      fps.MVcv = 0.0;
+      fps.mv_in_out_count = 0.0;
+      fps.new_mv_count = 0.0;
+      fps.pcnt_motion = 0.0;
     }
 
     // TODO(paulwilkins):  Handle the case when duration is set to 0, or
     // something less than the full time between subsequent values of
     // cpi->source_time_stamp.
-    fps.duration = (double)(cpi->source->ts_end
-                            - cpi->source->ts_start);
+    fps.duration = (double)(cpi->source->ts_end - cpi->source->ts_start);
 
-    // don't want to do output stats with a stack variable!
-    cpi->twopass.this_frame_stats = fps;
-    output_stats(cpi, cpi->output_pkt_list, &cpi->twopass.this_frame_stats);
-    accumulate_stats(&cpi->twopass.total_stats, &fps);
+    // Don't want to do output stats with a stack variable!
+    twopass->this_frame_stats = fps;
+    output_stats(&twopass->this_frame_stats, cpi->output_pkt_list);
+    accumulate_stats(&twopass->total_stats, &fps);
   }
 
   // Copy the previous Last Frame back into gf and and arf buffers if
-  // the prediction is good enough... but also dont allow it to lag too far
-  if ((cpi->twopass.sr_update_lag > 3) ||
+  // the prediction is good enough... but also don't allow it to lag too far.
+  if ((twopass->sr_update_lag > 3) ||
       ((cm->current_video_frame > 0) &&
-       (cpi->twopass.this_frame_stats.pcnt_inter > 0.20) &&
-       ((cpi->twopass.this_frame_stats.intra_error /
-         DOUBLE_DIVIDE_CHECK(cpi->twopass.this_frame_stats.coded_error)) >
-        2.0))) {
-    vp8_yv12_copy_frame(lst_yv12, gld_yv12);
-    cpi->twopass.sr_update_lag = 1;
+       (twopass->this_frame_stats.pcnt_inter > 0.20) &&
+       ((twopass->this_frame_stats.intra_error /
+         DOUBLE_DIVIDE_CHECK(twopass->this_frame_stats.coded_error)) > 2.0))) {
+    if (gld_yv12 != NULL) {
+      vp8_yv12_copy_frame(lst_yv12, gld_yv12);
+    }
+    twopass->sr_update_lag = 1;
   } else {
-    cpi->twopass.sr_update_lag++;
+    ++twopass->sr_update_lag;
   }
-  // swap frame pointers so last frame refers to the frame we just compressed
-  swap_yv12(lst_yv12, new_yv12);
 
-  vp9_extend_frame_borders(lst_yv12, cm->subsampling_x, cm->subsampling_y);
+  vp9_extend_frame_borders(new_yv12);
+
+  if (cpi->use_svc && cpi->svc.number_temporal_layers == 1) {
+    vp9_update_reference_frames(cpi);
+  } else {
+    // Swap frame pointers so last frame refers to the frame we just compressed.
+    swap_yv12(lst_yv12, new_yv12);
+  }
 
   // Special case for the first frame. Copy into the GF buffer as a second
   // reference.
-  if (cm->current_video_frame == 0)
+  if (cm->current_video_frame == 0 && gld_yv12 != NULL) {
     vp8_yv12_copy_frame(lst_yv12, gld_yv12);
+  }
 
-  // use this to see what the first pass reconstruction looks like
+  // Use this to see what the first pass reconstruction looks like.
   if (0) {
     char filename[512];
     FILE *recon_file;
@@ -884,52 +890,7 @@ void vp9_first_pass(VP9_COMP *cpi) {
     fclose(recon_file);
   }
 
-  cm->current_video_frame++;
-}
-
-// Estimate a cost per mb attributable to overheads such as the coding of
-// modes and motion vectors.
-// Currently simplistic in its assumptions for testing.
-//
-
-
-static double bitcost(double prob) {
-  return -(log(prob) / log(2.0));
-}
-
-static int64_t estimate_modemvcost(VP9_COMP *cpi,
-                                     FIRSTPASS_STATS *fpstats) {
-#if 0
-  int mv_cost;
-  int mode_cost;
-
-  double av_pct_inter = fpstats->pcnt_inter / fpstats->count;
-  double av_pct_motion = fpstats->pcnt_motion / fpstats->count;
-  double av_intra = (1.0 - av_pct_inter);
-
-  double zz_cost;
-  double motion_cost;
-  double intra_cost;
-
-  zz_cost = bitcost(av_pct_inter - av_pct_motion);
-  motion_cost = bitcost(av_pct_motion);
-  intra_cost = bitcost(av_intra);
-
-  // Estimate of extra bits per mv overhead for mbs
-  // << 9 is the normalization to the (bits * 512) used in vp9_bits_per_mb
-  mv_cost = ((int)(fpstats->new_mv_count / fpstats->count) * 8) << 9;
-
-  // Crude estimate of overhead cost from modes
-  // << 9 is the normalization to (bits * 512) used in vp9_bits_per_mb
-  mode_cost =
-    (int)((((av_pct_inter - av_pct_motion) * zz_cost) +
-           (av_pct_motion * motion_cost) +
-           (av_intra * intra_cost)) * cpi->common.MBs) << 9;
-
-  // return mv_cost + mode_cost;
-  // TODO(paulwilkins): Fix overhead costs for extended Q range.
-#endif
-  return 0;
+  ++cm->current_video_frame;
 }
 
 static double calc_correction_factor(double err_per_mb,
@@ -940,340 +901,182 @@ static double calc_correction_factor(double err_per_mb,
   const double error_term = err_per_mb / err_divisor;
 
   // Adjustment based on actual quantizer to power term.
-  const double power_term = MIN(vp9_convert_qindex_to_q(q) * 0.01 + pt_low,
+  const double power_term = MIN(vp9_convert_qindex_to_q(q) * 0.0125 + pt_low,
                                 pt_high);
 
-  // Calculate correction factor
+  // Calculate correction factor.
   if (power_term < 1.0)
     assert(error_term >= 0.0);
 
   return fclamp(pow(error_term, power_term), 0.05, 5.0);
 }
 
-// Given a current maxQ value sets a range for future values.
-// PGW TODO..
-// This code removes direct dependency on QIndex to determine the range
-// (now uses the actual quantizer) but has not been tuned.
-static void adjust_maxq_qrange(VP9_COMP *cpi) {
-  int i;
-  // Set the max corresponding to cpi->avg_q * 2.0
-  double q = cpi->avg_q * 2.0;
-  cpi->twopass.maxq_max_limit = cpi->worst_quality;
-  for (i = cpi->best_quality; i <= cpi->worst_quality; i++) {
-    cpi->twopass.maxq_max_limit = i;
-    if (vp9_convert_qindex_to_q(i) >= q)
-      break;
-  }
-
-  // Set the min corresponding to cpi->avg_q * 0.5
-  q = cpi->avg_q * 0.5;
-  cpi->twopass.maxq_min_limit = cpi->best_quality;
-  for (i = cpi->worst_quality; i >= cpi->best_quality; i--) {
-    cpi->twopass.maxq_min_limit = i;
-    if (vp9_convert_qindex_to_q(i) <= q)
-      break;
-  }
-}
+static int get_twopass_worst_quality(const VP9_COMP *cpi,
+                                     const FIRSTPASS_STATS *stats,
+                                     int section_target_bandwidth) {
+  const RATE_CONTROL *const rc = &cpi->rc;
+  const VP9EncoderConfig *const oxcf = &cpi->oxcf;
 
-static int estimate_max_q(VP9_COMP *cpi,
-                          FIRSTPASS_STATS *fpstats,
-                          int section_target_bandwitdh) {
-  int q;
-  int num_mbs = cpi->common.MBs;
-  int target_norm_bits_per_mb;
-
-  double section_err = fpstats->coded_error / fpstats->count;
-  double sr_correction;
-  double err_per_mb = section_err / num_mbs;
-  double err_correction_factor;
-  double speed_correction = 1.0;
-
-  if (section_target_bandwitdh <= 0)
-    return cpi->twopass.maxq_max_limit;          // Highest value allowed
-
-  target_norm_bits_per_mb = section_target_bandwitdh < (1 << 20)
-                              ? (512 * section_target_bandwitdh) / num_mbs
-                              : 512 * (section_target_bandwitdh / num_mbs);
-
-  // Look at the drop in prediction quality between the last frame
-  // and the GF buffer (which contained an older frame).
-  if (fpstats->sr_coded_error > fpstats->coded_error) {
-    double sr_err_diff = (fpstats->sr_coded_error - fpstats->coded_error) /
-                             (fpstats->count * cpi->common.MBs);
-    sr_correction = fclamp(pow(sr_err_diff / 32.0, 0.25), 0.75, 1.25);
+  if (section_target_bandwidth <= 0) {
+    return rc->worst_quality;  // Highest value allowed
   } else {
-    sr_correction = 0.75;
-  }
-
-  // Calculate a corrective factor based on a rolling ratio of bits spent
-  // vs target bits
-  if (cpi->rolling_target_bits > 0 &&
-      cpi->active_worst_quality < cpi->worst_quality) {
-    double rolling_ratio = (double)cpi->rolling_actual_bits /
-                               (double)cpi->rolling_target_bits;
-
-    if (rolling_ratio < 0.95)
-      cpi->twopass.est_max_qcorrection_factor -= 0.005;
-    else if (rolling_ratio > 1.05)
-      cpi->twopass.est_max_qcorrection_factor += 0.005;
-
-    cpi->twopass.est_max_qcorrection_factor = fclamp(
-        cpi->twopass.est_max_qcorrection_factor, 0.1, 10.0);
-  }
-
-  // Corrections for higher compression speed settings
-  // (reduced compression expected)
-  // FIXME(jimbankoski): Once we settle on vp9 speed features we need to
-  // change this code.
-  if (cpi->compressor_speed == 1)
-    speed_correction = cpi->oxcf.cpu_used <= 5 ?
-                          1.04 + (/*cpi->oxcf.cpu_used*/0 * 0.04) :
-                          1.25;
-
-  // Try and pick a max Q that will be high enough to encode the
-  // content at the given rate.
-  for (q = cpi->twopass.maxq_min_limit; q < cpi->twopass.maxq_max_limit; q++) {
-    int bits_per_mb_at_this_q;
-
-    err_correction_factor = calc_correction_factor(err_per_mb,
-                                                   ERR_DIVISOR, 0.4, 0.90, q) *
-                                sr_correction * speed_correction *
-                                cpi->twopass.est_max_qcorrection_factor;
-
-    bits_per_mb_at_this_q = vp9_bits_per_mb(INTER_FRAME, q,
-                                            err_correction_factor);
-
-    if (bits_per_mb_at_this_q <= target_norm_bits_per_mb)
-      break;
-  }
-
-  // Restriction on active max q for constrained quality mode.
-  if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY &&
-      q < cpi->cq_target_quality)
-    q = cpi->cq_target_quality;
-
-  // Adjust maxq_min_limit and maxq_max_limit limits based on
-  // average q observed in clip for non kf/gf/arf frames
-  // Give average a chance to settle though.
-  // PGW TODO.. This code is broken for the extended Q range
-  if (cpi->ni_frames > ((int)cpi->twopass.total_stats.count >> 8) &&
-      cpi->ni_frames > 25)
-    adjust_maxq_qrange(cpi);
-
-  return q;
-}
-
-// For cq mode estimate a cq level that matches the observed
-// complexity and data rate.
-static int estimate_cq(VP9_COMP *cpi,
-                       FIRSTPASS_STATS *fpstats,
-                       int section_target_bandwitdh) {
-  int q;
-  int num_mbs = cpi->common.MBs;
-  int target_norm_bits_per_mb;
-
-  double section_err = (fpstats->coded_error / fpstats->count);
-  double err_per_mb = section_err / num_mbs;
-  double err_correction_factor;
-  double sr_err_diff;
-  double sr_correction;
-  double speed_correction = 1.0;
-  double clip_iiratio;
-  double clip_iifactor;
-
-  target_norm_bits_per_mb = (section_target_bandwitdh < (1 << 20))
-                            ? (512 * section_target_bandwitdh) / num_mbs
-                            : 512 * (section_target_bandwitdh / num_mbs);
-
-
-  // Corrections for higher compression speed settings
-  // (reduced compression expected)
-  if (cpi->compressor_speed == 1) {
-    if (cpi->oxcf.cpu_used <= 5)
-      speed_correction = 1.04 + (/*cpi->oxcf.cpu_used*/ 0 * 0.04);
-    else
-      speed_correction = 1.25;
-  }
-
-  // Look at the drop in prediction quality between the last frame
-  // and the GF buffer (which contained an older frame).
-  if (fpstats->sr_coded_error > fpstats->coded_error) {
-    sr_err_diff =
-      (fpstats->sr_coded_error - fpstats->coded_error) /
-      (fpstats->count * cpi->common.MBs);
-    sr_correction = (sr_err_diff / 32.0);
-    sr_correction = pow(sr_correction, 0.25);
-    if (sr_correction < 0.75)
-      sr_correction = 0.75;
-    else if (sr_correction > 1.25)
-      sr_correction = 1.25;
-  } else {
-    sr_correction = 0.75;
-  }
-
-  // II ratio correction factor for clip as a whole
-  clip_iiratio = cpi->twopass.total_stats.intra_error /
-                 DOUBLE_DIVIDE_CHECK(cpi->twopass.total_stats.coded_error);
-  clip_iifactor = 1.0 - ((clip_iiratio - 10.0) * 0.025);
-  if (clip_iifactor < 0.80)
-    clip_iifactor = 0.80;
-
-  // Try and pick a Q that can encode the content at the given rate.
-  for (q = 0; q < MAXQ; q++) {
-    int bits_per_mb_at_this_q;
-
-    // Error per MB based correction factor
-    err_correction_factor =
-      calc_correction_factor(err_per_mb, 100.0, 0.4, 0.90, q) *
-      sr_correction * speed_correction * clip_iifactor;
+    const int num_mbs = cpi->common.MBs;
+    const double section_err = stats->coded_error / stats->count;
+    const double err_per_mb = section_err / num_mbs;
+    const double speed_term = 1.0 + 0.04 * oxcf->speed;
+    const int target_norm_bits_per_mb = ((uint64_t)section_target_bandwidth <<
+                                            BPER_MB_NORMBITS) / num_mbs;
+    int q;
+    int is_svc_upper_layer = 0;
+    if (cpi->use_svc && cpi->svc.number_temporal_layers == 1 &&
+        cpi->svc.spatial_layer_id > 0) {
+      is_svc_upper_layer = 1;
+    }
 
-    bits_per_mb_at_this_q =
-      vp9_bits_per_mb(INTER_FRAME, q, err_correction_factor);
+    // Try and pick a max Q that will be high enough to encode the
+    // content at the given rate.
+    for (q = rc->best_quality; q < rc->worst_quality; ++q) {
+      const double factor =
+          calc_correction_factor(err_per_mb, ERR_DIVISOR,
+                                 is_svc_upper_layer ? 0.8 : 0.5,
+                                 is_svc_upper_layer ? 1.0 : 0.90, q);
+      const int bits_per_mb = vp9_rc_bits_per_mb(INTER_FRAME, q,
+                                                 factor * speed_term);
+      if (bits_per_mb <= target_norm_bits_per_mb)
+        break;
+    }
 
-    if (bits_per_mb_at_this_q <= target_norm_bits_per_mb)
-      break;
+    // Restriction on active max q for constrained quality mode.
+    if (cpi->oxcf.rc_mode == RC_MODE_CONSTRAINED_QUALITY)
+      q = MAX(q, oxcf->cq_level);
+    return q;
   }
-
-  // Clip value to range "best allowed to (worst allowed - 1)"
-  q = select_cq_level(q);
-  if (q >= cpi->worst_quality)
-    q = cpi->worst_quality - 1;
-  if (q < cpi->best_quality)
-    q = cpi->best_quality;
-
-  return q;
 }
 
 extern void vp9_new_framerate(VP9_COMP *cpi, double framerate);
 
 void vp9_init_second_pass(VP9_COMP *cpi) {
-  FIRSTPASS_STATS this_frame;
-  FIRSTPASS_STATS *start_pos;
-
-  double lower_bounds_min_rate = FRAME_OVERHEAD_BITS * cpi->oxcf.framerate;
-  double two_pass_min_rate = (double)(cpi->oxcf.target_bandwidth *
-                                      cpi->oxcf.two_pass_vbrmin_section / 100);
-
-  if (two_pass_min_rate < lower_bounds_min_rate)
-    two_pass_min_rate = lower_bounds_min_rate;
-
-  zero_stats(&cpi->twopass.total_stats);
-  zero_stats(&cpi->twopass.total_left_stats);
-
-  if (!cpi->twopass.stats_in_end)
+  SVC *const svc = &cpi->svc;
+  const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+  const int is_spatial_svc = (svc->number_spatial_layers > 1) &&
+                             (svc->number_temporal_layers == 1);
+  struct twopass_rc *const twopass = is_spatial_svc ?
+      &svc->layer_context[svc->spatial_layer_id].twopass : &cpi->twopass;
+  double frame_rate;
+  FIRSTPASS_STATS *stats;
+
+  zero_stats(&twopass->total_stats);
+  zero_stats(&twopass->total_left_stats);
+
+  if (!twopass->stats_in_end)
     return;
 
-  cpi->twopass.total_stats = *cpi->twopass.stats_in_end;
-  cpi->twopass.total_left_stats = cpi->twopass.total_stats;
+  stats = &twopass->total_stats;
+
+  *stats = *twopass->stats_in_end;
+  twopass->total_left_stats = *stats;
 
-  // each frame can have a different duration, as the frame rate in the source
-  // isn't guaranteed to be constant.   The frame rate prior to the first frame
-  // encoded in the second pass is a guess.  However the sum duration is not.
-  // Its calculated based on the actual durations of all frames from the first
-  // pass.
-  vp9_new_framerate(cpi, 10000000.0 * cpi->twopass.total_stats.count /
-                       cpi->twopass.total_stats.duration);
+  frame_rate = 10000000.0 * stats->count / stats->duration;
+  // Each frame can have a different duration, as the frame rate in the source
+  // isn't guaranteed to be constant. The frame rate prior to the first frame
+  // encoded in the second pass is a guess. However, the sum duration is not.
+  // It is calculated based on the actual durations of all frames from the
+  // first pass.
 
-  cpi->output_framerate = cpi->oxcf.framerate;
-  cpi->twopass.bits_left = (int64_t)(cpi->twopass.total_stats.duration *
-                                     cpi->oxcf.target_bandwidth / 10000000.0);
-  cpi->twopass.bits_left -= (int64_t)(cpi->twopass.total_stats.duration *
-                                      two_pass_min_rate / 10000000.0);
+  if (is_spatial_svc) {
+    vp9_update_spatial_layer_framerate(cpi, frame_rate);
+    twopass->bits_left = (int64_t)(stats->duration *
+        svc->layer_context[svc->spatial_layer_id].target_bandwidth /
+        10000000.0);
+  } else {
+    vp9_new_framerate(cpi, frame_rate);
+    twopass->bits_left = (int64_t)(stats->duration * oxcf->target_bandwidth /
+                             10000000.0);
+  }
 
   // Calculate a minimum intra value to be used in determining the IIratio
   // scores used in the second pass. We have this minimum to make sure
   // that clips that are static but "low complexity" in the intra domain
-  // are still boosted appropriately for KF/GF/ARF
-  cpi->twopass.kf_intra_err_min = KF_MB_INTRA_MIN * cpi->common.MBs;
-  cpi->twopass.gf_intra_err_min = GF_MB_INTRA_MIN * cpi->common.MBs;
+  // are still boosted appropriately for KF/GF/ARF.
+  if (!is_spatial_svc) {
+    // We don't know the number of MBs for each layer at this point.
+    // So we will do it later.
+    twopass->kf_intra_err_min = KF_MB_INTRA_MIN * cpi->common.MBs;
+    twopass->gf_intra_err_min = GF_MB_INTRA_MIN * cpi->common.MBs;
+  }
 
-  // This variable monitors how far behind the second ref update is lagging
-  cpi->twopass.sr_update_lag = 1;
+  // This variable monitors how far behind the second ref update is lagging.
+  twopass->sr_update_lag = 1;
 
-  // Scan the first pass file and calculate an average Intra / Inter error score
-  // ratio for the sequence.
+  // Scan the first pass file and calculate an average Intra / Inter error
+  // score ratio for the sequence.
   {
+    const FIRSTPASS_STATS *const start_pos = twopass->stats_in;
+    FIRSTPASS_STATS this_frame;
     double sum_iiratio = 0.0;
-    double IIRatio;
 
-    start_pos = cpi->twopass.stats_in;  // Note the starting "file" position.
-
-    while (input_stats(cpi, &this_frame) != EOF) {
-      IIRatio = this_frame.intra_error
-                / DOUBLE_DIVIDE_CHECK(this_frame.coded_error);
-      IIRatio = (IIRatio < 1.0) ? 1.0 : (IIRatio > 20.0) ? 20.0 : IIRatio;
-      sum_iiratio += IIRatio;
+    while (input_stats(twopass, &this_frame) != EOF) {
+      const double iiratio = this_frame.intra_error /
+                                 DOUBLE_DIVIDE_CHECK(this_frame.coded_error);
+      sum_iiratio += fclamp(iiratio, 1.0, 20.0);
     }
 
-    cpi->twopass.avg_iiratio = sum_iiratio /
-        DOUBLE_DIVIDE_CHECK((double)cpi->twopass.total_stats.count);
+    twopass->avg_iiratio = sum_iiratio /
+                               DOUBLE_DIVIDE_CHECK((double)stats->count);
 
-    // Reset file position
-    reset_fpf_position(cpi, start_pos);
+    reset_fpf_position(twopass, start_pos);
   }
 
   // Scan the first pass file and calculate a modified total error based upon
   // the bias/power function used to allocate bits.
   {
-    start_pos = cpi->twopass.stats_in;  // Note starting "file" position
+    const FIRSTPASS_STATS *const start_pos = twopass->stats_in;
+    FIRSTPASS_STATS this_frame;
+    const double av_error = stats->ssim_weighted_pred_err /
+                                DOUBLE_DIVIDE_CHECK(stats->count);
+
 
-    cpi->twopass.modified_error_total = 0.0;
-    cpi->twopass.modified_error_used = 0.0;
+    twopass->modified_error_total = 0.0;
+    twopass->modified_error_min =
+        (av_error * oxcf->two_pass_vbrmin_section) / 100;
+    twopass->modified_error_max =
+        (av_error * oxcf->two_pass_vbrmax_section) / 100;
 
-    while (input_stats(cpi, &this_frame) != EOF) {
-      cpi->twopass.modified_error_total +=
+    while (input_stats(twopass, &this_frame) != EOF) {
+      twopass->modified_error_total +=
           calculate_modified_err(cpi, &this_frame);
     }
-    cpi->twopass.modified_error_left = cpi->twopass.modified_error_total;
+    twopass->modified_error_left = twopass->modified_error_total;
 
-    reset_fpf_position(cpi, start_pos);  // Reset file position
+    reset_fpf_position(twopass, start_pos);
   }
-}
 
-void vp9_end_second_pass(VP9_COMP *cpi) {
+  // Reset the vbr bits off target counter
+  cpi->rc.vbr_bits_off_target = 0;
 }
 
-// This function gives and estimate of how badly we believe
-// the prediction quality is decaying from frame to frame.
-static double get_prediction_decay_rate(VP9_COMP *cpi,
-                                        FIRSTPASS_STATS *next_frame) {
-  double prediction_decay_rate;
-  double second_ref_decay;
-  double mb_sr_err_diff;
-
-  // Initial basis is the % mbs inter coded
-  prediction_decay_rate = next_frame->pcnt_inter;
-
+// This function gives an estimate of how badly we believe the prediction
+// quality is decaying from frame to frame.
+static double get_prediction_decay_rate(const VP9_COMMON *cm,
+                                        const FIRSTPASS_STATS *next_frame) {
   // Look at the observed drop in prediction quality between the last frame
   // and the GF buffer (which contains an older frame).
-  mb_sr_err_diff = (next_frame->sr_coded_error - next_frame->coded_error) /
-                   cpi->common.MBs;
-  if (mb_sr_err_diff <= 512.0) {
-    second_ref_decay = 1.0 - (mb_sr_err_diff / 512.0);
-    second_ref_decay = pow(second_ref_decay, 0.5);
-    if (second_ref_decay < 0.85)
-      second_ref_decay = 0.85;
-    else if (second_ref_decay > 1.0)
-      second_ref_decay = 1.0;
-  } else {
-    second_ref_decay = 0.85;
-  }
+  const double mb_sr_err_diff = (next_frame->sr_coded_error -
+                                     next_frame->coded_error) / cm->MBs;
+  const double second_ref_decay = mb_sr_err_diff <= 512.0
+      ? fclamp(pow(1.0 - (mb_sr_err_diff / 512.0), 0.5), 0.85, 1.0)
+      : 0.85;
 
-  if (second_ref_decay < prediction_decay_rate)
-    prediction_decay_rate = second_ref_decay;
-
-  return prediction_decay_rate;
+  return MIN(second_ref_decay, next_frame->pcnt_inter);
 }
 
 // Function to test for a condition where a complex transition is followed
 // by a static section. For example in slide shows where there is a fade
 // between slides. This is to help with more optimal kf and gf positioning.
-static int detect_transition_to_still(
-  VP9_COMP *cpi,
-  int frame_interval,
-  int still_interval,
-  double loop_decay_rate,
-  double last_decay_rate) {
+static int detect_transition_to_still(struct twopass_rc *twopass,
+                                      int frame_interval, int still_interval,
+                                      double loop_decay_rate,
+                                      double last_decay_rate) {
   int trans_to_still = 0;
 
   // Break clause to detect very still sections after motion
@@ -1283,25 +1086,21 @@ static int detect_transition_to_still(
       loop_decay_rate >= 0.999 &&
       last_decay_rate < 0.9) {
     int j;
-    FIRSTPASS_STATS *position = cpi->twopass.stats_in;
+    const FIRSTPASS_STATS *position = twopass->stats_in;
     FIRSTPASS_STATS tmp_next_frame;
-    double zz_inter;
 
-    // Look ahead a few frames to see if static condition
-    // persists...
-    for (j = 0; j < still_interval; j++) {
-      if (EOF == input_stats(cpi, &tmp_next_frame))
+    // Look ahead a few frames to see if static condition persists...
+    for (j = 0; j < still_interval; ++j) {
+      if (EOF == input_stats(twopass, &tmp_next_frame))
         break;
 
-      zz_inter =
-        (tmp_next_frame.pcnt_inter - tmp_next_frame.pcnt_motion);
-      if (zz_inter < 0.999)
+      if (tmp_next_frame.pcnt_inter - tmp_next_frame.pcnt_motion < 0.999)
         break;
     }
-    // Reset file position
-    reset_fpf_position(cpi, position);
 
-    // Only if it does do we signal a transition to still
+    reset_fpf_position(twopass, position);
+
+    // Only if it does do we signal a transition to still.
     if (j == still_interval)
       trans_to_still = 1;
   }
@@ -1311,20 +1110,20 @@ static int detect_transition_to_still(
 
 // This function detects a flash through the high relative pcnt_second_ref
 // score in the frame following a flash frame. The offset passed in should
-// reflect this
-static int detect_flash(VP9_COMP *cpi, int offset) {
+// reflect this.
+static int detect_flash(const struct twopass_rc *twopass, int offset) {
   FIRSTPASS_STATS next_frame;
 
   int flash_detected = 0;
 
   // Read the frame data.
   // The return is FALSE (no flash detected) if not a valid frame
-  if (read_frame_stats(cpi, &next_frame, offset) != EOF) {
+  if (read_frame_stats(twopass, &next_frame, offset) != EOF) {
     // What we are looking for here is a situation where there is a
     // brief break in prediction (such as a flash) but subsequent frames
     // are reasonably well predicted by an earlier (pre flash) frame.
     // The recovery after a flash is indicated by a high pcnt_second_ref
-    // comapred to pcnt_inter.
+    // compared to pcnt_inter.
     if (next_frame.pcnt_second_ref > next_frame.pcnt_inter &&
         next_frame.pcnt_second_ref >= 0.5)
       flash_detected = 1;
@@ -1333,56 +1132,48 @@ static int detect_flash(VP9_COMP *cpi, int offset) {
   return flash_detected;
 }
 
-// Update the motion related elements to the GF arf boost calculation
+// Update the motion related elements to the GF arf boost calculation.
 static void accumulate_frame_motion_stats(
   FIRSTPASS_STATS *this_frame,
   double *this_frame_mv_in_out,
   double *mv_in_out_accumulator,
   double *abs_mv_in_out_accumulator,
   double *mv_ratio_accumulator) {
-  // double this_frame_mv_in_out;
-  double this_frame_mvr_ratio;
-  double this_frame_mvc_ratio;
   double motion_pct;
 
   // Accumulate motion stats.
   motion_pct = this_frame->pcnt_motion;
 
-  // Accumulate Motion In/Out of frame stats
+  // Accumulate Motion In/Out of frame stats.
   *this_frame_mv_in_out = this_frame->mv_in_out_count * motion_pct;
   *mv_in_out_accumulator += this_frame->mv_in_out_count * motion_pct;
-  *abs_mv_in_out_accumulator +=
-    fabs(this_frame->mv_in_out_count * motion_pct);
+  *abs_mv_in_out_accumulator += fabs(this_frame->mv_in_out_count * motion_pct);
 
   // Accumulate a measure of how uniform (or conversely how random)
-  // the motion field is. (A ratio of absmv / mv)
+  // the motion field is (a ratio of absmv / mv).
   if (motion_pct > 0.05) {
-    this_frame_mvr_ratio = fabs(this_frame->mvr_abs) /
+    const double this_frame_mvr_ratio = fabs(this_frame->mvr_abs) /
                            DOUBLE_DIVIDE_CHECK(fabs(this_frame->MVr));
 
-    this_frame_mvc_ratio = fabs(this_frame->mvc_abs) /
+    const double this_frame_mvc_ratio = fabs(this_frame->mvc_abs) /
                            DOUBLE_DIVIDE_CHECK(fabs(this_frame->MVc));
 
-    *mv_ratio_accumulator +=
-      (this_frame_mvr_ratio < this_frame->mvr_abs)
+    *mv_ratio_accumulator += (this_frame_mvr_ratio < this_frame->mvr_abs)
       ? (this_frame_mvr_ratio * motion_pct)
       : this_frame->mvr_abs * motion_pct;
 
-    *mv_ratio_accumulator +=
-      (this_frame_mvc_ratio < this_frame->mvc_abs)
+    *mv_ratio_accumulator += (this_frame_mvc_ratio < this_frame->mvc_abs)
       ? (this_frame_mvc_ratio * motion_pct)
       : this_frame->mvc_abs * motion_pct;
   }
 }
 
 // Calculate a baseline boost number for the current frame.
-static double calc_frame_boost(
-  VP9_COMP *cpi,
-  FIRSTPASS_STATS *this_frame,
-  double this_frame_mv_in_out) {
+static double calc_frame_boost(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame,
+                               double this_frame_mv_in_out) {
   double frame_boost;
 
-  // Underlying boost factor is based on inter intra error ratio
+  // Underlying boost factor is based on inter intra error ratio.
   if (this_frame->intra_error > cpi->twopass.gf_intra_err_min)
     frame_boost = (IIFACTOR * this_frame->intra_error /
                    DOUBLE_DIVIDE_CHECK(this_frame->coded_error));
@@ -1390,28 +1181,23 @@ static double calc_frame_boost(
     frame_boost = (IIFACTOR * cpi->twopass.gf_intra_err_min /
                    DOUBLE_DIVIDE_CHECK(this_frame->coded_error));
 
-  // Increase boost for frames where new data coming into frame
-  // (eg zoom out). Slightly reduce boost if there is a net balance
-  // of motion out of the frame (zoom in).
-  // The range for this_frame_mv_in_out is -1.0 to +1.0
+  // Increase boost for frames where new data coming into frame (e.g. zoom out).
+  // Slightly reduce boost if there is a net balance of motion out of the frame
+  // (zoom in). The range for this_frame_mv_in_out is -1.0 to +1.0.
   if (this_frame_mv_in_out > 0.0)
     frame_boost += frame_boost * (this_frame_mv_in_out * 2.0);
-  // In extreme case boost is halved
+  // In the extreme case the boost is halved.
   else
     frame_boost += frame_boost * (this_frame_mv_in_out / 2.0);
 
-  // Clip to maximum
-  if (frame_boost > GF_RMAX)
-    frame_boost = GF_RMAX;
-
-  return frame_boost;
+  return MIN(frame_boost, GF_RMAX);
 }
 
 static int calc_arf_boost(VP9_COMP *cpi, int offset,
                           int f_frames, int b_frames,
                           int *f_boost, int *b_boost) {
   FIRSTPASS_STATS this_frame;
-
+  struct twopass_rc *const twopass = &cpi->twopass;
   int i;
   double boost_score = 0.0;
   double mv_ratio_accumulator = 0.0;
@@ -1422,12 +1208,12 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset,
   int arf_boost;
   int flash_detected = 0;
 
-  // Search forward from the proposed arf/next gf position
-  for (i = 0; i < f_frames; i++) {
-    if (read_frame_stats(cpi, &this_frame, (i + offset)) == EOF)
+  // Search forward from the proposed arf/next gf position.
+  for (i = 0; i < f_frames; ++i) {
+    if (read_frame_stats(twopass, &this_frame, (i + offset)) == EOF)
       break;
 
-    // Update the motion related elements to the boost calculation
+    // Update the motion related elements to the boost calculation.
     accumulate_frame_motion_stats(&this_frame,
                                   &this_frame_mv_in_out, &mv_in_out_accumulator,
                                   &abs_mv_in_out_accumulator,
@@ -1435,12 +1221,12 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset,
 
     // We want to discount the flash frame itself and the recovery
     // frame that follows as both will have poor scores.
-    flash_detected = detect_flash(cpi, (i + offset)) ||
-                     detect_flash(cpi, (i + offset + 1));
+    flash_detected = detect_flash(twopass, i + offset) ||
+                     detect_flash(twopass, i + offset + 1);
 
-    // Cumulative effect of prediction quality decay
+    // Accumulate the effect of prediction quality decay.
     if (!flash_detected) {
-      decay_accumulator *= get_prediction_decay_rate(cpi, &this_frame);
+      decay_accumulator *= get_prediction_decay_rate(&cpi->common, &this_frame);
       decay_accumulator = decay_accumulator < MIN_DECAY_FACTOR
                           ? MIN_DECAY_FACTOR : decay_accumulator;
     }
@@ -1451,7 +1237,7 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset,
 
   *f_boost = (int)boost_score;
 
-  // Reset for backward looking loop
+  // Reset for backward looking loop.
   boost_score = 0.0;
   mv_ratio_accumulator = 0.0;
   decay_accumulator = 1.0;
@@ -1459,12 +1245,12 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset,
   mv_in_out_accumulator = 0.0;
   abs_mv_in_out_accumulator = 0.0;
 
-  // Search backward towards last gf position
-  for (i = -1; i >= -b_frames; i--) {
-    if (read_frame_stats(cpi, &this_frame, (i + offset)) == EOF)
+  // Search backward towards last gf position.
+  for (i = -1; i >= -b_frames; --i) {
+    if (read_frame_stats(twopass, &this_frame, (i + offset)) == EOF)
       break;
 
-    // Update the motion related elements to the boost calculation
+    // Update the motion related elements to the boost calculation.
     accumulate_frame_motion_stats(&this_frame,
                                   &this_frame_mv_in_out, &mv_in_out_accumulator,
                                   &abs_mv_in_out_accumulator,
@@ -1472,12 +1258,12 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset,
 
     // We want to discount the the flash frame itself and the recovery
     // frame that follows as both will have poor scores.
-    flash_detected = detect_flash(cpi, (i + offset)) ||
-                     detect_flash(cpi, (i + offset + 1));
+    flash_detected = detect_flash(twopass, i + offset) ||
+                     detect_flash(twopass, i + offset + 1);
 
-    // Cumulative effect of prediction quality decay
+    // Cumulative effect of prediction quality decay.
     if (!flash_detected) {
-      decay_accumulator *= get_prediction_decay_rate(cpi, &this_frame);
+      decay_accumulator *= get_prediction_decay_rate(&cpi->common, &this_frame);
       decay_accumulator = decay_accumulator < MIN_DECAY_FACTOR
                               ? MIN_DECAY_FACTOR : decay_accumulator;
     }
@@ -1525,8 +1311,7 @@ static void schedule_frames(VP9_COMP *cpi, const int start, const int end,
     return;
   }
 
-  // ARF Group: work out the ARF schedule.
-  // Mark ARF frames as negative.
+  // ARF Group: Work out the ARF schedule and mark ARF frames as negative.
   if (end < 0) {
     // printf("start:%d end:%d\n", -end, -end);
     // ARF frame is at the end of the range.
@@ -1578,6 +1363,8 @@ void define_fixed_arf_period(VP9_COMP *cpi) {
   cpi->this_frame_weight = cpi->arf_weight[cpi->sequence_number];
   assert(cpi->this_frame_weight >= 0);
 
+  cpi->twopass.gf_zeromotion_pct = 0;
+
   // Initialize frame coding order variables.
   cpi->new_frame_coding_order_period = 0;
   cpi->next_frame_in_order = 0;
@@ -1586,16 +1373,16 @@ void define_fixed_arf_period(VP9_COMP *cpi) {
   vp9_zero(cpi->arf_buffer_idx);
   vpx_memset(cpi->arf_weight, -1, sizeof(cpi->arf_weight));
 
-  if (cpi->twopass.frames_to_key <= (FIXED_ARF_GROUP_SIZE + 8)) {
+  if (cpi->rc.frames_to_key <= (FIXED_ARF_GROUP_SIZE + 8)) {
     // Setup a GF group close to the keyframe.
-    cpi->source_alt_ref_pending = 0;
-    cpi->baseline_gf_interval = cpi->twopass.frames_to_key;
-    schedule_frames(cpi, 0, (cpi->baseline_gf_interval - 1), 2, 0, 0);
+    cpi->rc.source_alt_ref_pending = 0;
+    cpi->rc.baseline_gf_interval = cpi->rc.frames_to_key;
+    schedule_frames(cpi, 0, (cpi->rc.baseline_gf_interval - 1), 2, 0, 0);
   } else {
     // Setup a fixed period ARF group.
-    cpi->source_alt_ref_pending = 1;
-    cpi->baseline_gf_interval = FIXED_ARF_GROUP_SIZE;
-    schedule_frames(cpi, 0, -(cpi->baseline_gf_interval - 1), 2, 1, 0);
+    cpi->rc.source_alt_ref_pending = 1;
+    cpi->rc.baseline_gf_interval = FIXED_ARF_GROUP_SIZE;
+    schedule_frames(cpi, 0, -(cpi->rc.baseline_gf_interval - 1), 2, 1, 0);
   }
 
   // Replace level indicator of -1 with correct level.
@@ -1631,10 +1418,91 @@ void define_fixed_arf_period(VP9_COMP *cpi) {
 }
 #endif
 
+// Calculate a section intra ratio used in setting max loop filter.
+static void calculate_section_intra_ratio(struct twopass_rc *twopass,
+                                          const FIRSTPASS_STATS *start_pos,
+                                          int section_length) {
+  FIRSTPASS_STATS next_frame;
+  FIRSTPASS_STATS sectionstats;
+  int i;
+
+  vp9_zero(next_frame);
+  vp9_zero(sectionstats);
+
+  reset_fpf_position(twopass, start_pos);
+
+  for (i = 0; i < section_length; ++i) {
+    input_stats(twopass, &next_frame);
+    accumulate_stats(&sectionstats, &next_frame);
+  }
+
+  avg_stats(&sectionstats);
+
+  twopass->section_intra_rating =
+    (int)(sectionstats.intra_error /
+          DOUBLE_DIVIDE_CHECK(sectionstats.coded_error));
+
+  reset_fpf_position(twopass, start_pos);
+}
+
+// Calculate the total bits to allocate in this GF/ARF group.
+static int64_t calculate_total_gf_group_bits(VP9_COMP *cpi,
+                                             double gf_group_err) {
+  const RATE_CONTROL *const rc = &cpi->rc;
+  const struct twopass_rc *const twopass = &cpi->twopass;
+  const int max_bits = frame_max_bits(rc, &cpi->oxcf);
+  int64_t total_group_bits;
+
+  // Calculate the bits to be allocated to the group as a whole.
+  if ((twopass->kf_group_bits > 0) && (twopass->kf_group_error_left > 0)) {
+    total_group_bits = (int64_t)(twopass->kf_group_bits *
+                                 (gf_group_err / twopass->kf_group_error_left));
+  } else {
+    total_group_bits = 0;
+  }
+
+  // Clamp odd edge cases.
+  total_group_bits = (total_group_bits < 0) ?
+     0 : (total_group_bits > twopass->kf_group_bits) ?
+     twopass->kf_group_bits : total_group_bits;
+
+  // Clip based on user supplied data rate variability limit.
+  if (total_group_bits > (int64_t)max_bits * rc->baseline_gf_interval)
+    total_group_bits = (int64_t)max_bits * rc->baseline_gf_interval;
+
+  return total_group_bits;
+}
+
+// Calculate the number bits extra to assign to boosted frames in a group.
+static int calculate_boost_bits(int frame_count,
+                                int boost, int64_t total_group_bits) {
+  int allocation_chunks;
+
+  // return 0 for invalid inputs (could arise e.g. through rounding errors)
+  if (!boost || (total_group_bits <= 0) || (frame_count <= 0) )
+    return 0;
+
+  allocation_chunks = (frame_count * 100) + boost;
+
+  // Prevent overflow.
+  if (boost > 1023) {
+    int divisor = boost >> 10;
+    boost /= divisor;
+    allocation_chunks /= divisor;
+  }
+
+  // Calculate the number of extra bits for use in the boosted frame or frames.
+  return MAX((int)(((int64_t)boost * total_group_bits) / allocation_chunks), 0);
+}
+
+
 // Analyse and define a gf/arf group.
 static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
-  FIRSTPASS_STATS next_frame = { 0 };
-  FIRSTPASS_STATS *start_pos;
+  RATE_CONTROL *const rc = &cpi->rc;
+  const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+  struct twopass_rc *const twopass = &cpi->twopass;
+  FIRSTPASS_STATS next_frame;
+  const FIRSTPASS_STATS *start_pos;
   int i;
   double boost_score = 0.0;
   double old_boost_score = 0.0;
@@ -1646,40 +1514,36 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   double decay_accumulator = 1.0;
   double zero_motion_accumulator = 1.0;
 
-  double loop_decay_rate = 1.00;          // Starting decay rate
+  double loop_decay_rate = 1.00;
   double last_loop_decay_rate = 1.00;
 
   double this_frame_mv_in_out = 0.0;
   double mv_in_out_accumulator = 0.0;
   double abs_mv_in_out_accumulator = 0.0;
   double mv_ratio_accumulator_thresh;
-  int max_bits = frame_max_bits(cpi);     // Max for a single frame
-
-  unsigned int allow_alt_ref =
-    cpi->oxcf.play_alternate && cpi->oxcf.lag_in_frames;
+  unsigned int allow_alt_ref = oxcf->play_alternate && oxcf->lag_in_frames;
 
   int f_boost = 0;
   int b_boost = 0;
   int flash_detected;
   int active_max_gf_interval;
 
-  cpi->twopass.gf_group_bits = 0;
-
-  vp9_clear_system_state();  // __asm emms;
+  vp9_clear_system_state();
+  vp9_zero(next_frame);
 
-  start_pos = cpi->twopass.stats_in;
+  twopass->gf_group_bits = 0;
+  start_pos = twopass->stats_in;
 
   // Load stats for the current frame.
   mod_frame_err = calculate_modified_err(cpi, this_frame);
 
-  // Note the error of the frame at the start of the group (this will be
-  // the GF frame error if we code a normal gf
+  // Note the error of the frame at the start of the group. This will be
+  // the GF frame error if we code a normal gf.
   gf_first_frame_err = mod_frame_err;
 
-  // Special treatment if the current frame is a key frame (which is also
-  // a gf). If it is then its error score (and hence bit allocation) need
-  // to be subtracted out from the calculation for the GF group
-  if (cpi->common.frame_type == KEY_FRAME)
+  // If this is a key frame or the overlay from a previous arf then
+  // the error score / cost of this frame has already been accounted for.
+  if (cpi->common.frame_type == KEY_FRAME || rc->source_alt_ref_active)
     gf_group_err -= gf_first_frame_err;
 
   // Motion breakout threshold for loop below depends on image size.
@@ -1691,71 +1555,67 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   // bits to spare and are better with a smaller interval and smaller boost.
   // At high Q when there are few bits to spare we are better with a longer
   // interval to spread the cost of the GF.
+  //
   active_max_gf_interval =
-    12 + ((int)vp9_convert_qindex_to_q(cpi->active_worst_quality) >> 5);
+    12 + ((int)vp9_convert_qindex_to_q(rc->last_q[INTER_FRAME]) >> 5);
 
-  if (active_max_gf_interval > cpi->max_gf_interval)
-    active_max_gf_interval = cpi->max_gf_interval;
+  if (active_max_gf_interval > rc->max_gf_interval)
+    active_max_gf_interval = rc->max_gf_interval;
 
   i = 0;
-  while (((i < cpi->twopass.static_scene_max_gf_interval) ||
-          ((cpi->twopass.frames_to_key - i) < MIN_GF_INTERVAL)) &&
-         (i < cpi->twopass.frames_to_key)) {
-    i++;    // Increment the loop counter
+  while (i < rc->static_scene_max_gf_interval && i < rc->frames_to_key) {
+    ++i;
 
-    // Accumulate error score of frames in this gf group
+    // Accumulate error score of frames in this gf group.
     mod_frame_err = calculate_modified_err(cpi, this_frame);
     gf_group_err += mod_frame_err;
 
-    if (EOF == input_stats(cpi, &next_frame))
+    if (EOF == input_stats(twopass, &next_frame))
       break;
 
     // Test for the case where there is a brief flash but the prediction
     // quality back to an earlier frame is then restored.
-    flash_detected = detect_flash(cpi, 0);
+    flash_detected = detect_flash(twopass, 0);
 
-    // Update the motion related elements to the boost calculation
+    // Update the motion related elements to the boost calculation.
     accumulate_frame_motion_stats(&next_frame,
                                   &this_frame_mv_in_out, &mv_in_out_accumulator,
                                   &abs_mv_in_out_accumulator,
                                   &mv_ratio_accumulator);
 
-    // Cumulative effect of prediction quality decay
+    // Accumulate the effect of prediction quality decay.
     if (!flash_detected) {
       last_loop_decay_rate = loop_decay_rate;
-      loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame);
+      loop_decay_rate = get_prediction_decay_rate(&cpi->common, &next_frame);
       decay_accumulator = decay_accumulator * loop_decay_rate;
 
       // Monitor for static sections.
       if ((next_frame.pcnt_inter - next_frame.pcnt_motion) <
           zero_motion_accumulator) {
-        zero_motion_accumulator =
-          (next_frame.pcnt_inter - next_frame.pcnt_motion);
+        zero_motion_accumulator = next_frame.pcnt_inter -
+                                      next_frame.pcnt_motion;
       }
 
-      // Break clause to detect very still sections after motion
-      // (for example a static image after a fade or other transition).
-      if (detect_transition_to_still(cpi, i, 5, loop_decay_rate,
+      // Break clause to detect very still sections after motion. For example,
+      // a static image after a fade or other transition.
+      if (detect_transition_to_still(twopass, i, 5, loop_decay_rate,
                                      last_loop_decay_rate)) {
         allow_alt_ref = 0;
         break;
       }
     }
 
-    // Calculate a boost number for this frame
-    boost_score +=
-      (decay_accumulator *
+    // Calculate a boost number for this frame.
+    boost_score += (decay_accumulator *
        calc_frame_boost(cpi, &next_frame, this_frame_mv_in_out));
 
     // Break out conditions.
     if (
-      // Break at cpi->max_gf_interval unless almost totally static
+      // Break at cpi->max_gf_interval unless almost totally static.
       (i >= active_max_gf_interval && (zero_motion_accumulator < 0.995)) ||
       (
-        // Don't break out with a very short interval
+        // Don't break out with a very short interval.
         (i > MIN_GF_INTERVAL) &&
-        // Don't break out very close to a key frame
-        ((cpi->twopass.frames_to_key - i) >= MIN_GF_INTERVAL) &&
         ((boost_score > 125.0) || (next_frame.pcnt_inter < 0.75)) &&
         (!flash_detected) &&
         ((mv_ratio_accumulator > mv_ratio_accumulator_thresh) ||
@@ -1771,26 +1631,23 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     old_boost_score = boost_score;
   }
 
-  cpi->gf_zeromotion_pct = (int)(zero_motion_accumulator * 1000.0);
+  twopass->gf_zeromotion_pct = (int)(zero_motion_accumulator * 1000.0);
 
-  // Don't allow a gf too near the next kf
-  if ((cpi->twopass.frames_to_key - i) < MIN_GF_INTERVAL) {
-    while (i < cpi->twopass.frames_to_key) {
-      i++;
+  // Don't allow a gf too near the next kf.
+  if ((rc->frames_to_key - i) < MIN_GF_INTERVAL) {
+    while (i < (rc->frames_to_key + !rc->next_key_frame_forced)) {
+      ++i;
 
-      if (EOF == input_stats(cpi, this_frame))
+      if (EOF == input_stats(twopass, this_frame))
         break;
 
-      if (i < cpi->twopass.frames_to_key) {
+      if (i < rc->frames_to_key) {
         mod_frame_err = calculate_modified_err(cpi, this_frame);
         gf_group_err += mod_frame_err;
       }
     }
   }
 
-  // Set the interval until the next gf or arf.
-  cpi->baseline_gf_interval = i;
-
 #if CONFIG_MULTIPLE_ARF
   if (cpi->multi_arf_enabled) {
     // Initialize frame coding order variables.
@@ -1803,36 +1660,39 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   }
 #endif
 
-  // Should we use the alternate reference frame
+  // Set the interval until the next gf.
+  if (cpi->common.frame_type == KEY_FRAME || rc->source_alt_ref_active)
+    rc->baseline_gf_interval = i - 1;
+  else
+    rc->baseline_gf_interval = i;
+
+  // Should we use the alternate reference frame.
   if (allow_alt_ref &&
       (i < cpi->oxcf.lag_in_frames) &&
       (i >= MIN_GF_INTERVAL) &&
-      // dont use ARF very near next kf
-      (i <= (cpi->twopass.frames_to_key - MIN_GF_INTERVAL)) &&
-      ((next_frame.pcnt_inter > 0.75) ||
-       (next_frame.pcnt_second_ref > 0.5)) &&
-      ((mv_in_out_accumulator / (double)i > -0.2) ||
-       (mv_in_out_accumulator > -2.0)) &&
-      (boost_score > 100)) {
-    // Alternative boost calculation for alt ref
-    cpi->gfu_boost = calc_arf_boost(cpi, 0, (i - 1), (i - 1), &f_boost,
-                                    &b_boost);
-    cpi->source_alt_ref_pending = 1;
+      // For real scene cuts (not forced kfs) don't allow arf very near kf.
+      (rc->next_key_frame_forced ||
+      (i <= (rc->frames_to_key - MIN_GF_INTERVAL)))) {
+    // Calculate the boost for alt ref.
+    rc->gfu_boost = calc_arf_boost(cpi, 0, (i - 1), (i - 1), &f_boost,
+                                   &b_boost);
+    rc->source_alt_ref_pending = 1;
 
 #if CONFIG_MULTIPLE_ARF
     // Set the ARF schedule.
     if (cpi->multi_arf_enabled) {
-      schedule_frames(cpi, 0, -(cpi->baseline_gf_interval - 1), 2, 1, 0);
+      schedule_frames(cpi, 0, -(rc->baseline_gf_interval - 1), 2, 1, 0);
     }
 #endif
   } else {
-    cpi->gfu_boost = (int)boost_score;
-    cpi->source_alt_ref_pending = 0;
+    rc->gfu_boost = (int)boost_score;
+    rc->source_alt_ref_pending = 0;
 #if CONFIG_MULTIPLE_ARF
     // Set the GF schedule.
     if (cpi->multi_arf_enabled) {
-      schedule_frames(cpi, 0, cpi->baseline_gf_interval - 1, 2, 0, 0);
-      assert(cpi->new_frame_coding_order_period == cpi->baseline_gf_interval);
+      schedule_frames(cpi, 0, rc->baseline_gf_interval - 1, 2, 0, 0);
+      assert(cpi->new_frame_coding_order_period ==
+             rc->baseline_gf_interval);
     }
 #endif
   }
@@ -1874,493 +1734,150 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   }
 #endif
 #endif
+  // Reset the file position.
+  reset_fpf_position(twopass, start_pos);
 
-  // Now decide how many bits should be allocated to the GF group as  a
-  // proportion of those remaining in the kf group.
-  // The final key frame group in the clip is treated as a special case
-  // where cpi->twopass.kf_group_bits is tied to cpi->twopass.bits_left.
-  // This is also important for short clips where there may only be one
-  // key frame.
-  if (cpi->twopass.frames_to_key >= (int)(cpi->twopass.total_stats.count -
-                                          cpi->common.current_video_frame)) {
-    cpi->twopass.kf_group_bits =
-      (cpi->twopass.bits_left > 0) ? cpi->twopass.bits_left : 0;
-  }
-
-  // Calculate the bits to be allocated to the group as a whole
-  if ((cpi->twopass.kf_group_bits > 0) &&
-      (cpi->twopass.kf_group_error_left > 0)) {
-    cpi->twopass.gf_group_bits =
-      (int64_t)(cpi->twopass.kf_group_bits *
-                (gf_group_err / cpi->twopass.kf_group_error_left));
-  } else {
-    cpi->twopass.gf_group_bits = 0;
-  }
-  cpi->twopass.gf_group_bits =
-    (cpi->twopass.gf_group_bits < 0)
-    ? 0
-    : (cpi->twopass.gf_group_bits > cpi->twopass.kf_group_bits)
-    ? cpi->twopass.kf_group_bits : cpi->twopass.gf_group_bits;
-
-  // Clip cpi->twopass.gf_group_bits based on user supplied data rate
-  // variability limit (cpi->oxcf.two_pass_vbrmax_section)
-  if (cpi->twopass.gf_group_bits >
-      (int64_t)max_bits * cpi->baseline_gf_interval)
-    cpi->twopass.gf_group_bits = (int64_t)max_bits * cpi->baseline_gf_interval;
-
-  // Reset the file position
-  reset_fpf_position(cpi, start_pos);
-
-  // Update the record of error used so far (only done once per gf group)
-  cpi->twopass.modified_error_used += gf_group_err;
-
-  // Assign  bits to the arf or gf.
-  for (i = 0;
-      i <= (cpi->source_alt_ref_pending && cpi->common.frame_type != KEY_FRAME);
-      ++i) {
-    int allocation_chunks;
-    int q = cpi->oxcf.fixed_q < 0 ? cpi->last_q[INTER_FRAME]
-                                  : cpi->oxcf.fixed_q;
-    int gf_bits;
-
-    int boost = (cpi->gfu_boost * vp9_gfboost_qadjust(q)) / 100;
-
-    // Set max and minimum boost and hence minimum allocation
-    boost = clamp(boost, 125, (cpi->baseline_gf_interval + 1) * 200);
-
-    if (cpi->source_alt_ref_pending && i == 0)
-      allocation_chunks = ((cpi->baseline_gf_interval + 1) * 100) + boost;
-    else
-      allocation_chunks = (cpi->baseline_gf_interval * 100) + (boost - 100);
+  // Calculate the bits to be allocated to the gf/arf group as a whole
+  twopass->gf_group_bits = calculate_total_gf_group_bits(cpi, gf_group_err);
 
-    // Prevent overflow
-    if (boost > 1023) {
-      int divisor = boost >> 10;
-      boost /= divisor;
-      allocation_chunks /= divisor;
-    }
+  // Calculate the extra bits to be used for boosted frame(s)
+  {
+    int q = rc->last_q[INTER_FRAME];
+    int boost = (rc->gfu_boost * gfboost_qadjust(q)) / 100;
 
-    // Calculate the number of bits to be spent on the gf or arf based on
-    // the boost number
-    gf_bits = (int)((double)boost * (cpi->twopass.gf_group_bits /
-                                       (double)allocation_chunks));
-
-    // If the frame that is to be boosted is simpler than the average for
-    // the gf/arf group then use an alternative calculation
-    // based on the error score of the frame itself
-    if (mod_frame_err < gf_group_err / (double)cpi->baseline_gf_interval) {
-      double alt_gf_grp_bits =
-        (double)cpi->twopass.kf_group_bits  *
-        (mod_frame_err * (double)cpi->baseline_gf_interval) /
-        DOUBLE_DIVIDE_CHECK(cpi->twopass.kf_group_error_left);
-
-      int alt_gf_bits = (int)((double)boost * (alt_gf_grp_bits /
-                                           (double)allocation_chunks));
-
-      if (gf_bits > alt_gf_bits)
-        gf_bits = alt_gf_bits;
-    } else {
-      // If it is harder than other frames in the group make sure it at
-      // least receives an allocation in keeping with its relative error
-      // score, otherwise it may be worse off than an "un-boosted" frame.
-      int alt_gf_bits = (int)((double)cpi->twopass.kf_group_bits *
-                        mod_frame_err /
-                        DOUBLE_DIVIDE_CHECK(cpi->twopass.kf_group_error_left));
-
-      if (alt_gf_bits > gf_bits)
-        gf_bits = alt_gf_bits;
-    }
+    // Set max and minimum boost and hence minimum allocation.
+    boost = clamp(boost, 125, (rc->baseline_gf_interval + 1) * 200);
 
-    // Dont allow a negative value for gf_bits
-    if (gf_bits < 0)
-      gf_bits = 0;
+    // Calculate the extra bits to be used for boosted frame(s)
+    twopass->gf_bits = calculate_boost_bits(rc->baseline_gf_interval,
+                                            boost, twopass->gf_group_bits);
 
-    // Add in minimum for a frame
-    gf_bits += cpi->min_frame_bandwidth;
 
-    if (i == 0) {
-      cpi->twopass.gf_bits = gf_bits;
-    }
-    if (i == 1 || (!cpi->source_alt_ref_pending
-        && (cpi->common.frame_type != KEY_FRAME))) {
-      // Per frame bit target for this frame
-      cpi->per_frame_bandwidth = gf_bits;
+    // For key frames the frame target rate is set already.
+    // NOTE: We dont bother to check for the special case of ARF overlay
+    // frames here, as there is clamping code for this in the function
+    // vp9_rc_clamp_pframe_target_size(), which applies to one and two pass
+    // encodes.
+    if (cpi->common.frame_type != KEY_FRAME &&
+        !vp9_is_upper_layer_key_frame(cpi)) {
+      vp9_rc_set_frame_target(cpi, twopass->gf_bits);
     }
   }
 
-  {
-    // Adjust KF group bits and error remaining
-    cpi->twopass.kf_group_error_left -= (int64_t)gf_group_err;
-    cpi->twopass.kf_group_bits -= cpi->twopass.gf_group_bits;
-
-    if (cpi->twopass.kf_group_bits < 0)
-      cpi->twopass.kf_group_bits = 0;
-
-    // Note the error score left in the remaining frames of the group.
-    // For normal GFs we want to remove the error score for the first frame
-    // of the group (except in Key frame case where this has already
-    // happened)
-    if (!cpi->source_alt_ref_pending && cpi->common.frame_type != KEY_FRAME)
-      cpi->twopass.gf_group_error_left = (int64_t)(gf_group_err
-                                                   - gf_first_frame_err);
-    else
-      cpi->twopass.gf_group_error_left = (int64_t)gf_group_err;
-
-    cpi->twopass.gf_group_bits -= cpi->twopass.gf_bits
-        - cpi->min_frame_bandwidth;
-
-    if (cpi->twopass.gf_group_bits < 0)
-      cpi->twopass.gf_group_bits = 0;
-
-    // This condition could fail if there are two kfs very close together
-    // despite (MIN_GF_INTERVAL) and would cause a divide by 0 in the
-    // calculation of alt_extra_bits.
-    if (cpi->baseline_gf_interval >= 3) {
-      const int boost = cpi->source_alt_ref_pending ? b_boost : cpi->gfu_boost;
-
-      if (boost >= 150) {
-        int alt_extra_bits;
-        int pct_extra = (boost - 100) / 50;
-        pct_extra = (pct_extra > 20) ? 20 : pct_extra;
-
-        alt_extra_bits = (int)((cpi->twopass.gf_group_bits * pct_extra) / 100);
-        cpi->twopass.gf_group_bits -= alt_extra_bits;
-      }
-    }
+  // Adjust KF group bits and error remaining.
+  twopass->kf_group_error_left -= (int64_t)gf_group_err;
+
+  // If this is an arf update we want to remove the score for the overlay
+  // frame at the end which will usually be very cheap to code.
+  // The overlay frame has already, in effect, been coded so we want to spread
+  // the remaining bits among the other frames.
+  // For normal GFs remove the score for the GF itself unless this is
+  // also a key frame in which case it has already been accounted for.
+  if (rc->source_alt_ref_pending) {
+    twopass->gf_group_error_left = (int64_t)(gf_group_err - mod_frame_err);
+  } else if (cpi->common.frame_type != KEY_FRAME) {
+    twopass->gf_group_error_left = (int64_t)(gf_group_err
+                                                 - gf_first_frame_err);
+  } else {
+    twopass->gf_group_error_left = (int64_t)gf_group_err;
   }
 
+  // Calculate a section intra ratio used in setting max loop filter.
   if (cpi->common.frame_type != KEY_FRAME) {
-    FIRSTPASS_STATS sectionstats;
-
-    zero_stats(&sectionstats);
-    reset_fpf_position(cpi, start_pos);
-
-    for (i = 0; i < cpi->baseline_gf_interval; i++) {
-      input_stats(cpi, &next_frame);
-      accumulate_stats(&sectionstats, &next_frame);
-    }
-
-    avg_stats(&sectionstats);
-
-    cpi->twopass.section_intra_rating = (int)
-      (sectionstats.intra_error /
-      DOUBLE_DIVIDE_CHECK(sectionstats.coded_error));
-
-    reset_fpf_position(cpi, start_pos);
+    calculate_section_intra_ratio(twopass, start_pos, rc->baseline_gf_interval);
   }
 }
 
 // Allocate bits to a normal frame that is neither a gf an arf or a key frame.
 static void assign_std_frame_bits(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
+  struct twopass_rc *twopass = &cpi->twopass;
+  // For a single frame.
+  const int max_bits = frame_max_bits(&cpi->rc, &cpi->oxcf);
+  // Calculate modified prediction error used in bit allocation.
+  const double modified_err = calculate_modified_err(cpi, this_frame);
   int target_frame_size;
-
-  double modified_err;
   double err_fraction;
 
-  // Max for a single frame.
-  int max_bits = frame_max_bits(cpi);
-
-  // Calculate modified prediction error used in bit allocation.
-  modified_err = calculate_modified_err(cpi, this_frame);
-
-  if (cpi->twopass.gf_group_error_left > 0)
+  if (twopass->gf_group_error_left > 0)
     // What portion of the remaining GF group error is used by this frame.
-    err_fraction = modified_err / cpi->twopass.gf_group_error_left;
+    err_fraction = modified_err / twopass->gf_group_error_left;
   else
     err_fraction = 0.0;
 
   // How many of those bits available for allocation should we give it?
-  target_frame_size = (int)((double)cpi->twopass.gf_group_bits * err_fraction);
+  target_frame_size = (int)((double)twopass->gf_group_bits * err_fraction);
 
   // Clip target size to 0 - max_bits (or cpi->twopass.gf_group_bits) at
   // the top end.
-  if (target_frame_size < 0) {
-    target_frame_size = 0;
-  } else {
-    if (target_frame_size > max_bits)
-      target_frame_size = max_bits;
-
-    if (target_frame_size > cpi->twopass.gf_group_bits)
-      target_frame_size = (int)cpi->twopass.gf_group_bits;
-  }
+  target_frame_size = clamp(target_frame_size, 0,
+                            MIN(max_bits, (int)twopass->gf_group_bits));
 
   // Adjust error and bits remaining.
-  cpi->twopass.gf_group_error_left -= (int64_t)modified_err;
-  cpi->twopass.gf_group_bits -= target_frame_size;
-
-  if (cpi->twopass.gf_group_bits < 0)
-    cpi->twopass.gf_group_bits = 0;
-
-  // Add in the minimum number of bits that is set aside for every frame.
-  target_frame_size += cpi->min_frame_bandwidth;
+  twopass->gf_group_error_left -= (int64_t)modified_err;
 
   // Per frame bit target for this frame.
-  cpi->per_frame_bandwidth = target_frame_size;
+  vp9_rc_set_frame_target(cpi, target_frame_size);
 }
 
-// Make a damped adjustment to the active max q.
-static int adjust_active_maxq(int old_maxqi, int new_maxqi) {
-  int i;
-  const double old_q = vp9_convert_qindex_to_q(old_maxqi);
-  const double new_q = vp9_convert_qindex_to_q(new_maxqi);
-  const double target_q = ((old_q * 7.0) + new_q) / 8.0;
-
-  if (target_q > old_q) {
-    for (i = old_maxqi; i <= new_maxqi; i++)
-      if (vp9_convert_qindex_to_q(i) >= target_q)
-        return i;
-  } else {
-    for (i = old_maxqi; i >= new_maxqi; i--)
-      if (vp9_convert_qindex_to_q(i) <= target_q)
-        return i;
-  }
-
-  return new_maxqi;
-}
-
-void vp9_second_pass(VP9_COMP *cpi) {
-  int tmp_q;
-  int frames_left = (int)(cpi->twopass.total_stats.count -
-                          cpi->common.current_video_frame);
-
-  FIRSTPASS_STATS this_frame;
-  FIRSTPASS_STATS this_frame_copy;
-
-  double this_frame_intra_error;
-  double this_frame_coded_error;
-
-  if (!cpi->twopass.stats_in)
-    return;
-
-  vp9_clear_system_state();
-
-  if (cpi->oxcf.end_usage == USAGE_CONSTANT_QUALITY) {
-    cpi->active_worst_quality = cpi->oxcf.cq_level;
-  } else {
-    // Special case code for first frame.
-    if (cpi->common.current_video_frame == 0) {
-      int section_target_bandwidth =
-          (int)(cpi->twopass.bits_left / frames_left);
-      cpi->twopass.est_max_qcorrection_factor = 1.0;
-
-      // Set a cq_level in constrained quality mode.
-      // Commenting this code out for now since it does not seem to be
-      // working well.
-      /*
-      if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) {
-        int est_cq = estimate_cq(cpi, &cpi->twopass.total_left_stats,
-           section_target_bandwidth);
-
-        if (est_cq > cpi->cq_target_quality)
-          cpi->cq_target_quality = est_cq;
-        else
-          cpi->cq_target_quality = cpi->oxcf.cq_level;
-      }
-      */
-
-      // guess at maxq needed in 2nd pass
-      cpi->twopass.maxq_max_limit = cpi->worst_quality;
-      cpi->twopass.maxq_min_limit = cpi->best_quality;
-
-      tmp_q = estimate_max_q(cpi, &cpi->twopass.total_left_stats,
-                             section_target_bandwidth);
-
-      cpi->active_worst_quality = tmp_q;
-      cpi->ni_av_qi = tmp_q;
-      cpi->avg_q = vp9_convert_qindex_to_q(tmp_q);
-
-#ifndef ONE_SHOT_Q_ESTIMATE
-      // Limit the maxq value returned subsequently.
-      // This increases the risk of overspend or underspend if the initial
-      // estimate for the clip is bad, but helps prevent excessive
-      // variation in Q, especially near the end of a clip
-      // where for example a small overspend may cause Q to crash
-      adjust_maxq_qrange(cpi);
-#endif
-    }
-
-#ifndef ONE_SHOT_Q_ESTIMATE
-    // The last few frames of a clip almost always have to few or too many
-    // bits and for the sake of over exact rate control we dont want to make
-    // radical adjustments to the allowed quantizer range just to use up a
-    // few surplus bits or get beneath the target rate.
-    else if ((cpi->common.current_video_frame <
-              (((unsigned int)cpi->twopass.total_stats.count * 255) >> 8)) &&
-             ((cpi->common.current_video_frame + cpi->baseline_gf_interval) <
-              (unsigned int)cpi->twopass.total_stats.count)) {
-      int section_target_bandwidth =
-          (int)(cpi->twopass.bits_left / frames_left);
-      if (frames_left < 1)
-        frames_left = 1;
-
-      tmp_q = estimate_max_q(
-          cpi,
-          &cpi->twopass.total_left_stats,
-          section_target_bandwidth);
-
-      // Make a damped adjustment to active max Q
-      cpi->active_worst_quality =
-          adjust_active_maxq(cpi->active_worst_quality, tmp_q);
-    }
-#endif
-  }
-  vp9_zero(this_frame);
-  if (EOF == input_stats(cpi, &this_frame))
-    return;
-
-  this_frame_intra_error = this_frame.intra_error;
-  this_frame_coded_error = this_frame.coded_error;
-
-  // keyframe and section processing !
-  if (cpi->twopass.frames_to_key == 0) {
-    // Define next KF group and assign bits to it
-    this_frame_copy = this_frame;
-    find_next_key_frame(cpi, &this_frame_copy);
-  }
-
-  // Is this a GF / ARF (Note that a KF is always also a GF)
-  if (cpi->frames_till_gf_update_due == 0) {
-    // Define next gf group and assign bits to it
-    this_frame_copy = this_frame;
-
-    cpi->gf_zeromotion_pct = 0;
-
-#if CONFIG_MULTIPLE_ARF
-    if (cpi->multi_arf_enabled) {
-      define_fixed_arf_period(cpi);
-    } else {
-#endif
-      define_gf_group(cpi, &this_frame_copy);
-#if CONFIG_MULTIPLE_ARF
-    }
-#endif
-
-    if (cpi->gf_zeromotion_pct > 995) {
-      // As long as max_thresh for encode breakout is small enough, it is ok
-      // to enable it for no-show frame, i.e. set enable_encode_breakout to 2.
-      if (!cpi->common.show_frame)
-        cpi->enable_encode_breakout = 0;
-      else
-        cpi->enable_encode_breakout = 2;
-    }
-
-    // If we are going to code an altref frame at the end of the group
-    // and the current frame is not a key frame....
-    // If the previous group used an arf this frame has already benefited
-    // from that arf boost and it should not be given extra bits
-    // If the previous group was NOT coded using arf we may want to apply
-    // some boost to this GF as well
-    if (cpi->source_alt_ref_pending && (cpi->common.frame_type != KEY_FRAME)) {
-      // Assign a standard frames worth of bits from those allocated
-      // to the GF group
-      int bak = cpi->per_frame_bandwidth;
-      this_frame_copy = this_frame;
-      assign_std_frame_bits(cpi, &this_frame_copy);
-      cpi->per_frame_bandwidth = bak;
-    }
-  } else {
-    // Otherwise this is an ordinary frame
-    // Assign bits from those allocated to the GF group
-    this_frame_copy =  this_frame;
-    assign_std_frame_bits(cpi, &this_frame_copy);
-  }
-
-  // Keep a globally available copy of this and the next frame's iiratio.
-  cpi->twopass.this_iiratio = (int)(this_frame_intra_error /
-                              DOUBLE_DIVIDE_CHECK(this_frame_coded_error));
-  {
-    FIRSTPASS_STATS next_frame;
-    if (lookup_next_frame_stats(cpi, &next_frame) != EOF) {
-      cpi->twopass.next_iiratio = (int)(next_frame.intra_error /
-                                  DOUBLE_DIVIDE_CHECK(next_frame.coded_error));
-    }
-  }
-
-  // Set nominal per second bandwidth for this frame
-  cpi->target_bandwidth = (int)(cpi->per_frame_bandwidth
-                                * cpi->output_framerate);
-  if (cpi->target_bandwidth < 0)
-    cpi->target_bandwidth = 0;
-
-  cpi->twopass.frames_to_key--;
-
-  // Update the total stats remaining structure
-  subtract_stats(&cpi->twopass.total_left_stats, &this_frame);
-}
-
-static int test_candidate_kf(VP9_COMP *cpi,
-                             FIRSTPASS_STATS *last_frame,
-                             FIRSTPASS_STATS *this_frame,
-                             FIRSTPASS_STATS *next_frame) {
+static int test_candidate_kf(struct twopass_rc *twopass,
+                             const FIRSTPASS_STATS *last_frame,
+                             const FIRSTPASS_STATS *this_frame,
+                             const FIRSTPASS_STATS *next_frame) {
   int is_viable_kf = 0;
 
-  // Does the frame satisfy the primary criteria of a key frame
-  //      If so, then examine how well it predicts subsequent frames
+  // Does the frame satisfy the primary criteria of a key frame?
+  // If so, then examine how well it predicts subsequent frames.
   if ((this_frame->pcnt_second_ref < 0.10) &&
       (next_frame->pcnt_second_ref < 0.10) &&
       ((this_frame->pcnt_inter < 0.05) ||
-       (((this_frame->pcnt_inter - this_frame->pcnt_neutral) < .35) &&
+       (((this_frame->pcnt_inter - this_frame->pcnt_neutral) < 0.35) &&
         ((this_frame->intra_error /
           DOUBLE_DIVIDE_CHECK(this_frame->coded_error)) < 2.5) &&
         ((fabs(last_frame->coded_error - this_frame->coded_error) /
-              DOUBLE_DIVIDE_CHECK(this_frame->coded_error) >
-          .40) ||
+              DOUBLE_DIVIDE_CHECK(this_frame->coded_error) > 0.40) ||
          (fabs(last_frame->intra_error - this_frame->intra_error) /
-              DOUBLE_DIVIDE_CHECK(this_frame->intra_error) >
-          .40) ||
+              DOUBLE_DIVIDE_CHECK(this_frame->intra_error) > 0.40) ||
          ((next_frame->intra_error /
            DOUBLE_DIVIDE_CHECK(next_frame->coded_error)) > 3.5))))) {
     int i;
-    FIRSTPASS_STATS *start_pos;
-
-    FIRSTPASS_STATS local_next_frame;
-
+    const FIRSTPASS_STATS *start_pos = twopass->stats_in;
+    FIRSTPASS_STATS local_next_frame = *next_frame;
     double boost_score = 0.0;
     double old_boost_score = 0.0;
     double decay_accumulator = 1.0;
-    double next_iiratio;
-
-    local_next_frame = *next_frame;
-
-    // Note the starting file position so we can reset to it
-    start_pos = cpi->twopass.stats_in;
 
-    // Examine how well the key frame predicts subsequent frames
-    for (i = 0; i < 16; i++) {
-      next_iiratio = (IIKFACTOR1 * local_next_frame.intra_error /
-                      DOUBLE_DIVIDE_CHECK(local_next_frame.coded_error));
+    // Examine how well the key frame predicts subsequent frames.
+    for (i = 0; i < 16; ++i) {
+      double next_iiratio = (IIKFACTOR1 * local_next_frame.intra_error /
+                             DOUBLE_DIVIDE_CHECK(local_next_frame.coded_error));
 
       if (next_iiratio > RMAX)
         next_iiratio = RMAX;
 
-      // Cumulative effect of decay in prediction quality
+      // Cumulative effect of decay in prediction quality.
       if (local_next_frame.pcnt_inter > 0.85)
-        decay_accumulator = decay_accumulator * local_next_frame.pcnt_inter;
+        decay_accumulator *= local_next_frame.pcnt_inter;
       else
-        decay_accumulator =
-            decay_accumulator * ((0.85 + local_next_frame.pcnt_inter) / 2.0);
+        decay_accumulator *= (0.85 + local_next_frame.pcnt_inter) / 2.0;
 
-      // decay_accumulator = decay_accumulator * local_next_frame.pcnt_inter;
-
-      // Keep a running total
+      // Keep a running total.
       boost_score += (decay_accumulator * next_iiratio);
 
-      // Test various breakout clauses
+      // Test various breakout clauses.
       if ((local_next_frame.pcnt_inter < 0.05) ||
           (next_iiratio < 1.5) ||
           (((local_next_frame.pcnt_inter -
              local_next_frame.pcnt_neutral) < 0.20) &&
            (next_iiratio < 3.0)) ||
           ((boost_score - old_boost_score) < 3.0) ||
-          (local_next_frame.intra_error < 200)
-         ) {
+          (local_next_frame.intra_error < 200)) {
         break;
       }
 
       old_boost_score = boost_score;
 
       // Get the next frame details
-      if (EOF == input_stats(cpi, &local_next_frame))
+      if (EOF == input_stats(twopass, &local_next_frame))
         break;
     }
 
@@ -2370,7 +1887,7 @@ static int test_candidate_kf(VP9_COMP *cpi,
       is_viable_kf = 1;
     } else {
       // Reset the file position
-      reset_fpf_position(cpi, start_pos);
+      reset_fpf_position(twopass, start_pos);
 
       is_viable_kf = 0;
     }
@@ -2378,343 +1895,445 @@ static int test_candidate_kf(VP9_COMP *cpi,
 
   return is_viable_kf;
 }
+
 static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   int i, j;
-  FIRSTPASS_STATS last_frame;
-  FIRSTPASS_STATS first_frame;
+  RATE_CONTROL *const rc = &cpi->rc;
+  struct twopass_rc *const twopass = &cpi->twopass;
+  const FIRSTPASS_STATS first_frame = *this_frame;
+  const FIRSTPASS_STATS *start_position = twopass->stats_in;
   FIRSTPASS_STATS next_frame;
-  FIRSTPASS_STATS *start_position;
-
+  FIRSTPASS_STATS last_frame;
   double decay_accumulator = 1.0;
   double zero_motion_accumulator = 1.0;
-  double boost_score = 0;
-  double loop_decay_rate;
-
+  double boost_score = 0.0;
   double kf_mod_err = 0.0;
   double kf_group_err = 0.0;
-  double kf_group_intra_err = 0.0;
-  double kf_group_coded_err = 0.0;
   double recent_loop_decay[8] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
 
   vp9_zero(next_frame);
 
-  vp9_clear_system_state();  // __asm emms;
-  start_position = cpi->twopass.stats_in;
-
   cpi->common.frame_type = KEY_FRAME;
 
-  // is this a forced key frame by interval
-  cpi->this_key_frame_forced = cpi->next_key_frame_forced;
-
-  // Clear the alt ref active flag as this can never be active on a key frame
-  cpi->source_alt_ref_active = 0;
+  // Is this a forced key frame by interval.
+  rc->this_key_frame_forced = rc->next_key_frame_forced;
 
-  // Kf is always a gf so clear frames till next gf counter
-  cpi->frames_till_gf_update_due = 0;
+  // Clear the alt ref active flag as this can never be active on a key frame.
+  rc->source_alt_ref_active = 0;
 
-  cpi->twopass.frames_to_key = 1;
+  // KF is always a GF so clear frames till next gf counter.
+  rc->frames_till_gf_update_due = 0;
 
-  // Take a copy of the initial frame details
-  first_frame = *this_frame;
+  rc->frames_to_key = 1;
 
-  cpi->twopass.kf_group_bits = 0;        // Total bits available to kf group
-  cpi->twopass.kf_group_error_left = 0;  // Group modified error score.
+  twopass->kf_group_bits = 0;        // Total bits available to kf group
+  twopass->kf_group_error_left = 0;  // Group modified error score.
 
   kf_mod_err = calculate_modified_err(cpi, this_frame);
 
-  // find the next keyframe
+  // Find the next keyframe.
   i = 0;
-  while (cpi->twopass.stats_in < cpi->twopass.stats_in_end) {
-    // Accumulate kf group error
+  while (twopass->stats_in < twopass->stats_in_end &&
+         rc->frames_to_key < cpi->oxcf.key_freq) {
+    // Accumulate kf group error.
     kf_group_err += calculate_modified_err(cpi, this_frame);
 
-    // These figures keep intra and coded error counts for all frames including
-    // key frames in the group. The effect of the key frame itself can be
-    // subtracted out using the first_frame data collected above.
-    kf_group_intra_err += this_frame->intra_error;
-    kf_group_coded_err += this_frame->coded_error;
-
-    // load a the next frame's stats
+    // Load the next frame's stats.
     last_frame = *this_frame;
-    input_stats(cpi, this_frame);
+    input_stats(twopass, this_frame);
 
     // Provided that we are not at the end of the file...
-    if (cpi->oxcf.auto_key
-        && lookup_next_frame_stats(cpi, &next_frame) != EOF) {
-      // Normal scene cut check
-      if (test_candidate_kf(cpi, &last_frame, this_frame, &next_frame))
-        break;
+    if (cpi->oxcf.auto_key &&
+        lookup_next_frame_stats(twopass, &next_frame) != EOF) {
+      double loop_decay_rate;
 
+      // Check for a scene cut.
+      if (test_candidate_kf(twopass, &last_frame, this_frame, &next_frame))
+        break;
 
-      // How fast is prediction quality decaying
-      loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame);
+      // How fast is the prediction quality decaying?
+      loop_decay_rate = get_prediction_decay_rate(&cpi->common, &next_frame);
 
       // We want to know something about the recent past... rather than
-      // as used elsewhere where we are concened with decay in prediction
+      // as used elsewhere where we are concerned with decay in prediction
       // quality since the last GF or KF.
       recent_loop_decay[i % 8] = loop_decay_rate;
       decay_accumulator = 1.0;
-      for (j = 0; j < 8; j++)
+      for (j = 0; j < 8; ++j)
         decay_accumulator *= recent_loop_decay[j];
 
       // Special check for transition or high motion followed by a
-      // to a static scene.
-      if (detect_transition_to_still(cpi, i, cpi->key_frame_frequency - i,
+      // static scene.
+      if (detect_transition_to_still(twopass, i, cpi->oxcf.key_freq - i,
                                      loop_decay_rate, decay_accumulator))
         break;
 
-      // Step on to the next frame
-      cpi->twopass.frames_to_key++;
+      // Step on to the next frame.
+      ++rc->frames_to_key;
 
       // If we don't have a real key frame within the next two
-      // forcekeyframeevery intervals then break out of the loop.
-      if (cpi->twopass.frames_to_key >= 2 * (int)cpi->key_frame_frequency)
+      // key_freq intervals then break out of the loop.
+      if (rc->frames_to_key >= 2 * cpi->oxcf.key_freq)
         break;
     } else {
-      cpi->twopass.frames_to_key++;
+      ++rc->frames_to_key;
     }
-    i++;
+    ++i;
   }
 
   // If there is a max kf interval set by the user we must obey it.
   // We already breakout of the loop above at 2x max.
-  // This code centers the extra kf if the actual natural
-  // interval is between 1x and 2x
-  if (cpi->oxcf.auto_key
-      && cpi->twopass.frames_to_key > (int)cpi->key_frame_frequency) {
-    FIRSTPASS_STATS *current_pos = cpi->twopass.stats_in;
-    FIRSTPASS_STATS tmp_frame;
+  // This code centers the extra kf if the actual natural interval
+  // is between 1x and 2x.
+  if (cpi->oxcf.auto_key &&
+      rc->frames_to_key > cpi->oxcf.key_freq) {
+    FIRSTPASS_STATS tmp_frame = first_frame;
 
-    cpi->twopass.frames_to_key /= 2;
+    rc->frames_to_key /= 2;
 
-    // Copy first frame details
-    tmp_frame = first_frame;
-
-    // Reset to the start of the group
-    reset_fpf_position(cpi, start_position);
+    // Reset to the start of the group.
+    reset_fpf_position(twopass, start_position);
 
     kf_group_err = 0;
-    kf_group_intra_err = 0;
-    kf_group_coded_err = 0;
 
-    // Rescan to get the correct error data for the forced kf group
-    for (i = 0; i < cpi->twopass.frames_to_key; i++) {
-      // Accumulate kf group errors
+    // Rescan to get the correct error data for the forced kf group.
+    for (i = 0; i < rc->frames_to_key; ++i) {
       kf_group_err += calculate_modified_err(cpi, &tmp_frame);
-      kf_group_intra_err += tmp_frame.intra_error;
-      kf_group_coded_err += tmp_frame.coded_error;
-
-      // Load a the next frame's stats
-      input_stats(cpi, &tmp_frame);
+      input_stats(twopass, &tmp_frame);
     }
-
-    // Reset to the start of the group
-    reset_fpf_position(cpi, current_pos);
-
-    cpi->next_key_frame_forced = 1;
+    rc->next_key_frame_forced = 1;
+  } else if (twopass->stats_in == twopass->stats_in_end ||
+             rc->frames_to_key >= cpi->oxcf.key_freq) {
+    rc->next_key_frame_forced = 1;
   } else {
-    cpi->next_key_frame_forced = 0;
+    rc->next_key_frame_forced = 0;
   }
-  // Special case for the last frame of the file
-  if (cpi->twopass.stats_in >= cpi->twopass.stats_in_end) {
-    // Accumulate kf group error
-    kf_group_err += calculate_modified_err(cpi, this_frame);
 
-    // These figures keep intra and coded error counts for all frames including
-    // key frames in the group. The effect of the key frame itself can be
-    // subtracted out using the first_frame data collected above.
-    kf_group_intra_err += this_frame->intra_error;
-    kf_group_coded_err += this_frame->coded_error;
+  // Special case for the last key frame of the file.
+  if (twopass->stats_in >= twopass->stats_in_end) {
+    // Accumulate kf group error.
+    kf_group_err += calculate_modified_err(cpi, this_frame);
   }
 
   // Calculate the number of bits that should be assigned to the kf group.
-  if ((cpi->twopass.bits_left > 0) &&
-      (cpi->twopass.modified_error_left > 0.0)) {
-    // Max for a single normal frame (not key frame)
-    int max_bits = frame_max_bits(cpi);
+  if (twopass->bits_left > 0 && twopass->modified_error_left > 0.0) {
+    // Maximum number of bits for a single normal frame (not key frame).
+    const int max_bits = frame_max_bits(rc, &cpi->oxcf);
 
-    // Maximum bits for the kf group
+    // Maximum number of bits allocated to the key frame group.
     int64_t max_grp_bits;
 
     // Default allocation based on bits left and relative
-    // complexity of the section
-    cpi->twopass.kf_group_bits = (int64_t)(cpi->twopass.bits_left *
-                                           (kf_group_err /
-                                            cpi->twopass.modified_error_left));
+    // complexity of the section.
+    twopass->kf_group_bits = (int64_t)(twopass->bits_left *
+       (kf_group_err / twopass->modified_error_left));
 
     // Clip based on maximum per frame rate defined by the user.
-    max_grp_bits = (int64_t)max_bits * (int64_t)cpi->twopass.frames_to_key;
-    if (cpi->twopass.kf_group_bits > max_grp_bits)
-      cpi->twopass.kf_group_bits = max_grp_bits;
+    max_grp_bits = (int64_t)max_bits * (int64_t)rc->frames_to_key;
+    if (twopass->kf_group_bits > max_grp_bits)
+      twopass->kf_group_bits = max_grp_bits;
   } else {
-    cpi->twopass.kf_group_bits = 0;
+    twopass->kf_group_bits = 0;
   }
-  // Reset the first pass file position
-  reset_fpf_position(cpi, start_position);
+  twopass->kf_group_bits = MAX(0, twopass->kf_group_bits);
+
+  // Reset the first pass file position.
+  reset_fpf_position(twopass, start_position);
 
-  // Determine how big to make this keyframe based on how well the subsequent
-  // frames use inter blocks.
+  // Scan through the kf group collating various stats used to deteermine
+  // how many bits to spend on it.
   decay_accumulator = 1.0;
   boost_score = 0.0;
-  loop_decay_rate = 1.00;       // Starting decay rate
-
-  // Scan through the kf group collating various stats.
-  for (i = 0; i < cpi->twopass.frames_to_key; i++) {
-    double r;
-
-    if (EOF == input_stats(cpi, &next_frame))
+  for (i = 0; i < rc->frames_to_key; ++i) {
+    if (EOF == input_stats(twopass, &next_frame))
       break;
 
     // Monitor for static sections.
     if ((next_frame.pcnt_inter - next_frame.pcnt_motion) <
-        zero_motion_accumulator) {
-      zero_motion_accumulator =
-        (next_frame.pcnt_inter - next_frame.pcnt_motion);
+            zero_motion_accumulator) {
+      zero_motion_accumulator = (next_frame.pcnt_inter -
+                                     next_frame.pcnt_motion);
     }
 
     // For the first few frames collect data to decide kf boost.
-    if (i <= (cpi->max_gf_interval * 2)) {
-      if (next_frame.intra_error > cpi->twopass.kf_intra_err_min)
+    if (i <= (rc->max_gf_interval * 2)) {
+      double r;
+      if (next_frame.intra_error > twopass->kf_intra_err_min)
         r = (IIKFACTOR2 * next_frame.intra_error /
              DOUBLE_DIVIDE_CHECK(next_frame.coded_error));
       else
-        r = (IIKFACTOR2 * cpi->twopass.kf_intra_err_min /
+        r = (IIKFACTOR2 * twopass->kf_intra_err_min /
              DOUBLE_DIVIDE_CHECK(next_frame.coded_error));
 
       if (r > RMAX)
         r = RMAX;
 
-      // How fast is prediction quality decaying
-      if (!detect_flash(cpi, 0)) {
-        loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame);
-        decay_accumulator = decay_accumulator * loop_decay_rate;
-        decay_accumulator = decay_accumulator < MIN_DECAY_FACTOR
-                              ? MIN_DECAY_FACTOR : decay_accumulator;
+      // How fast is prediction quality decaying.
+      if (!detect_flash(twopass, 0)) {
+        const double loop_decay_rate = get_prediction_decay_rate(&cpi->common,
+                                                                 &next_frame);
+        decay_accumulator *= loop_decay_rate;
+        decay_accumulator = MAX(decay_accumulator, MIN_DECAY_FACTOR);
       }
 
       boost_score += (decay_accumulator * r);
     }
   }
 
-  {
-    FIRSTPASS_STATS sectionstats;
+  // Store the zero motion percentage
+  twopass->kf_zeromotion_pct = (int)(zero_motion_accumulator * 100.0);
+
+  // Calculate a section intra ratio used in setting max loop filter.
+  calculate_section_intra_ratio(twopass, start_position, rc->frames_to_key);
+
+  // Work out how many bits to allocate for the key frame itself.
+  rc->kf_boost = (int)boost_score;
+
+  if (rc->kf_boost  < (rc->frames_to_key * 3))
+    rc->kf_boost  = (rc->frames_to_key * 3);
+  if (rc->kf_boost   < MIN_KF_BOOST)
+    rc->kf_boost = MIN_KF_BOOST;
+
+  twopass->kf_bits = calculate_boost_bits((rc->frames_to_key - 1),
+                                          rc->kf_boost, twopass->kf_group_bits);
+
+  twopass->kf_group_bits -= twopass->kf_bits;
+
+  // Per frame bit target for this frame.
+  vp9_rc_set_frame_target(cpi, twopass->kf_bits);
+
+  // Note the total error score of the kf group minus the key frame itself.
+  twopass->kf_group_error_left = (int)(kf_group_err - kf_mod_err);
+
+  // Adjust the count of total modified error left.
+  // The count of bits left is adjusted elsewhere based on real coded frame
+  // sizes.
+  twopass->modified_error_left -= kf_group_err;
+}
+
+void vp9_rc_get_first_pass_params(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  if (!cpi->refresh_alt_ref_frame &&
+      (cm->current_video_frame == 0 ||
+       (cpi->frame_flags & FRAMEFLAGS_KEY))) {
+    cm->frame_type = KEY_FRAME;
+  } else {
+    cm->frame_type = INTER_FRAME;
+  }
+  // Do not use periodic key frames.
+  cpi->rc.frames_to_key = INT_MAX;
+}
+
+// For VBR...adjustment to the frame target based on error from previous frames
+void vbr_rate_correction(int * this_frame_target,
+                         const int64_t vbr_bits_off_target) {
+  int max_delta = (*this_frame_target * 15) / 100;
+
+  // vbr_bits_off_target > 0 means we have extra bits to spend
+  if (vbr_bits_off_target > 0) {
+    *this_frame_target +=
+      (vbr_bits_off_target > max_delta) ? max_delta
+                                        : (int)vbr_bits_off_target;
+  } else {
+    *this_frame_target -=
+      (vbr_bits_off_target < -max_delta) ? max_delta
+                                         : (int)-vbr_bits_off_target;
+  }
+}
+
+void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  RATE_CONTROL *const rc = &cpi->rc;
+  struct twopass_rc *const twopass = &cpi->twopass;
+  int frames_left;
+  FIRSTPASS_STATS this_frame;
+  FIRSTPASS_STATS this_frame_copy;
+
+  double this_frame_intra_error;
+  double this_frame_coded_error;
+  int target;
+  LAYER_CONTEXT *lc = NULL;
+  const int is_spatial_svc = (cpi->use_svc &&
+                              cpi->svc.number_temporal_layers == 1);
+  if (is_spatial_svc) {
+    lc = &cpi->svc.layer_context[cpi->svc.spatial_layer_id];
+    frames_left = (int)(twopass->total_stats.count -
+                  lc->current_video_frame_in_layer);
+  } else {
+    frames_left = (int)(twopass->total_stats.count -
+                  cm->current_video_frame);
+  }
+
+  if (!twopass->stats_in)
+    return;
+
+  if (cpi->refresh_alt_ref_frame) {
+    int modified_target = twopass->gf_bits;
+    rc->base_frame_target = twopass->gf_bits;
+    cm->frame_type = INTER_FRAME;
+#ifdef LONG_TERM_VBR_CORRECTION
+    // Correction to rate target based on prior over or under shoot.
+    if (cpi->oxcf.rc_mode == RC_MODE_VBR)
+      vbr_rate_correction(&modified_target, rc->vbr_bits_off_target);
+#endif
+    vp9_rc_set_frame_target(cpi, modified_target);
+    return;
+  }
+
+  vp9_clear_system_state();
+
+  if (is_spatial_svc && twopass->kf_intra_err_min == 0) {
+    twopass->kf_intra_err_min = KF_MB_INTRA_MIN * cpi->common.MBs;
+    twopass->gf_intra_err_min = GF_MB_INTRA_MIN * cpi->common.MBs;
+  }
+
+  if (cpi->oxcf.rc_mode == RC_MODE_CONSTANT_QUALITY) {
+    twopass->active_worst_quality = cpi->oxcf.cq_level;
+  } else if (cm->current_video_frame == 0 ||
+             (is_spatial_svc && lc->current_video_frame_in_layer == 0)) {
+    // Special case code for first frame.
+    const int section_target_bandwidth = (int)(twopass->bits_left /
+                                               frames_left);
+    const int tmp_q = get_twopass_worst_quality(cpi, &twopass->total_left_stats,
+                                                section_target_bandwidth);
+    twopass->active_worst_quality = tmp_q;
+    rc->ni_av_qi = tmp_q;
+    rc->avg_q = vp9_convert_qindex_to_q(tmp_q);
+  }
+  vp9_zero(this_frame);
+  if (EOF == input_stats(twopass, &this_frame))
+    return;
 
-    zero_stats(&sectionstats);
-    reset_fpf_position(cpi, start_position);
+  this_frame_intra_error = this_frame.intra_error;
+  this_frame_coded_error = this_frame.coded_error;
 
-    for (i = 0; i < cpi->twopass.frames_to_key; i++) {
-      input_stats(cpi, &next_frame);
-      accumulate_stats(&sectionstats, &next_frame);
+  // Keyframe and section processing.
+  if (rc->frames_to_key == 0 ||
+      (cpi->frame_flags & FRAMEFLAGS_KEY)) {
+    // Define next KF group and assign bits to it.
+    this_frame_copy = this_frame;
+    find_next_key_frame(cpi, &this_frame_copy);
+    // Don't place key frame in any enhancement layers in spatial svc
+    if (is_spatial_svc) {
+      lc->is_key_frame = 1;
+      if (cpi->svc.spatial_layer_id > 0) {
+        cm->frame_type = INTER_FRAME;
+      }
+    }
+  } else {
+    if (is_spatial_svc) {
+      lc->is_key_frame = 0;
     }
+    cm->frame_type = INTER_FRAME;
+  }
+
+  // Is this frame a GF / ARF? (Note: a key frame is always also a GF).
+  if (rc->frames_till_gf_update_due == 0) {
+    // Define next gf group and assign bits to it.
+    this_frame_copy = this_frame;
 
-    avg_stats(&sectionstats);
-
-    cpi->twopass.section_intra_rating = (int)
-      (sectionstats.intra_error
-      / DOUBLE_DIVIDE_CHECK(sectionstats.coded_error));
-  }
-
-  // Reset the first pass file position
-  reset_fpf_position(cpi, start_position);
-
-  // Work out how many bits to allocate for the key frame itself
-  if (1) {
-    int kf_boost = (int)boost_score;
-    int allocation_chunks;
-    int alt_kf_bits;
-
-    if (kf_boost < (cpi->twopass.frames_to_key * 3))
-      kf_boost = (cpi->twopass.frames_to_key * 3);
-
-    if (kf_boost < 300)  // Min KF boost
-      kf_boost = 300;
-
-    // Make a note of baseline boost and the zero motion
-    // accumulator value for use elsewhere.
-    cpi->kf_boost = kf_boost;
-    cpi->kf_zeromotion_pct = (int)(zero_motion_accumulator * 100.0);
-
-    // We do three calculations for kf size.
-    // The first is based on the error score for the whole kf group.
-    // The second (optionaly) on the key frames own error if this is
-    // smaller than the average for the group.
-    // The final one insures that the frame receives at least the
-    // allocation it would have received based on its own error score vs
-    // the error score remaining
-    // Special case if the sequence appears almost totaly static
-    // In this case we want to spend almost all of the bits on the
-    // key frame.
-    // cpi->twopass.frames_to_key-1 because key frame itself is taken
-    // care of by kf_boost.
-    if (zero_motion_accumulator >= 0.99) {
-      allocation_chunks =
-        ((cpi->twopass.frames_to_key - 1) * 10) + kf_boost;
+#if CONFIG_MULTIPLE_ARF
+    if (cpi->multi_arf_enabled) {
+      define_fixed_arf_period(cpi);
     } else {
-      allocation_chunks =
-        ((cpi->twopass.frames_to_key - 1) * 100) + kf_boost;
+#endif
+      define_gf_group(cpi, &this_frame_copy);
+#if CONFIG_MULTIPLE_ARF
     }
+#endif
 
-    // Prevent overflow
-    if (kf_boost > 1028) {
-      int divisor = kf_boost >> 10;
-      kf_boost /= divisor;
-      allocation_chunks /= divisor;
+    if (twopass->gf_zeromotion_pct > 995) {
+      // As long as max_thresh for encode breakout is small enough, it is ok
+      // to enable it for show frame, i.e. set allow_encode_breakout to
+      // ENCODE_BREAKOUT_LIMITED.
+      if (!cm->show_frame)
+        cpi->allow_encode_breakout = ENCODE_BREAKOUT_DISABLED;
+      else
+        cpi->allow_encode_breakout = ENCODE_BREAKOUT_LIMITED;
     }
 
-    cpi->twopass.kf_group_bits =
-        (cpi->twopass.kf_group_bits < 0) ? 0 : cpi->twopass.kf_group_bits;
-
-    // Calculate the number of bits to be spent on the key frame
-    cpi->twopass.kf_bits =
-        (int)((double)kf_boost *
-              ((double)cpi->twopass.kf_group_bits / (double)allocation_chunks));
-
-    // If the key frame is actually easier than the average for the
-    // kf group (which does sometimes happen... eg a blank intro frame)
-    // Then use an alternate calculation based on the kf error score
-    // which should give a smaller key frame.
-    if (kf_mod_err < kf_group_err / cpi->twopass.frames_to_key) {
-      double  alt_kf_grp_bits =
-        ((double)cpi->twopass.bits_left *
-         (kf_mod_err * (double)cpi->twopass.frames_to_key) /
-         DOUBLE_DIVIDE_CHECK(cpi->twopass.modified_error_left));
-
-      alt_kf_bits = (int)((double)kf_boost *
-                          (alt_kf_grp_bits / (double)allocation_chunks));
-
-      if (cpi->twopass.kf_bits > alt_kf_bits) {
-        cpi->twopass.kf_bits = alt_kf_bits;
-      }
-    } else {
-    // Else if it is much harder than other frames in the group make sure
-    // it at least receives an allocation in keeping with its relative
-    // error score
-      alt_kf_bits =
-        (int)((double)cpi->twopass.bits_left *
-              (kf_mod_err /
-               DOUBLE_DIVIDE_CHECK(cpi->twopass.modified_error_left)));
-
-      if (alt_kf_bits > cpi->twopass.kf_bits) {
-        cpi->twopass.kf_bits = alt_kf_bits;
-      }
+    rc->frames_till_gf_update_due = rc->baseline_gf_interval;
+    cpi->refresh_golden_frame = 1;
+  } else {
+    // Otherwise this is an ordinary frame.
+    // Assign bits from those allocated to the GF group.
+    this_frame_copy =  this_frame;
+    assign_std_frame_bits(cpi, &this_frame_copy);
+  }
+
+  // Keep a globally available copy of this and the next frame's iiratio.
+  twopass->this_iiratio = (int)(this_frame_intra_error /
+                              DOUBLE_DIVIDE_CHECK(this_frame_coded_error));
+  {
+    FIRSTPASS_STATS next_frame;
+    if (lookup_next_frame_stats(twopass, &next_frame) != EOF) {
+      twopass->next_iiratio = (int)(next_frame.intra_error /
+                                 DOUBLE_DIVIDE_CHECK(next_frame.coded_error));
     }
+  }
 
-    cpi->twopass.kf_group_bits -= cpi->twopass.kf_bits;
-    // Add in the minimum frame allowance
-    cpi->twopass.kf_bits += cpi->min_frame_bandwidth;
+  if (cpi->common.frame_type == KEY_FRAME)
+    target = vp9_rc_clamp_iframe_target_size(cpi, rc->this_frame_target);
+  else
+    target = vp9_rc_clamp_pframe_target_size(cpi, rc->this_frame_target);
 
-    // Peer frame bit target for this frame
-    cpi->per_frame_bandwidth = cpi->twopass.kf_bits;
-    // Convert to a per second bitrate
-    cpi->target_bandwidth = (int)(cpi->twopass.kf_bits *
-                                  cpi->output_framerate);
-  }
+  rc->base_frame_target = target;
+#ifdef LONG_TERM_VBR_CORRECTION
+  // Correction to rate target based on prior over or under shoot.
+  if (cpi->oxcf.rc_mode == RC_MODE_VBR)
+    vbr_rate_correction(&target, rc->vbr_bits_off_target);
+#endif
+  vp9_rc_set_frame_target(cpi, target);
 
-  // Note the total error score of the kf group minus the key frame itself
-  cpi->twopass.kf_group_error_left = (int)(kf_group_err - kf_mod_err);
+  // Update the total stats remaining structure.
+  subtract_stats(&twopass->total_left_stats, &this_frame);
+}
 
-  // Adjust the count of total modified error left.
-  // The count of bits left is adjusted elsewhere based on real coded frame
-  // sizes.
-  cpi->twopass.modified_error_left -= kf_group_err;
+void vp9_twopass_postencode_update(VP9_COMP *cpi) {
+  RATE_CONTROL *const rc = &cpi->rc;
+#ifdef LONG_TERM_VBR_CORRECTION
+  // In this experimental mode, the VBR correction is done exclusively through
+  // rc->vbr_bits_off_target. Based on the sign of this value, a limited %
+  // adjustment is made to the target rate of subsequent frames, to try and
+  // push it back towards 0. This mode is less likely to suffer from
+  // extreme behaviour at the end of a clip or group of frames.
+  const int bits_used = rc->base_frame_target;
+  rc->vbr_bits_off_target += rc->base_frame_target - rc->projected_frame_size;
+#else
+  // In this mode, VBR correction is acheived by altering bits_left,
+  // kf_group_bits & gf_group_bits to reflect any deviation from the target
+  // rate in this frame. This alters the allocation of bits to the
+  // remaning frames in the group / clip.
+  //
+  // This method can give rise to unstable behaviour near the end of a clip
+  // or kf/gf group of frames where any accumulated error is corrected over an
+  // ever decreasing number of frames. Hence we change the balance of target
+  // vs. actual bitrate gradually as we progress towards the end of the
+  // sequence in order to mitigate this effect.
+  const double progress =
+      (double)(cpi->twopass.stats_in - cpi->twopass.stats_in_start) /
+              (cpi->twopass.stats_in_end - cpi->twopass.stats_in_start);
+  const int bits_used = (int)(progress * rc->this_frame_target +
+                             (1.0 - progress) * rc->projected_frame_size);
+#endif
+
+  cpi->twopass.bits_left -= bits_used;
+  cpi->twopass.bits_left = MAX(cpi->twopass.bits_left, 0);
+
+#ifdef LONG_TERM_VBR_CORRECTION
+  if (cpi->common.frame_type != KEY_FRAME &&
+      !vp9_is_upper_layer_key_frame(cpi)) {
+#else
+  if (cpi->common.frame_type == KEY_FRAME ||
+      vp9_is_upper_layer_key_frame(cpi)) {
+    // For key frames kf_group_bits already had the target bits subtracted out.
+    // So now update to the correct value based on the actual bits used.
+    cpi->twopass.kf_group_bits += cpi->rc.this_frame_target - bits_used;
+  } else {
+#endif
+    cpi->twopass.kf_group_bits -= bits_used;
+    cpi->twopass.gf_group_bits -= bits_used;
+    cpi->twopass.gf_group_bits = MAX(cpi->twopass.gf_group_bits, 0);
+  }
+  cpi->twopass.kf_group_bits = MAX(cpi->twopass.kf_group_bits, 0);
 }
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_firstpass.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_firstpass.h
index c18d11e0431..f7ba423b91f 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_firstpass.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_firstpass.h
@@ -10,14 +10,92 @@
 
 #ifndef VP9_ENCODER_VP9_FIRSTPASS_H_
 #define VP9_ENCODER_VP9_FIRSTPASS_H_
-#include "vp9/encoder/vp9_onyx_int.h"
 
-void vp9_init_first_pass(VP9_COMP *cpi);
-void vp9_first_pass(VP9_COMP *cpi);
-void vp9_end_first_pass(VP9_COMP *cpi);
+#ifdef __cplusplus
+extern "C" {
+#endif
 
-void vp9_init_second_pass(VP9_COMP *cpi);
-void vp9_second_pass(VP9_COMP *cpi);
-void vp9_end_second_pass(VP9_COMP *cpi);
+typedef struct {
+  double frame;
+  double intra_error;
+  double coded_error;
+  double sr_coded_error;
+  double ssim_weighted_pred_err;
+  double pcnt_inter;
+  double pcnt_motion;
+  double pcnt_second_ref;
+  double pcnt_neutral;
+  double MVr;
+  double mvr_abs;
+  double MVc;
+  double mvc_abs;
+  double MVrv;
+  double MVcv;
+  double mv_in_out_count;
+  double new_mv_count;
+  double duration;
+  double count;
+  int64_t spatial_layer_id;
+} FIRSTPASS_STATS;
+
+struct twopass_rc {
+  unsigned int section_intra_rating;
+  unsigned int next_iiratio;
+  unsigned int this_iiratio;
+  FIRSTPASS_STATS total_stats;
+  FIRSTPASS_STATS this_frame_stats;
+  const FIRSTPASS_STATS *stats_in;
+  const FIRSTPASS_STATS *stats_in_start;
+  const FIRSTPASS_STATS *stats_in_end;
+  FIRSTPASS_STATS total_left_stats;
+  int first_pass_done;
+  int64_t bits_left;
+  int64_t clip_bits_total;
+  double avg_iiratio;
+  double modified_error_min;
+  double modified_error_max;
+  double modified_error_total;
+  double modified_error_left;
+  double kf_intra_err_min;
+  double gf_intra_err_min;
+  int kf_bits;
+  // Remaining error from uncoded frames in a gf group. Two pass use only
+  int64_t gf_group_error_left;
+
+  // Projected total bits available for a key frame group of frames
+  int64_t kf_group_bits;
+
+  // Error score of frames still to be coded in kf group
+  int64_t kf_group_error_left;
+
+  // Projected Bits available for a group of frames including 1 GF or ARF
+  int64_t gf_group_bits;
+  // Bits for the golden frame or ARF - 2 pass only
+  int gf_bits;
+  int alt_extra_bits;
+
+  int sr_update_lag;
+
+  int kf_zeromotion_pct;
+  int gf_zeromotion_pct;
+
+  int active_worst_quality;
+};
+
+struct VP9_COMP;
+
+void vp9_init_first_pass(struct VP9_COMP *cpi);
+void vp9_rc_get_first_pass_params(struct VP9_COMP *cpi);
+void vp9_first_pass(struct VP9_COMP *cpi);
+void vp9_end_first_pass(struct VP9_COMP *cpi);
+
+void vp9_init_second_pass(struct VP9_COMP *cpi);
+void vp9_rc_get_second_pass_params(struct VP9_COMP *cpi);
+
+// Post encode update of the rate control parameters for 2-pass
+void vp9_twopass_postencode_update(struct VP9_COMP *cpi);
+#ifdef __cplusplus
+}  // extern "C"
+#endif
 
 #endif  // VP9_ENCODER_VP9_FIRSTPASS_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_lookahead.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_lookahead.c
index c28c868457a..abe71e681d3 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_lookahead.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_lookahead.c
@@ -11,9 +11,15 @@
 #include <stdlib.h>
 
 #include "./vpx_config.h"
+
 #include "vp9/common/vp9_common.h"
+
+#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_extend.h"
 #include "vp9/encoder/vp9_lookahead.h"
-#include "vp9/common/vp9_extend.h"
+
+// The max of past frames we want to keep in the queue.
+#define MAX_PRE_FRAMES 1
 
 struct lookahead_ctx {
   unsigned int max_sz;         /* Absolute size of the queue */
@@ -25,8 +31,8 @@ struct lookahead_ctx {
 
 
 /* Return the buffer at the given absolute index and increment the index */
-static struct lookahead_entry * pop(struct lookahead_ctx *ctx,
-                                    unsigned int *idx) {
+static struct lookahead_entry *pop(struct lookahead_ctx *ctx,
+                                   unsigned int *idx) {
   unsigned int index = *idx;
   struct lookahead_entry *buf = ctx->buf + index;
 
@@ -52,16 +58,19 @@ void vp9_lookahead_destroy(struct lookahead_ctx *ctx) {
 }
 
 
-struct lookahead_ctx * vp9_lookahead_init(unsigned int width,
-                                          unsigned int height,
-                                          unsigned int subsampling_x,
-                                          unsigned int subsampling_y,
-                                          unsigned int depth) {
+struct lookahead_ctx *vp9_lookahead_init(unsigned int width,
+                                         unsigned int height,
+                                         unsigned int subsampling_x,
+                                         unsigned int subsampling_y,
+                                         unsigned int depth) {
   struct lookahead_ctx *ctx = NULL;
 
   // Clamp the lookahead queue depth
   depth = clamp(depth, 1, MAX_LAG_BUFFERS);
 
+  // Allocate memory to keep previous source frames available.
+  depth += MAX_PRE_FRAMES;
+
   // Allocate the lookahead structures
   ctx = calloc(1, sizeof(*ctx));
   if (ctx) {
@@ -73,7 +82,7 @@ struct lookahead_ctx * vp9_lookahead_init(unsigned int width,
     for (i = 0; i < depth; i++)
       if (vp9_alloc_frame_buffer(&ctx->buf[i].img,
                                  width, height, subsampling_x, subsampling_y,
-                                 VP9BORDERINPIXELS))
+                                 VP9_ENC_BORDER_IN_PIXELS))
         goto bail;
   }
   return ctx;
@@ -85,8 +94,7 @@ struct lookahead_ctx * vp9_lookahead_init(unsigned int width,
 #define USE_PARTIAL_COPY 0
 
 int vp9_lookahead_push(struct lookahead_ctx *ctx, YV12_BUFFER_CONFIG   *src,
-                       int64_t ts_start, int64_t ts_end, unsigned int flags,
-                       unsigned char *active_map) {
+                       int64_t ts_start, int64_t ts_end, unsigned int flags) {
   struct lookahead_entry *buf;
 #if USE_PARTIAL_COPY
   int row, col, active_end;
@@ -94,7 +102,7 @@ int vp9_lookahead_push(struct lookahead_ctx *ctx, YV12_BUFFER_CONFIG   *src,
   int mb_cols = (src->y_width + 15) >> 4;
 #endif
 
-  if (ctx->sz + 1 > ctx->max_sz)
+  if (ctx->sz + 1  + MAX_PRE_FRAMES > ctx->max_sz)
     return 1;
   ctx->sz++;
   buf = pop(ctx, &ctx->write_idx);
@@ -157,11 +165,11 @@ int vp9_lookahead_push(struct lookahead_ctx *ctx, YV12_BUFFER_CONFIG   *src,
 }
 
 
-struct lookahead_entry * vp9_lookahead_pop(struct lookahead_ctx *ctx,
-                                           int drain) {
+struct lookahead_entry *vp9_lookahead_pop(struct lookahead_ctx *ctx,
+                                          int drain) {
   struct lookahead_entry *buf = NULL;
 
-  if (ctx->sz && (drain || ctx->sz == ctx->max_sz)) {
+  if (ctx->sz && (drain || ctx->sz == ctx->max_sz - MAX_PRE_FRAMES)) {
     buf = pop(ctx, &ctx->read_idx);
     ctx->sz--;
   }
@@ -169,17 +177,28 @@ struct lookahead_entry * vp9_lookahead_pop(struct lookahead_ctx *ctx,
 }
 
 
-struct lookahead_entry * vp9_lookahead_peek(struct lookahead_ctx *ctx,
-                                            int index) {
+struct lookahead_entry *vp9_lookahead_peek(struct lookahead_ctx *ctx,
+                                           int index) {
   struct lookahead_entry *buf = NULL;
 
-  assert(index < (int)ctx->max_sz);
-  if (index < (int)ctx->sz) {
-    index += ctx->read_idx;
-    if (index >= (int)ctx->max_sz)
-      index -= ctx->max_sz;
-    buf = ctx->buf + index;
+  if (index >= 0) {
+    // Forward peek
+    if (index < (int)ctx->sz) {
+      index += ctx->read_idx;
+      if (index >= (int)ctx->max_sz)
+        index -= ctx->max_sz;
+      buf = ctx->buf + index;
+    }
+  } else if (index < 0) {
+    // Backward peek
+    if (-index <= MAX_PRE_FRAMES) {
+      index += ctx->read_idx;
+      if (index < 0)
+        index += ctx->max_sz;
+      buf = ctx->buf + index;
+    }
   }
+
   return buf;
 }
 
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_lookahead.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_lookahead.h
index c773f8fcc6d..ff63c0d0d75 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_lookahead.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_lookahead.h
@@ -14,6 +14,10 @@
 #include "vpx_scale/yv12config.h"
 #include "vpx/vpx_integer.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 #define MAX_LAG_BUFFERS 25
 
 struct lookahead_entry {
@@ -59,8 +63,7 @@ void vp9_lookahead_destroy(struct lookahead_ctx *ctx);
  * \param[in] active_map  Map that specifies which macroblock is active
  */
 int vp9_lookahead_push(struct lookahead_ctx *ctx, YV12_BUFFER_CONFIG *src,
-                       int64_t ts_start, int64_t ts_end, unsigned int flags,
-                       unsigned char *active_map);
+                       int64_t ts_start, int64_t ts_end, unsigned int flags);
 
 
 /**\brief Get the next source buffer to encode
@@ -94,4 +97,8 @@ struct lookahead_entry *vp9_lookahead_peek(struct lookahead_ctx *ctx,
  */
 unsigned int vp9_lookahead_depth(struct lookahead_ctx *ctx);
 
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
 #endif  // VP9_ENCODER_VP9_LOOKAHEAD_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_mbgraph.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_mbgraph.c
index 7b605b212f8..5e87d283324 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_mbgraph.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_mbgraph.c
@@ -11,7 +11,6 @@
 #include <limits.h>
 
 #include "vpx_mem/vpx_mem.h"
-#include "vp9/encoder/vp9_encodeintra.h"
 #include "vp9/encoder/vp9_rdopt.h"
 #include "vp9/encoder/vp9_segmentation.h"
 #include "vp9/encoder/vp9_mcomp.h"
@@ -21,57 +20,50 @@
 #include "vp9/common/vp9_systemdependent.h"
 
 
-
 static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi,
-                                              int_mv *ref_mv,
-                                              int_mv *dst_mv,
+                                              const MV *ref_mv,
+                                              MV *dst_mv,
                                               int mb_row,
                                               int mb_col) {
   MACROBLOCK   *const x  = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   vp9_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[BLOCK_16X16];
-  unsigned int best_err;
 
   const int tmp_col_min = x->mv_col_min;
   const int tmp_col_max = x->mv_col_max;
   const int tmp_row_min = x->mv_row_min;
   const int tmp_row_max = x->mv_row_max;
-  int_mv ref_full;
+  MV ref_full;
 
   // Further step/diamond searches as necessary
   int step_param = cpi->sf.reduce_first_step_size +
-      (cpi->speed < 8 ? (cpi->speed > 5 ? 1 : 0) : 2);
-  step_param = MIN(step_param, (cpi->sf.max_step_search_steps - 2));
+                       (cpi->oxcf.speed > 5 ? 1 : 0);
+  step_param = MIN(step_param, cpi->sf.max_step_search_steps - 2);
 
-  vp9_clamp_mv_min_max(x, &ref_mv->as_mv);
+  vp9_set_mv_search_range(x, ref_mv);
 
-  ref_full.as_mv.col = ref_mv->as_mv.col >> 3;
-  ref_full.as_mv.row = ref_mv->as_mv.row >> 3;
+  ref_full.col = ref_mv->col >> 3;
+  ref_full.row = ref_mv->row >> 3;
 
   /*cpi->sf.search_method == HEX*/
-  best_err = vp9_hex_search(x, &ref_full.as_mv, step_param, x->errorperbit,
-                            0, &v_fn_ptr,
-                            0, &ref_mv->as_mv, &dst_mv->as_mv);
+  vp9_hex_search(x, &ref_full, step_param, x->errorperbit, 0, &v_fn_ptr, 0,
+                 ref_mv, dst_mv);
 
   // Try sub-pixel MC
   // if (bestsme > error_thresh && bestsme < INT_MAX)
   {
     int distortion;
     unsigned int sse;
-    best_err = cpi->find_fractional_mv_step(
-        x,
-        &dst_mv->as_mv, &ref_mv->as_mv,
-        cpi->common.allow_high_precision_mv,
-        x->errorperbit, &v_fn_ptr,
-        0, cpi->sf.subpel_iters_per_step, NULL, NULL,
-        & distortion, &sse);
+    cpi->find_fractional_mv_step(
+        x, dst_mv, ref_mv, cpi->common.allow_high_precision_mv, x->errorperbit,
+        &v_fn_ptr, 0, cpi->sf.subpel_iters_per_step, NULL, NULL, &distortion,
+        &sse);
   }
 
-  vp9_set_mbmode_and_mvs(x, NEWMV, dst_mv);
+  xd->mi[0]->mbmi.mode = NEWMV;
+  xd->mi[0]->mbmi.mv[0].as_mv = *dst_mv;
+
   vp9_build_inter_predictors_sby(xd, mb_row, mb_col, BLOCK_16X16);
-  best_err = vp9_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,
-                          xd->plane[0].dst.buf, xd->plane[0].dst.stride,
-                          INT_MAX);
 
   /* restore UMV window */
   x->mv_col_min = tmp_col_min;
@@ -79,15 +71,17 @@ static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi,
   x->mv_row_min = tmp_row_min;
   x->mv_row_max = tmp_row_max;
 
-  return best_err;
+  return vp9_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,
+          xd->plane[0].dst.buf, xd->plane[0].dst.stride,
+          INT_MAX);
 }
 
-static int do_16x16_motion_search(VP9_COMP *cpi, int_mv *ref_mv, int_mv *dst_mv,
-                                  int mb_row, int mb_col) {
+static int do_16x16_motion_search(VP9_COMP *cpi, const MV *ref_mv,
+                                  int_mv *dst_mv, int mb_row, int mb_col) {
   MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   unsigned int err, tmp_err;
-  int_mv tmp_mv;
+  MV tmp_mv;
 
   // Try zero MV first
   // FIXME should really use something like near/nearest MV and/or MV prediction
@@ -101,20 +95,19 @@ static int do_16x16_motion_search(VP9_COMP *cpi, int_mv *ref_mv, int_mv *dst_mv,
   tmp_err = do_16x16_motion_iteration(cpi, ref_mv, &tmp_mv, mb_row, mb_col);
   if (tmp_err < err) {
     err = tmp_err;
-    dst_mv->as_int = tmp_mv.as_int;
+    dst_mv->as_mv = tmp_mv;
   }
 
   // If the current best reference mv is not centered on 0,0 then do a 0,0
   // based search as well.
-  if (ref_mv->as_int) {
+  if (ref_mv->row != 0 || ref_mv->col != 0) {
     unsigned int tmp_err;
-    int_mv zero_ref_mv, tmp_mv;
+    MV zero_ref_mv = {0, 0}, tmp_mv;
 
-    zero_ref_mv.as_int = 0;
     tmp_err = do_16x16_motion_iteration(cpi, &zero_ref_mv, &tmp_mv,
                                         mb_row, mb_col);
     if (tmp_err < err) {
-      dst_mv->as_int = tmp_mv.as_int;
+      dst_mv->as_mv = tmp_mv;
       err = tmp_err;
     }
   }
@@ -137,12 +130,10 @@ static int do_16x16_zerozero_search(VP9_COMP *cpi, int_mv *dst_mv) {
 
   return err;
 }
-static int find_best_16x16_intra(VP9_COMP *cpi,
-                                 int mb_y_offset,
-                                 MB_PREDICTION_MODE *pbest_mode) {
+static int find_best_16x16_intra(VP9_COMP *cpi, PREDICTION_MODE *pbest_mode) {
   MACROBLOCK   *const x  = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_PREDICTION_MODE best_mode = -1, mode;
+  PREDICTION_MODE best_mode = -1, mode;
   unsigned int best_err = INT_MAX;
 
   // calculate SATD for each intra prediction mode;
@@ -150,10 +141,11 @@ static int find_best_16x16_intra(VP9_COMP *cpi,
   for (mode = DC_PRED; mode <= TM_PRED; mode++) {
     unsigned int err;
 
-    xd->mi_8x8[0]->mbmi.mode = mode;
+    xd->mi[0]->mbmi.mode = mode;
     vp9_predict_intra_block(xd, 0, 2, TX_16X16, mode,
                             x->plane[0].src.buf, x->plane[0].src.stride,
-                            xd->plane[0].dst.buf, xd->plane[0].dst.stride);
+                            xd->plane[0].dst.buf, xd->plane[0].dst.stride,
+                            0, 0, 0);
     err = vp9_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,
                        xd->plane[0].dst.buf, xd->plane[0].dst.stride, best_err);
 
@@ -177,11 +169,8 @@ static void update_mbgraph_mb_stats
   YV12_BUFFER_CONFIG *buf,
   int mb_y_offset,
   YV12_BUFFER_CONFIG *golden_ref,
-  int_mv *prev_golden_ref_mv,
-  int gld_y_offset,
+  const MV *prev_golden_ref_mv,
   YV12_BUFFER_CONFIG *alt_ref,
-  int_mv *prev_alt_ref_mv,
-  int arf_y_offset,
   int mb_row,
   int mb_col
 ) {
@@ -198,7 +187,7 @@ static void update_mbgraph_mb_stats
   xd->plane[0].dst.stride = get_frame_new_buffer(cm)->y_stride;
 
   // do intra 16x16 prediction
-  intra_error = find_best_16x16_intra(cpi, mb_y_offset,
+  intra_error = find_best_16x16_intra(cpi,
                                       &stats->ref[INTRA_FRAME].m.mode);
   if (intra_error <= 0)
     intra_error = 1;
@@ -246,34 +235,31 @@ static void update_mbgraph_frame_stats(VP9_COMP *cpi,
 
   int mb_col, mb_row, offset = 0;
   int mb_y_offset = 0, arf_y_offset = 0, gld_y_offset = 0;
-  int_mv arf_top_mv, gld_top_mv;
-  MODE_INFO mi_local = { { 0 } };
+  MV arf_top_mv = {0, 0}, gld_top_mv = {0, 0};
+  MODE_INFO mi_local;
 
+  vp9_zero(mi_local);
   // Set up limit values for motion vectors to prevent them extending outside
   // the UMV borders.
-  arf_top_mv.as_int = 0;
-  gld_top_mv.as_int = 0;
   x->mv_row_min     = -BORDER_MV_PIXELS_B16;
   x->mv_row_max     = (cm->mb_rows - 1) * 8 + BORDER_MV_PIXELS_B16;
   xd->up_available  = 0;
   xd->plane[0].dst.stride  = buf->y_stride;
   xd->plane[0].pre[0].stride  = buf->y_stride;
   xd->plane[1].dst.stride = buf->uv_stride;
-  xd->mi_8x8[0] = &mi_local;
+  xd->mi[0] = &mi_local;
   mi_local.mbmi.sb_type = BLOCK_16X16;
   mi_local.mbmi.ref_frame[0] = LAST_FRAME;
   mi_local.mbmi.ref_frame[1] = NONE;
 
   for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) {
-    int_mv arf_left_mv, gld_left_mv;
+    MV arf_left_mv = arf_top_mv, gld_left_mv = gld_top_mv;
     int mb_y_in_offset  = mb_y_offset;
     int arf_y_in_offset = arf_y_offset;
     int gld_y_in_offset = gld_y_offset;
 
     // Set up limit values for motion vectors to prevent them extending outside
     // the UMV borders.
-    arf_left_mv.as_int = arf_top_mv.as_int;
-    gld_left_mv.as_int = gld_top_mv.as_int;
     x->mv_col_min      = -BORDER_MV_PIXELS_B16;
     x->mv_col_max      = (cm->mb_cols - 1) * 8 + BORDER_MV_PIXELS_B16;
     xd->left_available = 0;
@@ -282,14 +268,13 @@ static void update_mbgraph_frame_stats(VP9_COMP *cpi,
       MBGRAPH_MB_STATS *mb_stats = &stats->mb_stats[offset + mb_col];
 
       update_mbgraph_mb_stats(cpi, mb_stats, buf, mb_y_in_offset,
-                              golden_ref, &gld_left_mv, gld_y_in_offset,
-                              alt_ref,    &arf_left_mv, arf_y_in_offset,
+                              golden_ref, &gld_left_mv, alt_ref,
                               mb_row, mb_col);
-      arf_left_mv.as_int = mb_stats->ref[ALTREF_FRAME].m.mv.as_int;
-      gld_left_mv.as_int = mb_stats->ref[GOLDEN_FRAME].m.mv.as_int;
+      arf_left_mv = mb_stats->ref[ALTREF_FRAME].m.mv.as_mv;
+      gld_left_mv = mb_stats->ref[GOLDEN_FRAME].m.mv.as_mv;
       if (mb_col == 0) {
-        arf_top_mv.as_int = arf_left_mv.as_int;
-        gld_top_mv.as_int = gld_left_mv.as_int;
+        arf_top_mv = arf_left_mv;
+        gld_top_mv = gld_left_mv;
       }
       xd->left_available = 1;
       mb_y_in_offset    += 16;
@@ -324,8 +309,8 @@ static void separate_arf_mbs(VP9_COMP *cpi) {
                              1));
 
   // We are not interested in results beyond the alt ref itself.
-  if (n_frames > cpi->frames_till_gf_update_due)
-    n_frames = cpi->frames_till_gf_update_due;
+  if (n_frames > cpi->rc.frames_till_gf_update_due)
+    n_frames = cpi->rc.frames_till_gf_update_due;
 
   // defer cost to reference frames
   for (i = n_frames - 1; i >= 0; i--) {
@@ -356,7 +341,7 @@ static void separate_arf_mbs(VP9_COMP *cpi) {
     for (mi_col = 0; mi_col < cm->mi_cols; mi_col++) {
       // If any of the blocks in the sequence failed then the MB
       // goes in segment 0
-      if (arf_not_zz[mi_row/2*cm->mb_cols + mi_col/2]) {
+      if (arf_not_zz[mi_row / 2 * cm->mb_cols + mi_col / 2]) {
         ncnt[0]++;
         cpi->segmentation_map[mi_row * cm->mi_cols + mi_col] = 0;
       } else {
@@ -378,11 +363,10 @@ static void separate_arf_mbs(VP9_COMP *cpi) {
     else
       cpi->static_mb_pct = 0;
 
-    cpi->seg0_cnt = ncnt[0];
-    vp9_enable_segmentation((VP9_PTR)cpi);
+    vp9_enable_segmentation(&cm->seg);
   } else {
     cpi->static_mb_pct = 0;
-    vp9_disable_segmentation((VP9_PTR)cpi);
+    vp9_disable_segmentation(&cm->seg);
   }
 
   // Free localy allocated storage
@@ -392,15 +376,13 @@ static void separate_arf_mbs(VP9_COMP *cpi) {
 void vp9_update_mbgraph_stats(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
   int i, n_frames = vp9_lookahead_depth(cpi->lookahead);
-  YV12_BUFFER_CONFIG *golden_ref =
-      &cm->yv12_fb[cm->ref_frame_map[cpi->gld_fb_idx]];
+  YV12_BUFFER_CONFIG *golden_ref = get_ref_frame_buffer(cpi, GOLDEN_FRAME);
 
   // we need to look ahead beyond where the ARF transitions into
   // being a GF - so exit if we don't look ahead beyond that
-  if (n_frames <= cpi->frames_till_gf_update_due)
+  if (n_frames <= cpi->rc.frames_till_gf_update_due)
     return;
-  if (n_frames > (int)cpi->frames_till_alt_ref_frame)
-    n_frames = cpi->frames_till_alt_ref_frame;
+
   if (n_frames > MAX_LAG_BUFFERS)
     n_frames = MAX_LAG_BUFFERS;
 
@@ -426,7 +408,7 @@ void vp9_update_mbgraph_stats(VP9_COMP *cpi) {
                                golden_ref, cpi->Source);
   }
 
-  vp9_clear_system_state();  // __asm emms;
+  vp9_clear_system_state();
 
   separate_arf_mbs(cpi);
 }
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_mbgraph.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_mbgraph.h
index c5bca4d01f5..c3af972bc00 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_mbgraph.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_mbgraph.h
@@ -11,6 +11,30 @@
 #ifndef VP9_ENCODER_VP9_MBGRAPH_H_
 #define VP9_ENCODER_VP9_MBGRAPH_H_
 
-void vp9_update_mbgraph_stats(VP9_COMP *cpi);
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+  struct {
+    int err;
+    union {
+      int_mv mv;
+      PREDICTION_MODE mode;
+    } m;
+  } ref[MAX_REF_FRAMES];
+} MBGRAPH_MB_STATS;
+
+typedef struct {
+  MBGRAPH_MB_STATS *mb_stats;
+} MBGRAPH_FRAME_STATS;
+
+struct VP9_COMP;
+
+void vp9_update_mbgraph_stats(struct VP9_COMP *cpi);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
 
 #endif  // VP9_ENCODER_VP9_MBGRAPH_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_mcomp.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_mcomp.c
index a52f5b1b0af..4f7d6f17cd8 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_mcomp.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_mcomp.c
@@ -16,19 +16,28 @@
 
 #include "vpx_mem/vpx_mem.h"
 
-#include "vp9/common/vp9_findnearmv.h"
 #include "vp9/common/vp9_common.h"
 
-#include "vp9/encoder/vp9_onyx_int.h"
+#include "vp9/encoder/vp9_encoder.h"
 #include "vp9/encoder/vp9_mcomp.h"
 
 // #define NEW_DIAMOND_SEARCH
 
-void vp9_clamp_mv_min_max(MACROBLOCK *x, MV *mv) {
-  const int col_min = (mv->col >> 3) - MAX_FULL_PEL_VAL + (mv->col & 7 ? 1 : 0);
-  const int row_min = (mv->row >> 3) - MAX_FULL_PEL_VAL + (mv->row & 7 ? 1 : 0);
-  const int col_max = (mv->col >> 3) + MAX_FULL_PEL_VAL;
-  const int row_max = (mv->row >> 3) + MAX_FULL_PEL_VAL;
+static INLINE const uint8_t *get_buf_from_mv(const struct buf_2d *buf,
+                                             const MV *mv) {
+  return &buf->buf[mv->row * buf->stride + mv->col];
+}
+
+void vp9_set_mv_search_range(MACROBLOCK *x, const MV *mv) {
+  int col_min = (mv->col >> 3) - MAX_FULL_PEL_VAL + (mv->col & 7 ? 1 : 0);
+  int row_min = (mv->row >> 3) - MAX_FULL_PEL_VAL + (mv->row & 7 ? 1 : 0);
+  int col_max = (mv->col >> 3) + MAX_FULL_PEL_VAL;
+  int row_max = (mv->row >> 3) + MAX_FULL_PEL_VAL;
+
+  col_min = MAX(col_min, (MV_LOW >> 3) + 1);
+  row_min = MAX(row_min, (MV_LOW >> 3) + 1);
+  col_max = MIN(col_max, (MV_UPP >> 3) - 1);
+  row_max = MIN(row_max, (MV_UPP >> 3) - 1);
 
   // Get intersection of UMV window and valid MV window to reduce # of checks
   // in diamond search.
@@ -42,7 +51,7 @@ void vp9_clamp_mv_min_max(MACROBLOCK *x, MV *mv) {
     x->mv_row_max = row_max;
 }
 
-int vp9_init_search_range(VP9_COMP *cpi, int size) {
+int vp9_init_search_range(const SPEED_FEATURES *sf, int size) {
   int sr = 0;
 
   // Minimum search size no matter what the passed in value.
@@ -51,16 +60,13 @@ int vp9_init_search_range(VP9_COMP *cpi, int size) {
   while ((size << sr) < MAX_FULL_PEL_VAL)
     sr++;
 
-  if (sr)
-    sr--;
-
-  sr += cpi->sf.reduce_first_step_size;
-  sr = MIN(sr, (cpi->sf.max_step_search_steps - 2));
+  sr += sf->reduce_first_step_size;
+  sr = MIN(sr, (sf->max_step_search_steps - 2));
   return sr;
 }
 
 static INLINE int mv_cost(const MV *mv,
-                          const int *joint_cost, int *comp_cost[2]) {
+                          const int *joint_cost, int *const comp_cost[2]) {
   return joint_cost[vp9_get_mv_joint(mv)] +
              comp_cost[0][mv->row] + comp_cost[1][mv->col];
 }
@@ -84,63 +90,43 @@ static int mv_err_cost(const MV *mv, const MV *ref,
   return 0;
 }
 
-static int mvsad_err_cost(const MV *mv, const MV *ref,
-                          const int *mvjsadcost, int *mvsadcost[2],
+static int mvsad_err_cost(const MACROBLOCK *x, const MV *mv, const MV *ref,
                           int error_per_bit) {
-  if (mvsadcost) {
+  if (x->nmvsadcost) {
     const MV diff = { mv->row - ref->row,
                       mv->col - ref->col };
-    return ROUND_POWER_OF_TWO(mv_cost(&diff, mvjsadcost, mvsadcost) *
-                                  error_per_bit, 8);
+    return ROUND_POWER_OF_TWO(mv_cost(&diff, x->nmvjointsadcost,
+                                      x->nmvsadcost) * error_per_bit, 8);
   }
   return 0;
 }
 
-void vp9_init_dsmotion_compensation(MACROBLOCK *x, int stride) {
-  int len;
-  int search_site_count = 0;
+void vp9_init_dsmotion_compensation(search_site_config *cfg, int stride) {
+  int len, ss_count = 1;
 
-  // Generate offsets for 4 search sites per step.
-  x->ss[search_site_count].mv.col = 0;
-  x->ss[search_site_count].mv.row = 0;
-  x->ss[search_site_count].offset = 0;
-  search_site_count++;
+  cfg->ss[0].mv.col = cfg->ss[0].mv.row = 0;
+  cfg->ss[0].offset = 0;
 
   for (len = MAX_FIRST_STEP; len > 0; len /= 2) {
-    // Compute offsets for search sites.
-    x->ss[search_site_count].mv.col = 0;
-    x->ss[search_site_count].mv.row = -len;
-    x->ss[search_site_count].offset = -len * stride;
-    search_site_count++;
-
-    // Compute offsets for search sites.
-    x->ss[search_site_count].mv.col = 0;
-    x->ss[search_site_count].mv.row = len;
-    x->ss[search_site_count].offset = len * stride;
-    search_site_count++;
-
-    // Compute offsets for search sites.
-    x->ss[search_site_count].mv.col = -len;
-    x->ss[search_site_count].mv.row = 0;
-    x->ss[search_site_count].offset = -len;
-    search_site_count++;
-
-    // Compute offsets for search sites.
-    x->ss[search_site_count].mv.col = len;
-    x->ss[search_site_count].mv.row = 0;
-    x->ss[search_site_count].offset = len;
-    search_site_count++;
+    // Generate offsets for 4 search sites per step.
+    const MV ss_mvs[] = {{-len, 0}, {len, 0}, {0, -len}, {0, len}};
+    int i;
+    for (i = 0; i < 4; ++i) {
+      search_site *const ss = &cfg->ss[ss_count++];
+      ss->mv = ss_mvs[i];
+      ss->offset = ss->mv.row * stride + ss->mv.col;
+    }
   }
 
-  x->ss_count = search_site_count;
-  x->searches_per_step = 4;
+  cfg->ss_count = ss_count;
+  cfg->searches_per_step = 4;
 }
 
-void vp9_init3smotion_compensation(MACROBLOCK *x, int stride) {
+void vp9_init3smotion_compensation(search_site_config *cfg, int stride) {
   int len, ss_count = 1;
 
-  x->ss[0].mv.col = x->ss[0].mv.row = 0;
-  x->ss[0].offset = 0;
+  cfg->ss[0].mv.col = cfg->ss[0].mv.row = 0;
+  cfg->ss[0].offset = 0;
 
   for (len = MAX_FIRST_STEP; len > 0; len /= 2) {
     // Generate offsets for 8 search sites per step.
@@ -150,14 +136,14 @@ void vp9_init3smotion_compensation(MACROBLOCK *x, int stride) {
     };
     int i;
     for (i = 0; i < 8; ++i) {
-      search_site *const ss = &x->ss[ss_count++];
+      search_site *const ss = &cfg->ss[ss_count++];
       ss->mv = ss_mvs[i];
       ss->offset = ss->mv.row * stride + ss->mv.col;
     }
   }
 
-  x->ss_count = ss_count;
-  x->searches_per_step = 8;
+  cfg->ss_count = ss_count;
+  cfg->searches_per_step = 8;
 }
 
 /*
@@ -178,35 +164,34 @@ void vp9_init3smotion_compensation(MACROBLOCK *x, int stride) {
       error_per_bit + 4096) >> 13 : 0)
 
 
-#define SP(x) (((x) & 7) << 1)  // convert motion vector component to offset
-                                // for svf calc
-
-#define IFMVCV(r, c, s, e)                                \
-    if (c >= minc && c <= maxc && r >= minr && r <= maxr) \
-      s                                                   \
-    else                                                  \
-      e;
+// convert motion vector component to offset for svf calc
+static INLINE int sp(int x) {
+  return (x & 7) << 1;
+}
 
-/* pointer to predictor base of a motionvector */
-#define PRE(r, c) (y + (((r) >> 3) * y_stride + ((c) >> 3) -(offset)))
+static INLINE const uint8_t *pre(const uint8_t *buf, int stride, int r, int c) {
+  return &buf[(r >> 3) * stride + (c >> 3)];
+}
 
 /* returns subpixel variance error function */
 #define DIST(r, c) \
-    vfp->svf(PRE(r, c), y_stride, SP(c), SP(r), z, src_stride, &sse)
+    vfp->svf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z, \
+             src_stride, &sse)
 
 /* checks if (r, c) has better score than previous best */
 #define CHECK_BETTER(v, r, c) \
-    IFMVCV(r, c, {                                                       \
-      thismse = (DIST(r, c));                                            \
-      if ((v = MVC(r, c) + thismse) < besterr) {                         \
-        besterr = v;                                                     \
-        br = r;                                                          \
-        bc = c;                                                          \
-        *distortion = thismse;                                           \
-        *sse1 = sse;                                                     \
-      }                                                                  \
-    },                                                                   \
-    v = INT_MAX;)
+  if (c >= minc && c <= maxc && r >= minr && r <= maxr) {              \
+    thismse = (DIST(r, c));                                            \
+    if ((v = MVC(r, c) + thismse) < besterr) {                         \
+      besterr = v;                                                     \
+      br = r;                                                          \
+      bc = c;                                                          \
+      *distortion = thismse;                                           \
+      *sse1 = sse;                                                     \
+    }                                                                  \
+  } else {                                                             \
+    v = INT_MAX;                                                       \
+  }
 
 #define FIRST_LEVEL_CHECKS                              \
   {                                                     \
@@ -273,105 +258,7 @@ void vp9_init3smotion_compensation(MACROBLOCK *x, int stride) {
     }                                                   \
   }
 
-int vp9_find_best_sub_pixel_iterative(MACROBLOCK *x,
-                                      MV *bestmv, const MV *ref_mv,
-                                      int allow_hp,
-                                      int error_per_bit,
-                                      const vp9_variance_fn_ptr_t *vfp,
-                                      int forced_stop,
-                                      int iters_per_step,
-                                      int *mvjcost, int *mvcost[2],
-                                      int *distortion,
-                                      unsigned int *sse1) {
-  uint8_t *z = x->plane[0].src.buf;
-  int src_stride = x->plane[0].src.stride;
-  MACROBLOCKD *xd = &x->e_mbd;
-
-  unsigned int besterr = INT_MAX;
-  unsigned int sse;
-  unsigned int whichdir;
-  unsigned int halfiters = iters_per_step;
-  unsigned int quarteriters = iters_per_step;
-  unsigned int eighthiters = iters_per_step;
-  int thismse;
-
-  const int y_stride = xd->plane[0].pre[0].stride;
-  const int offset = bestmv->row * y_stride + bestmv->col;
-  uint8_t *y = xd->plane[0].pre[0].buf + offset;
-
-  int rr = ref_mv->row;
-  int rc = ref_mv->col;
-  int br = bestmv->row * 8;
-  int bc = bestmv->col * 8;
-  int hstep = 4;
-  const int minc = MAX(x->mv_col_min * 8, ref_mv->col - MV_MAX);
-  const int maxc = MIN(x->mv_col_max * 8, ref_mv->col + MV_MAX);
-  const int minr = MAX(x->mv_row_min * 8, ref_mv->row - MV_MAX);
-  const int maxr = MIN(x->mv_row_max * 8, ref_mv->row + MV_MAX);
-
-  int tr = br;
-  int tc = bc;
-
-  // central mv
-  bestmv->row <<= 3;
-  bestmv->col <<= 3;
-
-  // calculate central point error
-  besterr = vfp->vf(y, y_stride, z, src_stride, sse1);
-  *distortion = besterr;
-  besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
-
-  // TODO(jbb): Each subsequent iteration checks at least one point in
-  // common with the last iteration could be 2 if diagonal is selected.
-  while (halfiters--) {
-    // 1/2 pel
-    FIRST_LEVEL_CHECKS;
-    // no reason to check the same one again.
-    if (tr == br && tc == bc)
-      break;
-    tr = br;
-    tc = bc;
-  }
-
-  // TODO(yaowu): Each subsequent iteration checks at least one point in common
-  // with the last iteration could be 2 if diagonal is selected.
-
-  // Note forced_stop: 0 - full, 1 - qtr only, 2 - half only
-  if (forced_stop != 2) {
-    hstep >>= 1;
-    while (quarteriters--) {
-      FIRST_LEVEL_CHECKS;
-      // no reason to check the same one again.
-      if (tr == br && tc == bc)
-        break;
-      tr = br;
-      tc = bc;
-    }
-  }
-
-  if (allow_hp && vp9_use_mv_hp(ref_mv) && forced_stop == 0) {
-    hstep >>= 1;
-    while (eighthiters--) {
-      FIRST_LEVEL_CHECKS;
-      // no reason to check the same one again.
-      if (tr == br && tc == bc)
-        break;
-      tr = br;
-      tc = bc;
-    }
-  }
-
-  bestmv->row = br;
-  bestmv->col = bc;
-
-  if ((abs(bestmv->col - ref_mv->col) > (MAX_FULL_PEL_VAL << 3)) ||
-      (abs(bestmv->row - ref_mv->row) > (MAX_FULL_PEL_VAL << 3)))
-    return INT_MAX;
-
-  return besterr;
-}
-
-int vp9_find_best_sub_pixel_tree(MACROBLOCK *x,
+int vp9_find_best_sub_pixel_tree(const MACROBLOCK *x,
                                  MV *bestmv, const MV *ref_mv,
                                  int allow_hp,
                                  int error_per_bit,
@@ -381,9 +268,9 @@ int vp9_find_best_sub_pixel_tree(MACROBLOCK *x,
                                  int *mvjcost, int *mvcost[2],
                                  int *distortion,
                                  unsigned int *sse1) {
-  uint8_t *z = x->plane[0].src.buf;
+  const uint8_t *const z = x->plane[0].src.buf;
   const int src_stride = x->plane[0].src.stride;
-  MACROBLOCKD *xd = &x->e_mbd;
+  const MACROBLOCKD *xd = &x->e_mbd;
   unsigned int besterr = INT_MAX;
   unsigned int sse;
   unsigned int whichdir;
@@ -394,7 +281,7 @@ int vp9_find_best_sub_pixel_tree(MACROBLOCK *x,
 
   const int y_stride = xd->plane[0].pre[0].stride;
   const int offset = bestmv->row * y_stride + bestmv->col;
-  uint8_t *y = xd->plane[0].pre[0].buf + offset;
+  const uint8_t *const y = xd->plane[0].pre[0].buf;
 
   int rr = ref_mv->row;
   int rc = ref_mv->col;
@@ -414,7 +301,7 @@ int vp9_find_best_sub_pixel_tree(MACROBLOCK *x,
   bestmv->col *= 8;
 
   // calculate central point error
-  besterr = vfp->vf(y, y_stride, z, src_stride, sse1);
+  besterr = vfp->vf(y + offset, y_stride, z, src_stride, sse1);
   *distortion = besterr;
   besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
 
@@ -446,6 +333,10 @@ int vp9_find_best_sub_pixel_tree(MACROBLOCK *x,
     tr = br;
     tc = bc;
   }
+  // These lines insure static analysis doesn't warn that
+  // tr and tc aren't used after the above point.
+  (void) tr;
+  (void) tc;
 
   bestmv->row = br;
   bestmv->col = bc;
@@ -460,113 +351,10 @@ int vp9_find_best_sub_pixel_tree(MACROBLOCK *x,
 #undef DIST
 /* returns subpixel variance error function */
 #define DIST(r, c) \
-    vfp->svaf(PRE(r, c), y_stride, SP(c), SP(r), \
+    vfp->svaf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), \
               z, src_stride, &sse, second_pred)
 
-int vp9_find_best_sub_pixel_comp_iterative(MACROBLOCK *x,
-                                           MV *bestmv, const MV *ref_mv,
-                                           int allow_hp,
-                                           int error_per_bit,
-                                           const vp9_variance_fn_ptr_t *vfp,
-                                           int forced_stop,
-                                           int iters_per_step,
-                                           int *mvjcost, int *mvcost[2],
-                                           int *distortion,
-                                           unsigned int *sse1,
-                                           const uint8_t *second_pred,
-                                           int w, int h) {
-  uint8_t *const z = x->plane[0].src.buf;
-  const int src_stride = x->plane[0].src.stride;
-  MACROBLOCKD *const xd = &x->e_mbd;
-
-  unsigned int besterr = INT_MAX;
-  unsigned int sse;
-  unsigned int whichdir;
-  unsigned int halfiters = iters_per_step;
-  unsigned int quarteriters = iters_per_step;
-  unsigned int eighthiters = iters_per_step;
-  int thismse;
-
-  DECLARE_ALIGNED_ARRAY(16, uint8_t, comp_pred, 64 * 64);
-  const int y_stride = xd->plane[0].pre[0].stride;
-  const int offset = bestmv->row * y_stride + bestmv->col;
-  uint8_t *const y = xd->plane[0].pre[0].buf + offset;
-
-  int rr = ref_mv->row;
-  int rc = ref_mv->col;
-  int br = bestmv->row * 8;
-  int bc = bestmv->col * 8;
-  int hstep = 4;
-  const int minc = MAX(x->mv_col_min * 8, ref_mv->col - MV_MAX);
-  const int maxc = MIN(x->mv_col_max * 8, ref_mv->col + MV_MAX);
-  const int minr = MAX(x->mv_row_min * 8, ref_mv->row - MV_MAX);
-  const int maxr = MIN(x->mv_row_max * 8, ref_mv->row + MV_MAX);
-
-  int tr = br;
-  int tc = bc;
-
-  // central mv
-  bestmv->row *= 8;
-  bestmv->col *= 8;
-
-  // calculate central point error
-  // TODO(yunqingwang): central pointer error was already calculated in full-
-  // pixel search, and can be passed in this function.
-  comp_avg_pred(comp_pred, second_pred, w, h, y, y_stride);
-  besterr = vfp->vf(comp_pred, w, z, src_stride, sse1);
-  *distortion = besterr;
-  besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
-
-  // Each subsequent iteration checks at least one point in
-  // common with the last iteration could be 2 ( if diag selected)
-  while (halfiters--) {
-    // 1/2 pel
-    FIRST_LEVEL_CHECKS;
-    // no reason to check the same one again.
-    if (tr == br && tc == bc)
-      break;
-    tr = br;
-    tc = bc;
-  }
-
-  // Each subsequent iteration checks at least one point in common with
-  // the last iteration could be 2 ( if diag selected) 1/4 pel
-
-  // Note forced_stop: 0 - full, 1 - qtr only, 2 - half only
-  if (forced_stop != 2) {
-    hstep >>= 1;
-    while (quarteriters--) {
-      FIRST_LEVEL_CHECKS;
-      // no reason to check the same one again.
-      if (tr == br && tc == bc)
-        break;
-      tr = br;
-      tc = bc;
-    }
-  }
-
-  if (allow_hp && vp9_use_mv_hp(ref_mv) && forced_stop == 0) {
-    hstep >>= 1;
-    while (eighthiters--) {
-      FIRST_LEVEL_CHECKS;
-      // no reason to check the same one again.
-      if (tr == br && tc == bc)
-        break;
-      tr = br;
-      tc = bc;
-    }
-  }
-  bestmv->row = br;
-  bestmv->col = bc;
-
-  if ((abs(bestmv->col - ref_mv->col) > (MAX_FULL_PEL_VAL << 3)) ||
-      (abs(bestmv->row - ref_mv->row) > (MAX_FULL_PEL_VAL << 3)))
-    return INT_MAX;
-
-  return besterr;
-}
-
-int vp9_find_best_sub_pixel_comp_tree(MACROBLOCK *x,
+int vp9_find_best_sub_pixel_comp_tree(const MACROBLOCK *x,
                                       MV *bestmv, const MV *ref_mv,
                                       int allow_hp,
                                       int error_per_bit,
@@ -578,21 +366,21 @@ int vp9_find_best_sub_pixel_comp_tree(MACROBLOCK *x,
                                       unsigned int *sse1,
                                       const uint8_t *second_pred,
                                       int w, int h) {
-  uint8_t *z = x->plane[0].src.buf;
+  const uint8_t *const z = x->plane[0].src.buf;
   const int src_stride = x->plane[0].src.stride;
-  MACROBLOCKD *xd = &x->e_mbd;
+  const MACROBLOCKD *xd = &x->e_mbd;
   unsigned int besterr = INT_MAX;
   unsigned int sse;
   unsigned int whichdir;
   int thismse;
-  unsigned int halfiters = iters_per_step;
-  unsigned int quarteriters = iters_per_step;
-  unsigned int eighthiters = iters_per_step;
+  const unsigned int halfiters = iters_per_step;
+  const unsigned int quarteriters = iters_per_step;
+  const unsigned int eighthiters = iters_per_step;
 
   DECLARE_ALIGNED_ARRAY(16, uint8_t, comp_pred, 64 * 64);
   const int y_stride = xd->plane[0].pre[0].stride;
   const int offset = bestmv->row * y_stride + bestmv->col;
-  uint8_t *y = xd->plane[0].pre[0].buf + offset;
+  const uint8_t *const y = xd->plane[0].pre[0].buf;
 
   int rr = ref_mv->row;
   int rc = ref_mv->col;
@@ -614,7 +402,7 @@ int vp9_find_best_sub_pixel_comp_tree(MACROBLOCK *x,
   // calculate central point error
   // TODO(yunqingwang): central pointer error was already calculated in full-
   // pixel search, and can be passed in this function.
-  comp_avg_pred(comp_pred, second_pred, w, h, y, y_stride);
+  vp9_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride);
   besterr = vfp->vf(comp_pred, w, z, src_stride, sse1);
   *distortion = besterr;
   besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
@@ -652,6 +440,11 @@ int vp9_find_best_sub_pixel_comp_tree(MACROBLOCK *x,
     tr = br;
     tc = bc;
   }
+  // These lines insure static analysis doesn't warn that
+  // tr and tc aren't used after the above point.
+  (void) tr;
+  (void) tc;
+
   bestmv->row = br;
   bestmv->col = bc;
 
@@ -665,48 +458,33 @@ int vp9_find_best_sub_pixel_comp_tree(MACROBLOCK *x,
 #undef MVC
 #undef PRE
 #undef DIST
-#undef IFMVCV
 #undef CHECK_BETTER
-#undef SP
 
-#define CHECK_BOUNDS(range) \
-  {\
-    all_in = 1;\
-    all_in &= ((br-range) >= x->mv_row_min);\
-    all_in &= ((br+range) <= x->mv_row_max);\
-    all_in &= ((bc-range) >= x->mv_col_min);\
-    all_in &= ((bc+range) <= x->mv_col_max);\
-  }
+static INLINE int check_bounds(const MACROBLOCK *x, int row, int col,
+                               int range) {
+  return ((row - range) >= x->mv_row_min) &
+         ((row + range) <= x->mv_row_max) &
+         ((col - range) >= x->mv_col_min) &
+         ((col + range) <= x->mv_col_max);
+}
 
-#define CHECK_POINT \
-  {\
-    if (this_mv.col < x->mv_col_min) continue;\
-    if (this_mv.col > x->mv_col_max) continue;\
-    if (this_mv.row < x->mv_row_min) continue;\
-    if (this_mv.row > x->mv_row_max) continue;\
-  }
+static INLINE int is_mv_in(const MACROBLOCK *x, const MV *mv) {
+  return (mv->col >= x->mv_col_min) && (mv->col <= x->mv_col_max) &&
+         (mv->row >= x->mv_row_min) && (mv->row <= x->mv_row_max);
+}
 
 #define CHECK_BETTER \
   {\
-    if (thissad < bestsad)\
-    {\
+    if (thissad < bestsad) {\
       if (use_mvcost) \
-        thissad += mvsad_err_cost(&this_mv, &fcenter_mv.as_mv, \
-                                  mvjsadcost, mvsadcost, \
-                                  sad_per_bit);\
-      if (thissad < bestsad)\
-      {\
+        thissad += mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit);\
+      if (thissad < bestsad) {\
         bestsad = thissad;\
         best_site = i;\
       }\
     }\
   }
 
-#define get_next_chkpts(list, i, n)   \
-    list[0] = ((i) == 0 ? (n) - 1 : (i) - 1);  \
-    list[1] = (i);                             \
-    list[2] = ((i) == (n) - 1 ? 0 : (i) + 1);
-
 #define MAX_PATTERN_SCALES         11
 #define MAX_PATTERN_CANDIDATES      8  // max number of canddiates per scale
 #define PATTERN_CANDIDATES_REF      3  // number of refinement candidates
@@ -715,56 +493,40 @@ int vp9_find_best_sub_pixel_comp_tree(MACROBLOCK *x,
 // Each scale can have a different number of candidates and shape of
 // candidates as indicated in the num_candidates and candidates arrays
 // passed into this function
-static int vp9_pattern_search(MACROBLOCK *x,
+static int vp9_pattern_search(const MACROBLOCK *x,
                               MV *ref_mv,
                               int search_param,
                               int sad_per_bit,
-                              int do_init_search,
-                              int do_refine,
+                              int do_init_search, int do_refine,
                               const vp9_variance_fn_ptr_t *vfp,
                               int use_mvcost,
                               const MV *center_mv, MV *best_mv,
                               const int num_candidates[MAX_PATTERN_SCALES],
                               const MV candidates[MAX_PATTERN_SCALES]
                                                  [MAX_PATTERN_CANDIDATES]) {
-  const MACROBLOCKD* const xd = &x->e_mbd;
+  const MACROBLOCKD *const xd = &x->e_mbd;
   static const int search_param_to_steps[MAX_MVSEARCH_STEPS] = {
     10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
   };
   int i, j, s, t;
-  uint8_t *what = x->plane[0].src.buf;
-  int what_stride = x->plane[0].src.stride;
-  int in_what_stride = xd->plane[0].pre[0].stride;
+  const struct buf_2d *const what = &x->plane[0].src;
+  const struct buf_2d *const in_what = &xd->plane[0].pre[0];
   int br, bc;
-  MV this_mv;
   int bestsad = INT_MAX;
   int thissad;
-  uint8_t *base_offset;
-  uint8_t *this_offset;
   int k = -1;
-  int all_in;
-  int best_site = -1;
-  int_mv fcenter_mv;
+  const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
   int best_init_s = search_param_to_steps[search_param];
-  int *mvjsadcost = x->nmvjointsadcost;
-  int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]};
-
-  fcenter_mv.as_mv.row = center_mv->row >> 3;
-  fcenter_mv.as_mv.col = center_mv->col >> 3;
-
   // adjust ref_mv to make sure it is within MV range
   clamp_mv(ref_mv, x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max);
   br = ref_mv->row;
   bc = ref_mv->col;
 
   // Work out the start point for the search
-  base_offset = (uint8_t *)(xd->plane[0].pre[0].buf);
-  this_offset = base_offset + (br * in_what_stride) + bc;
-  this_mv.row = br;
-  this_mv.col = bc;
-  bestsad = vfp->sdf(what, what_stride, this_offset, in_what_stride, 0x7fffffff)
-                + mvsad_err_cost(&this_mv, &fcenter_mv.as_mv,
-                                 mvjsadcost, mvsadcost, sad_per_bit);
+  bestsad = vfp->sdf(what->buf, what->stride,
+                     get_buf_from_mv(in_what, ref_mv), in_what->stride,
+                     0x7fffffff) + mvsad_err_cost(x, ref_mv, &fcenter_mv,
+                                                  sad_per_bit);
 
   // Search all possible scales upto the search param around the center point
   // pick the scale of the point that is best as the starting scale of
@@ -773,27 +535,25 @@ static int vp9_pattern_search(MACROBLOCK *x,
     s = best_init_s;
     best_init_s = -1;
     for (t = 0; t <= s; ++t) {
-      best_site = -1;
-      CHECK_BOUNDS((1 << t))
-      if (all_in) {
+      int best_site = -1;
+      if (check_bounds(x, br, bc, 1 << t)) {
         for (i = 0; i < num_candidates[t]; i++) {
-          this_mv.row = br + candidates[t][i].row;
-          this_mv.col = bc + candidates[t][i].col;
-          this_offset = base_offset + (this_mv.row * in_what_stride) +
-                                       this_mv.col;
-          thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride,
-                             bestsad);
+          const MV this_mv = {br + candidates[t][i].row,
+                              bc + candidates[t][i].col};
+          thissad = vfp->sdf(what->buf, what->stride,
+                             get_buf_from_mv(in_what, &this_mv),
+                             in_what->stride, bestsad);
           CHECK_BETTER
         }
       } else {
         for (i = 0; i < num_candidates[t]; i++) {
-          this_mv.row = br + candidates[t][i].row;
-          this_mv.col = bc + candidates[t][i].col;
-          CHECK_POINT
-          this_offset = base_offset + (this_mv.row * in_what_stride) +
-                                       this_mv.col;
-          thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride,
-                             bestsad);
+          const MV this_mv = {br + candidates[t][i].row,
+                              bc + candidates[t][i].col};
+          if (!is_mv_in(x, &this_mv))
+            continue;
+          thissad = vfp->sdf(what->buf, what->stride,
+                             get_buf_from_mv(in_what, &this_mv),
+                             in_what->stride, bestsad);
           CHECK_BETTER
         }
       }
@@ -813,31 +573,30 @@ static int vp9_pattern_search(MACROBLOCK *x,
   // If the center point is still the best, just skip this and move to
   // the refinement step.
   if (best_init_s != -1) {
+    int best_site = -1;
     s = best_init_s;
-    best_site = -1;
+
     do {
       // No need to search all 6 points the 1st time if initial search was used
       if (!do_init_search || s != best_init_s) {
-        CHECK_BOUNDS((1 << s))
-        if (all_in) {
+        if (check_bounds(x, br, bc, 1 << s)) {
           for (i = 0; i < num_candidates[s]; i++) {
-            this_mv.row = br + candidates[s][i].row;
-            this_mv.col = bc + candidates[s][i].col;
-            this_offset = base_offset + (this_mv.row * in_what_stride) +
-                                         this_mv.col;
-            thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride,
-                               bestsad);
+            const MV this_mv = {br + candidates[s][i].row,
+                                bc + candidates[s][i].col};
+            thissad = vfp->sdf(what->buf, what->stride,
+                               get_buf_from_mv(in_what, &this_mv),
+                               in_what->stride, bestsad);
             CHECK_BETTER
           }
         } else {
           for (i = 0; i < num_candidates[s]; i++) {
-            this_mv.row = br + candidates[s][i].row;
-            this_mv.col = bc + candidates[s][i].col;
-            CHECK_POINT
-            this_offset = base_offset + (this_mv.row * in_what_stride) +
-                                         this_mv.col;
-            thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride,
-                               bestsad);
+            const MV this_mv = {br + candidates[s][i].row,
+                                bc + candidates[s][i].col};
+            if (!is_mv_in(x, &this_mv))
+              continue;
+            thissad = vfp->sdf(what->buf, what->stride,
+                               get_buf_from_mv(in_what, &this_mv),
+                               in_what->stride, bestsad);
             CHECK_BETTER
           }
         }
@@ -854,28 +613,28 @@ static int vp9_pattern_search(MACROBLOCK *x,
       do {
         int next_chkpts_indices[PATTERN_CANDIDATES_REF];
         best_site = -1;
-        CHECK_BOUNDS((1 << s))
+        next_chkpts_indices[0] = (k == 0) ? num_candidates[s] - 1 : k - 1;
+        next_chkpts_indices[1] = k;
+        next_chkpts_indices[2] = (k == num_candidates[s] - 1) ? 0 : k + 1;
 
-        get_next_chkpts(next_chkpts_indices, k, num_candidates[s]);
-        if (all_in) {
+        if (check_bounds(x, br, bc, 1 << s)) {
           for (i = 0; i < PATTERN_CANDIDATES_REF; i++) {
-            this_mv.row = br + candidates[s][next_chkpts_indices[i]].row;
-            this_mv.col = bc + candidates[s][next_chkpts_indices[i]].col;
-            this_offset = base_offset + (this_mv.row * (in_what_stride)) +
-                                         this_mv.col;
-            thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride,
-                               bestsad);
+            const MV this_mv = {br + candidates[s][next_chkpts_indices[i]].row,
+                                bc + candidates[s][next_chkpts_indices[i]].col};
+            thissad = vfp->sdf(what->buf, what->stride,
+                               get_buf_from_mv(in_what, &this_mv),
+                               in_what->stride, bestsad);
             CHECK_BETTER
           }
         } else {
           for (i = 0; i < PATTERN_CANDIDATES_REF; i++) {
-            this_mv.row = br + candidates[s][next_chkpts_indices[i]].row;
-            this_mv.col = bc + candidates[s][next_chkpts_indices[i]].col;
-            CHECK_POINT
-            this_offset = base_offset + (this_mv.row * (in_what_stride)) +
-                                         this_mv.col;
-            thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride,
-                               bestsad);
+            const MV this_mv = {br + candidates[s][next_chkpts_indices[i]].row,
+                                bc + candidates[s][next_chkpts_indices[i]].col};
+            if (!is_mv_in(x, &this_mv))
+              continue;
+            thissad = vfp->sdf(what->buf, what->stride,
+                               get_buf_from_mv(in_what, &this_mv),
+                               in_what->stride, bestsad);
             CHECK_BETTER
           }
         }
@@ -892,34 +651,31 @@ static int vp9_pattern_search(MACROBLOCK *x,
   // Check 4 1-away neighbors if do_refine is true.
   // For most well-designed schemes do_refine will not be necessary.
   if (do_refine) {
-    static const MV neighbors[4] = {
-      {0, -1}, { -1, 0}, {1, 0}, {0, 1},
-    };
+    static const MV neighbors[4] = {{0, -1}, { -1, 0}, {1, 0}, {0, 1}};
+
     for (j = 0; j < 16; j++) {
-      best_site = -1;
-      CHECK_BOUNDS(1)
-      if (all_in) {
+      int best_site = -1;
+      if (check_bounds(x, br, bc, 1)) {
         for (i = 0; i < 4; i++) {
-          this_mv.row = br + neighbors[i].row;
-          this_mv.col = bc + neighbors[i].col;
-          this_offset = base_offset + (this_mv.row * (in_what_stride)) +
-                                       this_mv.col;
-          thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride,
-                             bestsad);
+          const MV this_mv = {br + neighbors[i].row,
+                              bc + neighbors[i].col};
+          thissad = vfp->sdf(what->buf, what->stride,
+                             get_buf_from_mv(in_what, &this_mv),
+                             in_what->stride, bestsad);
           CHECK_BETTER
         }
       } else {
         for (i = 0; i < 4; i++) {
-          this_mv.row = br + neighbors[i].row;
-          this_mv.col = bc + neighbors[i].col;
-          CHECK_POINT
-          this_offset = base_offset + (this_mv.row * (in_what_stride)) +
-                                       this_mv.col;
-          thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride,
-                             bestsad);
+          const MV this_mv = {br + neighbors[i].row,
+                              bc + neighbors[i].col};
+          if (!is_mv_in(x, &this_mv))
+            continue;
+          thissad = vfp->sdf(what->buf, what->stride,
+                             get_buf_from_mv(in_what, &this_mv),
+                             in_what->stride, bestsad);
           CHECK_BETTER
         }
-          }
+      }
 
       if (best_site == -1) {
         break;
@@ -933,22 +689,43 @@ static int vp9_pattern_search(MACROBLOCK *x,
   best_mv->row = br;
   best_mv->col = bc;
 
-  this_offset = base_offset + (best_mv->row * in_what_stride) +
-                               best_mv->col;
-  this_mv.row = best_mv->row * 8;
-  this_mv.col = best_mv->col * 8;
-  if (bestsad == INT_MAX)
-    return INT_MAX;
+  return bestsad;
+}
 
-  return vfp->vf(what, what_stride, this_offset, in_what_stride,
-                 (unsigned int *)&bestsad) +
-         use_mvcost ? mv_err_cost(&this_mv, center_mv,
-                                  x->nmvjointcost, x->mvcost, x->errorperbit)
-                    : 0;
+int vp9_get_mvpred_var(const MACROBLOCK *x,
+                       const MV *best_mv, const MV *center_mv,
+                       const vp9_variance_fn_ptr_t *vfp,
+                       int use_mvcost) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const struct buf_2d *const what = &x->plane[0].src;
+  const struct buf_2d *const in_what = &xd->plane[0].pre[0];
+  const MV mv = {best_mv->row * 8, best_mv->col * 8};
+  unsigned int unused;
+
+  return vfp->vf(what->buf, what->stride,
+                 get_buf_from_mv(in_what, best_mv), in_what->stride, &unused) +
+      (use_mvcost ?  mv_err_cost(&mv, center_mv, x->nmvjointcost,
+                                 x->mvcost, x->errorperbit) : 0);
 }
 
+int vp9_get_mvpred_av_var(const MACROBLOCK *x,
+                          const MV *best_mv, const MV *center_mv,
+                          const uint8_t *second_pred,
+                          const vp9_variance_fn_ptr_t *vfp,
+                          int use_mvcost) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const struct buf_2d *const what = &x->plane[0].src;
+  const struct buf_2d *const in_what = &xd->plane[0].pre[0];
+  const MV mv = {best_mv->row * 8, best_mv->col * 8};
+  unsigned int unused;
+
+  return vfp->svaf(get_buf_from_mv(in_what, best_mv), in_what->stride, 0, 0,
+                   what->buf, what->stride, &unused, second_pred) +
+      (use_mvcost ?  mv_err_cost(&mv, center_mv, x->nmvjointcost,
+                                 x->mvcost, x->errorperbit) : 0);
+}
 
-int vp9_hex_search(MACROBLOCK *x,
+int vp9_hex_search(const MACROBLOCK *x,
                    MV *ref_mv,
                    int search_param,
                    int sad_per_bit,
@@ -976,14 +753,13 @@ int vp9_hex_search(MACROBLOCK *x,
     {{-512, -1024}, {512, -1024}, {1024, 0}, {512, 1024}, { -512, 1024},
       { -1024, 0}},
   };
-  return
-      vp9_pattern_search(x, ref_mv, search_param, sad_per_bit,
-                         do_init_search, 0, vfp, use_mvcost,
-                         center_mv, best_mv,
-                         hex_num_candidates, hex_candidates);
+  return vp9_pattern_search(x, ref_mv, search_param, sad_per_bit,
+                            do_init_search, 0, vfp, use_mvcost,
+                            center_mv, best_mv,
+                            hex_num_candidates, hex_candidates);
 }
 
-int vp9_bigdia_search(MACROBLOCK *x,
+int vp9_bigdia_search(const MACROBLOCK *x,
                       MV *ref_mv,
                       int search_param,
                       int sad_per_bit,
@@ -1024,7 +800,7 @@ int vp9_bigdia_search(MACROBLOCK *x,
                             bigdia_num_candidates, bigdia_candidates);
 }
 
-int vp9_square_search(MACROBLOCK *x,
+int vp9_square_search(const MACROBLOCK *x,
                       MV *ref_mv,
                       int search_param,
                       int sad_per_bit,
@@ -1063,96 +839,159 @@ int vp9_square_search(MACROBLOCK *x,
                             do_init_search, 0, vfp, use_mvcost,
                             center_mv, best_mv,
                             square_num_candidates, square_candidates);
-};
-
-#undef CHECK_BOUNDS
-#undef CHECK_POINT
-#undef CHECK_BETTER
-
-int vp9_diamond_search_sad_c(MACROBLOCK *x,
-                             int_mv *ref_mv, int_mv *best_mv,
-                             int search_param, int sad_per_bit, int *num00,
-                             vp9_variance_fn_ptr_t *fn_ptr, int *mvjcost,
-                             int *mvcost[2], int_mv *center_mv) {
-  int i, j, step;
-
-  const MACROBLOCKD* const xd = &x->e_mbd;
-  uint8_t *what = x->plane[0].src.buf;
-  int what_stride = x->plane[0].src.stride;
-  uint8_t *in_what;
-  int in_what_stride = xd->plane[0].pre[0].stride;
-  uint8_t *best_address;
-
-  int tot_steps;
-  int_mv this_mv;
+}
 
-  int bestsad = INT_MAX;
-  int best_site = 0;
-  int last_site = 0;
+int vp9_fast_hex_search(const MACROBLOCK *x,
+                        MV *ref_mv,
+                        int search_param,
+                        int sad_per_bit,
+                        int do_init_search,  // must be zero for fast_hex
+                        const vp9_variance_fn_ptr_t *vfp,
+                        int use_mvcost,
+                        const MV *center_mv,
+                        MV *best_mv) {
+  return vp9_hex_search(x, ref_mv, MAX(MAX_MVSEARCH_STEPS - 2, search_param),
+                        sad_per_bit, do_init_search, vfp, use_mvcost,
+                        center_mv, best_mv);
+}
 
-  int ref_row, ref_col;
-  int this_row_offset, this_col_offset;
-  search_site *ss;
+int vp9_fast_dia_search(const MACROBLOCK *x,
+                        MV *ref_mv,
+                        int search_param,
+                        int sad_per_bit,
+                        int do_init_search,
+                        const vp9_variance_fn_ptr_t *vfp,
+                        int use_mvcost,
+                        const MV *center_mv,
+                        MV *best_mv) {
+  return vp9_bigdia_search(x, ref_mv, MAX(MAX_MVSEARCH_STEPS - 2, search_param),
+                           sad_per_bit, do_init_search, vfp, use_mvcost,
+                           center_mv, best_mv);
+}
 
-  uint8_t *check_here;
-  int thissad;
-  int_mv fcenter_mv;
+#undef CHECK_BETTER
 
-  int *mvjsadcost = x->nmvjointsadcost;
-  int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]};
+int vp9_full_range_search_c(const MACROBLOCK *x,
+                            const search_site_config *cfg,
+                            MV *ref_mv, MV *best_mv,
+                            int search_param, int sad_per_bit, int *num00,
+                            const vp9_variance_fn_ptr_t *fn_ptr,
+                            const MV *center_mv) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const struct buf_2d *const what = &x->plane[0].src;
+  const struct buf_2d *const in_what = &xd->plane[0].pre[0];
+  const int range = 64;
+  const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
+  unsigned int best_sad = INT_MAX;
+  int r, c, i;
+  int start_col, end_col, start_row, end_row;
+
+  // The cfg and search_param parameters are not used in this search variant
+  (void)cfg;
+  (void)search_param;
 
-  fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
-  fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
+  clamp_mv(ref_mv, x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max);
+  *best_mv = *ref_mv;
+  *num00 = 11;
+  best_sad = fn_ptr->sdf(what->buf, what->stride,
+                         get_buf_from_mv(in_what, ref_mv), in_what->stride,
+                         0x7fffffff) +
+                 mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit);
+  start_row = MAX(-range, x->mv_row_min - ref_mv->row);
+  start_col = MAX(-range, x->mv_col_min - ref_mv->col);
+  end_row = MIN(range, x->mv_row_max - ref_mv->row);
+  end_col = MIN(range, x->mv_col_max - ref_mv->col);
+
+  for (r = start_row; r <= end_row; ++r) {
+    for (c = start_col; c <= end_col; c += 4) {
+      if (c + 3 <= end_col) {
+        unsigned int sads[4];
+        const uint8_t *addrs[4];
+        for (i = 0; i < 4; ++i) {
+          const MV mv = {ref_mv->row + r, ref_mv->col + c + i};
+          addrs[i] = get_buf_from_mv(in_what, &mv);
+        }
 
-  clamp_mv(&ref_mv->as_mv,
-           x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max);
-  ref_row = ref_mv->as_mv.row;
-  ref_col = ref_mv->as_mv.col;
-  *num00 = 0;
-  best_mv->as_mv.row = ref_row;
-  best_mv->as_mv.col = ref_col;
+        fn_ptr->sdx4df(what->buf, what->stride, addrs, in_what->stride, sads);
 
-  // Work out the start point for the search
-  in_what = (uint8_t *)(xd->plane[0].pre[0].buf +
-                        (ref_row * (xd->plane[0].pre[0].stride)) + ref_col);
-  best_address = in_what;
+        for (i = 0; i < 4; ++i) {
+          if (sads[i] < best_sad) {
+            const MV mv = {ref_mv->row + r, ref_mv->col + c + i};
+            const unsigned int sad = sads[i] +
+                mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
+            if (sad < best_sad) {
+              best_sad = sad;
+              *best_mv = mv;
+            }
+          }
+        }
+      } else {
+        for (i = 0; i < end_col - c; ++i) {
+          const MV mv = {ref_mv->row + r, ref_mv->col + c + i};
+          unsigned int sad = fn_ptr->sdf(what->buf, what->stride,
+              get_buf_from_mv(in_what, &mv), in_what->stride, best_sad);
+          if (sad < best_sad) {
+            sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
+            if (sad < best_sad) {
+              best_sad = sad;
+              *best_mv = mv;
+            }
+          }
+        }
+      }
+    }
+  }
 
-  // Check the starting position
-  bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride, 0x7fffffff)
-                + mvsad_err_cost(&best_mv->as_mv, &fcenter_mv.as_mv,
-                                 mvjsadcost, mvsadcost, sad_per_bit);
+  return best_sad;
+}
 
+int vp9_diamond_search_sad_c(const MACROBLOCK *x,
+                             const search_site_config *cfg,
+                             MV *ref_mv, MV *best_mv,
+                             int search_param, int sad_per_bit, int *num00,
+                             const vp9_variance_fn_ptr_t *fn_ptr,
+                             const MV *center_mv) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const struct buf_2d *const what = &x->plane[0].src;
+  const struct buf_2d *const in_what = &xd->plane[0].pre[0];
   // search_param determines the length of the initial step and hence the number
   // of iterations
   // 0 = initial step (MAX_FIRST_STEP) pel : 1 = (MAX_FIRST_STEP/2) pel, 2 =
   // (MAX_FIRST_STEP/4) pel... etc.
-  ss = &x->ss[search_param * x->searches_per_step];
-  tot_steps = (x->ss_count / x->searches_per_step) - search_param;
+  const search_site *const ss = &cfg->ss[search_param * cfg->searches_per_step];
+  const int tot_steps = (cfg->ss_count / cfg->searches_per_step) - search_param;
+  const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
+  const uint8_t *best_address, *in_what_ref;
+  int best_sad = INT_MAX;
+  int best_site = 0;
+  int last_site = 0;
+  int i, j, step;
+
+  clamp_mv(ref_mv, x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max);
+  in_what_ref = get_buf_from_mv(in_what, ref_mv);
+  best_address = in_what_ref;
+  *num00 = 0;
+  *best_mv = *ref_mv;
+
+  // Check the starting position
+  best_sad = fn_ptr->sdf(what->buf, what->stride,
+                         best_address, in_what->stride, 0x7fffffff) +
+      mvsad_err_cost(x, best_mv, &fcenter_mv, sad_per_bit);
 
   i = 1;
 
   for (step = 0; step < tot_steps; step++) {
-    for (j = 0; j < x->searches_per_step; j++) {
-      // Trap illegal vectors
-      this_row_offset = best_mv->as_mv.row + ss[i].mv.row;
-      this_col_offset = best_mv->as_mv.col + ss[i].mv.col;
-
-      if ((this_col_offset > x->mv_col_min) &&
-          (this_col_offset < x->mv_col_max) &&
-          (this_row_offset > x->mv_row_min) &&
-          (this_row_offset < x->mv_row_max)) {
-        check_here = ss[i].offset + best_address;
-        thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride,
-                              bestsad);
-
-        if (thissad < bestsad) {
-          this_mv.as_mv.row = this_row_offset;
-          this_mv.as_mv.col = this_col_offset;
-          thissad += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv,
-                                    mvjsadcost, mvsadcost, sad_per_bit);
-
-          if (thissad < bestsad) {
-            bestsad = thissad;
+    for (j = 0; j < cfg->searches_per_step; j++) {
+      const MV mv = {best_mv->row + ss[i].mv.row,
+                     best_mv->col + ss[i].mv.col};
+      if (is_mv_in(x, &mv)) {
+       int sad = fn_ptr->sdf(what->buf, what->stride,
+                             best_address + ss[i].offset, in_what->stride,
+                             best_sad);
+        if (sad < best_sad) {
+          sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
+          if (sad < best_sad) {
+            best_sad = sad;
             best_site = i;
           }
         }
@@ -1162,30 +1001,24 @@ int vp9_diamond_search_sad_c(MACROBLOCK *x,
     }
 
     if (best_site != last_site) {
-      best_mv->as_mv.row += ss[best_site].mv.row;
-      best_mv->as_mv.col += ss[best_site].mv.col;
+      best_mv->row += ss[best_site].mv.row;
+      best_mv->col += ss[best_site].mv.col;
       best_address += ss[best_site].offset;
       last_site = best_site;
 #if defined(NEW_DIAMOND_SEARCH)
       while (1) {
-        this_row_offset = best_mv->as_mv.row + ss[best_site].mv.row;
-        this_col_offset = best_mv->as_mv.col + ss[best_site].mv.col;
-        if ((this_col_offset > x->mv_col_min) &&
-            (this_col_offset < x->mv_col_max) &&
-            (this_row_offset > x->mv_row_min) &&
-            (this_row_offset < x->mv_row_max)) {
-          check_here = ss[best_site].offset + best_address;
-          thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride,
-                                bestsad);
-          if (thissad < bestsad) {
-            this_mv.as_mv.row = this_row_offset;
-            this_mv.as_mv.col = this_col_offset;
-            thissad += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv,
-                                      mvjsadcost, mvsadcost, sad_per_bit);
-            if (thissad < bestsad) {
-              bestsad = thissad;
-              best_mv->as_mv.row += ss[best_site].mv.row;
-              best_mv->as_mv.col += ss[best_site].mv.col;
+        const MV this_mv = {best_mv->row + ss[best_site].mv.row,
+                            best_mv->col + ss[best_site].mv.col};
+        if (is_mv_in(x, &this_mv)) {
+          int sad = fn_ptr->sdf(what->buf, what->stride,
+                                best_address + ss[best_site].offset,
+                                in_what->stride, best_sad);
+          if (sad < best_sad) {
+            sad += mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit);
+            if (sad < best_sad) {
+              best_sad = sad;
+              best_mv->row += ss[best_site].mv.row;
+              best_mv->col += ss[best_site].mv.col;
               best_address += ss[best_site].offset;
               continue;
             }
@@ -1194,39 +1027,27 @@ int vp9_diamond_search_sad_c(MACROBLOCK *x,
         break;
       };
 #endif
-    } else if (best_address == in_what) {
+    } else if (best_address == in_what_ref) {
       (*num00)++;
     }
   }
-
-  this_mv.as_mv.row = best_mv->as_mv.row * 8;
-  this_mv.as_mv.col = best_mv->as_mv.col * 8;
-
-  if (bestsad == INT_MAX)
-    return INT_MAX;
-
-  return fn_ptr->vf(what, what_stride, best_address, in_what_stride,
-                    (unsigned int *)(&thissad)) +
-                       mv_err_cost(&this_mv.as_mv, &center_mv->as_mv,
-                                   mvjcost, mvcost, x->errorperbit);
+  return best_sad;
 }
 
-int vp9_diamond_search_sadx4(MACROBLOCK *x,
-                             int_mv *ref_mv, int_mv *best_mv, int search_param,
+int vp9_diamond_search_sadx4(const MACROBLOCK *x,
+                             const search_site_config *cfg,
+                             MV *ref_mv, MV *best_mv, int search_param,
                              int sad_per_bit, int *num00,
-                             vp9_variance_fn_ptr_t *fn_ptr,
-                             int *mvjcost, int *mvcost[2], int_mv *center_mv) {
+                             const vp9_variance_fn_ptr_t *fn_ptr,
+                             const MV *center_mv) {
   int i, j, step;
 
-  const MACROBLOCKD* const xd = &x->e_mbd;
+  const MACROBLOCKD *const xd = &x->e_mbd;
   uint8_t *what = x->plane[0].src.buf;
-  int what_stride = x->plane[0].src.stride;
-  uint8_t *in_what;
-  int in_what_stride = xd->plane[0].pre[0].stride;
-  uint8_t *best_address;
-
-  int tot_steps;
-  int_mv this_mv;
+  const int what_stride = x->plane[0].src.stride;
+  const uint8_t *in_what;
+  const int in_what_stride = xd->plane[0].pre[0].stride;
+  const uint8_t *best_address;
 
   unsigned int bestsad = INT_MAX;
   int best_site = 0;
@@ -1234,45 +1055,30 @@ int vp9_diamond_search_sadx4(MACROBLOCK *x,
 
   int ref_row;
   int ref_col;
-  int this_row_offset;
-  int this_col_offset;
-  search_site *ss;
-
-  uint8_t *check_here;
-  unsigned int thissad;
-  int_mv fcenter_mv;
-
-  int *mvjsadcost = x->nmvjointsadcost;
-  int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]};
 
-  fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
-  fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
+  // search_param determines the length of the initial step and hence the number
+  // of iterations.
+  // 0 = initial step (MAX_FIRST_STEP) pel
+  // 1 = (MAX_FIRST_STEP/2) pel,
+  // 2 = (MAX_FIRST_STEP/4) pel...
+  const search_site *ss = &cfg->ss[search_param * cfg->searches_per_step];
+  const int tot_steps = (cfg->ss_count / cfg->searches_per_step) - search_param;
 
-  clamp_mv(&ref_mv->as_mv,
-           x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max);
-  ref_row = ref_mv->as_mv.row;
-  ref_col = ref_mv->as_mv.col;
+  const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
+  clamp_mv(ref_mv, x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max);
+  ref_row = ref_mv->row;
+  ref_col = ref_mv->col;
   *num00 = 0;
-  best_mv->as_mv.row = ref_row;
-  best_mv->as_mv.col = ref_col;
+  best_mv->row = ref_row;
+  best_mv->col = ref_col;
 
   // Work out the start point for the search
-  in_what = (uint8_t *)(xd->plane[0].pre[0].buf +
-                        (ref_row * (xd->plane[0].pre[0].stride)) + ref_col);
+  in_what = xd->plane[0].pre[0].buf + ref_row * in_what_stride + ref_col;
   best_address = in_what;
 
   // Check the starting position
   bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride, 0x7fffffff)
-                + mvsad_err_cost(&best_mv->as_mv, &fcenter_mv.as_mv,
-                                 mvjsadcost, mvsadcost, sad_per_bit);
-
-  // search_param determines the length of the initial step and hence the number
-  // of iterations.
-  // 0 = initial step (MAX_FIRST_STEP) pel
-  // 1 = (MAX_FIRST_STEP/2) pel,
-  // 2 = (MAX_FIRST_STEP/4) pel...
-  ss = &x->ss[search_param * x->searches_per_step];
-  tot_steps = (x->ss_count / x->searches_per_step) - search_param;
+                + mvsad_err_cost(x, best_mv, &fcenter_mv, sad_per_bit);
 
   i = 1;
 
@@ -1281,10 +1087,10 @@ int vp9_diamond_search_sadx4(MACROBLOCK *x,
 
     // All_in is true if every one of the points we are checking are within
     // the bounds of the image.
-    all_in &= ((best_mv->as_mv.row + ss[i].mv.row) > x->mv_row_min);
-    all_in &= ((best_mv->as_mv.row + ss[i + 1].mv.row) < x->mv_row_max);
-    all_in &= ((best_mv->as_mv.col + ss[i + 2].mv.col) > x->mv_col_min);
-    all_in &= ((best_mv->as_mv.col + ss[i + 3].mv.col) < x->mv_col_max);
+    all_in &= ((best_mv->row + ss[i].mv.row) > x->mv_row_min);
+    all_in &= ((best_mv->row + ss[i + 1].mv.row) < x->mv_row_max);
+    all_in &= ((best_mv->col + ss[i + 2].mv.col) > x->mv_col_min);
+    all_in &= ((best_mv->col + ss[i + 3].mv.col) < x->mv_col_max);
 
     // If all the pixels are within the bounds we don't check whether the
     // search point is valid in this loop,  otherwise we check each point
@@ -1292,7 +1098,7 @@ int vp9_diamond_search_sadx4(MACROBLOCK *x,
     if (all_in) {
       unsigned int sad_array[4];
 
-      for (j = 0; j < x->searches_per_step; j += 4) {
+      for (j = 0; j < cfg->searches_per_step; j += 4) {
         unsigned char const *block_offset[4];
 
         for (t = 0; t < 4; t++)
@@ -1303,11 +1109,10 @@ int vp9_diamond_search_sadx4(MACROBLOCK *x,
 
         for (t = 0; t < 4; t++, i++) {
           if (sad_array[t] < bestsad) {
-            this_mv.as_mv.row = best_mv->as_mv.row + ss[i].mv.row;
-            this_mv.as_mv.col = best_mv->as_mv.col + ss[i].mv.col;
-            sad_array[t] += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv,
-                                           mvjsadcost, mvsadcost, sad_per_bit);
-
+            const MV this_mv = {best_mv->row + ss[i].mv.row,
+                                best_mv->col + ss[i].mv.col};
+            sad_array[t] += mvsad_err_cost(x, &this_mv, &fcenter_mv,
+                                           sad_per_bit);
             if (sad_array[t] < bestsad) {
               bestsad = sad_array[t];
               best_site = i;
@@ -1316,25 +1121,18 @@ int vp9_diamond_search_sadx4(MACROBLOCK *x,
         }
       }
     } else {
-      for (j = 0; j < x->searches_per_step; j++) {
+      for (j = 0; j < cfg->searches_per_step; j++) {
         // Trap illegal vectors
-        this_row_offset = best_mv->as_mv.row + ss[i].mv.row;
-        this_col_offset = best_mv->as_mv.col + ss[i].mv.col;
+        const MV this_mv = {best_mv->row + ss[i].mv.row,
+                            best_mv->col + ss[i].mv.col};
 
-        if ((this_col_offset > x->mv_col_min) &&
-            (this_col_offset < x->mv_col_max) &&
-            (this_row_offset > x->mv_row_min) &&
-            (this_row_offset < x->mv_row_max)) {
-          check_here = ss[i].offset + best_address;
-          thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride,
-                                bestsad);
+        if (is_mv_in(x, &this_mv)) {
+          const uint8_t *const check_here = ss[i].offset + best_address;
+          unsigned int thissad = fn_ptr->sdf(what, what_stride, check_here,
+                                             in_what_stride, bestsad);
 
           if (thissad < bestsad) {
-            this_mv.as_mv.row = this_row_offset;
-            this_mv.as_mv.col = this_col_offset;
-            thissad += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv,
-                                      mvjsadcost, mvsadcost, sad_per_bit);
-
+            thissad += mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit);
             if (thissad < bestsad) {
               bestsad = thissad;
               best_site = i;
@@ -1345,30 +1143,25 @@ int vp9_diamond_search_sadx4(MACROBLOCK *x,
       }
     }
     if (best_site != last_site) {
-      best_mv->as_mv.row += ss[best_site].mv.row;
-      best_mv->as_mv.col += ss[best_site].mv.col;
+      best_mv->row += ss[best_site].mv.row;
+      best_mv->col += ss[best_site].mv.col;
       best_address += ss[best_site].offset;
       last_site = best_site;
 #if defined(NEW_DIAMOND_SEARCH)
       while (1) {
-        this_row_offset = best_mv->as_mv.row + ss[best_site].mv.row;
-        this_col_offset = best_mv->as_mv.col + ss[best_site].mv.col;
-        if ((this_col_offset > x->mv_col_min) &&
-            (this_col_offset < x->mv_col_max) &&
-            (this_row_offset > x->mv_row_min) &&
-            (this_row_offset < x->mv_row_max)) {
-          check_here = ss[best_site].offset + best_address;
-          thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride,
-                                bestsad);
+        const MV this_mv = {best_mv->row + ss[best_site].mv.row,
+                            best_mv->col + ss[best_site].mv.col};
+        if (is_mv_in(x, &this_mv)) {
+          const uint8_t *const check_here = ss[best_site].offset + best_address;
+          unsigned int thissad = fn_ptr->sdf(what, what_stride, check_here,
+                                             in_what_stride, bestsad);
           if (thissad < bestsad) {
-            this_mv.as_mv.row = this_row_offset;
-            this_mv.as_mv.col = this_col_offset;
-            thissad += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv,
+            thissad += mvsad_err_cost(&this_mv, &fcenter_mv,
                                       mvjsadcost, mvsadcost, sad_per_bit);
             if (thissad < bestsad) {
               bestsad = thissad;
-              best_mv->as_mv.row += ss[best_site].mv.row;
-              best_mv->as_mv.col += ss[best_site].mv.col;
+              best_mv->row += ss[best_site].mv.row;
+              best_mv->col += ss[best_site].mv.col;
               best_address += ss[best_site].offset;
               continue;
             }
@@ -1381,498 +1174,287 @@ int vp9_diamond_search_sadx4(MACROBLOCK *x,
       (*num00)++;
     }
   }
-
-  this_mv.as_mv.row = best_mv->as_mv.row * 8;
-  this_mv.as_mv.col = best_mv->as_mv.col * 8;
-
-  if (bestsad == INT_MAX)
-    return INT_MAX;
-
-  return fn_ptr->vf(what, what_stride, best_address, in_what_stride,
-                    (unsigned int *)(&thissad)) +
-                    mv_err_cost(&this_mv.as_mv, &center_mv->as_mv,
-                                mvjcost, mvcost, x->errorperbit);
+  return bestsad;
 }
 
 /* do_refine: If last step (1-away) of n-step search doesn't pick the center
               point as the best match, we will do a final 1-away diamond
               refining search  */
 
-int vp9_full_pixel_diamond(VP9_COMP *cpi, MACROBLOCK *x,
-                           int_mv *mvp_full, int step_param,
-                           int sadpb, int further_steps,
-                           int do_refine, vp9_variance_fn_ptr_t *fn_ptr,
-                           int_mv *ref_mv, int_mv *dst_mv) {
-  int_mv temp_mv;
-  int thissme, n, num00;
-  int bestsme = cpi->diamond_search_sad(x, mvp_full, &temp_mv,
-                                        step_param, sadpb, &num00,
-                                        fn_ptr, x->nmvjointcost,
-                                        x->mvcost, ref_mv);
-  dst_mv->as_int = temp_mv.as_int;
-
-  n = num00;
-  num00 = 0;
-
-  /* If there won't be more n-step search, check to see if refining search is
-   * needed. */
+int vp9_full_pixel_diamond(const VP9_COMP *cpi, MACROBLOCK *x,
+                           MV *mvp_full, int step_param,
+                           int sadpb, int further_steps, int do_refine,
+                           const vp9_variance_fn_ptr_t *fn_ptr,
+                           const MV *ref_mv, MV *dst_mv) {
+  MV temp_mv;
+  int thissme, n, num00 = 0;
+  int bestsme = cpi->diamond_search_sad(x, &cpi->ss_cfg, mvp_full, &temp_mv,
+                                        step_param, sadpb, &n,
+                                        fn_ptr, ref_mv);
+  if (bestsme < INT_MAX)
+    bestsme = vp9_get_mvpred_var(x, &temp_mv, ref_mv, fn_ptr, 1);
+  *dst_mv = temp_mv;
+
+  // If there won't be more n-step search, check to see if refining search is
+  // needed.
   if (n > further_steps)
     do_refine = 0;
 
   while (n < further_steps) {
-    n++;
+    ++n;
 
     if (num00) {
       num00--;
     } else {
-      thissme = cpi->diamond_search_sad(x, mvp_full, &temp_mv,
+      thissme = cpi->diamond_search_sad(x, &cpi->ss_cfg, mvp_full, &temp_mv,
                                         step_param + n, sadpb, &num00,
-                                        fn_ptr, x->nmvjointcost, x->mvcost,
-                                        ref_mv);
+                                        fn_ptr, ref_mv);
+      if (thissme < INT_MAX)
+        thissme = vp9_get_mvpred_var(x, &temp_mv, ref_mv, fn_ptr, 1);
 
-      /* check to see if refining search is needed. */
-      if (num00 > (further_steps - n))
+      // check to see if refining search is needed.
+      if (num00 > further_steps - n)
         do_refine = 0;
 
       if (thissme < bestsme) {
         bestsme = thissme;
-        dst_mv->as_int = temp_mv.as_int;
+        *dst_mv = temp_mv;
       }
     }
   }
 
-  /* final 1-away diamond refining search */
-  if (do_refine == 1) {
-    int search_range = 8;
-    int_mv best_mv;
-    best_mv.as_int = dst_mv->as_int;
+  // final 1-away diamond refining search
+  if (do_refine) {
+    const int search_range = 8;
+    MV best_mv = *dst_mv;
     thissme = cpi->refining_search_sad(x, &best_mv, sadpb, search_range,
-                                       fn_ptr, x->nmvjointcost, x->mvcost,
-                                       ref_mv);
-
+                                       fn_ptr, ref_mv);
+    if (thissme < INT_MAX)
+      thissme = vp9_get_mvpred_var(x, &best_mv, ref_mv, fn_ptr, 1);
     if (thissme < bestsme) {
       bestsme = thissme;
-      dst_mv->as_int = best_mv.as_int;
+      *dst_mv = best_mv;
     }
   }
   return bestsme;
 }
 
-int vp9_full_search_sad_c(MACROBLOCK *x, int_mv *ref_mv,
+int vp9_full_search_sad_c(const MACROBLOCK *x, const MV *ref_mv,
                           int sad_per_bit, int distance,
-                          vp9_variance_fn_ptr_t *fn_ptr, int *mvjcost,
-                          int *mvcost[2],
-                          int_mv *center_mv, int n) {
-  const MACROBLOCKD* const xd = &x->e_mbd;
-  uint8_t *what = x->plane[0].src.buf;
-  int what_stride = x->plane[0].src.stride;
-  uint8_t *in_what;
-  int in_what_stride = xd->plane[0].pre[0].stride;
-  int mv_stride = xd->plane[0].pre[0].stride;
-  uint8_t *bestaddress;
-  int_mv *best_mv = &x->e_mbd.mi_8x8[0]->bmi[n].as_mv[0];
-  int_mv this_mv;
-  int bestsad = INT_MAX;
+                          const vp9_variance_fn_ptr_t *fn_ptr,
+                          const MV *center_mv, MV *best_mv) {
   int r, c;
-
-  uint8_t *check_here;
-  int thissad;
-
-  int ref_row = ref_mv->as_mv.row;
-  int ref_col = ref_mv->as_mv.col;
-
-  int row_min = ref_row - distance;
-  int row_max = ref_row + distance;
-  int col_min = ref_col - distance;
-  int col_max = ref_col + distance;
-  int_mv fcenter_mv;
-
-  int *mvjsadcost = x->nmvjointsadcost;
-  int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]};
-
-  fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
-  fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
-
-  // Work out the mid point for the search
-  in_what = xd->plane[0].pre[0].buf;
-  bestaddress = in_what + (ref_row * xd->plane[0].pre[0].stride) + ref_col;
-
-  best_mv->as_mv.row = ref_row;
-  best_mv->as_mv.col = ref_col;
-
-  // Baseline value at the centre
-  bestsad = fn_ptr->sdf(what, what_stride, bestaddress,
-                        in_what_stride, 0x7fffffff)
-                           + mvsad_err_cost(&best_mv->as_mv, &fcenter_mv.as_mv,
-                                            mvjsadcost, mvsadcost, sad_per_bit);
-
-  // Apply further limits to prevent us looking using vectors that stretch
-  // beyond the UMV border
-  col_min = MAX(col_min, x->mv_col_min);
-  col_max = MIN(col_max, x->mv_col_max);
-  row_min = MAX(row_min, x->mv_row_min);
-  row_max = MIN(row_max, x->mv_row_max);
-
-  for (r = row_min; r < row_max; r++) {
-    this_mv.as_mv.row = r;
-    check_here = r * mv_stride + in_what + col_min;
-
-    for (c = col_min; c < col_max; c++) {
-      thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride,
-                            bestsad);
-
-      this_mv.as_mv.col = c;
-      thissad += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv,
-                                mvjsadcost, mvsadcost, sad_per_bit);
-
-      if (thissad < bestsad) {
-        bestsad = thissad;
-        best_mv->as_mv.row = r;
-        best_mv->as_mv.col = c;
-        bestaddress = check_here;
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const struct buf_2d *const what = &x->plane[0].src;
+  const struct buf_2d *const in_what = &xd->plane[0].pre[0];
+  const int row_min = MAX(ref_mv->row - distance, x->mv_row_min);
+  const int row_max = MIN(ref_mv->row + distance, x->mv_row_max);
+  const int col_min = MAX(ref_mv->col - distance, x->mv_col_min);
+  const int col_max = MIN(ref_mv->col + distance, x->mv_col_max);
+  const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
+  int best_sad = fn_ptr->sdf(what->buf, what->stride,
+      get_buf_from_mv(in_what, ref_mv), in_what->stride, 0x7fffffff) +
+      mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit);
+  *best_mv = *ref_mv;
+
+  for (r = row_min; r < row_max; ++r) {
+    for (c = col_min; c < col_max; ++c) {
+      const MV mv = {r, c};
+      const int sad = fn_ptr->sdf(what->buf, what->stride,
+          get_buf_from_mv(in_what, &mv), in_what->stride, best_sad) +
+              mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
+      if (sad < best_sad) {
+        best_sad = sad;
+        *best_mv = mv;
       }
-
-      check_here++;
     }
   }
-
-  this_mv.as_mv.row = best_mv->as_mv.row * 8;
-  this_mv.as_mv.col = best_mv->as_mv.col * 8;
-
-  if (bestsad < INT_MAX)
-    return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride,
-                      (unsigned int *)(&thissad)) +
-                      mv_err_cost(&this_mv.as_mv, &center_mv->as_mv,
-                                  mvjcost, mvcost, x->errorperbit);
-  else
-    return INT_MAX;
+  return best_sad;
 }
 
-int vp9_full_search_sadx3(MACROBLOCK *x, int_mv *ref_mv,
+int vp9_full_search_sadx3(const MACROBLOCK *x, const MV *ref_mv,
                           int sad_per_bit, int distance,
-                          vp9_variance_fn_ptr_t *fn_ptr, int *mvjcost,
-                          int *mvcost[2], int_mv *center_mv, int n) {
-  const MACROBLOCKD* const xd = &x->e_mbd;
-  uint8_t *what = x->plane[0].src.buf;
-  int what_stride = x->plane[0].src.stride;
-  uint8_t *in_what;
-  int in_what_stride = xd->plane[0].pre[0].stride;
-  int mv_stride = xd->plane[0].pre[0].stride;
-  uint8_t *bestaddress;
-  int_mv *best_mv = &x->e_mbd.mi_8x8[0]->bmi[n].as_mv[0];
-  int_mv this_mv;
-  unsigned int bestsad = INT_MAX;
-  int r, c;
-
-  uint8_t *check_here;
-  unsigned int thissad;
-
-  int ref_row = ref_mv->as_mv.row;
-  int ref_col = ref_mv->as_mv.col;
-
-  int row_min = ref_row - distance;
-  int row_max = ref_row + distance;
-  int col_min = ref_col - distance;
-  int col_max = ref_col + distance;
-
-  unsigned int sad_array[3];
-  int_mv fcenter_mv;
-
-  int *mvjsadcost = x->nmvjointsadcost;
-  int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]};
-
-  fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
-  fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
-
-  // Work out the mid point for the search
-  in_what = xd->plane[0].pre[0].buf;
-  bestaddress = in_what + (ref_row * xd->plane[0].pre[0].stride) + ref_col;
-
-  best_mv->as_mv.row = ref_row;
-  best_mv->as_mv.col = ref_col;
-
-  // Baseline value at the centre
-  bestsad = fn_ptr->sdf(what, what_stride,
-                        bestaddress, in_what_stride, 0x7fffffff)
-            + mvsad_err_cost(&best_mv->as_mv, &fcenter_mv.as_mv,
-                             mvjsadcost, mvsadcost, sad_per_bit);
-
-  // Apply further limits to prevent us looking using vectors that stretch
-  // beyond the UMV border
-  col_min = MAX(col_min, x->mv_col_min);
-  col_max = MIN(col_max, x->mv_col_max);
-  row_min = MAX(row_min, x->mv_row_min);
-  row_max = MIN(row_max, x->mv_row_max);
-
-  for (r = row_min; r < row_max; r++) {
-    this_mv.as_mv.row = r;
-    check_here = r * mv_stride + in_what + col_min;
-    c = col_min;
-
-    while ((c + 2) < col_max) {
-      int i;
-
-      fn_ptr->sdx3f(what, what_stride, check_here, in_what_stride, sad_array);
-
-      for (i = 0; i < 3; i++) {
-        thissad = sad_array[i];
-
-        if (thissad < bestsad) {
-          this_mv.as_mv.col = c;
-          thissad += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv,
-                                    mvjsadcost, mvsadcost, sad_per_bit);
-
-          if (thissad < bestsad) {
-            bestsad = thissad;
-            best_mv->as_mv.row = r;
-            best_mv->as_mv.col = c;
-            bestaddress = check_here;
+                          const vp9_variance_fn_ptr_t *fn_ptr,
+                          const MV *center_mv, MV *best_mv) {
+  int r;
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const struct buf_2d *const what = &x->plane[0].src;
+  const struct buf_2d *const in_what = &xd->plane[0].pre[0];
+  const int row_min = MAX(ref_mv->row - distance, x->mv_row_min);
+  const int row_max = MIN(ref_mv->row + distance, x->mv_row_max);
+  const int col_min = MAX(ref_mv->col - distance, x->mv_col_min);
+  const int col_max = MIN(ref_mv->col + distance, x->mv_col_max);
+  const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
+  unsigned int best_sad = fn_ptr->sdf(what->buf, what->stride,
+      get_buf_from_mv(in_what, ref_mv), in_what->stride, 0x7fffffff) +
+      mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit);
+  *best_mv = *ref_mv;
+
+  for (r = row_min; r < row_max; ++r) {
+    int c = col_min;
+    const uint8_t *check_here = &in_what->buf[r * in_what->stride + c];
+
+    if (fn_ptr->sdx3f != NULL) {
+      while ((c + 2) < col_max) {
+        int i;
+        unsigned int sads[3];
+
+        fn_ptr->sdx3f(what->buf, what->stride, check_here, in_what->stride,
+                      sads);
+
+        for (i = 0; i < 3; ++i) {
+          unsigned int sad = sads[i];
+          if (sad < best_sad) {
+            const MV mv = {r, c};
+            sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
+            if (sad < best_sad) {
+              best_sad = sad;
+              *best_mv = mv;
+            }
           }
+          ++check_here;
+          ++c;
         }
-
-        check_here++;
-        c++;
       }
     }
 
     while (c < col_max) {
-      thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride,
-                            bestsad);
-
-      if (thissad < bestsad) {
-        this_mv.as_mv.col = c;
-        thissad  += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv,
-                                   mvjsadcost, mvsadcost, sad_per_bit);
-
-        if (thissad < bestsad) {
-          bestsad = thissad;
-          best_mv->as_mv.row = r;
-          best_mv->as_mv.col = c;
-          bestaddress = check_here;
+      unsigned int sad = fn_ptr->sdf(what->buf, what->stride,
+                                     check_here, in_what->stride, best_sad);
+      if (sad < best_sad) {
+        const MV mv = {r, c};
+        sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
+        if (sad < best_sad) {
+          best_sad = sad;
+          *best_mv = mv;
         }
       }
-
-      check_here++;
-      c++;
+      ++check_here;
+      ++c;
     }
   }
 
-  this_mv.as_mv.row = best_mv->as_mv.row * 8;
-  this_mv.as_mv.col = best_mv->as_mv.col * 8;
-
-  if (bestsad < INT_MAX)
-    return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride,
-                      (unsigned int *)(&thissad)) +
-                      mv_err_cost(&this_mv.as_mv, &center_mv->as_mv,
-                                  mvjcost, mvcost, x->errorperbit);
-  else
-    return INT_MAX;
+  return best_sad;
 }
 
-int vp9_full_search_sadx8(MACROBLOCK *x, int_mv *ref_mv,
+int vp9_full_search_sadx8(const MACROBLOCK *x, const MV *ref_mv,
                           int sad_per_bit, int distance,
-                          vp9_variance_fn_ptr_t *fn_ptr,
-                          int *mvjcost, int *mvcost[2],
-                          int_mv *center_mv, int n) {
-  const MACROBLOCKD* const xd = &x->e_mbd;
-  uint8_t *what = x->plane[0].src.buf;
-  int what_stride = x->plane[0].src.stride;
-  uint8_t *in_what;
-  int in_what_stride = xd->plane[0].pre[0].stride;
-  int mv_stride = xd->plane[0].pre[0].stride;
-  uint8_t *bestaddress;
-  int_mv *best_mv = &x->e_mbd.mi_8x8[0]->bmi[n].as_mv[0];
-  int_mv this_mv;
-  unsigned int bestsad = INT_MAX;
-  int r, c;
-
-  uint8_t *check_here;
-  unsigned int thissad;
-
-  int ref_row = ref_mv->as_mv.row;
-  int ref_col = ref_mv->as_mv.col;
-
-  int row_min = ref_row - distance;
-  int row_max = ref_row + distance;
-  int col_min = ref_col - distance;
-  int col_max = ref_col + distance;
-
-  DECLARE_ALIGNED_ARRAY(16, uint32_t, sad_array8, 8);
-  unsigned int sad_array[3];
-  int_mv fcenter_mv;
-
-  int *mvjsadcost = x->nmvjointsadcost;
-  int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]};
-
-  fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
-  fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
-
-  // Work out the mid point for the search
-  in_what = xd->plane[0].pre[0].buf;
-  bestaddress = in_what + (ref_row * xd->plane[0].pre[0].stride) + ref_col;
-
-  best_mv->as_mv.row = ref_row;
-  best_mv->as_mv.col = ref_col;
-
-  // Baseline value at the centre
-  bestsad = fn_ptr->sdf(what, what_stride,
-                        bestaddress, in_what_stride, 0x7fffffff)
-            + mvsad_err_cost(&best_mv->as_mv, &fcenter_mv.as_mv,
-                             mvjsadcost, mvsadcost, sad_per_bit);
-
-  // Apply further limits to prevent us looking using vectors that stretch
-  // beyond the UMV border
-  col_min = MAX(col_min, x->mv_col_min);
-  col_max = MIN(col_max, x->mv_col_max);
-  row_min = MAX(row_min, x->mv_row_min);
-  row_max = MIN(row_max, x->mv_row_max);
-
-  for (r = row_min; r < row_max; r++) {
-    this_mv.as_mv.row = r;
-    check_here = r * mv_stride + in_what + col_min;
-    c = col_min;
-
-    while ((c + 7) < col_max) {
-      int i;
-
-      fn_ptr->sdx8f(what, what_stride, check_here, in_what_stride, sad_array8);
-
-      for (i = 0; i < 8; i++) {
-        thissad = (unsigned int)sad_array8[i];
-
-        if (thissad < bestsad) {
-          this_mv.as_mv.col = c;
-          thissad += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv,
-                                    mvjsadcost, mvsadcost, sad_per_bit);
-
-          if (thissad < bestsad) {
-            bestsad = thissad;
-            best_mv->as_mv.row = r;
-            best_mv->as_mv.col = c;
-            bestaddress = check_here;
+                          const vp9_variance_fn_ptr_t *fn_ptr,
+                          const MV *center_mv, MV *best_mv) {
+  int r;
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const struct buf_2d *const what = &x->plane[0].src;
+  const struct buf_2d *const in_what = &xd->plane[0].pre[0];
+  const int row_min = MAX(ref_mv->row - distance, x->mv_row_min);
+  const int row_max = MIN(ref_mv->row + distance, x->mv_row_max);
+  const int col_min = MAX(ref_mv->col - distance, x->mv_col_min);
+  const int col_max = MIN(ref_mv->col + distance, x->mv_col_max);
+  const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
+  unsigned int best_sad = fn_ptr->sdf(what->buf, what->stride,
+      get_buf_from_mv(in_what, ref_mv), in_what->stride, 0x7fffffff) +
+      mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit);
+  *best_mv = *ref_mv;
+
+  for (r = row_min; r < row_max; ++r) {
+    int c = col_min;
+    const uint8_t *check_here = &in_what->buf[r * in_what->stride + c];
+
+    if (fn_ptr->sdx8f != NULL) {
+      while ((c + 7) < col_max) {
+        int i;
+        unsigned int sads[8];
+
+        fn_ptr->sdx8f(what->buf, what->stride, check_here, in_what->stride,
+                      sads);
+
+        for (i = 0; i < 8; ++i) {
+          unsigned int sad = sads[i];
+          if (sad < best_sad) {
+            const MV mv = {r, c};
+            sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
+            if (sad < best_sad) {
+              best_sad = sad;
+              *best_mv = mv;
+            }
           }
+          ++check_here;
+          ++c;
         }
-
-        check_here++;
-        c++;
       }
     }
 
-    while ((c + 2) < col_max && fn_ptr->sdx3f != NULL) {
-      int i;
-
-      fn_ptr->sdx3f(what, what_stride, check_here, in_what_stride, sad_array);
-
-      for (i = 0; i < 3; i++) {
-        thissad = sad_array[i];
-
-        if (thissad < bestsad) {
-          this_mv.as_mv.col = c;
-          thissad  += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv,
-                                     mvjsadcost, mvsadcost, sad_per_bit);
-
-          if (thissad < bestsad) {
-            bestsad = thissad;
-            best_mv->as_mv.row = r;
-            best_mv->as_mv.col = c;
-            bestaddress = check_here;
+    if (fn_ptr->sdx3f != NULL) {
+      while ((c + 2) < col_max) {
+        int i;
+        unsigned int sads[3];
+
+        fn_ptr->sdx3f(what->buf, what->stride, check_here, in_what->stride,
+                      sads);
+
+        for (i = 0; i < 3; ++i) {
+          unsigned int sad = sads[i];
+          if (sad < best_sad) {
+            const MV mv = {r, c};
+            sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
+            if (sad < best_sad) {
+              best_sad = sad;
+              *best_mv = mv;
+            }
           }
+          ++check_here;
+          ++c;
         }
-
-        check_here++;
-        c++;
       }
     }
 
     while (c < col_max) {
-      thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride,
-                            bestsad);
-
-      if (thissad < bestsad) {
-        this_mv.as_mv.col = c;
-        thissad += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv,
-                                  mvjsadcost, mvsadcost, sad_per_bit);
-
-        if (thissad < bestsad) {
-          bestsad = thissad;
-          best_mv->as_mv.row = r;
-          best_mv->as_mv.col = c;
-          bestaddress = check_here;
+      unsigned int sad = fn_ptr->sdf(what->buf, what->stride,
+                                     check_here, in_what->stride, best_sad);
+      if (sad < best_sad) {
+        const MV mv = {r, c};
+        sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
+        if (sad < best_sad) {
+          best_sad = sad;
+          *best_mv = mv;
         }
       }
-
-      check_here++;
-      c++;
+      ++check_here;
+      ++c;
     }
   }
 
-  this_mv.as_mv.row = best_mv->as_mv.row * 8;
-  this_mv.as_mv.col = best_mv->as_mv.col * 8;
-
-  if (bestsad < INT_MAX)
-    return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride,
-                      (unsigned int *)(&thissad)) +
-                      mv_err_cost(&this_mv.as_mv, &center_mv->as_mv,
-                                  mvjcost, mvcost, x->errorperbit);
-  else
-    return INT_MAX;
+  return best_sad;
 }
-int vp9_refining_search_sad_c(MACROBLOCK *x,
-                              int_mv *ref_mv, int error_per_bit,
-                              int search_range, vp9_variance_fn_ptr_t *fn_ptr,
-                              int *mvjcost, int *mvcost[2], int_mv *center_mv) {
-  const MACROBLOCKD* const xd = &x->e_mbd;
-  MV neighbors[4] = {{ -1, 0}, {0, -1}, {0, 1}, {1, 0}};
-  int i, j;
-  int this_row_offset, this_col_offset;
-
-  int what_stride = x->plane[0].src.stride;
-  int in_what_stride = xd->plane[0].pre[0].stride;
-  uint8_t *what = x->plane[0].src.buf;
-  uint8_t *best_address = xd->plane[0].pre[0].buf +
-                          (ref_mv->as_mv.row * xd->plane[0].pre[0].stride) +
-                          ref_mv->as_mv.col;
-  uint8_t *check_here;
-  unsigned int thissad;
-  int_mv this_mv;
-  unsigned int bestsad = INT_MAX;
-  int_mv fcenter_mv;
-
-  int *mvjsadcost = x->nmvjointsadcost;
-  int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]};
 
-  fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
-  fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
-
-  bestsad = fn_ptr->sdf(what, what_stride, best_address,
-                        in_what_stride, 0x7fffffff) +
-                        mvsad_err_cost(&ref_mv->as_mv, &fcenter_mv.as_mv,
-                                       mvjsadcost, mvsadcost, error_per_bit);
+int vp9_refining_search_sad_c(const MACROBLOCK *x,
+                              MV *ref_mv, int error_per_bit,
+                              int search_range,
+                              const vp9_variance_fn_ptr_t *fn_ptr,
+                              const MV *center_mv) {
+  const MV neighbors[4] = {{ -1, 0}, {0, -1}, {0, 1}, {1, 0}};
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const struct buf_2d *const what = &x->plane[0].src;
+  const struct buf_2d *const in_what = &xd->plane[0].pre[0];
+  const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
+  unsigned int best_sad = fn_ptr->sdf(what->buf, what->stride,
+                                     get_buf_from_mv(in_what, ref_mv),
+                                     in_what->stride, 0x7fffffff) +
+      mvsad_err_cost(x, ref_mv, &fcenter_mv, error_per_bit);
+  int i, j;
 
   for (i = 0; i < search_range; i++) {
     int best_site = -1;
 
     for (j = 0; j < 4; j++) {
-      this_row_offset = ref_mv->as_mv.row + neighbors[j].row;
-      this_col_offset = ref_mv->as_mv.col + neighbors[j].col;
-
-      if ((this_col_offset > x->mv_col_min) &&
-          (this_col_offset < x->mv_col_max) &&
-          (this_row_offset > x->mv_row_min) &&
-          (this_row_offset < x->mv_row_max)) {
-        check_here = (neighbors[j].row) * in_what_stride + neighbors[j].col +
-                     best_address;
-        thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride,
-                              bestsad);
-
-        if (thissad < bestsad) {
-          this_mv.as_mv.row = this_row_offset;
-          this_mv.as_mv.col = this_col_offset;
-          thissad += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv,
-                                    mvjsadcost, mvsadcost, error_per_bit);
-
-          if (thissad < bestsad) {
-            bestsad = thissad;
+      const MV mv = {ref_mv->row + neighbors[j].row,
+                     ref_mv->col + neighbors[j].col};
+      if (is_mv_in(x, &mv)) {
+        unsigned int sad = fn_ptr->sdf(what->buf, what->stride,
+            get_buf_from_mv(in_what, &mv), in_what->stride, best_sad);
+        if (sad < best_sad) {
+          sad += mvsad_err_cost(x, &mv, &fcenter_mv, error_per_bit);
+          if (sad < best_sad) {
+            best_sad = sad;
             best_site = j;
           }
         }
@@ -1882,110 +1464,71 @@ int vp9_refining_search_sad_c(MACROBLOCK *x,
     if (best_site == -1) {
       break;
     } else {
-      ref_mv->as_mv.row += neighbors[best_site].row;
-      ref_mv->as_mv.col += neighbors[best_site].col;
-      best_address += (neighbors[best_site].row) * in_what_stride +
-                      neighbors[best_site].col;
+      ref_mv->row += neighbors[best_site].row;
+      ref_mv->col += neighbors[best_site].col;
     }
   }
-
-  this_mv.as_mv.row = ref_mv->as_mv.row * 8;
-  this_mv.as_mv.col = ref_mv->as_mv.col * 8;
-
-  if (bestsad < INT_MAX)
-    return fn_ptr->vf(what, what_stride, best_address, in_what_stride,
-                      (unsigned int *)(&thissad)) +
-                      mv_err_cost(&this_mv.as_mv, &center_mv->as_mv,
-                                  mvjcost, mvcost, x->errorperbit);
-  else
-    return INT_MAX;
+  return best_sad;
 }
 
-int vp9_refining_search_sadx4(MACROBLOCK *x,
-                              int_mv *ref_mv, int error_per_bit,
-                              int search_range, vp9_variance_fn_ptr_t *fn_ptr,
-                              int *mvjcost, int *mvcost[2], int_mv *center_mv) {
-  const MACROBLOCKD* const xd = &x->e_mbd;
-  MV neighbors[4] = {{ -1, 0}, {0, -1}, {0, 1}, {1, 0}};
+int vp9_refining_search_sadx4(const MACROBLOCK *x,
+                              MV *ref_mv, int error_per_bit,
+                              int search_range,
+                              const vp9_variance_fn_ptr_t *fn_ptr,
+                              const MV *center_mv) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const MV neighbors[4] = {{ -1, 0}, {0, -1}, {0, 1}, {1, 0}};
+  const struct buf_2d *const what = &x->plane[0].src;
+  const struct buf_2d *const in_what = &xd->plane[0].pre[0];
+  const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
+  const uint8_t *best_address = get_buf_from_mv(in_what, ref_mv);
+  unsigned int best_sad = fn_ptr->sdf(what->buf, what->stride, best_address,
+                                    in_what->stride, 0x7fffffff) +
+      mvsad_err_cost(x, ref_mv, &fcenter_mv, error_per_bit);
   int i, j;
-  int this_row_offset, this_col_offset;
-
-  int what_stride = x->plane[0].src.stride;
-  int in_what_stride = xd->plane[0].pre[0].stride;
-  uint8_t *what = x->plane[0].src.buf;
-  uint8_t *best_address = xd->plane[0].pre[0].buf +
-                          (ref_mv->as_mv.row * xd->plane[0].pre[0].stride) +
-                          ref_mv->as_mv.col;
-  uint8_t *check_here;
-  unsigned int thissad;
-  int_mv this_mv;
-  unsigned int bestsad = INT_MAX;
-  int_mv fcenter_mv;
-
-  int *mvjsadcost = x->nmvjointsadcost;
-  int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]};
-
-  fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
-  fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
-
-  bestsad = fn_ptr->sdf(what, what_stride, best_address,
-                        in_what_stride, 0x7fffffff) +
-      mvsad_err_cost(&ref_mv->as_mv, &fcenter_mv.as_mv,
-                     mvjsadcost, mvsadcost, error_per_bit);
 
   for (i = 0; i < search_range; i++) {
     int best_site = -1;
-    int all_in = ((ref_mv->as_mv.row - 1) > x->mv_row_min) &
-                 ((ref_mv->as_mv.row + 1) < x->mv_row_max) &
-                 ((ref_mv->as_mv.col - 1) > x->mv_col_min) &
-                 ((ref_mv->as_mv.col + 1) < x->mv_col_max);
+    const int all_in = ((ref_mv->row - 1) > x->mv_row_min) &
+                       ((ref_mv->row + 1) < x->mv_row_max) &
+                       ((ref_mv->col - 1) > x->mv_col_min) &
+                       ((ref_mv->col + 1) < x->mv_col_max);
 
     if (all_in) {
-      unsigned int sad_array[4];
-      unsigned char const *block_offset[4];
-      block_offset[0] = best_address - in_what_stride;
-      block_offset[1] = best_address - 1;
-      block_offset[2] = best_address + 1;
-      block_offset[3] = best_address + in_what_stride;
-
-      fn_ptr->sdx4df(what, what_stride, block_offset, in_what_stride,
-                     sad_array);
-
-      for (j = 0; j < 4; j++) {
-        if (sad_array[j] < bestsad) {
-          this_mv.as_mv.row = ref_mv->as_mv.row + neighbors[j].row;
-          this_mv.as_mv.col = ref_mv->as_mv.col + neighbors[j].col;
-          sad_array[j] += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv,
-                                         mvjsadcost, mvsadcost, error_per_bit);
-
-          if (sad_array[j] < bestsad) {
-            bestsad = sad_array[j];
+      unsigned int sads[4];
+      const uint8_t *const positions[4] = {
+        best_address - in_what->stride,
+        best_address - 1,
+        best_address + 1,
+        best_address + in_what->stride
+      };
+
+      fn_ptr->sdx4df(what->buf, what->stride, positions, in_what->stride, sads);
+
+      for (j = 0; j < 4; ++j) {
+        if (sads[j] < best_sad) {
+          const MV mv = {ref_mv->row + neighbors[j].row,
+                         ref_mv->col + neighbors[j].col};
+          sads[j] += mvsad_err_cost(x, &mv, &fcenter_mv, error_per_bit);
+          if (sads[j] < best_sad) {
+            best_sad = sads[j];
             best_site = j;
           }
         }
       }
     } else {
-      for (j = 0; j < 4; j++) {
-        this_row_offset = ref_mv->as_mv.row + neighbors[j].row;
-        this_col_offset = ref_mv->as_mv.col + neighbors[j].col;
-
-        if ((this_col_offset > x->mv_col_min) &&
-            (this_col_offset < x->mv_col_max) &&
-            (this_row_offset > x->mv_row_min) &&
-            (this_row_offset < x->mv_row_max)) {
-          check_here = (neighbors[j].row) * in_what_stride + neighbors[j].col +
-                       best_address;
-          thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride,
-                                bestsad);
-
-          if (thissad < bestsad) {
-            this_mv.as_mv.row = this_row_offset;
-            this_mv.as_mv.col = this_col_offset;
-            thissad += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv,
-                                      mvjsadcost, mvsadcost, error_per_bit);
-
-            if (thissad < bestsad) {
-              bestsad = thissad;
+      for (j = 0; j < 4; ++j) {
+        const MV mv = {ref_mv->row + neighbors[j].row,
+                       ref_mv->col + neighbors[j].col};
+
+        if (is_mv_in(x, &mv)) {
+          unsigned int sad = fn_ptr->sdf(what->buf, what->stride,
+                                         get_buf_from_mv(in_what, &mv),
+                                         in_what->stride, best_sad);
+          if (sad < best_sad) {
+            sad += mvsad_err_cost(x, &mv, &fcenter_mv, error_per_bit);
+            if (sad < best_sad) {
+              best_sad = sad;
               best_site = j;
             }
           }
@@ -1996,88 +1539,50 @@ int vp9_refining_search_sadx4(MACROBLOCK *x,
     if (best_site == -1) {
       break;
     } else {
-      ref_mv->as_mv.row += neighbors[best_site].row;
-      ref_mv->as_mv.col += neighbors[best_site].col;
-      best_address += (neighbors[best_site].row) * in_what_stride +
-                      neighbors[best_site].col;
+      ref_mv->row += neighbors[best_site].row;
+      ref_mv->col += neighbors[best_site].col;
+      best_address = get_buf_from_mv(in_what, ref_mv);
     }
   }
 
-  this_mv.as_mv.row = ref_mv->as_mv.row * 8;
-  this_mv.as_mv.col = ref_mv->as_mv.col * 8;
-
-  if (bestsad < INT_MAX)
-    return fn_ptr->vf(what, what_stride, best_address, in_what_stride,
-                      (unsigned int *)(&thissad)) +
-                      mv_err_cost(&this_mv.as_mv, &center_mv->as_mv,
-                                  mvjcost, mvcost, x->errorperbit);
-  else
-    return INT_MAX;
+  return best_sad;
 }
 
-/* This function is called when we do joint motion search in comp_inter_inter
- * mode.
- */
-int vp9_refining_search_8p_c(MACROBLOCK *x,
-                             int_mv *ref_mv, int error_per_bit,
-                             int search_range, vp9_variance_fn_ptr_t *fn_ptr,
-                             int *mvjcost, int *mvcost[2], int_mv *center_mv,
-                             const uint8_t *second_pred, int w, int h) {
-  const MACROBLOCKD* const xd = &x->e_mbd;
-  MV neighbors[8] = {{-1, 0}, {0, -1}, {0, 1}, {1, 0},
-      {-1, -1}, {1, -1}, {-1, 1}, {1, 1}};
+// This function is called when we do joint motion search in comp_inter_inter
+// mode.
+int vp9_refining_search_8p_c(const MACROBLOCK *x,
+                             MV *ref_mv, int error_per_bit,
+                             int search_range,
+                             const vp9_variance_fn_ptr_t *fn_ptr,
+                             const MV *center_mv,
+                             const uint8_t *second_pred) {
+  const MV neighbors[8] = {{-1, 0}, {0, -1}, {0, 1}, {1, 0},
+                           {-1, -1}, {1, -1}, {-1, 1}, {1, 1}};
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const struct buf_2d *const what = &x->plane[0].src;
+  const struct buf_2d *const in_what = &xd->plane[0].pre[0];
+  const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
+  unsigned int best_sad = fn_ptr->sdaf(what->buf, what->stride,
+      get_buf_from_mv(in_what, ref_mv), in_what->stride,
+      second_pred, 0x7fffffff) +
+      mvsad_err_cost(x, ref_mv, &fcenter_mv, error_per_bit);
   int i, j;
-  int this_row_offset, this_col_offset;
 
-  int what_stride = x->plane[0].src.stride;
-  int in_what_stride = xd->plane[0].pre[0].stride;
-  uint8_t *what = x->plane[0].src.buf;
-  uint8_t *best_address = xd->plane[0].pre[0].buf +
-                          (ref_mv->as_mv.row * xd->plane[0].pre[0].stride) +
-                          ref_mv->as_mv.col;
-  uint8_t *check_here;
-  unsigned int thissad;
-  int_mv this_mv;
-  unsigned int bestsad = INT_MAX;
-  int_mv fcenter_mv;
-
-  int *mvjsadcost = x->nmvjointsadcost;
-  int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]};
-
-  fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
-  fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
-
-  /* Get compound pred by averaging two pred blocks. */
-  bestsad = fn_ptr->sdaf(what, what_stride, best_address, in_what_stride,
-                         second_pred, 0x7fffffff) +
-      mvsad_err_cost(&ref_mv->as_mv, &fcenter_mv.as_mv,
-                     mvjsadcost, mvsadcost, error_per_bit);
-
-  for (i = 0; i < search_range; i++) {
+  for (i = 0; i < search_range; ++i) {
     int best_site = -1;
 
-    for (j = 0; j < 8; j++) {
-      this_row_offset = ref_mv->as_mv.row + neighbors[j].row;
-      this_col_offset = ref_mv->as_mv.col + neighbors[j].col;
-
-      if ((this_col_offset > x->mv_col_min) &&
-          (this_col_offset < x->mv_col_max) &&
-          (this_row_offset > x->mv_row_min) &&
-          (this_row_offset < x->mv_row_max)) {
-        check_here = (neighbors[j].row) * in_what_stride + neighbors[j].col +
-            best_address;
-
-        /* Get compound block and use it to calculate SAD. */
-        thissad = fn_ptr->sdaf(what, what_stride, check_here, in_what_stride,
-                               second_pred, bestsad);
-
-        if (thissad < bestsad) {
-          this_mv.as_mv.row = this_row_offset;
-          this_mv.as_mv.col = this_col_offset;
-          thissad += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv,
-                                    mvjsadcost, mvsadcost, error_per_bit);
-          if (thissad < bestsad) {
-            bestsad = thissad;
+    for (j = 0; j < 8; ++j) {
+      const MV mv = {ref_mv->row + neighbors[j].row,
+                     ref_mv->col + neighbors[j].col};
+
+      if (is_mv_in(x, &mv)) {
+        unsigned int sad = fn_ptr->sdaf(what->buf, what->stride,
+            get_buf_from_mv(in_what, &mv), in_what->stride,
+            second_pred, best_sad);
+        if (sad < best_sad) {
+          sad += mvsad_err_cost(x, &mv, &fcenter_mv, error_per_bit);
+          if (sad < best_sad) {
+            best_sad = sad;
             best_site = j;
           }
         }
@@ -2087,24 +1592,9 @@ int vp9_refining_search_8p_c(MACROBLOCK *x,
     if (best_site == -1) {
       break;
     } else {
-      ref_mv->as_mv.row += neighbors[best_site].row;
-      ref_mv->as_mv.col += neighbors[best_site].col;
-      best_address += (neighbors[best_site].row) * in_what_stride +
-          neighbors[best_site].col;
+      ref_mv->row += neighbors[best_site].row;
+      ref_mv->col += neighbors[best_site].col;
     }
   }
-
-  this_mv.as_mv.row = ref_mv->as_mv.row * 8;
-  this_mv.as_mv.col = ref_mv->as_mv.col * 8;
-
-  if (bestsad < INT_MAX) {
-    // FIXME(rbultje, yunqing): add full-pixel averaging variance functions
-    // so we don't have to use the subpixel with xoff=0,yoff=0 here.
-    return fn_ptr->svaf(best_address, in_what_stride, 0, 0, what, what_stride,
-                        (unsigned int *)(&thissad), second_pred) +
-                        mv_err_cost(&this_mv.as_mv, &center_mv->as_mv,
-                                    mvjcost, mvcost, x->errorperbit);
-  } else {
-    return INT_MAX;
-  }
+  return best_sad;
 }
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_mcomp.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_mcomp.h
index bcab679c7e6..873edf376de 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_mcomp.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_mcomp.h
@@ -15,64 +15,83 @@
 #include "vp9/encoder/vp9_block.h"
 #include "vp9/encoder/vp9_variance.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 // The maximum number of steps in a step search given the largest
 // allowed initial step
 #define MAX_MVSEARCH_STEPS 11
-// Max full pel mv specified in 1 pel units
-#define MAX_FULL_PEL_VAL ((1 << (MAX_MVSEARCH_STEPS)) - 1)
+// Max full pel mv specified in the unit of full pixel
+// Enable the use of motion vector in range [-1023, 1023].
+#define MAX_FULL_PEL_VAL ((1 << (MAX_MVSEARCH_STEPS - 1)) - 1)
 // Maximum size of the first step in full pel units
 #define MAX_FIRST_STEP (1 << (MAX_MVSEARCH_STEPS-1))
 // Allowed motion vector pixel distance outside image border
 // for Block_16x16
 #define BORDER_MV_PIXELS_B16 (16 + VP9_INTERP_EXTEND)
 
+// motion search site
+typedef struct search_site {
+  MV mv;
+  int offset;
+} search_site;
+
+typedef struct search_site_config {
+  search_site ss[8 * MAX_MVSEARCH_STEPS + 1];
+  int ss_count;
+  int searches_per_step;
+} search_site_config;
+
+void vp9_init_dsmotion_compensation(search_site_config *cfg, int stride);
+void vp9_init3smotion_compensation(search_site_config *cfg,  int stride);
 
-void vp9_clamp_mv_min_max(MACROBLOCK *x, MV *mv);
+void vp9_set_mv_search_range(MACROBLOCK *x, const MV *mv);
 int vp9_mv_bit_cost(const MV *mv, const MV *ref,
                     const int *mvjcost, int *mvcost[2], int weight);
-void vp9_init_dsmotion_compensation(MACROBLOCK *x, int stride);
-void vp9_init3smotion_compensation(MACROBLOCK *x,  int stride);
+
+// Utility to compute variance + MV rate cost for a given MV
+int vp9_get_mvpred_var(const MACROBLOCK *x,
+                       const MV *best_mv, const MV *center_mv,
+                       const vp9_variance_fn_ptr_t *vfp,
+                       int use_mvcost);
+int vp9_get_mvpred_av_var(const MACROBLOCK *x,
+                          const MV *best_mv, const MV *center_mv,
+                          const uint8_t *second_pred,
+                          const vp9_variance_fn_ptr_t *vfp,
+                          int use_mvcost);
 
 struct VP9_COMP;
-int vp9_init_search_range(struct VP9_COMP *cpi, int size);
+struct SPEED_FEATURES;
+
+int vp9_init_search_range(const struct SPEED_FEATURES *sf, int size);
 
 // Runs sequence of diamond searches in smaller steps for RD
-int vp9_full_pixel_diamond(struct VP9_COMP *cpi, MACROBLOCK *x,
-                           int_mv *mvp_full, int step_param,
+int vp9_full_pixel_diamond(const struct VP9_COMP *cpi, MACROBLOCK *x,
+                           MV *mvp_full, int step_param,
                            int sadpb, int further_steps, int do_refine,
-                           vp9_variance_fn_ptr_t *fn_ptr,
-                           int_mv *ref_mv, int_mv *dst_mv);
-
-int vp9_hex_search(MACROBLOCK *x,
-                   MV *ref_mv,
-                   int search_param,
-                   int error_per_bit,
-                   int do_init_search,
-                   const vp9_variance_fn_ptr_t *vf,
-                   int use_mvcost,
-                   const MV *center_mv,
-                   MV *best_mv);
-int vp9_bigdia_search(MACROBLOCK *x,
-                      MV *ref_mv,
-                      int search_param,
-                      int error_per_bit,
-                      int do_init_search,
-                      const vp9_variance_fn_ptr_t *vf,
-                      int use_mvcost,
-                      const MV *center_mv,
-                      MV *best_mv);
-int vp9_square_search(MACROBLOCK *x,
-                      MV *ref_mv,
-                      int search_param,
-                      int error_per_bit,
-                      int do_init_search,
-                      const vp9_variance_fn_ptr_t *vf,
-                      int use_mvcost,
-                      const MV *center_mv,
-                      MV *best_mv);
+                           const vp9_variance_fn_ptr_t *fn_ptr,
+                           const MV *ref_mv, MV *dst_mv);
+
+typedef int (integer_mv_pattern_search_fn) (
+    const MACROBLOCK *x,
+    MV *ref_mv,
+    int search_param,
+    int error_per_bit,
+    int do_init_search,
+    const vp9_variance_fn_ptr_t *vf,
+    int use_mvcost,
+    const MV *center_mv,
+    MV *best_mv);
+
+integer_mv_pattern_search_fn vp9_hex_search;
+integer_mv_pattern_search_fn vp9_bigdia_search;
+integer_mv_pattern_search_fn vp9_square_search;
+integer_mv_pattern_search_fn vp9_fast_hex_search;
+integer_mv_pattern_search_fn vp9_fast_dia_search;
 
 typedef int (fractional_mv_step_fp) (
-    MACROBLOCK *x,
+    const MACROBLOCK *x,
     MV *bestmv, const MV *ref_mv,
     int allow_hp,
     int error_per_bit,
@@ -83,11 +102,11 @@ typedef int (fractional_mv_step_fp) (
     int *mvcost[2],
     int *distortion,
     unsigned int *sse);
-extern fractional_mv_step_fp vp9_find_best_sub_pixel_iterative;
+
 extern fractional_mv_step_fp vp9_find_best_sub_pixel_tree;
 
 typedef int (fractional_mv_step_comp_fp) (
-    MACROBLOCK *x,
+    const MACROBLOCK *x,
     MV *bestmv, const MV *ref_mv,
     int allow_hp,
     int error_per_bit,
@@ -98,34 +117,36 @@ typedef int (fractional_mv_step_comp_fp) (
     int *distortion, unsigned int *sse1,
     const uint8_t *second_pred,
     int w, int h);
-extern fractional_mv_step_comp_fp vp9_find_best_sub_pixel_comp_iterative;
+
 extern fractional_mv_step_comp_fp vp9_find_best_sub_pixel_comp_tree;
 
-typedef int (*vp9_full_search_fn_t)(MACROBLOCK *x,
-                                    int_mv *ref_mv, int sad_per_bit,
-                                    int distance, vp9_variance_fn_ptr_t *fn_ptr,
-                                    int *mvjcost, int *mvcost[2],
-                                    int_mv *center_mv, int n);
+typedef int (*vp9_full_search_fn_t)(const MACROBLOCK *x,
+                                    const MV *ref_mv, int sad_per_bit,
+                                    int distance,
+                                    const vp9_variance_fn_ptr_t *fn_ptr,
+                                    const MV *center_mv, MV *best_mv);
 
-typedef int (*vp9_refining_search_fn_t)(MACROBLOCK *x,
-                                        int_mv *ref_mv, int sad_per_bit,
+typedef int (*vp9_refining_search_fn_t)(const MACROBLOCK *x,
+                                        MV *ref_mv, int sad_per_bit,
                                         int distance,
-                                        vp9_variance_fn_ptr_t *fn_ptr,
-                                        int *mvjcost, int *mvcost[2],
-                                        int_mv *center_mv);
+                                        const vp9_variance_fn_ptr_t *fn_ptr,
+                                        const MV *center_mv);
 
-typedef int (*vp9_diamond_search_fn_t)(MACROBLOCK *x,
-                                       int_mv *ref_mv, int_mv *best_mv,
+typedef int (*vp9_diamond_search_fn_t)(const MACROBLOCK *x,
+                                       const search_site_config *cfg,
+                                       MV *ref_mv, MV *best_mv,
                                        int search_param, int sad_per_bit,
                                        int *num00,
-                                       vp9_variance_fn_ptr_t *fn_ptr,
-                                       int *mvjcost, int *mvcost[2],
-                                       int_mv *center_mv);
-
-int vp9_refining_search_8p_c(MACROBLOCK *x,
-                             int_mv *ref_mv, int error_per_bit,
-                             int search_range, vp9_variance_fn_ptr_t *fn_ptr,
-                             int *mvjcost, int *mvcost[2],
-                             int_mv *center_mv, const uint8_t *second_pred,
-                             int w, int h);
+                                       const vp9_variance_fn_ptr_t *fn_ptr,
+                                       const MV *center_mv);
+
+int vp9_refining_search_8p_c(const MACROBLOCK *x,
+                             MV *ref_mv, int error_per_bit,
+                             int search_range,
+                             const vp9_variance_fn_ptr_t *fn_ptr,
+                             const MV *center_mv, const uint8_t *second_pred);
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
 #endif  // VP9_ENCODER_VP9_MCOMP_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_modecosts.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_modecosts.c
deleted file mode 100644
index 7eb65923244..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_modecosts.c
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vp9/common/vp9_blockd.h"
-#include "vp9/encoder/vp9_onyx_int.h"
-#include "vp9/encoder/vp9_treewriter.h"
-#include "vp9/common/vp9_entropymode.h"
-
-
-void vp9_init_mode_costs(VP9_COMP *c) {
-  VP9_COMMON *const cm = &c->common;
-  const vp9_tree_index *KT = vp9_intra_mode_tree;
-  int i, j;
-
-  for (i = 0; i < INTRA_MODES; i++) {
-    for (j = 0; j < INTRA_MODES; j++) {
-      vp9_cost_tokens((int *)c->mb.y_mode_costs[i][j], vp9_kf_y_mode_prob[i][j],
-                      KT);
-    }
-  }
-
-  // TODO(rbultje) separate tables for superblock costing?
-  vp9_cost_tokens(c->mb.mbmode_cost, cm->fc.y_mode_prob[1],
-                  vp9_intra_mode_tree);
-  vp9_cost_tokens(c->mb.intra_uv_mode_cost[1],
-                  cm->fc.uv_mode_prob[INTRA_MODES - 1], vp9_intra_mode_tree);
-  vp9_cost_tokens(c->mb.intra_uv_mode_cost[0],
-                  vp9_kf_uv_mode_prob[INTRA_MODES - 1],
-                  vp9_intra_mode_tree);
-
-  for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
-    vp9_cost_tokens((int *)c->mb.switchable_interp_costs[i],
-                    cm->fc.switchable_interp_prob[i],
-                    vp9_switchable_interp_tree);
-}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_onyx_if.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_onyx_if.c
deleted file mode 100644
index b664f1e998e..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_onyx_if.c
+++ /dev/null
@@ -1,4319 +0,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <math.h>
-#include <stdio.h>
-#include <limits.h>
-
-#include "./vpx_config.h"
-#include "./vpx_scale_rtcd.h"
-
-#include "vp9/common/vp9_alloccommon.h"
-#include "vp9/common/vp9_filter.h"
-#include "vp9/common/vp9_idct.h"
-#if CONFIG_VP9_POSTPROC
-#include "vp9/common/vp9_postproc.h"
-#endif
-#include "vp9/common/vp9_reconinter.h"
-#include "vp9/common/vp9_systemdependent.h"
-#include "vp9/common/vp9_tile_common.h"
-#include "vp9/encoder/vp9_firstpass.h"
-#include "vp9/encoder/vp9_mbgraph.h"
-#include "vp9/encoder/vp9_onyx_int.h"
-#include "vp9/encoder/vp9_picklpf.h"
-#include "vp9/encoder/vp9_psnr.h"
-#include "vp9/encoder/vp9_ratectrl.h"
-#include "vp9/encoder/vp9_rdopt.h"
-#include "vp9/encoder/vp9_segmentation.h"
-#include "vp9/encoder/vp9_temporal_filter.h"
-#include "vp9/encoder/vp9_vaq.h"
-
-#include "vpx_ports/vpx_timer.h"
-
-
-extern void print_tree_update_probs();
-
-static void set_default_lf_deltas(struct loopfilter *lf);
-
-#define DEFAULT_INTERP_FILTER SWITCHABLE
-
-#define SHARP_FILTER_QTHRESH 0          /* Q threshold for 8-tap sharp filter */
-
-#define ALTREF_HIGH_PRECISION_MV 1      // Whether to use high precision mv
-                                         //  for altref computation.
-#define HIGH_PRECISION_MV_QTHRESH 200   // Q threshold for high precision
-                                         // mv. Choose a very high value for
-                                         // now so that HIGH_PRECISION is always
-                                         // chosen.
-
-// Masks for partially or completely disabling split mode
-#define DISABLE_ALL_SPLIT         0x3F
-#define DISABLE_ALL_INTER_SPLIT   0x1F
-#define DISABLE_COMPOUND_SPLIT    0x18
-#define LAST_AND_INTRA_SPLIT_ONLY 0x1E
-
-#if CONFIG_INTERNAL_STATS
-extern double vp9_calc_ssim(YV12_BUFFER_CONFIG *source,
-                            YV12_BUFFER_CONFIG *dest, int lumamask,
-                            double *weight);
-
-
-extern double vp9_calc_ssimg(YV12_BUFFER_CONFIG *source,
-                             YV12_BUFFER_CONFIG *dest, double *ssim_y,
-                             double *ssim_u, double *ssim_v);
-
-
-#endif
-
-// #define OUTPUT_YUV_REC
-
-#ifdef OUTPUT_YUV_SRC
-FILE *yuv_file;
-#endif
-#ifdef OUTPUT_YUV_REC
-FILE *yuv_rec_file;
-#endif
-
-#if 0
-FILE *framepsnr;
-FILE *kf_list;
-FILE *keyfile;
-#endif
-
-
-#ifdef ENTROPY_STATS
-extern int intra_mode_stats[INTRA_MODES]
-                           [INTRA_MODES]
-                           [INTRA_MODES];
-#endif
-
-#ifdef MODE_STATS
-extern void init_tx_count_stats();
-extern void write_tx_count_stats();
-extern void init_switchable_interp_stats();
-extern void write_switchable_interp_stats();
-#endif
-
-#ifdef SPEEDSTATS
-unsigned int frames_at_speed[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                                    0, 0, 0};
-#endif
-
-#if defined(SECTIONBITS_OUTPUT)
-extern unsigned __int64 Sectionbits[500];
-#endif
-
-extern void vp9_init_quantizer(VP9_COMP *cpi);
-
-// Tables relating active max Q to active min Q
-static int kf_low_motion_minq[QINDEX_RANGE];
-static int kf_high_motion_minq[QINDEX_RANGE];
-static int gf_low_motion_minq[QINDEX_RANGE];
-static int gf_high_motion_minq[QINDEX_RANGE];
-static int inter_minq[QINDEX_RANGE];
-static int afq_low_motion_minq[QINDEX_RANGE];
-static int afq_high_motion_minq[QINDEX_RANGE];
-
-static INLINE void Scale2Ratio(int mode, int *hr, int *hs) {
-  switch (mode) {
-    case NORMAL:
-      *hr = 1;
-      *hs = 1;
-      break;
-    case FOURFIVE:
-      *hr = 4;
-      *hs = 5;
-      break;
-    case THREEFIVE:
-      *hr = 3;
-      *hs = 5;
-    break;
-    case ONETWO:
-      *hr = 1;
-      *hs = 2;
-    break;
-    default:
-      *hr = 1;
-      *hs = 1;
-       assert(0);
-      break;
-  }
-}
-
-// Functions to compute the active minq lookup table entries based on a
-// formulaic approach to facilitate easier adjustment of the Q tables.
-// The formulae were derived from computing a 3rd order polynomial best
-// fit to the original data (after plotting real maxq vs minq (not q index))
-static int calculate_minq_index(double maxq,
-                                double x3, double x2, double x1, double c) {
-  int i;
-  const double minqtarget = MIN(((x3 * maxq + x2) * maxq + x1) * maxq + c,
-                                maxq);
-
-  // Special case handling to deal with the step from q2.0
-  // down to lossless mode represented by q 1.0.
-  if (minqtarget <= 2.0)
-    return 0;
-
-  for (i = 0; i < QINDEX_RANGE; i++) {
-    if (minqtarget <= vp9_convert_qindex_to_q(i))
-      return i;
-  }
-
-  return QINDEX_RANGE - 1;
-}
-
-static void init_minq_luts(void) {
-  int i;
-
-  for (i = 0; i < QINDEX_RANGE; i++) {
-    const double maxq = vp9_convert_qindex_to_q(i);
-
-
-    kf_low_motion_minq[i] = calculate_minq_index(maxq,
-                                                 0.000001,
-                                                 -0.0004,
-                                                 0.15,
-                                                 0.0);
-    kf_high_motion_minq[i] = calculate_minq_index(maxq,
-                                                  0.000002,
-                                                  -0.0012,
-                                                  0.5,
-                                                  0.0);
-
-    gf_low_motion_minq[i] = calculate_minq_index(maxq,
-                                                 0.0000015,
-                                                 -0.0009,
-                                                 0.32,
-                                                 0.0);
-    gf_high_motion_minq[i] = calculate_minq_index(maxq,
-                                                  0.0000021,
-                                                  -0.00125,
-                                                  0.50,
-                                                  0.0);
-    inter_minq[i] = calculate_minq_index(maxq,
-                                         0.00000271,
-                                         -0.00113,
-                                         0.75,
-                                         0.0);
-    afq_low_motion_minq[i] = calculate_minq_index(maxq,
-                                                  0.0000015,
-                                                  -0.0009,
-                                                  0.33,
-                                                  0.0);
-    afq_high_motion_minq[i] = calculate_minq_index(maxq,
-                                                   0.0000021,
-                                                   -0.00125,
-                                                   0.55,
-                                                   0.0);
-  }
-}
-
-static int get_active_quality(int q,
-                              int gfu_boost,
-                              int low,
-                              int high,
-                              int *low_motion_minq,
-                              int *high_motion_minq) {
-  int active_best_quality;
-  if (gfu_boost > high) {
-    active_best_quality = low_motion_minq[q];
-  } else if (gfu_boost < low) {
-    active_best_quality = high_motion_minq[q];
-  } else {
-    const int gap = high - low;
-    const int offset = high - gfu_boost;
-    const int qdiff = high_motion_minq[q] - low_motion_minq[q];
-    const int adjustment = ((offset * qdiff) + (gap >> 1)) / gap;
-    active_best_quality = low_motion_minq[q] + adjustment;
-  }
-  return active_best_quality;
-}
-
-static void set_mvcost(VP9_COMP *cpi) {
-  MACROBLOCK *const mb = &cpi->mb;
-  if (cpi->common.allow_high_precision_mv) {
-    mb->mvcost = mb->nmvcost_hp;
-    mb->mvsadcost = mb->nmvsadcost_hp;
-  } else {
-    mb->mvcost = mb->nmvcost;
-    mb->mvsadcost = mb->nmvsadcost;
-  }
-}
-
-void vp9_initialize_enc() {
-  static int init_done = 0;
-
-  if (!init_done) {
-    vp9_initialize_common();
-    vp9_tokenize_initialize();
-    vp9_init_quant_tables();
-    vp9_init_me_luts();
-    init_minq_luts();
-    // init_base_skip_probs();
-    init_done = 1;
-  }
-}
-
-static void setup_features(VP9_COMMON *cm) {
-  struct loopfilter *const lf = &cm->lf;
-  struct segmentation *const seg = &cm->seg;
-
-  // Set up default state for MB feature flags
-  seg->enabled = 0;
-
-  seg->update_map = 0;
-  seg->update_data = 0;
-  vpx_memset(seg->tree_probs, 255, sizeof(seg->tree_probs));
-
-  vp9_clearall_segfeatures(seg);
-
-  lf->mode_ref_delta_enabled = 0;
-  lf->mode_ref_delta_update = 0;
-  vp9_zero(lf->ref_deltas);
-  vp9_zero(lf->mode_deltas);
-  vp9_zero(lf->last_ref_deltas);
-  vp9_zero(lf->last_mode_deltas);
-
-  set_default_lf_deltas(lf);
-}
-
-static void dealloc_compressor_data(VP9_COMP *cpi) {
-  // Delete sementation map
-  vpx_free(cpi->segmentation_map);
-  cpi->segmentation_map = 0;
-  vpx_free(cpi->common.last_frame_seg_map);
-  cpi->common.last_frame_seg_map = 0;
-  vpx_free(cpi->coding_context.last_frame_seg_map_copy);
-  cpi->coding_context.last_frame_seg_map_copy = 0;
-
-  vpx_free(cpi->active_map);
-  cpi->active_map = 0;
-
-  vp9_free_frame_buffers(&cpi->common);
-
-  vp9_free_frame_buffer(&cpi->last_frame_uf);
-  vp9_free_frame_buffer(&cpi->scaled_source);
-  vp9_free_frame_buffer(&cpi->alt_ref_buffer);
-  vp9_lookahead_destroy(cpi->lookahead);
-
-  vpx_free(cpi->tok);
-  cpi->tok = 0;
-
-  // Activity mask based per mb zbin adjustments
-  vpx_free(cpi->mb_activity_map);
-  cpi->mb_activity_map = 0;
-  vpx_free(cpi->mb_norm_activity_map);
-  cpi->mb_norm_activity_map = 0;
-
-  vpx_free(cpi->above_context[0]);
-  cpi->above_context[0] = NULL;
-
-  vpx_free(cpi->above_seg_context);
-  cpi->above_seg_context = NULL;
-}
-
-// Computes a q delta (in "q index" terms) to get from a starting q value
-// to a target value
-// target q value
-int vp9_compute_qdelta(VP9_COMP *cpi, double qstart, double qtarget) {
-  int i;
-  int start_index = cpi->worst_quality;
-  int target_index = cpi->worst_quality;
-
-  // Convert the average q value to an index.
-  for (i = cpi->best_quality; i < cpi->worst_quality; i++) {
-    start_index = i;
-    if (vp9_convert_qindex_to_q(i) >= qstart)
-      break;
-  }
-
-  // Convert the q target to an index
-  for (i = cpi->best_quality; i < cpi->worst_quality; i++) {
-    target_index = i;
-    if (vp9_convert_qindex_to_q(i) >= qtarget)
-      break;
-  }
-
-  return target_index - start_index;
-}
-
-static void configure_static_seg_features(VP9_COMP *cpi) {
-  VP9_COMMON *cm = &cpi->common;
-  struct segmentation *seg = &cm->seg;
-
-  int high_q = (int)(cpi->avg_q > 48.0);
-  int qi_delta;
-
-  // Disable and clear down for KF
-  if (cm->frame_type == KEY_FRAME) {
-    // Clear down the global segmentation map
-    vpx_memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols);
-    seg->update_map = 0;
-    seg->update_data = 0;
-    cpi->static_mb_pct = 0;
-
-    // Disable segmentation
-    vp9_disable_segmentation((VP9_PTR)cpi);
-
-    // Clear down the segment features.
-    vp9_clearall_segfeatures(seg);
-  } else if (cpi->refresh_alt_ref_frame) {
-    // If this is an alt ref frame
-    // Clear down the global segmentation map
-    vpx_memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols);
-    seg->update_map = 0;
-    seg->update_data = 0;
-    cpi->static_mb_pct = 0;
-
-    // Disable segmentation and individual segment features by default
-    vp9_disable_segmentation((VP9_PTR)cpi);
-    vp9_clearall_segfeatures(seg);
-
-    // Scan frames from current to arf frame.
-    // This function re-enables segmentation if appropriate.
-    vp9_update_mbgraph_stats(cpi);
-
-    // If segmentation was enabled set those features needed for the
-    // arf itself.
-    if (seg->enabled) {
-      seg->update_map = 1;
-      seg->update_data = 1;
-
-      qi_delta = vp9_compute_qdelta(cpi, cpi->avg_q, (cpi->avg_q * 0.875));
-      vp9_set_segdata(seg, 1, SEG_LVL_ALT_Q, (qi_delta - 2));
-      vp9_set_segdata(seg, 1, SEG_LVL_ALT_LF, -2);
-
-      vp9_enable_segfeature(seg, 1, SEG_LVL_ALT_Q);
-      vp9_enable_segfeature(seg, 1, SEG_LVL_ALT_LF);
-
-      // Where relevant assume segment data is delta data
-      seg->abs_delta = SEGMENT_DELTADATA;
-    }
-  } else if (seg->enabled) {
-    // All other frames if segmentation has been enabled
-
-    // First normal frame in a valid gf or alt ref group
-    if (cpi->frames_since_golden == 0) {
-      // Set up segment features for normal frames in an arf group
-      if (cpi->source_alt_ref_active) {
-        seg->update_map = 0;
-        seg->update_data = 1;
-        seg->abs_delta = SEGMENT_DELTADATA;
-
-        qi_delta = vp9_compute_qdelta(cpi, cpi->avg_q,
-                                      (cpi->avg_q * 1.125));
-        vp9_set_segdata(seg, 1, SEG_LVL_ALT_Q, (qi_delta + 2));
-        vp9_enable_segfeature(seg, 1, SEG_LVL_ALT_Q);
-
-        vp9_set_segdata(seg, 1, SEG_LVL_ALT_LF, -2);
-        vp9_enable_segfeature(seg, 1, SEG_LVL_ALT_LF);
-
-        // Segment coding disabled for compred testing
-        if (high_q || (cpi->static_mb_pct == 100)) {
-          vp9_set_segdata(seg, 1, SEG_LVL_REF_FRAME, ALTREF_FRAME);
-          vp9_enable_segfeature(seg, 1, SEG_LVL_REF_FRAME);
-          vp9_enable_segfeature(seg, 1, SEG_LVL_SKIP);
-        }
-      } else {
-        // Disable segmentation and clear down features if alt ref
-        // is not active for this group
-
-        vp9_disable_segmentation((VP9_PTR)cpi);
-
-        vpx_memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols);
-
-        seg->update_map = 0;
-        seg->update_data = 0;
-
-        vp9_clearall_segfeatures(seg);
-      }
-    } else if (cpi->is_src_frame_alt_ref) {
-      // Special case where we are coding over the top of a previous
-      // alt ref frame.
-      // Segment coding disabled for compred testing
-
-      // Enable ref frame features for segment 0 as well
-      vp9_enable_segfeature(seg, 0, SEG_LVL_REF_FRAME);
-      vp9_enable_segfeature(seg, 1, SEG_LVL_REF_FRAME);
-
-      // All mbs should use ALTREF_FRAME
-      vp9_clear_segdata(seg, 0, SEG_LVL_REF_FRAME);
-      vp9_set_segdata(seg, 0, SEG_LVL_REF_FRAME, ALTREF_FRAME);
-      vp9_clear_segdata(seg, 1, SEG_LVL_REF_FRAME);
-      vp9_set_segdata(seg, 1, SEG_LVL_REF_FRAME, ALTREF_FRAME);
-
-      // Skip all MBs if high Q (0,0 mv and skip coeffs)
-      if (high_q) {
-        vp9_enable_segfeature(seg, 0, SEG_LVL_SKIP);
-        vp9_enable_segfeature(seg, 1, SEG_LVL_SKIP);
-      }
-      // Enable data update
-      seg->update_data = 1;
-    } else {
-      // All other frames.
-
-      // No updates.. leave things as they are.
-      seg->update_map = 0;
-      seg->update_data = 0;
-    }
-  }
-}
-
-#ifdef ENTROPY_STATS
-void vp9_update_mode_context_stats(VP9_COMP *cpi) {
-  VP9_COMMON *cm = &cpi->common;
-  int i, j;
-  unsigned int (*inter_mode_counts)[INTER_MODES - 1][2] =
-      cm->fc.inter_mode_counts;
-  int64_t (*mv_ref_stats)[INTER_MODES - 1][2] = cpi->mv_ref_stats;
-  FILE *f;
-
-  // Read the past stats counters
-  f = fopen("mode_context.bin",  "rb");
-  if (!f) {
-    vpx_memset(cpi->mv_ref_stats, 0, sizeof(cpi->mv_ref_stats));
-  } else {
-    fread(cpi->mv_ref_stats, sizeof(cpi->mv_ref_stats), 1, f);
-    fclose(f);
-  }
-
-  // Add in the values for this frame
-  for (i = 0; i < INTER_MODE_CONTEXTS; i++) {
-    for (j = 0; j < INTER_MODES - 1; j++) {
-      mv_ref_stats[i][j][0] += (int64_t)inter_mode_counts[i][j][0];
-      mv_ref_stats[i][j][1] += (int64_t)inter_mode_counts[i][j][1];
-    }
-  }
-
-  // Write back the accumulated stats
-  f = fopen("mode_context.bin",  "wb");
-  fwrite(cpi->mv_ref_stats, sizeof(cpi->mv_ref_stats), 1, f);
-  fclose(f);
-}
-
-void print_mode_context(VP9_COMP *cpi) {
-  FILE *f = fopen("vp9_modecont.c", "a");
-  int i, j;
-
-  fprintf(f, "#include \"vp9_entropy.h\"\n");
-  fprintf(
-      f,
-      "const int inter_mode_probs[INTER_MODE_CONTEXTS][INTER_MODES - 1] =");
-  fprintf(f, "{\n");
-  for (j = 0; j < INTER_MODE_CONTEXTS; j++) {
-    fprintf(f, "  {/* %d */ ", j);
-    fprintf(f, "    ");
-    for (i = 0; i < INTER_MODES - 1; i++) {
-      int this_prob;
-      int64_t count = cpi->mv_ref_stats[j][i][0] + cpi->mv_ref_stats[j][i][1];
-      if (count)
-        this_prob = ((cpi->mv_ref_stats[j][i][0] * 256) + (count >> 1)) / count;
-      else
-        this_prob = 128;
-
-      // context probs
-      fprintf(f, "%5d, ", this_prob);
-    }
-    fprintf(f, "  },\n");
-  }
-
-  fprintf(f, "};\n");
-  fclose(f);
-}
-#endif  // ENTROPY_STATS
-
-// DEBUG: Print out the segment id of each MB in the current frame.
-static void print_seg_map(VP9_COMP *cpi) {
-  VP9_COMMON *cm = &cpi->common;
-  int row, col;
-  int map_index = 0;
-  FILE *statsfile = fopen("segmap.stt", "a");
-
-  fprintf(statsfile, "%10d\n", cm->current_video_frame);
-
-  for (row = 0; row < cpi->common.mi_rows; row++) {
-    for (col = 0; col < cpi->common.mi_cols; col++) {
-      fprintf(statsfile, "%10d", cpi->segmentation_map[map_index]);
-      map_index++;
-    }
-    fprintf(statsfile, "\n");
-  }
-  fprintf(statsfile, "\n");
-
-  fclose(statsfile);
-}
-
-static void update_reference_segmentation_map(VP9_COMP *cpi) {
-  VP9_COMMON *const cm = &cpi->common;
-  int row, col;
-  MODE_INFO **mi_8x8, **mi_8x8_ptr = cm->mi_grid_visible;
-  uint8_t *cache_ptr = cm->last_frame_seg_map, *cache;
-
-  for (row = 0; row < cm->mi_rows; row++) {
-    mi_8x8 = mi_8x8_ptr;
-    cache = cache_ptr;
-    for (col = 0; col < cm->mi_cols; col++, mi_8x8++, cache++)
-      cache[0] = mi_8x8[0]->mbmi.segment_id;
-    mi_8x8_ptr += cm->mode_info_stride;
-    cache_ptr += cm->mi_cols;
-  }
-}
-
-static void set_default_lf_deltas(struct loopfilter *lf) {
-  lf->mode_ref_delta_enabled = 1;
-  lf->mode_ref_delta_update = 1;
-
-  vp9_zero(lf->ref_deltas);
-  vp9_zero(lf->mode_deltas);
-
-  // Test of ref frame deltas
-  lf->ref_deltas[INTRA_FRAME] = 2;
-  lf->ref_deltas[LAST_FRAME] = 0;
-  lf->ref_deltas[GOLDEN_FRAME] = -2;
-  lf->ref_deltas[ALTREF_FRAME] = -2;
-
-  lf->mode_deltas[0] = 0;   // Zero
-  lf->mode_deltas[1] = 0;   // New mv
-}
-
-static void set_rd_speed_thresholds(VP9_COMP *cpi, int mode) {
-  SPEED_FEATURES *sf = &cpi->sf;
-  int i;
-
-  // Set baseline threshold values
-  for (i = 0; i < MAX_MODES; ++i)
-    sf->thresh_mult[i] = mode == 0 ? -500 : 0;
-
-  sf->thresh_mult[THR_NEARESTMV] = 0;
-  sf->thresh_mult[THR_NEARESTG] = 0;
-  sf->thresh_mult[THR_NEARESTA] = 0;
-
-  sf->thresh_mult[THR_DC] += 1000;
-
-  sf->thresh_mult[THR_NEWMV] += 1000;
-  sf->thresh_mult[THR_NEWA] += 1000;
-  sf->thresh_mult[THR_NEWG] += 1000;
-
-  sf->thresh_mult[THR_NEARMV] += 1000;
-  sf->thresh_mult[THR_NEARA] += 1000;
-  sf->thresh_mult[THR_COMP_NEARESTLA] += 1000;
-  sf->thresh_mult[THR_COMP_NEARESTGA] += 1000;
-
-  sf->thresh_mult[THR_TM] += 1000;
-
-  sf->thresh_mult[THR_COMP_NEARLA] += 1500;
-  sf->thresh_mult[THR_COMP_NEWLA] += 2000;
-  sf->thresh_mult[THR_NEARG] += 1000;
-  sf->thresh_mult[THR_COMP_NEARGA] += 1500;
-  sf->thresh_mult[THR_COMP_NEWGA] += 2000;
-
-  sf->thresh_mult[THR_ZEROMV] += 2000;
-  sf->thresh_mult[THR_ZEROG] += 2000;
-  sf->thresh_mult[THR_ZEROA] += 2000;
-  sf->thresh_mult[THR_COMP_ZEROLA] += 2500;
-  sf->thresh_mult[THR_COMP_ZEROGA] += 2500;
-
-  sf->thresh_mult[THR_H_PRED] += 2000;
-  sf->thresh_mult[THR_V_PRED] += 2000;
-  sf->thresh_mult[THR_D45_PRED ] += 2500;
-  sf->thresh_mult[THR_D135_PRED] += 2500;
-  sf->thresh_mult[THR_D117_PRED] += 2500;
-  sf->thresh_mult[THR_D153_PRED] += 2500;
-  sf->thresh_mult[THR_D207_PRED] += 2500;
-  sf->thresh_mult[THR_D63_PRED] += 2500;
-
-  /* disable frame modes if flags not set */
-  if (!(cpi->ref_frame_flags & VP9_LAST_FLAG)) {
-    sf->thresh_mult[THR_NEWMV    ] = INT_MAX;
-    sf->thresh_mult[THR_NEARESTMV] = INT_MAX;
-    sf->thresh_mult[THR_ZEROMV   ] = INT_MAX;
-    sf->thresh_mult[THR_NEARMV   ] = INT_MAX;
-  }
-  if (!(cpi->ref_frame_flags & VP9_GOLD_FLAG)) {
-    sf->thresh_mult[THR_NEARESTG ] = INT_MAX;
-    sf->thresh_mult[THR_ZEROG    ] = INT_MAX;
-    sf->thresh_mult[THR_NEARG    ] = INT_MAX;
-    sf->thresh_mult[THR_NEWG     ] = INT_MAX;
-  }
-  if (!(cpi->ref_frame_flags & VP9_ALT_FLAG)) {
-    sf->thresh_mult[THR_NEARESTA ] = INT_MAX;
-    sf->thresh_mult[THR_ZEROA    ] = INT_MAX;
-    sf->thresh_mult[THR_NEARA    ] = INT_MAX;
-    sf->thresh_mult[THR_NEWA     ] = INT_MAX;
-  }
-
-  if ((cpi->ref_frame_flags & (VP9_LAST_FLAG | VP9_ALT_FLAG)) !=
-      (VP9_LAST_FLAG | VP9_ALT_FLAG)) {
-    sf->thresh_mult[THR_COMP_ZEROLA   ] = INT_MAX;
-    sf->thresh_mult[THR_COMP_NEARESTLA] = INT_MAX;
-    sf->thresh_mult[THR_COMP_NEARLA   ] = INT_MAX;
-    sf->thresh_mult[THR_COMP_NEWLA    ] = INT_MAX;
-  }
-  if ((cpi->ref_frame_flags & (VP9_GOLD_FLAG | VP9_ALT_FLAG)) !=
-      (VP9_GOLD_FLAG | VP9_ALT_FLAG)) {
-    sf->thresh_mult[THR_COMP_ZEROGA   ] = INT_MAX;
-    sf->thresh_mult[THR_COMP_NEARESTGA] = INT_MAX;
-    sf->thresh_mult[THR_COMP_NEARGA   ] = INT_MAX;
-    sf->thresh_mult[THR_COMP_NEWGA    ] = INT_MAX;
-  }
-}
-
-static void set_rd_speed_thresholds_sub8x8(VP9_COMP *cpi, int mode) {
-  SPEED_FEATURES *sf = &cpi->sf;
-  int i;
-
-  for (i = 0; i < MAX_REFS; ++i)
-    sf->thresh_mult_sub8x8[i] = mode == 0 ? -500 : 0;
-
-  sf->thresh_mult_sub8x8[THR_LAST] += 2500;
-  sf->thresh_mult_sub8x8[THR_GOLD] += 2500;
-  sf->thresh_mult_sub8x8[THR_ALTR] += 2500;
-  sf->thresh_mult_sub8x8[THR_INTRA] += 2500;
-  sf->thresh_mult_sub8x8[THR_COMP_LA] += 4500;
-  sf->thresh_mult_sub8x8[THR_COMP_GA] += 4500;
-
-  // Check for masked out split cases.
-  for (i = 0; i < MAX_REFS; i++) {
-    if (sf->disable_split_mask & (1 << i))
-      sf->thresh_mult_sub8x8[i] = INT_MAX;
-  }
-
-  // disable mode test if frame flag is not set
-  if (!(cpi->ref_frame_flags & VP9_LAST_FLAG))
-    sf->thresh_mult_sub8x8[THR_LAST] = INT_MAX;
-  if (!(cpi->ref_frame_flags & VP9_GOLD_FLAG))
-    sf->thresh_mult_sub8x8[THR_GOLD] = INT_MAX;
-  if (!(cpi->ref_frame_flags & VP9_ALT_FLAG))
-    sf->thresh_mult_sub8x8[THR_ALTR] = INT_MAX;
-  if ((cpi->ref_frame_flags & (VP9_LAST_FLAG | VP9_ALT_FLAG)) !=
-      (VP9_LAST_FLAG | VP9_ALT_FLAG))
-    sf->thresh_mult_sub8x8[THR_COMP_LA] = INT_MAX;
-  if ((cpi->ref_frame_flags & (VP9_GOLD_FLAG | VP9_ALT_FLAG)) !=
-      (VP9_GOLD_FLAG | VP9_ALT_FLAG))
-    sf->thresh_mult_sub8x8[THR_COMP_GA] = INT_MAX;
-}
-
-void vp9_set_speed_features(VP9_COMP *cpi) {
-  SPEED_FEATURES *sf = &cpi->sf;
-  int mode = cpi->compressor_speed;
-  int speed = cpi->speed;
-  int i;
-
-  // Only modes 0 and 1 supported for now in experimental code basae
-  if (mode > 1)
-    mode = 1;
-
-  for (i = 0; i < MAX_MODES; ++i)
-    cpi->mode_chosen_counts[i] = 0;
-
-  // best quality defaults
-  sf->RD = 1;
-  sf->search_method = NSTEP;
-  sf->auto_filter = 1;
-  sf->recode_loop = 1;
-  sf->subpel_search_method = SUBPEL_TREE;
-  sf->subpel_iters_per_step = 2;
-  sf->optimize_coefficients = !cpi->oxcf.lossless;
-  sf->reduce_first_step_size = 0;
-  sf->auto_mv_step_size = 0;
-  sf->max_step_search_steps = MAX_MVSEARCH_STEPS;
-  sf->comp_inter_joint_search_thresh = BLOCK_4X4;
-  sf->adaptive_rd_thresh = 0;
-  sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_OFF;
-  sf->tx_size_search_method = USE_FULL_RD;
-  sf->use_lp32x32fdct = 0;
-  sf->adaptive_motion_search = 0;
-  sf->use_avoid_tested_higherror = 0;
-  sf->reference_masking = 0;
-  sf->use_one_partition_size_always = 0;
-  sf->less_rectangular_check = 0;
-  sf->use_square_partition_only = 0;
-  sf->auto_min_max_partition_size = 0;
-  sf->max_partition_size = BLOCK_64X64;
-  sf->min_partition_size = BLOCK_4X4;
-  sf->adjust_partitioning_from_last_frame = 0;
-  sf->last_partitioning_redo_frequency = 4;
-  sf->disable_split_mask = 0;
-  sf->mode_search_skip_flags = 0;
-  sf->disable_split_var_thresh = 0;
-  sf->disable_filter_search_var_thresh = 0;
-  for (i = 0; i < TX_SIZES; i++) {
-    sf->intra_y_mode_mask[i] = ALL_INTRA_MODES;
-    sf->intra_uv_mode_mask[i] = ALL_INTRA_MODES;
-  }
-  sf->use_rd_breakout = 0;
-  sf->skip_encode_sb = 0;
-  sf->use_uv_intra_rd_estimate = 0;
-  sf->use_fast_lpf_pick = 0;
-  sf->use_fast_coef_updates = 0;
-  sf->using_small_partition_info = 0;
-  sf->mode_skip_start = MAX_MODES;  // Mode index at which mode skip mask set
-
-#if CONFIG_MULTIPLE_ARF
-  // Switch segmentation off.
-  sf->static_segmentation = 0;
-#else
-  sf->static_segmentation = 0;
-#endif
-
-  sf->variance_adaptive_quantization = 0;
-
-  switch (mode) {
-    case 0:  // This is the best quality mode.
-      break;
-
-    case 1:
-#if CONFIG_MULTIPLE_ARF
-      // Switch segmentation off.
-      sf->static_segmentation = 0;
-#else
-      sf->static_segmentation = 0;
-#endif
-      sf->use_avoid_tested_higherror = 1;
-      sf->adaptive_rd_thresh = 1;
-      sf->recode_loop = (speed < 1);
-
-      if (speed == 1) {
-        sf->use_square_partition_only = !frame_is_intra_only(&cpi->common);
-        sf->less_rectangular_check  = 1;
-        sf->tx_size_search_method = frame_is_intra_only(&cpi->common)
-                                     ? USE_FULL_RD : USE_LARGESTALL;
-
-        if (MIN(cpi->common.width, cpi->common.height) >= 720)
-          sf->disable_split_mask = cpi->common.show_frame ?
-              DISABLE_ALL_SPLIT : DISABLE_ALL_INTER_SPLIT;
-        else
-          sf->disable_split_mask = DISABLE_COMPOUND_SPLIT;
-
-        sf->use_rd_breakout = 1;
-        sf->adaptive_motion_search = 1;
-        sf->auto_mv_step_size = 1;
-        sf->adaptive_rd_thresh = 2;
-        sf->recode_loop = 2;
-        sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V;
-        sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC_H_V;
-        sf->intra_uv_mode_mask[TX_16X16] = INTRA_DC_H_V;
-      }
-      if (speed == 2) {
-        sf->use_square_partition_only = !frame_is_intra_only(&cpi->common);
-        sf->less_rectangular_check  = 1;
-        sf->tx_size_search_method = frame_is_intra_only(&cpi->common)
-                                     ? USE_FULL_RD : USE_LARGESTALL;
-
-        if (MIN(cpi->common.width, cpi->common.height) >= 720)
-          sf->disable_split_mask = cpi->common.show_frame ?
-              DISABLE_ALL_SPLIT : DISABLE_ALL_INTER_SPLIT;
-        else
-          sf->disable_split_mask = LAST_AND_INTRA_SPLIT_ONLY;
-
-
-        sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH |
-                                     FLAG_SKIP_INTRA_BESTINTER |
-                                     FLAG_SKIP_COMP_BESTINTRA |
-                                     FLAG_SKIP_INTRA_LOWVAR;
-
-        sf->use_rd_breakout = 1;
-        sf->adaptive_motion_search = 1;
-        sf->auto_mv_step_size = 1;
-
-        sf->disable_filter_search_var_thresh = 16;
-        sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
-
-        sf->auto_min_max_partition_size = 1;
-        sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_LOW_MOTION;
-        sf->adjust_partitioning_from_last_frame = 1;
-        sf->last_partitioning_redo_frequency = 3;
-
-        sf->adaptive_rd_thresh = 2;
-        sf->recode_loop = 2;
-        sf->mode_skip_start = 11;
-        sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V;
-        sf->intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V;
-        sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC_H_V;
-        sf->intra_uv_mode_mask[TX_16X16] = INTRA_DC_H_V;
-      }
-      if (speed == 3) {
-        sf->use_square_partition_only = 1;
-        sf->tx_size_search_method = USE_LARGESTALL;
-
-        if (MIN(cpi->common.width, cpi->common.height) >= 720)
-          sf->disable_split_mask = DISABLE_ALL_SPLIT;
-        else
-          sf->disable_split_mask = DISABLE_ALL_INTER_SPLIT;
-
-        sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH |
-                                     FLAG_SKIP_INTRA_BESTINTER |
-                                     FLAG_SKIP_COMP_BESTINTRA |
-                                     FLAG_SKIP_INTRA_LOWVAR;
-
-        sf->use_rd_breakout = 1;
-        sf->adaptive_motion_search = 1;
-        sf->auto_mv_step_size = 1;
-
-        sf->disable_filter_search_var_thresh = 16;
-        sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
-
-        sf->auto_min_max_partition_size = 1;
-        sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_ALL;
-        sf->adjust_partitioning_from_last_frame = 1;
-        sf->last_partitioning_redo_frequency = 3;
-
-        sf->use_uv_intra_rd_estimate = 1;
-        sf->skip_encode_sb = 1;
-        sf->use_lp32x32fdct = 1;
-        sf->subpel_iters_per_step = 1;
-        sf->use_fast_coef_updates = 2;
-
-        sf->adaptive_rd_thresh = 4;
-        sf->mode_skip_start = 6;
-      }
-      if (speed == 4) {
-        sf->use_square_partition_only = 1;
-        sf->tx_size_search_method = USE_LARGESTALL;
-        sf->disable_split_mask = DISABLE_ALL_SPLIT;
-
-        sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH |
-                                     FLAG_SKIP_INTRA_BESTINTER |
-                                     FLAG_SKIP_COMP_BESTINTRA |
-                                     FLAG_SKIP_COMP_REFMISMATCH |
-                                     FLAG_SKIP_INTRA_LOWVAR |
-                                     FLAG_EARLY_TERMINATE;
-
-        sf->use_rd_breakout = 1;
-        sf->adaptive_motion_search = 1;
-        sf->auto_mv_step_size = 1;
-
-        sf->disable_filter_search_var_thresh = 16;
-        sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
-
-        sf->auto_min_max_partition_size = 1;
-        sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_ALL;
-        sf->adjust_partitioning_from_last_frame = 1;
-        sf->last_partitioning_redo_frequency = 3;
-
-        sf->use_uv_intra_rd_estimate = 1;
-        sf->skip_encode_sb = 1;
-        sf->use_lp32x32fdct = 1;
-        sf->subpel_iters_per_step = 1;
-        sf->use_fast_coef_updates = 2;
-
-        sf->adaptive_rd_thresh = 4;
-        sf->mode_skip_start = 6;
-
-        /* sf->intra_y_mode_mask = INTRA_DC_ONLY;
-        sf->intra_uv_mode_mask = INTRA_DC_ONLY;
-        sf->search_method = BIGDIA;
-        sf->disable_split_var_thresh = 64;
-        sf->disable_filter_search_var_thresh = 64; */
-      }
-      if (speed == 5) {
-        sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
-        sf->use_one_partition_size_always = 1;
-        sf->always_this_block_size = BLOCK_16X16;
-        sf->tx_size_search_method = frame_is_intra_only(&cpi->common) ?
-                                     USE_FULL_RD : USE_LARGESTALL;
-        sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH |
-                                     FLAG_SKIP_INTRA_BESTINTER |
-                                     FLAG_SKIP_COMP_BESTINTRA |
-                                     FLAG_SKIP_COMP_REFMISMATCH |
-                                     FLAG_SKIP_INTRA_LOWVAR |
-                                     FLAG_EARLY_TERMINATE;
-        sf->use_rd_breakout = 1;
-        sf->use_lp32x32fdct = 1;
-        sf->optimize_coefficients = 0;
-        sf->auto_mv_step_size = 1;
-        // sf->reduce_first_step_size = 1;
-        // sf->reference_masking = 1;
-
-        sf->disable_split_mask = DISABLE_ALL_SPLIT;
-        sf->search_method = HEX;
-        sf->subpel_iters_per_step = 1;
-        sf->disable_split_var_thresh = 64;
-        sf->disable_filter_search_var_thresh = 96;
-        for (i = 0; i < TX_SIZES; i++) {
-          sf->intra_y_mode_mask[i] = INTRA_DC_ONLY;
-          sf->intra_uv_mode_mask[i] = INTRA_DC_ONLY;
-        }
-        sf->use_fast_coef_updates = 2;
-        sf->adaptive_rd_thresh = 4;
-        sf->mode_skip_start = 6;
-      }
-      break;
-  }; /* switch */
-
-  // Set rd thresholds based on mode and speed setting
-  set_rd_speed_thresholds(cpi, mode);
-  set_rd_speed_thresholds_sub8x8(cpi, mode);
-
-  // Slow quant, dct and trellis not worthwhile for first pass
-  // so make sure they are always turned off.
-  if (cpi->pass == 1) {
-    sf->optimize_coefficients = 0;
-  }
-
-  // No recode for 1 pass.
-  if (cpi->pass == 0) {
-    sf->recode_loop = 0;
-    sf->optimize_coefficients = 0;
-  }
-
-  cpi->mb.fwd_txm4x4 = vp9_fdct4x4;
-  if (cpi->oxcf.lossless || cpi->mb.e_mbd.lossless) {
-    cpi->mb.fwd_txm4x4 = vp9_fwht4x4;
-  }
-
-  if (cpi->sf.subpel_search_method == SUBPEL_ITERATIVE) {
-    cpi->find_fractional_mv_step = vp9_find_best_sub_pixel_iterative;
-    cpi->find_fractional_mv_step_comp = vp9_find_best_sub_pixel_comp_iterative;
-  } else if (cpi->sf.subpel_search_method == SUBPEL_TREE) {
-    cpi->find_fractional_mv_step = vp9_find_best_sub_pixel_tree;
-    cpi->find_fractional_mv_step_comp = vp9_find_best_sub_pixel_comp_tree;
-  }
-
-  cpi->mb.optimize = cpi->sf.optimize_coefficients == 1 && cpi->pass != 1;
-
-#ifdef SPEEDSTATS
-  frames_at_speed[cpi->speed]++;
-#endif
-}
-
-static void alloc_raw_frame_buffers(VP9_COMP *cpi) {
-  VP9_COMMON *cm = &cpi->common;
-
-  cpi->lookahead = vp9_lookahead_init(cpi->oxcf.width, cpi->oxcf.height,
-                                      cm->subsampling_x, cm->subsampling_y,
-                                      cpi->oxcf.lag_in_frames);
-  if (!cpi->lookahead)
-    vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
-                       "Failed to allocate lag buffers");
-
-  if (vp9_realloc_frame_buffer(&cpi->alt_ref_buffer,
-                               cpi->oxcf.width, cpi->oxcf.height,
-                               cm->subsampling_x, cm->subsampling_y,
-                               VP9BORDERINPIXELS))
-    vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
-                       "Failed to allocate altref buffer");
-}
-
-void vp9_alloc_compressor_data(VP9_COMP *cpi) {
-  VP9_COMMON *cm = &cpi->common;
-
-  if (vp9_alloc_frame_buffers(cm, cm->width, cm->height))
-    vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
-                       "Failed to allocate frame buffers");
-
-  if (vp9_alloc_frame_buffer(&cpi->last_frame_uf,
-                             cm->width, cm->height,
-                             cm->subsampling_x, cm->subsampling_y,
-                             VP9BORDERINPIXELS))
-    vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
-                       "Failed to allocate last frame buffer");
-
-  if (vp9_alloc_frame_buffer(&cpi->scaled_source,
-                             cm->width, cm->height,
-                             cm->subsampling_x, cm->subsampling_y,
-                             VP9BORDERINPIXELS))
-    vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
-                       "Failed to allocate scaled source buffer");
-
-  vpx_free(cpi->tok);
-
-  {
-    unsigned int tokens = get_token_alloc(cm->mb_rows, cm->mb_cols);
-
-    CHECK_MEM_ERROR(cm, cpi->tok, vpx_calloc(tokens, sizeof(*cpi->tok)));
-  }
-
-  // Data used for real time vc mode to see if gf needs refreshing
-  cpi->inter_zz_count = 0;
-  cpi->gf_bad_count = 0;
-  cpi->gf_update_recommended = 0;
-
-  vpx_free(cpi->mb_activity_map);
-  CHECK_MEM_ERROR(cm, cpi->mb_activity_map,
-                  vpx_calloc(sizeof(unsigned int),
-                             cm->mb_rows * cm->mb_cols));
-
-  vpx_free(cpi->mb_norm_activity_map);
-  CHECK_MEM_ERROR(cm, cpi->mb_norm_activity_map,
-                  vpx_calloc(sizeof(unsigned int),
-                             cm->mb_rows * cm->mb_cols));
-
-  // 2 contexts per 'mi unit', so that we have one context per 4x4 txfm
-  // block where mi unit size is 8x8.
-  vpx_free(cpi->above_context[0]);
-  CHECK_MEM_ERROR(cm, cpi->above_context[0],
-                  vpx_calloc(2 * mi_cols_aligned_to_sb(cm->mi_cols) *
-                             MAX_MB_PLANE,
-                             sizeof(*cpi->above_context[0])));
-
-  vpx_free(cpi->above_seg_context);
-  CHECK_MEM_ERROR(cm, cpi->above_seg_context,
-                  vpx_calloc(mi_cols_aligned_to_sb(cm->mi_cols),
-                             sizeof(*cpi->above_seg_context)));
-}
-
-
-static void update_frame_size(VP9_COMP *cpi) {
-  VP9_COMMON *cm = &cpi->common;
-
-  vp9_update_frame_size(cm);
-
-  // Update size of buffers local to this frame
-  if (vp9_realloc_frame_buffer(&cpi->last_frame_uf,
-                               cm->width, cm->height,
-                               cm->subsampling_x, cm->subsampling_y,
-                               VP9BORDERINPIXELS))
-    vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
-                       "Failed to reallocate last frame buffer");
-
-  if (vp9_realloc_frame_buffer(&cpi->scaled_source,
-                               cm->width, cm->height,
-                               cm->subsampling_x, cm->subsampling_y,
-                               VP9BORDERINPIXELS))
-    vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
-                       "Failed to reallocate scaled source buffer");
-
-  {
-    int y_stride = cpi->scaled_source.y_stride;
-
-    if (cpi->sf.search_method == NSTEP) {
-      vp9_init3smotion_compensation(&cpi->mb, y_stride);
-    } else if (cpi->sf.search_method == DIAMOND) {
-      vp9_init_dsmotion_compensation(&cpi->mb, y_stride);
-    }
-  }
-
-  {
-    int i;
-    for (i = 1; i < MAX_MB_PLANE; ++i) {
-      cpi->above_context[i] = cpi->above_context[0] +
-                              i * sizeof(*cpi->above_context[0]) * 2 *
-                              mi_cols_aligned_to_sb(cm->mi_cols);
-    }
-  }
-}
-
-
-// Table that converts 0-63 Q range values passed in outside to the Qindex
-// range used internally.
-static const int q_trans[] = {
-  0,    4,   8,  12,  16,  20,  24,  28,
-  32,   36,  40,  44,  48,  52,  56,  60,
-  64,   68,  72,  76,  80,  84,  88,  92,
-  96,  100, 104, 108, 112, 116, 120, 124,
-  128, 132, 136, 140, 144, 148, 152, 156,
-  160, 164, 168, 172, 176, 180, 184, 188,
-  192, 196, 200, 204, 208, 212, 216, 220,
-  224, 228, 232, 236, 240, 244, 249, 255,
-};
-
-int vp9_reverse_trans(int x) {
-  int i;
-
-  for (i = 0; i < 64; i++)
-    if (q_trans[i] >= x)
-      return i;
-
-  return 63;
-};
-void vp9_new_framerate(VP9_COMP *cpi, double framerate) {
-  if (framerate < 0.1)
-    framerate = 30;
-
-  cpi->oxcf.framerate = framerate;
-  cpi->output_framerate = cpi->oxcf.framerate;
-  cpi->per_frame_bandwidth = (int)(cpi->oxcf.target_bandwidth
-                             / cpi->output_framerate);
-  cpi->av_per_frame_bandwidth = (int)(cpi->oxcf.target_bandwidth
-                                / cpi->output_framerate);
-  cpi->min_frame_bandwidth = (int)(cpi->av_per_frame_bandwidth *
-                                   cpi->oxcf.two_pass_vbrmin_section / 100);
-
-
-  cpi->min_frame_bandwidth = MAX(cpi->min_frame_bandwidth, FRAME_OVERHEAD_BITS);
-
-  // Set Maximum gf/arf interval
-  cpi->max_gf_interval = 16;
-
-  // Extended interval for genuinely static scenes
-  cpi->twopass.static_scene_max_gf_interval = cpi->key_frame_frequency >> 1;
-
-  // Special conditions when alt ref frame enabled in lagged compress mode
-  if (cpi->oxcf.play_alternate && cpi->oxcf.lag_in_frames) {
-    if (cpi->max_gf_interval > cpi->oxcf.lag_in_frames - 1)
-      cpi->max_gf_interval = cpi->oxcf.lag_in_frames - 1;
-
-    if (cpi->twopass.static_scene_max_gf_interval > cpi->oxcf.lag_in_frames - 1)
-      cpi->twopass.static_scene_max_gf_interval = cpi->oxcf.lag_in_frames - 1;
-  }
-
-  if (cpi->max_gf_interval > cpi->twopass.static_scene_max_gf_interval)
-    cpi->max_gf_interval = cpi->twopass.static_scene_max_gf_interval;
-}
-
-static int64_t rescale(int val, int64_t num, int denom) {
-  int64_t llnum = num;
-  int64_t llden = denom;
-  int64_t llval = val;
-
-  return (llval * llnum / llden);
-}
-
-static void set_tile_limits(VP9_COMP *cpi) {
-  VP9_COMMON *const cm = &cpi->common;
-
-  int min_log2_tile_cols, max_log2_tile_cols;
-  vp9_get_tile_n_bits(cm->mi_cols, &min_log2_tile_cols, &max_log2_tile_cols);
-
-  cm->log2_tile_cols = clamp(cpi->oxcf.tile_columns,
-                             min_log2_tile_cols, max_log2_tile_cols);
-  cm->log2_tile_rows = cpi->oxcf.tile_rows;
-}
-
-static void init_config(VP9_PTR ptr, VP9_CONFIG *oxcf) {
-  VP9_COMP *cpi = (VP9_COMP *)(ptr);
-  VP9_COMMON *const cm = &cpi->common;
-  int i;
-
-  cpi->oxcf = *oxcf;
-  cpi->goldfreq = 7;
-
-  cm->version = oxcf->version;
-
-  cm->width = oxcf->width;
-  cm->height = oxcf->height;
-  cm->subsampling_x = 0;
-  cm->subsampling_y = 0;
-  vp9_alloc_compressor_data(cpi);
-
-  // change includes all joint functionality
-  vp9_change_config(ptr, oxcf);
-
-  // Initialize active best and worst q and average q values.
-  cpi->active_worst_quality         = cpi->oxcf.worst_allowed_q;
-  cpi->active_best_quality          = cpi->oxcf.best_allowed_q;
-  cpi->avg_frame_qindex             = cpi->oxcf.worst_allowed_q;
-
-  // Initialise the starting buffer levels
-  cpi->buffer_level                 = cpi->oxcf.starting_buffer_level;
-  cpi->bits_off_target              = cpi->oxcf.starting_buffer_level;
-
-  cpi->rolling_target_bits          = cpi->av_per_frame_bandwidth;
-  cpi->rolling_actual_bits          = cpi->av_per_frame_bandwidth;
-  cpi->long_rolling_target_bits     = cpi->av_per_frame_bandwidth;
-  cpi->long_rolling_actual_bits     = cpi->av_per_frame_bandwidth;
-
-  cpi->total_actual_bits            = 0;
-  cpi->total_target_vs_actual       = 0;
-
-  cpi->static_mb_pct = 0;
-
-  cpi->lst_fb_idx = 0;
-  cpi->gld_fb_idx = 1;
-  cpi->alt_fb_idx = 2;
-
-  cpi->current_layer = 0;
-  cpi->use_svc = 0;
-
-  set_tile_limits(cpi);
-
-  cpi->fixed_divide[0] = 0;
-  for (i = 1; i < 512; i++)
-    cpi->fixed_divide[i] = 0x80000 / i;
-}
-
-
-void vp9_change_config(VP9_PTR ptr, VP9_CONFIG *oxcf) {
-  VP9_COMP *cpi = (VP9_COMP *)(ptr);
-  VP9_COMMON *const cm = &cpi->common;
-
-  if (!cpi || !oxcf)
-    return;
-
-  if (cm->version != oxcf->version) {
-    cm->version = oxcf->version;
-  }
-
-  cpi->oxcf = *oxcf;
-
-  switch (cpi->oxcf.Mode) {
-      // Real time and one pass deprecated in test code base
-    case MODE_GOODQUALITY:
-      cpi->pass = 0;
-      cpi->compressor_speed = 2;
-      cpi->oxcf.cpu_used = clamp(cpi->oxcf.cpu_used, -5, 5);
-      break;
-
-    case MODE_FIRSTPASS:
-      cpi->pass = 1;
-      cpi->compressor_speed = 1;
-      break;
-
-    case MODE_SECONDPASS:
-      cpi->pass = 2;
-      cpi->compressor_speed = 1;
-      cpi->oxcf.cpu_used = clamp(cpi->oxcf.cpu_used, -5, 5);
-      break;
-
-    case MODE_SECONDPASS_BEST:
-      cpi->pass = 2;
-      cpi->compressor_speed = 0;
-      break;
-  }
-
-  cpi->oxcf.worst_allowed_q = q_trans[oxcf->worst_allowed_q];
-  cpi->oxcf.best_allowed_q = q_trans[oxcf->best_allowed_q];
-  cpi->oxcf.cq_level = q_trans[cpi->oxcf.cq_level];
-
-  cpi->oxcf.lossless = oxcf->lossless;
-  cpi->mb.e_mbd.itxm_add = cpi->oxcf.lossless ? vp9_iwht4x4_add
-                                              : vp9_idct4x4_add;
-  cpi->baseline_gf_interval = DEFAULT_GF_INTERVAL;
-
-  cpi->ref_frame_flags = VP9_ALT_FLAG | VP9_GOLD_FLAG | VP9_LAST_FLAG;
-
-  // cpi->use_golden_frame_only = 0;
-  // cpi->use_last_frame_only = 0;
-  cpi->refresh_golden_frame = 0;
-  cpi->refresh_last_frame = 1;
-  cm->refresh_frame_context = 1;
-  cm->reset_frame_context = 0;
-
-  setup_features(cm);
-  cpi->common.allow_high_precision_mv = 0;  // Default mv precision
-  set_mvcost(cpi);
-
-  {
-    int i;
-
-    for (i = 0; i < MAX_SEGMENTS; i++)
-      cpi->segment_encode_breakout[i] = cpi->oxcf.encode_breakout;
-  }
-
-  // At the moment the first order values may not be > MAXQ
-  cpi->oxcf.fixed_q = MIN(cpi->oxcf.fixed_q, MAXQ);
-
-  // local file playback mode == really big buffer
-  if (cpi->oxcf.end_usage == USAGE_LOCAL_FILE_PLAYBACK) {
-    cpi->oxcf.starting_buffer_level   = 60000;
-    cpi->oxcf.optimal_buffer_level    = 60000;
-    cpi->oxcf.maximum_buffer_size     = 240000;
-  }
-
-  // Convert target bandwidth from Kbit/s to Bit/s
-  cpi->oxcf.target_bandwidth       *= 1000;
-
-  cpi->oxcf.starting_buffer_level = rescale(cpi->oxcf.starting_buffer_level,
-                                            cpi->oxcf.target_bandwidth, 1000);
-
-  // Set or reset optimal and maximum buffer levels.
-  if (cpi->oxcf.optimal_buffer_level == 0)
-    cpi->oxcf.optimal_buffer_level = cpi->oxcf.target_bandwidth / 8;
-  else
-    cpi->oxcf.optimal_buffer_level = rescale(cpi->oxcf.optimal_buffer_level,
-                                             cpi->oxcf.target_bandwidth, 1000);
-
-  if (cpi->oxcf.maximum_buffer_size == 0)
-    cpi->oxcf.maximum_buffer_size = cpi->oxcf.target_bandwidth / 8;
-  else
-    cpi->oxcf.maximum_buffer_size = rescale(cpi->oxcf.maximum_buffer_size,
-                                            cpi->oxcf.target_bandwidth, 1000);
-
-  // Set up frame rate and related parameters rate control values.
-  vp9_new_framerate(cpi, cpi->oxcf.framerate);
-
-  // Set absolute upper and lower quality limits
-  cpi->worst_quality = cpi->oxcf.worst_allowed_q;
-  cpi->best_quality = cpi->oxcf.best_allowed_q;
-
-  // active values should only be modified if out of new range
-  cpi->active_worst_quality = clamp(cpi->active_worst_quality,
-                                    cpi->oxcf.best_allowed_q,
-                                    cpi->oxcf.worst_allowed_q);
-
-  cpi->active_best_quality = clamp(cpi->active_best_quality,
-                                   cpi->oxcf.best_allowed_q,
-                                   cpi->oxcf.worst_allowed_q);
-
-  cpi->buffered_mode = cpi->oxcf.optimal_buffer_level > 0;
-
-  cpi->cq_target_quality = cpi->oxcf.cq_level;
-
-  cm->mcomp_filter_type = DEFAULT_INTERP_FILTER;
-
-  cpi->target_bandwidth = cpi->oxcf.target_bandwidth;
-
-  cm->display_width = cpi->oxcf.width;
-  cm->display_height = cpi->oxcf.height;
-
-  // VP8 sharpness level mapping 0-7 (vs 0-10 in general VPx dialogs)
-  cpi->oxcf.Sharpness = MIN(7, cpi->oxcf.Sharpness);
-
-  cpi->common.lf.sharpness_level = cpi->oxcf.Sharpness;
-
-  if (cpi->initial_width) {
-    // Increasing the size of the frame beyond the first seen frame, or some
-    // otherwise signalled maximum size, is not supported.
-    // TODO(jkoleszar): exit gracefully.
-    assert(cm->width <= cpi->initial_width);
-    assert(cm->height <= cpi->initial_height);
-  }
-  update_frame_size(cpi);
-
-  if (cpi->oxcf.fixed_q >= 0) {
-    cpi->last_q[0] = cpi->oxcf.fixed_q;
-    cpi->last_q[1] = cpi->oxcf.fixed_q;
-    cpi->last_boosted_qindex = cpi->oxcf.fixed_q;
-  }
-
-  cpi->speed = cpi->oxcf.cpu_used;
-
-  if (cpi->oxcf.lag_in_frames == 0) {
-    // force to allowlag to 0 if lag_in_frames is 0;
-    cpi->oxcf.allow_lag = 0;
-  } else if (cpi->oxcf.lag_in_frames > MAX_LAG_BUFFERS) {
-     // Limit on lag buffers as these are not currently dynamically allocated
-    cpi->oxcf.lag_in_frames = MAX_LAG_BUFFERS;
-  }
-
-  // YX Temp
-#if CONFIG_MULTIPLE_ARF
-  vp9_zero(cpi->alt_ref_source);
-#else
-  cpi->alt_ref_source = NULL;
-#endif
-  cpi->is_src_frame_alt_ref = 0;
-
-#if 0
-  // Experimental RD Code
-  cpi->frame_distortion = 0;
-  cpi->last_frame_distortion = 0;
-#endif
-
-  set_tile_limits(cpi);
-}
-
-#define M_LOG2_E 0.693147180559945309417
-#define log2f(x) (log (x) / (float) M_LOG2_E)
-
-static void cal_nmvjointsadcost(int *mvjointsadcost) {
-  mvjointsadcost[0] = 600;
-  mvjointsadcost[1] = 300;
-  mvjointsadcost[2] = 300;
-  mvjointsadcost[0] = 300;
-}
-
-static void cal_nmvsadcosts(int *mvsadcost[2]) {
-  int i = 1;
-
-  mvsadcost[0][0] = 0;
-  mvsadcost[1][0] = 0;
-
-  do {
-    double z = 256 * (2 * (log2f(8 * i) + .6));
-    mvsadcost[0][i] = (int)z;
-    mvsadcost[1][i] = (int)z;
-    mvsadcost[0][-i] = (int)z;
-    mvsadcost[1][-i] = (int)z;
-  } while (++i <= MV_MAX);
-}
-
-static void cal_nmvsadcosts_hp(int *mvsadcost[2]) {
-  int i = 1;
-
-  mvsadcost[0][0] = 0;
-  mvsadcost[1][0] = 0;
-
-  do {
-    double z = 256 * (2 * (log2f(8 * i) + .6));
-    mvsadcost[0][i] = (int)z;
-    mvsadcost[1][i] = (int)z;
-    mvsadcost[0][-i] = (int)z;
-    mvsadcost[1][-i] = (int)z;
-  } while (++i <= MV_MAX);
-}
-
-static void init_pick_mode_context(VP9_COMP *cpi) {
-  int i;
-  MACROBLOCK  *x  = &cpi->mb;
-  MACROBLOCKD *xd = &x->e_mbd;
-  VP9_COMMON  *cm = &cpi->common;
-
-  for (i = 0; i < BLOCK_SIZES; ++i) {
-    const int num_4x4_w = num_4x4_blocks_wide_lookup[i];
-    const int num_4x4_h = num_4x4_blocks_high_lookup[i];
-    const int num_4x4_blk = MAX(4, num_4x4_w * num_4x4_h);
-    if (i < BLOCK_16X16) {
-      for (xd->sb_index = 0; xd->sb_index < 4; ++xd->sb_index) {
-        for (xd->mb_index = 0; xd->mb_index < 4; ++xd->mb_index) {
-          for (xd->b_index = 0; xd->b_index < 16 / num_4x4_blk; ++xd->b_index) {
-            PICK_MODE_CONTEXT *ctx = get_block_context(x, i);
-            ctx->num_4x4_blk = num_4x4_blk;
-            CHECK_MEM_ERROR(cm, ctx->zcoeff_blk,
-                            vpx_calloc(num_4x4_blk, sizeof(uint8_t)));
-          }
-        }
-      }
-    } else if (i < BLOCK_32X32) {
-      for (xd->sb_index = 0; xd->sb_index < 4; ++xd->sb_index) {
-        for (xd->mb_index = 0; xd->mb_index < 64 / num_4x4_blk;
-                               ++xd->mb_index) {
-          PICK_MODE_CONTEXT *ctx = get_block_context(x, i);
-          ctx->num_4x4_blk = num_4x4_blk;
-          CHECK_MEM_ERROR(cm, ctx->zcoeff_blk,
-                          vpx_calloc(num_4x4_blk, sizeof(uint8_t)));
-        }
-      }
-    } else if (i < BLOCK_64X64) {
-      for (xd->sb_index = 0; xd->sb_index < 256 / num_4x4_blk; ++xd->sb_index) {
-        PICK_MODE_CONTEXT *ctx = get_block_context(x, i);
-        ctx->num_4x4_blk = num_4x4_blk;
-        CHECK_MEM_ERROR(cm, ctx->zcoeff_blk,
-                        vpx_calloc(num_4x4_blk, sizeof(uint8_t)));
-      }
-    } else {
-      PICK_MODE_CONTEXT *ctx = get_block_context(x, i);
-      ctx->num_4x4_blk = num_4x4_blk;
-      CHECK_MEM_ERROR(cm, ctx->zcoeff_blk,
-                      vpx_calloc(num_4x4_blk, sizeof(uint8_t)));
-    }
-  }
-}
-
-static void free_pick_mode_context(MACROBLOCK *x) {
-  int i;
-  MACROBLOCKD *xd = &x->e_mbd;
-
-  for (i = 0; i < BLOCK_SIZES; ++i) {
-    const int num_4x4_w = num_4x4_blocks_wide_lookup[i];
-    const int num_4x4_h = num_4x4_blocks_high_lookup[i];
-    const int num_4x4_blk = MAX(4, num_4x4_w * num_4x4_h);
-    if (i < BLOCK_16X16) {
-      for (xd->sb_index = 0; xd->sb_index < 4; ++xd->sb_index) {
-        for (xd->mb_index = 0; xd->mb_index < 4; ++xd->mb_index) {
-          for (xd->b_index = 0; xd->b_index < 16 / num_4x4_blk; ++xd->b_index) {
-            PICK_MODE_CONTEXT *ctx = get_block_context(x, i);
-            vpx_free(ctx->zcoeff_blk);
-            ctx->zcoeff_blk = 0;
-          }
-        }
-      }
-    } else if (i < BLOCK_32X32) {
-      for (xd->sb_index = 0; xd->sb_index < 4; ++xd->sb_index) {
-        for (xd->mb_index = 0; xd->mb_index < 64 / num_4x4_blk;
-                               ++xd->mb_index) {
-          PICK_MODE_CONTEXT *ctx = get_block_context(x, i);
-          vpx_free(ctx->zcoeff_blk);
-          ctx->zcoeff_blk = 0;
-        }
-      }
-    } else if (i < BLOCK_64X64) {
-      for (xd->sb_index = 0; xd->sb_index < 256 / num_4x4_blk; ++xd->sb_index) {
-        PICK_MODE_CONTEXT *ctx = get_block_context(x, i);
-        vpx_free(ctx->zcoeff_blk);
-        ctx->zcoeff_blk = 0;
-      }
-    } else {
-      PICK_MODE_CONTEXT *ctx = get_block_context(x, i);
-      vpx_free(ctx->zcoeff_blk);
-      ctx->zcoeff_blk = 0;
-    }
-  }
-}
-
-VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) {
-  int i, j;
-  volatile union {
-    VP9_COMP *cpi;
-    VP9_PTR   ptr;
-  } ctx;
-
-  VP9_COMP *cpi;
-  VP9_COMMON *cm;
-
-  cpi = ctx.cpi = vpx_memalign(32, sizeof(VP9_COMP));
-  // Check that the CPI instance is valid
-  if (!cpi)
-    return 0;
-
-  cm = &cpi->common;
-
-  vp9_zero(*cpi);
-
-  if (setjmp(cm->error.jmp)) {
-    VP9_PTR ptr = ctx.ptr;
-
-    ctx.cpi->common.error.setjmp = 0;
-    vp9_remove_compressor(&ptr);
-    return 0;
-  }
-
-  cm->error.setjmp = 1;
-
-  CHECK_MEM_ERROR(cm, cpi->mb.ss, vpx_calloc(sizeof(search_site),
-                                             (MAX_MVSEARCH_STEPS * 8) + 1));
-
-  vp9_create_common(cm);
-
-  init_config((VP9_PTR)cpi, oxcf);
-
-  init_pick_mode_context(cpi);
-
-  cm->current_video_frame   = 0;
-  cpi->kf_overspend_bits            = 0;
-  cpi->kf_bitrate_adjustment        = 0;
-  cpi->frames_till_gf_update_due    = 0;
-  cpi->gf_overspend_bits            = 0;
-  cpi->non_gf_bitrate_adjustment    = 0;
-
-  // Set reference frame sign bias for ALTREF frame to 1 (for now)
-  cm->ref_frame_sign_bias[ALTREF_FRAME] = 1;
-
-  cpi->baseline_gf_interval = DEFAULT_GF_INTERVAL;
-
-  cpi->gold_is_last = 0;
-  cpi->alt_is_last  = 0;
-  cpi->gold_is_alt  = 0;
-
-  // Spatial scalability
-  cpi->number_spatial_layers = oxcf->ss_number_layers;
-
-  // Create the encoder segmentation map and set all entries to 0
-  CHECK_MEM_ERROR(cm, cpi->segmentation_map,
-                  vpx_calloc(cm->mi_rows * cm->mi_cols, 1));
-
-  // And a place holder structure is the coding context
-  // for use if we want to save and restore it
-  CHECK_MEM_ERROR(cm, cpi->coding_context.last_frame_seg_map_copy,
-                  vpx_calloc(cm->mi_rows * cm->mi_cols, 1));
-
-  CHECK_MEM_ERROR(cm, cpi->active_map, vpx_calloc(cm->MBs, 1));
-  vpx_memset(cpi->active_map, 1, cm->MBs);
-  cpi->active_map_enabled = 0;
-
-  for (i = 0; i < (sizeof(cpi->mbgraph_stats) /
-                   sizeof(cpi->mbgraph_stats[0])); i++) {
-    CHECK_MEM_ERROR(cm, cpi->mbgraph_stats[i].mb_stats,
-                    vpx_calloc(cm->MBs *
-                               sizeof(*cpi->mbgraph_stats[i].mb_stats), 1));
-  }
-
-#ifdef ENTROPY_STATS
-  if (cpi->pass != 1)
-    init_context_counters();
-#endif
-
-#ifdef MODE_STATS
-  init_tx_count_stats();
-  init_switchable_interp_stats();
-#endif
-
-  /*Initialize the feed-forward activity masking.*/
-  cpi->activity_avg = 90 << 12;
-
-  cpi->frames_since_key = 8;  // Sensible default for first frame.
-  cpi->key_frame_frequency = cpi->oxcf.key_freq;
-  cpi->this_key_frame_forced = 0;
-  cpi->next_key_frame_forced = 0;
-
-  cpi->source_alt_ref_pending = 0;
-  cpi->source_alt_ref_active = 0;
-  cpi->refresh_alt_ref_frame = 0;
-
-#if CONFIG_MULTIPLE_ARF
-  // Turn multiple ARF usage on/off. This is a quick hack for the initial test
-  // version. It should eventually be set via the codec API.
-  cpi->multi_arf_enabled = 1;
-
-  if (cpi->multi_arf_enabled) {
-    cpi->sequence_number = 0;
-    cpi->frame_coding_order_period = 0;
-    vp9_zero(cpi->frame_coding_order);
-    vp9_zero(cpi->arf_buffer_idx);
-  }
-#endif
-
-  cpi->b_calculate_psnr = CONFIG_INTERNAL_STATS;
-#if CONFIG_INTERNAL_STATS
-  cpi->b_calculate_ssimg = 0;
-
-  cpi->count = 0;
-  cpi->bytes = 0;
-
-  if (cpi->b_calculate_psnr) {
-    cpi->total_sq_error = 0.0;
-    cpi->total_sq_error2 = 0.0;
-    cpi->total_y = 0.0;
-    cpi->total_u = 0.0;
-    cpi->total_v = 0.0;
-    cpi->total = 0.0;
-    cpi->totalp_y = 0.0;
-    cpi->totalp_u = 0.0;
-    cpi->totalp_v = 0.0;
-    cpi->totalp = 0.0;
-    cpi->tot_recode_hits = 0;
-    cpi->summed_quality = 0;
-    cpi->summed_weights = 0;
-    cpi->summedp_quality = 0;
-    cpi->summedp_weights = 0;
-  }
-
-  if (cpi->b_calculate_ssimg) {
-    cpi->total_ssimg_y = 0;
-    cpi->total_ssimg_u = 0;
-    cpi->total_ssimg_v = 0;
-    cpi->total_ssimg_all = 0;
-  }
-
-#endif
-
-  cpi->first_time_stamp_ever = INT64_MAX;
-
-  cpi->frames_till_gf_update_due      = 0;
-  cpi->key_frame_count              = 1;
-
-  cpi->ni_av_qi                     = cpi->oxcf.worst_allowed_q;
-  cpi->ni_tot_qi                    = 0;
-  cpi->ni_frames                   = 0;
-  cpi->tot_q = 0.0;
-  cpi->avg_q = vp9_convert_qindex_to_q(cpi->oxcf.worst_allowed_q);
-  cpi->total_byte_count             = 0;
-
-  cpi->rate_correction_factor         = 1.0;
-  cpi->key_frame_rate_correction_factor = 1.0;
-  cpi->gf_rate_correction_factor  = 1.0;
-  cpi->twopass.est_max_qcorrection_factor  = 1.0;
-
-  cal_nmvjointsadcost(cpi->mb.nmvjointsadcost);
-  cpi->mb.nmvcost[0] = &cpi->mb.nmvcosts[0][MV_MAX];
-  cpi->mb.nmvcost[1] = &cpi->mb.nmvcosts[1][MV_MAX];
-  cpi->mb.nmvsadcost[0] = &cpi->mb.nmvsadcosts[0][MV_MAX];
-  cpi->mb.nmvsadcost[1] = &cpi->mb.nmvsadcosts[1][MV_MAX];
-  cal_nmvsadcosts(cpi->mb.nmvsadcost);
-
-  cpi->mb.nmvcost_hp[0] = &cpi->mb.nmvcosts_hp[0][MV_MAX];
-  cpi->mb.nmvcost_hp[1] = &cpi->mb.nmvcosts_hp[1][MV_MAX];
-  cpi->mb.nmvsadcost_hp[0] = &cpi->mb.nmvsadcosts_hp[0][MV_MAX];
-  cpi->mb.nmvsadcost_hp[1] = &cpi->mb.nmvsadcosts_hp[1][MV_MAX];
-  cal_nmvsadcosts_hp(cpi->mb.nmvsadcost_hp);
-
-  for (i = 0; i < KEY_FRAME_CONTEXT; i++)
-    cpi->prior_key_frame_distance[i] = (int)cpi->output_framerate;
-
-#ifdef OUTPUT_YUV_SRC
-  yuv_file = fopen("bd.yuv", "ab");
-#endif
-#ifdef OUTPUT_YUV_REC
-  yuv_rec_file = fopen("rec.yuv", "wb");
-#endif
-
-#if 0
-  framepsnr = fopen("framepsnr.stt", "a");
-  kf_list = fopen("kf_list.stt", "w");
-#endif
-
-  cpi->output_pkt_list = oxcf->output_pkt_list;
-
-  cpi->enable_encode_breakout = 1;
-
-  if (cpi->pass == 1) {
-    vp9_init_first_pass(cpi);
-  } else if (cpi->pass == 2) {
-    size_t packet_sz = sizeof(FIRSTPASS_STATS);
-    int packets = (int)(oxcf->two_pass_stats_in.sz / packet_sz);
-
-    cpi->twopass.stats_in_start = oxcf->two_pass_stats_in.buf;
-    cpi->twopass.stats_in = cpi->twopass.stats_in_start;
-    cpi->twopass.stats_in_end = (void *)((char *)cpi->twopass.stats_in
-                                         + (packets - 1) * packet_sz);
-    vp9_init_second_pass(cpi);
-  }
-
-  vp9_set_speed_features(cpi);
-
-  // Default rd threshold factors for mode selection
-  for (i = 0; i < BLOCK_SIZES; ++i) {
-    for (j = 0; j < MAX_MODES; ++j)
-      cpi->rd_thresh_freq_fact[i][j] = 32;
-    for (j = 0; j < MAX_REFS; ++j)
-      cpi->rd_thresh_freq_sub8x8[i][j] = 32;
-  }
-
-#define BFP(BT, SDF, SDAF, VF, SVF, SVAF, SVFHH, SVFHV, SVFHHV, \
-            SDX3F, SDX8F, SDX4DF)\
-    cpi->fn_ptr[BT].sdf            = SDF; \
-    cpi->fn_ptr[BT].sdaf           = SDAF; \
-    cpi->fn_ptr[BT].vf             = VF; \
-    cpi->fn_ptr[BT].svf            = SVF; \
-    cpi->fn_ptr[BT].svaf           = SVAF; \
-    cpi->fn_ptr[BT].svf_halfpix_h  = SVFHH; \
-    cpi->fn_ptr[BT].svf_halfpix_v  = SVFHV; \
-    cpi->fn_ptr[BT].svf_halfpix_hv = SVFHHV; \
-    cpi->fn_ptr[BT].sdx3f          = SDX3F; \
-    cpi->fn_ptr[BT].sdx8f          = SDX8F; \
-    cpi->fn_ptr[BT].sdx4df         = SDX4DF;
-
-  BFP(BLOCK_32X16, vp9_sad32x16, vp9_sad32x16_avg,
-      vp9_variance32x16, vp9_sub_pixel_variance32x16,
-      vp9_sub_pixel_avg_variance32x16, NULL, NULL,
-      NULL, NULL, NULL,
-      vp9_sad32x16x4d)
-
-  BFP(BLOCK_16X32, vp9_sad16x32, vp9_sad16x32_avg,
-      vp9_variance16x32, vp9_sub_pixel_variance16x32,
-      vp9_sub_pixel_avg_variance16x32, NULL, NULL,
-      NULL, NULL, NULL,
-      vp9_sad16x32x4d)
-
-  BFP(BLOCK_64X32, vp9_sad64x32, vp9_sad64x32_avg,
-      vp9_variance64x32, vp9_sub_pixel_variance64x32,
-      vp9_sub_pixel_avg_variance64x32, NULL, NULL,
-      NULL, NULL, NULL,
-      vp9_sad64x32x4d)
-
-  BFP(BLOCK_32X64, vp9_sad32x64, vp9_sad32x64_avg,
-      vp9_variance32x64, vp9_sub_pixel_variance32x64,
-      vp9_sub_pixel_avg_variance32x64, NULL, NULL,
-      NULL, NULL, NULL,
-      vp9_sad32x64x4d)
-
-  BFP(BLOCK_32X32, vp9_sad32x32, vp9_sad32x32_avg,
-      vp9_variance32x32, vp9_sub_pixel_variance32x32,
-      vp9_sub_pixel_avg_variance32x32, vp9_variance_halfpixvar32x32_h,
-      vp9_variance_halfpixvar32x32_v,
-      vp9_variance_halfpixvar32x32_hv, vp9_sad32x32x3, vp9_sad32x32x8,
-      vp9_sad32x32x4d)
-
-  BFP(BLOCK_64X64, vp9_sad64x64, vp9_sad64x64_avg,
-      vp9_variance64x64, vp9_sub_pixel_variance64x64,
-      vp9_sub_pixel_avg_variance64x64, vp9_variance_halfpixvar64x64_h,
-      vp9_variance_halfpixvar64x64_v,
-      vp9_variance_halfpixvar64x64_hv, vp9_sad64x64x3, vp9_sad64x64x8,
-      vp9_sad64x64x4d)
-
-  BFP(BLOCK_16X16, vp9_sad16x16, vp9_sad16x16_avg,
-      vp9_variance16x16, vp9_sub_pixel_variance16x16,
-      vp9_sub_pixel_avg_variance16x16, vp9_variance_halfpixvar16x16_h,
-      vp9_variance_halfpixvar16x16_v,
-      vp9_variance_halfpixvar16x16_hv, vp9_sad16x16x3, vp9_sad16x16x8,
-      vp9_sad16x16x4d)
-
-  BFP(BLOCK_16X8, vp9_sad16x8, vp9_sad16x8_avg,
-      vp9_variance16x8, vp9_sub_pixel_variance16x8,
-      vp9_sub_pixel_avg_variance16x8, NULL, NULL, NULL,
-      vp9_sad16x8x3, vp9_sad16x8x8, vp9_sad16x8x4d)
-
-  BFP(BLOCK_8X16, vp9_sad8x16, vp9_sad8x16_avg,
-      vp9_variance8x16, vp9_sub_pixel_variance8x16,
-      vp9_sub_pixel_avg_variance8x16, NULL, NULL, NULL,
-      vp9_sad8x16x3, vp9_sad8x16x8, vp9_sad8x16x4d)
-
-  BFP(BLOCK_8X8, vp9_sad8x8, vp9_sad8x8_avg,
-      vp9_variance8x8, vp9_sub_pixel_variance8x8,
-      vp9_sub_pixel_avg_variance8x8, NULL, NULL, NULL,
-      vp9_sad8x8x3, vp9_sad8x8x8, vp9_sad8x8x4d)
-
-  BFP(BLOCK_8X4, vp9_sad8x4, vp9_sad8x4_avg,
-      vp9_variance8x4, vp9_sub_pixel_variance8x4,
-      vp9_sub_pixel_avg_variance8x4, NULL, NULL,
-      NULL, NULL, vp9_sad8x4x8,
-      vp9_sad8x4x4d)
-
-  BFP(BLOCK_4X8, vp9_sad4x8, vp9_sad4x8_avg,
-      vp9_variance4x8, vp9_sub_pixel_variance4x8,
-      vp9_sub_pixel_avg_variance4x8, NULL, NULL,
-      NULL, NULL, vp9_sad4x8x8,
-      vp9_sad4x8x4d)
-
-  BFP(BLOCK_4X4, vp9_sad4x4, vp9_sad4x4_avg,
-      vp9_variance4x4, vp9_sub_pixel_variance4x4,
-      vp9_sub_pixel_avg_variance4x4, NULL, NULL, NULL,
-      vp9_sad4x4x3, vp9_sad4x4x8, vp9_sad4x4x4d)
-
-  cpi->full_search_sad = vp9_full_search_sad;
-  cpi->diamond_search_sad = vp9_diamond_search_sad;
-  cpi->refining_search_sad = vp9_refining_search_sad;
-
-  // make sure frame 1 is okay
-  cpi->error_bins[0] = cpi->common.MBs;
-
-  /* vp9_init_quantizer() is first called here. Add check in
-   * vp9_frame_init_quantizer() so that vp9_init_quantizer is only
-   * called later when needed. This will avoid unnecessary calls of
-   * vp9_init_quantizer() for every frame.
-   */
-  vp9_init_quantizer(cpi);
-
-  vp9_loop_filter_init(cm);
-
-  cpi->common.error.setjmp = 0;
-
-  vp9_zero(cpi->y_uv_mode_count);
-
-#ifdef MODE_TEST_HIT_STATS
-  vp9_zero(cpi->mode_test_hits);
-#endif
-
-  return (VP9_PTR) cpi;
-}
-
-void vp9_remove_compressor(VP9_PTR *ptr) {
-  VP9_COMP *cpi = (VP9_COMP *)(*ptr);
-  int i;
-
-  if (!cpi)
-    return;
-
-  if (cpi && (cpi->common.current_video_frame > 0)) {
-    if (cpi->pass == 2) {
-      vp9_end_second_pass(cpi);
-    }
-
-#ifdef ENTROPY_STATS
-    if (cpi->pass != 1) {
-      print_context_counters();
-      print_tree_update_probs();
-      print_mode_context(cpi);
-    }
-#endif
-
-#ifdef MODE_STATS
-    if (cpi->pass != 1) {
-      write_tx_count_stats();
-      write_switchable_interp_stats();
-    }
-#endif
-
-#if CONFIG_INTERNAL_STATS
-
-    vp9_clear_system_state();
-
-    // printf("\n8x8-4x4:%d-%d\n", cpi->t8x8_count, cpi->t4x4_count);
-    if (cpi->pass != 1) {
-      FILE *f = fopen("opsnr.stt", "a");
-      double time_encoded = (cpi->last_end_time_stamp_seen
-                             - cpi->first_time_stamp_ever) / 10000000.000;
-      double total_encode_time = (cpi->time_receive_data +
-                                  cpi->time_compress_data)   / 1000.000;
-      double dr = (double)cpi->bytes * (double) 8 / (double)1000
-                  / time_encoded;
-
-      if (cpi->b_calculate_psnr) {
-        YV12_BUFFER_CONFIG *lst_yv12 =
-            &cpi->common.yv12_fb[cpi->common.ref_frame_map[cpi->lst_fb_idx]];
-        double samples = 3.0 / 2 * cpi->count *
-                         lst_yv12->y_width * lst_yv12->y_height;
-        double total_psnr = vp9_mse2psnr(samples, 255.0, cpi->total_sq_error);
-        double total_psnr2 = vp9_mse2psnr(samples, 255.0, cpi->total_sq_error2);
-        double total_ssim = 100 * pow(cpi->summed_quality /
-                                      cpi->summed_weights, 8.0);
-        double total_ssimp = 100 * pow(cpi->summedp_quality /
-                                       cpi->summedp_weights, 8.0);
-
-        fprintf(f, "Bitrate\tAVGPsnr\tGLBPsnr\tAVPsnrP\tGLPsnrP\t"
-                "VPXSSIM\tVPSSIMP\t  Time(ms)\n");
-        fprintf(f, "%7.2f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t%8.0f\n",
-                dr, cpi->total / cpi->count, total_psnr,
-                cpi->totalp / cpi->count, total_psnr2, total_ssim, total_ssimp,
-                total_encode_time);
-      }
-
-      if (cpi->b_calculate_ssimg) {
-        fprintf(f, "BitRate\tSSIM_Y\tSSIM_U\tSSIM_V\tSSIM_A\t  Time(ms)\n");
-        fprintf(f, "%7.2f\t%6.4f\t%6.4f\t%6.4f\t%6.4f\t%8.0f\n", dr,
-                cpi->total_ssimg_y / cpi->count,
-                cpi->total_ssimg_u / cpi->count,
-                cpi->total_ssimg_v / cpi->count,
-                cpi->total_ssimg_all / cpi->count, total_encode_time);
-      }
-
-      fclose(f);
-    }
-
-#endif
-
-#ifdef MODE_TEST_HIT_STATS
-    if (cpi->pass != 1) {
-      double norm_per_pixel_mode_tests = 0;
-      double norm_counts[BLOCK_SIZES];
-      int i;
-      int sb64_per_frame;
-      int norm_factors[BLOCK_SIZES] =
-        {256, 128, 128, 64, 32, 32, 16, 8, 8, 4, 2, 2, 1};
-      FILE *f = fopen("mode_hit_stats.stt", "a");
-
-      // On average, how many mode tests do we do
-      for (i = 0; i < BLOCK_SIZES; ++i) {
-        norm_counts[i] = (double)cpi->mode_test_hits[i] /
-                         (double)norm_factors[i];
-        norm_per_pixel_mode_tests += norm_counts[i];
-      }
-      // Convert to a number per 64x64 and per frame
-      sb64_per_frame = ((cpi->common.height + 63) / 64) *
-                       ((cpi->common.width + 63) / 64);
-      norm_per_pixel_mode_tests =
-        norm_per_pixel_mode_tests /
-        (double)(cpi->common.current_video_frame * sb64_per_frame);
-
-      fprintf(f, "%6.4f\n", norm_per_pixel_mode_tests);
-      fclose(f);
-    }
-#endif
-
-#ifdef ENTROPY_STATS
-    {
-      int i, j, k;
-      FILE *fmode = fopen("vp9_modecontext.c", "w");
-
-      fprintf(fmode, "\n#include \"vp9_entropymode.h\"\n\n");
-      fprintf(fmode, "const unsigned int vp9_kf_default_bmode_counts ");
-      fprintf(fmode, "[INTRA_MODES][INTRA_MODES]"
-                     "[INTRA_MODES] =\n{\n");
-
-      for (i = 0; i < INTRA_MODES; i++) {
-        fprintf(fmode, "    { // Above Mode :  %d\n", i);
-
-        for (j = 0; j < INTRA_MODES; j++) {
-          fprintf(fmode, "        {");
-
-          for (k = 0; k < INTRA_MODES; k++) {
-            if (!intra_mode_stats[i][j][k])
-              fprintf(fmode, " %5d, ", 1);
-            else
-              fprintf(fmode, " %5d, ", intra_mode_stats[i][j][k]);
-          }
-
-          fprintf(fmode, "}, // left_mode %d\n", j);
-        }
-
-        fprintf(fmode, "    },\n");
-      }
-
-      fprintf(fmode, "};\n");
-      fclose(fmode);
-    }
-#endif
-
-
-#if defined(SECTIONBITS_OUTPUT)
-
-    if (0) {
-      int i;
-      FILE *f = fopen("tokenbits.stt", "a");
-
-      for (i = 0; i < 28; i++)
-        fprintf(f, "%8d", (int)(Sectionbits[i] / 256));
-
-      fprintf(f, "\n");
-      fclose(f);
-    }
-
-#endif
-
-#if 0
-    {
-      printf("\n_pick_loop_filter_level:%d\n", cpi->time_pick_lpf / 1000);
-      printf("\n_frames recive_data encod_mb_row compress_frame  Total\n");
-      printf("%6d %10ld %10ld %10ld %10ld\n", cpi->common.current_video_frame,
-             cpi->time_receive_data / 1000, cpi->time_encode_sb_row / 1000,
-             cpi->time_compress_data / 1000,
-             (cpi->time_receive_data + cpi->time_compress_data) / 1000);
-    }
-#endif
-  }
-
-  free_pick_mode_context(&cpi->mb);
-  dealloc_compressor_data(cpi);
-  vpx_free(cpi->mb.ss);
-  vpx_free(cpi->tok);
-
-  for (i = 0; i < sizeof(cpi->mbgraph_stats) /
-                  sizeof(cpi->mbgraph_stats[0]); ++i) {
-    vpx_free(cpi->mbgraph_stats[i].mb_stats);
-  }
-
-  vp9_remove_common(&cpi->common);
-  vpx_free(cpi);
-  *ptr = 0;
-
-#ifdef OUTPUT_YUV_SRC
-  fclose(yuv_file);
-#endif
-#ifdef OUTPUT_YUV_REC
-  fclose(yuv_rec_file);
-#endif
-
-#if 0
-
-  if (keyfile)
-    fclose(keyfile);
-
-  if (framepsnr)
-    fclose(framepsnr);
-
-  if (kf_list)
-    fclose(kf_list);
-
-#endif
-}
-
-
-static uint64_t calc_plane_error(uint8_t *orig, int orig_stride,
-                                 uint8_t *recon, int recon_stride,
-                                 unsigned int cols, unsigned int rows) {
-  unsigned int row, col;
-  uint64_t total_sse = 0;
-  int diff;
-
-  for (row = 0; row + 16 <= rows; row += 16) {
-    for (col = 0; col + 16 <= cols; col += 16) {
-      unsigned int sse;
-
-      vp9_mse16x16(orig + col, orig_stride, recon + col, recon_stride, &sse);
-      total_sse += sse;
-    }
-
-    /* Handle odd-sized width */
-    if (col < cols) {
-      unsigned int border_row, border_col;
-      uint8_t *border_orig = orig;
-      uint8_t *border_recon = recon;
-
-      for (border_row = 0; border_row < 16; border_row++) {
-        for (border_col = col; border_col < cols; border_col++) {
-          diff = border_orig[border_col] - border_recon[border_col];
-          total_sse += diff * diff;
-        }
-
-        border_orig += orig_stride;
-        border_recon += recon_stride;
-      }
-    }
-
-    orig += orig_stride * 16;
-    recon += recon_stride * 16;
-  }
-
-  /* Handle odd-sized height */
-  for (; row < rows; row++) {
-    for (col = 0; col < cols; col++) {
-      diff = orig[col] - recon[col];
-      total_sse += diff * diff;
-    }
-
-    orig += orig_stride;
-    recon += recon_stride;
-  }
-
-  return total_sse;
-}
-
-
-static void generate_psnr_packet(VP9_COMP *cpi) {
-  YV12_BUFFER_CONFIG      *orig = cpi->Source;
-  YV12_BUFFER_CONFIG      *recon = cpi->common.frame_to_show;
-  struct vpx_codec_cx_pkt  pkt;
-  uint64_t                 sse;
-  int                      i;
-  unsigned int             width = orig->y_crop_width;
-  unsigned int             height = orig->y_crop_height;
-
-  pkt.kind = VPX_CODEC_PSNR_PKT;
-  sse = calc_plane_error(orig->y_buffer, orig->y_stride,
-                         recon->y_buffer, recon->y_stride,
-                         width, height);
-  pkt.data.psnr.sse[0] = sse;
-  pkt.data.psnr.sse[1] = sse;
-  pkt.data.psnr.samples[0] = width * height;
-  pkt.data.psnr.samples[1] = width * height;
-
-  width = orig->uv_crop_width;
-  height = orig->uv_crop_height;
-
-  sse = calc_plane_error(orig->u_buffer, orig->uv_stride,
-                         recon->u_buffer, recon->uv_stride,
-                         width, height);
-  pkt.data.psnr.sse[0] += sse;
-  pkt.data.psnr.sse[2] = sse;
-  pkt.data.psnr.samples[0] += width * height;
-  pkt.data.psnr.samples[2] = width * height;
-
-  sse = calc_plane_error(orig->v_buffer, orig->uv_stride,
-                         recon->v_buffer, recon->uv_stride,
-                         width, height);
-  pkt.data.psnr.sse[0] += sse;
-  pkt.data.psnr.sse[3] = sse;
-  pkt.data.psnr.samples[0] += width * height;
-  pkt.data.psnr.samples[3] = width * height;
-
-  for (i = 0; i < 4; i++)
-    pkt.data.psnr.psnr[i] = vp9_mse2psnr(pkt.data.psnr.samples[i], 255.0,
-                                         (double)pkt.data.psnr.sse[i]);
-
-  vpx_codec_pkt_list_add(cpi->output_pkt_list, &pkt);
-}
-
-
-int vp9_use_as_reference(VP9_PTR ptr, int ref_frame_flags) {
-  VP9_COMP *cpi = (VP9_COMP *)(ptr);
-
-  if (ref_frame_flags > 7)
-    return -1;
-
-  cpi->ref_frame_flags = ref_frame_flags;
-  return 0;
-}
-int vp9_update_reference(VP9_PTR ptr, int ref_frame_flags) {
-  VP9_COMP *cpi = (VP9_COMP *)(ptr);
-
-  if (ref_frame_flags > 7)
-    return -1;
-
-  cpi->refresh_golden_frame = 0;
-  cpi->refresh_alt_ref_frame = 0;
-  cpi->refresh_last_frame   = 0;
-
-  if (ref_frame_flags & VP9_LAST_FLAG)
-    cpi->refresh_last_frame = 1;
-
-  if (ref_frame_flags & VP9_GOLD_FLAG)
-    cpi->refresh_golden_frame = 1;
-
-  if (ref_frame_flags & VP9_ALT_FLAG)
-    cpi->refresh_alt_ref_frame = 1;
-
-  return 0;
-}
-
-int vp9_copy_reference_enc(VP9_PTR ptr, VP9_REFFRAME ref_frame_flag,
-                           YV12_BUFFER_CONFIG *sd) {
-  VP9_COMP *cpi = (VP9_COMP *)(ptr);
-  VP9_COMMON *cm = &cpi->common;
-  int ref_fb_idx;
-
-  if (ref_frame_flag == VP9_LAST_FLAG)
-    ref_fb_idx = cm->ref_frame_map[cpi->lst_fb_idx];
-  else if (ref_frame_flag == VP9_GOLD_FLAG)
-    ref_fb_idx = cm->ref_frame_map[cpi->gld_fb_idx];
-  else if (ref_frame_flag == VP9_ALT_FLAG)
-    ref_fb_idx = cm->ref_frame_map[cpi->alt_fb_idx];
-  else
-    return -1;
-
-  vp8_yv12_copy_frame(&cm->yv12_fb[ref_fb_idx], sd);
-
-  return 0;
-}
-
-int vp9_get_reference_enc(VP9_PTR ptr, int index, YV12_BUFFER_CONFIG **fb) {
-  VP9_COMP *cpi = (VP9_COMP *)(ptr);
-  VP9_COMMON *cm = &cpi->common;
-
-  if (index < 0 || index >= NUM_REF_FRAMES)
-    return -1;
-
-  *fb = &cm->yv12_fb[cm->ref_frame_map[index]];
-  return 0;
-}
-
-int vp9_set_reference_enc(VP9_PTR ptr, VP9_REFFRAME ref_frame_flag,
-                          YV12_BUFFER_CONFIG *sd) {
-  VP9_COMP *cpi = (VP9_COMP *)(ptr);
-  VP9_COMMON *cm = &cpi->common;
-
-  int ref_fb_idx;
-
-  if (ref_frame_flag == VP9_LAST_FLAG)
-    ref_fb_idx = cm->ref_frame_map[cpi->lst_fb_idx];
-  else if (ref_frame_flag == VP9_GOLD_FLAG)
-    ref_fb_idx = cm->ref_frame_map[cpi->gld_fb_idx];
-  else if (ref_frame_flag == VP9_ALT_FLAG)
-    ref_fb_idx = cm->ref_frame_map[cpi->alt_fb_idx];
-  else
-    return -1;
-
-  vp8_yv12_copy_frame(sd, &cm->yv12_fb[ref_fb_idx]);
-
-  return 0;
-}
-int vp9_update_entropy(VP9_PTR comp, int update) {
-  ((VP9_COMP *)comp)->common.refresh_frame_context = update;
-  return 0;
-}
-
-
-#ifdef OUTPUT_YUV_SRC
-void vp9_write_yuv_frame(YV12_BUFFER_CONFIG *s) {
-  uint8_t *src = s->y_buffer;
-  int h = s->y_height;
-
-  do {
-    fwrite(src, s->y_width, 1,  yuv_file);
-    src += s->y_stride;
-  } while (--h);
-
-  src = s->u_buffer;
-  h = s->uv_height;
-
-  do {
-    fwrite(src, s->uv_width, 1,  yuv_file);
-    src += s->uv_stride;
-  } while (--h);
-
-  src = s->v_buffer;
-  h = s->uv_height;
-
-  do {
-    fwrite(src, s->uv_width, 1, yuv_file);
-    src += s->uv_stride;
-  } while (--h);
-}
-#endif
-
-#ifdef OUTPUT_YUV_REC
-void vp9_write_yuv_rec_frame(VP9_COMMON *cm) {
-  YV12_BUFFER_CONFIG *s = cm->frame_to_show;
-  uint8_t *src = s->y_buffer;
-  int h = cm->height;
-
-  do {
-    fwrite(src, s->y_width, 1,  yuv_rec_file);
-    src += s->y_stride;
-  } while (--h);
-
-  src = s->u_buffer;
-  h = s->uv_height;
-
-  do {
-    fwrite(src, s->uv_width, 1,  yuv_rec_file);
-    src += s->uv_stride;
-  } while (--h);
-
-  src = s->v_buffer;
-  h = s->uv_height;
-
-  do {
-    fwrite(src, s->uv_width, 1, yuv_rec_file);
-    src += s->uv_stride;
-  } while (--h);
-
-#if CONFIG_ALPHA
-  if (s->alpha_buffer) {
-    src = s->alpha_buffer;
-    h = s->alpha_height;
-    do {
-      fwrite(src, s->alpha_width, 1,  yuv_rec_file);
-      src += s->alpha_stride;
-    } while (--h);
-  }
-#endif
-
-  fflush(yuv_rec_file);
-}
-#endif
-
-static void scale_and_extend_frame(YV12_BUFFER_CONFIG *src_fb,
-                                   YV12_BUFFER_CONFIG *dst_fb) {
-  const int in_w = src_fb->y_crop_width;
-  const int in_h = src_fb->y_crop_height;
-  const int out_w = dst_fb->y_crop_width;
-  const int out_h = dst_fb->y_crop_height;
-  int x, y, i;
-
-  uint8_t *srcs[4] = {src_fb->y_buffer, src_fb->u_buffer, src_fb->v_buffer,
-                      src_fb->alpha_buffer};
-  int src_strides[4] = {src_fb->y_stride, src_fb->uv_stride, src_fb->uv_stride,
-                        src_fb->alpha_stride};
-
-  uint8_t *dsts[4] = {dst_fb->y_buffer, dst_fb->u_buffer, dst_fb->v_buffer,
-                      dst_fb->alpha_buffer};
-  int dst_strides[4] = {dst_fb->y_stride, dst_fb->uv_stride, dst_fb->uv_stride,
-                        dst_fb->alpha_stride};
-
-  for (y = 0; y < out_h; y += 16) {
-    for (x = 0; x < out_w; x += 16) {
-      for (i = 0; i < MAX_MB_PLANE; ++i) {
-        const int factor = i == 0 ? 1 : 2;
-        const int x_q4 = x * (16 / factor) * in_w / out_w;
-        const int y_q4 = y * (16 / factor) * in_h / out_h;
-        const int src_stride = src_strides[i];
-        const int dst_stride = dst_strides[i];
-        uint8_t *src = srcs[i] + y / factor * in_h / out_h * src_stride +
-                                 x / factor * in_w / out_w;
-        uint8_t *dst = dsts[i] + y / factor * dst_stride + x / factor;
-
-        vp9_convolve8(src, src_stride, dst, dst_stride,
-                      vp9_sub_pel_filters_8[x_q4 & 0xf], 16 * in_w / out_w,
-                      vp9_sub_pel_filters_8[y_q4 & 0xf], 16 * in_h / out_h,
-                      16 / factor, 16 / factor);
-      }
-    }
-  }
-
-  vp8_yv12_extend_frame_borders(dst_fb);
-}
-
-
-static void update_alt_ref_frame_stats(VP9_COMP *cpi) {
-  // this frame refreshes means next frames don't unless specified by user
-  cpi->frames_since_golden = 0;
-
-#if CONFIG_MULTIPLE_ARF
-  if (!cpi->multi_arf_enabled)
-#endif
-    // Clear the alternate reference update pending flag.
-    cpi->source_alt_ref_pending = 0;
-
-  // Set the alternate reference frame active flag
-  cpi->source_alt_ref_active = 1;
-}
-static void update_golden_frame_stats(VP9_COMP *cpi) {
-  // Update the Golden frame usage counts.
-  if (cpi->refresh_golden_frame) {
-    // this frame refreshes means next frames don't unless specified by user
-    cpi->refresh_golden_frame = 0;
-    cpi->frames_since_golden = 0;
-
-    // ******** Fixed Q test code only ************
-    // If we are going to use the ALT reference for the next group of frames
-    // set a flag to say so.
-    if (cpi->oxcf.fixed_q >= 0 &&
-        cpi->oxcf.play_alternate && !cpi->refresh_alt_ref_frame) {
-      cpi->source_alt_ref_pending = 1;
-      cpi->frames_till_gf_update_due = cpi->baseline_gf_interval;
-
-      // TODO(ivan): For SVC encoder, GF automatic update is disabled by using
-      // a large GF_interval.
-      if (cpi->use_svc) {
-        cpi->frames_till_gf_update_due = INT_MAX;
-      }
-    }
-
-    if (!cpi->source_alt_ref_pending)
-      cpi->source_alt_ref_active = 0;
-
-    // Decrement count down till next gf
-    if (cpi->frames_till_gf_update_due > 0)
-      cpi->frames_till_gf_update_due--;
-
-  } else if (!cpi->refresh_alt_ref_frame) {
-    // Decrement count down till next gf
-    if (cpi->frames_till_gf_update_due > 0)
-      cpi->frames_till_gf_update_due--;
-
-    if (cpi->frames_till_alt_ref_frame)
-      cpi->frames_till_alt_ref_frame--;
-
-    cpi->frames_since_golden++;
-  }
-}
-
-static int find_fp_qindex() {
-  int i;
-
-  for (i = 0; i < QINDEX_RANGE; i++) {
-    if (vp9_convert_qindex_to_q(i) >= 30.0) {
-      break;
-    }
-  }
-
-  if (i == QINDEX_RANGE)
-    i--;
-
-  return i;
-}
-
-static void Pass1Encode(VP9_COMP *cpi, unsigned long *size, unsigned char *dest,
-                        unsigned int *frame_flags) {
-  (void) size;
-  (void) dest;
-  (void) frame_flags;
-
-  vp9_set_quantizer(cpi, find_fp_qindex());
-  vp9_first_pass(cpi);
-}
-
-#define WRITE_RECON_BUFFER 0
-#if WRITE_RECON_BUFFER
-void write_cx_frame_to_file(YV12_BUFFER_CONFIG *frame, int this_frame) {
-  FILE *yframe;
-  int i;
-  char filename[255];
-
-  snprintf(filename, sizeof(filename), "cx\\y%04d.raw", this_frame);
-  yframe = fopen(filename, "wb");
-
-  for (i = 0; i < frame->y_height; i++)
-    fwrite(frame->y_buffer + i * frame->y_stride,
-           frame->y_width, 1, yframe);
-
-  fclose(yframe);
-  snprintf(filename, sizeof(filename), "cx\\u%04d.raw", this_frame);
-  yframe = fopen(filename, "wb");
-
-  for (i = 0; i < frame->uv_height; i++)
-    fwrite(frame->u_buffer + i * frame->uv_stride,
-           frame->uv_width, 1, yframe);
-
-  fclose(yframe);
-  snprintf(filename, sizeof(filename), "cx\\v%04d.raw", this_frame);
-  yframe = fopen(filename, "wb");
-
-  for (i = 0; i < frame->uv_height; i++)
-    fwrite(frame->v_buffer + i * frame->uv_stride,
-           frame->uv_width, 1, yframe);
-
-  fclose(yframe);
-}
-#endif
-
-static double compute_edge_pixel_proportion(YV12_BUFFER_CONFIG *frame) {
-#define EDGE_THRESH 128
-  int i, j;
-  int num_edge_pels = 0;
-  int num_pels = (frame->y_height - 2) * (frame->y_width - 2);
-  uint8_t *prev = frame->y_buffer + 1;
-  uint8_t *curr = frame->y_buffer + 1 + frame->y_stride;
-  uint8_t *next = frame->y_buffer + 1 + 2 * frame->y_stride;
-  for (i = 1; i < frame->y_height - 1; i++) {
-    for (j = 1; j < frame->y_width - 1; j++) {
-      /* Sobel hor and ver gradients */
-      int v = 2 * (curr[1] - curr[-1]) + (prev[1] - prev[-1]) +
-              (next[1] - next[-1]);
-      int h = 2 * (prev[0] - next[0]) + (prev[1] - next[1]) +
-              (prev[-1] - next[-1]);
-      h = (h < 0 ? -h : h);
-      v = (v < 0 ? -v : v);
-      if (h > EDGE_THRESH || v > EDGE_THRESH)
-        num_edge_pels++;
-      curr++;
-      prev++;
-      next++;
-    }
-    curr += frame->y_stride - frame->y_width + 2;
-    prev += frame->y_stride - frame->y_width + 2;
-    next += frame->y_stride - frame->y_width + 2;
-  }
-  return (double)num_edge_pels / num_pels;
-}
-
-// Function to test for conditions that indicate we should loop
-// back and recode a frame.
-static int recode_loop_test(VP9_COMP *cpi,
-                            int high_limit, int low_limit,
-                            int q, int maxq, int minq) {
-  int force_recode = 0;
-  VP9_COMMON *cm = &cpi->common;
-
-  // Is frame recode allowed at all
-  // Yes if either recode mode 1 is selected or mode two is selected
-  // and the frame is a key frame. golden frame or alt_ref_frame
-  if ((cpi->sf.recode_loop == 1) ||
-      ((cpi->sf.recode_loop == 2) &&
-       ((cm->frame_type == KEY_FRAME) ||
-        cpi->refresh_golden_frame ||
-        cpi->refresh_alt_ref_frame))) {
-    // General over and under shoot tests
-    if (((cpi->projected_frame_size > high_limit) && (q < maxq)) ||
-        ((cpi->projected_frame_size < low_limit) && (q > minq))) {
-      force_recode = 1;
-    } else if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) {
-      // Deal with frame undershoot and whether or not we are
-      // below the automatically set cq level.
-      if (q > cpi->cq_target_quality &&
-          cpi->projected_frame_size < ((cpi->this_frame_target * 7) >> 3)) {
-        force_recode = 1;
-      } else if (q > cpi->oxcf.cq_level &&
-                 cpi->projected_frame_size < cpi->min_frame_bandwidth &&
-                 cpi->active_best_quality > cpi->oxcf.cq_level) {
-        // Severe undershoot and between auto and user cq level
-        force_recode = 1;
-        cpi->active_best_quality = cpi->oxcf.cq_level;
-      }
-    }
-  }
-
-  return force_recode;
-}
-
-static void update_reference_frames(VP9_COMP * const cpi) {
-  VP9_COMMON * const cm = &cpi->common;
-
-  // At this point the new frame has been encoded.
-  // If any buffer copy / swapping is signaled it should be done here.
-  if (cm->frame_type == KEY_FRAME) {
-    ref_cnt_fb(cm->fb_idx_ref_cnt,
-               &cm->ref_frame_map[cpi->gld_fb_idx], cm->new_fb_idx);
-    ref_cnt_fb(cm->fb_idx_ref_cnt,
-               &cm->ref_frame_map[cpi->alt_fb_idx], cm->new_fb_idx);
-  }
-#if CONFIG_MULTIPLE_ARF
-  else if (!cpi->multi_arf_enabled && cpi->refresh_golden_frame &&
-      !cpi->refresh_alt_ref_frame) {
-#else
-  else if (cpi->refresh_golden_frame && !cpi->refresh_alt_ref_frame &&
-           !cpi->use_svc) {
-#endif
-    /* Preserve the previously existing golden frame and update the frame in
-     * the alt ref slot instead. This is highly specific to the current use of
-     * alt-ref as a forward reference, and this needs to be generalized as
-     * other uses are implemented (like RTC/temporal scaling)
-     *
-     * The update to the buffer in the alt ref slot was signaled in
-     * vp9_pack_bitstream(), now swap the buffer pointers so that it's treated
-     * as the golden frame next time.
-     */
-    int tmp;
-
-    ref_cnt_fb(cm->fb_idx_ref_cnt,
-               &cm->ref_frame_map[cpi->alt_fb_idx], cm->new_fb_idx);
-
-    tmp = cpi->alt_fb_idx;
-    cpi->alt_fb_idx = cpi->gld_fb_idx;
-    cpi->gld_fb_idx = tmp;
-  }  else { /* For non key/golden frames */
-    if (cpi->refresh_alt_ref_frame) {
-      int arf_idx = cpi->alt_fb_idx;
-#if CONFIG_MULTIPLE_ARF
-      if (cpi->multi_arf_enabled) {
-        arf_idx = cpi->arf_buffer_idx[cpi->sequence_number + 1];
-      }
-#endif
-      ref_cnt_fb(cm->fb_idx_ref_cnt,
-                 &cm->ref_frame_map[arf_idx], cm->new_fb_idx);
-    }
-
-    if (cpi->refresh_golden_frame) {
-      ref_cnt_fb(cm->fb_idx_ref_cnt,
-                 &cm->ref_frame_map[cpi->gld_fb_idx], cm->new_fb_idx);
-    }
-  }
-
-  if (cpi->refresh_last_frame) {
-    ref_cnt_fb(cm->fb_idx_ref_cnt,
-               &cm->ref_frame_map[cpi->lst_fb_idx], cm->new_fb_idx);
-  }
-}
-
-static void loopfilter_frame(VP9_COMP *cpi, VP9_COMMON *cm) {
-  MACROBLOCKD *xd = &cpi->mb.e_mbd;
-  struct loopfilter *lf = &cm->lf;
-  if (xd->lossless) {
-      lf->filter_level = 0;
-  } else {
-    struct vpx_usec_timer timer;
-
-    vp9_clear_system_state();
-
-    vpx_usec_timer_start(&timer);
-
-    vp9_pick_filter_level(cpi->Source, cpi, cpi->sf.use_fast_lpf_pick);
-
-    vpx_usec_timer_mark(&timer);
-    cpi->time_pick_lpf += vpx_usec_timer_elapsed(&timer);
-  }
-
-  if (lf->filter_level > 0) {
-    vp9_set_alt_lf_level(cpi, lf->filter_level);
-    vp9_loop_filter_frame(cm, xd, lf->filter_level, 0, 0);
-  }
-
-  vp9_extend_frame_inner_borders(cm->frame_to_show,
-                                 cm->subsampling_x, cm->subsampling_y);
-}
-
-static void scale_references(VP9_COMP *cpi) {
-  VP9_COMMON *cm = &cpi->common;
-  int i;
-  int refs[ALLOWED_REFS_PER_FRAME] = {cpi->lst_fb_idx, cpi->gld_fb_idx,
-                                      cpi->alt_fb_idx};
-
-  for (i = 0; i < 3; i++) {
-    YV12_BUFFER_CONFIG *ref = &cm->yv12_fb[cm->ref_frame_map[refs[i]]];
-
-    if (ref->y_crop_width != cm->width ||
-        ref->y_crop_height != cm->height) {
-      int new_fb = get_free_fb(cm);
-
-      vp9_realloc_frame_buffer(&cm->yv12_fb[new_fb],
-                               cm->width, cm->height,
-                               cm->subsampling_x, cm->subsampling_y,
-                               VP9BORDERINPIXELS);
-      scale_and_extend_frame(ref, &cm->yv12_fb[new_fb]);
-      cpi->scaled_ref_idx[i] = new_fb;
-    } else {
-      cpi->scaled_ref_idx[i] = cm->ref_frame_map[refs[i]];
-      cm->fb_idx_ref_cnt[cm->ref_frame_map[refs[i]]]++;
-    }
-  }
-}
-
-static void release_scaled_references(VP9_COMP *cpi) {
-  VP9_COMMON *cm = &cpi->common;
-  int i;
-
-  for (i = 0; i < 3; i++)
-    cm->fb_idx_ref_cnt[cpi->scaled_ref_idx[i]]--;
-}
-
-static void full_to_model_count(unsigned int *model_count,
-                                unsigned int *full_count) {
-  int n;
-  model_count[ZERO_TOKEN] = full_count[ZERO_TOKEN];
-  model_count[ONE_TOKEN] = full_count[ONE_TOKEN];
-  model_count[TWO_TOKEN] = full_count[TWO_TOKEN];
-  for (n = THREE_TOKEN; n < DCT_EOB_TOKEN; ++n)
-    model_count[TWO_TOKEN] += full_count[n];
-  model_count[DCT_EOB_MODEL_TOKEN] = full_count[DCT_EOB_TOKEN];
-}
-
-static void full_to_model_counts(
-    vp9_coeff_count_model *model_count, vp9_coeff_count *full_count) {
-  int i, j, k, l;
-  for (i = 0; i < BLOCK_TYPES; ++i)
-    for (j = 0; j < REF_TYPES; ++j)
-      for (k = 0; k < COEF_BANDS; ++k)
-        for (l = 0; l < PREV_COEF_CONTEXTS; ++l) {
-          if (l >= 3 && k == 0)
-            continue;
-          full_to_model_count(model_count[i][j][k][l], full_count[i][j][k][l]);
-        }
-}
-
-#if 0 && CONFIG_INTERNAL_STATS
-static void output_frame_level_debug_stats(VP9_COMP *cpi) {
-  VP9_COMMON *const cm = &cpi->common;
-  FILE *const f = fopen("tmp.stt", cm->current_video_frame ? "a" : "w");
-  int recon_err;
-
-  vp9_clear_system_state();  // __asm emms;
-
-  recon_err = vp9_calc_ss_err(cpi->Source, get_frame_new_buffer(cm));
-
-  if (cpi->twopass.total_left_stats.coded_error != 0.0)
-    fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d %10d"
-        "%7.2f %7.2f %7.2f %7.2f %7.2f %7.2f %7.2f"
-        "%6d %6d %5d %5d %5d %8.2f %10d %10.3f"
-        "%10.3f %8d %10d %10d %10d\n",
-        cpi->common.current_video_frame, cpi->this_frame_target,
-        cpi->projected_frame_size, 0,
-        (cpi->projected_frame_size - cpi->this_frame_target),
-        (int)cpi->total_target_vs_actual,
-        (int)(cpi->oxcf.starting_buffer_level - cpi->bits_off_target),
-        (int)cpi->total_actual_bits, cm->base_qindex,
-        vp9_convert_qindex_to_q(cm->base_qindex),
-        (double)vp9_dc_quant(cm->base_qindex, 0) / 4.0,
-        vp9_convert_qindex_to_q(cpi->active_best_quality),
-        vp9_convert_qindex_to_q(cpi->active_worst_quality), cpi->avg_q,
-        vp9_convert_qindex_to_q(cpi->ni_av_qi),
-        vp9_convert_qindex_to_q(cpi->cq_target_quality),
-        cpi->refresh_last_frame, cpi->refresh_golden_frame,
-        cpi->refresh_alt_ref_frame, cm->frame_type, cpi->gfu_boost,
-        cpi->twopass.est_max_qcorrection_factor, (int)cpi->twopass.bits_left,
-        cpi->twopass.total_left_stats.coded_error,
-        (double)cpi->twopass.bits_left /
-            (1 + cpi->twopass.total_left_stats.coded_error),
-        cpi->tot_recode_hits, recon_err, cpi->kf_boost, cpi->kf_zeromotion_pct);
-
-  fclose(f);
-
-  if (0) {
-    FILE *const fmodes = fopen("Modes.stt", "a");
-    int i;
-
-    fprintf(fmodes, "%6d:%1d:%1d:%1d ", cpi->common.current_video_frame,
-            cm->frame_type, cpi->refresh_golden_frame,
-            cpi->refresh_alt_ref_frame);
-
-    for (i = 0; i < MAX_MODES; ++i)
-      fprintf(fmodes, "%5d ", cpi->mode_chosen_counts[i]);
-    for (i = 0; i < MAX_REFS; ++i)
-      fprintf(fmodes, "%5d ", cpi->sub8x8_mode_chosen_counts[i]);
-
-    fprintf(fmodes, "\n");
-
-    fclose(fmodes);
-  }
-}
-#endif
-
-static int pick_q_and_adjust_q_bounds(VP9_COMP *cpi,
-                                      int * bottom_index, int * top_index) {
-  // Set an active best quality and if necessary active worst quality
-  int q = cpi->active_worst_quality;
-  VP9_COMMON *const cm = &cpi->common;
-
-  if (frame_is_intra_only(cm)) {
-#if !CONFIG_MULTIPLE_ARF
-    // Handle the special case for key frames forced when we have75 reached
-    // the maximum key frame interval. Here force the Q to a range
-    // based on the ambient Q to reduce the risk of popping.
-    if (cpi->this_key_frame_forced) {
-      int delta_qindex;
-      int qindex = cpi->last_boosted_qindex;
-      double last_boosted_q = vp9_convert_qindex_to_q(qindex);
-
-      delta_qindex = vp9_compute_qdelta(cpi, last_boosted_q,
-                                        (last_boosted_q * 0.75));
-
-      cpi->active_best_quality = MAX(qindex + delta_qindex,
-                                     cpi->best_quality);
-    } else {
-      int high = 5000;
-      int low = 400;
-      double q_adj_factor = 1.0;
-      double q_val;
-
-      // Baseline value derived from cpi->active_worst_quality and kf boost
-      cpi->active_best_quality = get_active_quality(q, cpi->kf_boost,
-                                                    low, high,
-                                                    kf_low_motion_minq,
-                                                    kf_high_motion_minq);
-
-      // Allow somewhat lower kf minq with small image formats.
-      if ((cm->width * cm->height) <= (352 * 288)) {
-        q_adj_factor -= 0.25;
-      }
-
-      // Make a further adjustment based on the kf zero motion measure.
-      q_adj_factor += 0.05 - (0.001 * (double)cpi->kf_zeromotion_pct);
-
-      // Convert the adjustment factor to a qindex delta
-      // on active_best_quality.
-      q_val = vp9_convert_qindex_to_q(cpi->active_best_quality);
-      cpi->active_best_quality +=
-          vp9_compute_qdelta(cpi, q_val, (q_val * q_adj_factor));
-    }
-#else
-    double current_q;
-    // Force the KF quantizer to be 30% of the active_worst_quality.
-    current_q = vp9_convert_qindex_to_q(cpi->active_worst_quality);
-    cpi->active_best_quality = cpi->active_worst_quality
-        + vp9_compute_qdelta(cpi, current_q, current_q * 0.3);
-#endif
-  } else if (!cpi->is_src_frame_alt_ref &&
-             (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
-    int high = 2000;
-    int low = 400;
-
-    // Use the lower of cpi->active_worst_quality and recent
-    // average Q as basis for GF/ARF best Q limit unless last frame was
-    // a key frame.
-    if (cpi->frames_since_key > 1 &&
-        cpi->avg_frame_qindex < cpi->active_worst_quality) {
-      q = cpi->avg_frame_qindex;
-    }
-    // For constrained quality dont allow Q less than the cq level
-    if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) {
-      if (q < cpi->cq_target_quality)
-        q = cpi->cq_target_quality;
-      if (cpi->frames_since_key > 1) {
-        cpi->active_best_quality = get_active_quality(q, cpi->gfu_boost,
-                                                      low, high,
-                                                      afq_low_motion_minq,
-                                                      afq_high_motion_minq);
-      } else {
-        cpi->active_best_quality = get_active_quality(q, cpi->gfu_boost,
-                                                      low, high,
-                                                      gf_low_motion_minq,
-                                                      gf_high_motion_minq);
-      }
-      // Constrained quality use slightly lower active best.
-      cpi->active_best_quality = cpi->active_best_quality * 15 / 16;
-
-    } else if (cpi->oxcf.end_usage == USAGE_CONSTANT_QUALITY) {
-      if (!cpi->refresh_alt_ref_frame) {
-        cpi->active_best_quality = cpi->cq_target_quality;
-      } else {
-        if (cpi->frames_since_key > 1) {
-          cpi->active_best_quality = get_active_quality(q, cpi->gfu_boost,
-                                                        low, high,
-                                                        afq_low_motion_minq,
-                                                        afq_high_motion_minq);
-        } else {
-          cpi->active_best_quality = get_active_quality(q, cpi->gfu_boost,
-                                                        low, high,
-                                                        gf_low_motion_minq,
-                                                        gf_high_motion_minq);
-        }
-      }
-    } else {
-        cpi->active_best_quality = get_active_quality(q, cpi->gfu_boost,
-                                                      low, high,
-                                                      gf_low_motion_minq,
-                                                      gf_high_motion_minq);
-    }
-  } else {
-    if (cpi->oxcf.end_usage == USAGE_CONSTANT_QUALITY) {
-      cpi->active_best_quality = cpi->cq_target_quality;
-    } else {
-#ifdef ONE_SHOT_Q_ESTIMATE
-#ifdef STRICT_ONE_SHOT_Q
-      cpi->active_best_quality = q;
-#else
-      cpi->active_best_quality = inter_minq[q];
-#endif
-#else
-      cpi->active_best_quality = inter_minq[q];
-      // 1-pass: for now, use the average Q for the active_best, if its lower
-      // than active_worst.
-      if (cpi->pass == 0 && (cpi->avg_frame_qindex < q))
-        cpi->active_best_quality = inter_minq[cpi->avg_frame_qindex];
-#endif
-
-      // For the constrained quality mode we don't want
-      // q to fall below the cq level.
-      if ((cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) &&
-          (cpi->active_best_quality < cpi->cq_target_quality)) {
-        // If we are strongly undershooting the target rate in the last
-        // frames then use the user passed in cq value not the auto
-        // cq value.
-        if (cpi->rolling_actual_bits < cpi->min_frame_bandwidth)
-          cpi->active_best_quality = cpi->oxcf.cq_level;
-        else
-          cpi->active_best_quality = cpi->cq_target_quality;
-      }
-    }
-  }
-
-  // Clip the active best and worst quality values to limits
-  if (cpi->active_worst_quality > cpi->worst_quality)
-    cpi->active_worst_quality = cpi->worst_quality;
-
-  if (cpi->active_best_quality < cpi->best_quality)
-    cpi->active_best_quality = cpi->best_quality;
-
-  if (cpi->active_best_quality > cpi->worst_quality)
-    cpi->active_best_quality = cpi->worst_quality;
-
-  if (cpi->active_worst_quality < cpi->active_best_quality)
-    cpi->active_worst_quality = cpi->active_best_quality;
-
-  // Limit Q range for the adaptive loop.
-  if (cm->frame_type == KEY_FRAME && !cpi->this_key_frame_forced) {
-    *top_index =
-      (cpi->active_worst_quality + cpi->active_best_quality * 3) / 4;
-    // If this is the first (key) frame in 1-pass, active best is the user
-    // best-allowed, and leave the top_index to active_worst.
-    if (cpi->pass == 0 && cpi->common.current_video_frame == 0) {
-      cpi->active_best_quality = cpi->oxcf.best_allowed_q;
-      *top_index = cpi->oxcf.worst_allowed_q;
-    }
-  } else if (!cpi->is_src_frame_alt_ref &&
-             (cpi->oxcf.end_usage != USAGE_STREAM_FROM_SERVER) &&
-             (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
-    *top_index =
-      (cpi->active_worst_quality + cpi->active_best_quality) / 2;
-  } else {
-    *top_index = cpi->active_worst_quality;
-  }
-  *bottom_index = cpi->active_best_quality;
-
-  if (cpi->oxcf.end_usage == USAGE_CONSTANT_QUALITY) {
-    q = cpi->active_best_quality;
-  // Special case code to try and match quality with forced key frames
-  } else if ((cm->frame_type == KEY_FRAME) && cpi->this_key_frame_forced) {
-    q = cpi->last_boosted_qindex;
-  } else {
-    // Determine initial Q to try.
-    if (cpi->pass == 0) {
-      // 1-pass: for now, use per-frame-bw for target size of frame, scaled
-      // by |x| for key frame.
-      int scale = (cm->frame_type == KEY_FRAME) ? 5 : 1;
-      q = vp9_regulate_q(cpi, scale * cpi->av_per_frame_bandwidth);
-    } else {
-      q = vp9_regulate_q(cpi, cpi->this_frame_target);
-    }
-    if (q > *top_index)
-      q = *top_index;
-  }
-
-  return q;
-}
-static void encode_frame_to_data_rate(VP9_COMP *cpi,
-                                      unsigned long *size,
-                                      unsigned char *dest,
-                                      unsigned int *frame_flags) {
-  VP9_COMMON *const cm = &cpi->common;
-  TX_SIZE t;
-  int q;
-  int frame_over_shoot_limit;
-  int frame_under_shoot_limit;
-
-  int loop = 0;
-  int loop_count;
-
-  int q_low;
-  int q_high;
-
-  int top_index;
-  int bottom_index;
-  int active_worst_qchanged = 0;
-
-  int overshoot_seen = 0;
-  int undershoot_seen = 0;
-
-  SPEED_FEATURES *const sf = &cpi->sf;
-  unsigned int max_mv_def = MIN(cpi->common.width, cpi->common.height);
-  struct segmentation *const seg = &cm->seg;
-
-  /* Scale the source buffer, if required. */
-  if (cm->mi_cols * 8 != cpi->un_scaled_source->y_width ||
-      cm->mi_rows * 8 != cpi->un_scaled_source->y_height) {
-    scale_and_extend_frame(cpi->un_scaled_source, &cpi->scaled_source);
-    cpi->Source = &cpi->scaled_source;
-  } else {
-    cpi->Source = cpi->un_scaled_source;
-  }
-  scale_references(cpi);
-
-  // Clear down mmx registers to allow floating point in what follows.
-  vp9_clear_system_state();
-
-  // For an alt ref frame in 2 pass we skip the call to the second
-  // pass function that sets the target bandwidth so we must set it here.
-  if (cpi->refresh_alt_ref_frame) {
-    // Set a per frame bit target for the alt ref frame.
-    cpi->per_frame_bandwidth = cpi->twopass.gf_bits;
-    // Set a per second target bitrate.
-    cpi->target_bandwidth = (int)(cpi->twopass.gf_bits * cpi->output_framerate);
-  }
-
-  // Clear zbin over-quant value and mode boost values.
-  cpi->zbin_mode_boost = 0;
-
-  // Enable or disable mode based tweaking of the zbin.
-  // For 2 pass only used where GF/ARF prediction quality
-  // is above a threshold.
-  cpi->zbin_mode_boost = 0;
-  cpi->zbin_mode_boost_enabled = 0;
-
-  // Current default encoder behavior for the altref sign bias.
-  cpi->common.ref_frame_sign_bias[ALTREF_FRAME] = cpi->source_alt_ref_active;
-
-  // Check to see if a key frame is signaled.
-  // For two pass with auto key frame enabled cm->frame_type may already be
-  // set, but not for one pass.
-  if ((cm->current_video_frame == 0) ||
-      (cm->frame_flags & FRAMEFLAGS_KEY) ||
-      (cpi->oxcf.auto_key && (cpi->frames_since_key %
-                              cpi->key_frame_frequency == 0))) {
-    // Set frame type to key frame for the force key frame, if we exceed the
-    // maximum distance in an automatic keyframe selection or for the first
-    // frame.
-    cm->frame_type = KEY_FRAME;
-  }
-
-  // Set default state for segment based loop filter update flags.
-  cm->lf.mode_ref_delta_update = 0;
-
-  // Initialize cpi->mv_step_param to default based on max resolution.
-  cpi->mv_step_param = vp9_init_search_range(cpi, max_mv_def);
-  // Initialize cpi->max_mv_magnitude and cpi->mv_step_param if appropriate.
-  if (sf->auto_mv_step_size) {
-    if (frame_is_intra_only(&cpi->common)) {
-      // Initialize max_mv_magnitude for use in the first INTER frame
-      // after a key/intra-only frame.
-      cpi->max_mv_magnitude = max_mv_def;
-    } else {
-      if (cm->show_frame)
-        // Allow mv_steps to correspond to twice the max mv magnitude found
-        // in the previous frame, capped by the default max_mv_magnitude based
-        // on resolution.
-        cpi->mv_step_param = vp9_init_search_range(
-            cpi, MIN(max_mv_def, 2 * cpi->max_mv_magnitude));
-      cpi->max_mv_magnitude = 0;
-    }
-  }
-
-  // Set various flags etc to special state if it is a key frame.
-  if (frame_is_intra_only(cm)) {
-    vp9_setup_key_frame(cpi);
-    // Reset the loop filter deltas and segmentation map.
-    setup_features(cm);
-
-    // If segmentation is enabled force a map update for key frames.
-    if (seg->enabled) {
-      seg->update_map = 1;
-      seg->update_data = 1;
-    }
-
-    // The alternate reference frame cannot be active for a key frame.
-    cpi->source_alt_ref_active = 0;
-
-    cm->error_resilient_mode = (cpi->oxcf.error_resilient_mode != 0);
-    cm->frame_parallel_decoding_mode =
-      (cpi->oxcf.frame_parallel_decoding_mode != 0);
-    if (cm->error_resilient_mode) {
-      cm->frame_parallel_decoding_mode = 1;
-      cm->reset_frame_context = 0;
-      cm->refresh_frame_context = 0;
-    } else if (cm->intra_only) {
-      // Only reset the current context.
-      cm->reset_frame_context = 2;
-    }
-  }
-
-  // Configure experimental use of segmentation for enhanced coding of
-  // static regions if indicated.
-  // Only allowed in second pass of two pass (as requires lagged coding)
-  // and if the relevant speed feature flag is set.
-  if ((cpi->pass == 2) && (cpi->sf.static_segmentation)) {
-    configure_static_seg_features(cpi);
-  }
-
-  // Decide how big to make the frame.
-  vp9_pick_frame_size(cpi);
-
-  vp9_clear_system_state();
-
-  q = pick_q_and_adjust_q_bounds(cpi, &bottom_index, &top_index);
-
-  q_high = top_index;
-  q_low  = bottom_index;
-
-  vp9_compute_frame_size_bounds(cpi, &frame_under_shoot_limit,
-                                &frame_over_shoot_limit);
-
-#if CONFIG_MULTIPLE_ARF
-  // Force the quantizer determined by the coding order pattern.
-  if (cpi->multi_arf_enabled && (cm->frame_type != KEY_FRAME) &&
-      cpi->oxcf.end_usage != USAGE_CONSTANT_QUALITY) {
-    double new_q;
-    double current_q = vp9_convert_qindex_to_q(cpi->active_worst_quality);
-    int level = cpi->this_frame_weight;
-    assert(level >= 0);
-
-    // Set quantizer steps at 10% increments.
-    new_q = current_q * (1.0 - (0.2 * (cpi->max_arf_level - level)));
-    q = cpi->active_worst_quality + vp9_compute_qdelta(cpi, current_q, new_q);
-
-    bottom_index = q;
-    top_index    = q;
-    q_low  = q;
-    q_high = q;
-
-    printf("frame:%d q:%d\n", cm->current_video_frame, q);
-  }
-#endif
-
-  loop_count = 0;
-  vp9_zero(cpi->rd_tx_select_threshes);
-
-  if (!frame_is_intra_only(cm)) {
-    cm->mcomp_filter_type = DEFAULT_INTERP_FILTER;
-    /* TODO: Decide this more intelligently */
-    cm->allow_high_precision_mv = q < HIGH_PRECISION_MV_QTHRESH;
-    set_mvcost(cpi);
-  }
-
-#if CONFIG_VP9_POSTPROC
-
-  if (cpi->oxcf.noise_sensitivity > 0) {
-    int l = 0;
-
-    switch (cpi->oxcf.noise_sensitivity) {
-      case 1:
-        l = 20;
-        break;
-      case 2:
-        l = 40;
-        break;
-      case 3:
-        l = 60;
-        break;
-      case 4:
-      case 5:
-        l = 100;
-        break;
-      case 6:
-        l = 150;
-        break;
-    }
-
-    vp9_denoise(cpi->Source, cpi->Source, l);
-  }
-
-#endif
-
-#ifdef OUTPUT_YUV_SRC
-  vp9_write_yuv_frame(cpi->Source);
-#endif
-
-  do {
-    vp9_clear_system_state();  // __asm emms;
-
-    vp9_set_quantizer(cpi, q);
-
-    if (loop_count == 0) {
-      // Set up entropy context depending on frame type. The decoder mandates
-      // the use of the default context, index 0, for keyframes and inter
-      // frames where the error_resilient_mode or intra_only flag is set. For
-      // other inter-frames the encoder currently uses only two contexts;
-      // context 1 for ALTREF frames and context 0 for the others.
-      if (cm->frame_type == KEY_FRAME) {
-        vp9_setup_key_frame(cpi);
-      } else {
-        if (!cm->intra_only && !cm->error_resilient_mode) {
-          cpi->common.frame_context_idx = cpi->refresh_alt_ref_frame;
-        }
-        vp9_setup_inter_frame(cpi);
-      }
-    }
-
-    if (cpi->sf.variance_adaptive_quantization) {
-        vp9_vaq_frame_setup(cpi);
-    }
-
-    // transform / motion compensation build reconstruction frame
-
-    vp9_encode_frame(cpi);
-
-    // Update the skip mb flag probabilities based on the distribution
-    // seen in the last encoder iteration.
-    // update_base_skip_probs(cpi);
-
-    vp9_clear_system_state();  // __asm emms;
-
-    // Dummy pack of the bitstream using up to date stats to get an
-    // accurate estimate of output frame size to determine if we need
-    // to recode.
-    vp9_save_coding_context(cpi);
-    cpi->dummy_packing = 1;
-    vp9_pack_bitstream(cpi, dest, size);
-    cpi->projected_frame_size = (*size) << 3;
-    vp9_restore_coding_context(cpi);
-
-    if (frame_over_shoot_limit == 0)
-      frame_over_shoot_limit = 1;
-    active_worst_qchanged = 0;
-
-    if (cpi->oxcf.end_usage == USAGE_CONSTANT_QUALITY) {
-      loop = 0;
-    } else {
-      // Special case handling for forced key frames
-      if ((cm->frame_type == KEY_FRAME) && cpi->this_key_frame_forced) {
-        int last_q = q;
-        int kf_err = vp9_calc_ss_err(cpi->Source, get_frame_new_buffer(cm));
-
-        int high_err_target = cpi->ambient_err;
-        int low_err_target = cpi->ambient_err >> 1;
-
-        // Prevent possible divide by zero error below for perfect KF
-        kf_err += !kf_err;
-
-        // The key frame is not good enough or we can afford
-        // to make it better without undue risk of popping.
-        if ((kf_err > high_err_target &&
-             cpi->projected_frame_size <= frame_over_shoot_limit) ||
-            (kf_err > low_err_target &&
-             cpi->projected_frame_size <= frame_under_shoot_limit)) {
-          // Lower q_high
-          q_high = q > q_low ? q - 1 : q_low;
-
-          // Adjust Q
-          q = (q * high_err_target) / kf_err;
-          q = MIN(q, (q_high + q_low) >> 1);
-        } else if (kf_err < low_err_target &&
-                   cpi->projected_frame_size >= frame_under_shoot_limit) {
-          // The key frame is much better than the previous frame
-          // Raise q_low
-          q_low = q < q_high ? q + 1 : q_high;
-
-          // Adjust Q
-          q = (q * low_err_target) / kf_err;
-          q = MIN(q, (q_high + q_low + 1) >> 1);
-        }
-
-        // Clamp Q to upper and lower limits:
-        q = clamp(q, q_low, q_high);
-
-        loop = q != last_q;
-      } else if (recode_loop_test(
-          cpi, frame_over_shoot_limit, frame_under_shoot_limit,
-          q, top_index, bottom_index)) {
-        // Is the projected frame size out of range and are we allowed
-        // to attempt to recode.
-        int last_q = q;
-        int retries = 0;
-
-        // Frame size out of permitted range:
-        // Update correction factor & compute new Q to try...
-
-        // Frame is too large
-        if (cpi->projected_frame_size > cpi->this_frame_target) {
-          // Raise Qlow as to at least the current value
-          q_low = q < q_high ? q + 1 : q_high;
-
-          if (undershoot_seen || loop_count > 1) {
-            // Update rate_correction_factor unless
-            // cpi->active_worst_quality has changed.
-            if (!active_worst_qchanged)
-              vp9_update_rate_correction_factors(cpi, 1);
-
-            q = (q_high + q_low + 1) / 2;
-          } else {
-            // Update rate_correction_factor unless
-            // cpi->active_worst_quality has changed.
-            if (!active_worst_qchanged)
-              vp9_update_rate_correction_factors(cpi, 0);
-
-            q = vp9_regulate_q(cpi, cpi->this_frame_target);
-
-            while (q < q_low && retries < 10) {
-              vp9_update_rate_correction_factors(cpi, 0);
-              q = vp9_regulate_q(cpi, cpi->this_frame_target);
-              retries++;
-            }
-          }
-
-          overshoot_seen = 1;
-        } else {
-          // Frame is too small
-          q_high = q > q_low ? q - 1 : q_low;
-
-          if (overshoot_seen || loop_count > 1) {
-            // Update rate_correction_factor unless
-            // cpi->active_worst_quality has changed.
-            if (!active_worst_qchanged)
-              vp9_update_rate_correction_factors(cpi, 1);
-
-            q = (q_high + q_low) / 2;
-          } else {
-            // Update rate_correction_factor unless
-            // cpi->active_worst_quality has changed.
-            if (!active_worst_qchanged)
-              vp9_update_rate_correction_factors(cpi, 0);
-
-            q = vp9_regulate_q(cpi, cpi->this_frame_target);
-
-            // Special case reset for qlow for constrained quality.
-            // This should only trigger where there is very substantial
-            // undershoot on a frame and the auto cq level is above
-            // the user passsed in value.
-            if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY && q < q_low) {
-              q_low = q;
-            }
-
-            while (q > q_high && retries < 10) {
-              vp9_update_rate_correction_factors(cpi, 0);
-              q = vp9_regulate_q(cpi, cpi->this_frame_target);
-              retries++;
-            }
-          }
-
-          undershoot_seen = 1;
-        }
-
-        // Clamp Q to upper and lower limits:
-        q = clamp(q, q_low, q_high);
-
-        loop = q != last_q;
-      } else {
-        loop = 0;
-      }
-    }
-
-    if (cpi->is_src_frame_alt_ref)
-      loop = 0;
-
-    if (loop) {
-      loop_count++;
-
-#if CONFIG_INTERNAL_STATS
-      cpi->tot_recode_hits++;
-#endif
-    }
-  } while (loop);
-
-  // Special case code to reduce pulsing when key frames are forced at a
-  // fixed interval. Note the reconstruction error if it is the frame before
-  // the force key frame
-  if (cpi->next_key_frame_forced && (cpi->twopass.frames_to_key == 0)) {
-    cpi->ambient_err = vp9_calc_ss_err(cpi->Source, get_frame_new_buffer(cm));
-  }
-
-  if (cm->frame_type == KEY_FRAME)
-    cpi->refresh_last_frame = 1;
-
-  cm->frame_to_show = get_frame_new_buffer(cm);
-
-#if WRITE_RECON_BUFFER
-  if (cm->show_frame)
-    write_cx_frame_to_file(cm->frame_to_show,
-                           cm->current_video_frame);
-  else
-    write_cx_frame_to_file(cm->frame_to_show,
-                           cm->current_video_frame + 1000);
-#endif
-
-  // Pick the loop filter level for the frame.
-  loopfilter_frame(cpi, cm);
-
-#if WRITE_RECON_BUFFER
-  if (cm->show_frame)
-    write_cx_frame_to_file(cm->frame_to_show,
-                           cm->current_video_frame + 2000);
-  else
-    write_cx_frame_to_file(cm->frame_to_show,
-                           cm->current_video_frame + 3000);
-#endif
-
-  // build the bitstream
-  cpi->dummy_packing = 0;
-  vp9_pack_bitstream(cpi, dest, size);
-
-  if (cm->seg.update_map)
-    update_reference_segmentation_map(cpi);
-
-  release_scaled_references(cpi);
-  update_reference_frames(cpi);
-
-  for (t = TX_4X4; t <= TX_32X32; t++)
-    full_to_model_counts(cpi->common.counts.coef[t],
-                         cpi->coef_counts[t]);
-  if (!cpi->common.error_resilient_mode &&
-      !cpi->common.frame_parallel_decoding_mode) {
-    vp9_adapt_coef_probs(&cpi->common);
-  }
-
-  if (!frame_is_intra_only(&cpi->common)) {
-    FRAME_COUNTS *counts = &cpi->common.counts;
-
-    vp9_copy(counts->y_mode, cpi->y_mode_count);
-    vp9_copy(counts->uv_mode, cpi->y_uv_mode_count);
-    vp9_copy(counts->partition, cpi->partition_count);
-    vp9_copy(counts->intra_inter, cpi->intra_inter_count);
-    vp9_copy(counts->comp_inter, cpi->comp_inter_count);
-    vp9_copy(counts->single_ref, cpi->single_ref_count);
-    vp9_copy(counts->comp_ref, cpi->comp_ref_count);
-    counts->mv = cpi->NMVcount;
-    if (!cpi->common.error_resilient_mode &&
-        !cpi->common.frame_parallel_decoding_mode) {
-      vp9_adapt_mode_probs(&cpi->common);
-      vp9_adapt_mv_probs(&cpi->common, cpi->common.allow_high_precision_mv);
-    }
-  }
-
-#ifdef ENTROPY_STATS
-  vp9_update_mode_context_stats(cpi);
-#endif
-
-  /* Move storing frame_type out of the above loop since it is also
-   * needed in motion search besides loopfilter */
-  cm->last_frame_type = cm->frame_type;
-
-  // Update rate control heuristics
-  cpi->total_byte_count += (*size);
-  cpi->projected_frame_size = (*size) << 3;
-
-  // Post encode loop adjustment of Q prediction.
-  if (!active_worst_qchanged)
-    vp9_update_rate_correction_factors(cpi, (cpi->sf.recode_loop) ? 2 : 0);
-
-  cpi->last_q[cm->frame_type] = cm->base_qindex;
-
-  // Keep record of last boosted (KF/KF/ARF) Q value.
-  // If the current frame is coded at a lower Q then we also update it.
-  // If all mbs in this group are skipped only update if the Q value is
-  // better than that already stored.
-  // This is used to help set quality in forced key frames to reduce popping
-  if ((cm->base_qindex < cpi->last_boosted_qindex) ||
-      ((cpi->static_mb_pct < 100) &&
-       ((cm->frame_type == KEY_FRAME) ||
-        cpi->refresh_alt_ref_frame ||
-        (cpi->refresh_golden_frame && !cpi->is_src_frame_alt_ref)))) {
-    cpi->last_boosted_qindex = cm->base_qindex;
-  }
-
-  if (cm->frame_type == KEY_FRAME) {
-    vp9_adjust_key_frame_context(cpi);
-  }
-
-  // Keep a record of ambient average Q.
-  if (cm->frame_type != KEY_FRAME)
-    cpi->avg_frame_qindex = (2 + 3 * cpi->avg_frame_qindex +
-                            cm->base_qindex) >> 2;
-
-  // Keep a record from which we can calculate the average Q excluding GF
-  // updates and key frames.
-  if (cm->frame_type != KEY_FRAME &&
-      !cpi->refresh_golden_frame &&
-      !cpi->refresh_alt_ref_frame) {
-    cpi->ni_frames++;
-    cpi->tot_q += vp9_convert_qindex_to_q(q);
-    cpi->avg_q = cpi->tot_q / (double)cpi->ni_frames;
-
-    // Calculate the average Q for normal inter frames (not key or GFU frames).
-    cpi->ni_tot_qi += q;
-    cpi->ni_av_qi = cpi->ni_tot_qi / cpi->ni_frames;
-  }
-
-  // Update the buffer level variable.
-  // Non-viewable frames are a special case and are treated as pure overhead.
-  if (!cm->show_frame)
-    cpi->bits_off_target -= cpi->projected_frame_size;
-  else
-    cpi->bits_off_target += cpi->av_per_frame_bandwidth -
-                            cpi->projected_frame_size;
-
-  // Clip the buffer level at the maximum buffer size
-  if (cpi->bits_off_target > cpi->oxcf.maximum_buffer_size)
-    cpi->bits_off_target = cpi->oxcf.maximum_buffer_size;
-
-  // Rolling monitors of whether we are over or underspending used to help
-  // regulate min and Max Q in two pass.
-  if (cm->frame_type != KEY_FRAME) {
-    cpi->rolling_target_bits =
-      ((cpi->rolling_target_bits * 3) + cpi->this_frame_target + 2) / 4;
-    cpi->rolling_actual_bits =
-      ((cpi->rolling_actual_bits * 3) + cpi->projected_frame_size + 2) / 4;
-    cpi->long_rolling_target_bits =
-      ((cpi->long_rolling_target_bits * 31) + cpi->this_frame_target + 16) / 32;
-    cpi->long_rolling_actual_bits =
-      ((cpi->long_rolling_actual_bits * 31) +
-       cpi->projected_frame_size + 16) / 32;
-  }
-
-  // Actual bits spent
-  cpi->total_actual_bits += cpi->projected_frame_size;
-
-  // Debug stats
-  cpi->total_target_vs_actual += (cpi->this_frame_target -
-                                  cpi->projected_frame_size);
-
-  cpi->buffer_level = cpi->bits_off_target;
-
-#ifndef DISABLE_RC_LONG_TERM_MEM
-  // Update bits left to the kf and gf groups to account for overshoot or
-  // undershoot on these frames
-  if (cm->frame_type == KEY_FRAME) {
-    cpi->twopass.kf_group_bits += cpi->this_frame_target -
-                                  cpi->projected_frame_size;
-
-    cpi->twopass.kf_group_bits = MAX(cpi->twopass.kf_group_bits, 0);
-  } else if (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame) {
-    cpi->twopass.gf_group_bits += cpi->this_frame_target -
-                                  cpi->projected_frame_size;
-
-    cpi->twopass.gf_group_bits = MAX(cpi->twopass.gf_group_bits, 0);
-  }
-#endif
-
-#if 0
-  output_frame_level_debug_stats(cpi);
-#endif
-  if (cpi->refresh_golden_frame == 1)
-    cm->frame_flags = cm->frame_flags | FRAMEFLAGS_GOLDEN;
-  else
-    cm->frame_flags = cm->frame_flags&~FRAMEFLAGS_GOLDEN;
-
-  if (cpi->refresh_alt_ref_frame == 1)
-    cm->frame_flags = cm->frame_flags | FRAMEFLAGS_ALTREF;
-  else
-    cm->frame_flags = cm->frame_flags&~FRAMEFLAGS_ALTREF;
-
-
-  if (cpi->refresh_last_frame & cpi->refresh_golden_frame)
-    cpi->gold_is_last = 1;
-  else if (cpi->refresh_last_frame ^ cpi->refresh_golden_frame)
-    cpi->gold_is_last = 0;
-
-  if (cpi->refresh_last_frame & cpi->refresh_alt_ref_frame)
-    cpi->alt_is_last = 1;
-  else if (cpi->refresh_last_frame ^ cpi->refresh_alt_ref_frame)
-    cpi->alt_is_last = 0;
-
-  if (cpi->refresh_alt_ref_frame & cpi->refresh_golden_frame)
-    cpi->gold_is_alt = 1;
-  else if (cpi->refresh_alt_ref_frame ^ cpi->refresh_golden_frame)
-    cpi->gold_is_alt = 0;
-
-  cpi->ref_frame_flags = VP9_ALT_FLAG | VP9_GOLD_FLAG | VP9_LAST_FLAG;
-
-  if (cpi->gold_is_last)
-    cpi->ref_frame_flags &= ~VP9_GOLD_FLAG;
-
-  if (cpi->alt_is_last)
-    cpi->ref_frame_flags &= ~VP9_ALT_FLAG;
-
-  if (cpi->gold_is_alt)
-    cpi->ref_frame_flags &= ~VP9_ALT_FLAG;
-
-  if (cpi->oxcf.play_alternate && cpi->refresh_alt_ref_frame
-      && (cm->frame_type != KEY_FRAME))
-    // Update the alternate reference frame stats as appropriate.
-    update_alt_ref_frame_stats(cpi);
-  else
-    // Update the Golden frame stats as appropriate.
-    update_golden_frame_stats(cpi);
-
-  if (cm->frame_type == KEY_FRAME) {
-    // Tell the caller that the frame was coded as a key frame
-    *frame_flags = cm->frame_flags | FRAMEFLAGS_KEY;
-
-#if CONFIG_MULTIPLE_ARF
-    // Reset the sequence number.
-    if (cpi->multi_arf_enabled) {
-      cpi->sequence_number = 0;
-      cpi->frame_coding_order_period = cpi->new_frame_coding_order_period;
-      cpi->new_frame_coding_order_period = -1;
-    }
-#endif
-
-    // As this frame is a key frame the next defaults to an inter frame.
-    cm->frame_type = INTER_FRAME;
-  } else {
-    *frame_flags = cm->frame_flags&~FRAMEFLAGS_KEY;
-
-#if CONFIG_MULTIPLE_ARF
-    /* Increment position in the coded frame sequence. */
-    if (cpi->multi_arf_enabled) {
-      ++cpi->sequence_number;
-      if (cpi->sequence_number >= cpi->frame_coding_order_period) {
-        cpi->sequence_number = 0;
-        cpi->frame_coding_order_period = cpi->new_frame_coding_order_period;
-        cpi->new_frame_coding_order_period = -1;
-      }
-      cpi->this_frame_weight = cpi->arf_weight[cpi->sequence_number];
-      assert(cpi->this_frame_weight >= 0);
-    }
-#endif
-  }
-
-  // Clear the one shot update flags for segmentation map and mode/ref loop
-  // filter deltas.
-  cm->seg.update_map = 0;
-  cm->seg.update_data = 0;
-  cm->lf.mode_ref_delta_update = 0;
-
-  // keep track of the last coded dimensions
-  cm->last_width = cm->width;
-  cm->last_height = cm->height;
-
-  // reset to normal state now that we are done.
-  cm->last_show_frame = cm->show_frame;
-  if (cm->show_frame) {
-    // current mip will be the prev_mip for the next frame
-    MODE_INFO *temp = cm->prev_mip;
-    MODE_INFO **temp2 = cm->prev_mi_grid_base;
-    cm->prev_mip = cm->mip;
-    cm->mip = temp;
-    cm->prev_mi_grid_base = cm->mi_grid_base;
-    cm->mi_grid_base = temp2;
-
-    // update the upper left visible macroblock ptrs
-    cm->mi = cm->mip + cm->mode_info_stride + 1;
-    cm->mi_grid_visible = cm->mi_grid_base + cm->mode_info_stride + 1;
-
-    cpi->mb.e_mbd.mi_8x8 = cm->mi_grid_visible;
-    cpi->mb.e_mbd.mi_8x8[0] = cm->mi;
-
-    // Don't increment frame counters if this was an altref buffer
-    // update not a real frame
-    ++cm->current_video_frame;
-    ++cpi->frames_since_key;
-  }
-  // restore prev_mi
-  cm->prev_mi = cm->prev_mip + cm->mode_info_stride + 1;
-  cm->prev_mi_grid_visible = cm->prev_mi_grid_base + cm->mode_info_stride + 1;
-}
-
-static void Pass2Encode(VP9_COMP *cpi, unsigned long *size,
-                        unsigned char *dest, unsigned int *frame_flags) {
-  cpi->enable_encode_breakout = 1;
-
-  if (!cpi->refresh_alt_ref_frame)
-    vp9_second_pass(cpi);
-
-  encode_frame_to_data_rate(cpi, size, dest, frame_flags);
-  // vp9_print_modes_and_motion_vectors(&cpi->common, "encode.stt");
-#ifdef DISABLE_RC_LONG_TERM_MEM
-  cpi->twopass.bits_left -=  cpi->this_frame_target;
-#else
-  cpi->twopass.bits_left -= 8 * *size;
-#endif
-
-  if (!cpi->refresh_alt_ref_frame) {
-    double lower_bounds_min_rate = FRAME_OVERHEAD_BITS * cpi->oxcf.framerate;
-    double two_pass_min_rate = (double)(cpi->oxcf.target_bandwidth
-                                        * cpi->oxcf.two_pass_vbrmin_section
-                                        / 100);
-
-    if (two_pass_min_rate < lower_bounds_min_rate)
-      two_pass_min_rate = lower_bounds_min_rate;
-
-    cpi->twopass.bits_left += (int64_t)(two_pass_min_rate
-                              / cpi->oxcf.framerate);
-  }
-}
-
-static void check_initial_width(VP9_COMP *cpi, YV12_BUFFER_CONFIG *sd) {
-  VP9_COMMON            *cm = &cpi->common;
-  if (!cpi->initial_width) {
-    // TODO(jkoleszar): Support 1/4 subsampling?
-    cm->subsampling_x = (sd != NULL) && sd->uv_width < sd->y_width;
-    cm->subsampling_y = (sd != NULL) && sd->uv_height < sd->y_height;
-    alloc_raw_frame_buffers(cpi);
-
-    cpi->initial_width = cm->width;
-    cpi->initial_height = cm->height;
-  }
-}
-
-
-int vp9_receive_raw_frame(VP9_PTR ptr, unsigned int frame_flags,
-                          YV12_BUFFER_CONFIG *sd, int64_t time_stamp,
-                          int64_t end_time) {
-  VP9_COMP              *cpi = (VP9_COMP *) ptr;
-  struct vpx_usec_timer  timer;
-  int                    res = 0;
-
-  check_initial_width(cpi, sd);
-  vpx_usec_timer_start(&timer);
-  if (vp9_lookahead_push(cpi->lookahead, sd, time_stamp, end_time, frame_flags,
-                         cpi->active_map_enabled ? cpi->active_map : NULL))
-    res = -1;
-  vpx_usec_timer_mark(&timer);
-  cpi->time_receive_data += vpx_usec_timer_elapsed(&timer);
-
-  return res;
-}
-
-
-static int frame_is_reference(const VP9_COMP *cpi) {
-  const VP9_COMMON *cm = &cpi->common;
-
-  return cm->frame_type == KEY_FRAME ||
-         cpi->refresh_last_frame ||
-         cpi->refresh_golden_frame ||
-         cpi->refresh_alt_ref_frame ||
-         cm->refresh_frame_context ||
-         cm->lf.mode_ref_delta_update ||
-         cm->seg.update_map ||
-         cm->seg.update_data;
-}
-
-#if CONFIG_MULTIPLE_ARF
-int is_next_frame_arf(VP9_COMP *cpi) {
-  // Negative entry in frame_coding_order indicates an ARF at this position.
-  return cpi->frame_coding_order[cpi->sequence_number + 1] < 0 ? 1 : 0;
-}
-#endif
-
-int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags,
-                            unsigned long *size, unsigned char *dest,
-                            int64_t *time_stamp, int64_t *time_end, int flush) {
-  VP9_COMP *cpi = (VP9_COMP *) ptr;
-  VP9_COMMON *cm = &cpi->common;
-  struct vpx_usec_timer  cmptimer;
-  YV12_BUFFER_CONFIG    *force_src_buffer = NULL;
-  int i;
-  // FILE *fp_out = fopen("enc_frame_type.txt", "a");
-
-  if (!cpi)
-    return -1;
-
-  vpx_usec_timer_start(&cmptimer);
-
-  cpi->source = NULL;
-
-  cpi->common.allow_high_precision_mv = ALTREF_HIGH_PRECISION_MV;
-  set_mvcost(cpi);
-
-  // Should we code an alternate reference frame.
-  if (cpi->oxcf.play_alternate && cpi->source_alt_ref_pending) {
-    int frames_to_arf;
-
-#if CONFIG_MULTIPLE_ARF
-    assert(!cpi->multi_arf_enabled ||
-           cpi->frame_coding_order[cpi->sequence_number] < 0);
-
-    if (cpi->multi_arf_enabled && (cpi->pass == 2))
-      frames_to_arf = (-cpi->frame_coding_order[cpi->sequence_number])
-        - cpi->next_frame_in_order;
-    else
-#endif
-      frames_to_arf = cpi->frames_till_gf_update_due;
-
-    assert(frames_to_arf < cpi->twopass.frames_to_key);
-
-    if ((cpi->source = vp9_lookahead_peek(cpi->lookahead, frames_to_arf))) {
-#if CONFIG_MULTIPLE_ARF
-      cpi->alt_ref_source[cpi->arf_buffered] = cpi->source;
-#else
-      cpi->alt_ref_source = cpi->source;
-#endif
-
-      if (cpi->oxcf.arnr_max_frames > 0) {
-        // Produce the filtered ARF frame.
-        // TODO(agrange) merge these two functions.
-        configure_arnr_filter(cpi, cm->current_video_frame + frames_to_arf,
-                              cpi->gfu_boost);
-        vp9_temporal_filter_prepare(cpi, frames_to_arf);
-        vp9_extend_frame_borders(&cpi->alt_ref_buffer,
-                                 cm->subsampling_x, cm->subsampling_y);
-        force_src_buffer = &cpi->alt_ref_buffer;
-      }
-
-      cm->show_frame = 0;
-      cpi->refresh_alt_ref_frame = 1;
-      cpi->refresh_golden_frame = 0;
-      cpi->refresh_last_frame = 0;
-      cpi->is_src_frame_alt_ref = 0;
-
-      // TODO(agrange) This needs to vary depending on where the next ARF is.
-      cpi->frames_till_alt_ref_frame = frames_to_arf;
-
-#if CONFIG_MULTIPLE_ARF
-      if (!cpi->multi_arf_enabled)
-#endif
-        cpi->source_alt_ref_pending = 0;   // Clear Pending altf Ref flag.
-    }
-  }
-
-  if (!cpi->source) {
-#if CONFIG_MULTIPLE_ARF
-    int i;
-#endif
-    if ((cpi->source = vp9_lookahead_pop(cpi->lookahead, flush))) {
-      cm->show_frame = 1;
-      cm->intra_only = 0;
-
-#if CONFIG_MULTIPLE_ARF
-      // Is this frame the ARF overlay.
-      cpi->is_src_frame_alt_ref = 0;
-      for (i = 0; i < cpi->arf_buffered; ++i) {
-        if (cpi->source == cpi->alt_ref_source[i]) {
-          cpi->is_src_frame_alt_ref = 1;
-          cpi->refresh_golden_frame = 1;
-          break;
-        }
-      }
-#else
-      cpi->is_src_frame_alt_ref = cpi->alt_ref_source
-                                  && (cpi->source == cpi->alt_ref_source);
-#endif
-      if (cpi->is_src_frame_alt_ref) {
-        // Current frame is an ARF overlay frame.
-#if CONFIG_MULTIPLE_ARF
-        cpi->alt_ref_source[i] = NULL;
-#else
-        cpi->alt_ref_source = NULL;
-#endif
-        // Don't refresh the last buffer for an ARF overlay frame. It will
-        // become the GF so preserve last as an alternative prediction option.
-        cpi->refresh_last_frame = 0;
-      }
-#if CONFIG_MULTIPLE_ARF
-      ++cpi->next_frame_in_order;
-#endif
-    }
-  }
-
-  if (cpi->source) {
-    cpi->un_scaled_source = cpi->Source = force_src_buffer ? force_src_buffer
-                                                           : &cpi->source->img;
-    *time_stamp = cpi->source->ts_start;
-    *time_end = cpi->source->ts_end;
-    *frame_flags = cpi->source->flags;
-
-    // fprintf(fp_out, "   Frame:%d", cm->current_video_frame);
-#if CONFIG_MULTIPLE_ARF
-    if (cpi->multi_arf_enabled) {
-      // fprintf(fp_out, "   seq_no:%d  this_frame_weight:%d",
-      //         cpi->sequence_number, cpi->this_frame_weight);
-    } else {
-      // fprintf(fp_out, "\n");
-    }
-#else
-    // fprintf(fp_out, "\n");
-#endif
-
-#if CONFIG_MULTIPLE_ARF
-    if ((cm->frame_type != KEY_FRAME) && (cpi->pass == 2))
-      cpi->source_alt_ref_pending = is_next_frame_arf(cpi);
-#endif
-  } else {
-    *size = 0;
-    if (flush && cpi->pass == 1 && !cpi->twopass.first_pass_done) {
-      vp9_end_first_pass(cpi);    /* get last stats packet */
-      cpi->twopass.first_pass_done = 1;
-    }
-
-    // fclose(fp_out);
-    return -1;
-  }
-
-  if (cpi->source->ts_start < cpi->first_time_stamp_ever) {
-    cpi->first_time_stamp_ever = cpi->source->ts_start;
-    cpi->last_end_time_stamp_seen = cpi->source->ts_start;
-  }
-
-  // adjust frame rates based on timestamps given
-  if (!cpi->refresh_alt_ref_frame) {
-    int64_t this_duration;
-    int step = 0;
-
-    if (cpi->source->ts_start == cpi->first_time_stamp_ever) {
-      this_duration = cpi->source->ts_end - cpi->source->ts_start;
-      step = 1;
-    } else {
-      int64_t last_duration = cpi->last_end_time_stamp_seen
-                                - cpi->last_time_stamp_seen;
-
-      this_duration = cpi->source->ts_end - cpi->last_end_time_stamp_seen;
-
-      // do a step update if the duration changes by 10%
-      if (last_duration)
-        step = (int)((this_duration - last_duration) * 10 / last_duration);
-    }
-
-    if (this_duration) {
-      if (step) {
-        vp9_new_framerate(cpi, 10000000.0 / this_duration);
-      } else {
-        // Average this frame's rate into the last second's average
-        // frame rate. If we haven't seen 1 second yet, then average
-        // over the whole interval seen.
-        const double interval = MIN((double)(cpi->source->ts_end
-                                     - cpi->first_time_stamp_ever), 10000000.0);
-        double avg_duration = 10000000.0 / cpi->oxcf.framerate;
-        avg_duration *= (interval - avg_duration + this_duration);
-        avg_duration /= interval;
-
-        vp9_new_framerate(cpi, 10000000.0 / avg_duration);
-      }
-    }
-
-    cpi->last_time_stamp_seen = cpi->source->ts_start;
-    cpi->last_end_time_stamp_seen = cpi->source->ts_end;
-  }
-
-  // start with a 0 size frame
-  *size = 0;
-
-  // Clear down mmx registers
-  vp9_clear_system_state();  // __asm emms;
-
-  /* find a free buffer for the new frame, releasing the reference previously
-   * held.
-   */
-  cm->fb_idx_ref_cnt[cm->new_fb_idx]--;
-  cm->new_fb_idx = get_free_fb(cm);
-
-#if CONFIG_MULTIPLE_ARF
-  /* Set up the correct ARF frame. */
-  if (cpi->refresh_alt_ref_frame) {
-    ++cpi->arf_buffered;
-  }
-  if (cpi->multi_arf_enabled && (cm->frame_type != KEY_FRAME) &&
-      (cpi->pass == 2)) {
-    cpi->alt_fb_idx = cpi->arf_buffer_idx[cpi->sequence_number];
-  }
-#endif
-
-  /* Get the mapping of L/G/A to the reference buffer pool */
-  cm->active_ref_idx[0] = cm->ref_frame_map[cpi->lst_fb_idx];
-  cm->active_ref_idx[1] = cm->ref_frame_map[cpi->gld_fb_idx];
-  cm->active_ref_idx[2] = cm->ref_frame_map[cpi->alt_fb_idx];
-
-#if 0  // CONFIG_MULTIPLE_ARF
-  if (cpi->multi_arf_enabled) {
-    fprintf(fp_out, "      idx(%d, %d, %d, %d) active(%d, %d, %d)",
-        cpi->lst_fb_idx, cpi->gld_fb_idx, cpi->alt_fb_idx, cm->new_fb_idx,
-        cm->active_ref_idx[0], cm->active_ref_idx[1], cm->active_ref_idx[2]);
-    if (cpi->refresh_alt_ref_frame)
-      fprintf(fp_out, "  type:ARF");
-    if (cpi->is_src_frame_alt_ref)
-      fprintf(fp_out, "  type:OVERLAY[%d]", cpi->alt_fb_idx);
-    fprintf(fp_out, "\n");
-  }
-#endif
-
-  cm->frame_type = INTER_FRAME;
-  cm->frame_flags = *frame_flags;
-
-  // Reset the frame pointers to the current frame size
-  vp9_realloc_frame_buffer(get_frame_new_buffer(cm),
-                           cm->width, cm->height,
-                           cm->subsampling_x, cm->subsampling_y,
-                           VP9BORDERINPIXELS);
-
-  // Calculate scaling factors for each of the 3 available references
-  for (i = 0; i < ALLOWED_REFS_PER_FRAME; ++i)
-    vp9_setup_scale_factors(cm, i);
-
-  vp9_setup_interp_filters(&cpi->mb.e_mbd, DEFAULT_INTERP_FILTER, cm);
-
-  if (cpi->sf.variance_adaptive_quantization) {
-      vp9_vaq_init();
-  }
-
-  if (cpi->pass == 1) {
-    Pass1Encode(cpi, size, dest, frame_flags);
-  } else if (cpi->pass == 2) {
-    Pass2Encode(cpi, size, dest, frame_flags);
-  } else {
-    encode_frame_to_data_rate(cpi, size, dest, frame_flags);
-  }
-
-  if (cm->refresh_frame_context)
-    cm->frame_contexts[cm->frame_context_idx] = cm->fc;
-
-  if (*size > 0) {
-    // if its a dropped frame honor the requests on subsequent frames
-    cpi->droppable = !frame_is_reference(cpi);
-
-    // return to normal state
-    cm->reset_frame_context = 0;
-    cm->refresh_frame_context = 1;
-    cpi->refresh_alt_ref_frame = 0;
-    cpi->refresh_golden_frame = 0;
-    cpi->refresh_last_frame = 1;
-    cm->frame_type = INTER_FRAME;
-  }
-
-  vpx_usec_timer_mark(&cmptimer);
-  cpi->time_compress_data += vpx_usec_timer_elapsed(&cmptimer);
-
-  if (cpi->b_calculate_psnr && cpi->pass != 1 && cm->show_frame)
-    generate_psnr_packet(cpi);
-
-#if CONFIG_INTERNAL_STATS
-
-  if (cpi->pass != 1) {
-    cpi->bytes += *size;
-
-    if (cm->show_frame) {
-      cpi->count++;
-
-      if (cpi->b_calculate_psnr) {
-        double ye, ue, ve;
-        double frame_psnr;
-        YV12_BUFFER_CONFIG      *orig = cpi->Source;
-        YV12_BUFFER_CONFIG      *recon = cpi->common.frame_to_show;
-        YV12_BUFFER_CONFIG      *pp = &cm->post_proc_buffer;
-        int y_samples = orig->y_height * orig->y_width;
-        int uv_samples = orig->uv_height * orig->uv_width;
-        int t_samples = y_samples + 2 * uv_samples;
-        double sq_error;
-
-        ye = (double)calc_plane_error(orig->y_buffer, orig->y_stride,
-                              recon->y_buffer, recon->y_stride,
-                              orig->y_crop_width, orig->y_crop_height);
-
-        ue = (double)calc_plane_error(orig->u_buffer, orig->uv_stride,
-                              recon->u_buffer, recon->uv_stride,
-                              orig->uv_crop_width, orig->uv_crop_height);
-
-        ve = (double)calc_plane_error(orig->v_buffer, orig->uv_stride,
-                              recon->v_buffer, recon->uv_stride,
-                              orig->uv_crop_width, orig->uv_crop_height);
-
-        sq_error = ye + ue + ve;
-
-        frame_psnr = vp9_mse2psnr(t_samples, 255.0, sq_error);
-
-        cpi->total_y += vp9_mse2psnr(y_samples, 255.0, ye);
-        cpi->total_u += vp9_mse2psnr(uv_samples, 255.0, ue);
-        cpi->total_v += vp9_mse2psnr(uv_samples, 255.0, ve);
-        cpi->total_sq_error += sq_error;
-        cpi->total  += frame_psnr;
-        {
-          double frame_psnr2, frame_ssim2 = 0;
-          double weight = 0;
-#if CONFIG_VP9_POSTPROC
-          vp9_deblock(cm->frame_to_show, &cm->post_proc_buffer,
-                      cm->lf.filter_level * 10 / 6);
-#endif
-          vp9_clear_system_state();
-
-          ye = (double)calc_plane_error(orig->y_buffer, orig->y_stride,
-                                pp->y_buffer, pp->y_stride,
-                                orig->y_crop_width, orig->y_crop_height);
-
-          ue = (double)calc_plane_error(orig->u_buffer, orig->uv_stride,
-                                pp->u_buffer, pp->uv_stride,
-                                orig->uv_crop_width, orig->uv_crop_height);
-
-          ve = (double)calc_plane_error(orig->v_buffer, orig->uv_stride,
-                                pp->v_buffer, pp->uv_stride,
-                                orig->uv_crop_width, orig->uv_crop_height);
-
-          sq_error = ye + ue + ve;
-
-          frame_psnr2 = vp9_mse2psnr(t_samples, 255.0, sq_error);
-
-          cpi->totalp_y += vp9_mse2psnr(y_samples, 255.0, ye);
-          cpi->totalp_u += vp9_mse2psnr(uv_samples, 255.0, ue);
-          cpi->totalp_v += vp9_mse2psnr(uv_samples, 255.0, ve);
-          cpi->total_sq_error2 += sq_error;
-          cpi->totalp  += frame_psnr2;
-
-          frame_ssim2 = vp9_calc_ssim(cpi->Source,
-                                      recon, 1, &weight);
-
-          cpi->summed_quality += frame_ssim2 * weight;
-          cpi->summed_weights += weight;
-
-          frame_ssim2 = vp9_calc_ssim(cpi->Source,
-                                      &cm->post_proc_buffer, 1, &weight);
-
-          cpi->summedp_quality += frame_ssim2 * weight;
-          cpi->summedp_weights += weight;
-#if 0
-          {
-            FILE *f = fopen("q_used.stt", "a");
-            fprintf(f, "%5d : Y%f7.3:U%f7.3:V%f7.3:F%f7.3:S%7.3f\n",
-                    cpi->common.current_video_frame, y2, u2, v2,
-                    frame_psnr2, frame_ssim2);
-            fclose(f);
-          }
-#endif
-        }
-      }
-
-      if (cpi->b_calculate_ssimg) {
-        double y, u, v, frame_all;
-        frame_all =  vp9_calc_ssimg(cpi->Source, cm->frame_to_show,
-                                    &y, &u, &v);
-        cpi->total_ssimg_y += y;
-        cpi->total_ssimg_u += u;
-        cpi->total_ssimg_v += v;
-        cpi->total_ssimg_all += frame_all;
-      }
-    }
-  }
-
-#endif
-  // fclose(fp_out);
-  return 0;
-}
-
-int vp9_get_preview_raw_frame(VP9_PTR comp, YV12_BUFFER_CONFIG *dest,
-                              vp9_ppflags_t *flags) {
-  VP9_COMP *cpi = (VP9_COMP *) comp;
-
-  if (!cpi->common.show_frame) {
-    return -1;
-  } else {
-    int ret;
-#if CONFIG_VP9_POSTPROC
-    ret = vp9_post_proc_frame(&cpi->common, dest, flags);
-#else
-
-    if (cpi->common.frame_to_show) {
-      *dest = *cpi->common.frame_to_show;
-      dest->y_width = cpi->common.width;
-      dest->y_height = cpi->common.height;
-      dest->uv_height = cpi->common.height / 2;
-      ret = 0;
-    } else {
-      ret = -1;
-    }
-
-#endif  // !CONFIG_VP9_POSTPROC
-    vp9_clear_system_state();
-    return ret;
-  }
-}
-
-int vp9_set_roimap(VP9_PTR comp, unsigned char *map, unsigned int rows,
-                   unsigned int cols, int delta_q[MAX_SEGMENTS],
-                   int delta_lf[MAX_SEGMENTS],
-                   unsigned int threshold[MAX_SEGMENTS]) {
-  VP9_COMP *cpi = (VP9_COMP *) comp;
-  signed char feature_data[SEG_LVL_MAX][MAX_SEGMENTS];
-  struct segmentation *seg = &cpi->common.seg;
-  int i;
-
-  if (cpi->common.mb_rows != rows || cpi->common.mb_cols != cols)
-    return -1;
-
-  if (!map) {
-    vp9_disable_segmentation((VP9_PTR)cpi);
-    return 0;
-  }
-
-  // Set the segmentation Map
-  vp9_set_segmentation_map((VP9_PTR)cpi, map);
-
-  // Activate segmentation.
-  vp9_enable_segmentation((VP9_PTR)cpi);
-
-  // Set up the quant, LF and breakout threshold segment data
-  for (i = 0; i < MAX_SEGMENTS; i++) {
-    feature_data[SEG_LVL_ALT_Q][i] = delta_q[i];
-    feature_data[SEG_LVL_ALT_LF][i] = delta_lf[i];
-    cpi->segment_encode_breakout[i] = threshold[i];
-  }
-
-  // Enable the loop and quant changes in the feature mask
-  for (i = 0; i < MAX_SEGMENTS; i++) {
-    if (delta_q[i])
-      vp9_enable_segfeature(seg, i, SEG_LVL_ALT_Q);
-    else
-      vp9_disable_segfeature(seg, i, SEG_LVL_ALT_Q);
-
-    if (delta_lf[i])
-      vp9_enable_segfeature(seg, i, SEG_LVL_ALT_LF);
-    else
-      vp9_disable_segfeature(seg, i, SEG_LVL_ALT_LF);
-  }
-
-  // Initialize the feature data structure
-  // SEGMENT_DELTADATA    0, SEGMENT_ABSDATA      1
-  vp9_set_segment_data((VP9_PTR)cpi, &feature_data[0][0], SEGMENT_DELTADATA);
-
-  return 0;
-}
-
-int vp9_set_active_map(VP9_PTR comp, unsigned char *map,
-                       unsigned int rows, unsigned int cols) {
-  VP9_COMP *cpi = (VP9_COMP *) comp;
-
-  if (rows == cpi->common.mb_rows && cols == cpi->common.mb_cols) {
-    if (map) {
-      vpx_memcpy(cpi->active_map, map, rows * cols);
-      cpi->active_map_enabled = 1;
-    } else {
-      cpi->active_map_enabled = 0;
-    }
-
-    return 0;
-  } else {
-    // cpi->active_map_enabled = 0;
-    return -1;
-  }
-}
-
-int vp9_set_internal_size(VP9_PTR comp,
-                          VPX_SCALING horiz_mode, VPX_SCALING vert_mode) {
-  VP9_COMP *cpi = (VP9_COMP *) comp;
-  VP9_COMMON *cm = &cpi->common;
-  int hr = 0, hs = 0, vr = 0, vs = 0;
-
-  if (horiz_mode > ONETWO || vert_mode > ONETWO)
-    return -1;
-
-  Scale2Ratio(horiz_mode, &hr, &hs);
-  Scale2Ratio(vert_mode, &vr, &vs);
-
-  // always go to the next whole number
-  cm->width = (hs - 1 + cpi->oxcf.width * hr) / hs;
-  cm->height = (vs - 1 + cpi->oxcf.height * vr) / vs;
-
-  assert(cm->width <= cpi->initial_width);
-  assert(cm->height <= cpi->initial_height);
-  update_frame_size(cpi);
-  return 0;
-}
-
-int vp9_set_size_literal(VP9_PTR comp, unsigned int width,
-                         unsigned int height) {
-  VP9_COMP *cpi = (VP9_COMP *)comp;
-  VP9_COMMON *cm = &cpi->common;
-
-  check_initial_width(cpi, NULL);
-
-  if (width) {
-    cm->width = width;
-    if (cm->width * 5 < cpi->initial_width) {
-      cm->width = cpi->initial_width / 5 + 1;
-      printf("Warning: Desired width too small, changed to %d \n", cm->width);
-    }
-    if (cm->width > cpi->initial_width) {
-      cm->width = cpi->initial_width;
-      printf("Warning: Desired width too large, changed to %d \n", cm->width);
-    }
-  }
-
-  if (height) {
-    cm->height = height;
-    if (cm->height * 5 < cpi->initial_height) {
-      cm->height = cpi->initial_height / 5 + 1;
-      printf("Warning: Desired height too small, changed to %d \n", cm->height);
-    }
-    if (cm->height > cpi->initial_height) {
-      cm->height = cpi->initial_height;
-      printf("Warning: Desired height too large, changed to %d \n", cm->height);
-    }
-  }
-
-  assert(cm->width <= cpi->initial_width);
-  assert(cm->height <= cpi->initial_height);
-  update_frame_size(cpi);
-  return 0;
-}
-
-int vp9_switch_layer(VP9_PTR comp, int layer) {
-  VP9_COMP *cpi = (VP9_COMP *)comp;
-
-  if (cpi->use_svc) {
-    cpi->current_layer = layer;
-
-    // Use buffer i for layer i LST
-    cpi->lst_fb_idx = layer;
-
-    // Use buffer i-1 for layer i Alt (Inter-layer prediction)
-    if (layer != 0) cpi->alt_fb_idx = layer - 1;
-
-    // Use the rest for Golden
-    if (layer < 2 * cpi->number_spatial_layers - NUM_REF_FRAMES)
-      cpi->gld_fb_idx = cpi->lst_fb_idx;
-    else
-      cpi->gld_fb_idx = 2 * cpi->number_spatial_layers - 1 - layer;
-
-    printf("Switching to layer %d:\n", layer);
-    printf("Using references: LST/GLD/ALT [%d|%d|%d]\n", cpi->lst_fb_idx,
-           cpi->gld_fb_idx, cpi->alt_fb_idx);
-  } else {
-    printf("Switching layer not supported. Enable SVC first \n");
-  }
-  return 0;
-}
-
-void vp9_set_svc(VP9_PTR comp, int use_svc) {
-  VP9_COMP *cpi = (VP9_COMP *)comp;
-  cpi->use_svc = use_svc;
-  if (cpi->use_svc) printf("Enabled SVC encoder \n");
-  return;
-}
-
-int vp9_calc_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest) {
-  int i, j;
-  int total = 0;
-
-  uint8_t *src = source->y_buffer;
-  uint8_t *dst = dest->y_buffer;
-
-  // Loop through the Y plane raw and reconstruction data summing
-  // (square differences)
-  for (i = 0; i < source->y_height; i += 16) {
-    for (j = 0; j < source->y_width; j += 16) {
-      unsigned int sse;
-      total += vp9_mse16x16(src + j, source->y_stride, dst + j, dest->y_stride,
-                            &sse);
-    }
-
-    src += 16 * source->y_stride;
-    dst += 16 * dest->y_stride;
-  }
-
-  return total;
-}
-
-
-int vp9_get_quantizer(VP9_PTR c) {
-  return ((VP9_COMP *)c)->common.base_qindex;
-}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_onyx_int.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_onyx_int.h
deleted file mode 100644
index 0498043fc72..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_onyx_int.h
+++ /dev/null
@@ -1,727 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef VP9_ENCODER_VP9_ONYX_INT_H_
-#define VP9_ENCODER_VP9_ONYX_INT_H_
-
-#include <stdio.h>
-#include "./vpx_config.h"
-#include "vp9/common/vp9_onyx.h"
-#include "vp9/encoder/vp9_treewriter.h"
-#include "vp9/encoder/vp9_tokenize.h"
-#include "vp9/common/vp9_onyxc_int.h"
-#include "vp9/encoder/vp9_variance.h"
-#include "vp9/encoder/vp9_encodemb.h"
-#include "vp9/encoder/vp9_quantize.h"
-#include "vp9/common/vp9_entropy.h"
-#include "vp9/common/vp9_entropymode.h"
-#include "vpx_ports/mem.h"
-#include "vpx/internal/vpx_codec_internal.h"
-#include "vp9/encoder/vp9_mcomp.h"
-#include "vp9/common/vp9_findnearmv.h"
-#include "vp9/encoder/vp9_lookahead.h"
-
-// Experimental rate control switches
-#if CONFIG_ONESHOTQ
-#define ONE_SHOT_Q_ESTIMATE 0
-#define STRICT_ONE_SHOT_Q 0
-#endif
-#define DISABLE_RC_LONG_TERM_MEM 0
-
-// #define MODE_TEST_HIT_STATS
-
-// #define SPEEDSTATS 1
-#if CONFIG_MULTIPLE_ARF
-// Set MIN_GF_INTERVAL to 1 for the full decomposition.
-#define MIN_GF_INTERVAL             2
-#else
-#define MIN_GF_INTERVAL             4
-#endif
-#define DEFAULT_GF_INTERVAL         7
-
-#define KEY_FRAME_CONTEXT 5
-
-#define MAX_MODES 30
-#define MAX_REFS  6
-
-#define MIN_THRESHMULT  32
-#define MAX_THRESHMULT  512
-
-#define GF_ZEROMV_ZBIN_BOOST 0
-#define LF_ZEROMV_ZBIN_BOOST 0
-#define MV_ZBIN_BOOST        0
-#define SPLIT_MV_ZBIN_BOOST  0
-#define INTRA_ZBIN_BOOST     0
-
-typedef struct {
-  int nmvjointcost[MV_JOINTS];
-  int nmvcosts[2][MV_VALS];
-  int nmvcosts_hp[2][MV_VALS];
-
-  vp9_prob segment_pred_probs[PREDICTION_PROBS];
-
-  unsigned char *last_frame_seg_map_copy;
-
-  // 0 = Intra, Last, GF, ARF
-  signed char last_ref_lf_deltas[MAX_REF_LF_DELTAS];
-  // 0 = ZERO_MV, MV
-  signed char last_mode_lf_deltas[MAX_MODE_LF_DELTAS];
-
-  int inter_mode_counts[INTER_MODE_CONTEXTS][INTER_MODES - 1][2];
-  FRAME_CONTEXT fc;
-} CODING_CONTEXT;
-
-typedef struct {
-  double frame;
-  double intra_error;
-  double coded_error;
-  double sr_coded_error;
-  double ssim_weighted_pred_err;
-  double pcnt_inter;
-  double pcnt_motion;
-  double pcnt_second_ref;
-  double pcnt_neutral;
-  double MVr;
-  double mvr_abs;
-  double MVc;
-  double mvc_abs;
-  double MVrv;
-  double MVcv;
-  double mv_in_out_count;
-  double new_mv_count;
-  double duration;
-  double count;
-} FIRSTPASS_STATS;
-
-typedef struct {
-  int frames_so_far;
-  double frame_intra_error;
-  double frame_coded_error;
-  double frame_pcnt_inter;
-  double frame_pcnt_motion;
-  double frame_mvr;
-  double frame_mvr_abs;
-  double frame_mvc;
-  double frame_mvc_abs;
-} ONEPASS_FRAMESTATS;
-
-typedef struct {
-  struct {
-    int err;
-    union {
-      int_mv mv;
-      MB_PREDICTION_MODE mode;
-    } m;
-  } ref[MAX_REF_FRAMES];
-} MBGRAPH_MB_STATS;
-
-typedef struct {
-  MBGRAPH_MB_STATS *mb_stats;
-} MBGRAPH_FRAME_STATS;
-
-// This enumerator type needs to be kept aligned with the mode order in
-// const MODE_DEFINITION vp9_mode_order[MAX_MODES] used in the rd code.
-typedef enum {
-  THR_NEARESTMV,
-  THR_NEARESTA,
-  THR_NEARESTG,
-
-  THR_DC,
-
-  THR_NEWMV,
-  THR_NEWA,
-  THR_NEWG,
-
-  THR_NEARMV,
-  THR_NEARA,
-  THR_COMP_NEARESTLA,
-  THR_COMP_NEARESTGA,
-
-  THR_TM,
-
-  THR_COMP_NEARLA,
-  THR_COMP_NEWLA,
-  THR_NEARG,
-  THR_COMP_NEARGA,
-  THR_COMP_NEWGA,
-
-  THR_ZEROMV,
-  THR_ZEROG,
-  THR_ZEROA,
-  THR_COMP_ZEROLA,
-  THR_COMP_ZEROGA,
-
-  THR_H_PRED,
-  THR_V_PRED,
-  THR_D135_PRED,
-  THR_D207_PRED,
-  THR_D153_PRED,
-  THR_D63_PRED,
-  THR_D117_PRED,
-  THR_D45_PRED,
-} THR_MODES;
-
-typedef enum {
-  THR_LAST,
-  THR_GOLD,
-  THR_ALTR,
-  THR_COMP_LA,
-  THR_COMP_GA,
-  THR_INTRA,
-} THR_MODES_SUB8X8;
-
-typedef enum {
-  DIAMOND = 0,
-  NSTEP = 1,
-  HEX = 2,
-  BIGDIA = 3,
-  SQUARE = 4
-} SEARCH_METHODS;
-
-typedef enum {
-  USE_FULL_RD = 0,
-  USE_LARGESTINTRA,
-  USE_LARGESTINTRA_MODELINTER,
-  USE_LARGESTALL
-} TX_SIZE_SEARCH_METHOD;
-
-typedef enum {
-  // Values should be powers of 2 so that they can be selected as bits of
-  // an integer flags field
-
-  // terminate search early based on distortion so far compared to
-  // qp step, distortion in the neighborhood of the frame, etc.
-  FLAG_EARLY_TERMINATE = 1,
-
-  // skips comp inter modes if the best so far is an intra mode
-  FLAG_SKIP_COMP_BESTINTRA = 2,
-
-  // skips comp inter modes if the best single intermode so far does
-  // not have the same reference as one of the two references being
-  // tested
-  FLAG_SKIP_COMP_REFMISMATCH = 4,
-
-  // skips oblique intra modes if the best so far is an inter mode
-  FLAG_SKIP_INTRA_BESTINTER = 8,
-
-  // skips oblique intra modes  at angles 27, 63, 117, 153 if the best
-  // intra so far is not one of the neighboring directions
-  FLAG_SKIP_INTRA_DIRMISMATCH = 16,
-
-  // skips intra modes other than DC_PRED if the source variance
-  // is small
-  FLAG_SKIP_INTRA_LOWVAR = 32,
-} MODE_SEARCH_SKIP_LOGIC;
-
-typedef enum {
-  SUBPEL_ITERATIVE = 0,
-  SUBPEL_TREE = 1,
-  // Other methods to come
-} SUBPEL_SEARCH_METHODS;
-
-#define ALL_INTRA_MODES 0x3FF
-#define INTRA_DC_ONLY 0x01
-#define INTRA_DC_TM ((1 << TM_PRED) | (1 << DC_PRED))
-#define INTRA_DC_H_V ((1 << DC_PRED) | (1 << V_PRED) | (1 << H_PRED))
-#define INTRA_DC_TM_H_V (INTRA_DC_TM | (1 << V_PRED) | (1 << H_PRED))
-
-typedef enum {
-  LAST_FRAME_PARTITION_OFF = 0,
-  LAST_FRAME_PARTITION_LOW_MOTION = 1,
-  LAST_FRAME_PARTITION_ALL = 2
-} LAST_FRAME_PARTITION_METHOD;
-
-typedef struct {
-  int RD;
-  SEARCH_METHODS search_method;
-  int auto_filter;
-  int recode_loop;
-  SUBPEL_SEARCH_METHODS subpel_search_method;
-  int subpel_iters_per_step;
-  int thresh_mult[MAX_MODES];
-  int thresh_mult_sub8x8[MAX_REFS];
-  int max_step_search_steps;
-  int reduce_first_step_size;
-  int auto_mv_step_size;
-  int optimize_coefficients;
-  int static_segmentation;
-  int variance_adaptive_quantization;
-  int comp_inter_joint_search_thresh;
-  int adaptive_rd_thresh;
-  int skip_encode_sb;
-  int skip_encode_frame;
-  LAST_FRAME_PARTITION_METHOD use_lastframe_partitioning;
-  TX_SIZE_SEARCH_METHOD tx_size_search_method;
-  int use_lp32x32fdct;
-  int use_avoid_tested_higherror;
-  int use_one_partition_size_always;
-  int less_rectangular_check;
-  int use_square_partition_only;
-  int mode_skip_start;
-  int reference_masking;
-  BLOCK_SIZE always_this_block_size;
-  int auto_min_max_partition_size;
-  BLOCK_SIZE min_partition_size;
-  BLOCK_SIZE max_partition_size;
-  int adjust_partitioning_from_last_frame;
-  int last_partitioning_redo_frequency;
-  int disable_split_mask;
-  int using_small_partition_info;
-  // TODO(jingning): combine the related motion search speed features
-  int adaptive_motion_search;
-
-  // Implements various heuristics to skip searching modes
-  // The heuristics selected are based on  flags
-  // defined in the MODE_SEARCH_SKIP_HEURISTICS enum
-  unsigned int mode_search_skip_flags;
-  // A source variance threshold below which the split mode is disabled
-  unsigned int disable_split_var_thresh;
-  // A source variance threshold below which filter search is disabled
-  // Choose a very large value (UINT_MAX) to use 8-tap always
-  unsigned int disable_filter_search_var_thresh;
-  int intra_y_mode_mask[TX_SIZES];
-  int intra_uv_mode_mask[TX_SIZES];
-  int use_rd_breakout;
-  int use_uv_intra_rd_estimate;
-  int use_fast_lpf_pick;
-  int use_fast_coef_updates;  // 0: 2-loop, 1: 1-loop, 2: 1-loop reduced
-} SPEED_FEATURES;
-
-typedef struct VP9_COMP {
-  DECLARE_ALIGNED(16, int16_t, y_quant[QINDEX_RANGE][8]);
-  DECLARE_ALIGNED(16, int16_t, y_quant_shift[QINDEX_RANGE][8]);
-  DECLARE_ALIGNED(16, int16_t, y_zbin[QINDEX_RANGE][8]);
-  DECLARE_ALIGNED(16, int16_t, y_round[QINDEX_RANGE][8]);
-
-  DECLARE_ALIGNED(16, int16_t, uv_quant[QINDEX_RANGE][8]);
-  DECLARE_ALIGNED(16, int16_t, uv_quant_shift[QINDEX_RANGE][8]);
-  DECLARE_ALIGNED(16, int16_t, uv_zbin[QINDEX_RANGE][8]);
-  DECLARE_ALIGNED(16, int16_t, uv_round[QINDEX_RANGE][8]);
-
-#if CONFIG_ALPHA
-  DECLARE_ALIGNED(16, int16_t, a_quant[QINDEX_RANGE][8]);
-  DECLARE_ALIGNED(16, int16_t, a_quant_shift[QINDEX_RANGE][8]);
-  DECLARE_ALIGNED(16, int16_t, a_zbin[QINDEX_RANGE][8]);
-  DECLARE_ALIGNED(16, int16_t, a_round[QINDEX_RANGE][8]);
-#endif
-
-  MACROBLOCK mb;
-  VP9_COMMON common;
-  VP9_CONFIG oxcf;
-  struct rdcost_block_args rdcost_stack;
-
-  struct lookahead_ctx    *lookahead;
-  struct lookahead_entry  *source;
-#if CONFIG_MULTIPLE_ARF
-  struct lookahead_entry  *alt_ref_source[NUM_REF_FRAMES];
-#else
-  struct lookahead_entry  *alt_ref_source;
-#endif
-
-  YV12_BUFFER_CONFIG *Source;
-  YV12_BUFFER_CONFIG *un_scaled_source;
-  YV12_BUFFER_CONFIG scaled_source;
-
-  unsigned int frames_till_alt_ref_frame;
-  int source_alt_ref_pending;
-  int source_alt_ref_active;
-
-  int is_src_frame_alt_ref;
-
-  int gold_is_last;  // gold same as last frame ( short circuit gold searches)
-  int alt_is_last;  // Alt same as last ( short circuit altref search)
-  int gold_is_alt;  // don't do both alt and gold search ( just do gold).
-
-  int scaled_ref_idx[3];
-  int lst_fb_idx;
-  int gld_fb_idx;
-  int alt_fb_idx;
-
-  int current_layer;
-  int use_svc;
-
-#if CONFIG_MULTIPLE_ARF
-  int alt_ref_fb_idx[NUM_REF_FRAMES - 3];
-#endif
-  int refresh_last_frame;
-  int refresh_golden_frame;
-  int refresh_alt_ref_frame;
-  YV12_BUFFER_CONFIG last_frame_uf;
-
-  TOKENEXTRA *tok;
-  unsigned int tok_count[4][1 << 6];
-
-
-  unsigned int frames_since_key;
-  unsigned int key_frame_frequency;
-  unsigned int this_key_frame_forced;
-  unsigned int next_key_frame_forced;
-#if CONFIG_MULTIPLE_ARF
-  // Position within a frame coding order (including any additional ARF frames).
-  unsigned int sequence_number;
-  // Next frame in naturally occurring order that has not yet been coded.
-  int next_frame_in_order;
-#endif
-
-  // Ambient reconstruction err target for force key frames
-  int ambient_err;
-
-  unsigned int mode_chosen_counts[MAX_MODES];
-  unsigned int sub8x8_mode_chosen_counts[MAX_REFS];
-  int64_t mode_skip_mask;
-  int ref_frame_mask;
-  int set_ref_frame_mask;
-
-  int rd_threshes[MAX_SEGMENTS][BLOCK_SIZES][MAX_MODES];
-  int rd_thresh_freq_fact[BLOCK_SIZES][MAX_MODES];
-  int rd_thresh_sub8x8[MAX_SEGMENTS][BLOCK_SIZES][MAX_REFS];
-  int rd_thresh_freq_sub8x8[BLOCK_SIZES][MAX_REFS];
-
-  int64_t rd_comp_pred_diff[NB_PREDICTION_TYPES];
-  int64_t rd_prediction_type_threshes[4][NB_PREDICTION_TYPES];
-  unsigned int intra_inter_count[INTRA_INTER_CONTEXTS][2];
-  unsigned int comp_inter_count[COMP_INTER_CONTEXTS][2];
-  unsigned int single_ref_count[REF_CONTEXTS][2][2];
-  unsigned int comp_ref_count[REF_CONTEXTS][2];
-
-  int64_t rd_tx_select_diff[TX_MODES];
-  // FIXME(rbultje) can this overflow?
-  int rd_tx_select_threshes[4][TX_MODES];
-
-  int64_t rd_filter_diff[SWITCHABLE_FILTER_CONTEXTS];
-  int64_t rd_filter_threshes[4][SWITCHABLE_FILTER_CONTEXTS];
-  int64_t rd_filter_cache[SWITCHABLE_FILTER_CONTEXTS];
-
-  int RDMULT;
-  int RDDIV;
-
-  CODING_CONTEXT coding_context;
-
-  // Rate targetting variables
-  int this_frame_target;
-  int projected_frame_size;
-  int last_q[2];                   // Separate values for Intra/Inter
-  int last_boosted_qindex;         // Last boosted GF/KF/ARF q
-
-  double rate_correction_factor;
-  double key_frame_rate_correction_factor;
-  double gf_rate_correction_factor;
-
-  unsigned int frames_since_golden;
-  int frames_till_gf_update_due;  // Count down till next GF
-
-  int gf_overspend_bits;  // cumulative bits overspent because of GF boost
-
-  int non_gf_bitrate_adjustment;  // Following GF to recover extra bits spent
-
-  int kf_overspend_bits;  // Bits spent on key frames to be recovered on inters
-  int kf_bitrate_adjustment;  // number of bits to recover on each inter frame.
-  int max_gf_interval;
-  int baseline_gf_interval;
-  int active_arnr_frames;           // <= cpi->oxcf.arnr_max_frames
-  int active_arnr_strength;         // <= cpi->oxcf.arnr_max_strength
-
-  int64_t key_frame_count;
-  int prior_key_frame_distance[KEY_FRAME_CONTEXT];
-  int per_frame_bandwidth;  // Current section per frame bandwidth target
-  int av_per_frame_bandwidth;  // Average frame size target for clip
-  int min_frame_bandwidth;  // Minimum allocation used for any frame
-  int inter_frame_target;
-  double output_framerate;
-  int64_t last_time_stamp_seen;
-  int64_t last_end_time_stamp_seen;
-  int64_t first_time_stamp_ever;
-
-  int ni_av_qi;
-  int ni_tot_qi;
-  int ni_frames;
-  int avg_frame_qindex;
-  double tot_q;
-  double avg_q;
-
-  int zbin_mode_boost;
-  int zbin_mode_boost_enabled;
-
-  int64_t total_byte_count;
-
-  int buffered_mode;
-
-  int buffer_level;
-  int bits_off_target;
-
-  int rolling_target_bits;
-  int rolling_actual_bits;
-
-  int long_rolling_target_bits;
-  int long_rolling_actual_bits;
-
-  int64_t total_actual_bits;
-  int total_target_vs_actual;        // debug stats
-
-  int worst_quality;
-  int active_worst_quality;
-  int best_quality;
-  int active_best_quality;
-
-  int cq_target_quality;
-
-  int y_mode_count[4][INTRA_MODES];
-  int y_uv_mode_count[INTRA_MODES][INTRA_MODES];
-  unsigned int partition_count[PARTITION_CONTEXTS][PARTITION_TYPES];
-
-  nmv_context_counts NMVcount;
-
-  vp9_coeff_count coef_counts[TX_SIZES][BLOCK_TYPES];
-  vp9_coeff_probs_model frame_coef_probs[TX_SIZES][BLOCK_TYPES];
-  vp9_coeff_stats frame_branch_ct[TX_SIZES][BLOCK_TYPES];
-
-  int gfu_boost;
-  int last_boost;
-  int kf_boost;
-  int kf_zeromotion_pct;
-  int gf_zeromotion_pct;
-
-  int64_t target_bandwidth;
-  struct vpx_codec_pkt_list  *output_pkt_list;
-
-#if 0
-  // Experimental code for lagged and one pass
-  ONEPASS_FRAMESTATS one_pass_frame_stats[MAX_LAG_BUFFERS];
-  int one_pass_frame_index;
-#endif
-  MBGRAPH_FRAME_STATS mbgraph_stats[MAX_LAG_BUFFERS];
-  int mbgraph_n_frames;             // number of frames filled in the above
-  int static_mb_pct;                // % forced skip mbs by segmentation
-  int seg0_progress, seg0_idx, seg0_cnt;
-
-  int decimation_factor;
-  int decimation_count;
-
-  // for real time encoding
-  int avg_encode_time;              // microsecond
-  int avg_pick_mode_time;            // microsecond
-  int speed;
-  unsigned int cpu_freq;           // Mhz
-  int compressor_speed;
-
-  int interquantizer;
-  int goldfreq;
-  int auto_worst_q;
-  int cpu_used;
-  int pass;
-
-  vp9_prob last_skip_false_probs[3][MBSKIP_CONTEXTS];
-  int last_skip_probs_q[3];
-
-  int ref_frame_flags;
-
-  SPEED_FEATURES sf;
-  int error_bins[1024];
-
-  unsigned int max_mv_magnitude;
-  int mv_step_param;
-
-  // Data used for real time conferencing mode to help determine if it
-  // would be good to update the gf
-  int inter_zz_count;
-  int gf_bad_count;
-  int gf_update_recommended;
-
-  unsigned char *segmentation_map;
-
-  // segment threashold for encode breakout
-  int  segment_encode_breakout[MAX_SEGMENTS];
-
-  unsigned char *active_map;
-  unsigned int active_map_enabled;
-
-  fractional_mv_step_fp *find_fractional_mv_step;
-  fractional_mv_step_comp_fp *find_fractional_mv_step_comp;
-  vp9_full_search_fn_t full_search_sad;
-  vp9_refining_search_fn_t refining_search_sad;
-  vp9_diamond_search_fn_t diamond_search_sad;
-  vp9_variance_fn_ptr_t fn_ptr[BLOCK_SIZES];
-  uint64_t time_receive_data;
-  uint64_t time_compress_data;
-  uint64_t time_pick_lpf;
-  uint64_t time_encode_sb_row;
-
-  struct twopass_rc {
-    unsigned int section_intra_rating;
-    unsigned int next_iiratio;
-    unsigned int this_iiratio;
-    FIRSTPASS_STATS total_stats;
-    FIRSTPASS_STATS this_frame_stats;
-    FIRSTPASS_STATS *stats_in, *stats_in_end, *stats_in_start;
-    FIRSTPASS_STATS total_left_stats;
-    int first_pass_done;
-    int64_t bits_left;
-    int64_t clip_bits_total;
-    double avg_iiratio;
-    double modified_error_total;
-    double modified_error_used;
-    double modified_error_left;
-    double kf_intra_err_min;
-    double gf_intra_err_min;
-    int frames_to_key;
-    int maxq_max_limit;
-    int maxq_min_limit;
-    int static_scene_max_gf_interval;
-    int kf_bits;
-    // Remaining error from uncoded frames in a gf group. Two pass use only
-    int64_t gf_group_error_left;
-
-    // Projected total bits available for a key frame group of frames
-    int64_t kf_group_bits;
-
-    // Error score of frames still to be coded in kf group
-    int64_t kf_group_error_left;
-
-    // Projected Bits available for a group of frames including 1 GF or ARF
-    int64_t gf_group_bits;
-    // Bits for the golden frame or ARF - 2 pass only
-    int gf_bits;
-    int alt_extra_bits;
-
-    int sr_update_lag;
-    double est_max_qcorrection_factor;
-  } twopass;
-
-  YV12_BUFFER_CONFIG alt_ref_buffer;
-  YV12_BUFFER_CONFIG *frames[MAX_LAG_BUFFERS];
-  int fixed_divide[512];
-
-#if CONFIG_INTERNAL_STATS
-  int    count;
-  double total_y;
-  double total_u;
-  double total_v;
-  double total;
-  double total_sq_error;
-  double totalp_y;
-  double totalp_u;
-  double totalp_v;
-  double totalp;
-  double total_sq_error2;
-  int    bytes;
-  double summed_quality;
-  double summed_weights;
-  double summedp_quality;
-  double summedp_weights;
-  unsigned int tot_recode_hits;
-
-
-  double total_ssimg_y;
-  double total_ssimg_u;
-  double total_ssimg_v;
-  double total_ssimg_all;
-
-  int b_calculate_ssimg;
-#endif
-  int b_calculate_psnr;
-
-  // Per MB activity measurement
-  unsigned int activity_avg;
-  unsigned int *mb_activity_map;
-  int *mb_norm_activity_map;
-  int output_partition;
-
-  /* force next frame to intra when kf_auto says so */
-  int force_next_frame_intra;
-
-  int droppable;
-
-  int dummy_packing;    /* flag to indicate if packing is dummy */
-
-  unsigned int switchable_interp_count[SWITCHABLE_FILTER_CONTEXTS]
-                                      [SWITCHABLE_FILTERS];
-
-  unsigned int tx_stepdown_count[TX_SIZES];
-
-  int initial_width;
-  int initial_height;
-
-  int number_spatial_layers;
-  int enable_encode_breakout;   // Default value is 1. From first pass stats,
-                                // encode_breakout may be disabled.
-
-#if CONFIG_MULTIPLE_ARF
-  // ARF tracking variables.
-  int multi_arf_enabled;
-  unsigned int frame_coding_order_period;
-  unsigned int new_frame_coding_order_period;
-  int frame_coding_order[MAX_LAG_BUFFERS * 2];
-  int arf_buffer_idx[MAX_LAG_BUFFERS * 3 / 2];
-  int arf_weight[MAX_LAG_BUFFERS];
-  int arf_buffered;
-  int this_frame_weight;
-  int max_arf_level;
-#endif
-
-#ifdef ENTROPY_STATS
-  int64_t mv_ref_stats[INTER_MODE_CONTEXTS][INTER_MODES - 1][2];
-#endif
-
-
-#ifdef MODE_TEST_HIT_STATS
-  // Debug / test stats
-  int64_t mode_test_hits[BLOCK_SIZES];
-#endif
-
-  /* Y,U,V,(A) */
-  ENTROPY_CONTEXT *above_context[MAX_MB_PLANE];
-  ENTROPY_CONTEXT left_context[MAX_MB_PLANE][16];
-
-  PARTITION_CONTEXT *above_seg_context;
-  PARTITION_CONTEXT left_seg_context[8];
-} VP9_COMP;
-
-static int get_ref_frame_idx(VP9_COMP *cpi, MV_REFERENCE_FRAME ref_frame) {
-  if (ref_frame == LAST_FRAME) {
-    return cpi->lst_fb_idx;
-  } else if (ref_frame == GOLDEN_FRAME) {
-    return cpi->gld_fb_idx;
-  } else {
-    return cpi->alt_fb_idx;
-  }
-}
-
-static int get_scale_ref_frame_idx(VP9_COMP *cpi,
-                                   MV_REFERENCE_FRAME ref_frame) {
-  if (ref_frame == LAST_FRAME) {
-    return 0;
-  } else if (ref_frame == GOLDEN_FRAME) {
-    return 1;
-  } else {
-    return 2;
-  }
-}
-
-void vp9_encode_frame(VP9_COMP *cpi);
-
-void vp9_pack_bitstream(VP9_COMP *cpi, unsigned char *dest,
-                        unsigned long *size);
-
-void vp9_activity_masking(VP9_COMP *cpi, MACROBLOCK *x);
-
-void vp9_set_speed_features(VP9_COMP *cpi);
-
-int vp9_calc_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest);
-
-void vp9_alloc_compressor_data(VP9_COMP *cpi);
-
-int vp9_compute_qdelta(VP9_COMP *cpi, double qstart, double qtarget);
-
-static int get_token_alloc(int mb_rows, int mb_cols) {
-  return mb_rows * mb_cols * (48 * 16 + 4);
-}
-
-#endif  // VP9_ENCODER_VP9_ONYX_INT_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_picklpf.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_picklpf.c
index 476ecaaa254..53284656e3b 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_picklpf.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_picklpf.c
@@ -10,162 +10,96 @@
 
 #include <assert.h>
 #include <limits.h>
-#include "vp9/common/vp9_onyxc_int.h"
-#include "vp9/encoder/vp9_onyx_int.h"
-#include "vp9/encoder/vp9_picklpf.h"
-#include "vp9/encoder/vp9_quantize.h"
-#include "vpx_mem/vpx_mem.h"
-#include "vpx_scale/vpx_scale.h"
-#include "vp9/common/vp9_alloccommon.h"
-#include "vp9/common/vp9_loopfilter.h"
-#include "./vpx_scale_rtcd.h"
-
-void vp9_yv12_copy_partial_frame_c(YV12_BUFFER_CONFIG *src_ybc,
-                                   YV12_BUFFER_CONFIG *dst_ybc, int fraction) {
-  const int height = src_ybc->y_height;
-  const int stride = src_ybc->y_stride;
-  const int offset = stride * ((height >> 5) * 16 - 8);
-  const int lines_to_copy = MAX(height >> (fraction + 4), 1) << 4;
-
-  assert(src_ybc->y_stride == dst_ybc->y_stride);
-  vpx_memcpy(dst_ybc->y_buffer + offset, src_ybc->y_buffer + offset,
-             stride * (lines_to_copy + 16));
-}
-
-static int calc_partial_ssl_err(YV12_BUFFER_CONFIG *source,
-                                YV12_BUFFER_CONFIG *dest, int Fraction) {
-  int i, j;
-  int Total = 0;
-  int srcoffset, dstoffset;
-  uint8_t *src = source->y_buffer;
-  uint8_t *dst = dest->y_buffer;
-
-  int linestocopy = (source->y_height >> (Fraction + 4));
-
-  if (linestocopy < 1)
-    linestocopy = 1;
-
-  linestocopy <<= 4;
 
+#include "./vpx_scale_rtcd.h"
 
-  srcoffset = source->y_stride   * (dest->y_height >> 5) * 16;
-  dstoffset = dest->y_stride     * (dest->y_height >> 5) * 16;
+#include "vpx_mem/vpx_mem.h"
 
-  src += srcoffset;
-  dst += dstoffset;
+#include "vp9/common/vp9_loopfilter.h"
+#include "vp9/common/vp9_onyxc_int.h"
+#include "vp9/common/vp9_quant_common.h"
 
-  // Loop through the raw Y plane and reconstruction data summing the square
-  // differences.
-  for (i = 0; i < linestocopy; i += 16) {
-    for (j = 0; j < source->y_width; j += 16) {
-      unsigned int sse;
-      Total += vp9_mse16x16(src + j, source->y_stride, dst + j, dest->y_stride,
-                            &sse);
-    }
+#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_picklpf.h"
+#include "vp9/encoder/vp9_quantize.h"
 
-    src += 16 * source->y_stride;
-    dst += 16 * dest->y_stride;
+static int get_max_filter_level(const VP9_COMP *cpi) {
+  if (cpi->pass == 2) {
+    return cpi->twopass.section_intra_rating > 8 ? MAX_LOOP_FILTER * 3 / 4
+                                                 : MAX_LOOP_FILTER;
+  } else {
+    return MAX_LOOP_FILTER;
   }
-
-  return Total;
-}
-
-// Enforce a minimum filter level based upon baseline Q
-static int get_min_filter_level(VP9_COMP *cpi, int base_qindex) {
-  int min_filter_level;
-  min_filter_level = 0;
-
-  return min_filter_level;
 }
 
-// Enforce a maximum filter level based upon baseline Q
-static int get_max_filter_level(VP9_COMP *cpi, int base_qindex) {
-  int max_filter_level = MAX_LOOP_FILTER;
-  (void)base_qindex;
 
-  if (cpi->twopass.section_intra_rating > 8)
-    max_filter_level = MAX_LOOP_FILTER * 3 / 4;
+static int try_filter_frame(const YV12_BUFFER_CONFIG *sd, VP9_COMP *const cpi,
+                            int filt_level, int partial_frame) {
+  VP9_COMMON *const cm = &cpi->common;
+  int filt_err;
 
-  return max_filter_level;
-}
+  vp9_loop_filter_frame(cm->frame_to_show, cm, &cpi->mb.e_mbd, filt_level, 1,
+                        partial_frame);
+  filt_err = vp9_get_y_sse(sd, cm->frame_to_show);
 
+  // Re-instate the unfiltered frame
+  vpx_yv12_copy_y(&cpi->last_frame_uf, cm->frame_to_show);
 
-// Stub function for now Alt LF not used
-void vp9_set_alt_lf_level(VP9_COMP *cpi, int filt_val) {
+  return filt_err;
 }
 
-void vp9_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi, int partial) {
-  VP9_COMMON *const cm = &cpi->common;
-  struct loopfilter *const lf = &cm->lf;
-
-  int best_err = 0;
-  int filt_err = 0;
-  const int min_filter_level = get_min_filter_level(cpi, cm->base_qindex);
-  const int max_filter_level = get_max_filter_level(cpi, cm->base_qindex);
-
-  int filter_step;
-  int filt_high = 0;
-  // Start search at previous frame filter level
-  int filt_mid = lf->filter_level;
-  int filt_low = 0;
-  int filt_best;
+static int search_filter_level(const YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi,
+                               int partial_frame) {
+  const VP9_COMMON *const cm = &cpi->common;
+  const struct loopfilter *const lf = &cm->lf;
+  const int min_filter_level = 0;
+  const int max_filter_level = get_max_filter_level(cpi);
   int filt_direction = 0;
-
-  int Bias = 0;  // Bias against raising loop filter in favor of lowering it.
-
-  //  Make a copy of the unfiltered / processed recon buffer
-  vpx_yv12_copy_y(cm->frame_to_show, &cpi->last_frame_uf);
-
-  lf->sharpness_level = cm->frame_type == KEY_FRAME ? 0
-                                                    : cpi->oxcf.Sharpness;
+  int best_err, filt_best;
 
   // Start the search at the previous frame filter level unless it is now out of
   // range.
-  filt_mid = clamp(lf->filter_level, min_filter_level, max_filter_level);
+  int filt_mid = clamp(lf->filter_level, min_filter_level, max_filter_level);
+  int filter_step = filt_mid < 16 ? 4 : filt_mid / 4;
+  // Sum squared error at each filter level
+  int ss_err[MAX_LOOP_FILTER + 1];
 
-  // Define the initial step size
-  filter_step = filt_mid < 16 ? 4 : filt_mid / 4;
+  // Set each entry to -1
+  vpx_memset(ss_err, 0xFF, sizeof(ss_err));
 
-  // Get baseline error score
-  vp9_set_alt_lf_level(cpi, filt_mid);
-  vp9_loop_filter_frame(cm, &cpi->mb.e_mbd, filt_mid, 1, partial);
+  //  Make a copy of the unfiltered / processed recon buffer
+  vpx_yv12_copy_y(cm->frame_to_show, &cpi->last_frame_uf);
 
-  best_err = vp9_calc_ss_err(sd, cm->frame_to_show);
+  best_err = try_filter_frame(sd, cpi, filt_mid, partial_frame);
   filt_best = filt_mid;
-
-  //  Re-instate the unfiltered frame
-  vpx_yv12_copy_y(&cpi->last_frame_uf, cm->frame_to_show);
+  ss_err[filt_mid] = best_err;
 
   while (filter_step > 0) {
-    Bias = (best_err >> (15 - (filt_mid / 8))) * filter_step;
+    const int filt_high = MIN(filt_mid + filter_step, max_filter_level);
+    const int filt_low = MAX(filt_mid - filter_step, min_filter_level);
+    int filt_err;
 
-    if (cpi->twopass.section_intra_rating < 20)
-      Bias = Bias * cpi->twopass.section_intra_rating / 20;
+    // Bias against raising loop filter in favor of lowering it.
+    int bias = (best_err >> (15 - (filt_mid / 8))) * filter_step;
 
-    // yx, bias less for large block size
-    if (cpi->common.tx_mode != ONLY_4X4)
-      Bias >>= 1;
+    if ((cpi->pass == 2) && (cpi->twopass.section_intra_rating < 20))
+      bias = (bias * cpi->twopass.section_intra_rating) / 20;
 
-    filt_high = ((filt_mid + filter_step) > max_filter_level)
-                    ? max_filter_level
-                    : (filt_mid + filter_step);
-    filt_low = ((filt_mid - filter_step) < min_filter_level)
-                   ? min_filter_level
-                   : (filt_mid - filter_step);
+    // yx, bias less for large block size
+    if (cm->tx_mode != ONLY_4X4)
+      bias >>= 1;
 
-    if ((filt_direction <= 0) && (filt_low != filt_mid)) {
+    if (filt_direction <= 0 && filt_low != filt_mid) {
       // Get Low filter error score
-      vp9_set_alt_lf_level(cpi, filt_low);
-      vp9_loop_filter_frame(cm, &cpi->mb.e_mbd, filt_low, 1, partial);
-
-      filt_err = vp9_calc_ss_err(sd, cm->frame_to_show);
-
-      //  Re-instate the unfiltered frame
-      vpx_yv12_copy_y(&cpi->last_frame_uf, cm->frame_to_show);
-
+      if (ss_err[filt_low] < 0) {
+        filt_err = try_filter_frame(sd, cpi, filt_low, partial_frame);
+        ss_err[filt_low] = filt_err;
+      } else {
+        filt_err = ss_err[filt_low];
+      }
       // If value is close to the best so far then bias towards a lower loop
       // filter value.
-      if ((filt_err - Bias) < best_err) {
+      if ((filt_err - bias) < best_err) {
         // Was it actually better than the previous best?
         if (filt_err < best_err)
           best_err = filt_err;
@@ -175,17 +109,15 @@ void vp9_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi, int partial) {
     }
 
     // Now look at filt_high
-    if ((filt_direction >= 0) && (filt_high != filt_mid)) {
-      vp9_set_alt_lf_level(cpi, filt_high);
-      vp9_loop_filter_frame(cm, &cpi->mb.e_mbd, filt_high, 1, partial);
-
-      filt_err = vp9_calc_ss_err(sd, cm->frame_to_show);
-
-      //  Re-instate the unfiltered frame
-      vpx_yv12_copy_y(&cpi->last_frame_uf, cm->frame_to_show);
-
+    if (filt_direction >= 0 && filt_high != filt_mid) {
+      if (ss_err[filt_high] < 0) {
+        filt_err = try_filter_frame(sd, cpi, filt_high, partial_frame);
+        ss_err[filt_high] = filt_err;
+      } else {
+        filt_err = ss_err[filt_high];
+      }
       // Was it better than the previous best?
-      if (filt_err < (best_err - Bias)) {
+      if (filt_err < (best_err - bias)) {
         best_err = filt_err;
         filt_best = filt_high;
       }
@@ -193,7 +125,7 @@ void vp9_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi, int partial) {
 
     // Half the step distance if the best filter value was the same as last time
     if (filt_best == filt_mid) {
-      filter_step = filter_step / 2;
+      filter_step /= 2;
       filt_direction = 0;
     } else {
       filt_direction = (filt_best < filt_mid) ? -1 : 1;
@@ -201,5 +133,29 @@ void vp9_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi, int partial) {
     }
   }
 
-  lf->filter_level = filt_best;
+  return filt_best;
+}
+
+void vp9_pick_filter_level(const YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi,
+                           LPF_PICK_METHOD method) {
+  VP9_COMMON *const cm = &cpi->common;
+  struct loopfilter *const lf = &cm->lf;
+
+  lf->sharpness_level = cm->frame_type == KEY_FRAME ? 0
+                                                    : cpi->oxcf.sharpness;
+
+  if (method == LPF_PICK_FROM_Q) {
+    const int min_filter_level = 0;
+    const int max_filter_level = get_max_filter_level(cpi);
+    const int q = vp9_ac_quant(cm->base_qindex, 0);
+    // These values were determined by linear fitting the result of the
+    // searched level, filt_guess = q * 0.316206 + 3.87252
+    int filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 1015158, 18);
+    if (cm->frame_type == KEY_FRAME)
+      filt_guess -= 4;
+    lf->filter_level = clamp(filt_guess, min_filter_level, max_filter_level);
+  } else {
+    lf->filter_level = search_filter_level(sd, cpi,
+                                           method == LPF_PICK_FROM_SUBIMAGE);
+  }
 }
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_picklpf.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_picklpf.h
index 9de4cf849cc..33c490f6935 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_picklpf.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_picklpf.h
@@ -12,11 +12,19 @@
 #ifndef VP9_ENCODER_VP9_PICKLPF_H_
 #define VP9_ENCODER_VP9_PICKLPF_H_
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "vp9/encoder/vp9_encoder.h"
+
 struct yv12_buffer_config;
 struct VP9_COMP;
 
-void vp9_set_alt_lf_level(struct VP9_COMP *cpi, int filt_val);
+void vp9_pick_filter_level(const struct yv12_buffer_config *sd,
+                           struct VP9_COMP *cpi, LPF_PICK_METHOD method);
+#ifdef __cplusplus
+}  // extern "C"
+#endif
 
-void vp9_pick_filter_level(struct yv12_buffer_config *sd,
-                           struct VP9_COMP *cpi, int partial);
 #endif  // VP9_ENCODER_VP9_PICKLPF_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_pickmode.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_pickmode.c
new file mode 100644
index 00000000000..1e9887c2d44
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_pickmode.c
@@ -0,0 +1,420 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <limits.h>
+#include <math.h>
+#include <stdio.h>
+
+#include "./vp9_rtcd.h"
+
+#include "vpx_mem/vpx_mem.h"
+
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_mvref_common.h"
+#include "vp9/common/vp9_reconinter.h"
+#include "vp9/common/vp9_reconintra.h"
+
+#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_ratectrl.h"
+#include "vp9/encoder/vp9_rdopt.h"
+
+static void full_pixel_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
+                                    BLOCK_SIZE bsize, int mi_row, int mi_col,
+                                    int_mv *tmp_mv, int *rate_mv) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0, 0}};
+  int step_param;
+  int sadpb = x->sadperbit16;
+  MV mvp_full;
+  int ref = mbmi->ref_frame[0];
+  const MV ref_mv = mbmi->ref_mvs[ref][0].as_mv;
+  int i;
+
+  int tmp_col_min = x->mv_col_min;
+  int tmp_col_max = x->mv_col_max;
+  int tmp_row_min = x->mv_row_min;
+  int tmp_row_max = x->mv_row_max;
+
+  const YV12_BUFFER_CONFIG *scaled_ref_frame = vp9_get_scaled_ref_frame(cpi,
+                                                                        ref);
+  if (scaled_ref_frame) {
+    int i;
+    // Swap out the reference frame for a version that's been scaled to
+    // match the resolution of the current frame, allowing the existing
+    // motion search code to be used without additional modifications.
+    for (i = 0; i < MAX_MB_PLANE; i++)
+      backup_yv12[i] = xd->plane[i].pre[0];
+
+    vp9_setup_pre_planes(xd, 0, scaled_ref_frame, mi_row, mi_col, NULL);
+  }
+
+  vp9_set_mv_search_range(x, &ref_mv);
+
+  // TODO(jingning) exploiting adaptive motion search control in non-RD
+  // mode decision too.
+  step_param = 6;
+
+  for (i = LAST_FRAME; i <= LAST_FRAME && cpi->common.show_frame; ++i) {
+    if ((x->pred_mv_sad[ref] >> 3) > x->pred_mv_sad[i]) {
+      tmp_mv->as_int = INVALID_MV;
+
+      if (scaled_ref_frame) {
+        int i;
+        for (i = 0; i < MAX_MB_PLANE; i++)
+          xd->plane[i].pre[0] = backup_yv12[i];
+      }
+      return;
+    }
+  }
+  assert(x->mv_best_ref_index[ref] <= 2);
+  if (x->mv_best_ref_index[ref] < 2)
+    mvp_full = mbmi->ref_mvs[ref][x->mv_best_ref_index[ref]].as_mv;
+  else
+    mvp_full = x->pred_mv[ref].as_mv;
+
+  mvp_full.col >>= 3;
+  mvp_full.row >>= 3;
+
+  full_pixel_search(cpi, x, bsize, &mvp_full, step_param, sadpb, &ref_mv,
+                    &tmp_mv->as_mv, INT_MAX, 0);
+
+  x->mv_col_min = tmp_col_min;
+  x->mv_col_max = tmp_col_max;
+  x->mv_row_min = tmp_row_min;
+  x->mv_row_max = tmp_row_max;
+
+  if (scaled_ref_frame) {
+    int i;
+    for (i = 0; i < MAX_MB_PLANE; i++)
+      xd->plane[i].pre[0] = backup_yv12[i];
+  }
+
+  // calculate the bit cost on motion vector
+  mvp_full.row = tmp_mv->as_mv.row * 8;
+  mvp_full.col = tmp_mv->as_mv.col * 8;
+  *rate_mv = vp9_mv_bit_cost(&mvp_full, &ref_mv,
+                             x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
+}
+
+static void sub_pixel_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
+                                    BLOCK_SIZE bsize, int mi_row, int mi_col,
+                                    MV *tmp_mv) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0, 0}};
+  int ref = mbmi->ref_frame[0];
+  MV ref_mv = mbmi->ref_mvs[ref][0].as_mv;
+  int dis;
+
+  const YV12_BUFFER_CONFIG *scaled_ref_frame = vp9_get_scaled_ref_frame(cpi,
+                                                                        ref);
+  if (scaled_ref_frame) {
+    int i;
+    // Swap out the reference frame for a version that's been scaled to
+    // match the resolution of the current frame, allowing the existing
+    // motion search code to be used without additional modifications.
+    for (i = 0; i < MAX_MB_PLANE; i++)
+      backup_yv12[i] = xd->plane[i].pre[0];
+
+    vp9_setup_pre_planes(xd, 0, scaled_ref_frame, mi_row, mi_col, NULL);
+  }
+
+  cpi->find_fractional_mv_step(x, tmp_mv, &ref_mv,
+                               cpi->common.allow_high_precision_mv,
+                               x->errorperbit,
+                               &cpi->fn_ptr[bsize],
+                               cpi->sf.subpel_force_stop,
+                               cpi->sf.subpel_iters_per_step,
+                               x->nmvjointcost, x->mvcost,
+                               &dis, &x->pred_sse[ref]);
+
+  if (scaled_ref_frame) {
+    int i;
+    for (i = 0; i < MAX_MB_PLANE; i++)
+      xd->plane[i].pre[0] = backup_yv12[i];
+  }
+
+  x->pred_mv[ref].as_mv = *tmp_mv;
+}
+
+static void model_rd_for_sb_y(VP9_COMP *cpi, BLOCK_SIZE bsize,
+                              MACROBLOCK *x, MACROBLOCKD *xd,
+                              int *out_rate_sum, int64_t *out_dist_sum) {
+  // Note our transform coeffs are 8 times an orthogonal transform.
+  // Hence quantizer step is also 8 times. To get effective quantizer
+  // we need to divide by 8 before sending to modeling function.
+  unsigned int sse;
+  int rate;
+  int64_t dist;
+
+  struct macroblock_plane *const p = &x->plane[0];
+  struct macroblockd_plane *const pd = &xd->plane[0];
+
+  unsigned int var = cpi->fn_ptr[bsize].vf(p->src.buf, p->src.stride,
+                                           pd->dst.buf, pd->dst.stride, &sse);
+
+  // TODO(jingning) This is a temporary solution to account for frames with
+  // light changes. Need to customize the rate-distortion modeling for non-RD
+  // mode decision.
+  if ((sse >> 3) > var)
+    sse = var;
+
+  vp9_model_rd_from_var_lapndz(var + sse, 1 << num_pels_log2_lookup[bsize],
+                               pd->dequant[1] >> 3, &rate, &dist);
+  *out_rate_sum = rate;
+  *out_dist_sum = dist << 3;
+}
+
+// TODO(jingning) placeholder for inter-frame non-RD mode decision.
+// this needs various further optimizations. to be continued..
+int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
+                            const TileInfo *const tile,
+                            int mi_row, int mi_col,
+                            int *returnrate,
+                            int64_t *returndistortion,
+                            BLOCK_SIZE bsize) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  struct macroblock_plane *const p = &x->plane[0];
+  struct macroblockd_plane *const pd = &xd->plane[0];
+  PREDICTION_MODE this_mode, best_mode = ZEROMV;
+  MV_REFERENCE_FRAME ref_frame, best_ref_frame = LAST_FRAME;
+  INTERP_FILTER best_pred_filter = EIGHTTAP;
+  int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
+  struct buf_2d yv12_mb[4][MAX_MB_PLANE];
+  static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
+                                    VP9_ALT_FLAG };
+  int64_t best_rd = INT64_MAX;
+  int64_t this_rd = INT64_MAX;
+
+  int rate = INT_MAX;
+  int64_t dist = INT64_MAX;
+
+  VP9_COMMON *cm = &cpi->common;
+  int intra_cost_penalty = 20 * vp9_dc_quant(cm->base_qindex, cm->y_dc_delta_q);
+
+  const int64_t inter_mode_thresh = RDCOST(x->rdmult, x->rddiv,
+                                           intra_cost_penalty, 0);
+  const int64_t intra_mode_cost = 50;
+
+  unsigned char segment_id = mbmi->segment_id;
+  const int *const rd_threshes = cpi->rd.threshes[segment_id][bsize];
+  const int *const rd_thresh_freq_fact = cpi->rd.thresh_freq_fact[bsize];
+  // Mode index conversion form THR_MODES to PREDICTION_MODE for a ref frame.
+  int mode_idx[MB_MODE_COUNT] = {0};
+  INTERP_FILTER filter_ref = SWITCHABLE;
+  int bsl = mi_width_log2_lookup[bsize];
+  int pred_filter_search = (((mi_row + mi_col) >> bsl) +
+                            cpi->sf.chessboard_index) & 0x01;
+
+  x->skip_encode = cpi->sf.skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
+
+  x->skip = 0;
+  if (!x->in_active_map)
+    x->skip = 1;
+  // initialize mode decisions
+  *returnrate = INT_MAX;
+  *returndistortion = INT64_MAX;
+  vpx_memset(mbmi, 0, sizeof(MB_MODE_INFO));
+  mbmi->sb_type = bsize;
+  mbmi->ref_frame[0] = NONE;
+  mbmi->ref_frame[1] = NONE;
+  mbmi->tx_size = MIN(max_txsize_lookup[bsize],
+                      tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
+  mbmi->interp_filter = cpi->common.interp_filter == SWITCHABLE ?
+                        EIGHTTAP : cpi->common.interp_filter;
+  mbmi->skip = 0;
+  mbmi->segment_id = segment_id;
+
+  for (ref_frame = LAST_FRAME; ref_frame <= LAST_FRAME ; ++ref_frame) {
+    x->pred_mv_sad[ref_frame] = INT_MAX;
+    if (cpi->ref_frame_flags & flag_list[ref_frame]) {
+      vp9_setup_buffer_inter(cpi, x, tile,
+                             ref_frame, bsize, mi_row, mi_col,
+                             frame_mv[NEARESTMV], frame_mv[NEARMV], yv12_mb);
+    }
+    frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
+    frame_mv[ZEROMV][ref_frame].as_int = 0;
+  }
+
+  if (xd->up_available)
+    filter_ref = xd->mi[-xd->mi_stride]->mbmi.interp_filter;
+  else if (xd->left_available)
+    filter_ref = xd->mi[-1]->mbmi.interp_filter;
+
+  for (ref_frame = LAST_FRAME; ref_frame <= LAST_FRAME ; ++ref_frame) {
+    if (!(cpi->ref_frame_flags & flag_list[ref_frame]))
+      continue;
+
+    // Select prediction reference frames.
+    xd->plane[0].pre[0] = yv12_mb[ref_frame][0];
+
+    clamp_mv2(&frame_mv[NEARESTMV][ref_frame].as_mv, xd);
+    clamp_mv2(&frame_mv[NEARMV][ref_frame].as_mv, xd);
+
+    mbmi->ref_frame[0] = ref_frame;
+
+    // Set conversion index for LAST_FRAME.
+    if (ref_frame == LAST_FRAME) {
+      mode_idx[NEARESTMV] = THR_NEARESTMV;   // LAST_FRAME, NEARESTMV
+      mode_idx[NEARMV] = THR_NEARMV;         // LAST_FRAME, NEARMV
+      mode_idx[ZEROMV] = THR_ZEROMV;         // LAST_FRAME, ZEROMV
+      mode_idx[NEWMV] = THR_NEWMV;           // LAST_FRAME, NEWMV
+    }
+
+    for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) {
+      int rate_mv = 0;
+
+      if (cpi->sf.disable_inter_mode_mask[bsize] &
+          (1 << INTER_OFFSET(this_mode)))
+        continue;
+
+      if (best_rd < ((int64_t)rd_threshes[mode_idx[this_mode]] *
+          rd_thresh_freq_fact[this_mode] >> 5) ||
+          rd_threshes[mode_idx[this_mode]] == INT_MAX)
+        continue;
+
+      if (this_mode == NEWMV) {
+        int rate_mode = 0;
+        if (this_rd < (int64_t)(1 << num_pels_log2_lookup[bsize]))
+          continue;
+
+        full_pixel_motion_search(cpi, x, bsize, mi_row, mi_col,
+                                 &frame_mv[NEWMV][ref_frame], &rate_mv);
+
+        if (frame_mv[NEWMV][ref_frame].as_int == INVALID_MV)
+          continue;
+
+        rate_mode = cpi->inter_mode_cost[mbmi->mode_context[ref_frame]]
+                                        [INTER_OFFSET(this_mode)];
+        if (RDCOST(x->rdmult, x->rddiv, rate_mv + rate_mode, 0) > best_rd)
+          continue;
+
+        sub_pixel_motion_search(cpi, x, bsize, mi_row, mi_col,
+                                &frame_mv[NEWMV][ref_frame].as_mv);
+      }
+
+      if (this_mode != NEARESTMV)
+        if (frame_mv[this_mode][ref_frame].as_int ==
+            frame_mv[NEARESTMV][ref_frame].as_int)
+          continue;
+
+      mbmi->mode = this_mode;
+      mbmi->mv[0].as_int = frame_mv[this_mode][ref_frame].as_int;
+
+      // Search for the best prediction filter type, when the resulting
+      // motion vector is at sub-pixel accuracy level for luma component, i.e.,
+      // the last three bits are all zeros.
+      if ((this_mode == NEWMV || filter_ref == SWITCHABLE) &&
+          pred_filter_search &&
+          ((mbmi->mv[0].as_mv.row & 0x07) != 0 ||
+           (mbmi->mv[0].as_mv.col & 0x07) != 0)) {
+        int64_t tmp_rdcost1 = INT64_MAX;
+        int64_t tmp_rdcost2 = INT64_MAX;
+        int64_t tmp_rdcost3 = INT64_MAX;
+        int pf_rate[3];
+        int64_t pf_dist[3];
+
+        mbmi->interp_filter = EIGHTTAP;
+        vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
+        model_rd_for_sb_y(cpi, bsize, x, xd, &pf_rate[EIGHTTAP],
+                          &pf_dist[EIGHTTAP]);
+        tmp_rdcost1 = RDCOST(x->rdmult, x->rddiv,
+                             vp9_get_switchable_rate(cpi) + pf_rate[EIGHTTAP],
+                             pf_dist[EIGHTTAP]);
+
+        mbmi->interp_filter = EIGHTTAP_SHARP;
+        vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
+        model_rd_for_sb_y(cpi, bsize, x, xd, &pf_rate[EIGHTTAP_SHARP],
+                          &pf_dist[EIGHTTAP_SHARP]);
+        tmp_rdcost2 = RDCOST(x->rdmult, x->rddiv, vp9_get_switchable_rate(cpi) +
+                                 pf_rate[EIGHTTAP_SHARP],
+                             pf_dist[EIGHTTAP_SHARP]);
+
+        mbmi->interp_filter = EIGHTTAP_SMOOTH;
+        vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
+        model_rd_for_sb_y(cpi, bsize, x, xd, &pf_rate[EIGHTTAP_SMOOTH],
+                          &pf_dist[EIGHTTAP_SMOOTH]);
+        tmp_rdcost3 = RDCOST(x->rdmult, x->rddiv, vp9_get_switchable_rate(cpi) +
+                                 pf_rate[EIGHTTAP_SMOOTH],
+                             pf_dist[EIGHTTAP_SMOOTH]);
+
+        if (tmp_rdcost2 < tmp_rdcost1) {
+          if (tmp_rdcost2 < tmp_rdcost3)
+            mbmi->interp_filter = EIGHTTAP_SHARP;
+          else
+            mbmi->interp_filter = EIGHTTAP_SMOOTH;
+        } else {
+          if (tmp_rdcost1 < tmp_rdcost3)
+            mbmi->interp_filter = EIGHTTAP;
+          else
+            mbmi->interp_filter = EIGHTTAP_SMOOTH;
+        }
+
+        rate = pf_rate[mbmi->interp_filter];
+        dist = pf_dist[mbmi->interp_filter];
+      } else {
+        mbmi->interp_filter = (filter_ref == SWITCHABLE) ? EIGHTTAP: filter_ref;
+        vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
+        model_rd_for_sb_y(cpi, bsize, x, xd, &rate, &dist);
+      }
+
+      rate += rate_mv;
+      rate += cpi->inter_mode_cost[mbmi->mode_context[ref_frame]]
+                                [INTER_OFFSET(this_mode)];
+      this_rd = RDCOST(x->rdmult, x->rddiv, rate, dist);
+
+      if (this_rd < best_rd) {
+        best_rd = this_rd;
+        *returnrate = rate;
+        *returndistortion = dist;
+        best_mode = this_mode;
+        best_pred_filter = mbmi->interp_filter;
+        best_ref_frame = ref_frame;
+      }
+    }
+  }
+
+  mbmi->mode = best_mode;
+  mbmi->interp_filter = best_pred_filter;
+  mbmi->ref_frame[0] = best_ref_frame;
+  mbmi->mv[0].as_int = frame_mv[best_mode][best_ref_frame].as_int;
+  xd->mi[0]->bmi[0].as_mv[0].as_int = mbmi->mv[0].as_int;
+
+  // Perform intra prediction search, if the best SAD is above a certain
+  // threshold.
+  if (best_rd > inter_mode_thresh && bsize < cpi->sf.max_intra_bsize) {
+    for (this_mode = DC_PRED; this_mode <= DC_PRED; ++this_mode) {
+      vp9_predict_intra_block(xd, 0, b_width_log2(bsize),
+                              mbmi->tx_size, this_mode,
+                              &p->src.buf[0], p->src.stride,
+                              &pd->dst.buf[0], pd->dst.stride, 0, 0, 0);
+
+      model_rd_for_sb_y(cpi, bsize, x, xd, &rate, &dist);
+      rate += cpi->mbmode_cost[this_mode];
+      rate += intra_cost_penalty;
+      this_rd = RDCOST(x->rdmult, x->rddiv, rate, dist);
+
+      if (this_rd + intra_mode_cost < best_rd) {
+        best_rd = this_rd;
+        *returnrate = rate;
+        *returndistortion = dist;
+        mbmi->mode = this_mode;
+        mbmi->ref_frame[0] = INTRA_FRAME;
+        mbmi->uv_mode = this_mode;
+        mbmi->mv[0].as_int = INVALID_MV;
+      }
+    }
+  }
+
+  return INT64_MAX;
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_pickmode.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_pickmode.h
new file mode 100644
index 00000000000..a9c948d31a5
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_pickmode.h
@@ -0,0 +1,31 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_ENCODER_VP9_PICKMODE_H_
+#define VP9_ENCODER_VP9_PICKMODE_H_
+
+#include "vp9/encoder/vp9_encoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
+                            const struct TileInfo *const tile,
+                            int mi_row, int mi_col,
+                            int *returnrate,
+                            int64_t *returndistortion,
+                            BLOCK_SIZE bsize);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_ENCODER_VP9_PICKMODE_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_psnr.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_psnr.c
deleted file mode 100644
index 58294e15a38..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_psnr.c
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <math.h>
-
-#include "vpx_scale/yv12config.h"
-
-#define MAX_PSNR 100
-
-double vp9_mse2psnr(double samples, double peak, double mse) {
-  double psnr;
-
-  if (mse > 0.0)
-    psnr = 10.0 * log10(peak * peak * samples / mse);
-  else
-    psnr = MAX_PSNR;  // Limit to prevent / 0
-
-  if (psnr > MAX_PSNR)
-    psnr = MAX_PSNR;
-
-  return psnr;
-}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_psnr.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_psnr.h
deleted file mode 100644
index 15dd8366bd8..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_psnr.h
+++ /dev/null
@@ -1,17 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef VP9_ENCODER_VP9_PSNR_H_
-#define VP9_ENCODER_VP9_PSNR_H_
-
-double vp9_mse2psnr(double samples, double peak, double mse);
-
-#endif  // VP9_ENCODER_VP9_PSNR_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_quantize.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_quantize.c
index fca75252430..4d3086d6075 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_quantize.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_quantize.c
@@ -9,20 +9,17 @@
  */
 
 #include <math.h>
+
 #include "vpx_mem/vpx_mem.h"
 
-#include "vp9/encoder/vp9_onyx_int.h"
-#include "vp9/encoder/vp9_rdopt.h"
-#include "vp9/encoder/vp9_quantize.h"
 #include "vp9/common/vp9_quant_common.h"
-
 #include "vp9/common/vp9_seg_common.h"
 
-#ifdef ENC_DEBUG
-extern int enc_debug;
-#endif
+#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_quantize.h"
+#include "vp9/encoder/vp9_rdopt.h"
 
-void vp9_quantize_b_c(const int16_t *coeff_ptr, intptr_t n_coeffs,
+void vp9_quantize_b_c(const int16_t *coeff_ptr, intptr_t count,
                       int skip_block,
                       const int16_t *zbin_ptr, const int16_t *round_ptr,
                       const int16_t *quant_ptr, const int16_t *quant_shift_ptr,
@@ -30,58 +27,45 @@ void vp9_quantize_b_c(const int16_t *coeff_ptr, intptr_t n_coeffs,
                       const int16_t *dequant_ptr,
                       int zbin_oq_value, uint16_t *eob_ptr,
                       const int16_t *scan, const int16_t *iscan) {
-  int i, rc, eob;
-  int zbins[2], nzbins[2], zbin;
-  int x, y, z, sz;
-  int zero_flag = n_coeffs;
-
-  vpx_memset(qcoeff_ptr, 0, n_coeffs*sizeof(int16_t));
-  vpx_memset(dqcoeff_ptr, 0, n_coeffs*sizeof(int16_t));
-
-  eob = -1;
+  int i, non_zero_count = (int)count, eob = -1;
+  const int zbins[2] = { zbin_ptr[0] + zbin_oq_value,
+                         zbin_ptr[1] + zbin_oq_value };
+  const int nzbins[2] = { zbins[0] * -1,
+                          zbins[1] * -1 };
+  (void)iscan;
 
-  // Base ZBIN
-  zbins[0] = zbin_ptr[0] + zbin_oq_value;
-  zbins[1] = zbin_ptr[1] + zbin_oq_value;
-  nzbins[0] = zbins[0] * -1;
-  nzbins[1] = zbins[1] * -1;
+  vpx_memset(qcoeff_ptr, 0, count * sizeof(int16_t));
+  vpx_memset(dqcoeff_ptr, 0, count * sizeof(int16_t));
 
   if (!skip_block) {
     // Pre-scan pass
-    for (i = n_coeffs - 1; i >= 0; i--) {
-      rc = scan[i];
-      z = coeff_ptr[rc];
+    for (i = (int)count - 1; i >= 0; i--) {
+      const int rc = scan[i];
+      const int coeff = coeff_ptr[rc];
 
-      if (z < zbins[rc != 0] && z > nzbins[rc != 0]) {
-        zero_flag--;
-      } else {
+      if (coeff < zbins[rc != 0] && coeff > nzbins[rc != 0])
+        non_zero_count--;
+      else
         break;
-      }
     }
 
     // Quantization pass: All coefficients with index >= zero_flag are
     // skippable. Note: zero_flag can be zero.
-    for (i = 0; i < zero_flag; i++) {
-      rc = scan[i];
-      z  = coeff_ptr[rc];
-
-      zbin = (zbins[rc != 0]);
-
-      sz = (z >> 31);                               // sign of z
-      x  = (z ^ sz) - sz;
-
-      if (x >= zbin) {
-        x += (round_ptr[rc != 0]);
-        x  = clamp(x, INT16_MIN, INT16_MAX);
-        y  = (((int)(((int)(x * quant_ptr[rc != 0]) >> 16) + x)) *
-              quant_shift_ptr[rc != 0]) >> 16;      // quantize (x)
-        x  = (y ^ sz) - sz;                         // get the sign back
-        qcoeff_ptr[rc]  = x;                        // write to destination
-        dqcoeff_ptr[rc] = x * dequant_ptr[rc != 0];  // dequantized value
-
-        if (y) {
-          eob = i;                                  // last nonzero coeffs
-        }
+    for (i = 0; i < non_zero_count; i++) {
+      const int rc = scan[i];
+      const int coeff = coeff_ptr[rc];
+      const int coeff_sign = (coeff >> 31);
+      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+
+      if (abs_coeff >= zbins[rc != 0]) {
+        int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
+        tmp = ((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) *
+                  quant_shift_ptr[rc != 0]) >> 16;  // quantization
+        qcoeff_ptr[rc]  = (tmp ^ coeff_sign) - coeff_sign;
+        dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
+
+        if (tmp)
+          eob = i;
       }
     }
   }
@@ -97,99 +81,65 @@ void vp9_quantize_b_32x32_c(const int16_t *coeff_ptr, intptr_t n_coeffs,
                             const int16_t *dequant_ptr,
                             int zbin_oq_value, uint16_t *eob_ptr,
                             const int16_t *scan, const int16_t *iscan) {
-  int i, rc, eob;
-  int zbins[2], nzbins[2];
-  int x, y, z, sz;
+  const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0] + zbin_oq_value, 1),
+                         ROUND_POWER_OF_TWO(zbin_ptr[1] + zbin_oq_value, 1) };
+  const int nzbins[2] = {zbins[0] * -1, zbins[1] * -1};
+
   int idx = 0;
   int idx_arr[1024];
+  int i, eob = -1;
+  (void)iscan;
 
-  vpx_memset(qcoeff_ptr, 0, n_coeffs*sizeof(int16_t));
-  vpx_memset(dqcoeff_ptr, 0, n_coeffs*sizeof(int16_t));
-
-  eob = -1;
-
-  // Base ZBIN
-  zbins[0] = ROUND_POWER_OF_TWO(zbin_ptr[0] + zbin_oq_value, 1);
-  zbins[1] = ROUND_POWER_OF_TWO(zbin_ptr[1] + zbin_oq_value, 1);
-  nzbins[0] = zbins[0] * -1;
-  nzbins[1] = zbins[1] * -1;
+  vpx_memset(qcoeff_ptr, 0, n_coeffs * sizeof(int16_t));
+  vpx_memset(dqcoeff_ptr, 0, n_coeffs * sizeof(int16_t));
 
   if (!skip_block) {
     // Pre-scan pass
     for (i = 0; i < n_coeffs; i++) {
-      rc = scan[i];
-      z = coeff_ptr[rc];
+      const int rc = scan[i];
+      const int coeff = coeff_ptr[rc];
 
       // If the coefficient is out of the base ZBIN range, keep it for
       // quantization.
-      if (z >= zbins[rc != 0] || z <= nzbins[rc != 0])
+      if (coeff >= zbins[rc != 0] || coeff <= nzbins[rc != 0])
         idx_arr[idx++] = i;
     }
 
     // Quantization pass: only process the coefficients selected in
     // pre-scan pass. Note: idx can be zero.
     for (i = 0; i < idx; i++) {
-      rc = scan[idx_arr[i]];
-
-      z = coeff_ptr[rc];
-      sz = (z >> 31);                               // sign of z
-      x  = (z ^ sz) - sz;                           // x = abs(z)
-
-      x += ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
-      x  = clamp(x, INT16_MIN, INT16_MAX);
-      y  = ((((x * quant_ptr[rc != 0]) >> 16) + x) *
-            quant_shift_ptr[rc != 0]) >> 15;      // quantize (x)
-
-      x  = (y ^ sz) - sz;                         // get the sign back
-      qcoeff_ptr[rc]  = x;                        // write to destination
-      dqcoeff_ptr[rc] = x * dequant_ptr[rc != 0] / 2;  // dequantized value
-
-      if (y)
-        eob = idx_arr[i];                         // last nonzero coeffs
+      const int rc = scan[idx_arr[i]];
+      const int coeff = coeff_ptr[rc];
+      const int coeff_sign = (coeff >> 31);
+      int tmp;
+      int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+      abs_coeff += ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
+      abs_coeff = clamp(abs_coeff, INT16_MIN, INT16_MAX);
+      tmp = ((((abs_coeff * quant_ptr[rc != 0]) >> 16) + abs_coeff) *
+               quant_shift_ptr[rc != 0]) >> 15;
+
+      qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
+      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
+
+      if (tmp)
+        eob = idx_arr[i];
     }
   }
   *eob_ptr = eob + 1;
 }
 
-struct plane_block_idx {
-  int plane;
-  int block;
-};
-
-// TODO(jkoleszar): returning a struct so it can be used in a const context,
-// expect to refactor this further later.
-static INLINE struct plane_block_idx plane_block_idx(int y_blocks,
-                                                     int b_idx) {
-  const int v_offset = y_blocks * 5 / 4;
-  struct plane_block_idx res;
-
-  if (b_idx < y_blocks) {
-    res.plane = 0;
-    res.block = b_idx;
-  } else if (b_idx < v_offset) {
-    res.plane = 1;
-    res.block = b_idx - y_blocks;
-  } else {
-    assert(b_idx < y_blocks * 3 / 2);
-    res.plane = 2;
-    res.block = b_idx - v_offset;
-  }
-  return res;
-}
-
-void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int y_blocks, int b_idx,
+void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int plane, int block,
                                 const int16_t *scan, const int16_t *iscan) {
   MACROBLOCKD *const xd = &x->e_mbd;
-  const struct plane_block_idx pb_idx = plane_block_idx(y_blocks, b_idx);
-  struct macroblock_plane* p = &x->plane[pb_idx.plane];
-  struct macroblockd_plane* pd = &xd->plane[pb_idx.plane];
+  struct macroblock_plane *p = &x->plane[plane];
+  struct macroblockd_plane *pd = &xd->plane[plane];
 
-  vp9_quantize_b(BLOCK_OFFSET(p->coeff, pb_idx.block),
+  vp9_quantize_b(BLOCK_OFFSET(p->coeff, block),
            16, x->skip_block,
            p->zbin, p->round, p->quant, p->quant_shift,
-           BLOCK_OFFSET(pd->qcoeff, pb_idx.block),
-           BLOCK_OFFSET(pd->dqcoeff, pb_idx.block),
-           pd->dequant, p->zbin_extra, &pd->eobs[pb_idx.block], scan, iscan);
+           BLOCK_OFFSET(p->qcoeff, block),
+           BLOCK_OFFSET(pd->dqcoeff, block),
+           pd->dequant, p->zbin_extra, &p->eobs[block], scan, iscan);
 }
 
 static void invert_quant(int16_t *quant, int16_t *shift, int d) {
@@ -204,132 +154,119 @@ static void invert_quant(int16_t *quant, int16_t *shift, int d) {
 }
 
 void vp9_init_quantizer(VP9_COMP *cpi) {
-  int i, q;
   VP9_COMMON *const cm = &cpi->common;
+  QUANTS *const quants = &cpi->quants;
+  int i, q, quant;
 
   for (q = 0; q < QINDEX_RANGE; q++) {
     const int qzbin_factor = q == 0 ? 64 : (vp9_dc_quant(q, 0) < 148 ? 84 : 80);
     const int qrounding_factor = q == 0 ? 64 : 48;
 
-    // y
     for (i = 0; i < 2; ++i) {
-      const int quant = i == 0 ? vp9_dc_quant(q, cm->y_dc_delta_q)
-                               : vp9_ac_quant(q, 0);
-      invert_quant(&cpi->y_quant[q][i], &cpi->y_quant_shift[q][i], quant);
-      cpi->y_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant, 7);
-      cpi->y_round[q][i] = (qrounding_factor * quant) >> 7;
+      // y
+      quant = i == 0 ? vp9_dc_quant(q, cm->y_dc_delta_q)
+                     : vp9_ac_quant(q, 0);
+      invert_quant(&quants->y_quant[q][i], &quants->y_quant_shift[q][i], quant);
+      quants->y_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant, 7);
+      quants->y_round[q][i] = (qrounding_factor * quant) >> 7;
       cm->y_dequant[q][i] = quant;
-    }
 
-    // uv
-    for (i = 0; i < 2; ++i) {
-      const int quant = i == 0 ? vp9_dc_quant(q, cm->uv_dc_delta_q)
-                               : vp9_ac_quant(q, cm->uv_ac_delta_q);
-      invert_quant(&cpi->uv_quant[q][i], &cpi->uv_quant_shift[q][i], quant);
-      cpi->uv_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant, 7);
-      cpi->uv_round[q][i] = (qrounding_factor * quant) >> 7;
+      // uv
+      quant = i == 0 ? vp9_dc_quant(q, cm->uv_dc_delta_q)
+                     : vp9_ac_quant(q, cm->uv_ac_delta_q);
+      invert_quant(&quants->uv_quant[q][i],
+                   &quants->uv_quant_shift[q][i], quant);
+      quants->uv_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant, 7);
+      quants->uv_round[q][i] = (qrounding_factor * quant) >> 7;
       cm->uv_dequant[q][i] = quant;
-    }
 
 #if CONFIG_ALPHA
-    // alpha
-    for (i = 0; i < 2; ++i) {
-      const int quant = i == 0 ? vp9_dc_quant(q, cm->a_dc_delta_q)
-                               : vp9_ac_quant(q, cm->a_ac_delta_q);
-      invert_quant(&cpi->a_quant[q][i], &cpi->a_quant_shift[q][i], quant);
-      cpi->a_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant, 7);
-      cpi->a_round[q][i] = (qrounding_factor * quant) >> 7;
+      // alpha
+      quant = i == 0 ? vp9_dc_quant(q, cm->a_dc_delta_q)
+                     : vp9_ac_quant(q, cm->a_ac_delta_q);
+      invert_quant(&quants->a_quant[q][i], &quants->a_quant_shift[q][i], quant);
+      quants->a_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant, 7);
+      quants->a_round[q][i] = (qrounding_factor * quant) >> 7;
       cm->a_dequant[q][i] = quant;
-    }
 #endif
+    }
 
     for (i = 2; i < 8; i++) {
-      cpi->y_quant[q][i] = cpi->y_quant[q][1];
-      cpi->y_quant_shift[q][i] = cpi->y_quant_shift[q][1];
-      cpi->y_zbin[q][i] = cpi->y_zbin[q][1];
-      cpi->y_round[q][i] = cpi->y_round[q][1];
+      quants->y_quant[q][i] = quants->y_quant[q][1];
+      quants->y_quant_shift[q][i] = quants->y_quant_shift[q][1];
+      quants->y_zbin[q][i] = quants->y_zbin[q][1];
+      quants->y_round[q][i] = quants->y_round[q][1];
       cm->y_dequant[q][i] = cm->y_dequant[q][1];
 
-      cpi->uv_quant[q][i] = cpi->uv_quant[q][1];
-      cpi->uv_quant_shift[q][i] = cpi->uv_quant_shift[q][1];
-      cpi->uv_zbin[q][i] = cpi->uv_zbin[q][1];
-      cpi->uv_round[q][i] = cpi->uv_round[q][1];
+      quants->uv_quant[q][i] = quants->uv_quant[q][1];
+      quants->uv_quant_shift[q][i] = quants->uv_quant_shift[q][1];
+      quants->uv_zbin[q][i] = quants->uv_zbin[q][1];
+      quants->uv_round[q][i] = quants->uv_round[q][1];
       cm->uv_dequant[q][i] = cm->uv_dequant[q][1];
 
 #if CONFIG_ALPHA
-      cpi->a_quant[q][i] = cpi->a_quant[q][1];
-      cpi->a_quant_shift[q][i] = cpi->a_quant_shift[q][1];
-      cpi->a_zbin[q][i] = cpi->a_zbin[q][1];
-      cpi->a_round[q][i] = cpi->a_round[q][1];
+      quants->a_quant[q][i] = quants->a_quant[q][1];
+      quants->a_quant_shift[q][i] = quants->a_quant_shift[q][1];
+      quants->a_zbin[q][i] = quants->a_zbin[q][1];
+      quants->a_round[q][i] = quants->a_round[q][1];
       cm->a_dequant[q][i] = cm->a_dequant[q][1];
 #endif
     }
   }
 }
 
-void vp9_mb_init_quantizer(VP9_COMP *cpi, MACROBLOCK *x) {
+void vp9_init_plane_quantizers(VP9_COMP *cpi, MACROBLOCK *x) {
+  const VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  QUANTS *const quants = &cpi->quants;
+  const int segment_id = xd->mi[0]->mbmi.segment_id;
+  const int qindex = vp9_get_qindex(&cm->seg, segment_id, cm->base_qindex);
+  const int rdmult = vp9_compute_rd_mult(cpi, qindex + cm->y_dc_delta_q);
+  const int zbin = cpi->zbin_mode_boost;
   int i;
-  VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *xd = &x->e_mbd;
-  int zbin_extra;
-  int segment_id = xd->mi_8x8[0]->mbmi.segment_id;
-  const int qindex = vp9_get_qindex(&cpi->common.seg, segment_id,
-                                    cpi->common.base_qindex);
-
-  int rdmult = vp9_compute_rd_mult(cpi, qindex + cm->y_dc_delta_q);
 
   // Y
-  zbin_extra = (cpi->common.y_dequant[qindex][1] *
-                 (cpi->zbin_mode_boost + x->act_zbin_adj)) >> 7;
-
-  x->plane[0].quant = cpi->y_quant[qindex];
-  x->plane[0].quant_shift = cpi->y_quant_shift[qindex];
-  x->plane[0].zbin = cpi->y_zbin[qindex];
-  x->plane[0].round = cpi->y_round[qindex];
-  x->plane[0].zbin_extra = (int16_t)zbin_extra;
-  x->e_mbd.plane[0].dequant = cpi->common.y_dequant[qindex];
+  x->plane[0].quant = quants->y_quant[qindex];
+  x->plane[0].quant_shift = quants->y_quant_shift[qindex];
+  x->plane[0].zbin = quants->y_zbin[qindex];
+  x->plane[0].round = quants->y_round[qindex];
+  x->plane[0].zbin_extra = (int16_t)((cm->y_dequant[qindex][1] * zbin) >> 7);
+  xd->plane[0].dequant = cm->y_dequant[qindex];
 
   // UV
-  zbin_extra = (cpi->common.uv_dequant[qindex][1] *
-                (cpi->zbin_mode_boost + x->act_zbin_adj)) >> 7;
-
   for (i = 1; i < 3; i++) {
-    x->plane[i].quant = cpi->uv_quant[qindex];
-    x->plane[i].quant_shift = cpi->uv_quant_shift[qindex];
-    x->plane[i].zbin = cpi->uv_zbin[qindex];
-    x->plane[i].round = cpi->uv_round[qindex];
-    x->plane[i].zbin_extra = (int16_t)zbin_extra;
-    x->e_mbd.plane[i].dequant = cpi->common.uv_dequant[qindex];
+    x->plane[i].quant = quants->uv_quant[qindex];
+    x->plane[i].quant_shift = quants->uv_quant_shift[qindex];
+    x->plane[i].zbin = quants->uv_zbin[qindex];
+    x->plane[i].round = quants->uv_round[qindex];
+    x->plane[i].zbin_extra = (int16_t)((cm->uv_dequant[qindex][1] * zbin) >> 7);
+    xd->plane[i].dequant = cm->uv_dequant[qindex];
   }
 
 #if CONFIG_ALPHA
-  x->plane[3].quant = cpi->a_quant[qindex];
-  x->plane[3].quant_shift = cpi->a_quant_shift[qindex];
-  x->plane[3].zbin = cpi->a_zbin[qindex];
-  x->plane[3].round = cpi->a_round[qindex];
-  x->plane[3].zbin_extra = (int16_t)zbin_extra;
-  x->e_mbd.plane[3].dequant = cpi->common.a_dequant[qindex];
+  x->plane[3].quant = quants->a_quant[qindex];
+  x->plane[3].quant_shift = quants->a_quant_shift[qindex];
+  x->plane[3].zbin = quants->a_zbin[qindex];
+  x->plane[3].round = quants->a_round[qindex];
+  x->plane[3].zbin_extra = (int16_t)((cm->a_dequant[qindex][1] * zbin) >> 7);
+  xd->plane[3].dequant = cm->a_dequant[qindex];
 #endif
 
-  x->skip_block = vp9_segfeature_active(&cpi->common.seg, segment_id,
-                                        SEG_LVL_SKIP);
+  x->skip_block = vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP);
+  x->q_index = qindex;
 
-  /* save this macroblock QIndex for vp9_update_zbin_extra() */
-  x->e_mbd.q_index = qindex;
+  x->errorperbit = rdmult >> 6;
+  x->errorperbit += (x->errorperbit == 0);
 
-  /* R/D setup */
-  cpi->mb.errorperbit = rdmult >> 6;
-  cpi->mb.errorperbit += (cpi->mb.errorperbit == 0);
-
-  vp9_initialize_me_consts(cpi, xd->q_index);
+  vp9_initialize_me_consts(cpi, x->q_index);
 }
 
 void vp9_update_zbin_extra(VP9_COMP *cpi, MACROBLOCK *x) {
-  const int qindex = x->e_mbd.q_index;
+  const int qindex = x->q_index;
   const int y_zbin_extra = (cpi->common.y_dequant[qindex][1] *
-                (cpi->zbin_mode_boost + x->act_zbin_adj)) >> 7;
+                            cpi->zbin_mode_boost) >> 7;
   const int uv_zbin_extra = (cpi->common.uv_dequant[qindex][1] *
-                  (cpi->zbin_mode_boost + x->act_zbin_adj)) >> 7;
+                             cpi->zbin_mode_boost) >> 7;
 
   x->plane[0].zbin_extra = (int16_t)y_zbin_extra;
   x->plane[1].zbin_extra = (int16_t)uv_zbin_extra;
@@ -337,26 +274,42 @@ void vp9_update_zbin_extra(VP9_COMP *cpi, MACROBLOCK *x) {
 }
 
 void vp9_frame_init_quantizer(VP9_COMP *cpi) {
-  // Clear Zbin mode boost for default case
   cpi->zbin_mode_boost = 0;
-
-  // MB level quantizer setup
-  vp9_mb_init_quantizer(cpi, &cpi->mb);
+  vp9_init_plane_quantizers(cpi, &cpi->mb);
 }
 
-void vp9_set_quantizer(struct VP9_COMP *cpi, int q) {
-  VP9_COMMON *cm = &cpi->common;
-
+void vp9_set_quantizer(VP9_COMMON *cm, int q) {
+  // quantizer has to be reinitialized with vp9_init_quantizer() if any
+  // delta_q changes.
   cm->base_qindex = q;
-
-  // if any of the delta_q values are changing update flag will
-  // have to be set.
   cm->y_dc_delta_q = 0;
   cm->uv_dc_delta_q = 0;
   cm->uv_ac_delta_q = 0;
+}
+
+// Table that converts 0-63 Q-range values passed in outside to the Qindex
+// range used internally.
+static const int quantizer_to_qindex[] = {
+  0,    4,   8,  12,  16,  20,  24,  28,
+  32,   36,  40,  44,  48,  52,  56,  60,
+  64,   68,  72,  76,  80,  84,  88,  92,
+  96,  100, 104, 108, 112, 116, 120, 124,
+  128, 132, 136, 140, 144, 148, 152, 156,
+  160, 164, 168, 172, 176, 180, 184, 188,
+  192, 196, 200, 204, 208, 212, 216, 220,
+  224, 228, 232, 236, 240, 244, 249, 255,
+};
+
+int vp9_quantizer_to_qindex(int quantizer) {
+  return quantizer_to_qindex[quantizer];
+}
+
+int vp9_qindex_to_quantizer(int qindex) {
+  int quantizer;
+
+  for (quantizer = 0; quantizer < 64; ++quantizer)
+    if (quantizer_to_qindex[quantizer] >= qindex)
+      return quantizer;
 
-  // quantizer has to be reinitialized if any delta_q changes.
-  // As there are not any here for now this is inactive code.
-  // if(update)
-  //    vp9_init_quantizer(cpi);
+  return 63;
 }
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_quantize.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_quantize.h
index c078e1d41a5..1835e9cccb7 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_quantize.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_quantize.h
@@ -11,21 +11,54 @@
 #ifndef VP9_ENCODER_VP9_QUANTIZE_H_
 #define VP9_ENCODER_VP9_QUANTIZE_H_
 
+#include "./vpx_config.h"
 #include "vp9/encoder/vp9_block.h"
 
-void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int y_blocks, int b_idx,
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+  DECLARE_ALIGNED(16, int16_t, y_quant[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, y_quant_shift[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, y_zbin[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, y_round[QINDEX_RANGE][8]);
+
+  DECLARE_ALIGNED(16, int16_t, uv_quant[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, uv_quant_shift[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, uv_zbin[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, uv_round[QINDEX_RANGE][8]);
+
+#if CONFIG_ALPHA
+  DECLARE_ALIGNED(16, int16_t, a_quant[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, a_quant_shift[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, a_zbin[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, a_round[QINDEX_RANGE][8]);
+#endif
+} QUANTS;
+
+void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int plane, int block,
                                 const int16_t *scan, const int16_t *iscan);
 
 struct VP9_COMP;
-
-void vp9_set_quantizer(struct VP9_COMP *cpi, int q);
+struct VP9Common;
 
 void vp9_frame_init_quantizer(struct VP9_COMP *cpi);
 
 void vp9_update_zbin_extra(struct VP9_COMP *cpi, MACROBLOCK *x);
 
-void vp9_mb_init_quantizer(struct VP9_COMP *cpi, MACROBLOCK *x);
+void vp9_init_plane_quantizers(struct VP9_COMP *cpi, MACROBLOCK *x);
 
 void vp9_init_quantizer(struct VP9_COMP *cpi);
 
+void vp9_set_quantizer(struct VP9Common *cm, int q);
+
+int vp9_quantizer_to_qindex(int quantizer);
+
+int vp9_qindex_to_quantizer(int qindex);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
 #endif  // VP9_ENCODER_VP9_QUANTIZE_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ratectrl.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ratectrl.c
index 0aa3a6893eb..a04622c8cab 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ratectrl.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ratectrl.c
@@ -8,32 +8,86 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <limits.h>
 #include <assert.h>
+#include <limits.h>
 #include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "vpx_mem/vpx_mem.h"
 
 #include "vp9/common/vp9_alloccommon.h"
 #include "vp9/common/vp9_common.h"
-#include "vp9/encoder/vp9_ratectrl.h"
 #include "vp9/common/vp9_entropymode.h"
-#include "vpx_mem/vpx_mem.h"
-#include "vp9/common/vp9_systemdependent.h"
-#include "vp9/encoder/vp9_encodemv.h"
 #include "vp9/common/vp9_quant_common.h"
 #include "vp9/common/vp9_seg_common.h"
+#include "vp9/common/vp9_systemdependent.h"
+
+#include "vp9/encoder/vp9_encodemv.h"
+#include "vp9/encoder/vp9_ratectrl.h"
+
+// Max rate target for 1080P and below encodes under normal circumstances
+// (1920 * 1080 / (16 * 16)) * MAX_MB_RATE bits per MB
+#define MAX_MB_RATE 250
+#define MAXRATE_1080P 2025000
+
+#define DEFAULT_KF_BOOST 2000
+#define DEFAULT_GF_BOOST 2000
+
+#define LIMIT_QRANGE_FOR_ALTREF_AND_KEY 1
 
 #define MIN_BPB_FACTOR 0.005
 #define MAX_BPB_FACTOR 50
 
-// Bits Per MB at different Q (Multiplied by 512)
-#define BPER_MB_NORMBITS    9
+#define FRAME_OVERHEAD_BITS 200
+
+// Tables relating active max Q to active min Q
+static int kf_low_motion_minq[QINDEX_RANGE];
+static int kf_high_motion_minq[QINDEX_RANGE];
+static int arfgf_low_motion_minq[QINDEX_RANGE];
+static int arfgf_high_motion_minq[QINDEX_RANGE];
+static int inter_minq[QINDEX_RANGE];
+static int rtc_minq[QINDEX_RANGE];
+static int gf_high = 2000;
+static int gf_low = 400;
+static int kf_high = 5000;
+static int kf_low = 400;
+
+// Functions to compute the active minq lookup table entries based on a
+// formulaic approach to facilitate easier adjustment of the Q tables.
+// The formulae were derived from computing a 3rd order polynomial best
+// fit to the original data (after plotting real maxq vs minq (not q index))
+static int get_minq_index(double maxq, double x3, double x2, double x1) {
+  int i;
+  const double minqtarget = MIN(((x3 * maxq + x2) * maxq + x1) * maxq,
+                                maxq);
+
+  // Special case handling to deal with the step from q2.0
+  // down to lossless mode represented by q 1.0.
+  if (minqtarget <= 2.0)
+    return 0;
 
-static const unsigned int prior_key_frame_weight[KEY_FRAME_CONTEXT] =
-    { 1, 2, 3, 4, 5 };
+  for (i = 0; i < QINDEX_RANGE; i++)
+    if (minqtarget <= vp9_convert_qindex_to_q(i))
+      return i;
+
+  return QINDEX_RANGE - 1;
+}
+
+void vp9_rc_init_minq_luts() {
+  int i;
+
+  for (i = 0; i < QINDEX_RANGE; i++) {
+    const double maxq = vp9_convert_qindex_to_q(i);
+    kf_low_motion_minq[i] = get_minq_index(maxq, 0.000001, -0.0004, 0.125);
+    kf_high_motion_minq[i] = get_minq_index(maxq, 0.000002, -0.0012, 0.50);
+    arfgf_low_motion_minq[i] = get_minq_index(maxq, 0.0000015, -0.0009, 0.30);
+    arfgf_high_motion_minq[i] = get_minq_index(maxq, 0.0000021, -0.00125, 0.50);
+    inter_minq[i] = get_minq_index(maxq, 0.00000271, -0.00113, 0.90);
+    rtc_minq[i] = get_minq_index(maxq, 0.00000271, -0.00113, 0.70);
+  }
+}
 
 // These functions use formulaic calculations to make playing with the
 // quantizer tables easier. If necessary they can be replaced by lookup
@@ -43,22 +97,8 @@ double vp9_convert_qindex_to_q(int qindex) {
   return vp9_ac_quant(qindex, 0) / 4.0;
 }
 
-int vp9_gfboost_qadjust(int qindex) {
-  const double q = vp9_convert_qindex_to_q(qindex);
-  return (int)((0.00000828 * q * q * q) +
-               (-0.0055 * q * q) +
-               (1.32 * q) + 79.3);
-}
-
-static int kfboost_qadjust(int qindex) {
-  const double q = vp9_convert_qindex_to_q(qindex);
-  return (int)((0.00000973 * q * q * q) +
-               (-0.00613 * q * q) +
-               (1.316 * q) + 121.2);
-}
-
-int vp9_bits_per_mb(FRAME_TYPE frame_type, int qindex,
-                    double correction_factor) {
+int vp9_rc_bits_per_mb(FRAME_TYPE frame_type, int qindex,
+                       double correction_factor) {
   const double q = vp9_convert_qindex_to_q(qindex);
   int enumerator = frame_type == KEY_FRAME ? 3300000 : 2250000;
 
@@ -67,207 +107,224 @@ int vp9_bits_per_mb(FRAME_TYPE frame_type, int qindex,
   return (int)(0.5 + (enumerator * correction_factor / q));
 }
 
-void vp9_save_coding_context(VP9_COMP *cpi) {
-  CODING_CONTEXT *const cc = &cpi->coding_context;
-  VP9_COMMON *cm = &cpi->common;
-
-  // Stores a snapshot of key state variables which can subsequently be
-  // restored with a call to vp9_restore_coding_context. These functions are
-  // intended for use in a re-code loop in vp9_compress_frame where the
-  // quantizer value is adjusted between loop iterations.
-  vp9_copy(cc->nmvjointcost,  cpi->mb.nmvjointcost);
-  vp9_copy(cc->nmvcosts,  cpi->mb.nmvcosts);
-  vp9_copy(cc->nmvcosts_hp,  cpi->mb.nmvcosts_hp);
-
-  vp9_copy(cc->segment_pred_probs, cm->seg.pred_probs);
-
-  vpx_memcpy(cpi->coding_context.last_frame_seg_map_copy,
-             cm->last_frame_seg_map, (cm->mi_rows * cm->mi_cols));
-
-  vp9_copy(cc->last_ref_lf_deltas, cm->lf.last_ref_deltas);
-  vp9_copy(cc->last_mode_lf_deltas, cm->lf.last_mode_deltas);
-
-  cc->fc = cm->fc;
-}
-
-void vp9_restore_coding_context(VP9_COMP *cpi) {
-  CODING_CONTEXT *const cc = &cpi->coding_context;
-  VP9_COMMON *cm = &cpi->common;
-
-  // Restore key state variables to the snapshot state stored in the
-  // previous call to vp9_save_coding_context.
-  vp9_copy(cpi->mb.nmvjointcost, cc->nmvjointcost);
-  vp9_copy(cpi->mb.nmvcosts, cc->nmvcosts);
-  vp9_copy(cpi->mb.nmvcosts_hp, cc->nmvcosts_hp);
-
-  vp9_copy(cm->seg.pred_probs, cc->segment_pred_probs);
-
-  vpx_memcpy(cm->last_frame_seg_map,
-             cpi->coding_context.last_frame_seg_map_copy,
-             (cm->mi_rows * cm->mi_cols));
-
-  vp9_copy(cm->lf.last_ref_deltas, cc->last_ref_lf_deltas);
-  vp9_copy(cm->lf.last_mode_deltas, cc->last_mode_lf_deltas);
-
-  cm->fc = cc->fc;
+static int estimate_bits_at_q(FRAME_TYPE frame_type, int q, int mbs,
+                              double correction_factor) {
+  const int bpm = (int)(vp9_rc_bits_per_mb(frame_type, q, correction_factor));
+  return ((uint64_t)bpm * mbs) >> BPER_MB_NORMBITS;
 }
 
-void vp9_setup_key_frame(VP9_COMP *cpi) {
-  VP9_COMMON *cm = &cpi->common;
-
-  vp9_setup_past_independence(cm);
-
-  // interval before next GF
-  cpi->frames_till_gf_update_due = cpi->baseline_gf_interval;
-  /* All buffers are implicitly updated on key frames. */
-  cpi->refresh_golden_frame = 1;
-  cpi->refresh_alt_ref_frame = 1;
+int vp9_rc_clamp_pframe_target_size(const VP9_COMP *const cpi, int target) {
+  const RATE_CONTROL *rc = &cpi->rc;
+  const int min_frame_target = MAX(rc->min_frame_bandwidth,
+                                   rc->avg_frame_bandwidth >> 5);
+  if (target < min_frame_target)
+    target = min_frame_target;
+  if (cpi->refresh_golden_frame && rc->is_src_frame_alt_ref) {
+    // If there is an active ARF at this location use the minimum
+    // bits on this frame even if it is a constructed arf.
+    // The active maximum quantizer insures that an appropriate
+    // number of bits will be spent if needed for constructed ARFs.
+    target = min_frame_target;
+  }
+  // Clip the frame target to the maximum allowed value.
+  if (target > rc->max_frame_bandwidth)
+    target = rc->max_frame_bandwidth;
+  return target;
 }
 
-void vp9_setup_inter_frame(VP9_COMP *cpi) {
-  VP9_COMMON *cm = &cpi->common;
-  if (cm->error_resilient_mode || cm->intra_only)
-    vp9_setup_past_independence(cm);
-
-  assert(cm->frame_context_idx < NUM_FRAME_CONTEXTS);
-  cm->fc = cm->frame_contexts[cm->frame_context_idx];
+int vp9_rc_clamp_iframe_target_size(const VP9_COMP *const cpi, int target) {
+  const RATE_CONTROL *rc = &cpi->rc;
+  const VP9EncoderConfig *oxcf = &cpi->oxcf;
+  if (oxcf->rc_max_intra_bitrate_pct) {
+    const int max_rate = rc->avg_frame_bandwidth *
+                             oxcf->rc_max_intra_bitrate_pct / 100;
+    target = MIN(target, max_rate);
+  }
+  if (target > rc->max_frame_bandwidth)
+    target = rc->max_frame_bandwidth;
+  return target;
 }
 
-static int estimate_bits_at_q(int frame_kind, int q, int mbs,
-                              double correction_factor) {
-  const int bpm = (int)(vp9_bits_per_mb(frame_kind, q, correction_factor));
 
-  // Attempt to retain reasonable accuracy without overflow. The cutoff is
-  // chosen such that the maximum product of Bpm and MBs fits 31 bits. The
-  // largest Bpm takes 20 bits.
-  return (mbs > (1 << 11)) ? (bpm >> BPER_MB_NORMBITS) * mbs
-                           : (bpm * mbs) >> BPER_MB_NORMBITS;
+// Update the buffer level for higher layers, given the encoded current layer.
+static void update_layer_buffer_level(SVC *svc, int encoded_frame_size) {
+  int temporal_layer = 0;
+  int current_temporal_layer = svc->temporal_layer_id;
+  for (temporal_layer = current_temporal_layer + 1;
+      temporal_layer < svc->number_temporal_layers; ++temporal_layer) {
+    LAYER_CONTEXT *lc = &svc->layer_context[temporal_layer];
+    RATE_CONTROL *lrc = &lc->rc;
+    int bits_off_for_this_layer = (int)(lc->target_bandwidth / lc->framerate -
+        encoded_frame_size);
+    lrc->bits_off_target += bits_off_for_this_layer;
+
+    // Clip buffer level to maximum buffer size for the layer.
+    lrc->bits_off_target = MIN(lrc->bits_off_target, lc->maximum_buffer_size);
+    lrc->buffer_level = lrc->bits_off_target;
+  }
 }
 
+// Update the buffer level: leaky bucket model.
+static void update_buffer_level(VP9_COMP *cpi, int encoded_frame_size) {
+  const VP9_COMMON *const cm = &cpi->common;
+  const VP9EncoderConfig *oxcf = &cpi->oxcf;
+  RATE_CONTROL *const rc = &cpi->rc;
 
-static void calc_iframe_target_size(VP9_COMP *cpi) {
-  // boost defaults to half second
-  int target;
-
-  // Clear down mmx registers to allow floating point in what follows
-  vp9_clear_system_state();  // __asm emms;
+  // Non-viewable frames are a special case and are treated as pure overhead.
+  if (!cm->show_frame) {
+    rc->bits_off_target -= encoded_frame_size;
+  } else {
+    rc->bits_off_target += rc->avg_frame_bandwidth - encoded_frame_size;
+  }
 
-  // New Two pass RC
-  target = cpi->per_frame_bandwidth;
+  // Clip the buffer level to the maximum specified buffer size.
+  rc->bits_off_target = MIN(rc->bits_off_target, oxcf->maximum_buffer_size);
+  rc->buffer_level = rc->bits_off_target;
 
-  if (cpi->oxcf.rc_max_intra_bitrate_pct) {
-    int max_rate = cpi->per_frame_bandwidth
-                 * cpi->oxcf.rc_max_intra_bitrate_pct / 100;
+  if (cpi->use_svc && cpi->oxcf.rc_mode == RC_MODE_CBR) {
+    update_layer_buffer_level(&cpi->svc, encoded_frame_size);
+  }
+}
 
-    if (target > max_rate)
-      target = max_rate;
+void vp9_rc_init(const VP9EncoderConfig *oxcf, int pass, RATE_CONTROL *rc) {
+  if (pass == 0 && oxcf->rc_mode == RC_MODE_CBR) {
+    rc->avg_frame_qindex[0] = oxcf->worst_allowed_q;
+    rc->avg_frame_qindex[1] = oxcf->worst_allowed_q;
+    rc->avg_frame_qindex[2] = oxcf->worst_allowed_q;
+  } else {
+    rc->avg_frame_qindex[0] = (oxcf->worst_allowed_q +
+                                   oxcf->best_allowed_q) / 2;
+    rc->avg_frame_qindex[1] = (oxcf->worst_allowed_q +
+                                   oxcf->best_allowed_q) / 2;
+    rc->avg_frame_qindex[2] = (oxcf->worst_allowed_q +
+                                   oxcf->best_allowed_q) / 2;
   }
 
-  cpi->this_frame_target = target;
-}
+  rc->last_q[0] = oxcf->best_allowed_q;
+  rc->last_q[1] = oxcf->best_allowed_q;
+  rc->last_q[2] = oxcf->best_allowed_q;
 
+  rc->buffer_level =    oxcf->starting_buffer_level;
+  rc->bits_off_target = oxcf->starting_buffer_level;
 
-//  Do the best we can to define the parameters for the next GF based
-//  on what information we have available.
-//
-//  In this experimental code only two pass is supported
-//  so we just use the interval determined in the two pass code.
-static void calc_gf_params(VP9_COMP *cpi) {
-  // Set the gf interval
-  cpi->frames_till_gf_update_due = cpi->baseline_gf_interval;
-}
+  rc->rolling_target_bits      = rc->avg_frame_bandwidth;
+  rc->rolling_actual_bits      = rc->avg_frame_bandwidth;
+  rc->long_rolling_target_bits = rc->avg_frame_bandwidth;
+  rc->long_rolling_actual_bits = rc->avg_frame_bandwidth;
 
+  rc->total_actual_bits = 0;
+  rc->total_target_vs_actual = 0;
 
-static void calc_pframe_target_size(VP9_COMP *cpi) {
-  const int min_frame_target = MAX(cpi->min_frame_bandwidth,
-                                   cpi->av_per_frame_bandwidth >> 5);
-  if (cpi->refresh_alt_ref_frame) {
-    // Special alt reference frame case
-    // Per frame bit target for the alt ref frame
-    cpi->per_frame_bandwidth = cpi->twopass.gf_bits;
-    cpi->this_frame_target = cpi->per_frame_bandwidth;
-  } else {
-    // Normal frames (gf,and inter)
-    cpi->this_frame_target = cpi->per_frame_bandwidth;
-  }
+  rc->baseline_gf_interval = DEFAULT_GF_INTERVAL;
+  rc->frames_since_key = 8;  // Sensible default for first frame.
+  rc->this_key_frame_forced = 0;
+  rc->next_key_frame_forced = 0;
+  rc->source_alt_ref_pending = 0;
+  rc->source_alt_ref_active = 0;
 
-  // Check that the total sum of adjustments is not above the maximum allowed.
-  // That is, having allowed for the KF and GF penalties, we have not pushed
-  // the current inter-frame target too low. If the adjustment we apply here is
-  // not capable of recovering all the extra bits we have spent in the KF or GF,
-  // then the remainder will have to be recovered over a longer time span via
-  // other buffer / rate control mechanisms.
-  if (cpi->this_frame_target < min_frame_target)
-    cpi->this_frame_target = min_frame_target;
+  rc->frames_till_gf_update_due = 0;
 
-  if (!cpi->refresh_alt_ref_frame)
-    // Note the baseline target data rate for this inter frame.
-    cpi->inter_frame_target = cpi->this_frame_target;
+  rc->ni_av_qi = oxcf->worst_allowed_q;
+  rc->ni_tot_qi = 0;
+  rc->ni_frames = 0;
 
-  // Adjust target frame size for Golden Frames:
-  if (cpi->frames_till_gf_update_due == 0) {
-    const int q = (cpi->oxcf.fixed_q < 0) ? cpi->last_q[INTER_FRAME]
-                                          : cpi->oxcf.fixed_q;
+  rc->tot_q = 0.0;
+  rc->avg_q = vp9_convert_qindex_to_q(oxcf->worst_allowed_q);
 
-    cpi->refresh_golden_frame = 1;
+  rc->rate_correction_factor = 1.0;
+  rc->key_frame_rate_correction_factor = 1.0;
+  rc->gf_rate_correction_factor = 1.0;
+}
 
-    calc_gf_params(cpi);
+int vp9_rc_drop_frame(VP9_COMP *cpi) {
+  const VP9EncoderConfig *oxcf = &cpi->oxcf;
+  RATE_CONTROL *const rc = &cpi->rc;
 
-    // If we are using alternate ref instead of gf then do not apply the boost
-    // It will instead be applied to the altref update
-    // Jims modified boost
-    if (!cpi->source_alt_ref_active) {
-      if (cpi->oxcf.fixed_q < 0) {
-        // The spend on the GF is defined in the two pass code
-        // for two pass encodes
-        cpi->this_frame_target = cpi->per_frame_bandwidth;
+  if (!oxcf->drop_frames_water_mark) {
+    return 0;
+  } else {
+    if (rc->buffer_level < 0) {
+      // Always drop if buffer is below 0.
+      return 1;
+    } else {
+      // If buffer is below drop_mark, for now just drop every other frame
+      // (starting with the next frame) until it increases back over drop_mark.
+      int drop_mark = (int)(oxcf->drop_frames_water_mark *
+          oxcf->optimal_buffer_level / 100);
+      if ((rc->buffer_level > drop_mark) &&
+          (rc->decimation_factor > 0)) {
+        --rc->decimation_factor;
+      } else if (rc->buffer_level <= drop_mark &&
+          rc->decimation_factor == 0) {
+        rc->decimation_factor = 1;
+      }
+      if (rc->decimation_factor > 0) {
+        if (rc->decimation_count > 0) {
+          --rc->decimation_count;
+          return 1;
+        } else {
+          rc->decimation_count = rc->decimation_factor;
+          return 0;
+        }
       } else {
-        cpi->this_frame_target =
-          (estimate_bits_at_q(1, q, cpi->common.MBs, 1.0)
-           * cpi->last_boost) / 100;
+        rc->decimation_count = 0;
+        return 0;
       }
-    } else {
-      // If there is an active ARF at this location use the minimum
-      // bits on this frame even if it is a constructed arf.
-      // The active maximum quantizer insures that an appropriate
-      // number of bits will be spent if needed for constructed ARFs.
-      cpi->this_frame_target = 0;
     }
   }
 }
 
+static double get_rate_correction_factor(const VP9_COMP *cpi) {
+  if (cpi->common.frame_type == KEY_FRAME) {
+    return cpi->rc.key_frame_rate_correction_factor;
+  } else {
+    if ((cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame) &&
+        !cpi->rc.is_src_frame_alt_ref &&
+        !(cpi->use_svc && cpi->oxcf.rc_mode == RC_MODE_CBR))
+      return cpi->rc.gf_rate_correction_factor;
+    else
+      return cpi->rc.rate_correction_factor;
+  }
+}
 
-void vp9_update_rate_correction_factors(VP9_COMP *cpi, int damp_var) {
-  const int q = cpi->common.base_qindex;
+static void set_rate_correction_factor(VP9_COMP *cpi, double factor) {
+  if (cpi->common.frame_type == KEY_FRAME) {
+    cpi->rc.key_frame_rate_correction_factor = factor;
+  } else {
+    if ((cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame) &&
+        !cpi->rc.is_src_frame_alt_ref &&
+        !(cpi->use_svc && cpi->oxcf.rc_mode == RC_MODE_CBR))
+      cpi->rc.gf_rate_correction_factor = factor;
+    else
+      cpi->rc.rate_correction_factor = factor;
+  }
+}
+
+void vp9_rc_update_rate_correction_factors(VP9_COMP *cpi, int damp_var) {
+  const VP9_COMMON *const cm = &cpi->common;
   int correction_factor = 100;
-  double rate_correction_factor;
+  double rate_correction_factor = get_rate_correction_factor(cpi);
   double adjustment_limit;
 
   int projected_size_based_on_q = 0;
 
-  // Clear down mmx registers to allow floating point in what follows
-  vp9_clear_system_state();  // __asm emms;
+  // Do not update the rate factors for arf overlay frames.
+  if (cpi->rc.is_src_frame_alt_ref)
+    return;
 
-  if (cpi->common.frame_type == KEY_FRAME) {
-    rate_correction_factor = cpi->key_frame_rate_correction_factor;
-  } else {
-    if (cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame)
-      rate_correction_factor = cpi->gf_rate_correction_factor;
-    else
-      rate_correction_factor = cpi->rate_correction_factor;
-  }
+  // Clear down mmx registers to allow floating point in what follows
+  vp9_clear_system_state();
 
   // Work out how big we would have expected the frame to be at this Q given
   // the current correction factor.
   // Stay in double to avoid int overflow when values are large
-  projected_size_based_on_q = estimate_bits_at_q(cpi->common.frame_type, q,
-                                                 cpi->common.MBs,
+  projected_size_based_on_q = estimate_bits_at_q(cm->frame_type,
+                                                 cm->base_qindex, cm->MBs,
                                                  rate_correction_factor);
-
   // Work out a size correction factor.
   if (projected_size_based_on_q > 0)
-    correction_factor =
-        (100 * cpi->projected_frame_size) / projected_size_based_on_q;
+    correction_factor = (100 * cpi->rc.projected_frame_size) /
+                            projected_size_based_on_q;
 
   // More heavily damped adjustment used if we have been oscillating either side
   // of target.
@@ -284,74 +341,48 @@ void vp9_update_rate_correction_factors(VP9_COMP *cpi, int damp_var) {
       break;
   }
 
-  // if ( (correction_factor > 102) && (Q < cpi->active_worst_quality) )
   if (correction_factor > 102) {
     // We are not already at the worst allowable quality
-    correction_factor =
-        (int)(100 + ((correction_factor - 100) * adjustment_limit));
-    rate_correction_factor =
-        ((rate_correction_factor * correction_factor) / 100);
+    correction_factor = (int)(100 + ((correction_factor - 100) *
+                                  adjustment_limit));
+    rate_correction_factor = (rate_correction_factor * correction_factor) / 100;
 
     // Keep rate_correction_factor within limits
     if (rate_correction_factor > MAX_BPB_FACTOR)
       rate_correction_factor = MAX_BPB_FACTOR;
   } else if (correction_factor < 99) {
     // We are not already at the best allowable quality
-    correction_factor =
-        (int)(100 - ((100 - correction_factor) * adjustment_limit));
-    rate_correction_factor =
-        ((rate_correction_factor * correction_factor) / 100);
+    correction_factor = (int)(100 - ((100 - correction_factor) *
+                                  adjustment_limit));
+    rate_correction_factor = (rate_correction_factor * correction_factor) / 100;
 
     // Keep rate_correction_factor within limits
     if (rate_correction_factor < MIN_BPB_FACTOR)
       rate_correction_factor = MIN_BPB_FACTOR;
   }
 
-  if (cpi->common.frame_type == KEY_FRAME) {
-    cpi->key_frame_rate_correction_factor = rate_correction_factor;
-  } else {
-    if (cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame)
-      cpi->gf_rate_correction_factor = rate_correction_factor;
-    else
-      cpi->rate_correction_factor = rate_correction_factor;
-  }
+  set_rate_correction_factor(cpi, rate_correction_factor);
 }
 
 
-int vp9_regulate_q(VP9_COMP *cpi, int target_bits_per_frame) {
-  int q = cpi->active_worst_quality;
-
-  int i;
+int vp9_rc_regulate_q(const VP9_COMP *cpi, int target_bits_per_frame,
+                      int active_best_quality, int active_worst_quality) {
+  const VP9_COMMON *const cm = &cpi->common;
+  int q = active_worst_quality;
   int last_error = INT_MAX;
-  int target_bits_per_mb;
-  int bits_per_mb_at_this_q;
-  double correction_factor;
-
-  // Select the appropriate correction factor based upon type of frame.
-  if (cpi->common.frame_type == KEY_FRAME) {
-    correction_factor = cpi->key_frame_rate_correction_factor;
-  } else {
-    if (cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame)
-      correction_factor = cpi->gf_rate_correction_factor;
-    else
-      correction_factor = cpi->rate_correction_factor;
-  }
+  int i, target_bits_per_mb;
+  const double correction_factor = get_rate_correction_factor(cpi);
 
   // Calculate required scaling factor based on target frame size and size of
   // frame produced using previous Q.
-  if (target_bits_per_frame >= (INT_MAX >> BPER_MB_NORMBITS))
-    target_bits_per_mb =
-        (target_bits_per_frame / cpi->common.MBs)
-        << BPER_MB_NORMBITS;  // Case where we would overflow int
-  else
-    target_bits_per_mb =
-        (target_bits_per_frame << BPER_MB_NORMBITS) / cpi->common.MBs;
+  target_bits_per_mb =
+      ((uint64_t)target_bits_per_frame << BPER_MB_NORMBITS) / cm->MBs;
 
-  i = cpi->active_best_quality;
+  i = active_best_quality;
 
   do {
-    bits_per_mb_at_this_q = (int)vp9_bits_per_mb(cpi->common.frame_type, i,
-                                                 correction_factor);
+    const int bits_per_mb_at_this_q = (int)vp9_rc_bits_per_mb(cm->frame_type, i,
+                                                             correction_factor);
 
     if (bits_per_mb_at_this_q <= target_bits_per_mb) {
       if ((target_bits_per_mb - bits_per_mb_at_this_q) <= last_error)
@@ -363,114 +394,1026 @@ int vp9_regulate_q(VP9_COMP *cpi, int target_bits_per_frame) {
     } else {
       last_error = bits_per_mb_at_this_q - target_bits_per_mb;
     }
-  } while (++i <= cpi->active_worst_quality);
+  } while (++i <= active_worst_quality);
 
   return q;
 }
 
+static int get_active_quality(int q, int gfu_boost, int low, int high,
+                              int *low_motion_minq, int *high_motion_minq) {
+  if (gfu_boost > high) {
+    return low_motion_minq[q];
+  } else if (gfu_boost < low) {
+    return high_motion_minq[q];
+  } else {
+    const int gap = high - low;
+    const int offset = high - gfu_boost;
+    const int qdiff = high_motion_minq[q] - low_motion_minq[q];
+    const int adjustment = ((offset * qdiff) + (gap >> 1)) / gap;
+    return low_motion_minq[q] + adjustment;
+  }
+}
 
-static int estimate_keyframe_frequency(VP9_COMP *cpi) {
-  int i;
+static int calc_active_worst_quality_one_pass_vbr(const VP9_COMP *cpi) {
+  const RATE_CONTROL *const rc = &cpi->rc;
+  const unsigned int curr_frame = cpi->common.current_video_frame;
+  int active_worst_quality;
 
-  // Average key frame frequency
-  int av_key_frame_frequency = 0;
+  if (cpi->common.frame_type == KEY_FRAME) {
+    active_worst_quality = curr_frame == 0 ? rc->worst_quality
+                                           : rc->last_q[KEY_FRAME] * 2;
+  } else {
+    if (!rc->is_src_frame_alt_ref &&
+        (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
+      active_worst_quality =  curr_frame == 1 ? rc->last_q[KEY_FRAME] * 5 / 4
+                                              : rc->last_q[INTER_FRAME];
+    } else {
+      active_worst_quality = curr_frame == 1 ? rc->last_q[KEY_FRAME] * 2
+                                             : rc->last_q[INTER_FRAME] * 2;
+    }
+  }
 
-  /* First key frame at start of sequence is a special case. We have no
-   * frequency data.
-   */
-  if (cpi->key_frame_count == 1) {
-    /* Assume a default of 1 kf every 2 seconds, or the max kf interval,
-     * whichever is smaller.
-     */
-    int key_freq = cpi->oxcf.key_freq > 0 ? cpi->oxcf.key_freq : 1;
-    av_key_frame_frequency = (int)cpi->output_framerate * 2;
+  return MIN(active_worst_quality, rc->worst_quality);
+}
 
-    if (cpi->oxcf.auto_key && av_key_frame_frequency > key_freq)
-      av_key_frame_frequency = cpi->oxcf.key_freq;
+// Adjust active_worst_quality level based on buffer level.
+static int calc_active_worst_quality_one_pass_cbr(const VP9_COMP *cpi) {
+  // Adjust active_worst_quality: If buffer is above the optimal/target level,
+  // bring active_worst_quality down depending on fullness of buffer.
+  // If buffer is below the optimal level, let the active_worst_quality go from
+  // ambient Q (at buffer = optimal level) to worst_quality level
+  // (at buffer = critical level).
+  const VP9_COMMON *const cm = &cpi->common;
+  const VP9EncoderConfig *oxcf = &cpi->oxcf;
+  const RATE_CONTROL *rc = &cpi->rc;
+  // Buffer level below which we push active_worst to worst_quality.
+  int64_t critical_level = oxcf->optimal_buffer_level >> 2;
+  int64_t buff_lvl_step = 0;
+  int adjustment = 0;
+  int active_worst_quality;
+  if (cm->frame_type == KEY_FRAME)
+    return rc->worst_quality;
+  if (cm->current_video_frame > 1)
+    active_worst_quality = MIN(rc->worst_quality,
+                               rc->avg_frame_qindex[INTER_FRAME] * 5 / 4);
+  else
+    active_worst_quality = MIN(rc->worst_quality,
+                               rc->avg_frame_qindex[KEY_FRAME] * 3 / 2);
+  if (rc->buffer_level > oxcf->optimal_buffer_level) {
+    // Adjust down.
+    // Maximum limit for down adjustment, ~30%.
+    int max_adjustment_down = active_worst_quality / 3;
+    if (max_adjustment_down) {
+      buff_lvl_step = ((oxcf->maximum_buffer_size -
+                        oxcf->optimal_buffer_level) / max_adjustment_down);
+      if (buff_lvl_step)
+        adjustment = (int)((rc->buffer_level - oxcf->optimal_buffer_level) /
+                            buff_lvl_step);
+      active_worst_quality -= adjustment;
+    }
+  } else if (rc->buffer_level > critical_level) {
+    // Adjust up from ambient Q.
+    if (critical_level) {
+      buff_lvl_step = (oxcf->optimal_buffer_level - critical_level);
+      if (buff_lvl_step) {
+        adjustment =
+            (int)((rc->worst_quality - rc->avg_frame_qindex[INTER_FRAME]) *
+                  (oxcf->optimal_buffer_level - rc->buffer_level) /
+                  buff_lvl_step);
+      }
+      active_worst_quality = rc->avg_frame_qindex[INTER_FRAME] + adjustment;
+    }
+  } else {
+    // Set to worst_quality if buffer is below critical level.
+    active_worst_quality = rc->worst_quality;
+  }
+  return active_worst_quality;
+}
+
+static int rc_pick_q_and_bounds_one_pass_cbr(const VP9_COMP *cpi,
+                                             int *bottom_index,
+                                             int *top_index) {
+  const VP9_COMMON *const cm = &cpi->common;
+  const RATE_CONTROL *const rc = &cpi->rc;
+  int active_best_quality;
+  int active_worst_quality = calc_active_worst_quality_one_pass_cbr(cpi);
+  int q;
+
+  if (frame_is_intra_only(cm)) {
+    active_best_quality = rc->best_quality;
+    // Handle the special case for key frames forced when we have75 reached
+    // the maximum key frame interval. Here force the Q to a range
+    // based on the ambient Q to reduce the risk of popping.
+    if (rc->this_key_frame_forced) {
+      int qindex = rc->last_boosted_qindex;
+      double last_boosted_q = vp9_convert_qindex_to_q(qindex);
+      int delta_qindex = vp9_compute_qdelta(rc, last_boosted_q,
+                                            (last_boosted_q * 0.75));
+      active_best_quality = MAX(qindex + delta_qindex, rc->best_quality);
+    } else if (cm->current_video_frame > 0) {
+      // not first frame of one pass and kf_boost is set
+      double q_adj_factor = 1.0;
+      double q_val;
+
+      active_best_quality = get_active_quality(rc->avg_frame_qindex[KEY_FRAME],
+                                               rc->kf_boost,
+                                               kf_low, kf_high,
+                                               kf_low_motion_minq,
+                                               kf_high_motion_minq);
+
+      // Allow somewhat lower kf minq with small image formats.
+      if ((cm->width * cm->height) <= (352 * 288)) {
+        q_adj_factor -= 0.25;
+      }
 
-    cpi->prior_key_frame_distance[KEY_FRAME_CONTEXT - 1]
-      = av_key_frame_frequency;
+      // Convert the adjustment factor to a qindex delta
+      // on active_best_quality.
+      q_val = vp9_convert_qindex_to_q(active_best_quality);
+      active_best_quality += vp9_compute_qdelta(rc, q_val,
+                                                q_val * q_adj_factor);
+    }
+  } else if (!rc->is_src_frame_alt_ref &&
+             !cpi->use_svc &&
+             (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
+    // Use the lower of active_worst_quality and recent
+    // average Q as basis for GF/ARF best Q limit unless last frame was
+    // a key frame.
+    if (rc->frames_since_key > 1 &&
+        rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality) {
+      q = rc->avg_frame_qindex[INTER_FRAME];
+    } else {
+      q = active_worst_quality;
+    }
+    active_best_quality = get_active_quality(
+        q, rc->gfu_boost, gf_low, gf_high,
+        arfgf_low_motion_minq, arfgf_high_motion_minq);
   } else {
-    unsigned int total_weight = 0;
-    int last_kf_interval =
-      (cpi->frames_since_key > 0) ? cpi->frames_since_key : 1;
-
-    /* reset keyframe context and calculate weighted average of last
-     * KEY_FRAME_CONTEXT keyframes
-     */
-    for (i = 0; i < KEY_FRAME_CONTEXT; i++) {
-      if (i < KEY_FRAME_CONTEXT - 1)
-        cpi->prior_key_frame_distance[i]
-          = cpi->prior_key_frame_distance[i + 1];
+    // Use the lower of active_worst_quality and recent/average Q.
+    if (cm->current_video_frame > 1) {
+      if (rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality)
+        active_best_quality = rtc_minq[rc->avg_frame_qindex[INTER_FRAME]];
       else
-        cpi->prior_key_frame_distance[i] = last_kf_interval;
-
-      av_key_frame_frequency += prior_key_frame_weight[i]
-                                * cpi->prior_key_frame_distance[i];
-      total_weight += prior_key_frame_weight[i];
+        active_best_quality = rtc_minq[active_worst_quality];
+    } else {
+      if (rc->avg_frame_qindex[KEY_FRAME] < active_worst_quality)
+        active_best_quality = rtc_minq[rc->avg_frame_qindex[KEY_FRAME]];
+      else
+        active_best_quality = rtc_minq[active_worst_quality];
     }
+  }
 
-    av_key_frame_frequency /= total_weight;
+  // Clip the active best and worst quality values to limits
+  active_best_quality = clamp(active_best_quality,
+                              rc->best_quality, rc->worst_quality);
+  active_worst_quality = clamp(active_worst_quality,
+                               active_best_quality, rc->worst_quality);
+
+  *top_index = active_worst_quality;
+  *bottom_index = active_best_quality;
+
+#if LIMIT_QRANGE_FOR_ALTREF_AND_KEY
+  // Limit Q range for the adaptive loop.
+  if (cm->frame_type == KEY_FRAME &&
+      !rc->this_key_frame_forced  &&
+      !(cm->current_video_frame == 0)) {
+    int qdelta = 0;
+    vp9_clear_system_state();
+    qdelta = vp9_compute_qdelta_by_rate(&cpi->rc, cm->frame_type,
+                                        active_worst_quality, 2.0);
+    *top_index = active_worst_quality + qdelta;
+    *top_index = (*top_index > *bottom_index) ? *top_index : *bottom_index;
   }
-  return av_key_frame_frequency;
+#endif
+
+  // Special case code to try and match quality with forced key frames
+  if (cm->frame_type == KEY_FRAME && rc->this_key_frame_forced) {
+    q = rc->last_boosted_qindex;
+  } else {
+    q = vp9_rc_regulate_q(cpi, rc->this_frame_target,
+                          active_best_quality, active_worst_quality);
+    if (q > *top_index) {
+      // Special case when we are targeting the max allowed rate
+      if (rc->this_frame_target >= rc->max_frame_bandwidth)
+        *top_index = q;
+      else
+        q = *top_index;
+    }
+  }
+  assert(*top_index <= rc->worst_quality &&
+         *top_index >= rc->best_quality);
+  assert(*bottom_index <= rc->worst_quality &&
+         *bottom_index >= rc->best_quality);
+  assert(q <= rc->worst_quality && q >= rc->best_quality);
+  return q;
 }
 
+static int rc_pick_q_and_bounds_one_pass_vbr(const VP9_COMP *cpi,
+                                             int *bottom_index,
+                                             int *top_index) {
+  const VP9_COMMON *const cm = &cpi->common;
+  const RATE_CONTROL *const rc = &cpi->rc;
+  const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+  const int cq_level = oxcf->cq_level;
+  int active_best_quality;
+  int active_worst_quality = calc_active_worst_quality_one_pass_vbr(cpi);
+  int q;
+
+  if (frame_is_intra_only(cm)) {
+    active_best_quality = rc->best_quality;
+#if !CONFIG_MULTIPLE_ARF
+    // Handle the special case for key frames forced when we have75 reached
+    // the maximum key frame interval. Here force the Q to a range
+    // based on the ambient Q to reduce the risk of popping.
+    if (rc->this_key_frame_forced) {
+      int qindex = rc->last_boosted_qindex;
+      double last_boosted_q = vp9_convert_qindex_to_q(qindex);
+      int delta_qindex = vp9_compute_qdelta(rc, last_boosted_q,
+                                            last_boosted_q * 0.75);
+      active_best_quality = MAX(qindex + delta_qindex, rc->best_quality);
+    } else if (cm->current_video_frame > 0) {
+      // not first frame of one pass and kf_boost is set
+      double q_adj_factor = 1.0;
+      double q_val;
+
+      active_best_quality = get_active_quality(rc->avg_frame_qindex[KEY_FRAME],
+                                               rc->kf_boost,
+                                               kf_low, kf_high,
+                                               kf_low_motion_minq,
+                                               kf_high_motion_minq);
+
+      // Allow somewhat lower kf minq with small image formats.
+      if ((cm->width * cm->height) <= (352 * 288)) {
+        q_adj_factor -= 0.25;
+      }
 
-void vp9_adjust_key_frame_context(VP9_COMP *cpi) {
-  // Clear down mmx registers to allow floating point in what follows
-  vp9_clear_system_state();
+      // Convert the adjustment factor to a qindex delta
+      // on active_best_quality.
+      q_val = vp9_convert_qindex_to_q(active_best_quality);
+      active_best_quality += vp9_compute_qdelta(rc, q_val,
+                                                q_val * q_adj_factor);
+    }
+#else
+    double current_q;
+    // Force the KF quantizer to be 30% of the active_worst_quality.
+    current_q = vp9_convert_qindex_to_q(active_worst_quality);
+    active_best_quality = active_worst_quality
+        + vp9_compute_qdelta(rc, current_q, current_q * 0.3);
+#endif
+  } else if (!rc->is_src_frame_alt_ref &&
+             (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
+    // Use the lower of active_worst_quality and recent
+    // average Q as basis for GF/ARF best Q limit unless last frame was
+    // a key frame.
+    if (rc->frames_since_key > 1 &&
+        rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality) {
+      q = rc->avg_frame_qindex[INTER_FRAME];
+    } else {
+      q = rc->avg_frame_qindex[KEY_FRAME];
+    }
+    // For constrained quality dont allow Q less than the cq level
+    if (oxcf->rc_mode == RC_MODE_CONSTRAINED_QUALITY) {
+      if (q < cq_level)
+        q = cq_level;
+
+      active_best_quality = get_active_quality(q, rc->gfu_boost,
+                                               gf_low, gf_high,
+                                               arfgf_low_motion_minq,
+                                               arfgf_high_motion_minq);
+
+      // Constrained quality use slightly lower active best.
+      active_best_quality = active_best_quality * 15 / 16;
+
+    } else if (oxcf->rc_mode == RC_MODE_CONSTANT_QUALITY) {
+      if (!cpi->refresh_alt_ref_frame) {
+        active_best_quality = cq_level;
+      } else {
+        active_best_quality = get_active_quality(
+            q, rc->gfu_boost, gf_low, gf_high,
+            arfgf_low_motion_minq, arfgf_high_motion_minq);
+      }
+    } else {
+      active_best_quality = get_active_quality(
+          q, rc->gfu_boost, gf_low, gf_high,
+          arfgf_low_motion_minq, arfgf_high_motion_minq);
+    }
+  } else {
+    if (oxcf->rc_mode == RC_MODE_CONSTANT_QUALITY) {
+      active_best_quality = cq_level;
+    } else {
+      // Use the lower of active_worst_quality and recent/average Q.
+      if (cm->current_video_frame > 1)
+        active_best_quality = inter_minq[rc->avg_frame_qindex[INTER_FRAME]];
+      else
+        active_best_quality = inter_minq[rc->avg_frame_qindex[KEY_FRAME]];
+      // For the constrained quality mode we don't want
+      // q to fall below the cq level.
+      if ((oxcf->rc_mode == RC_MODE_CONSTRAINED_QUALITY) &&
+          (active_best_quality < cq_level)) {
+        active_best_quality = cq_level;
+      }
+    }
+  }
+
+  // Clip the active best and worst quality values to limits
+  active_best_quality = clamp(active_best_quality,
+                              rc->best_quality, rc->worst_quality);
+  active_worst_quality = clamp(active_worst_quality,
+                               active_best_quality, rc->worst_quality);
+
+  *top_index = active_worst_quality;
+  *bottom_index = active_best_quality;
+
+#if LIMIT_QRANGE_FOR_ALTREF_AND_KEY
+  {
+    int qdelta = 0;
+    vp9_clear_system_state();
+
+    // Limit Q range for the adaptive loop.
+    if (cm->frame_type == KEY_FRAME &&
+        !rc->this_key_frame_forced &&
+        !(cm->current_video_frame == 0)) {
+      qdelta = vp9_compute_qdelta_by_rate(&cpi->rc, cm->frame_type,
+                                          active_worst_quality, 2.0);
+    } else if (!rc->is_src_frame_alt_ref &&
+               (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
+      qdelta = vp9_compute_qdelta_by_rate(&cpi->rc, cm->frame_type,
+                                          active_worst_quality, 1.75);
+    }
+    *top_index = active_worst_quality + qdelta;
+    *top_index = (*top_index > *bottom_index) ? *top_index : *bottom_index;
+  }
+#endif
 
-  cpi->frames_since_key = 0;
-  cpi->key_frame_count++;
+  if (oxcf->rc_mode == RC_MODE_CONSTANT_QUALITY) {
+    q = active_best_quality;
+  // Special case code to try and match quality with forced key frames
+  } else if ((cm->frame_type == KEY_FRAME) && rc->this_key_frame_forced) {
+    q = rc->last_boosted_qindex;
+  } else {
+    q = vp9_rc_regulate_q(cpi, rc->this_frame_target,
+                          active_best_quality, active_worst_quality);
+    if (q > *top_index) {
+      // Special case when we are targeting the max allowed rate
+      if (rc->this_frame_target >= rc->max_frame_bandwidth)
+        *top_index = q;
+      else
+        q = *top_index;
+    }
+  }
+#if CONFIG_MULTIPLE_ARF
+  // Force the quantizer determined by the coding order pattern.
+  if (cpi->multi_arf_enabled && (cm->frame_type != KEY_FRAME) &&
+      cpi->oxcf.rc_mode != RC_MODE_CONSTANT_QUALITY) {
+    double new_q;
+    double current_q = vp9_convert_qindex_to_q(active_worst_quality);
+    int level = cpi->this_frame_weight;
+    assert(level >= 0);
+    new_q = current_q * (1.0 - (0.2 * (cpi->max_arf_level - level)));
+    q = active_worst_quality +
+        vp9_compute_qdelta(rc, current_q, new_q);
+
+    *bottom_index = q;
+    *top_index    = q;
+    printf("frame:%d q:%d\n", cm->current_video_frame, q);
+  }
+#endif
+  assert(*top_index <= rc->worst_quality &&
+         *top_index >= rc->best_quality);
+  assert(*bottom_index <= rc->worst_quality &&
+         *bottom_index >= rc->best_quality);
+  assert(q <= rc->worst_quality && q >= rc->best_quality);
+  return q;
 }
 
+static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi,
+                                         int *bottom_index,
+                                         int *top_index) {
+  const VP9_COMMON *const cm = &cpi->common;
+  const RATE_CONTROL *const rc = &cpi->rc;
+  const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+  const int cq_level = oxcf->cq_level;
+  int active_best_quality;
+  int active_worst_quality = cpi->twopass.active_worst_quality;
+  int q;
+
+  if (frame_is_intra_only(cm) || vp9_is_upper_layer_key_frame(cpi)) {
+#if !CONFIG_MULTIPLE_ARF
+    // Handle the special case for key frames forced when we have75 reached
+    // the maximum key frame interval. Here force the Q to a range
+    // based on the ambient Q to reduce the risk of popping.
+    if (rc->this_key_frame_forced) {
+      int qindex = rc->last_boosted_qindex;
+      double last_boosted_q = vp9_convert_qindex_to_q(qindex);
+      int delta_qindex = vp9_compute_qdelta(rc, last_boosted_q,
+                                            last_boosted_q * 0.75);
+      active_best_quality = MAX(qindex + delta_qindex, rc->best_quality);
+    } else {
+      // Not forced keyframe.
+      double q_adj_factor = 1.0;
+      double q_val;
+      // Baseline value derived from cpi->active_worst_quality and kf boost.
+      active_best_quality = get_active_quality(active_worst_quality,
+                                               rc->kf_boost,
+                                               kf_low, kf_high,
+                                               kf_low_motion_minq,
+                                               kf_high_motion_minq);
+
+      // Allow somewhat lower kf minq with small image formats.
+      if ((cm->width * cm->height) <= (352 * 288)) {
+        q_adj_factor -= 0.25;
+      }
 
-void vp9_compute_frame_size_bounds(VP9_COMP *cpi, int *frame_under_shoot_limit,
-                                   int *frame_over_shoot_limit) {
-  // Set-up bounds on acceptable frame size:
-  if (cpi->oxcf.fixed_q >= 0) {
-    // Fixed Q scenario: frame size never outranges target (there is no target!)
-    *frame_under_shoot_limit = 0;
-    *frame_over_shoot_limit  = INT_MAX;
-  } else {
-    if (cpi->common.frame_type == KEY_FRAME) {
-      *frame_over_shoot_limit  = cpi->this_frame_target * 9 / 8;
-      *frame_under_shoot_limit = cpi->this_frame_target * 7 / 8;
+      // Make a further adjustment based on the kf zero motion measure.
+      q_adj_factor += 0.05 - (0.001 * (double)cpi->twopass.kf_zeromotion_pct);
+
+      // Convert the adjustment factor to a qindex delta
+      // on active_best_quality.
+      q_val = vp9_convert_qindex_to_q(active_best_quality);
+      active_best_quality += vp9_compute_qdelta(rc, q_val,
+                                                q_val * q_adj_factor);
+    }
+#else
+    double current_q;
+    // Force the KF quantizer to be 30% of the active_worst_quality.
+    current_q = vp9_convert_qindex_to_q(active_worst_quality);
+    active_best_quality = active_worst_quality
+        + vp9_compute_qdelta(rc, current_q, current_q * 0.3);
+#endif
+  } else if (!rc->is_src_frame_alt_ref &&
+             (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
+    // Use the lower of active_worst_quality and recent
+    // average Q as basis for GF/ARF best Q limit unless last frame was
+    // a key frame.
+    if (rc->frames_since_key > 1 &&
+        rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality) {
+      q = rc->avg_frame_qindex[INTER_FRAME];
     } else {
-      if (cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame) {
-        *frame_over_shoot_limit  = cpi->this_frame_target * 9 / 8;
-        *frame_under_shoot_limit = cpi->this_frame_target * 7 / 8;
+      q = active_worst_quality;
+    }
+    // For constrained quality dont allow Q less than the cq level
+    if (oxcf->rc_mode == RC_MODE_CONSTRAINED_QUALITY) {
+      if (q < cq_level)
+        q = cq_level;
+
+      active_best_quality = get_active_quality(q, rc->gfu_boost,
+                                               gf_low, gf_high,
+                                               arfgf_low_motion_minq,
+                                               arfgf_high_motion_minq);
+
+      // Constrained quality use slightly lower active best.
+      active_best_quality = active_best_quality * 15 / 16;
+
+    } else if (oxcf->rc_mode == RC_MODE_CONSTANT_QUALITY) {
+      if (!cpi->refresh_alt_ref_frame) {
+        active_best_quality = cq_level;
       } else {
-        // Stron overshoot limit for constrained quality
-        if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) {
-          *frame_over_shoot_limit  = cpi->this_frame_target * 11 / 8;
-          *frame_under_shoot_limit = cpi->this_frame_target * 2 / 8;
-        } else {
-          *frame_over_shoot_limit  = cpi->this_frame_target * 11 / 8;
-          *frame_under_shoot_limit = cpi->this_frame_target * 5 / 8;
-        }
+        active_best_quality = get_active_quality(
+            q, rc->gfu_boost, gf_low, gf_high,
+            arfgf_low_motion_minq, arfgf_high_motion_minq);
       }
+    } else {
+      active_best_quality = get_active_quality(
+          q, rc->gfu_boost, gf_low, gf_high,
+          arfgf_low_motion_minq, arfgf_high_motion_minq);
     }
+  } else {
+    if (oxcf->rc_mode == RC_MODE_CONSTANT_QUALITY) {
+      active_best_quality = cq_level;
+    } else {
+      active_best_quality = inter_minq[active_worst_quality];
 
+      // For the constrained quality mode we don't want
+      // q to fall below the cq level.
+      if ((oxcf->rc_mode == RC_MODE_CONSTRAINED_QUALITY) &&
+          (active_best_quality < cq_level)) {
+        active_best_quality = cq_level;
+      }
+    }
+  }
+
+  // Clip the active best and worst quality values to limits.
+  active_best_quality = clamp(active_best_quality,
+                              rc->best_quality, rc->worst_quality);
+  active_worst_quality = clamp(active_worst_quality,
+                               active_best_quality, rc->worst_quality);
+
+  *top_index = active_worst_quality;
+  *bottom_index = active_best_quality;
+
+#if LIMIT_QRANGE_FOR_ALTREF_AND_KEY
+  {
+    int qdelta = 0;
+    vp9_clear_system_state();
+
+    // Limit Q range for the adaptive loop.
+    if ((cm->frame_type == KEY_FRAME || vp9_is_upper_layer_key_frame(cpi)) &&
+        !rc->this_key_frame_forced) {
+      qdelta = vp9_compute_qdelta_by_rate(&cpi->rc, cm->frame_type,
+                                          active_worst_quality, 2.0);
+    } else if (!rc->is_src_frame_alt_ref &&
+               (oxcf->rc_mode != RC_MODE_CBR) &&
+               (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
+      qdelta = vp9_compute_qdelta_by_rate(&cpi->rc, cm->frame_type,
+                                          active_worst_quality, 1.75);
+    }
+    *top_index = active_worst_quality + qdelta;
+    *top_index = (*top_index > *bottom_index) ? *top_index : *bottom_index;
+  }
+#endif
+
+  if (oxcf->rc_mode == RC_MODE_CONSTANT_QUALITY) {
+    q = active_best_quality;
+  // Special case code to try and match quality with forced key frames.
+  } else if ((cm->frame_type == KEY_FRAME) && rc->this_key_frame_forced) {
+    q = rc->last_boosted_qindex;
+  } else {
+    q = vp9_rc_regulate_q(cpi, rc->this_frame_target,
+                          active_best_quality, active_worst_quality);
+    if (q > *top_index) {
+      // Special case when we are targeting the max allowed rate.
+      if (rc->this_frame_target >= rc->max_frame_bandwidth)
+        *top_index = q;
+      else
+        q = *top_index;
+    }
+  }
+#if CONFIG_MULTIPLE_ARF
+  // Force the quantizer determined by the coding order pattern.
+  if (cpi->multi_arf_enabled && (cm->frame_type != KEY_FRAME) &&
+      cpi->oxcf.rc_mode != RC_MODE_CONSTANT_QUALITY) {
+    double new_q;
+    double current_q = vp9_convert_qindex_to_q(active_worst_quality);
+    int level = cpi->this_frame_weight;
+    assert(level >= 0);
+    new_q = current_q * (1.0 - (0.2 * (cpi->max_arf_level - level)));
+    q = active_worst_quality +
+        vp9_compute_qdelta(rc, current_q, new_q);
+
+    *bottom_index = q;
+    *top_index    = q;
+    printf("frame:%d q:%d\n", cm->current_video_frame, q);
+  }
+#endif
+  assert(*top_index <= rc->worst_quality &&
+         *top_index >= rc->best_quality);
+  assert(*bottom_index <= rc->worst_quality &&
+         *bottom_index >= rc->best_quality);
+  assert(q <= rc->worst_quality && q >= rc->best_quality);
+  return q;
+}
+
+int vp9_rc_pick_q_and_bounds(const VP9_COMP *cpi,
+                             int *bottom_index, int *top_index) {
+  int q;
+  if (cpi->pass == 0) {
+    if (cpi->oxcf.rc_mode == RC_MODE_CBR)
+      q = rc_pick_q_and_bounds_one_pass_cbr(cpi, bottom_index, top_index);
+    else
+      q = rc_pick_q_and_bounds_one_pass_vbr(cpi, bottom_index, top_index);
+  } else {
+    q = rc_pick_q_and_bounds_two_pass(cpi, bottom_index, top_index);
+  }
+
+  if (cpi->sf.use_nonrd_pick_mode) {
+    if (cpi->sf.force_frame_boost == 1)
+      q -= cpi->sf.max_delta_qindex;
+
+    if (q < *bottom_index)
+      *bottom_index = q;
+    else if (q > *top_index)
+      *top_index = q;
+  }
+  return q;
+}
+
+void vp9_rc_compute_frame_size_bounds(const VP9_COMP *cpi,
+                                      int frame_target,
+                                      int *frame_under_shoot_limit,
+                                      int *frame_over_shoot_limit) {
+  if (cpi->oxcf.rc_mode == RC_MODE_CONSTANT_QUALITY) {
+    *frame_under_shoot_limit = 0;
+    *frame_over_shoot_limit  = INT_MAX;
+  } else {
     // For very small rate targets where the fractional adjustment
-    // (eg * 7/8) may be tiny make sure there is at least a minimum
-    // range.
-    *frame_over_shoot_limit += 200;
-    *frame_under_shoot_limit -= 200;
-    if (*frame_under_shoot_limit < 0)
-      *frame_under_shoot_limit = 0;
+    // may be tiny make sure there is at least a minimum range.
+    const int tolerance = (cpi->sf.recode_tolerance * frame_target) / 100;
+    *frame_under_shoot_limit = MAX(frame_target - tolerance - 200, 0);
+    *frame_over_shoot_limit = MIN(frame_target + tolerance + 200,
+                                  cpi->rc.max_frame_bandwidth);
   }
 }
 
+void vp9_rc_set_frame_target(VP9_COMP *cpi, int target) {
+  const VP9_COMMON *const cm = &cpi->common;
+  RATE_CONTROL *const rc = &cpi->rc;
+
+  rc->this_frame_target = target;
 
-// return of 0 means drop frame
-int vp9_pick_frame_size(VP9_COMP *cpi) {
-  VP9_COMMON *cm = &cpi->common;
+  // Target rate per SB64 (including partial SB64s.
+  rc->sb64_target_rate = ((int64_t)rc->this_frame_target * 64 * 64) /
+                             (cm->width * cm->height);
+}
+
+static void update_alt_ref_frame_stats(VP9_COMP *cpi) {
+  // this frame refreshes means next frames don't unless specified by user
+  RATE_CONTROL *const rc = &cpi->rc;
+  rc->frames_since_golden = 0;
+
+#if CONFIG_MULTIPLE_ARF
+  if (!cpi->multi_arf_enabled)
+#endif
+    // Clear the alternate reference update pending flag.
+    rc->source_alt_ref_pending = 0;
+
+  // Set the alternate reference frame active flag
+  rc->source_alt_ref_active = 1;
+}
+
+static void update_golden_frame_stats(VP9_COMP *cpi) {
+  RATE_CONTROL *const rc = &cpi->rc;
+
+  // Update the Golden frame usage counts.
+  if (cpi->refresh_golden_frame) {
+    // this frame refreshes means next frames don't unless specified by user
+    rc->frames_since_golden = 0;
+
+    if (!rc->source_alt_ref_pending)
+      rc->source_alt_ref_active = 0;
+
+    // Decrement count down till next gf
+    if (rc->frames_till_gf_update_due > 0)
+      rc->frames_till_gf_update_due--;
+
+  } else if (!cpi->refresh_alt_ref_frame) {
+    // Decrement count down till next gf
+    if (rc->frames_till_gf_update_due > 0)
+      rc->frames_till_gf_update_due--;
+
+    rc->frames_since_golden++;
+  }
+}
+
+void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) {
+  const VP9_COMMON *const cm = &cpi->common;
+  const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+  RATE_CONTROL *const rc = &cpi->rc;
+  const int qindex = cm->base_qindex;
+
+  // Update rate control heuristics
+  rc->projected_frame_size = (int)(bytes_used << 3);
+
+  // Post encode loop adjustment of Q prediction.
+  vp9_rc_update_rate_correction_factors(
+      cpi, (cpi->sf.recode_loop >= ALLOW_RECODE_KFARFGF ||
+            oxcf->rc_mode == RC_MODE_CBR) ? 2 : 0);
+
+  // Keep a record of last Q and ambient average Q.
+  if (cm->frame_type == KEY_FRAME) {
+    rc->last_q[KEY_FRAME] = qindex;
+    rc->avg_frame_qindex[KEY_FRAME] =
+        ROUND_POWER_OF_TWO(3 * rc->avg_frame_qindex[KEY_FRAME] + qindex, 2);
+  } else if (!rc->is_src_frame_alt_ref &&
+             (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame) &&
+             !(cpi->use_svc && oxcf->rc_mode == RC_MODE_CBR)) {
+    rc->last_q[2] = qindex;
+    rc->avg_frame_qindex[2] =
+        ROUND_POWER_OF_TWO(3 * rc->avg_frame_qindex[2] + qindex, 2);
+  } else {
+    rc->last_q[INTER_FRAME] = qindex;
+    rc->avg_frame_qindex[INTER_FRAME] =
+        ROUND_POWER_OF_TWO(3 * rc->avg_frame_qindex[INTER_FRAME] + qindex, 2);
+    rc->ni_frames++;
+    rc->tot_q += vp9_convert_qindex_to_q(qindex);
+    rc->avg_q = rc->tot_q / rc->ni_frames;
+    // Calculate the average Q for normal inter frames (not key or GFU frames).
+    rc->ni_tot_qi += qindex;
+    rc->ni_av_qi = rc->ni_tot_qi / rc->ni_frames;
+  }
+
+  // Keep record of last boosted (KF/KF/ARF) Q value.
+  // If the current frame is coded at a lower Q then we also update it.
+  // If all mbs in this group are skipped only update if the Q value is
+  // better than that already stored.
+  // This is used to help set quality in forced key frames to reduce popping
+  if ((qindex < rc->last_boosted_qindex) ||
+      ((cpi->static_mb_pct < 100) &&
+       ((cm->frame_type == KEY_FRAME) || cpi->refresh_alt_ref_frame ||
+        (cpi->refresh_golden_frame && !rc->is_src_frame_alt_ref)))) {
+    rc->last_boosted_qindex = qindex;
+  }
+
+  update_buffer_level(cpi, rc->projected_frame_size);
+
+  // Rolling monitors of whether we are over or underspending used to help
+  // regulate min and Max Q in two pass.
+  if (cm->frame_type != KEY_FRAME) {
+    rc->rolling_target_bits = ROUND_POWER_OF_TWO(
+        rc->rolling_target_bits * 3 + rc->this_frame_target, 2);
+    rc->rolling_actual_bits = ROUND_POWER_OF_TWO(
+        rc->rolling_actual_bits * 3 + rc->projected_frame_size, 2);
+    rc->long_rolling_target_bits = ROUND_POWER_OF_TWO(
+        rc->long_rolling_target_bits * 31 + rc->this_frame_target, 5);
+    rc->long_rolling_actual_bits = ROUND_POWER_OF_TWO(
+        rc->long_rolling_actual_bits * 31 + rc->projected_frame_size, 5);
+  }
+
+  // Actual bits spent
+  rc->total_actual_bits += rc->projected_frame_size;
+  rc->total_target_bits += cm->show_frame ? rc->avg_frame_bandwidth : 0;
+
+  rc->total_target_vs_actual = rc->total_actual_bits - rc->total_target_bits;
+
+  if (oxcf->play_alternate && cpi->refresh_alt_ref_frame &&
+      (cm->frame_type != KEY_FRAME))
+    // Update the alternate reference frame stats as appropriate.
+    update_alt_ref_frame_stats(cpi);
+  else
+    // Update the Golden frame stats as appropriate.
+    update_golden_frame_stats(cpi);
 
   if (cm->frame_type == KEY_FRAME)
-    calc_iframe_target_size(cpi);
+    rc->frames_since_key = 0;
+  if (cm->show_frame) {
+    rc->frames_since_key++;
+    rc->frames_to_key--;
+  }
+}
+
+void vp9_rc_postencode_update_drop_frame(VP9_COMP *cpi) {
+  // Update buffer level with zero size, update frame counters, and return.
+  update_buffer_level(cpi, 0);
+  cpi->common.last_frame_type = cpi->common.frame_type;
+  cpi->rc.frames_since_key++;
+  cpi->rc.frames_to_key--;
+}
+
+// Use this macro to turn on/off use of alt-refs in one-pass mode.
+#define USE_ALTREF_FOR_ONE_PASS   1
+
+static int calc_pframe_target_size_one_pass_vbr(const VP9_COMP *const cpi) {
+  static const int af_ratio = 10;
+  const RATE_CONTROL *const rc = &cpi->rc;
+  int target;
+#if USE_ALTREF_FOR_ONE_PASS
+  target = (!rc->is_src_frame_alt_ref &&
+            (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) ?
+      (rc->avg_frame_bandwidth * rc->baseline_gf_interval * af_ratio) /
+      (rc->baseline_gf_interval + af_ratio - 1) :
+      (rc->avg_frame_bandwidth * rc->baseline_gf_interval) /
+      (rc->baseline_gf_interval + af_ratio - 1);
+#else
+  target = rc->avg_frame_bandwidth;
+#endif
+  return vp9_rc_clamp_pframe_target_size(cpi, target);
+}
+
+static int calc_iframe_target_size_one_pass_vbr(const VP9_COMP *const cpi) {
+  static const int kf_ratio = 25;
+  const RATE_CONTROL *rc = &cpi->rc;
+  const int target = rc->avg_frame_bandwidth * kf_ratio;
+  return vp9_rc_clamp_iframe_target_size(cpi, target);
+}
+
+void vp9_rc_get_one_pass_vbr_params(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  RATE_CONTROL *const rc = &cpi->rc;
+  int target;
+  // TODO(yaowu): replace the "auto_key && 0" below with proper decision logic.
+  if (!cpi->refresh_alt_ref_frame &&
+      (cm->current_video_frame == 0 ||
+       (cpi->frame_flags & FRAMEFLAGS_KEY) ||
+       rc->frames_to_key == 0 ||
+       (cpi->oxcf.auto_key && 0))) {
+    cm->frame_type = KEY_FRAME;
+    rc->this_key_frame_forced = cm->current_video_frame != 0 &&
+                                rc->frames_to_key == 0;
+    rc->frames_to_key = cpi->oxcf.key_freq;
+    rc->kf_boost = DEFAULT_KF_BOOST;
+    rc->source_alt_ref_active = 0;
+  } else {
+    cm->frame_type = INTER_FRAME;
+  }
+  if (rc->frames_till_gf_update_due == 0) {
+    rc->baseline_gf_interval = DEFAULT_GF_INTERVAL;
+    rc->frames_till_gf_update_due = rc->baseline_gf_interval;
+    // NOTE: frames_till_gf_update_due must be <= frames_to_key.
+    if (rc->frames_till_gf_update_due > rc->frames_to_key)
+      rc->frames_till_gf_update_due = rc->frames_to_key;
+    cpi->refresh_golden_frame = 1;
+    rc->source_alt_ref_pending = USE_ALTREF_FOR_ONE_PASS;
+    rc->gfu_boost = DEFAULT_GF_BOOST;
+  }
+  if (cm->frame_type == KEY_FRAME)
+    target = calc_iframe_target_size_one_pass_vbr(cpi);
   else
-    calc_pframe_target_size(cpi);
+    target = calc_pframe_target_size_one_pass_vbr(cpi);
+  vp9_rc_set_frame_target(cpi, target);
+}
+
+static int calc_pframe_target_size_one_pass_cbr(const VP9_COMP *cpi) {
+  const VP9EncoderConfig *oxcf = &cpi->oxcf;
+  const RATE_CONTROL *rc = &cpi->rc;
+  const SVC *const svc = &cpi->svc;
+  const int64_t diff = oxcf->optimal_buffer_level - rc->buffer_level;
+  const int64_t one_pct_bits = 1 + oxcf->optimal_buffer_level / 100;
+  int min_frame_target = MAX(rc->avg_frame_bandwidth >> 4, FRAME_OVERHEAD_BITS);
+  int target = rc->avg_frame_bandwidth;
+  if (svc->number_temporal_layers > 1 &&
+      oxcf->rc_mode == RC_MODE_CBR) {
+    // Note that for layers, avg_frame_bandwidth is the cumulative
+    // per-frame-bandwidth. For the target size of this frame, use the
+    // layer average frame size (i.e., non-cumulative per-frame-bw).
+    int current_temporal_layer = svc->temporal_layer_id;
+    const LAYER_CONTEXT *lc = &svc->layer_context[current_temporal_layer];
+    target = lc->avg_frame_size;
+    min_frame_target = MAX(lc->avg_frame_size >> 4, FRAME_OVERHEAD_BITS);
+  }
+  if (diff > 0) {
+    // Lower the target bandwidth for this frame.
+    const int pct_low = (int)MIN(diff / one_pct_bits, oxcf->under_shoot_pct);
+    target -= (target * pct_low) / 200;
+  } else if (diff < 0) {
+    // Increase the target bandwidth for this frame.
+    const int pct_high = (int)MIN(-diff / one_pct_bits, oxcf->over_shoot_pct);
+    target += (target * pct_high) / 200;
+  }
+  return MAX(min_frame_target, target);
+}
+
+static int calc_iframe_target_size_one_pass_cbr(const VP9_COMP *cpi) {
+  const RATE_CONTROL *rc = &cpi->rc;
+  const VP9EncoderConfig *oxcf = &cpi->oxcf;
+  const SVC *const svc = &cpi->svc;
+  int target;
+  if (cpi->common.current_video_frame == 0) {
+    target = ((cpi->oxcf.starting_buffer_level / 2) > INT_MAX)
+      ? INT_MAX : (int)(cpi->oxcf.starting_buffer_level / 2);
+  } else {
+    int kf_boost = 32;
+    double framerate = oxcf->framerate;
+    if (svc->number_temporal_layers > 1 &&
+        oxcf->rc_mode == RC_MODE_CBR) {
+      // Use the layer framerate for temporal layers CBR mode.
+      const LAYER_CONTEXT *lc = &svc->layer_context[svc->temporal_layer_id];
+      framerate = lc->framerate;
+    }
+    kf_boost = MAX(kf_boost, (int)(2 * framerate - 16));
+    if (rc->frames_since_key <  framerate / 2) {
+      kf_boost = (int)(kf_boost * rc->frames_since_key /
+                       (framerate / 2));
+    }
+    target = ((16 + kf_boost) * rc->avg_frame_bandwidth) >> 4;
+  }
+  return vp9_rc_clamp_iframe_target_size(cpi, target);
+}
+
+void vp9_rc_get_svc_params(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  RATE_CONTROL *const rc = &cpi->rc;
+  int target = rc->avg_frame_bandwidth;
+  if ((cm->current_video_frame == 0) ||
+      (cpi->frame_flags & FRAMEFLAGS_KEY) ||
+      (cpi->oxcf.auto_key && (rc->frames_since_key %
+          cpi->oxcf.key_freq == 0))) {
+    cm->frame_type = KEY_FRAME;
+    rc->source_alt_ref_active = 0;
+
+    if (cpi->use_svc && cpi->svc.number_temporal_layers == 1) {
+      cpi->svc.layer_context[cpi->svc.spatial_layer_id].is_key_frame = 1;
+    }
+
+    if (cpi->pass == 0 && cpi->oxcf.rc_mode == RC_MODE_CBR) {
+      target = calc_iframe_target_size_one_pass_cbr(cpi);
+    }
+  } else {
+    cm->frame_type = INTER_FRAME;
+
+    if (cpi->use_svc && cpi->svc.number_temporal_layers == 1) {
+      LAYER_CONTEXT *lc = &cpi->svc.layer_context[cpi->svc.spatial_layer_id];
+      if (cpi->svc.spatial_layer_id == 0) {
+        lc->is_key_frame = 0;
+      } else {
+        lc->is_key_frame = cpi->svc.layer_context[0].is_key_frame;
+      }
+    }
+
+    if (cpi->pass == 0 && cpi->oxcf.rc_mode == RC_MODE_CBR) {
+      target = calc_pframe_target_size_one_pass_cbr(cpi);
+    }
+  }
+  vp9_rc_set_frame_target(cpi, target);
+  rc->frames_till_gf_update_due = INT_MAX;
+  rc->baseline_gf_interval = INT_MAX;
+}
+
+void vp9_rc_get_one_pass_cbr_params(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  RATE_CONTROL *const rc = &cpi->rc;
+  int target;
+  // TODO(yaowu): replace the "auto_key && 0" below with proper decision logic.
+  if ((cm->current_video_frame == 0 ||
+      (cpi->frame_flags & FRAMEFLAGS_KEY) ||
+      rc->frames_to_key == 0 ||
+      (cpi->oxcf.auto_key && 0))) {
+    cm->frame_type = KEY_FRAME;
+    rc->this_key_frame_forced = cm->current_video_frame != 0 &&
+                                rc->frames_to_key == 0;
+    rc->frames_to_key = cpi->oxcf.key_freq;
+    rc->kf_boost = DEFAULT_KF_BOOST;
+    rc->source_alt_ref_active = 0;
+    target = calc_iframe_target_size_one_pass_cbr(cpi);
+  } else {
+    cm->frame_type = INTER_FRAME;
+    target = calc_pframe_target_size_one_pass_cbr(cpi);
+  }
+  vp9_rc_set_frame_target(cpi, target);
+  // Don't use gf_update by default in CBR mode.
+  rc->frames_till_gf_update_due = INT_MAX;
+  rc->baseline_gf_interval = INT_MAX;
+}
+
+int vp9_compute_qdelta(const RATE_CONTROL *rc, double qstart, double qtarget) {
+  int start_index = rc->worst_quality;
+  int target_index = rc->worst_quality;
+  int i;
+
+  // Convert the average q value to an index.
+  for (i = rc->best_quality; i < rc->worst_quality; ++i) {
+    start_index = i;
+    if (vp9_convert_qindex_to_q(i) >= qstart)
+      break;
+  }
+
+  // Convert the q target to an index
+  for (i = rc->best_quality; i < rc->worst_quality; ++i) {
+    target_index = i;
+    if (vp9_convert_qindex_to_q(i) >= qtarget)
+      break;
+  }
+
+  return target_index - start_index;
+}
+
+int vp9_compute_qdelta_by_rate(const RATE_CONTROL *rc, FRAME_TYPE frame_type,
+                               int qindex, double rate_target_ratio) {
+  int target_index = rc->worst_quality;
+  int i;
+
+  // Look up the current projected bits per block for the base index
+  const int base_bits_per_mb = vp9_rc_bits_per_mb(frame_type, qindex, 1.0);
+
+  // Find the target bits per mb based on the base value and given ratio.
+  const int target_bits_per_mb = (int)(rate_target_ratio * base_bits_per_mb);
+
+  // Convert the q target to an index
+  for (i = rc->best_quality; i < rc->worst_quality; ++i) {
+    target_index = i;
+    if (vp9_rc_bits_per_mb(frame_type, i, 1.0) <= target_bits_per_mb )
+      break;
+  }
+
+  return target_index - qindex;
+}
+
+void vp9_rc_update_framerate(VP9_COMP *cpi) {
+  const VP9_COMMON *const cm = &cpi->common;
+  const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+  RATE_CONTROL *const rc = &cpi->rc;
+  int vbr_max_bits;
+
+  rc->avg_frame_bandwidth = (int)(oxcf->target_bandwidth / oxcf->framerate);
+  rc->min_frame_bandwidth = (int)(rc->avg_frame_bandwidth *
+                                oxcf->two_pass_vbrmin_section / 100);
+
+  rc->min_frame_bandwidth = MAX(rc->min_frame_bandwidth, FRAME_OVERHEAD_BITS);
+
+  // A maximum bitrate for a frame is defined.
+  // The baseline for this aligns with HW implementations that
+  // can support decode of 1080P content up to a bitrate of MAX_MB_RATE bits
+  // per 16x16 MB (averaged over a frame). However this limit is extended if
+  // a very high rate is given on the command line or the the rate cannnot
+  // be acheived because of a user specificed max q (e.g. when the user
+  // specifies lossless encode.
+  vbr_max_bits = (int)(((int64_t)rc->avg_frame_bandwidth *
+                     oxcf->two_pass_vbrmax_section) / 100);
+  rc->max_frame_bandwidth = MAX(MAX((cm->MBs * MAX_MB_RATE), MAXRATE_1080P),
+                                    vbr_max_bits);
+
+  // Set Maximum gf/arf interval
+  rc->max_gf_interval = 16;
+
+  // Extended interval for genuinely static scenes
+  rc->static_scene_max_gf_interval = cpi->oxcf.key_freq >> 1;
+
+  // Special conditions when alt ref frame enabled in lagged compress mode
+  if (oxcf->play_alternate && oxcf->lag_in_frames) {
+    if (rc->max_gf_interval > oxcf->lag_in_frames - 1)
+      rc->max_gf_interval = oxcf->lag_in_frames - 1;
+
+    if (rc->static_scene_max_gf_interval > oxcf->lag_in_frames - 1)
+      rc->static_scene_max_gf_interval = oxcf->lag_in_frames - 1;
+  }
 
-  return 1;
+  if (rc->max_gf_interval > rc->static_scene_max_gf_interval)
+    rc->max_gf_interval = rc->static_scene_max_gf_interval;
 }
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ratectrl.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ratectrl.h
index ddda7130c9a..b1cc676091f 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ratectrl.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ratectrl.h
@@ -12,28 +12,174 @@
 #ifndef VP9_ENCODER_VP9_RATECTRL_H_
 #define VP9_ENCODER_VP9_RATECTRL_H_
 
-#include "vp9/encoder/vp9_onyx_int.h"
+#include "vpx/vpx_integer.h"
 
-#define FRAME_OVERHEAD_BITS 200
+#include "vp9/common/vp9_blockd.h"
 
-void vp9_save_coding_context(VP9_COMP *cpi);
-void vp9_restore_coding_context(VP9_COMP *cpi);
+#ifdef __cplusplus
+extern "C" {
+#endif
 
-void vp9_setup_key_frame(VP9_COMP *cpi);
-void vp9_update_rate_correction_factors(VP9_COMP *cpi, int damp_var);
-int vp9_regulate_q(VP9_COMP *cpi, int target_bits_per_frame);
-void vp9_adjust_key_frame_context(VP9_COMP *cpi);
-void vp9_compute_frame_size_bounds(VP9_COMP *cpi,
-                                   int *frame_under_shoot_limit,
-                                   int *frame_over_shoot_limit);
+// Bits Per MB at different Q (Multiplied by 512)
+#define BPER_MB_NORMBITS    9
 
-// return of 0 means drop frame
-int vp9_pick_frame_size(VP9_COMP *cpi);
+typedef struct {
+  // Rate targetting variables
+  int base_frame_target;           // A baseline frame target before adjustment
+                                   // for previous under or over shoot.
+  int this_frame_target;           // Actual frame target after rc adjustment.
+  int projected_frame_size;
+  int sb64_target_rate;
+  int last_q[3];                   // Separate values for Intra/Inter/ARF-GF
+  int last_boosted_qindex;         // Last boosted GF/KF/ARF q
+
+  int gfu_boost;
+  int last_boost;
+  int kf_boost;
+
+  double rate_correction_factor;
+  double key_frame_rate_correction_factor;
+  double gf_rate_correction_factor;
+
+  int frames_since_golden;
+  int frames_till_gf_update_due;
+  int max_gf_interval;
+  int static_scene_max_gf_interval;
+  int baseline_gf_interval;
+  int frames_to_key;
+  int frames_since_key;
+  int this_key_frame_forced;
+  int next_key_frame_forced;
+  int source_alt_ref_pending;
+  int source_alt_ref_active;
+  int is_src_frame_alt_ref;
+
+  int avg_frame_bandwidth;  // Average frame size target for clip
+  int min_frame_bandwidth;  // Minimum allocation used for any frame
+  int max_frame_bandwidth;  // Maximum burst rate allowed for a frame.
+
+  int ni_av_qi;
+  int ni_tot_qi;
+  int ni_frames;
+  int avg_frame_qindex[3];        // 0 - KEY, 1 - INTER, 2 - ARF/GF
+  double tot_q;
+  double avg_q;
+
+  int64_t buffer_level;
+  int64_t bits_off_target;
+  int64_t vbr_bits_off_target;
+
+  int decimation_factor;
+  int decimation_count;
+
+  int rolling_target_bits;
+  int rolling_actual_bits;
+
+  int long_rolling_target_bits;
+  int long_rolling_actual_bits;
+
+  int64_t total_actual_bits;
+  int64_t total_target_bits;
+  int64_t total_target_vs_actual;
+
+  int worst_quality;
+  int best_quality;
+  // int active_best_quality;
+} RATE_CONTROL;
+
+struct VP9_COMP;
+struct VP9EncoderConfig;
+
+void vp9_rc_init(const struct VP9EncoderConfig *oxcf, int pass,
+                 RATE_CONTROL *rc);
 
 double vp9_convert_qindex_to_q(int qindex);
-int vp9_gfboost_qadjust(int qindex);
-int vp9_bits_per_mb(FRAME_TYPE frame_type, int qindex,
-                    double correction_factor);
-void vp9_setup_inter_frame(VP9_COMP *cpi);
+
+void vp9_rc_init_minq_luts();
+
+// Generally at the high level, the following flow is expected
+// to be enforced for rate control:
+// First call per frame, one of:
+//   vp9_rc_get_one_pass_vbr_params()
+//   vp9_rc_get_one_pass_cbr_params()
+//   vp9_rc_get_svc_params()
+//   vp9_rc_get_first_pass_params()
+//   vp9_rc_get_second_pass_params()
+// depending on the usage to set the rate control encode parameters desired.
+//
+// Then, call encode_frame_to_data_rate() to perform the
+// actual encode. This function will in turn call encode_frame()
+// one or more times, followed by one of:
+//   vp9_rc_postencode_update()
+//   vp9_rc_postencode_update_drop_frame()
+//
+// The majority of rate control parameters are only expected
+// to be set in the vp9_rc_get_..._params() functions and
+// updated during the vp9_rc_postencode_update...() functions.
+// The only exceptions are vp9_rc_drop_frame() and
+// vp9_rc_update_rate_correction_factors() functions.
+
+// Functions to set parameters for encoding before the actual
+// encode_frame_to_data_rate() function.
+void vp9_rc_get_one_pass_vbr_params(struct VP9_COMP *cpi);
+void vp9_rc_get_one_pass_cbr_params(struct VP9_COMP *cpi);
+void vp9_rc_get_svc_params(struct VP9_COMP *cpi);
+
+// Post encode update of the rate control parameters based
+// on bytes used
+void vp9_rc_postencode_update(struct VP9_COMP *cpi, uint64_t bytes_used);
+// Post encode update of the rate control parameters for dropped frames
+void vp9_rc_postencode_update_drop_frame(struct VP9_COMP *cpi);
+
+// Updates rate correction factors
+// Changes only the rate correction factors in the rate control structure.
+void vp9_rc_update_rate_correction_factors(struct VP9_COMP *cpi, int damp_var);
+
+// Decide if we should drop this frame: For 1-pass CBR.
+// Changes only the decimation count in the rate control structure
+int vp9_rc_drop_frame(struct VP9_COMP *cpi);
+
+// Computes frame size bounds.
+void vp9_rc_compute_frame_size_bounds(const struct VP9_COMP *cpi,
+                                      int this_frame_target,
+                                      int *frame_under_shoot_limit,
+                                      int *frame_over_shoot_limit);
+
+// Picks q and q bounds given the target for bits
+int vp9_rc_pick_q_and_bounds(const struct VP9_COMP *cpi,
+                             int *bottom_index,
+                             int *top_index);
+
+// Estimates q to achieve a target bits per frame
+int vp9_rc_regulate_q(const struct VP9_COMP *cpi, int target_bits_per_frame,
+                      int active_best_quality, int active_worst_quality);
+
+// Estimates bits per mb for a given qindex and correction factor.
+int vp9_rc_bits_per_mb(FRAME_TYPE frame_type, int qindex,
+                       double correction_factor);
+
+// Clamping utilities for bitrate targets for iframes and pframes.
+int vp9_rc_clamp_iframe_target_size(const struct VP9_COMP *const cpi,
+                                    int target);
+int vp9_rc_clamp_pframe_target_size(const struct VP9_COMP *const cpi,
+                                    int target);
+// Utility to set frame_target into the RATE_CONTROL structure
+// This function is called only from the vp9_rc_get_..._params() functions.
+void vp9_rc_set_frame_target(struct VP9_COMP *cpi, int target);
+
+// Computes a q delta (in "q index" terms) to get from a starting q value
+// to a target q value
+int vp9_compute_qdelta(const RATE_CONTROL *rc, double qstart, double qtarget);
+
+// Computes a q delta (in "q index" terms) to get from a starting q value
+// to a value that should equate to the given rate ratio.
+int vp9_compute_qdelta_by_rate(const RATE_CONTROL *rc, FRAME_TYPE frame_type,
+                               int qindex, double rate_target_ratio);
+
+void vp9_rc_update_framerate(struct VP9_COMP *cpi);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
 
 #endif  // VP9_ENCODER_VP9_RATECTRL_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_rdopt.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_rdopt.c
index f9de78bf82c..64f3e5a7479 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_rdopt.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_rdopt.c
@@ -8,39 +8,43 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include <stdio.h>
-#include <math.h>
-#include <limits.h>
 #include <assert.h>
+#include <limits.h>
+#include <math.h>
+#include <stdio.h>
 
-#include "vp9/common/vp9_pragmas.h"
-#include "vp9/encoder/vp9_tokenize.h"
-#include "vp9/encoder/vp9_treewriter.h"
-#include "vp9/encoder/vp9_onyx_int.h"
-#include "vp9/encoder/vp9_modecosts.h"
-#include "vp9/encoder/vp9_encodeintra.h"
+#include "./vp9_rtcd.h"
+
+#include "vpx_mem/vpx_mem.h"
+
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_entropy.h"
 #include "vp9/common/vp9_entropymode.h"
+#include "vp9/common/vp9_idct.h"
+#include "vp9/common/vp9_mvref_common.h"
+#include "vp9/common/vp9_pragmas.h"
+#include "vp9/common/vp9_pred_common.h"
+#include "vp9/common/vp9_quant_common.h"
 #include "vp9/common/vp9_reconinter.h"
 #include "vp9/common/vp9_reconintra.h"
-#include "vp9/common/vp9_findnearmv.h"
-#include "vp9/common/vp9_quant_common.h"
+#include "vp9/common/vp9_seg_common.h"
+#include "vp9/common/vp9_systemdependent.h"
+
+#include "vp9/encoder/vp9_cost.h"
 #include "vp9/encoder/vp9_encodemb.h"
-#include "vp9/encoder/vp9_quantize.h"
-#include "vp9/encoder/vp9_variance.h"
+#include "vp9/encoder/vp9_encodemv.h"
+#include "vp9/encoder/vp9_encoder.h"
 #include "vp9/encoder/vp9_mcomp.h"
-#include "vp9/encoder/vp9_rdopt.h"
+#include "vp9/encoder/vp9_quantize.h"
 #include "vp9/encoder/vp9_ratectrl.h"
-#include "vpx_mem/vpx_mem.h"
-#include "vp9/common/vp9_systemdependent.h"
-#include "vp9/encoder/vp9_encodemv.h"
-#include "vp9/common/vp9_seg_common.h"
-#include "vp9/common/vp9_pred_common.h"
-#include "vp9/common/vp9_entropy.h"
-#include "./vp9_rtcd.h"
-#include "vp9/common/vp9_mvref_common.h"
-#include "vp9/common/vp9_common.h"
+#include "vp9/encoder/vp9_rdopt.h"
+#include "vp9/encoder/vp9_tokenize.h"
+#include "vp9/encoder/vp9_variance.h"
 
-#define INVALID_MV 0x80008000
+#define RD_THRESH_MAX_FACT 64
+#define RD_THRESH_INC      1
+#define RD_THRESH_POW      1.25
+#define RD_MULT_EPB_RATIO  64
 
 /* Factor to weigh the rate for switchable interp filters */
 #define SWITCHABLE_INTERP_RATE_FACTOR 1
@@ -51,91 +55,144 @@
 
 #define MIN_EARLY_TERM_INDEX    3
 
-const MODE_DEFINITION vp9_mode_order[MAX_MODES] = {
-  {NEARESTMV, LAST_FRAME,   NONE},
-  {NEARESTMV, ALTREF_FRAME, NONE},
-  {NEARESTMV, GOLDEN_FRAME, NONE},
-
-  {DC_PRED,   INTRA_FRAME,  NONE},
-
-  {NEWMV,     LAST_FRAME,   NONE},
-  {NEWMV,     ALTREF_FRAME, NONE},
-  {NEWMV,     GOLDEN_FRAME, NONE},
-
-  {NEARMV,    LAST_FRAME,   NONE},
-  {NEARMV,    ALTREF_FRAME, NONE},
-  {NEARESTMV, LAST_FRAME,   ALTREF_FRAME},
-  {NEARESTMV, GOLDEN_FRAME, ALTREF_FRAME},
-
-  {TM_PRED,   INTRA_FRAME,  NONE},
-
-  {NEARMV,    LAST_FRAME,   ALTREF_FRAME},
-  {NEWMV,     LAST_FRAME,   ALTREF_FRAME},
-  {NEARMV,    GOLDEN_FRAME, NONE},
-  {NEARMV,    GOLDEN_FRAME, ALTREF_FRAME},
-  {NEWMV,     GOLDEN_FRAME, ALTREF_FRAME},
-
-  {ZEROMV,    LAST_FRAME,   NONE},
-  {ZEROMV,    GOLDEN_FRAME, NONE},
-  {ZEROMV,    ALTREF_FRAME, NONE},
-  {ZEROMV,    LAST_FRAME,   ALTREF_FRAME},
-  {ZEROMV,    GOLDEN_FRAME, ALTREF_FRAME},
-
-  {H_PRED,    INTRA_FRAME,  NONE},
-  {V_PRED,    INTRA_FRAME,  NONE},
-  {D135_PRED, INTRA_FRAME,  NONE},
-  {D207_PRED, INTRA_FRAME,  NONE},
-  {D153_PRED, INTRA_FRAME,  NONE},
-  {D63_PRED,  INTRA_FRAME,  NONE},
-  {D117_PRED, INTRA_FRAME,  NONE},
-  {D45_PRED,  INTRA_FRAME,  NONE},
+typedef struct {
+  PREDICTION_MODE mode;
+  MV_REFERENCE_FRAME ref_frame[2];
+} MODE_DEFINITION;
+
+typedef struct {
+  MV_REFERENCE_FRAME ref_frame[2];
+} REF_DEFINITION;
+
+struct rdcost_block_args {
+  MACROBLOCK *x;
+  ENTROPY_CONTEXT t_above[16];
+  ENTROPY_CONTEXT t_left[16];
+  int rate;
+  int64_t dist;
+  int64_t sse;
+  int this_rate;
+  int64_t this_dist;
+  int64_t this_sse;
+  int64_t this_rd;
+  int64_t best_rd;
+  int skip;
+  int use_fast_coef_costing;
+  const scan_order *so;
+};
+
+static const MODE_DEFINITION vp9_mode_order[MAX_MODES] = {
+  {NEARESTMV, {LAST_FRAME,   NONE}},
+  {NEARESTMV, {ALTREF_FRAME, NONE}},
+  {NEARESTMV, {GOLDEN_FRAME, NONE}},
+
+  {DC_PRED,   {INTRA_FRAME,  NONE}},
+
+  {NEWMV,     {LAST_FRAME,   NONE}},
+  {NEWMV,     {ALTREF_FRAME, NONE}},
+  {NEWMV,     {GOLDEN_FRAME, NONE}},
+
+  {NEARMV,    {LAST_FRAME,   NONE}},
+  {NEARMV,    {ALTREF_FRAME, NONE}},
+  {NEARESTMV, {LAST_FRAME,   ALTREF_FRAME}},
+  {NEARESTMV, {GOLDEN_FRAME, ALTREF_FRAME}},
+
+  {TM_PRED,   {INTRA_FRAME,  NONE}},
+
+  {NEARMV,    {LAST_FRAME,   ALTREF_FRAME}},
+  {NEWMV,     {LAST_FRAME,   ALTREF_FRAME}},
+  {NEARMV,    {GOLDEN_FRAME, NONE}},
+  {NEARMV,    {GOLDEN_FRAME, ALTREF_FRAME}},
+  {NEWMV,     {GOLDEN_FRAME, ALTREF_FRAME}},
+
+  {ZEROMV,    {LAST_FRAME,   NONE}},
+  {ZEROMV,    {GOLDEN_FRAME, NONE}},
+  {ZEROMV,    {ALTREF_FRAME, NONE}},
+  {ZEROMV,    {LAST_FRAME,   ALTREF_FRAME}},
+  {ZEROMV,    {GOLDEN_FRAME, ALTREF_FRAME}},
+
+  {H_PRED,    {INTRA_FRAME,  NONE}},
+  {V_PRED,    {INTRA_FRAME,  NONE}},
+  {D135_PRED, {INTRA_FRAME,  NONE}},
+  {D207_PRED, {INTRA_FRAME,  NONE}},
+  {D153_PRED, {INTRA_FRAME,  NONE}},
+  {D63_PRED,  {INTRA_FRAME,  NONE}},
+  {D117_PRED, {INTRA_FRAME,  NONE}},
+  {D45_PRED,  {INTRA_FRAME,  NONE}},
 };
 
-const REF_DEFINITION vp9_ref_order[MAX_REFS] = {
-  {LAST_FRAME,   NONE},
-  {GOLDEN_FRAME, NONE},
-  {ALTREF_FRAME, NONE},
-  {LAST_FRAME,   ALTREF_FRAME},
-  {GOLDEN_FRAME, ALTREF_FRAME},
-  {INTRA_FRAME,  NONE},
+static const REF_DEFINITION vp9_ref_order[MAX_REFS] = {
+  {{LAST_FRAME,   NONE}},
+  {{GOLDEN_FRAME, NONE}},
+  {{ALTREF_FRAME, NONE}},
+  {{LAST_FRAME,   ALTREF_FRAME}},
+  {{GOLDEN_FRAME, ALTREF_FRAME}},
+  {{INTRA_FRAME,  NONE}},
 };
 
 // The baseline rd thresholds for breaking out of the rd loop for
 // certain modes are assumed to be based on 8x8 blocks.
 // This table is used to correct for blocks size.
 // The factors here are << 2 (2 = x0.5, 32 = x8 etc).
-static int rd_thresh_block_size_factor[BLOCK_SIZES] =
-  {2, 3, 3, 4, 6, 6, 8, 12, 12, 16, 24, 24, 32};
+static const uint8_t rd_thresh_block_size_factor[BLOCK_SIZES] = {
+  2, 3, 3, 4, 6, 6, 8, 12, 12, 16, 24, 24, 32
+};
 
-#define RD_THRESH_MAX_FACT 64
-#define RD_THRESH_INC      1
-#define RD_THRESH_POW      1.25
-#define RD_MULT_EPB_RATIO  64
+static int raster_block_offset(BLOCK_SIZE plane_bsize,
+                               int raster_block, int stride) {
+  const int bw = b_width_log2(plane_bsize);
+  const int y = 4 * (raster_block >> bw);
+  const int x = 4 * (raster_block & ((1 << bw) - 1));
+  return y * stride + x;
+}
+static int16_t* raster_block_offset_int16(BLOCK_SIZE plane_bsize,
+                                          int raster_block, int16_t *base) {
+  const int stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
+  return base + raster_block_offset(plane_bsize, raster_block, stride);
+}
+
+static void fill_mode_costs(VP9_COMP *cpi) {
+  const FRAME_CONTEXT *const fc = &cpi->common.fc;
+  int i, j;
 
-#define MV_COST_WEIGHT      108
-#define MV_COST_WEIGHT_SUB  120
+  for (i = 0; i < INTRA_MODES; i++)
+    for (j = 0; j < INTRA_MODES; j++)
+      vp9_cost_tokens(cpi->y_mode_costs[i][j], vp9_kf_y_mode_prob[i][j],
+                      vp9_intra_mode_tree);
+
+  // TODO(rbultje) separate tables for superblock costing?
+  vp9_cost_tokens(cpi->mbmode_cost, fc->y_mode_prob[1], vp9_intra_mode_tree);
+  vp9_cost_tokens(cpi->intra_uv_mode_cost[KEY_FRAME],
+                  vp9_kf_uv_mode_prob[TM_PRED], vp9_intra_mode_tree);
+  vp9_cost_tokens(cpi->intra_uv_mode_cost[INTER_FRAME],
+                  fc->uv_mode_prob[TM_PRED], vp9_intra_mode_tree);
+
+  for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
+    vp9_cost_tokens(cpi->switchable_interp_costs[i],
+                    fc->switchable_interp_prob[i], vp9_switchable_interp_tree);
+}
 
 static void fill_token_costs(vp9_coeff_cost *c,
-                             vp9_coeff_probs_model (*p)[BLOCK_TYPES]) {
+                             vp9_coeff_probs_model (*p)[PLANE_TYPES]) {
   int i, j, k, l;
   TX_SIZE t;
-  for (t = TX_4X4; t <= TX_32X32; t++)
-    for (i = 0; i < BLOCK_TYPES; i++)
-      for (j = 0; j < REF_TYPES; j++)
-        for (k = 0; k < COEF_BANDS; k++)
-          for (l = 0; l < PREV_COEF_CONTEXTS; l++) {
+  for (t = TX_4X4; t <= TX_32X32; ++t)
+    for (i = 0; i < PLANE_TYPES; ++i)
+      for (j = 0; j < REF_TYPES; ++j)
+        for (k = 0; k < COEF_BANDS; ++k)
+          for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) {
             vp9_prob probs[ENTROPY_NODES];
             vp9_model_to_full_probs(p[t][i][j][k][l], probs);
             vp9_cost_tokens((int *)c[t][i][j][k][0][l], probs,
                             vp9_coef_tree);
             vp9_cost_tokens_skip((int *)c[t][i][j][k][1][l], probs,
                                  vp9_coef_tree);
-            assert(c[t][i][j][k][0][l][DCT_EOB_TOKEN] ==
-                   c[t][i][j][k][1][l][DCT_EOB_TOKEN]);
+            assert(c[t][i][j][k][0][l][EOB_TOKEN] ==
+                   c[t][i][j][k][1][l][EOB_TOKEN]);
           }
 }
 
-static const int rd_iifactor[32] = {
+static const uint8_t rd_iifactor[32] = {
   4, 4, 3, 2, 1, 0, 0, 0,
   0, 0, 0, 0, 0, 0, 0, 0,
   0, 0, 0, 0, 0, 0, 0, 0,
@@ -155,13 +212,13 @@ void vp9_init_me_luts() {
   // This is to make it easier to resolve the impact of experimental changes
   // to the quantizer tables.
   for (i = 0; i < QINDEX_RANGE; i++) {
-    sad_per_bit16lut[i] =
-      (int)((0.0418 * vp9_convert_qindex_to_q(i)) + 2.4107);
-    sad_per_bit4lut[i] = (int)(0.063 * vp9_convert_qindex_to_q(i) + 2.742);
+    const double q = vp9_convert_qindex_to_q(i);
+    sad_per_bit16lut[i] = (int)(0.0418 * q + 2.4107);
+    sad_per_bit4lut[i] = (int)(0.063 * q + 2.742);
   }
 }
 
-int vp9_compute_rd_mult(VP9_COMP *cpi, int qindex) {
+int vp9_compute_rd_mult(const VP9_COMP *cpi, int qindex) {
   const int q = vp9_dc_quant(qindex, 0);
   // TODO(debargha): Adjust the function below
   int rdmult = 88 * q * q / 25;
@@ -175,12 +232,9 @@ int vp9_compute_rd_mult(VP9_COMP *cpi, int qindex) {
 }
 
 static int compute_rd_thresh_factor(int qindex) {
-  int q;
   // TODO(debargha): Adjust the function below
-  q = (int)(pow(vp9_dc_quant(qindex, 0) / 4.0, RD_THRESH_POW) * 5.12);
-  if (q < 8)
-    q = 8;
-  return q;
+  const int q = (int)(pow(vp9_dc_quant(qindex, 0) / 4.0, RD_THRESH_POW) * 5.12);
+  return MAX(q, 8);
 }
 
 void vp9_initialize_me_consts(VP9_COMP *cpi, int qindex) {
@@ -188,115 +242,90 @@ void vp9_initialize_me_consts(VP9_COMP *cpi, int qindex) {
   cpi->mb.sadperbit4 = sad_per_bit4lut[qindex];
 }
 
-static void set_block_thresholds(VP9_COMP *cpi) {
+static void set_block_thresholds(const VP9_COMMON *cm, RD_OPT *rd) {
   int i, bsize, segment_id;
-  VP9_COMMON *cm = &cpi->common;
 
   for (segment_id = 0; segment_id < MAX_SEGMENTS; ++segment_id) {
-    int q;
-    int segment_qindex = vp9_get_qindex(&cm->seg, segment_id, cm->base_qindex);
-    segment_qindex = clamp(segment_qindex + cm->y_dc_delta_q, 0, MAXQ);
-    q = compute_rd_thresh_factor(segment_qindex);
+    const int qindex = clamp(vp9_get_qindex(&cm->seg, segment_id,
+                                            cm->base_qindex) + cm->y_dc_delta_q,
+                             0, MAXQ);
+    const int q = compute_rd_thresh_factor(qindex);
 
     for (bsize = 0; bsize < BLOCK_SIZES; ++bsize) {
-      // Threshold here seem unecessarily harsh but fine given actual
-      // range of values used for cpi->sf.thresh_mult[]
-      int thresh_max = INT_MAX / (q * rd_thresh_block_size_factor[bsize]);
-
-      for (i = 0; i < MAX_MODES; ++i) {
-        if (cpi->sf.thresh_mult[i] < thresh_max) {
-          cpi->rd_threshes[segment_id][bsize][i] =
-              cpi->sf.thresh_mult[i] * q *
-              rd_thresh_block_size_factor[bsize] / 4;
-        } else {
-          cpi->rd_threshes[segment_id][bsize][i] = INT_MAX;
-        }
-      }
-
-      for (i = 0; i < MAX_REFS; ++i) {
-        if (cpi->sf.thresh_mult_sub8x8[i] < thresh_max) {
-          cpi->rd_thresh_sub8x8[segment_id][bsize][i] =
-              cpi->sf.thresh_mult_sub8x8[i] * q *
-              rd_thresh_block_size_factor[bsize] / 4;
-        } else {
-          cpi->rd_thresh_sub8x8[segment_id][bsize][i] = INT_MAX;
-        }
+      // Threshold here seems unnecessarily harsh but fine given actual
+      // range of values used for cpi->sf.thresh_mult[].
+      const int t = q * rd_thresh_block_size_factor[bsize];
+      const int thresh_max = INT_MAX / t;
+
+      if (bsize >= BLOCK_8X8) {
+        for (i = 0; i < MAX_MODES; ++i)
+          rd->threshes[segment_id][bsize][i] =
+              rd->thresh_mult[i] < thresh_max
+                  ? rd->thresh_mult[i] * t / 4
+                  : INT_MAX;
+      } else {
+        for (i = 0; i < MAX_REFS; ++i)
+          rd->threshes[segment_id][bsize][i] =
+              rd->thresh_mult_sub8x8[i] < thresh_max
+                  ? rd->thresh_mult_sub8x8[i] * t / 4
+                  : INT_MAX;
       }
     }
   }
 }
 
 void vp9_initialize_rd_consts(VP9_COMP *cpi) {
-  VP9_COMMON *cm = &cpi->common;
-  int qindex, i;
-
-  vp9_clear_system_state();  // __asm emms;
-
-  // Further tests required to see if optimum is different
-  // for key frames, golden frames and arf frames.
-  // if (cpi->common.refresh_golden_frame ||
-  //     cpi->common.refresh_alt_ref_frame)
-  qindex = clamp(cm->base_qindex + cm->y_dc_delta_q, 0, MAXQ);
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &cpi->mb;
+  RD_OPT *const rd = &cpi->rd;
+  int i;
 
-  cpi->RDDIV = RDDIV_BITS;  // in bits (to multiply D by 128)
-  cpi->RDMULT = vp9_compute_rd_mult(cpi, qindex);
+  vp9_clear_system_state();
 
-  cpi->mb.errorperbit = cpi->RDMULT / RD_MULT_EPB_RATIO;
-  cpi->mb.errorperbit += (cpi->mb.errorperbit == 0);
+  rd->RDDIV = RDDIV_BITS;  // in bits (to multiply D by 128)
+  rd->RDMULT = vp9_compute_rd_mult(cpi, cm->base_qindex + cm->y_dc_delta_q);
 
-  vp9_set_speed_features(cpi);
+  x->errorperbit = rd->RDMULT / RD_MULT_EPB_RATIO;
+  x->errorperbit += (x->errorperbit == 0);
 
-  set_block_thresholds(cpi);
+  x->select_txfm_size = (cpi->sf.tx_size_search_method == USE_LARGESTALL &&
+                         cm->frame_type != KEY_FRAME) ? 0 : 1;
 
-  fill_token_costs(cpi->mb.token_costs, cm->fc.coef_probs);
+  set_block_thresholds(cm, rd);
 
-  for (i = 0; i < PARTITION_CONTEXTS; i++)
-    vp9_cost_tokens(cpi->mb.partition_cost[i],
-                    cm->fc.partition_prob[cm->frame_type][i],
-                    vp9_partition_tree);
+  if (!cpi->sf.use_nonrd_pick_mode || cm->frame_type == KEY_FRAME) {
+    fill_token_costs(x->token_costs, cm->fc.coef_probs);
 
-  /*rough estimate for costing*/
-  vp9_init_mode_costs(cpi);
+    for (i = 0; i < PARTITION_CONTEXTS; i++)
+      vp9_cost_tokens(x->partition_cost[i], get_partition_probs(cm, i),
+                      vp9_partition_tree);
+  }
 
-  if (!frame_is_intra_only(cm)) {
-    vp9_build_nmv_cost_table(
-        cpi->mb.nmvjointcost,
-        cm->allow_high_precision_mv ? cpi->mb.nmvcost_hp : cpi->mb.nmvcost,
-        &cm->fc.nmvc,
-        cm->allow_high_precision_mv, 1, 1);
+  if (!cpi->sf.use_nonrd_pick_mode || (cm->current_video_frame & 0x07) == 1 ||
+      cm->frame_type == KEY_FRAME) {
+    fill_mode_costs(cpi);
 
-    for (i = 0; i < INTER_MODE_CONTEXTS; i++) {
-      MB_PREDICTION_MODE m;
+    if (!frame_is_intra_only(cm)) {
+      vp9_build_nmv_cost_table(x->nmvjointcost,
+                               cm->allow_high_precision_mv ? x->nmvcost_hp
+                                                           : x->nmvcost,
+                               &cm->fc.nmvc, cm->allow_high_precision_mv);
 
-      for (m = NEARESTMV; m < MB_MODE_COUNT; m++)
-        cpi->mb.inter_mode_cost[i][inter_mode_offset(m)] =
-            cost_token(vp9_inter_mode_tree,
-                       cm->fc.inter_mode_probs[i],
-                       &vp9_inter_mode_encodings[inter_mode_offset(m)]);
+      for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
+        vp9_cost_tokens((int *)cpi->inter_mode_cost[i],
+                        cm->fc.inter_mode_probs[i], vp9_inter_mode_tree);
     }
   }
 }
 
-static INLINE void linear_interpolate2(double x, int ntab, int inv_step,
-                                       const double *tab1, const double *tab2,
-                                       double *v1, double *v2) {
-  double y = x * inv_step;
-  int d = (int) y;
-  if (d >= ntab - 1) {
-    *v1 = tab1[ntab - 1];
-    *v2 = tab2[ntab - 1];
-  } else {
-    double a = y - d;
-    *v1 = tab1[d] * (1 - a) + tab1[d + 1] * a;
-    *v2 = tab2[d] * (1 - a) + tab2[d + 1] * a;
-  }
-}
+static const int MAX_XSQ_Q10 = 245727;
 
-static void model_rd_norm(double x, double *R, double *D) {
-  static const int inv_tab_step = 8;
-  static const int tab_size = 120;
+static void model_rd_norm(int xsq_q10, int *r_q10, int *d_q10) {
   // NOTE: The tables below must be of the same size
-  //
+
+  // The functions described below are sampled at the four most significant
+  // bits of x^2 + 8 / 256
+
   // Normalized rate
   // This table models the rate for a Laplacian source
   // source with given variance when quantized with a uniform quantizer
@@ -304,22 +333,20 @@ static void model_rd_norm(double x, double *R, double *D) {
   // Rn(x) = H(sqrt(r)) + sqrt(r)*[1 + H(r)/(1 - r)],
   // where r = exp(-sqrt(2) * x) and x = qpstep / sqrt(variance),
   // and H(x) is the binary entropy function.
-  static const double rate_tab[] = {
-    64.00, 4.944, 3.949, 3.372, 2.966, 2.655, 2.403, 2.194,
-    2.014, 1.858, 1.720, 1.596, 1.485, 1.384, 1.291, 1.206,
-    1.127, 1.054, 0.986, 0.923, 0.863, 0.808, 0.756, 0.708,
-    0.662, 0.619, 0.579, 0.541, 0.506, 0.473, 0.442, 0.412,
-    0.385, 0.359, 0.335, 0.313, 0.291, 0.272, 0.253, 0.236,
-    0.220, 0.204, 0.190, 0.177, 0.165, 0.153, 0.142, 0.132,
-    0.123, 0.114, 0.106, 0.099, 0.091, 0.085, 0.079, 0.073,
-    0.068, 0.063, 0.058, 0.054, 0.050, 0.047, 0.043, 0.040,
-    0.037, 0.034, 0.032, 0.029, 0.027, 0.025, 0.023, 0.022,
-    0.020, 0.019, 0.017, 0.016, 0.015, 0.014, 0.013, 0.012,
-    0.011, 0.010, 0.009, 0.008, 0.008, 0.007, 0.007, 0.006,
-    0.006, 0.005, 0.005, 0.005, 0.004, 0.004, 0.004, 0.003,
-    0.003, 0.003, 0.003, 0.002, 0.002, 0.002, 0.002, 0.002,
-    0.002, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001,
-    0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.000,
+  static const int rate_tab_q10[] = {
+    65536,  6086,  5574,  5275,  5063,  4899,  4764,  4651,
+     4553,  4389,  4255,  4142,  4044,  3958,  3881,  3811,
+     3748,  3635,  3538,  3453,  3376,  3307,  3244,  3186,
+     3133,  3037,  2952,  2877,  2809,  2747,  2690,  2638,
+     2589,  2501,  2423,  2353,  2290,  2232,  2179,  2130,
+     2084,  2001,  1928,  1862,  1802,  1748,  1698,  1651,
+     1608,  1530,  1460,  1398,  1342,  1290,  1243,  1199,
+     1159,  1086,  1021,   963,   911,   864,   821,   781,
+      745,   680,   623,   574,   530,   490,   455,   424,
+      395,   345,   304,   269,   239,   213,   190,   171,
+      154,   126,   104,    87,    73,    61,    52,    44,
+       38,    28,    21,    16,    12,    10,     8,     6,
+        5,     3,     2,     1,     1,     1,     0,     0,
   };
   // Normalized distortion
   // This table models the normalized distortion for a Laplacian source
@@ -328,54 +355,74 @@ static void model_rd_norm(double x, double *R, double *D) {
   // Dn(x) = 1 - 1/sqrt(2) * x / sinh(x/sqrt(2))
   // where x = qpstep / sqrt(variance)
   // Note the actual distortion is Dn * variance.
-  static const double dist_tab[] = {
-    0.000, 0.001, 0.005, 0.012, 0.021, 0.032, 0.045, 0.061,
-    0.079, 0.098, 0.119, 0.142, 0.166, 0.190, 0.216, 0.242,
-    0.269, 0.296, 0.324, 0.351, 0.378, 0.405, 0.432, 0.458,
-    0.484, 0.509, 0.534, 0.557, 0.580, 0.603, 0.624, 0.645,
-    0.664, 0.683, 0.702, 0.719, 0.735, 0.751, 0.766, 0.780,
-    0.794, 0.807, 0.819, 0.830, 0.841, 0.851, 0.861, 0.870,
-    0.878, 0.886, 0.894, 0.901, 0.907, 0.913, 0.919, 0.925,
-    0.930, 0.935, 0.939, 0.943, 0.947, 0.951, 0.954, 0.957,
-    0.960, 0.963, 0.966, 0.968, 0.971, 0.973, 0.975, 0.976,
-    0.978, 0.980, 0.981, 0.982, 0.984, 0.985, 0.986, 0.987,
-    0.988, 0.989, 0.990, 0.990, 0.991, 0.992, 0.992, 0.993,
-    0.993, 0.994, 0.994, 0.995, 0.995, 0.996, 0.996, 0.996,
-    0.996, 0.997, 0.997, 0.997, 0.997, 0.998, 0.998, 0.998,
-    0.998, 0.998, 0.998, 0.999, 0.999, 0.999, 0.999, 0.999,
-    0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 1.000,
+  static const int dist_tab_q10[] = {
+       0,     0,     1,     1,     1,     2,     2,     2,
+       3,     3,     4,     5,     5,     6,     7,     7,
+       8,     9,    11,    12,    13,    15,    16,    17,
+      18,    21,    24,    26,    29,    31,    34,    36,
+      39,    44,    49,    54,    59,    64,    69,    73,
+      78,    88,    97,   106,   115,   124,   133,   142,
+     151,   167,   184,   200,   215,   231,   245,   260,
+     274,   301,   327,   351,   375,   397,   418,   439,
+     458,   495,   528,   559,   587,   613,   637,   659,
+     680,   717,   749,   777,   801,   823,   842,   859,
+     874,   899,   919,   936,   949,   960,   969,   977,
+     983,   994,  1001,  1006,  1010,  1013,  1015,  1017,
+    1018,  1020,  1022,  1022,  1023,  1023,  1023,  1024,
+  };
+  static const int xsq_iq_q10[] = {
+         0,      4,      8,     12,     16,     20,     24,     28,
+        32,     40,     48,     56,     64,     72,     80,     88,
+        96,    112,    128,    144,    160,    176,    192,    208,
+       224,    256,    288,    320,    352,    384,    416,    448,
+       480,    544,    608,    672,    736,    800,    864,    928,
+       992,   1120,   1248,   1376,   1504,   1632,   1760,   1888,
+      2016,   2272,   2528,   2784,   3040,   3296,   3552,   3808,
+      4064,   4576,   5088,   5600,   6112,   6624,   7136,   7648,
+      8160,   9184,  10208,  11232,  12256,  13280,  14304,  15328,
+     16352,  18400,  20448,  22496,  24544,  26592,  28640,  30688,
+     32736,  36832,  40928,  45024,  49120,  53216,  57312,  61408,
+     65504,  73696,  81888,  90080,  98272, 106464, 114656, 122848,
+    131040, 147424, 163808, 180192, 196576, 212960, 229344, 245728,
   };
   /*
-  assert(sizeof(rate_tab) == tab_size * sizeof(rate_tab[0]);
-  assert(sizeof(dist_tab) == tab_size * sizeof(dist_tab[0]);
-  assert(sizeof(rate_tab) == sizeof(dist_tab));
+  static const int tab_size = sizeof(rate_tab_q10) / sizeof(rate_tab_q10[0]);
+  assert(sizeof(dist_tab_q10) / sizeof(dist_tab_q10[0]) == tab_size);
+  assert(sizeof(xsq_iq_q10) / sizeof(xsq_iq_q10[0]) == tab_size);
+  assert(MAX_XSQ_Q10 + 1 == xsq_iq_q10[tab_size - 1]);
   */
-  assert(x >= 0.0);
-  linear_interpolate2(x, tab_size, inv_tab_step,
-                      rate_tab, dist_tab, R, D);
+  int tmp = (xsq_q10 >> 2) + 8;
+  int k = get_msb(tmp) - 3;
+  int xq = (k << 3) + ((tmp >> k) & 0x7);
+  const int one_q10 = 1 << 10;
+  const int a_q10 = ((xsq_q10 - xsq_iq_q10[xq]) << 10) >> (2 + k);
+  const int b_q10 = one_q10 - a_q10;
+  *r_q10 = (rate_tab_q10[xq] * b_q10 + rate_tab_q10[xq + 1] * a_q10) >> 10;
+  *d_q10 = (dist_tab_q10[xq] * b_q10 + dist_tab_q10[xq + 1] * a_q10) >> 10;
 }
 
-static void model_rd_from_var_lapndz(int var, int n, int qstep,
-                                     int *rate, int64_t *dist) {
+void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n,
+                                  unsigned int qstep, int *rate,
+                                  int64_t *dist) {
   // This function models the rate and distortion for a Laplacian
   // source with given variance when quantized with a uniform quantizer
   // with given stepsize. The closed form expressions are in:
   // Hang and Chen, "Source Model for transform video coder and its
   // application - Part I: Fundamental Theory", IEEE Trans. Circ.
   // Sys. for Video Tech., April 1997.
-  vp9_clear_system_state();
-  if (var == 0 || n == 0) {
+  if (var == 0) {
     *rate = 0;
     *dist = 0;
   } else {
-    double D, R;
-    double s2 = (double) var / n;
-    double x = qstep / sqrt(s2);
-    model_rd_norm(x, &R, &D);
-    *rate = (int)((n << 8) * R + 0.5);
-    *dist = (int)(var * D + 0.5);
+    int d_q10, r_q10;
+    const uint64_t xsq_q10_64 =
+        ((((uint64_t)qstep * qstep * n) << 10) + (var >> 1)) / var;
+    const int xsq_q10 = xsq_q10_64 > MAX_XSQ_Q10 ?
+                        MAX_XSQ_Q10 : (int)xsq_q10_64;
+    model_rd_norm(xsq_q10, &r_q10, &d_q10);
+    *rate = (n * r_q10 + 2) >> 2;
+    *dist = (var * (int64_t)d_q10 + 512) >> 10;
   }
-  vp9_clear_system_state();
 }
 
 static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize,
@@ -384,26 +431,48 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize,
   // Note our transform coeffs are 8 times an orthogonal transform.
   // Hence quantizer step is also 8 times. To get effective quantizer
   // we need to divide by 8 before sending to modeling function.
-  int i, rate_sum = 0, dist_sum = 0;
+  int i;
+  int64_t rate_sum = 0;
+  int64_t dist_sum = 0;
+  const int ref = xd->mi[0]->mbmi.ref_frame[0];
+  unsigned int sse;
 
   for (i = 0; i < MAX_MB_PLANE; ++i) {
     struct macroblock_plane *const p = &x->plane[i];
     struct macroblockd_plane *const pd = &xd->plane[i];
     const BLOCK_SIZE bs = get_plane_block_size(bsize, pd);
-    unsigned int sse;
-    int rate;
-    int64_t dist;
+
     (void) cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride,
                               pd->dst.buf, pd->dst.stride, &sse);
-    // sse works better than var, since there is no dc prediction used
-    model_rd_from_var_lapndz(sse, 1 << num_pels_log2_lookup[bs],
-                             pd->dequant[1] >> 3, &rate, &dist);
 
-    rate_sum += rate;
-    dist_sum += (int)dist;
+    if (i == 0)
+      x->pred_sse[ref] = sse;
+
+    // Fast approximate the modelling function.
+    if (cpi->oxcf.speed > 4) {
+      int64_t rate;
+      int64_t dist;
+      int64_t square_error = sse;
+      int quantizer = (pd->dequant[1] >> 3);
+
+      if (quantizer < 120)
+        rate = (square_error * (280 - quantizer)) >> 8;
+      else
+        rate = 0;
+      dist = (square_error * quantizer) >> 8;
+      rate_sum += rate;
+      dist_sum += dist;
+    } else {
+      int rate;
+      int64_t dist;
+      vp9_model_rd_from_var_lapndz(sse, 1 << num_pels_log2_lookup[bs],
+                                   pd->dequant[1] >> 3, &rate, &dist);
+      rate_sum += rate;
+      dist_sum += dist;
+    }
   }
 
-  *out_rate_sum = rate_sum;
+  *out_rate_sum = (int)rate_sum;
   *out_dist_sum = dist_sum << 4;
 }
 
@@ -414,10 +483,10 @@ static void model_rd_for_sb_y_tx(VP9_COMP *cpi, BLOCK_SIZE bsize,
                                  int *out_skip) {
   int j, k;
   BLOCK_SIZE bs;
-  struct macroblock_plane *const p = &x->plane[0];
-  struct macroblockd_plane *const pd = &xd->plane[0];
-  const int width = 4 << num_4x4_blocks_wide_lookup[bsize];
-  const int height = 4 << num_4x4_blocks_high_lookup[bsize];
+  const struct macroblock_plane *const p = &x->plane[0];
+  const struct macroblockd_plane *const pd = &xd->plane[0];
+  const int width = 4 * num_4x4_blocks_wide_lookup[bsize];
+  const int height = 4 * num_4x4_blocks_high_lookup[bsize];
   int rate_sum = 0;
   int64_t dist_sum = 0;
   const int t = 4 << tx_size;
@@ -444,7 +513,8 @@ static void model_rd_for_sb_y_tx(VP9_COMP *cpi, BLOCK_SIZE bsize,
                          &pd->dst.buf[j * pd->dst.stride + k], pd->dst.stride,
                          &sse);
       // sse works better than var, since there is no dc prediction used
-      model_rd_from_var_lapndz(sse, t * t, pd->dequant[1] >> 3, &rate, &dist);
+      vp9_model_rd_from_var_lapndz(sse, t * t, pd->dequant[1] >> 3,
+                                   &rate, &dist);
       rate_sum += rate;
       dist_sum += dist;
       *out_skip &= (rate < 1024);
@@ -455,15 +525,15 @@ static void model_rd_for_sb_y_tx(VP9_COMP *cpi, BLOCK_SIZE bsize,
   *out_dist_sum = dist_sum << 4;
 }
 
-int64_t vp9_block_error_c(int16_t *coeff, int16_t *dqcoeff,
+int64_t vp9_block_error_c(const int16_t *coeff, const int16_t *dqcoeff,
                           intptr_t block_size, int64_t *ssz) {
   int i;
   int64_t error = 0, sqcoeff = 0;
 
   for (i = 0; i < block_size; i++) {
-    int this_diff = coeff[i] - dqcoeff[i];
-    error += (unsigned)this_diff * this_diff;
-    sqcoeff += (unsigned) coeff[i] * coeff[i];
+    const int diff = coeff[i] - dqcoeff[i];
+    error +=  diff * diff;
+    sqcoeff += coeff[i] * coeff[i];
   }
 
   *ssz = sqcoeff;
@@ -481,43 +551,41 @@ static const int16_t band_counts[TX_SIZES][8] = {
   { 1, 2, 3, 4, 11,  256 - 21, 0 },
   { 1, 2, 3, 4, 11, 1024 - 21, 0 },
 };
-
 static INLINE int cost_coeffs(MACROBLOCK *x,
                               int plane, int block,
                               ENTROPY_CONTEXT *A, ENTROPY_CONTEXT *L,
                               TX_SIZE tx_size,
-                              const int16_t *scan, const int16_t *nb) {
+                              const int16_t *scan, const int16_t *nb,
+                              int use_fast_coef_costing) {
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
-  struct macroblockd_plane *pd = &xd->plane[plane];
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  const struct macroblock_plane *p = &x->plane[plane];
+  const struct macroblockd_plane *pd = &xd->plane[plane];
   const PLANE_TYPE type = pd->plane_type;
   const int16_t *band_count = &band_counts[tx_size][1];
-  const int eob = pd->eobs[block];
-  const int16_t *const qcoeff_ptr = BLOCK_OFFSET(pd->qcoeff, block);
-  const int ref = mbmi->ref_frame[0] != INTRA_FRAME;
-  unsigned int (*token_costs)[2][PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS] =
-                   x->token_costs[tx_size][type][ref];
-  const ENTROPY_CONTEXT above_ec = !!*A, left_ec = !!*L;
-  uint8_t *p_tok = x->token_cache;
-  int pt = combine_entropy_contexts(above_ec, left_ec);
+  const int eob = p->eobs[block];
+  const int16_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+  unsigned int (*token_costs)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] =
+                   x->token_costs[tx_size][type][is_inter_block(mbmi)];
+  uint8_t token_cache[32 * 32];
+  int pt = combine_entropy_contexts(*A, *L);
   int c, cost;
-
   // Check for consistency of tx_size with mode info
-  assert(type == PLANE_TYPE_Y_WITH_DC ? mbmi->tx_size == tx_size
-                                      : get_uv_tx_size(mbmi) == tx_size);
+  assert(type == PLANE_TYPE_Y ? mbmi->tx_size == tx_size
+                              : get_uv_tx_size(mbmi) == tx_size);
 
   if (eob == 0) {
     // single eob token
-    cost = token_costs[0][0][pt][DCT_EOB_TOKEN];
+    cost = token_costs[0][0][pt][EOB_TOKEN];
     c = 0;
   } else {
     int band_left = *band_count++;
 
     // dc token
-    int v = qcoeff_ptr[0];
+    int v = qcoeff[0];
     int prev_t = vp9_dct_value_tokens_ptr[v].token;
     cost = (*token_costs)[0][pt][prev_t] + vp9_dct_value_cost_ptr[v];
-    p_tok[0] = vp9_pt_energy_class[prev_t];
+    token_cache[0] = vp9_pt_energy_class[prev_t];
     ++token_costs;
 
     // ac tokens
@@ -525,11 +593,15 @@ static INLINE int cost_coeffs(MACROBLOCK *x,
       const int rc = scan[c];
       int t;
 
-      v = qcoeff_ptr[rc];
+      v = qcoeff[rc];
       t = vp9_dct_value_tokens_ptr[v].token;
-      pt = get_coef_context(nb, p_tok, c);
-      cost += (*token_costs)[!prev_t][pt][t] + vp9_dct_value_cost_ptr[v];
-      p_tok[rc] = vp9_pt_energy_class[t];
+      if (use_fast_coef_costing) {
+        cost += (*token_costs)[!prev_t][!prev_t][t] + vp9_dct_value_cost_ptr[v];
+      } else {
+        pt = get_coef_context(nb, token_cache, c);
+        cost += (*token_costs)[!prev_t][pt][t] + vp9_dct_value_cost_ptr[v];
+        token_cache[rc] = vp9_pt_energy_class[t];
+      }
       prev_t = t;
       if (!--band_left) {
         band_left = *band_count++;
@@ -539,8 +611,12 @@ static INLINE int cost_coeffs(MACROBLOCK *x,
 
     // eob token
     if (band_left) {
-      pt = get_coef_context(nb, p_tok, c);
-      cost += (*token_costs)[0][pt][DCT_EOB_TOKEN];
+      if (use_fast_coef_costing) {
+        cost += (*token_costs)[0][!prev_t][EOB_TOKEN];
+      } else {
+        pt = get_coef_context(nb, token_cache, c);
+        cost += (*token_costs)[0][pt][EOB_TOKEN];
+      }
     }
   }
 
@@ -549,24 +625,22 @@ static INLINE int cost_coeffs(MACROBLOCK *x,
 
   return cost;
 }
-
-static void dist_block(int plane, int block, TX_SIZE tx_size, void *arg) {
+static void dist_block(int plane, int block, TX_SIZE tx_size,
+                       struct rdcost_block_args* args) {
   const int ss_txfrm_size = tx_size << 1;
-  struct rdcost_block_args* args = arg;
   MACROBLOCK* const x = args->x;
   MACROBLOCKD* const xd = &x->e_mbd;
-  struct macroblock_plane *const p = &x->plane[plane];
-  struct macroblockd_plane *const pd = &xd->plane[plane];
+  const struct macroblock_plane *const p = &x->plane[plane];
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
   int64_t this_sse;
-  int shift = args->tx_size == TX_32X32 ? 0 : 2;
+  int shift = tx_size == TX_32X32 ? 0 : 2;
   int16_t *const coeff = BLOCK_OFFSET(p->coeff, block);
   int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
   args->dist = vp9_block_error(coeff, dqcoeff, 16 << ss_txfrm_size,
                                &this_sse) >> shift;
   args->sse  = this_sse >> shift;
 
-  if (x->skip_encode &&
-      xd->mi_8x8[0]->mbmi.ref_frame[0] == INTRA_FRAME) {
+  if (x->skip_encode && !is_inter_block(&xd->mi[0]->mbmi)) {
     // TODO(jingning): tune the model to better capture the distortion.
     int64_t p = (pd->dequant[1] * pd->dequant[1] *
                     (1 << ss_txfrm_size)) >> (shift + 2);
@@ -576,32 +650,31 @@ static void dist_block(int plane, int block, TX_SIZE tx_size, void *arg) {
 }
 
 static void rate_block(int plane, int block, BLOCK_SIZE plane_bsize,
-                       TX_SIZE tx_size, void *arg) {
-  struct rdcost_block_args* args = arg;
-
+                       TX_SIZE tx_size, struct rdcost_block_args* args) {
   int x_idx, y_idx;
-  txfrm_block_to_raster_xy(plane_bsize, args->tx_size, block, &x_idx, &y_idx);
+  txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &x_idx, &y_idx);
 
   args->rate = cost_coeffs(args->x, plane, block, args->t_above + x_idx,
-                           args->t_left + y_idx, args->tx_size,
-                           args->scan, args->nb);
+                           args->t_left + y_idx, tx_size,
+                           args->so->scan, args->so->neighbors,
+                           args->use_fast_coef_costing);
 }
 
-static void block_yrd_txfm(int plane, int block, BLOCK_SIZE plane_bsize,
-                           TX_SIZE tx_size, void *arg) {
+static void block_rd_txfm(int plane, int block, BLOCK_SIZE plane_bsize,
+                          TX_SIZE tx_size, void *arg) {
   struct rdcost_block_args *args = arg;
   MACROBLOCK *const x = args->x;
   MACROBLOCKD *const xd = &x->e_mbd;
-  struct encode_b_args encode_args = {x, NULL};
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
   int64_t rd1, rd2, rd;
 
   if (args->skip)
     return;
 
-  if (!is_inter_block(&xd->mi_8x8[0]->mbmi))
-    vp9_encode_block_intra(plane, block, plane_bsize, tx_size, &encode_args);
+  if (!is_inter_block(mbmi))
+    vp9_encode_block_intra(x, plane, block, plane_bsize, tx_size, &mbmi->skip);
   else
-    vp9_xform_quant(plane, block, plane_bsize, tx_size, &encode_args);
+    vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
 
   dist_block(plane, block, tx_size, args);
   rate_block(plane, block, plane_bsize, tx_size, args);
@@ -611,7 +684,8 @@ static void block_yrd_txfm(int plane, int block, BLOCK_SIZE plane_bsize,
   // TODO(jingning): temporarily enabled only for luma component
   rd = MIN(rd1, rd2);
   if (plane == 0)
-    x->zcoeff_blk[tx_size][block] = rd1 > rd2 || !xd->plane[plane].eobs[block];
+    x->zcoeff_blk[tx_size][block] = !x->plane[plane].eobs[block] ||
+                                    (rd1 > rd2 && !xd->lossless);
 
   args->this_rate += args->rate;
   args->this_dist += args->dist;
@@ -624,10 +698,16 @@ static void block_yrd_txfm(int plane, int block, BLOCK_SIZE plane_bsize,
   }
 }
 
-void vp9_get_entropy_contexts(TX_SIZE tx_size,
-    ENTROPY_CONTEXT t_above[16], ENTROPY_CONTEXT t_left[16],
-    const ENTROPY_CONTEXT *above, const ENTROPY_CONTEXT *left,
-    int num_4x4_w, int num_4x4_h) {
+void vp9_get_entropy_contexts(BLOCK_SIZE bsize, TX_SIZE tx_size,
+                              const struct macroblockd_plane *pd,
+                              ENTROPY_CONTEXT t_above[16],
+                              ENTROPY_CONTEXT t_left[16]) {
+  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
+  const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
+  const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
+  const ENTROPY_CONTEXT *const above = pd->above_context;
+  const ENTROPY_CONTEXT *const left = pd->left_context;
+
   int i;
   switch (tx_size) {
     case TX_4X4:
@@ -653,57 +733,43 @@ void vp9_get_entropy_contexts(TX_SIZE tx_size,
         t_left[i] = !!*(const uint64_t *)&left[i];
       break;
     default:
-      assert(!"Invalid transform size.");
+      assert(0 && "Invalid transform size.");
   }
 }
 
-static void init_rdcost_stack(MACROBLOCK *x, TX_SIZE tx_size,
-                              const int num_4x4_w, const int num_4x4_h,
-                              const int64_t ref_rdcost,
-                              struct rdcost_block_args *arg) {
-  vpx_memset(arg, 0, sizeof(struct rdcost_block_args));
-  arg->x = x;
-  arg->tx_size = tx_size;
-  arg->bw = num_4x4_w;
-  arg->bh = num_4x4_h;
-  arg->best_rd = ref_rdcost;
-}
-
 static void txfm_rd_in_plane(MACROBLOCK *x,
-                             struct rdcost_block_args *rd_stack,
                              int *rate, int64_t *distortion,
                              int *skippable, int64_t *sse,
                              int64_t ref_best_rd, int plane,
-                             BLOCK_SIZE bsize, TX_SIZE tx_size) {
+                             BLOCK_SIZE bsize, TX_SIZE tx_size,
+                             int use_fast_coef_casting) {
   MACROBLOCKD *const xd = &x->e_mbd;
-  struct macroblockd_plane *const pd = &xd->plane[plane];
-  const BLOCK_SIZE bs = get_plane_block_size(bsize, pd);
-  const int num_4x4_w = num_4x4_blocks_wide_lookup[bs];
-  const int num_4x4_h = num_4x4_blocks_high_lookup[bs];
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  struct rdcost_block_args args;
+  vp9_zero(args);
+  args.x = x;
+  args.best_rd = ref_best_rd;
+  args.use_fast_coef_costing = use_fast_coef_casting;
 
-  init_rdcost_stack(x, tx_size, num_4x4_w, num_4x4_h,
-                    ref_best_rd, rd_stack);
   if (plane == 0)
-    xd->mi_8x8[0]->mbmi.tx_size = tx_size;
+    xd->mi[0]->mbmi.tx_size = tx_size;
 
-  vp9_get_entropy_contexts(tx_size, rd_stack->t_above, rd_stack->t_left,
-                           pd->above_context, pd->left_context,
-                           num_4x4_w, num_4x4_h);
+  vp9_get_entropy_contexts(bsize, tx_size, pd, args.t_above, args.t_left);
 
-  get_scan(xd, tx_size, pd->plane_type, 0, &rd_stack->scan, &rd_stack->nb);
+  args.so = get_scan(xd, tx_size, pd->plane_type, 0);
 
-  foreach_transformed_block_in_plane(xd, bsize, plane,
-                                     block_yrd_txfm, rd_stack);
-  if (rd_stack->skip) {
+  vp9_foreach_transformed_block_in_plane(xd, bsize, plane,
+                                         block_rd_txfm, &args);
+  if (args.skip) {
     *rate       = INT_MAX;
     *distortion = INT64_MAX;
     *sse        = INT64_MAX;
     *skippable  = 0;
   } else {
-    *distortion = rd_stack->this_dist;
-    *rate       = rd_stack->this_rate;
-    *sse        = rd_stack->this_sse;
-    *skippable  = vp9_is_skippable_in_plane(xd, bsize, plane);
+    *distortion = args.this_dist;
+    *rate       = args.this_rate;
+    *sse        = args.this_sse;
+    *skippable  = vp9_is_skippable_in_plane(x, bsize, plane);
   }
 }
 
@@ -716,13 +782,13 @@ static void choose_largest_txfm_size(VP9_COMP *cpi, MACROBLOCK *x,
   VP9_COMMON *const cm = &cpi->common;
   const TX_SIZE largest_tx_size = tx_mode_to_biggest_tx_size[cm->tx_mode];
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
 
   mbmi->tx_size = MIN(max_tx_size, largest_tx_size);
 
-  txfm_rd_in_plane(x, &cpi->rdcost_stack, rate, distortion, skip,
+  txfm_rd_in_plane(x, rate, distortion, skip,
                    &sse[mbmi->tx_size], ref_best_rd, 0, bs,
-                   mbmi->tx_size);
+                   mbmi->tx_size, cpi->sf.use_fast_coef_costing);
   cpi->tx_stepdown_count[0]++;
 }
 
@@ -735,64 +801,50 @@ static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
   const TX_SIZE max_tx_size = max_txsize_lookup[bs];
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
-  vp9_prob skip_prob = vp9_get_pred_prob_mbskip(cm, xd);
-  int64_t rd[TX_SIZES][2];
-  int n, m;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  vp9_prob skip_prob = vp9_get_skip_prob(cm, xd);
+  int64_t rd[TX_SIZES][2] = {{INT64_MAX, INT64_MAX},
+                             {INT64_MAX, INT64_MAX},
+                             {INT64_MAX, INT64_MAX},
+                             {INT64_MAX, INT64_MAX}};
+  TX_SIZE n, m;
   int s0, s1;
+  const TX_SIZE max_mode_tx_size = tx_mode_to_biggest_tx_size[cm->tx_mode];
+  int64_t best_rd = INT64_MAX;
+  TX_SIZE best_tx = TX_4X4;
 
-  const vp9_prob *tx_probs = get_tx_probs2(xd, &cm->fc.tx_probs, xd->mi_8x8[0]);
-
-  for (n = TX_4X4; n <= max_tx_size; n++) {
-    r[n][1] = r[n][0];
-    if (r[n][0] == INT_MAX)
-      continue;
-    for (m = 0; m <= n - (n == max_tx_size); m++) {
-      if (m == n)
-        r[n][1] += vp9_cost_zero(tx_probs[m]);
-      else
-        r[n][1] += vp9_cost_one(tx_probs[m]);
-    }
-  }
-
+  const vp9_prob *tx_probs = get_tx_probs2(max_tx_size, xd, &cm->fc.tx_probs);
   assert(skip_prob > 0);
   s0 = vp9_cost_bit(skip_prob, 0);
   s1 = vp9_cost_bit(skip_prob, 1);
 
   for (n = TX_4X4; n <= max_tx_size; n++) {
+    r[n][1] = r[n][0];
+    if (r[n][0] < INT_MAX) {
+      for (m = 0; m <= n - (n == max_tx_size); m++) {
+        if (m == n)
+          r[n][1] += vp9_cost_zero(tx_probs[m]);
+        else
+          r[n][1] += vp9_cost_one(tx_probs[m]);
+      }
+    }
     if (d[n] == INT64_MAX) {
       rd[n][0] = rd[n][1] = INT64_MAX;
-      continue;
-    }
-    if (s[n]) {
+    } else if (s[n]) {
       rd[n][0] = rd[n][1] = RDCOST(x->rdmult, x->rddiv, s1, d[n]);
     } else {
       rd[n][0] = RDCOST(x->rdmult, x->rddiv, r[n][0] + s0, d[n]);
       rd[n][1] = RDCOST(x->rdmult, x->rddiv, r[n][1] + s0, d[n]);
     }
-  }
 
-  if (max_tx_size == TX_32X32 &&
-      (cm->tx_mode == ALLOW_32X32 ||
-       (cm->tx_mode == TX_MODE_SELECT &&
-        rd[TX_32X32][1] < rd[TX_16X16][1] && rd[TX_32X32][1] < rd[TX_8X8][1] &&
-        rd[TX_32X32][1] < rd[TX_4X4][1]))) {
-    mbmi->tx_size = TX_32X32;
-  } else if (max_tx_size >= TX_16X16 &&
-             (cm->tx_mode == ALLOW_16X16 ||
-              cm->tx_mode == ALLOW_32X32 ||
-              (cm->tx_mode == TX_MODE_SELECT &&
-               rd[TX_16X16][1] < rd[TX_8X8][1] &&
-               rd[TX_16X16][1] < rd[TX_4X4][1]))) {
-    mbmi->tx_size = TX_16X16;
-  } else if (cm->tx_mode == ALLOW_8X8 ||
-             cm->tx_mode == ALLOW_16X16 ||
-             cm->tx_mode == ALLOW_32X32 ||
-           (cm->tx_mode == TX_MODE_SELECT && rd[TX_8X8][1] < rd[TX_4X4][1])) {
-    mbmi->tx_size = TX_8X8;
-  } else {
-    mbmi->tx_size = TX_4X4;
+    if (rd[n][1] < best_rd) {
+      best_tx = n;
+      best_rd = rd[n][1];
+    }
   }
+  mbmi->tx_size = cm->tx_mode == TX_MODE_SELECT ?
+                      best_tx : MIN(max_tx_size, max_mode_tx_size);
+
 
   *distortion = d[mbmi->tx_size];
   *rate       = r[mbmi->tx_size][cm->tx_mode == TX_MODE_SELECT];
@@ -802,33 +854,27 @@ static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
   tx_cache[ALLOW_8X8] = rd[TX_8X8][0];
   tx_cache[ALLOW_16X16] = rd[MIN(max_tx_size, TX_16X16)][0];
   tx_cache[ALLOW_32X32] = rd[MIN(max_tx_size, TX_32X32)][0];
-  if (max_tx_size == TX_32X32 &&
-      rd[TX_32X32][1] < rd[TX_16X16][1] && rd[TX_32X32][1] < rd[TX_8X8][1] &&
-      rd[TX_32X32][1] < rd[TX_4X4][1])
-    tx_cache[TX_MODE_SELECT] = rd[TX_32X32][1];
-  else if (max_tx_size >= TX_16X16 &&
-           rd[TX_16X16][1] < rd[TX_8X8][1] && rd[TX_16X16][1] < rd[TX_4X4][1])
-    tx_cache[TX_MODE_SELECT] = rd[TX_16X16][1];
-  else
-    tx_cache[TX_MODE_SELECT] = rd[TX_4X4][1] < rd[TX_8X8][1] ?
-                                 rd[TX_4X4][1] : rd[TX_8X8][1];
 
-  if (max_tx_size == TX_32X32 &&
-      rd[TX_32X32][1] < rd[TX_16X16][1] &&
-      rd[TX_32X32][1] < rd[TX_8X8][1] &&
-      rd[TX_32X32][1] < rd[TX_4X4][1]) {
+  if (max_tx_size == TX_32X32 && best_tx == TX_32X32) {
+    tx_cache[TX_MODE_SELECT] = rd[TX_32X32][1];
     cpi->tx_stepdown_count[0]++;
-  } else if (max_tx_size >= TX_16X16 &&
-             rd[TX_16X16][1] < rd[TX_8X8][1] &&
-             rd[TX_16X16][1] < rd[TX_4X4][1]) {
+  } else if (max_tx_size >= TX_16X16 && best_tx == TX_16X16) {
+    tx_cache[TX_MODE_SELECT] = rd[TX_16X16][1];
     cpi->tx_stepdown_count[max_tx_size - TX_16X16]++;
   } else if (rd[TX_8X8][1] < rd[TX_4X4][1]) {
+    tx_cache[TX_MODE_SELECT] = rd[TX_8X8][1];
     cpi->tx_stepdown_count[max_tx_size - TX_8X8]++;
   } else {
+    tx_cache[TX_MODE_SELECT] = rd[TX_4X4][1];
     cpi->tx_stepdown_count[max_tx_size - TX_4X4]++;
   }
 }
 
+static int64_t scaled_rd_cost(int rdmult, int rddiv,
+                              int rate, int64_t dist, double scale) {
+  return (int64_t) (RDCOST(rdmult, rddiv, rate, dist) * scale);
+}
+
 static void choose_txfm_size_from_modelrd(VP9_COMP *cpi, MACROBLOCK *x,
                                           int (*r)[2], int *rate,
                                           int64_t *d, int64_t *distortion,
@@ -838,20 +884,26 @@ static void choose_txfm_size_from_modelrd(VP9_COMP *cpi, MACROBLOCK *x,
   const TX_SIZE max_tx_size = max_txsize_lookup[bs];
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
-  vp9_prob skip_prob = vp9_get_pred_prob_mbskip(cm, xd);
-  int64_t rd[TX_SIZES][2];
-  int n, m;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  vp9_prob skip_prob = vp9_get_skip_prob(cm, xd);
+  int64_t rd[TX_SIZES][2] = {{INT64_MAX, INT64_MAX},
+                             {INT64_MAX, INT64_MAX},
+                             {INT64_MAX, INT64_MAX},
+                             {INT64_MAX, INT64_MAX}};
+  TX_SIZE n, m;
   int s0, s1;
   double scale_rd[TX_SIZES] = {1.73, 1.44, 1.20, 1.00};
-  // double scale_r[TX_SIZES] = {2.82, 2.00, 1.41, 1.00};
-
-  const vp9_prob *tx_probs = get_tx_probs2(xd, &cm->fc.tx_probs, xd->mi_8x8[0]);
+  const TX_SIZE max_mode_tx_size = tx_mode_to_biggest_tx_size[cm->tx_mode];
+  int64_t best_rd = INT64_MAX;
+  TX_SIZE best_tx = TX_4X4;
 
-  // for (n = TX_4X4; n <= max_txfm_size; n++)
-  //   r[n][0] = (r[n][0] * scale_r[n]);
+  const vp9_prob *tx_probs = get_tx_probs2(max_tx_size, xd, &cm->fc.tx_probs);
+  assert(skip_prob > 0);
+  s0 = vp9_cost_bit(skip_prob, 0);
+  s1 = vp9_cost_bit(skip_prob, 1);
 
   for (n = TX_4X4; n <= max_tx_size; n++) {
+    double scale = scale_rd[n];
     r[n][1] = r[n][0];
     for (m = 0; m <= n - (n == max_tx_size); m++) {
       if (m == n)
@@ -859,62 +911,33 @@ static void choose_txfm_size_from_modelrd(VP9_COMP *cpi, MACROBLOCK *x,
       else
         r[n][1] += vp9_cost_one(tx_probs[m]);
     }
-  }
-
-  assert(skip_prob > 0);
-  s0 = vp9_cost_bit(skip_prob, 0);
-  s1 = vp9_cost_bit(skip_prob, 1);
-
-  for (n = TX_4X4; n <= max_tx_size; n++) {
     if (s[n]) {
-      rd[n][0] = rd[n][1] = RDCOST(x->rdmult, x->rddiv, s1, d[n]);
+      rd[n][0] = rd[n][1] = scaled_rd_cost(x->rdmult, x->rddiv, s1, d[n],
+                                           scale);
     } else {
-      rd[n][0] = RDCOST(x->rdmult, x->rddiv, r[n][0] + s0, d[n]);
-      rd[n][1] = RDCOST(x->rdmult, x->rddiv, r[n][1] + s0, d[n]);
+      rd[n][0] = scaled_rd_cost(x->rdmult, x->rddiv, r[n][0] + s0, d[n],
+                                scale);
+      rd[n][1] = scaled_rd_cost(x->rdmult, x->rddiv, r[n][1] + s0, d[n],
+                                scale);
+    }
+    if (rd[n][1] < best_rd) {
+      best_rd = rd[n][1];
+      best_tx = n;
     }
-  }
-  for (n = TX_4X4; n <= max_tx_size; n++) {
-    rd[n][0] = (int64_t)(scale_rd[n] * rd[n][0]);
-    rd[n][1] = (int64_t)(scale_rd[n] * rd[n][1]);
   }
 
-  if (max_tx_size == TX_32X32 &&
-      (cm->tx_mode == ALLOW_32X32 ||
-       (cm->tx_mode == TX_MODE_SELECT &&
-        rd[TX_32X32][1] <= rd[TX_16X16][1] &&
-        rd[TX_32X32][1] <= rd[TX_8X8][1] &&
-        rd[TX_32X32][1] <= rd[TX_4X4][1]))) {
-    mbmi->tx_size = TX_32X32;
-  } else if (max_tx_size >= TX_16X16 &&
-             (cm->tx_mode == ALLOW_16X16 ||
-              cm->tx_mode == ALLOW_32X32 ||
-              (cm->tx_mode == TX_MODE_SELECT &&
-               rd[TX_16X16][1] <= rd[TX_8X8][1] &&
-               rd[TX_16X16][1] <= rd[TX_4X4][1]))) {
-    mbmi->tx_size = TX_16X16;
-  } else if (cm->tx_mode == ALLOW_8X8 ||
-             cm->tx_mode == ALLOW_16X16 ||
-             cm->tx_mode == ALLOW_32X32 ||
-           (cm->tx_mode == TX_MODE_SELECT &&
-            rd[TX_8X8][1] <= rd[TX_4X4][1])) {
-    mbmi->tx_size = TX_8X8;
-  } else {
-    mbmi->tx_size = TX_4X4;
-  }
+  mbmi->tx_size = cm->tx_mode == TX_MODE_SELECT ?
+                      best_tx : MIN(max_tx_size, max_mode_tx_size);
 
   // Actually encode using the chosen mode if a model was used, but do not
   // update the r, d costs
-  txfm_rd_in_plane(x, &cpi->rdcost_stack, rate, distortion, skip,
-                   &sse[mbmi->tx_size], ref_best_rd, 0, bs, mbmi->tx_size);
+  txfm_rd_in_plane(x, rate, distortion, skip,
+                   &sse[mbmi->tx_size], ref_best_rd, 0, bs, mbmi->tx_size,
+                   cpi->sf.use_fast_coef_costing);
 
-  if (max_tx_size == TX_32X32 &&
-      rd[TX_32X32][1] <= rd[TX_16X16][1] &&
-      rd[TX_32X32][1] <= rd[TX_8X8][1] &&
-      rd[TX_32X32][1] <= rd[TX_4X4][1]) {
+  if (max_tx_size == TX_32X32 && best_tx == TX_32X32) {
     cpi->tx_stepdown_count[0]++;
-  } else if (max_tx_size >= TX_16X16 &&
-             rd[TX_16X16][1] <= rd[TX_8X8][1] &&
-             rd[TX_16X16][1] <= rd[TX_4X4][1]) {
+  } else if (max_tx_size >= TX_16X16 &&  best_tx == TX_16X16) {
     cpi->tx_stepdown_count[max_tx_size - TX_16X16]++;
   } else if (rd[TX_8X8][1] <= rd[TX_4X4][1]) {
     cpi->tx_stepdown_count[max_tx_size - TX_8X8]++;
@@ -923,25 +946,23 @@ static void choose_txfm_size_from_modelrd(VP9_COMP *cpi, MACROBLOCK *x,
   }
 }
 
-static void super_block_yrd(VP9_COMP *cpi,
-                            MACROBLOCK *x, int *rate, int64_t *distortion,
-                            int *skip, int64_t *psse, BLOCK_SIZE bs,
-                            int64_t txfm_cache[TX_MODES],
-                            int64_t ref_best_rd) {
+static void inter_super_block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
+                                  int64_t *distortion, int *skip,
+                                  int64_t *psse, BLOCK_SIZE bs,
+                                  int64_t txfm_cache[TX_MODES],
+                                  int64_t ref_best_rd) {
   int r[TX_SIZES][2], s[TX_SIZES];
   int64_t d[TX_SIZES], sse[TX_SIZES];
   MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
-  struct rdcost_block_args *rdcost_stack = &cpi->rdcost_stack;
-  const int b_inter_mode = is_inter_block(mbmi);
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  const TX_SIZE max_tx_size = max_txsize_lookup[bs];
+  TX_SIZE tx_size;
 
   assert(bs == mbmi->sb_type);
-  if (b_inter_mode)
-    vp9_subtract_sby(x, bs);
 
-  if (cpi->sf.tx_size_search_method == USE_LARGESTALL ||
-      (cpi->sf.tx_size_search_method != USE_FULL_RD &&
-       !b_inter_mode)) {
+  vp9_subtract_plane(x, bs, 0);
+
+  if (cpi->sf.tx_size_search_method == USE_LARGESTALL || xd->lossless) {
     vpx_memset(txfm_cache, 0, TX_MODES * sizeof(int64_t));
     choose_largest_txfm_size(cpi, x, rate, distortion, skip, sse,
                              ref_best_rd, bs);
@@ -950,36 +971,18 @@ static void super_block_yrd(VP9_COMP *cpi,
     return;
   }
 
-  if (cpi->sf.tx_size_search_method == USE_LARGESTINTRA_MODELINTER &&
-      b_inter_mode) {
-    if (bs >= BLOCK_32X32)
-      model_rd_for_sb_y_tx(cpi, bs, TX_32X32, x, xd,
-                           &r[TX_32X32][0], &d[TX_32X32], &s[TX_32X32]);
-    if (bs >= BLOCK_16X16)
-      model_rd_for_sb_y_tx(cpi, bs, TX_16X16, x, xd,
-                           &r[TX_16X16][0], &d[TX_16X16], &s[TX_16X16]);
-
-    model_rd_for_sb_y_tx(cpi, bs, TX_8X8, x, xd,
-                         &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8]);
-
-    model_rd_for_sb_y_tx(cpi, bs, TX_4X4, x, xd,
-                         &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4]);
-
+  if (cpi->sf.tx_size_search_method == USE_LARGESTINTRA_MODELINTER) {
+    for (tx_size = TX_4X4; tx_size <= max_tx_size; ++tx_size)
+      model_rd_for_sb_y_tx(cpi, bs, tx_size, x, xd,
+                           &r[tx_size][0], &d[tx_size], &s[tx_size]);
     choose_txfm_size_from_modelrd(cpi, x, r, rate, d, distortion, s,
                                   skip, sse, ref_best_rd, bs);
   } else {
-    if (bs >= BLOCK_32X32)
-      txfm_rd_in_plane(x, rdcost_stack, &r[TX_32X32][0], &d[TX_32X32],
-                       &s[TX_32X32], &sse[TX_32X32],
-                       ref_best_rd, 0, bs, TX_32X32);
-    if (bs >= BLOCK_16X16)
-      txfm_rd_in_plane(x, rdcost_stack, &r[TX_16X16][0], &d[TX_16X16],
-                       &s[TX_16X16], &sse[TX_16X16],
-                       ref_best_rd, 0, bs, TX_16X16);
-    txfm_rd_in_plane(x, rdcost_stack, &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8],
-                     &sse[TX_8X8], ref_best_rd, 0, bs, TX_8X8);
-    txfm_rd_in_plane(x, rdcost_stack, &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4],
-                     &sse[TX_4X4], ref_best_rd, 0, bs, TX_4X4);
+    for (tx_size = TX_4X4; tx_size <= max_tx_size; ++tx_size)
+      txfm_rd_in_plane(x, &r[tx_size][0], &d[tx_size],
+                       &s[tx_size], &sse[tx_size],
+                       ref_best_rd, 0, bs, tx_size,
+                       cpi->sf.use_fast_coef_costing);
     choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s,
                              skip, txfm_cache, bs);
   }
@@ -987,8 +990,39 @@ static void super_block_yrd(VP9_COMP *cpi,
     *psse = sse[mbmi->tx_size];
 }
 
-static int conditional_skipintra(MB_PREDICTION_MODE mode,
-                                 MB_PREDICTION_MODE best_intra_mode) {
+static void intra_super_block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
+                                  int64_t *distortion, int *skip,
+                                  int64_t *psse, BLOCK_SIZE bs,
+                                  int64_t txfm_cache[TX_MODES],
+                                  int64_t ref_best_rd) {
+  int64_t sse[TX_SIZES];
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+
+  assert(bs == mbmi->sb_type);
+  if (cpi->sf.tx_size_search_method != USE_FULL_RD || xd->lossless) {
+    vpx_memset(txfm_cache, 0, TX_MODES * sizeof(int64_t));
+    choose_largest_txfm_size(cpi, x, rate, distortion, skip, sse,
+                             ref_best_rd, bs);
+  } else {
+    int r[TX_SIZES][2], s[TX_SIZES];
+    int64_t d[TX_SIZES];
+    TX_SIZE tx_size;
+    for (tx_size = TX_4X4; tx_size <= max_txsize_lookup[bs]; ++tx_size)
+      txfm_rd_in_plane(x, &r[tx_size][0], &d[tx_size],
+                       &s[tx_size], &sse[tx_size],
+                       ref_best_rd, 0, bs, tx_size,
+                       cpi->sf.use_fast_coef_costing);
+    choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s,
+                             skip, txfm_cache, bs);
+  }
+  if (psse)
+    *psse = sse[mbmi->tx_size];
+}
+
+
+static int conditional_skipintra(PREDICTION_MODE mode,
+                                 PREDICTION_MODE best_intra_mode) {
   if (mode == D117_PRED &&
       best_intra_mode != V_PRED &&
       best_intra_mode != D135_PRED)
@@ -1009,27 +1043,24 @@ static int conditional_skipintra(MB_PREDICTION_MODE mode,
 }
 
 static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
-                                     MB_PREDICTION_MODE *best_mode,
-                                     int *bmode_costs,
+                                     PREDICTION_MODE *best_mode,
+                                     const int *bmode_costs,
                                      ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
                                      int *bestrate, int *bestratey,
                                      int64_t *bestdistortion,
                                      BLOCK_SIZE bsize, int64_t rd_thresh) {
-  MB_PREDICTION_MODE mode;
-  MACROBLOCKD *xd = &x->e_mbd;
+  PREDICTION_MODE mode;
+  MACROBLOCKD *const xd = &x->e_mbd;
   int64_t best_rd = rd_thresh;
-  int rate = 0;
-  int64_t distortion;
+
   struct macroblock_plane *p = &x->plane[0];
   struct macroblockd_plane *pd = &xd->plane[0];
   const int src_stride = p->src.stride;
   const int dst_stride = pd->dst.stride;
-  uint8_t *src_init = raster_block_offset_uint8(BLOCK_8X8, ib,
-                                                p->src.buf, src_stride);
-  uint8_t *dst_init = raster_block_offset_uint8(BLOCK_8X8, ib,
-                                                pd->dst.buf, dst_stride);
-  int16_t *src_diff, *coeff;
-
+  const uint8_t *src_init = &p->src.buf[raster_block_offset(BLOCK_8X8, ib,
+                                                            src_stride)];
+  uint8_t *dst_init = &pd->dst.buf[raster_block_offset(BLOCK_8X8, ib,
+                                                       dst_stride)];
   ENTROPY_CONTEXT ta[2], tempa[2];
   ENTROPY_CONTEXT tl[2], templ[2];
 
@@ -1042,11 +1073,13 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
 
   vpx_memcpy(ta, a, sizeof(ta));
   vpx_memcpy(tl, l, sizeof(tl));
-  xd->mi_8x8[0]->mbmi.tx_size = TX_4X4;
+  xd->mi[0]->mbmi.tx_size = TX_4X4;
 
   for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
     int64_t this_rd;
     int ratey = 0;
+    int64_t distortion = 0;
+    int rate = bmode_costs[mode];
 
     if (!(cpi->sf.intra_y_mode_mask[TX_4X4] & (1 << mode)))
       continue;
@@ -1058,56 +1091,52 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
           continue;
     }
 
-    rate = bmode_costs[mode];
-    distortion = 0;
-
     vpx_memcpy(tempa, ta, sizeof(ta));
     vpx_memcpy(templ, tl, sizeof(tl));
 
     for (idy = 0; idy < num_4x4_blocks_high; ++idy) {
       for (idx = 0; idx < num_4x4_blocks_wide; ++idx) {
-        int64_t ssz;
-        const int16_t *scan;
-        const int16_t *nb;
-        uint8_t *src = src_init + idx * 4 + idy * 4 * src_stride;
-        uint8_t *dst = dst_init + idx * 4 + idy * 4 * dst_stride;
         const int block = ib + idy * 2 + idx;
-        TX_TYPE tx_type;
-        xd->mi_8x8[0]->bmi[block].as_mode = mode;
-        src_diff = raster_block_offset_int16(BLOCK_8X8, block, p->src_diff);
-        coeff = BLOCK_OFFSET(x->plane[0].coeff, block);
+        const uint8_t *const src = &src_init[idx * 4 + idy * 4 * src_stride];
+        uint8_t *const dst = &dst_init[idx * 4 + idy * 4 * dst_stride];
+        int16_t *const src_diff = raster_block_offset_int16(BLOCK_8X8, block,
+                                                            p->src_diff);
+        int16_t *const coeff = BLOCK_OFFSET(x->plane[0].coeff, block);
+        xd->mi[0]->bmi[block].as_mode = mode;
         vp9_predict_intra_block(xd, block, 1,
                                 TX_4X4, mode,
                                 x->skip_encode ? src : dst,
                                 x->skip_encode ? src_stride : dst_stride,
-                                dst, dst_stride);
-        vp9_subtract_block(4, 4, src_diff, 8,
-                           src, src_stride,
-                           dst, dst_stride);
-
-        tx_type = get_tx_type_4x4(PLANE_TYPE_Y_WITH_DC, xd, block);
-        get_scan_nb_4x4(tx_type, &scan, &nb);
-
-        if (tx_type != DCT_DCT)
-          vp9_short_fht4x4(src_diff, coeff, 8, tx_type);
-        else
-          x->fwd_txm4x4(src_diff, coeff, 8);
-
-        vp9_regular_quantize_b_4x4(x, 16, block, scan, get_iscan_4x4(tx_type));
-
-        ratey += cost_coeffs(x, 0, block,
-                             tempa + idx, templ + idy, TX_4X4, scan, nb);
-        distortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, block),
-                                      16, &ssz) >> 2;
-        if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
-          goto next;
-
-        if (tx_type != DCT_DCT)
-          vp9_iht4x4_16_add(BLOCK_OFFSET(pd->dqcoeff, block),
-                               dst, pd->dst.stride, tx_type);
-        else
-          xd->itxm_add(BLOCK_OFFSET(pd->dqcoeff, block), dst, pd->dst.stride,
-                       16);
+                                dst, dst_stride, idx, idy, 0);
+        vp9_subtract_block(4, 4, src_diff, 8, src, src_stride, dst, dst_stride);
+
+        if (xd->lossless) {
+          const scan_order *so = &vp9_default_scan_orders[TX_4X4];
+          vp9_fwht4x4(src_diff, coeff, 8);
+          vp9_regular_quantize_b_4x4(x, 0, block, so->scan, so->iscan);
+          ratey += cost_coeffs(x, 0, block, tempa + idx, templ + idy, TX_4X4,
+                               so->scan, so->neighbors,
+                               cpi->sf.use_fast_coef_costing);
+          if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
+            goto next;
+          vp9_iwht4x4_add(BLOCK_OFFSET(pd->dqcoeff, block), dst, dst_stride,
+                          p->eobs[block]);
+        } else {
+          int64_t unused;
+          const TX_TYPE tx_type = get_tx_type_4x4(PLANE_TYPE_Y, xd, block);
+          const scan_order *so = &vp9_scan_orders[TX_4X4][tx_type];
+          vp9_fht4x4(src_diff, coeff, 8, tx_type);
+          vp9_regular_quantize_b_4x4(x, 0, block, so->scan, so->iscan);
+          ratey += cost_coeffs(x, 0, block, tempa + idx, templ + idy, TX_4X4,
+                             so->scan, so->neighbors,
+                             cpi->sf.use_fast_coef_costing);
+          distortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, block),
+                                        16, &unused) >> 2;
+          if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
+            goto next;
+          vp9_iht4x4_add(tx_type, BLOCK_OFFSET(pd->dqcoeff, block),
+                         dst, dst_stride, p->eobs[block]);
+        }
       }
     }
 
@@ -1140,18 +1169,16 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
   return best_rd;
 }
 
-static int64_t rd_pick_intra_sub_8x8_y_mode(VP9_COMP * const cpi,
-                                            MACROBLOCK * const mb,
-                                            int * const rate,
-                                            int * const rate_y,
-                                            int64_t * const distortion,
+static int64_t rd_pick_intra_sub_8x8_y_mode(VP9_COMP *cpi, MACROBLOCK *mb,
+                                            int *rate, int *rate_y,
+                                            int64_t *distortion,
                                             int64_t best_rd) {
   int i, j;
-  MACROBLOCKD *const xd = &mb->e_mbd;
-  MODE_INFO *const mic = xd->mi_8x8[0];
-  const MODE_INFO *above_mi = xd->mi_8x8[-xd->mode_info_stride];
-  const MODE_INFO *left_mi = xd->left_available ? xd->mi_8x8[-1] : NULL;
-  const BLOCK_SIZE bsize = xd->mi_8x8[0]->mbmi.sb_type;
+  const MACROBLOCKD *const xd = &mb->e_mbd;
+  MODE_INFO *const mic = xd->mi[0];
+  const MODE_INFO *above_mi = xd->mi[-xd->mi_stride];
+  const MODE_INFO *left_mi = xd->left_available ? xd->mi[-1] : NULL;
+  const BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
   const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
   const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
   int idx, idy;
@@ -1160,25 +1187,23 @@ static int64_t rd_pick_intra_sub_8x8_y_mode(VP9_COMP * const cpi,
   int tot_rate_y = 0;
   int64_t total_rd = 0;
   ENTROPY_CONTEXT t_above[4], t_left[4];
-  int *bmode_costs;
+  const int *bmode_costs = cpi->mbmode_cost;
 
   vpx_memcpy(t_above, xd->plane[0].above_context, sizeof(t_above));
   vpx_memcpy(t_left, xd->plane[0].left_context, sizeof(t_left));
 
-  bmode_costs = mb->mbmode_cost;
-
   // Pick modes for each sub-block (of size 4x4, 4x8, or 8x4) in an 8x8 block.
   for (idy = 0; idy < 2; idy += num_4x4_blocks_high) {
     for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
-      MB_PREDICTION_MODE best_mode = DC_PRED;
+      PREDICTION_MODE best_mode = DC_PRED;
       int r = INT_MAX, ry = INT_MAX;
       int64_t d = INT64_MAX, this_rd = INT64_MAX;
       i = idy * 2 + idx;
       if (cpi->common.frame_type == KEY_FRAME) {
-        const MB_PREDICTION_MODE A = above_block_mode(mic, above_mi, i);
-        const MB_PREDICTION_MODE L = left_block_mode(mic, left_mi, i);
+        const PREDICTION_MODE A = vp9_above_block_mode(mic, above_mi, i);
+        const PREDICTION_MODE L = vp9_left_block_mode(mic, left_mi, i);
 
-        bmode_costs  = mb->y_mode_costs[A][L];
+        bmode_costs  = cpi->y_mode_costs[A][L];
       }
 
       this_rd = rd_pick_intra4x4block(cpi, mb, i, &best_mode, bmode_costs,
@@ -1217,15 +1242,15 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x,
                                       BLOCK_SIZE bsize,
                                       int64_t tx_cache[TX_MODES],
                                       int64_t best_rd) {
-  MB_PREDICTION_MODE mode;
-  MB_PREDICTION_MODE mode_selected = DC_PRED;
+  PREDICTION_MODE mode;
+  PREDICTION_MODE mode_selected = DC_PRED;
   MACROBLOCKD *const xd = &x->e_mbd;
-  MODE_INFO *const mic = xd->mi_8x8[0];
+  MODE_INFO *const mic = xd->mi[0];
   int this_rate, this_rate_tokenonly, s;
   int64_t this_distortion, this_rd;
   TX_SIZE best_tx = TX_4X4;
   int i;
-  int *bmode_costs = x->mbmode_cost;
+  int *bmode_costs = cpi->mbmode_cost;
 
   if (cpi->sf.tx_size_search_method == USE_FULL_RD)
     for (i = 0; i < TX_MODES; i++)
@@ -1234,22 +1259,22 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x,
   /* Y Search for intra prediction mode */
   for (mode = DC_PRED; mode <= TM_PRED; mode++) {
     int64_t local_tx_cache[TX_MODES];
-    MODE_INFO *above_mi = xd->mi_8x8[-xd->mode_info_stride];
-    MODE_INFO *left_mi = xd->left_available ? xd->mi_8x8[-1] : NULL;
+    MODE_INFO *above_mi = xd->mi[-xd->mi_stride];
+    MODE_INFO *left_mi = xd->left_available ? xd->mi[-1] : NULL;
 
     if (!(cpi->sf.intra_y_mode_mask[max_txsize_lookup[bsize]] & (1 << mode)))
       continue;
 
     if (cpi->common.frame_type == KEY_FRAME) {
-      const MB_PREDICTION_MODE A = above_block_mode(mic, above_mi, 0);
-      const MB_PREDICTION_MODE L = left_block_mode(mic, left_mi, 0);
+      const PREDICTION_MODE A = vp9_above_block_mode(mic, above_mi, 0);
+      const PREDICTION_MODE L = vp9_left_block_mode(mic, left_mi, 0);
 
-      bmode_costs = x->y_mode_costs[A][L];
+      bmode_costs = cpi->y_mode_costs[A][L];
     }
     mic->mbmi.mode = mode;
 
-    super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion, &s, NULL,
-                    bsize, local_tx_cache, best_rd);
+    intra_super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion,
+        &s, NULL, bsize, local_tx_cache, best_rd);
 
     if (this_rate_tokenonly == INT_MAX)
       continue;
@@ -1284,12 +1309,12 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x,
   return best_rd;
 }
 
-static void super_block_uvrd(VP9_COMP *const cpi, MACROBLOCK *x,
+static void super_block_uvrd(const VP9_COMP *cpi, MACROBLOCK *x,
                              int *rate, int64_t *distortion, int *skippable,
                              int64_t *sse, BLOCK_SIZE bsize,
                              int64_t ref_best_rd) {
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
   TX_SIZE uv_txfm_size = get_uv_tx_size(mbmi);
   int plane;
   int pnrate = 0, pnskip = 1;
@@ -1298,8 +1323,11 @@ static void super_block_uvrd(VP9_COMP *const cpi, MACROBLOCK *x,
   if (ref_best_rd < 0)
     goto term;
 
-  if (is_inter_block(mbmi))
-    vp9_subtract_sbuv(x, bsize);
+  if (is_inter_block(mbmi)) {
+    int plane;
+    for (plane = 1; plane < MAX_MB_PLANE; ++plane)
+      vp9_subtract_plane(x, bsize, plane);
+  }
 
   *rate = 0;
   *distortion = 0;
@@ -1307,8 +1335,9 @@ static void super_block_uvrd(VP9_COMP *const cpi, MACROBLOCK *x,
   *skippable = 1;
 
   for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
-    txfm_rd_in_plane(x, &cpi->rdcost_stack, &pnrate, &pndist, &pnskip, &pnsse,
-                     ref_best_rd, plane, bsize, uv_txfm_size);
+    txfm_rd_in_plane(x, &pnrate, &pndist, &pnskip, &pnsse,
+                     ref_best_rd, plane, bsize, uv_txfm_size,
+                     cpi->sf.use_fast_coef_costing);
     if (pnrate == INT_MAX)
       goto term;
     *rate += pnrate;
@@ -1327,32 +1356,29 @@ static void super_block_uvrd(VP9_COMP *const cpi, MACROBLOCK *x,
 }
 
 static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x,
+                                       PICK_MODE_CONTEXT *ctx,
                                        int *rate, int *rate_tokenonly,
                                        int64_t *distortion, int *skippable,
-                                       BLOCK_SIZE bsize) {
-  MB_PREDICTION_MODE mode;
-  MB_PREDICTION_MODE mode_selected = DC_PRED;
+                                       BLOCK_SIZE bsize, TX_SIZE max_tx_size) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  PREDICTION_MODE mode;
+  PREDICTION_MODE mode_selected = DC_PRED;
   int64_t best_rd = INT64_MAX, this_rd;
   int this_rate_tokenonly, this_rate, s;
   int64_t this_distortion, this_sse;
 
-  // int mode_mask = (bsize <= BLOCK_8X8)
-  //                ? ALL_INTRA_MODES : cpi->sf.intra_uv_mode_mask;
-
-  for (mode = DC_PRED; mode <= TM_PRED; mode ++) {
-    // if (!(mode_mask & (1 << mode)))
-    if (!(cpi->sf.intra_uv_mode_mask[max_uv_txsize_lookup[bsize]]
-          & (1 << mode)))
+  for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
+    if (!(cpi->sf.intra_uv_mode_mask[max_tx_size] & (1 << mode)))
       continue;
 
-    x->e_mbd.mi_8x8[0]->mbmi.uv_mode = mode;
+    xd->mi[0]->mbmi.uv_mode = mode;
 
     super_block_uvrd(cpi, x, &this_rate_tokenonly,
                      &this_distortion, &s, &this_sse, bsize, best_rd);
     if (this_rate_tokenonly == INT_MAX)
       continue;
     this_rate = this_rate_tokenonly +
-                x->intra_uv_mode_cost[cpi->common.frame_type][mode];
+                cpi->intra_uv_mode_cost[cpi->common.frame_type][mode];
     this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
 
     if (this_rd < best_rd) {
@@ -1362,72 +1388,84 @@ static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x,
       *rate_tokenonly = this_rate_tokenonly;
       *distortion     = this_distortion;
       *skippable      = s;
+      if (!x->select_txfm_size) {
+        int i;
+        struct macroblock_plane *const p = x->plane;
+        struct macroblockd_plane *const pd = xd->plane;
+        for (i = 1; i < MAX_MB_PLANE; ++i) {
+          p[i].coeff    = ctx->coeff_pbuf[i][2];
+          p[i].qcoeff   = ctx->qcoeff_pbuf[i][2];
+          pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][2];
+          p[i].eobs    = ctx->eobs_pbuf[i][2];
+
+          ctx->coeff_pbuf[i][2]   = ctx->coeff_pbuf[i][0];
+          ctx->qcoeff_pbuf[i][2]  = ctx->qcoeff_pbuf[i][0];
+          ctx->dqcoeff_pbuf[i][2] = ctx->dqcoeff_pbuf[i][0];
+          ctx->eobs_pbuf[i][2]    = ctx->eobs_pbuf[i][0];
+
+          ctx->coeff_pbuf[i][0]   = p[i].coeff;
+          ctx->qcoeff_pbuf[i][0]  = p[i].qcoeff;
+          ctx->dqcoeff_pbuf[i][0] = pd[i].dqcoeff;
+          ctx->eobs_pbuf[i][0]    = p[i].eobs;
+        }
+      }
     }
   }
 
-  x->e_mbd.mi_8x8[0]->mbmi.uv_mode = mode_selected;
-
+  xd->mi[0]->mbmi.uv_mode = mode_selected;
   return best_rd;
 }
 
-static int64_t rd_sbuv_dcpred(VP9_COMP *cpi, MACROBLOCK *x,
+static int64_t rd_sbuv_dcpred(const VP9_COMP *cpi, MACROBLOCK *x,
                               int *rate, int *rate_tokenonly,
                               int64_t *distortion, int *skippable,
                               BLOCK_SIZE bsize) {
-  int64_t this_rd;
-  int64_t this_sse;
+  const VP9_COMMON *cm = &cpi->common;
+  int64_t unused;
 
-  x->e_mbd.mi_8x8[0]->mbmi.uv_mode = DC_PRED;
+  x->e_mbd.mi[0]->mbmi.uv_mode = DC_PRED;
   super_block_uvrd(cpi, x, rate_tokenonly, distortion,
-                   skippable, &this_sse, bsize, INT64_MAX);
-  *rate = *rate_tokenonly +
-          x->intra_uv_mode_cost[cpi->common.frame_type][DC_PRED];
-  this_rd = RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
-
-  return this_rd;
+                   skippable, &unused, bsize, INT64_MAX);
+  *rate = *rate_tokenonly + cpi->intra_uv_mode_cost[cm->frame_type][DC_PRED];
+  return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
 }
 
-static void choose_intra_uv_mode(VP9_COMP *cpi, BLOCK_SIZE bsize,
+static void choose_intra_uv_mode(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
+                                 BLOCK_SIZE bsize, TX_SIZE max_tx_size,
                                  int *rate_uv, int *rate_uv_tokenonly,
                                  int64_t *dist_uv, int *skip_uv,
-                                 MB_PREDICTION_MODE *mode_uv) {
+                                 PREDICTION_MODE *mode_uv) {
   MACROBLOCK *const x = &cpi->mb;
 
   // Use an estimated rd for uv_intra based on DC_PRED if the
   // appropriate speed flag is set.
   if (cpi->sf.use_uv_intra_rd_estimate) {
-    rd_sbuv_dcpred(cpi, x, rate_uv, rate_uv_tokenonly, dist_uv, skip_uv,
-                   bsize < BLOCK_8X8 ? BLOCK_8X8 : bsize);
+    rd_sbuv_dcpred(cpi, x, rate_uv, rate_uv_tokenonly, dist_uv,
+                   skip_uv, bsize < BLOCK_8X8 ? BLOCK_8X8 : bsize);
   // Else do a proper rd search for each possible transform size that may
   // be considered in the main rd loop.
   } else {
-    rd_pick_intra_sbuv_mode(cpi, x,
+    rd_pick_intra_sbuv_mode(cpi, x, ctx,
                             rate_uv, rate_uv_tokenonly, dist_uv, skip_uv,
-                            bsize < BLOCK_8X8 ? BLOCK_8X8 : bsize);
+                            bsize < BLOCK_8X8 ? BLOCK_8X8 : bsize, max_tx_size);
   }
-  *mode_uv = x->e_mbd.mi_8x8[0]->mbmi.uv_mode;
+  *mode_uv = x->e_mbd.mi[0]->mbmi.uv_mode;
 }
 
-static int cost_mv_ref(VP9_COMP *cpi, MB_PREDICTION_MODE mode,
+static int cost_mv_ref(const VP9_COMP *cpi, PREDICTION_MODE mode,
                        int mode_context) {
-  MACROBLOCK *const x = &cpi->mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  const int segment_id = xd->mi_8x8[0]->mbmi.segment_id;
+  const MACROBLOCK *const x = &cpi->mb;
+  const int segment_id = x->e_mbd.mi[0]->mbmi.segment_id;
 
   // Don't account for mode here if segment skip is enabled.
   if (!vp9_segfeature_active(&cpi->common.seg, segment_id, SEG_LVL_SKIP)) {
     assert(is_inter_mode(mode));
-    return x->inter_mode_cost[mode_context][inter_mode_offset(mode)];
+    return cpi->inter_mode_cost[mode_context][INTER_OFFSET(mode)];
   } else {
     return 0;
   }
 }
 
-void vp9_set_mbmode_and_mvs(MACROBLOCK *x, MB_PREDICTION_MODE mb, int_mv *mv) {
-  x->e_mbd.mi_8x8[0]->mbmi.mode = mb;
-  x->e_mbd.mi_8x8[0]->mbmi.mv[0].as_int = mv->as_int;
-}
-
 static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
                                 BLOCK_SIZE bsize,
                                 int_mv *frame_mv,
@@ -1435,79 +1473,59 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
                                 int_mv single_newmv[MAX_REF_FRAMES],
                                 int *rate_mv);
 
-static int labels2mode(MACROBLOCK *x, int i,
-                       MB_PREDICTION_MODE this_mode,
-                       int_mv *this_mv, int_mv *this_second_mv,
-                       int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES],
-                       int_mv seg_mvs[MAX_REF_FRAMES],
-                       int_mv *best_ref_mv,
-                       int_mv *second_best_ref_mv,
-                       int *mvjcost, int *mvcost[2], VP9_COMP *cpi) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MODE_INFO *const mic = xd->mi_8x8[0];
-  MB_MODE_INFO *mbmi = &mic->mbmi;
-  int cost = 0, thismvcost = 0;
+static int set_and_cost_bmi_mvs(VP9_COMP *cpi, MACROBLOCKD *xd, int i,
+                                PREDICTION_MODE mode, int_mv this_mv[2],
+                                int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES],
+                                int_mv seg_mvs[MAX_REF_FRAMES],
+                                int_mv *best_ref_mv[2], const int *mvjcost,
+                                int *mvcost[2]) {
+  MODE_INFO *const mic = xd->mi[0];
+  const MB_MODE_INFO *const mbmi = &mic->mbmi;
+  int thismvcost = 0;
   int idx, idy;
   const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[mbmi->sb_type];
   const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[mbmi->sb_type];
-  const int has_second_rf = has_second_ref(mbmi);
+  const int is_compound = has_second_ref(mbmi);
 
-  /* We have to be careful retrieving previously-encoded motion vectors.
-   Ones from this macroblock have to be pulled from the BLOCKD array
-   as they have not yet made it to the bmi array in our MB_MODE_INFO. */
-  MB_PREDICTION_MODE m;
-
-  // the only time we should do costing for new motion vector or mode
-  // is when we are on a new label  (jbb May 08, 2007)
-  switch (m = this_mode) {
+  switch (mode) {
     case NEWMV:
-      this_mv->as_int = seg_mvs[mbmi->ref_frame[0]].as_int;
-      thismvcost  = vp9_mv_bit_cost(&this_mv->as_mv, &best_ref_mv->as_mv,
+      this_mv[0].as_int = seg_mvs[mbmi->ref_frame[0]].as_int;
+      thismvcost += vp9_mv_bit_cost(&this_mv[0].as_mv, &best_ref_mv[0]->as_mv,
                                     mvjcost, mvcost, MV_COST_WEIGHT_SUB);
-      if (has_second_rf) {
-        this_second_mv->as_int = seg_mvs[mbmi->ref_frame[1]].as_int;
-        thismvcost += vp9_mv_bit_cost(&this_second_mv->as_mv,
-                                      &second_best_ref_mv->as_mv,
+      if (is_compound) {
+        this_mv[1].as_int = seg_mvs[mbmi->ref_frame[1]].as_int;
+        thismvcost += vp9_mv_bit_cost(&this_mv[1].as_mv, &best_ref_mv[1]->as_mv,
                                       mvjcost, mvcost, MV_COST_WEIGHT_SUB);
       }
       break;
-    case NEARESTMV:
-      this_mv->as_int = frame_mv[NEARESTMV][mbmi->ref_frame[0]].as_int;
-      if (has_second_rf)
-        this_second_mv->as_int =
-            frame_mv[NEARESTMV][mbmi->ref_frame[1]].as_int;
-      break;
     case NEARMV:
-      this_mv->as_int = frame_mv[NEARMV][mbmi->ref_frame[0]].as_int;
-      if (has_second_rf)
-        this_second_mv->as_int =
-            frame_mv[NEARMV][mbmi->ref_frame[1]].as_int;
+    case NEARESTMV:
+      this_mv[0].as_int = frame_mv[mode][mbmi->ref_frame[0]].as_int;
+      if (is_compound)
+        this_mv[1].as_int = frame_mv[mode][mbmi->ref_frame[1]].as_int;
       break;
     case ZEROMV:
-      this_mv->as_int = 0;
-      if (has_second_rf)
-        this_second_mv->as_int = 0;
+      this_mv[0].as_int = 0;
+      if (is_compound)
+        this_mv[1].as_int = 0;
       break;
     default:
       break;
   }
 
-  cost = cost_mv_ref(cpi, this_mode,
-                     mbmi->mode_context[mbmi->ref_frame[0]]);
-
-  mic->bmi[i].as_mv[0].as_int = this_mv->as_int;
-  if (has_second_rf)
-    mic->bmi[i].as_mv[1].as_int = this_second_mv->as_int;
+  mic->bmi[i].as_mv[0].as_int = this_mv[0].as_int;
+  if (is_compound)
+    mic->bmi[i].as_mv[1].as_int = this_mv[1].as_int;
 
-  mic->bmi[i].as_mode = m;
+  mic->bmi[i].as_mode = mode;
 
   for (idy = 0; idy < num_4x4_blocks_high; ++idy)
     for (idx = 0; idx < num_4x4_blocks_wide; ++idx)
       vpx_memcpy(&mic->bmi[i + idy * 2 + idx],
                  &mic->bmi[i], sizeof(mic->bmi[i]));
 
-  cost += thismvcost;
-  return cost;
+  return cost_mv_ref(cpi, mode, mbmi->mode_context[mbmi->ref_frame[0]]) +
+            thismvcost;
 }
 
 static int64_t encode_inter_mb_segment(VP9_COMP *cpi,
@@ -1517,32 +1535,38 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi,
                                        int *labelyrate,
                                        int64_t *distortion, int64_t *sse,
                                        ENTROPY_CONTEXT *ta,
-                                       ENTROPY_CONTEXT *tl) {
+                                       ENTROPY_CONTEXT *tl,
+                                       int mi_row, int mi_col) {
   int k;
   MACROBLOCKD *xd = &x->e_mbd;
   struct macroblockd_plane *const pd = &xd->plane[0];
   struct macroblock_plane *const p = &x->plane[0];
-  MODE_INFO *const mi = xd->mi_8x8[0];
-  const BLOCK_SIZE bsize = mi->mbmi.sb_type;
-  const int width = plane_block_width(bsize, pd);
-  const int height = plane_block_height(bsize, pd);
+  MODE_INFO *const mi = xd->mi[0];
+  const BLOCK_SIZE plane_bsize = get_plane_block_size(mi->mbmi.sb_type, pd);
+  const int width = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
+  const int height = 4 * num_4x4_blocks_high_lookup[plane_bsize];
   int idx, idy;
 
-  uint8_t *const src = raster_block_offset_uint8(BLOCK_8X8, i,
-                                                 p->src.buf, p->src.stride);
-  uint8_t *const dst = raster_block_offset_uint8(BLOCK_8X8, i,
-                                                 pd->dst.buf, pd->dst.stride);
+  const uint8_t *const src = &p->src.buf[raster_block_offset(BLOCK_8X8, i,
+                                                             p->src.stride)];
+  uint8_t *const dst = &pd->dst.buf[raster_block_offset(BLOCK_8X8, i,
+                                                        pd->dst.stride)];
   int64_t thisdistortion = 0, thissse = 0;
   int thisrate = 0, ref;
+  const scan_order *so = &vp9_default_scan_orders[TX_4X4];
   const int is_compound = has_second_ref(&mi->mbmi);
+  const InterpKernel *kernel = vp9_get_interp_kernel(mi->mbmi.interp_filter);
+
   for (ref = 0; ref < 1 + is_compound; ++ref) {
-    const uint8_t *pre = raster_block_offset_uint8(BLOCK_8X8, i,
-                                     pd->pre[ref].buf, pd->pre[ref].stride);
+    const uint8_t *pre = &pd->pre[ref].buf[raster_block_offset(BLOCK_8X8, i,
+                                               pd->pre[ref].stride)];
     vp9_build_inter_predictor(pre, pd->pre[ref].stride,
                               dst, pd->dst.stride,
                               &mi->bmi[i].as_mv[ref].as_mv,
-                              &xd->scale_factor[ref],
-                              width, height, ref, &xd->subpix, MV_PRECISION_Q3);
+                              &xd->block_refs[ref]->sf, width, height, ref,
+                              kernel, MV_PRECISION_Q3,
+                              mi_col * MI_SIZE + 4 * (i % 2),
+                              mi_row * MI_SIZE + 4 * (i / 2));
   }
 
   vp9_subtract_block(height, width,
@@ -1560,16 +1584,13 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi,
       coeff = BLOCK_OFFSET(p->coeff, k);
       x->fwd_txm4x4(raster_block_offset_int16(BLOCK_8X8, k, p->src_diff),
                     coeff, 8);
-      vp9_regular_quantize_b_4x4(x, 16, k, get_scan_4x4(DCT_DCT),
-                                 get_iscan_4x4(DCT_DCT));
+      vp9_regular_quantize_b_4x4(x, 0, k, so->scan, so->iscan);
       thisdistortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, k),
                                         16, &ssz);
       thissse += ssz;
-      thisrate += cost_coeffs(x, 0, k,
-                              ta + (k & 1),
-                              tl + (k >> 1), TX_4X4,
-                              vp9_default_scan_4x4,
-                              vp9_default_scan_4x4_neighbors);
+      thisrate += cost_coeffs(x, 0, k, ta + (k & 1), tl + (k >> 1), TX_4X4,
+                              so->scan, so->neighbors,
+                              cpi->sf.use_fast_coef_costing);
       rd1 = RDCOST(x->rdmult, x->rddiv, thisrate, thisdistortion >> 2);
       rd2 = RDCOST(x->rdmult, x->rddiv, 0, thissse >> 2);
       rd = MIN(rd1, rd2);
@@ -1598,7 +1619,7 @@ typedef struct {
 } SEG_RDSTAT;
 
 typedef struct {
-  int_mv *ref_mv, *second_ref_mv;
+  int_mv *ref_mv[2];
   int_mv mvp;
 
   int64_t segment_rd;
@@ -1606,55 +1627,108 @@ typedef struct {
   int64_t d;
   int64_t sse;
   int segment_yrate;
-  MB_PREDICTION_MODE modes[4];
+  PREDICTION_MODE modes[4];
   SEG_RDSTAT rdstat[4][INTER_MODES];
   int mvthresh;
 } BEST_SEG_INFO;
 
-static INLINE int mv_check_bounds(MACROBLOCK *x, int_mv *mv) {
-  int r = 0;
-  r |= (mv->as_mv.row >> 3) < x->mv_row_min;
-  r |= (mv->as_mv.row >> 3) > x->mv_row_max;
-  r |= (mv->as_mv.col >> 3) < x->mv_col_min;
-  r |= (mv->as_mv.col >> 3) > x->mv_col_max;
-  return r;
+static INLINE int mv_check_bounds(const MACROBLOCK *x, const MV *mv) {
+  return (mv->row >> 3) < x->mv_row_min ||
+         (mv->row >> 3) > x->mv_row_max ||
+         (mv->col >> 3) < x->mv_col_min ||
+         (mv->col >> 3) > x->mv_col_max;
 }
 
 static INLINE void mi_buf_shift(MACROBLOCK *x, int i) {
-  MB_MODE_INFO *const mbmi = &x->e_mbd.mi_8x8[0]->mbmi;
+  MB_MODE_INFO *const mbmi = &x->e_mbd.mi[0]->mbmi;
   struct macroblock_plane *const p = &x->plane[0];
   struct macroblockd_plane *const pd = &x->e_mbd.plane[0];
 
-  p->src.buf = raster_block_offset_uint8(BLOCK_8X8, i, p->src.buf,
-                                         p->src.stride);
+  p->src.buf = &p->src.buf[raster_block_offset(BLOCK_8X8, i, p->src.stride)];
   assert(((intptr_t)pd->pre[0].buf & 0x7) == 0);
-  pd->pre[0].buf = raster_block_offset_uint8(BLOCK_8X8, i, pd->pre[0].buf,
-                                             pd->pre[0].stride);
+  pd->pre[0].buf = &pd->pre[0].buf[raster_block_offset(BLOCK_8X8, i,
+                                                       pd->pre[0].stride)];
   if (has_second_ref(mbmi))
-    pd->pre[1].buf = raster_block_offset_uint8(BLOCK_8X8, i, pd->pre[1].buf,
-                                               pd->pre[1].stride);
+    pd->pre[1].buf = &pd->pre[1].buf[raster_block_offset(BLOCK_8X8, i,
+                                                         pd->pre[1].stride)];
 }
 
 static INLINE void mi_buf_restore(MACROBLOCK *x, struct buf_2d orig_src,
                                   struct buf_2d orig_pre[2]) {
-  MB_MODE_INFO *mbmi = &x->e_mbd.mi_8x8[0]->mbmi;
+  MB_MODE_INFO *mbmi = &x->e_mbd.mi[0]->mbmi;
   x->plane[0].src = orig_src;
   x->e_mbd.plane[0].pre[0] = orig_pre[0];
   if (has_second_ref(mbmi))
     x->e_mbd.plane[0].pre[1] = orig_pre[1];
 }
 
-static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
-                                    const TileInfo *const tile,
-                                    BEST_SEG_INFO *bsi_buf, int filter_idx,
-                                    int_mv seg_mvs[4][MAX_REF_FRAMES],
-                                    int mi_row, int mi_col) {
-  int i, br = 0, idx, idy;
+static INLINE int mv_has_subpel(const MV *mv) {
+  return (mv->row & 0x0F) || (mv->col & 0x0F);
+}
+
+// Check if NEARESTMV/NEARMV/ZEROMV is the cheapest way encode zero motion.
+// TODO(aconverse): Find out if this is still productive then clean up or remove
+static int check_best_zero_mv(
+    const VP9_COMP *cpi, const uint8_t mode_context[MAX_REF_FRAMES],
+    int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES],
+    int disable_inter_mode_mask, int this_mode,
+    const MV_REFERENCE_FRAME ref_frames[2]) {
+  if (!(disable_inter_mode_mask & (1 << INTER_OFFSET(ZEROMV))) &&
+      (this_mode == NEARMV || this_mode == NEARESTMV || this_mode == ZEROMV) &&
+      frame_mv[this_mode][ref_frames[0]].as_int == 0 &&
+      (ref_frames[1] == NONE ||
+       frame_mv[this_mode][ref_frames[1]].as_int == 0)) {
+    int rfc = mode_context[ref_frames[0]];
+    int c1 = cost_mv_ref(cpi, NEARMV, rfc);
+    int c2 = cost_mv_ref(cpi, NEARESTMV, rfc);
+    int c3 = cost_mv_ref(cpi, ZEROMV, rfc);
+
+    if (this_mode == NEARMV) {
+      if (c1 > c3) return 0;
+    } else if (this_mode == NEARESTMV) {
+      if (c2 > c3) return 0;
+    } else {
+      assert(this_mode == ZEROMV);
+      if (ref_frames[1] == NONE) {
+        if ((c3 >= c2 && frame_mv[NEARESTMV][ref_frames[0]].as_int == 0) ||
+            (c3 >= c1 && frame_mv[NEARMV][ref_frames[0]].as_int == 0))
+          return 0;
+      } else {
+        if ((c3 >= c2 && frame_mv[NEARESTMV][ref_frames[0]].as_int == 0 &&
+             frame_mv[NEARESTMV][ref_frames[1]].as_int == 0) ||
+            (c3 >= c1 && frame_mv[NEARMV][ref_frames[0]].as_int == 0 &&
+             frame_mv[NEARMV][ref_frames[1]].as_int == 0))
+          return 0;
+      }
+    }
+  }
+  return 1;
+}
+
+static int64_t rd_pick_best_sub8x8_mode(VP9_COMP *cpi, MACROBLOCK *x,
+                                        const TileInfo * const tile,
+                                        int_mv *best_ref_mv,
+                                        int_mv *second_best_ref_mv,
+                                        int64_t best_rd, int *returntotrate,
+                                        int *returnyrate,
+                                        int64_t *returndistortion,
+                                        int *skippable, int64_t *psse,
+                                        int mvthresh,
+                                        int_mv seg_mvs[4][MAX_REF_FRAMES],
+                                        BEST_SEG_INFO *bsi_buf, int filter_idx,
+                                        int mi_row, int mi_col) {
+  int i;
+  BEST_SEG_INFO *bsi = bsi_buf + filter_idx;
+  MACROBLOCKD *xd = &x->e_mbd;
+  MODE_INFO *mi = xd->mi[0];
+  MB_MODE_INFO *mbmi = &mi->mbmi;
+  int mode_idx;
+  int k, br = 0, idx, idy;
   int64_t bd = 0, block_sse = 0;
-  MB_PREDICTION_MODE this_mode;
-  MODE_INFO *mi = x->e_mbd.mi_8x8[0];
-  MB_MODE_INFO *const mbmi = &mi->mbmi;
-  struct macroblockd_plane *const pd = &x->e_mbd.plane[0];
+  PREDICTION_MODE this_mode;
+  VP9_COMMON *cm = &cpi->common;
+  struct macroblock_plane *const p = &x->plane[0];
+  struct macroblockd_plane *const pd = &xd->plane[0];
   const int label_count = 4;
   int64_t this_segment_rd = 0;
   int label_mv_thresh;
@@ -1662,18 +1736,25 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
   const BLOCK_SIZE bsize = mbmi->sb_type;
   const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
   const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
-  vp9_variance_fn_ptr_t *v_fn_ptr;
   ENTROPY_CONTEXT t_above[2], t_left[2];
-  BEST_SEG_INFO *bsi = bsi_buf + filter_idx;
-  int mode_idx;
   int subpelmv = 1, have_ref = 0;
   const int has_second_rf = has_second_ref(mbmi);
+  const int disable_inter_mode_mask = cpi->sf.disable_inter_mode_mask[bsize];
+
+  vp9_zero(*bsi);
+
+  bsi->segment_rd = best_rd;
+  bsi->ref_mv[0] = best_ref_mv;
+  bsi->ref_mv[1] = second_best_ref_mv;
+  bsi->mvp.as_int = best_ref_mv->as_int;
+  bsi->mvthresh = mvthresh;
+
+  for (i = 0; i < 4; i++)
+    bsi->modes[i] = ZEROMV;
 
   vpx_memcpy(t_above, pd->above_context, sizeof(t_above));
   vpx_memcpy(t_left, pd->left_context, sizeof(t_left));
 
-  v_fn_ptr = &cpi->fn_ptr[bsize];
-
   // 64 makes this threshold really big effectively
   // making it so that we very rarely check mvs on
   // segments.   setting this to 1 would make mv thresh
@@ -1685,68 +1766,35 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
     for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
       // TODO(jingning,rbultje): rewrite the rate-distortion optimization
       // loop for 4x4/4x8/8x4 block coding. to be replaced with new rd loop
-      int_mv mode_mv[MB_MODE_COUNT], second_mode_mv[MB_MODE_COUNT];
+      int_mv mode_mv[MB_MODE_COUNT][2];
       int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
-      MB_PREDICTION_MODE mode_selected = ZEROMV;
+      PREDICTION_MODE mode_selected = ZEROMV;
       int64_t best_rd = INT64_MAX;
-      i = idy * 2 + idx;
-
-      frame_mv[ZEROMV][mbmi->ref_frame[0]].as_int = 0;
-      vp9_append_sub8x8_mvs_for_idx(&cpi->common, &x->e_mbd, tile,
-                                    &frame_mv[NEARESTMV][mbmi->ref_frame[0]],
-                                    &frame_mv[NEARMV][mbmi->ref_frame[0]],
-                                    i, 0, mi_row, mi_col);
-      if (has_second_rf) {
-        frame_mv[ZEROMV][mbmi->ref_frame[1]].as_int = 0;
-        vp9_append_sub8x8_mvs_for_idx(&cpi->common, &x->e_mbd, tile,
-                                      &frame_mv[NEARESTMV][mbmi->ref_frame[1]],
-                                      &frame_mv[NEARMV][mbmi->ref_frame[1]],
-                                      i, 1, mi_row, mi_col);
+      const int i = idy * 2 + idx;
+      int ref;
+
+      for (ref = 0; ref < 1 + has_second_rf; ++ref) {
+        const MV_REFERENCE_FRAME frame = mbmi->ref_frame[ref];
+        frame_mv[ZEROMV][frame].as_int = 0;
+        vp9_append_sub8x8_mvs_for_idx(cm, xd, tile, i, ref, mi_row, mi_col,
+                                      &frame_mv[NEARESTMV][frame],
+                                      &frame_mv[NEARMV][frame]);
       }
+
       // search for the best motion vector on this segment
       for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) {
         const struct buf_2d orig_src = x->plane[0].src;
         struct buf_2d orig_pre[2];
 
-        mode_idx = inter_mode_offset(this_mode);
+        mode_idx = INTER_OFFSET(this_mode);
         bsi->rdstat[i][mode_idx].brdcost = INT64_MAX;
+        if (disable_inter_mode_mask & (1 << mode_idx))
+          continue;
 
-        // if we're near/nearest and mv == 0,0, compare to zeromv
-        if ((this_mode == NEARMV || this_mode == NEARESTMV ||
-             this_mode == ZEROMV) &&
-            frame_mv[this_mode][mbmi->ref_frame[0]].as_int == 0 &&
-            (!has_second_rf ||
-             frame_mv[this_mode][mbmi->ref_frame[1]].as_int == 0)) {
-          int rfc = mbmi->mode_context[mbmi->ref_frame[0]];
-          int c1 = cost_mv_ref(cpi, NEARMV, rfc);
-          int c2 = cost_mv_ref(cpi, NEARESTMV, rfc);
-          int c3 = cost_mv_ref(cpi, ZEROMV, rfc);
-
-          if (this_mode == NEARMV) {
-            if (c1 > c3)
-              continue;
-          } else if (this_mode == NEARESTMV) {
-            if (c2 > c3)
-              continue;
-          } else {
-            assert(this_mode == ZEROMV);
-            if (!has_second_rf) {
-              if ((c3 >= c2 &&
-                   frame_mv[NEARESTMV][mbmi->ref_frame[0]].as_int == 0) ||
-                  (c3 >= c1 &&
-                   frame_mv[NEARMV][mbmi->ref_frame[0]].as_int == 0))
-                continue;
-            } else {
-              if ((c3 >= c2 &&
-                   frame_mv[NEARESTMV][mbmi->ref_frame[0]].as_int == 0 &&
-                   frame_mv[NEARESTMV][mbmi->ref_frame[1]].as_int == 0) ||
-                  (c3 >= c1 &&
-                   frame_mv[NEARMV][mbmi->ref_frame[0]].as_int == 0 &&
-                   frame_mv[NEARMV][mbmi->ref_frame[1]].as_int == 0))
-                continue;
-            }
-          }
-        }
+        if (!check_best_zero_mv(cpi, mbmi->mode_context, frame_mv,
+                                disable_inter_mode_mask,
+                                this_mode, mbmi->ref_frame))
+          continue;
 
         vpx_memcpy(orig_pre, pd->pre, sizeof(orig_pre));
         vpx_memcpy(bsi->rdstat[i][mode_idx].ta, t_above,
@@ -1757,11 +1805,11 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
         // motion search for newmv (single predictor case only)
         if (!has_second_rf && this_mode == NEWMV &&
             seg_mvs[i][mbmi->ref_frame[0]].as_int == INVALID_MV) {
+          MV *const new_mv = &mode_mv[NEWMV][0].as_mv;
           int step_param = 0;
-          int further_steps;
           int thissme, bestsme = INT_MAX;
           int sadpb = x->sadperbit4;
-          int_mv mvp_full;
+          MV mvp_full;
           int max_mv;
 
           /* Is the best so far sufficiently good that we cant justify doing
@@ -1769,7 +1817,7 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
           if (best_rd < label_mv_thresh)
             break;
 
-          if (cpi->compressor_speed) {
+          if (!is_best_mode(cpi->oxcf.mode)) {
             // use previous block's result as next block's MV predictor.
             if (i > 0) {
               bsi->mvp.as_int = mi->bmi[i - 1].as_mv[0].as_int;
@@ -1782,102 +1830,86 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
           else
             max_mv = MAX(abs(bsi->mvp.as_mv.row), abs(bsi->mvp.as_mv.col)) >> 3;
 
-          if (cpi->sf.auto_mv_step_size && cpi->common.show_frame) {
+          if (cpi->sf.auto_mv_step_size && cm->show_frame) {
             // Take wtd average of the step_params based on the last frame's
             // max mv magnitude and the best ref mvs of the current block for
             // the given reference.
-            step_param = (vp9_init_search_range(cpi, max_mv) +
-                          cpi->mv_step_param) >> 1;
+            step_param = (vp9_init_search_range(&cpi->sf, max_mv) +
+                              cpi->mv_step_param) / 2;
           } else {
             step_param = cpi->mv_step_param;
           }
 
-          mvp_full.as_mv.row = bsi->mvp.as_mv.row >> 3;
-          mvp_full.as_mv.col = bsi->mvp.as_mv.col >> 3;
+          mvp_full.row = bsi->mvp.as_mv.row >> 3;
+          mvp_full.col = bsi->mvp.as_mv.col >> 3;
 
-          if (cpi->sf.adaptive_motion_search && cpi->common.show_frame) {
-            mvp_full.as_mv.row = x->pred_mv[mbmi->ref_frame[0]].as_mv.row >> 3;
-            mvp_full.as_mv.col = x->pred_mv[mbmi->ref_frame[0]].as_mv.col >> 3;
+          if (cpi->sf.adaptive_motion_search && cm->show_frame) {
+            mvp_full.row = x->pred_mv[mbmi->ref_frame[0]].as_mv.row >> 3;
+            mvp_full.col = x->pred_mv[mbmi->ref_frame[0]].as_mv.col >> 3;
             step_param = MAX(step_param, 8);
           }
 
-          further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param;
           // adjust src pointer for this block
           mi_buf_shift(x, i);
-          if (cpi->sf.search_method == HEX) {
-            bestsme = vp9_hex_search(x, &mvp_full.as_mv,
-                                     step_param,
-                                     sadpb, 1, v_fn_ptr, 1,
-                                     &bsi->ref_mv->as_mv,
-                                     &mode_mv[NEWMV].as_mv);
-          } else if (cpi->sf.search_method == SQUARE) {
-            bestsme = vp9_square_search(x, &mvp_full.as_mv,
-                                        step_param,
-                                        sadpb, 1, v_fn_ptr, 1,
-                                        &bsi->ref_mv->as_mv,
-                                        &mode_mv[NEWMV].as_mv);
-          } else if (cpi->sf.search_method == BIGDIA) {
-            bestsme = vp9_bigdia_search(x, &mvp_full.as_mv,
-                                        step_param,
-                                        sadpb, 1, v_fn_ptr, 1,
-                                        &bsi->ref_mv->as_mv,
-                                        &mode_mv[NEWMV].as_mv);
-          } else {
-            bestsme = vp9_full_pixel_diamond(cpi, x, &mvp_full, step_param,
-                                             sadpb, further_steps, 0, v_fn_ptr,
-                                             bsi->ref_mv, &mode_mv[NEWMV]);
-          }
+
+          vp9_set_mv_search_range(x, &bsi->ref_mv[0]->as_mv);
+
+          bestsme = full_pixel_search(cpi, x, bsize, &mvp_full, step_param,
+                                      sadpb, &bsi->ref_mv[0]->as_mv, new_mv,
+                                      INT_MAX, 1);
 
           // Should we do a full search (best quality only)
-          if (cpi->compressor_speed == 0) {
+          if (is_best_mode(cpi->oxcf.mode)) {
+            int_mv *const best_mv = &mi->bmi[i].as_mv[0];
             /* Check if mvp_full is within the range. */
-            clamp_mv(&mvp_full.as_mv, x->mv_col_min, x->mv_col_max,
+            clamp_mv(&mvp_full, x->mv_col_min, x->mv_col_max,
                      x->mv_row_min, x->mv_row_max);
-
             thissme = cpi->full_search_sad(x, &mvp_full,
-                                           sadpb, 16, v_fn_ptr,
-                                           x->nmvjointcost, x->mvcost,
-                                           bsi->ref_mv, i);
-
+                                           sadpb, 16, &cpi->fn_ptr[bsize],
+                                           &bsi->ref_mv[0]->as_mv,
+                                           &best_mv->as_mv);
             if (thissme < bestsme) {
               bestsme = thissme;
-              mode_mv[NEWMV].as_int = mi->bmi[i].as_mv[0].as_int;
+              *new_mv = best_mv->as_mv;
             } else {
-              /* The full search result is actually worse so re-instate the
-               * previous best vector */
-              mi->bmi[i].as_mv[0].as_int = mode_mv[NEWMV].as_int;
+              // The full search result is actually worse so re-instate the
+              // previous best vector
+              best_mv->as_mv = *new_mv;
             }
           }
 
           if (bestsme < INT_MAX) {
             int distortion;
-            unsigned int sse;
             cpi->find_fractional_mv_step(x,
-                                         &mode_mv[NEWMV].as_mv,
-                                         &bsi->ref_mv->as_mv,
-                                         cpi->common.allow_high_precision_mv,
-                                         x->errorperbit, v_fn_ptr,
-                                         0, cpi->sf.subpel_iters_per_step,
+                                         new_mv,
+                                         &bsi->ref_mv[0]->as_mv,
+                                         cm->allow_high_precision_mv,
+                                         x->errorperbit, &cpi->fn_ptr[bsize],
+                                         cpi->sf.subpel_force_stop,
+                                         cpi->sf.subpel_iters_per_step,
                                          x->nmvjointcost, x->mvcost,
-                                         &distortion, &sse);
+                                         &distortion,
+                                         &x->pred_sse[mbmi->ref_frame[0]]);
 
             // save motion search result for use in compound prediction
-            seg_mvs[i][mbmi->ref_frame[0]].as_int = mode_mv[NEWMV].as_int;
+            seg_mvs[i][mbmi->ref_frame[0]].as_mv = *new_mv;
           }
 
           if (cpi->sf.adaptive_motion_search)
-            x->pred_mv[mbmi->ref_frame[0]].as_int = mode_mv[NEWMV].as_int;
+            x->pred_mv[mbmi->ref_frame[0]].as_mv = *new_mv;
 
           // restore src pointers
           mi_buf_restore(x, orig_src, orig_pre);
         }
 
-        if (has_second_rf && this_mode == NEWMV &&
-            mbmi->interp_filter == EIGHTTAP) {
+        if (has_second_rf) {
           if (seg_mvs[i][mbmi->ref_frame[1]].as_int == INVALID_MV ||
               seg_mvs[i][mbmi->ref_frame[0]].as_int == INVALID_MV)
             continue;
+        }
 
+        if (has_second_rf && this_mode == NEWMV &&
+            mbmi->interp_filter == EIGHTTAP) {
           // adjust src pointers
           mi_buf_shift(x, i);
           if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
@@ -1895,57 +1927,44 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
         }
 
         bsi->rdstat[i][mode_idx].brate =
-            labels2mode(x, i, this_mode, &mode_mv[this_mode],
-                        &second_mode_mv[this_mode], frame_mv, seg_mvs[i],
-                        bsi->ref_mv, bsi->second_ref_mv, x->nmvjointcost,
-                        x->mvcost, cpi);
-
-        bsi->rdstat[i][mode_idx].mvs[0].as_int = mode_mv[this_mode].as_int;
-        if (num_4x4_blocks_wide > 1)
-          bsi->rdstat[i + 1][mode_idx].mvs[0].as_int =
-              mode_mv[this_mode].as_int;
-        if (num_4x4_blocks_high > 1)
-          bsi->rdstat[i + 2][mode_idx].mvs[0].as_int =
-              mode_mv[this_mode].as_int;
-        if (has_second_rf) {
-          bsi->rdstat[i][mode_idx].mvs[1].as_int =
-              second_mode_mv[this_mode].as_int;
+            set_and_cost_bmi_mvs(cpi, xd, i, this_mode, mode_mv[this_mode],
+                                 frame_mv, seg_mvs[i], bsi->ref_mv,
+                                 x->nmvjointcost, x->mvcost);
+
+        for (ref = 0; ref < 1 + has_second_rf; ++ref) {
+          bsi->rdstat[i][mode_idx].mvs[ref].as_int =
+              mode_mv[this_mode][ref].as_int;
           if (num_4x4_blocks_wide > 1)
-            bsi->rdstat[i + 1][mode_idx].mvs[1].as_int =
-                second_mode_mv[this_mode].as_int;
+            bsi->rdstat[i + 1][mode_idx].mvs[ref].as_int =
+                mode_mv[this_mode][ref].as_int;
           if (num_4x4_blocks_high > 1)
-            bsi->rdstat[i + 2][mode_idx].mvs[1].as_int =
-                second_mode_mv[this_mode].as_int;
+            bsi->rdstat[i + 2][mode_idx].mvs[ref].as_int =
+                mode_mv[this_mode][ref].as_int;
         }
 
         // Trap vectors that reach beyond the UMV borders
-        if (mv_check_bounds(x, &mode_mv[this_mode]))
-          continue;
-        if (has_second_rf &&
-            mv_check_bounds(x, &second_mode_mv[this_mode]))
+        if (mv_check_bounds(x, &mode_mv[this_mode][0].as_mv) ||
+            (has_second_rf &&
+             mv_check_bounds(x, &mode_mv[this_mode][1].as_mv)))
           continue;
 
         if (filter_idx > 0) {
           BEST_SEG_INFO *ref_bsi = bsi_buf;
-          subpelmv = (mode_mv[this_mode].as_mv.row & 0x0f) ||
-                     (mode_mv[this_mode].as_mv.col & 0x0f);
-          have_ref = mode_mv[this_mode].as_int ==
-                     ref_bsi->rdstat[i][mode_idx].mvs[0].as_int;
-          if (has_second_rf) {
-            subpelmv |= (second_mode_mv[this_mode].as_mv.row & 0x0f) ||
-                        (second_mode_mv[this_mode].as_mv.col & 0x0f);
-            have_ref  &= second_mode_mv[this_mode].as_int ==
-                         ref_bsi->rdstat[i][mode_idx].mvs[1].as_int;
+          subpelmv = 0;
+          have_ref = 1;
+
+          for (ref = 0; ref < 1 + has_second_rf; ++ref) {
+            subpelmv |= mv_has_subpel(&mode_mv[this_mode][ref].as_mv);
+            have_ref &= mode_mv[this_mode][ref].as_int ==
+                ref_bsi->rdstat[i][mode_idx].mvs[ref].as_int;
           }
 
           if (filter_idx > 1 && !subpelmv && !have_ref) {
             ref_bsi = bsi_buf + 1;
-            have_ref = mode_mv[this_mode].as_int ==
-                       ref_bsi->rdstat[i][mode_idx].mvs[0].as_int;
-            if (has_second_rf) {
-              have_ref  &= second_mode_mv[this_mode].as_int ==
-                           ref_bsi->rdstat[i][mode_idx].mvs[1].as_int;
-            }
+            have_ref = 1;
+            for (ref = 0; ref < 1 + has_second_rf; ++ref)
+              have_ref &= mode_mv[this_mode][ref].as_int ==
+                  ref_bsi->rdstat[i][mode_idx].mvs[ref].as_int;
           }
 
           if (!subpelmv && have_ref &&
@@ -1974,16 +1993,17 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
                                     &bsi->rdstat[i][mode_idx].bdist,
                                     &bsi->rdstat[i][mode_idx].bsse,
                                     bsi->rdstat[i][mode_idx].ta,
-                                    bsi->rdstat[i][mode_idx].tl);
+                                    bsi->rdstat[i][mode_idx].tl,
+                                    mi_row, mi_col);
         if (bsi->rdstat[i][mode_idx].brdcost < INT64_MAX) {
           bsi->rdstat[i][mode_idx].brdcost += RDCOST(x->rdmult, x->rddiv,
                                             bsi->rdstat[i][mode_idx].brate, 0);
           bsi->rdstat[i][mode_idx].brate += bsi->rdstat[i][mode_idx].byrate;
-          bsi->rdstat[i][mode_idx].eobs = pd->eobs[i];
+          bsi->rdstat[i][mode_idx].eobs = p->eobs[i];
           if (num_4x4_blocks_wide > 1)
-            bsi->rdstat[i + 1][mode_idx].eobs = pd->eobs[i + 1];
+            bsi->rdstat[i + 1][mode_idx].eobs = p->eobs[i + 1];
           if (num_4x4_blocks_high > 1)
-            bsi->rdstat[i + 2][mode_idx].eobs = pd->eobs[i + 2];
+            bsi->rdstat[i + 2][mode_idx].eobs = p->eobs[i + 2];
         }
 
         if (bsi->rdstat[i][mode_idx].brdcost < best_rd) {
@@ -1998,17 +2018,16 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
           for (midx = 0; midx < INTER_MODES; ++midx)
             bsi->rdstat[iy][midx].brdcost = INT64_MAX;
         bsi->segment_rd = INT64_MAX;
-        return;
+        return INT64_MAX;;
       }
 
-      mode_idx = inter_mode_offset(mode_selected);
+      mode_idx = INTER_OFFSET(mode_selected);
       vpx_memcpy(t_above, bsi->rdstat[i][mode_idx].ta, sizeof(t_above));
       vpx_memcpy(t_left, bsi->rdstat[i][mode_idx].tl, sizeof(t_left));
 
-      labels2mode(x, i, mode_selected, &mode_mv[mode_selected],
-                  &second_mode_mv[mode_selected], frame_mv, seg_mvs[i],
-                  bsi->ref_mv, bsi->second_ref_mv, x->nmvjointcost,
-                  x->mvcost, cpi);
+      set_and_cost_bmi_mvs(cpi, xd, i, mode_selected, mode_mv[mode_selected],
+                           frame_mv, seg_mvs[i], bsi->ref_mv, x->nmvjointcost,
+                           x->mvcost);
 
       br += bsi->rdstat[i][mode_idx].brate;
       bd += bsi->rdstat[i][mode_idx].bdist;
@@ -2022,7 +2041,7 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
           for (midx = 0; midx < INTER_MODES; ++midx)
             bsi->rdstat[iy][midx].brdcost = INT64_MAX;
         bsi->segment_rd = INT64_MAX;
-        return;
+        return INT64_MAX;;
       }
     }
   } /* for each label */
@@ -2034,54 +2053,18 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
   bsi->sse = block_sse;
 
   // update the coding decisions
-  for (i = 0; i < 4; ++i)
-    bsi->modes[i] = mi->bmi[i].as_mode;
-}
-
-static int64_t rd_pick_best_mbsegmentation(VP9_COMP *cpi, MACROBLOCK *x,
-                                           const TileInfo *const tile,
-                                           int_mv *best_ref_mv,
-                                           int_mv *second_best_ref_mv,
-                                           int64_t best_rd,
-                                           int *returntotrate,
-                                           int *returnyrate,
-                                           int64_t *returndistortion,
-                                           int *skippable, int64_t *psse,
-                                           int mvthresh,
-                                           int_mv seg_mvs[4][MAX_REF_FRAMES],
-                                           BEST_SEG_INFO *bsi_buf,
-                                           int filter_idx,
-                                           int mi_row, int mi_col) {
-  int i;
-  BEST_SEG_INFO *bsi = bsi_buf + filter_idx;
-  MACROBLOCKD *xd = &x->e_mbd;
-  MODE_INFO *mi = xd->mi_8x8[0];
-  MB_MODE_INFO *mbmi = &mi->mbmi;
-  int mode_idx;
-
-  vp9_zero(*bsi);
-
-  bsi->segment_rd = best_rd;
-  bsi->ref_mv = best_ref_mv;
-  bsi->second_ref_mv = second_best_ref_mv;
-  bsi->mvp.as_int = best_ref_mv->as_int;
-  bsi->mvthresh = mvthresh;
-
-  for (i = 0; i < 4; i++)
-    bsi->modes[i] = ZEROMV;
-
-  rd_check_segment_txsize(cpi, x, tile, bsi_buf, filter_idx, seg_mvs,
-                          mi_row, mi_col);
+  for (k = 0; k < 4; ++k)
+    bsi->modes[k] = mi->bmi[k].as_mode;
 
   if (bsi->segment_rd > best_rd)
     return INT64_MAX;
   /* set it to the best */
   for (i = 0; i < 4; i++) {
-    mode_idx = inter_mode_offset(bsi->modes[i]);
+    mode_idx = INTER_OFFSET(bsi->modes[i]);
     mi->bmi[i].as_mv[0].as_int = bsi->rdstat[i][mode_idx].mvs[0].as_int;
     if (has_second_ref(mbmi))
       mi->bmi[i].as_mv[1].as_int = bsi->rdstat[i][mode_idx].mvs[1].as_int;
-    xd->plane[0].eobs[i] = bsi->rdstat[i][mode_idx].eobs;
+    x->plane[0].eobs[i] = bsi->rdstat[i][mode_idx].eobs;
     mi->bmi[i].as_mode = bsi->modes[i];
   }
 
@@ -2091,7 +2074,7 @@ static int64_t rd_pick_best_mbsegmentation(VP9_COMP *cpi, MACROBLOCK *x,
   *returntotrate = bsi->r;
   *returndistortion = bsi->d;
   *returnyrate = bsi->segment_yrate;
-  *skippable = vp9_is_skippable_in_plane(&x->e_mbd, BLOCK_8X8, 0);
+  *skippable = vp9_is_skippable_in_plane(x, BLOCK_8X8, 0);
   *psse = bsi->sse;
   mbmi->mode = bsi->modes[3];
 
@@ -2102,14 +2085,14 @@ static void mv_pred(VP9_COMP *cpi, MACROBLOCK *x,
                     uint8_t *ref_y_buffer, int ref_y_stride,
                     int ref_frame, BLOCK_SIZE block_size ) {
   MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
   int_mv this_mv;
   int i;
   int zero_seen = 0;
   int best_index = 0;
   int best_sad = INT_MAX;
   int this_sad = INT_MAX;
-  unsigned int max_mv = 0;
+  int max_mv = 0;
 
   uint8_t *src_y_ptr = x->plane[0].src.buf;
   uint8_t *ref_y_ptr;
@@ -2119,16 +2102,21 @@ static void mv_pred(VP9_COMP *cpi, MACROBLOCK *x,
                      cpi->common.show_frame &&
                      block_size < cpi->sf.max_partition_size);
 
+  int_mv pred_mv[3];
+  pred_mv[0] = mbmi->ref_mvs[ref_frame][0];
+  pred_mv[1] = mbmi->ref_mvs[ref_frame][1];
+  pred_mv[2] = x->pred_mv[ref_frame];
+
   // Get the sad for each candidate reference mv
   for (i = 0; i < num_mv_refs; i++) {
-    this_mv.as_int = (i < MAX_MV_REF_CANDIDATES) ?
-        mbmi->ref_mvs[ref_frame][i].as_int : x->pred_mv[ref_frame].as_int;
+    this_mv.as_int = pred_mv[i].as_int;
 
     max_mv = MAX(max_mv,
                  MAX(abs(this_mv.as_mv.row), abs(this_mv.as_mv.col)) >> 3);
-    // The list is at an end if we see 0 for a second time.
+    // only need to check zero mv once
     if (!this_mv.as_int && zero_seen)
-      break;
+      continue;
+
     zero_seen = zero_seen || !this_mv.as_int;
 
     row_offset = this_mv.as_mv.row >> 3;
@@ -2150,14 +2138,15 @@ static void mv_pred(VP9_COMP *cpi, MACROBLOCK *x,
   // Note the index of the mv that worked best in the reference list.
   x->mv_best_ref_index[ref_frame] = best_index;
   x->max_mv_context[ref_frame] = max_mv;
+  x->pred_mv_sad[ref_frame] = best_sad;
 }
 
-static void estimate_ref_frame_costs(VP9_COMP *cpi, int segment_id,
+static void estimate_ref_frame_costs(const VP9_COMMON *cm,
+                                     const MACROBLOCKD *xd,
+                                     int segment_id,
                                      unsigned int *ref_costs_single,
                                      unsigned int *ref_costs_comp,
                                      vp9_prob *comp_mode_p) {
-  VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *const xd = &cpi->mb.e_mbd;
   int seg_ref_active = vp9_segfeature_active(&cm->seg, segment_id,
                                              SEG_LVL_REF_FRAME);
   if (seg_ref_active) {
@@ -2165,11 +2154,11 @@ static void estimate_ref_frame_costs(VP9_COMP *cpi, int segment_id,
     vpx_memset(ref_costs_comp,   0, MAX_REF_FRAMES * sizeof(*ref_costs_comp));
     *comp_mode_p = 128;
   } else {
-    vp9_prob intra_inter_p = vp9_get_pred_prob_intra_inter(cm, xd);
+    vp9_prob intra_inter_p = vp9_get_intra_inter_prob(cm, xd);
     vp9_prob comp_inter_p = 128;
 
-    if (cm->comp_pred_mode == HYBRID_PREDICTION) {
-      comp_inter_p = vp9_get_pred_prob_comp_inter_inter(cm, xd);
+    if (cm->reference_mode == REFERENCE_MODE_SELECT) {
+      comp_inter_p = vp9_get_reference_mode_prob(cm, xd);
       *comp_mode_p = comp_inter_p;
     } else {
       *comp_mode_p = 128;
@@ -2177,12 +2166,12 @@ static void estimate_ref_frame_costs(VP9_COMP *cpi, int segment_id,
 
     ref_costs_single[INTRA_FRAME] = vp9_cost_bit(intra_inter_p, 0);
 
-    if (cm->comp_pred_mode != COMP_PREDICTION_ONLY) {
+    if (cm->reference_mode != COMPOUND_REFERENCE) {
       vp9_prob ref_single_p1 = vp9_get_pred_prob_single_ref_p1(cm, xd);
       vp9_prob ref_single_p2 = vp9_get_pred_prob_single_ref_p2(cm, xd);
       unsigned int base_cost = vp9_cost_bit(intra_inter_p, 1);
 
-      if (cm->comp_pred_mode == HYBRID_PREDICTION)
+      if (cm->reference_mode == REFERENCE_MODE_SELECT)
         base_cost += vp9_cost_bit(comp_inter_p, 0);
 
       ref_costs_single[LAST_FRAME] = ref_costs_single[GOLDEN_FRAME] =
@@ -2197,11 +2186,11 @@ static void estimate_ref_frame_costs(VP9_COMP *cpi, int segment_id,
       ref_costs_single[GOLDEN_FRAME] = 512;
       ref_costs_single[ALTREF_FRAME] = 512;
     }
-    if (cm->comp_pred_mode != SINGLE_PREDICTION_ONLY) {
+    if (cm->reference_mode != SINGLE_REFERENCE) {
       vp9_prob ref_comp_p = vp9_get_pred_prob_comp_ref_p(cm, xd);
       unsigned int base_cost = vp9_cost_bit(intra_inter_p, 1);
 
-      if (cm->comp_pred_mode == HYBRID_PREDICTION)
+      if (cm->reference_mode == REFERENCE_MODE_SELECT)
         base_cost += vp9_cost_bit(comp_inter_p, 1);
 
       ref_costs_comp[LAST_FRAME]   = base_cost + vp9_cost_bit(ref_comp_p, 0);
@@ -2217,8 +2206,8 @@ static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
                          int mode_index,
                          int_mv *ref_mv,
                          int_mv *second_ref_mv,
-                         int64_t comp_pred_diff[NB_PREDICTION_TYPES],
-                         int64_t tx_size_diff[TX_MODES],
+                         int64_t comp_pred_diff[REFERENCE_MODES],
+                         const int64_t tx_size_diff[TX_MODES],
                          int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS]) {
   MACROBLOCKD *const xd = &x->e_mbd;
 
@@ -2226,14 +2215,14 @@ static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
   // restored if we decide to encode this way
   ctx->skip = x->skip;
   ctx->best_mode_index = mode_index;
-  ctx->mic = *xd->mi_8x8[0];
+  ctx->mic = *xd->mi[0];
 
-  ctx->best_ref_mv.as_int = ref_mv->as_int;
-  ctx->second_best_ref_mv.as_int = second_ref_mv->as_int;
+  ctx->best_ref_mv[0].as_int = ref_mv->as_int;
+  ctx->best_ref_mv[1].as_int = second_ref_mv->as_int;
 
-  ctx->single_pred_diff = (int)comp_pred_diff[SINGLE_PREDICTION_ONLY];
-  ctx->comp_pred_diff   = (int)comp_pred_diff[COMP_PREDICTION_ONLY];
-  ctx->hybrid_pred_diff = (int)comp_pred_diff[HYBRID_PREDICTION];
+  ctx->single_pred_diff = (int)comp_pred_diff[SINGLE_REFERENCE];
+  ctx->comp_pred_diff   = (int)comp_pred_diff[COMPOUND_REFERENCE];
+  ctx->hybrid_pred_diff = (int)comp_pred_diff[REFERENCE_MODE_SELECT];
 
   vpx_memcpy(ctx->tx_rd_diff, tx_size_diff, sizeof(ctx->tx_rd_diff));
   vpx_memcpy(ctx->best_filter_diff, best_filter_diff,
@@ -2266,91 +2255,84 @@ static void setup_pred_block(const MACROBLOCKD *xd,
   }
 }
 
-static void setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x,
-                               const TileInfo *const tile,
-                               int idx, MV_REFERENCE_FRAME frame_type,
-                               BLOCK_SIZE block_size,
-                               int mi_row, int mi_col,
-                               int_mv frame_nearest_mv[MAX_REF_FRAMES],
-                               int_mv frame_near_mv[MAX_REF_FRAMES],
-                               struct buf_2d yv12_mb[4][MAX_MB_PLANE],
-                               struct scale_factors scale[MAX_REF_FRAMES]) {
-  VP9_COMMON *cm = &cpi->common;
-  YV12_BUFFER_CONFIG *yv12 = &cm->yv12_fb[cpi->common.ref_frame_map[idx]];
+void vp9_setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x,
+                            const TileInfo *const tile,
+                            MV_REFERENCE_FRAME ref_frame,
+                            BLOCK_SIZE block_size,
+                            int mi_row, int mi_col,
+                            int_mv frame_nearest_mv[MAX_REF_FRAMES],
+                            int_mv frame_near_mv[MAX_REF_FRAMES],
+                            struct buf_2d yv12_mb[4][MAX_MB_PLANE]) {
+  const VP9_COMMON *cm = &cpi->common;
+  const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame);
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
-
-  // set up scaling factors
-  scale[frame_type] = cpi->common.active_ref_scale[frame_type - 1];
-
-  scale[frame_type].sfc->set_scaled_offsets(&scale[frame_type],
-                                            mi_row * MI_SIZE, mi_col * MI_SIZE);
+  MODE_INFO *const mi = xd->mi[0];
+  int_mv *const candidates = mi->mbmi.ref_mvs[ref_frame];
+  const struct scale_factors *const sf = &cm->frame_refs[ref_frame - 1].sf;
 
   // TODO(jkoleszar): Is the UV buffer ever used here? If so, need to make this
   // use the UV scaling factors.
-  setup_pred_block(xd, yv12_mb[frame_type], yv12, mi_row, mi_col,
-                   &scale[frame_type], &scale[frame_type]);
+  setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, sf, sf);
 
   // Gets an initial list of candidate vectors from neighbours and orders them
-  vp9_find_mv_refs(cm, xd, tile, xd->mi_8x8[0],
-                   xd->last_mi,
-                   frame_type,
-                   mbmi->ref_mvs[frame_type], mi_row, mi_col);
+  vp9_find_mv_refs(cm, xd, tile, mi, ref_frame, candidates, mi_row, mi_col);
 
   // Candidate refinement carried out at encoder and decoder
-  vp9_find_best_ref_mvs(xd, cm->allow_high_precision_mv,
-                        mbmi->ref_mvs[frame_type],
-                        &frame_nearest_mv[frame_type],
-                        &frame_near_mv[frame_type]);
+  vp9_find_best_ref_mvs(xd, cm->allow_high_precision_mv, candidates,
+                        &frame_nearest_mv[ref_frame],
+                        &frame_near_mv[ref_frame]);
 
   // Further refinement that is encode side only to test the top few candidates
   // in full and choose the best as the centre point for subsequent searches.
   // The current implementation doesn't support scaling.
-  if (!vp9_is_scaled(scale[frame_type].sfc) && block_size >= BLOCK_8X8)
-    mv_pred(cpi, x, yv12_mb[frame_type][0].buf, yv12->y_stride,
-            frame_type, block_size);
+  if (!vp9_is_scaled(sf) && block_size >= BLOCK_8X8)
+    mv_pred(cpi, x, yv12_mb[ref_frame][0].buf, yv12->y_stride,
+            ref_frame, block_size);
 }
 
-static YV12_BUFFER_CONFIG *get_scaled_ref_frame(VP9_COMP *cpi, int ref_frame) {
-  YV12_BUFFER_CONFIG *scaled_ref_frame = NULL;
-  int fb = get_ref_frame_idx(cpi, ref_frame);
-  int fb_scale = get_scale_ref_frame_idx(cpi, ref_frame);
-  if (cpi->scaled_ref_idx[fb_scale] != cpi->common.ref_frame_map[fb])
-    scaled_ref_frame = &cpi->common.yv12_fb[cpi->scaled_ref_idx[fb_scale]];
-  return scaled_ref_frame;
+const YV12_BUFFER_CONFIG *vp9_get_scaled_ref_frame(const VP9_COMP *cpi,
+                                                   int ref_frame) {
+  const VP9_COMMON *const cm = &cpi->common;
+  const int ref_idx = cm->ref_frame_map[get_ref_frame_idx(cpi, ref_frame)];
+  const int scaled_idx = cpi->scaled_ref_idx[ref_frame - 1];
+  return (scaled_idx != ref_idx) ? &cm->frame_bufs[scaled_idx].buf : NULL;
 }
 
-static INLINE int get_switchable_rate(const MACROBLOCK *x) {
-  const MACROBLOCKD *const xd = &x->e_mbd;
-  const MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
+int vp9_get_switchable_rate(const VP9_COMP *cpi) {
+  const MACROBLOCKD *const xd = &cpi->mb.e_mbd;
+  const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
   const int ctx = vp9_get_pred_context_switchable_interp(xd);
   return SWITCHABLE_INTERP_RATE_FACTOR *
-             x->switchable_interp_costs[ctx][mbmi->interp_filter];
+             cpi->switchable_interp_costs[ctx][mbmi->interp_filter];
 }
 
 static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
-                                 const TileInfo *const tile,
                                  BLOCK_SIZE bsize,
                                  int mi_row, int mi_col,
                                  int_mv *tmp_mv, int *rate_mv) {
   MACROBLOCKD *xd = &x->e_mbd;
-  VP9_COMMON *cm = &cpi->common;
-  MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
-  struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0}};
+  const VP9_COMMON *cm = &cpi->common;
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0, 0}};
   int bestsme = INT_MAX;
-  int further_steps, step_param;
+  int step_param;
   int sadpb = x->sadperbit16;
-  int_mv mvp_full;
+  MV mvp_full;
   int ref = mbmi->ref_frame[0];
-  int_mv ref_mv = mbmi->ref_mvs[ref][0];
-  const BLOCK_SIZE block_size = get_plane_block_size(bsize, &xd->plane[0]);
+  MV ref_mv = mbmi->ref_mvs[ref][0].as_mv;
 
   int tmp_col_min = x->mv_col_min;
   int tmp_col_max = x->mv_col_max;
   int tmp_row_min = x->mv_row_min;
   int tmp_row_max = x->mv_row_max;
 
-  YV12_BUFFER_CONFIG *scaled_ref_frame = get_scaled_ref_frame(cpi, ref);
+  const YV12_BUFFER_CONFIG *scaled_ref_frame = vp9_get_scaled_ref_frame(cpi,
+                                                                        ref);
+
+  MV pred_mv[3];
+  pred_mv[0] = mbmi->ref_mvs[ref][0].as_mv;
+  pred_mv[1] = mbmi->ref_mvs[ref][1].as_mv;
+  pred_mv[2] = x->pred_mv[ref].as_mv;
 
   if (scaled_ref_frame) {
     int i;
@@ -2360,84 +2342,62 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
     for (i = 0; i < MAX_MB_PLANE; i++)
       backup_yv12[i] = xd->plane[i].pre[0];
 
-    setup_pre_planes(xd, 0, scaled_ref_frame, mi_row, mi_col, NULL);
+    vp9_setup_pre_planes(xd, 0, scaled_ref_frame, mi_row, mi_col, NULL);
   }
 
-  vp9_clamp_mv_min_max(x, &ref_mv.as_mv);
-
-  // Adjust search parameters based on small partitions' result.
-  if (x->fast_ms) {
-    // && abs(mvp_full.as_mv.row - x->pred_mv.as_mv.row) < 24 &&
-    // abs(mvp_full.as_mv.col - x->pred_mv.as_mv.col) < 24) {
-    // adjust search range
-    step_param = 6;
-    if (x->fast_ms > 1)
-      step_param = 8;
-
-    // Get prediction MV.
-    mvp_full.as_int = x->pred_mv[ref].as_int;
+  vp9_set_mv_search_range(x, &ref_mv);
 
-    // Adjust MV sign if needed.
-    if (cm->ref_frame_sign_bias[ref]) {
-      mvp_full.as_mv.col *= -1;
-      mvp_full.as_mv.row *= -1;
-    }
+  // Work out the size of the first step in the mv step search.
+  // 0 here is maximum length first step. 1 is MAX >> 1 etc.
+  if (cpi->sf.auto_mv_step_size && cm->show_frame) {
+    // Take wtd average of the step_params based on the last frame's
+    // max mv magnitude and that based on the best ref mvs of the current
+    // block for the given reference.
+    step_param = (vp9_init_search_range(&cpi->sf, x->max_mv_context[ref]) +
+                    cpi->mv_step_param) / 2;
   } else {
-    // Work out the size of the first step in the mv step search.
-    // 0 here is maximum length first step. 1 is MAX >> 1 etc.
-    if (cpi->sf.auto_mv_step_size && cpi->common.show_frame) {
-      // Take wtd average of the step_params based on the last frame's
-      // max mv magnitude and that based on the best ref mvs of the current
-      // block for the given reference.
-      step_param = (vp9_init_search_range(cpi, x->max_mv_context[ref]) +
-                    cpi->mv_step_param) >> 1;
-    } else {
-      step_param = cpi->mv_step_param;
-    }
+    step_param = cpi->mv_step_param;
   }
 
   if (cpi->sf.adaptive_motion_search && bsize < BLOCK_64X64 &&
-      cpi->common.show_frame) {
+      cm->show_frame) {
     int boffset = 2 * (b_width_log2(BLOCK_64X64) - MIN(b_height_log2(bsize),
                                                        b_width_log2(bsize)));
     step_param = MAX(step_param, boffset);
   }
 
-  mvp_full.as_int = x->mv_best_ref_index[ref] < MAX_MV_REF_CANDIDATES ?
-      mbmi->ref_mvs[ref][x->mv_best_ref_index[ref]].as_int :
-      x->pred_mv[ref].as_int;
-
-  mvp_full.as_mv.col >>= 3;
-  mvp_full.as_mv.row >>= 3;
-
-  // Further step/diamond searches as necessary
-  further_steps = (cpi->sf.max_step_search_steps - 1) - step_param;
-
-  if (cpi->sf.search_method == HEX) {
-    bestsme = vp9_hex_search(x, &mvp_full.as_mv,
-                             step_param,
-                             sadpb, 1,
-                             &cpi->fn_ptr[block_size], 1,
-                             &ref_mv.as_mv, &tmp_mv->as_mv);
-  } else if (cpi->sf.search_method == SQUARE) {
-    bestsme = vp9_square_search(x, &mvp_full.as_mv,
-                                step_param,
-                                sadpb, 1,
-                                &cpi->fn_ptr[block_size], 1,
-                                &ref_mv.as_mv, &tmp_mv->as_mv);
-  } else if (cpi->sf.search_method == BIGDIA) {
-    bestsme = vp9_bigdia_search(x, &mvp_full.as_mv,
-                                step_param,
-                                sadpb, 1,
-                                &cpi->fn_ptr[block_size], 1,
-                                &ref_mv.as_mv, &tmp_mv->as_mv);
-  } else {
-    bestsme = vp9_full_pixel_diamond(cpi, x, &mvp_full, step_param,
-                                     sadpb, further_steps, 1,
-                                     &cpi->fn_ptr[block_size],
-                                     &ref_mv, tmp_mv);
+  if (cpi->sf.adaptive_motion_search) {
+    int bwl = b_width_log2_lookup[bsize];
+    int bhl = b_height_log2_lookup[bsize];
+    int i;
+    int tlevel = x->pred_mv_sad[ref] >> (bwl + bhl + 4);
+
+    if (tlevel < 5)
+      step_param += 2;
+
+    for (i = LAST_FRAME; i <= ALTREF_FRAME && cm->show_frame; ++i) {
+      if ((x->pred_mv_sad[ref] >> 3) > x->pred_mv_sad[i]) {
+        x->pred_mv[ref].as_int = 0;
+        tmp_mv->as_int = INVALID_MV;
+
+        if (scaled_ref_frame) {
+          int i;
+          for (i = 0; i < MAX_MB_PLANE; i++)
+            xd->plane[i].pre[0] = backup_yv12[i];
+        }
+        return;
+      }
+    }
   }
 
+  mvp_full = pred_mv[x->mv_best_ref_index[ref]];
+
+  mvp_full.col >>= 3;
+  mvp_full.row >>= 3;
+
+  bestsme = full_pixel_search(cpi, x, bsize, &mvp_full, step_param, sadpb,
+                              &ref_mv, &tmp_mv->as_mv, INT_MAX, 1);
+
   x->mv_col_min = tmp_col_min;
   x->mv_col_max = tmp_col_max;
   x->mv_row_min = tmp_row_min;
@@ -2445,19 +2405,19 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
 
   if (bestsme < INT_MAX) {
     int dis;  /* TODO: use dis in distortion calculation later. */
-    unsigned int sse;
-    cpi->find_fractional_mv_step(x, &tmp_mv->as_mv, &ref_mv.as_mv,
+    cpi->find_fractional_mv_step(x, &tmp_mv->as_mv, &ref_mv,
                                  cm->allow_high_precision_mv,
                                  x->errorperbit,
-                                 &cpi->fn_ptr[block_size],
-                                 0, cpi->sf.subpel_iters_per_step,
+                                 &cpi->fn_ptr[bsize],
+                                 cpi->sf.subpel_force_stop,
+                                 cpi->sf.subpel_iters_per_step,
                                  x->nmvjointcost, x->mvcost,
-                                 &dis, &sse);
+                                 &dis, &x->pred_sse[ref]);
   }
-  *rate_mv = vp9_mv_bit_cost(&tmp_mv->as_mv, &ref_mv.as_mv,
+  *rate_mv = vp9_mv_bit_cost(&tmp_mv->as_mv, &ref_mv,
                              x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
 
-  if (cpi->sf.adaptive_motion_search && cpi->common.show_frame)
+  if (cpi->sf.adaptive_motion_search && cm->show_frame)
     x->pred_mv[ref].as_int = tmp_mv->as_int;
 
   if (scaled_ref_frame) {
@@ -2473,64 +2433,51 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
                                 int mi_row, int mi_col,
                                 int_mv single_newmv[MAX_REF_FRAMES],
                                 int *rate_mv) {
-  int pw = 4 << b_width_log2(bsize), ph = 4 << b_height_log2(bsize);
+  const int pw = 4 * num_4x4_blocks_wide_lookup[bsize];
+  const int ph = 4 * num_4x4_blocks_high_lookup[bsize];
   MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
-  int refs[2] = { mbmi->ref_frame[0],
-    (mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) };
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  const int refs[2] = { mbmi->ref_frame[0],
+                        mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1] };
   int_mv ref_mv[2];
-  const BLOCK_SIZE block_size = get_plane_block_size(bsize, &xd->plane[0]);
-  int ite;
+  int ite, ref;
   // Prediction buffer from second frame.
   uint8_t *second_pred = vpx_memalign(16, pw * ph * sizeof(uint8_t));
+  const InterpKernel *kernel = vp9_get_interp_kernel(mbmi->interp_filter);
 
   // Do joint motion search in compound mode to get more accurate mv.
-  struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0}};
-  struct buf_2d backup_second_yv12[MAX_MB_PLANE] = {{0}};
-  struct buf_2d scaled_first_yv12;
+  struct buf_2d backup_yv12[2][MAX_MB_PLANE];
+  struct buf_2d scaled_first_yv12 = xd->plane[0].pre[0];
   int last_besterr[2] = {INT_MAX, INT_MAX};
-  YV12_BUFFER_CONFIG *scaled_ref_frame[2] = {NULL, NULL};
-  scaled_ref_frame[0] = get_scaled_ref_frame(cpi, mbmi->ref_frame[0]);
-  scaled_ref_frame[1] = get_scaled_ref_frame(cpi, mbmi->ref_frame[1]);
-
-  ref_mv[0] = mbmi->ref_mvs[refs[0]][0];
-  ref_mv[1] = mbmi->ref_mvs[refs[1]][0];
-
-  if (scaled_ref_frame[0]) {
-    int i;
-    // Swap out the reference frame for a version that's been scaled to
-    // match the resolution of the current frame, allowing the existing
-    // motion search code to be used without additional modifications.
-    for (i = 0; i < MAX_MB_PLANE; i++)
-      backup_yv12[i] = xd->plane[i].pre[0];
-    setup_pre_planes(xd, 0, scaled_ref_frame[0], mi_row, mi_col, NULL);
-  }
+  const YV12_BUFFER_CONFIG *const scaled_ref_frame[2] = {
+    vp9_get_scaled_ref_frame(cpi, mbmi->ref_frame[0]),
+    vp9_get_scaled_ref_frame(cpi, mbmi->ref_frame[1])
+  };
 
-  if (scaled_ref_frame[1]) {
-    int i;
-    for (i = 0; i < MAX_MB_PLANE; i++)
-      backup_second_yv12[i] = xd->plane[i].pre[1];
+  for (ref = 0; ref < 2; ++ref) {
+    ref_mv[ref] = mbmi->ref_mvs[refs[ref]][0];
+
+    if (scaled_ref_frame[ref]) {
+      int i;
+      // Swap out the reference frame for a version that's been scaled to
+      // match the resolution of the current frame, allowing the existing
+      // motion search code to be used without additional modifications.
+      for (i = 0; i < MAX_MB_PLANE; i++)
+        backup_yv12[ref][i] = xd->plane[i].pre[ref];
+      vp9_setup_pre_planes(xd, ref, scaled_ref_frame[ref], mi_row, mi_col,
+                           NULL);
+    }
 
-    setup_pre_planes(xd, 1, scaled_ref_frame[1], mi_row, mi_col, NULL);
+    frame_mv[refs[ref]].as_int = single_newmv[refs[ref]].as_int;
   }
 
-  xd->scale_factor[0].sfc->set_scaled_offsets(&xd->scale_factor[0],
-                                         mi_row, mi_col);
-  xd->scale_factor[1].sfc->set_scaled_offsets(&xd->scale_factor[1],
-                                         mi_row, mi_col);
-  scaled_first_yv12 = xd->plane[0].pre[0];
-
-  // Initialize mv using single prediction mode result.
-  frame_mv[refs[0]].as_int = single_newmv[refs[0]].as_int;
-  frame_mv[refs[1]].as_int = single_newmv[refs[1]].as_int;
-
   // Allow joint search multiple times iteratively for each ref frame
   // and break out the search loop if it couldn't find better mv.
   for (ite = 0; ite < 4; ite++) {
     struct buf_2d ref_yv12[2];
     int bestsme = INT_MAX;
     int sadpb = x->sadperbit16;
-    int_mv tmp_mv;
+    MV tmp_mv;
     int search_range = 3;
 
     int tmp_col_min = x->mv_col_min;
@@ -2548,28 +2495,30 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
                               ref_yv12[!id].stride,
                               second_pred, pw,
                               &frame_mv[refs[!id]].as_mv,
-                              &xd->scale_factor[!id],
+                              &xd->block_refs[!id]->sf,
                               pw, ph, 0,
-                              &xd->subpix, MV_PRECISION_Q3);
+                              kernel, MV_PRECISION_Q3,
+                              mi_col * MI_SIZE, mi_row * MI_SIZE);
 
     // Compound motion search on first ref frame.
     if (id)
       xd->plane[0].pre[0] = ref_yv12[id];
-    vp9_clamp_mv_min_max(x, &ref_mv[id].as_mv);
+    vp9_set_mv_search_range(x, &ref_mv[id].as_mv);
 
     // Use mv result from single mode as mvp.
-    tmp_mv.as_int = frame_mv[refs[id]].as_int;
+    tmp_mv = frame_mv[refs[id]].as_mv;
 
-    tmp_mv.as_mv.col >>= 3;
-    tmp_mv.as_mv.row >>= 3;
+    tmp_mv.col >>= 3;
+    tmp_mv.row >>= 3;
 
     // Small-range full-pixel motion search
     bestsme = vp9_refining_search_8p_c(x, &tmp_mv, sadpb,
                                        search_range,
-                                       &cpi->fn_ptr[block_size],
-                                       x->nmvjointcost, x->mvcost,
-                                       &ref_mv[id], second_pred,
-                                       pw, ph);
+                                       &cpi->fn_ptr[bsize],
+                                       &ref_mv[id].as_mv, second_pred);
+    if (bestsme < INT_MAX)
+      bestsme = vp9_get_mvpred_av_var(x, &tmp_mv, &ref_mv[id].as_mv,
+                                      second_pred, &cpi->fn_ptr[bsize], 1);
 
     x->mv_col_min = tmp_col_min;
     x->mv_col_max = tmp_col_max;
@@ -2579,13 +2528,12 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
     if (bestsme < INT_MAX) {
       int dis; /* TODO: use dis in distortion calculation later. */
       unsigned int sse;
-
       bestsme = cpi->find_fractional_mv_step_comp(
-          x, &tmp_mv.as_mv,
+          x, &tmp_mv,
           &ref_mv[id].as_mv,
           cpi->common.allow_high_precision_mv,
           x->errorperbit,
-          &cpi->fn_ptr[block_size],
+          &cpi->fn_ptr[bsize],
           0, cpi->sf.subpel_iters_per_step,
           x->nmvjointcost, x->mvcost,
           &dis, &sse, second_pred,
@@ -2596,37 +2544,42 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
       xd->plane[0].pre[0] = scaled_first_yv12;
 
     if (bestsme < last_besterr[id]) {
-      frame_mv[refs[id]].as_int = tmp_mv.as_int;
+      frame_mv[refs[id]].as_mv = tmp_mv;
       last_besterr[id] = bestsme;
     } else {
       break;
     }
   }
 
-  // restore the predictor
-  if (scaled_ref_frame[0]) {
-    int i;
-    for (i = 0; i < MAX_MB_PLANE; i++)
-      xd->plane[i].pre[0] = backup_yv12[i];
-  }
+  *rate_mv = 0;
 
-  if (scaled_ref_frame[1]) {
-    int i;
-    for (i = 0; i < MAX_MB_PLANE; i++)
-      xd->plane[i].pre[1] = backup_second_yv12[i];
+  for (ref = 0; ref < 2; ++ref) {
+    if (scaled_ref_frame[ref]) {
+      // restore the predictor
+      int i;
+      for (i = 0; i < MAX_MB_PLANE; i++)
+        xd->plane[i].pre[ref] = backup_yv12[ref][i];
+    }
+
+    *rate_mv += vp9_mv_bit_cost(&frame_mv[refs[ref]].as_mv,
+                                &mbmi->ref_mvs[refs[ref]][0].as_mv,
+                                x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
   }
-  *rate_mv  = vp9_mv_bit_cost(&frame_mv[refs[0]].as_mv,
-                              &mbmi->ref_mvs[refs[0]][0].as_mv,
-                              x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
-  *rate_mv += vp9_mv_bit_cost(&frame_mv[refs[1]].as_mv,
-                              &mbmi->ref_mvs[refs[1]][0].as_mv,
-                              x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
 
   vpx_free(second_pred);
 }
 
+static INLINE void restore_dst_buf(MACROBLOCKD *xd,
+                                   uint8_t *orig_dst[MAX_MB_PLANE],
+                                   int orig_dst_stride[MAX_MB_PLANE]) {
+  int i;
+  for (i = 0; i < MAX_MB_PLANE; i++) {
+    xd->plane[i].dst.buf = orig_dst[i];
+    xd->plane[i].dst.stride = orig_dst_stride[i];
+  }
+}
+
 static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
-                                 const TileInfo *const tile,
                                  BLOCK_SIZE bsize,
                                  int64_t txfm_cache[],
                                  int *rate2, int64_t *distortion,
@@ -2634,15 +2587,16 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
                                  int *rate_y, int64_t *distortion_y,
                                  int *rate_uv, int64_t *distortion_uv,
                                  int *mode_excluded, int *disable_skip,
-                                 INTERPOLATION_TYPE *best_filter,
+                                 INTERP_FILTER *best_filter,
                                  int_mv (*mode_mv)[MAX_REF_FRAMES],
                                  int mi_row, int mi_col,
                                  int_mv single_newmv[MAX_REF_FRAMES],
                                  int64_t *psse,
                                  const int64_t ref_best_rd) {
   VP9_COMMON *cm = &cpi->common;
+  RD_OPT *rd_opt = &cpi->rd;
   MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
   const int is_comp_pred = has_second_ref(mbmi);
   const int num_refs = is_comp_pred ? 2 : 1;
   const int this_mode = mbmi->mode;
@@ -2661,6 +2615,12 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
   int orig_dst_stride[MAX_MB_PLANE];
   int rs = 0;
 
+  if (is_comp_pred) {
+    if (frame_mv[refs[0]].as_int == INVALID_MV ||
+        frame_mv[refs[1]].as_int == INVALID_MV)
+      return INT64_MAX;
+  }
+
   if (this_mode == NEWMV) {
     int rate_mv;
     if (is_comp_pred) {
@@ -2679,64 +2639,27 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
                                    &mbmi->ref_mvs[refs[1]][0].as_mv,
                                    x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
       }
-      if (frame_mv[refs[0]].as_int == INVALID_MV ||
-          frame_mv[refs[1]].as_int == INVALID_MV)
-        return INT64_MAX;
       *rate2 += rate_mv;
     } else {
       int_mv tmp_mv;
-      single_motion_search(cpi, x, tile, bsize, mi_row, mi_col,
+      single_motion_search(cpi, x, bsize, mi_row, mi_col,
                            &tmp_mv, &rate_mv);
+      if (tmp_mv.as_int == INVALID_MV)
+        return INT64_MAX;
       *rate2 += rate_mv;
       frame_mv[refs[0]].as_int =
-          xd->mi_8x8[0]->bmi[0].as_mv[0].as_int = tmp_mv.as_int;
+          xd->mi[0]->bmi[0].as_mv[0].as_int = tmp_mv.as_int;
       single_newmv[refs[0]].as_int = tmp_mv.as_int;
     }
   }
 
-  // if we're near/nearest and mv == 0,0, compare to zeromv
-  if ((this_mode == NEARMV || this_mode == NEARESTMV || this_mode == ZEROMV) &&
-      frame_mv[refs[0]].as_int == 0 &&
-      !vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP) &&
-      (num_refs == 1 || frame_mv[refs[1]].as_int == 0)) {
-    int rfc = mbmi->mode_context[mbmi->ref_frame[0]];
-    int c1 = cost_mv_ref(cpi, NEARMV, rfc);
-    int c2 = cost_mv_ref(cpi, NEARESTMV, rfc);
-    int c3 = cost_mv_ref(cpi, ZEROMV, rfc);
-
-    if (this_mode == NEARMV) {
-      if (c1 > c3)
-        return INT64_MAX;
-    } else if (this_mode == NEARESTMV) {
-      if (c2 > c3)
-        return INT64_MAX;
-    } else {
-      assert(this_mode == ZEROMV);
-      if (num_refs == 1) {
-        if ((c3 >= c2 &&
-             mode_mv[NEARESTMV][mbmi->ref_frame[0]].as_int == 0) ||
-            (c3 >= c1 &&
-             mode_mv[NEARMV][mbmi->ref_frame[0]].as_int == 0))
-          return INT64_MAX;
-      } else {
-        if ((c3 >= c2 &&
-             mode_mv[NEARESTMV][mbmi->ref_frame[0]].as_int == 0 &&
-             mode_mv[NEARESTMV][mbmi->ref_frame[1]].as_int == 0) ||
-            (c3 >= c1 &&
-             mode_mv[NEARMV][mbmi->ref_frame[0]].as_int == 0 &&
-             mode_mv[NEARMV][mbmi->ref_frame[1]].as_int == 0))
-          return INT64_MAX;
-      }
-    }
-  }
-
   for (i = 0; i < num_refs; ++i) {
     cur_mv[i] = frame_mv[refs[i]];
     // Clip "next_nearest" so that it does not extend to far out of image
     if (this_mode != NEWMV)
       clamp_mv2(&cur_mv[i].as_mv, xd);
 
-    if (mv_check_bounds(x, &cur_mv[i]))
+    if (mv_check_bounds(x, &cur_mv[i].as_mv))
       return INT64_MAX;
     mbmi->mv[i].as_int = cur_mv[i].as_int;
   }
@@ -2755,67 +2678,57 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
    * are only three options: Last/Golden, ARF/Last or Golden/ARF, or in other
    * words if you present them in that order, the second one is always known
    * if the first is known */
-  *rate2 += cost_mv_ref(cpi, this_mode,
-                        mbmi->mode_context[mbmi->ref_frame[0]]);
+  *rate2 += cost_mv_ref(cpi, this_mode, mbmi->mode_context[refs[0]]);
 
-  if (!(*mode_excluded)) {
-    if (is_comp_pred) {
-      *mode_excluded = (cpi->common.comp_pred_mode == SINGLE_PREDICTION_ONLY);
-    } else {
-      *mode_excluded = (cpi->common.comp_pred_mode == COMP_PREDICTION_ONLY);
-    }
-  }
+  if (!(*mode_excluded))
+    *mode_excluded = is_comp_pred ? cm->reference_mode == SINGLE_REFERENCE
+                                  : cm->reference_mode == COMPOUND_REFERENCE;
 
   pred_exists = 0;
   // Are all MVs integer pel for Y and UV
-  intpel_mv = (mbmi->mv[0].as_mv.row & 15) == 0 &&
-      (mbmi->mv[0].as_mv.col & 15) == 0;
+  intpel_mv = !mv_has_subpel(&mbmi->mv[0].as_mv);
   if (is_comp_pred)
-    intpel_mv &= (mbmi->mv[1].as_mv.row & 15) == 0 &&
-        (mbmi->mv[1].as_mv.col & 15) == 0;
+    intpel_mv &= !mv_has_subpel(&mbmi->mv[1].as_mv);
+
   // Search for best switchable filter by checking the variance of
   // pred error irrespective of whether the filter will be used
-  if (cm->mcomp_filter_type != BILINEAR) {
+  rd_opt->mask_filter = 0;
+  for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
+    rd_opt->filter_cache[i] = INT64_MAX;
+
+  if (cm->interp_filter != BILINEAR) {
     *best_filter = EIGHTTAP;
-    if (x->source_variance <
-        cpi->sf.disable_filter_search_var_thresh) {
+    if (x->source_variance < cpi->sf.disable_filter_search_var_thresh) {
       *best_filter = EIGHTTAP;
-      vp9_zero(cpi->rd_filter_cache);
     } else {
-      int i, newbest;
+      int newbest;
       int tmp_rate_sum = 0;
       int64_t tmp_dist_sum = 0;
 
-      cpi->rd_filter_cache[SWITCHABLE_FILTERS] = INT64_MAX;
       for (i = 0; i < SWITCHABLE_FILTERS; ++i) {
         int j;
         int64_t rs_rd;
         mbmi->interp_filter = i;
-        vp9_setup_interp_filters(xd, mbmi->interp_filter, cm);
-        rs = get_switchable_rate(x);
+        rs = vp9_get_switchable_rate(cpi);
         rs_rd = RDCOST(x->rdmult, x->rddiv, rs, 0);
 
         if (i > 0 && intpel_mv) {
-          cpi->rd_filter_cache[i] = RDCOST(x->rdmult, x->rddiv,
-                                           tmp_rate_sum, tmp_dist_sum);
-          cpi->rd_filter_cache[SWITCHABLE_FILTERS] =
-              MIN(cpi->rd_filter_cache[SWITCHABLE_FILTERS],
-                  cpi->rd_filter_cache[i] + rs_rd);
-          rd = cpi->rd_filter_cache[i];
-          if (cm->mcomp_filter_type == SWITCHABLE)
+          rd = RDCOST(x->rdmult, x->rddiv, tmp_rate_sum, tmp_dist_sum);
+          rd_opt->filter_cache[i] = rd;
+          rd_opt->filter_cache[SWITCHABLE_FILTERS] =
+              MIN(rd_opt->filter_cache[SWITCHABLE_FILTERS], rd + rs_rd);
+          if (cm->interp_filter == SWITCHABLE)
             rd += rs_rd;
+          rd_opt->mask_filter = MAX(rd_opt->mask_filter, rd);
         } else {
           int rate_sum = 0;
           int64_t dist_sum = 0;
-          if ((cm->mcomp_filter_type == SWITCHABLE &&
+          if ((cm->interp_filter == SWITCHABLE &&
                (!i || best_needs_copy)) ||
-              (cm->mcomp_filter_type != SWITCHABLE &&
-               (cm->mcomp_filter_type == mbmi->interp_filter ||
+              (cm->interp_filter != SWITCHABLE &&
+               (cm->interp_filter == mbmi->interp_filter ||
                 (i == 0 && intpel_mv)))) {
-            for (j = 0; j < MAX_MB_PLANE; j++) {
-              xd->plane[j].dst.buf = orig_dst[j];
-              xd->plane[j].dst.stride = orig_dst_stride[j];
-            }
+            restore_dst_buf(xd, orig_dst, orig_dst_stride);
           } else {
             for (j = 0; j < MAX_MB_PLANE; j++) {
               xd->plane[j].dst.buf = tmp_buf + j * 64 * 64;
@@ -2824,25 +2737,24 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
           }
           vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
           model_rd_for_sb(cpi, bsize, x, xd, &rate_sum, &dist_sum);
-          cpi->rd_filter_cache[i] = RDCOST(x->rdmult, x->rddiv,
-                                           rate_sum, dist_sum);
-          cpi->rd_filter_cache[SWITCHABLE_FILTERS] =
-              MIN(cpi->rd_filter_cache[SWITCHABLE_FILTERS],
-                  cpi->rd_filter_cache[i] + rs_rd);
-          rd = cpi->rd_filter_cache[i];
-          if (cm->mcomp_filter_type == SWITCHABLE)
+
+          rd = RDCOST(x->rdmult, x->rddiv, rate_sum, dist_sum);
+          rd_opt->filter_cache[i] = rd;
+          rd_opt->filter_cache[SWITCHABLE_FILTERS] =
+              MIN(rd_opt->filter_cache[SWITCHABLE_FILTERS], rd + rs_rd);
+          if (cm->interp_filter == SWITCHABLE)
             rd += rs_rd;
+          rd_opt->mask_filter = MAX(rd_opt->mask_filter, rd);
+
           if (i == 0 && intpel_mv) {
             tmp_rate_sum = rate_sum;
             tmp_dist_sum = dist_sum;
           }
         }
+
         if (i == 0 && cpi->sf.use_rd_breakout && ref_best_rd < INT64_MAX) {
           if (rd / 2 > ref_best_rd) {
-            for (i = 0; i < MAX_MB_PLANE; i++) {
-              xd->plane[i].dst.buf = orig_dst[i];
-              xd->plane[i].dst.stride = orig_dst_stride[i];
-            }
+            restore_dst_buf(xd, orig_dst, orig_dst_stride);
             return INT64_MAX;
           }
         }
@@ -2851,28 +2763,23 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
         if (newbest) {
           best_rd = rd;
           *best_filter = mbmi->interp_filter;
-          if (cm->mcomp_filter_type == SWITCHABLE && i && !intpel_mv)
+          if (cm->interp_filter == SWITCHABLE && i && !intpel_mv)
             best_needs_copy = !best_needs_copy;
         }
 
-        if ((cm->mcomp_filter_type == SWITCHABLE && newbest) ||
-            (cm->mcomp_filter_type != SWITCHABLE &&
-             cm->mcomp_filter_type == mbmi->interp_filter)) {
+        if ((cm->interp_filter == SWITCHABLE && newbest) ||
+            (cm->interp_filter != SWITCHABLE &&
+             cm->interp_filter == mbmi->interp_filter)) {
           pred_exists = 1;
         }
       }
-
-      for (i = 0; i < MAX_MB_PLANE; i++) {
-        xd->plane[i].dst.buf = orig_dst[i];
-        xd->plane[i].dst.stride = orig_dst_stride[i];
-      }
+      restore_dst_buf(xd, orig_dst, orig_dst_stride);
     }
   }
   // Set the appropriate filter
-  mbmi->interp_filter = cm->mcomp_filter_type != SWITCHABLE ?
-      cm->mcomp_filter_type : *best_filter;
-  vp9_setup_interp_filters(xd, mbmi->interp_filter, cm);
-  rs = cm->mcomp_filter_type == SWITCHABLE ? get_switchable_rate(x) : 0;
+  mbmi->interp_filter = cm->interp_filter != SWITCHABLE ?
+      cm->interp_filter : *best_filter;
+  rs = cm->interp_filter == SWITCHABLE ? vp9_get_switchable_rate(cpi) : 0;
 
   if (pred_exists) {
     if (best_needs_copy) {
@@ -2888,7 +2795,6 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
     vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
   }
 
-
   if (cpi->sf.use_rd_breakout && ref_best_rd < INT64_MAX) {
     int tmp_rate;
     int64_t tmp_dist;
@@ -2897,44 +2803,37 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
     // if current pred_error modeled rd is substantially more than the best
     // so far, do not bother doing full rd
     if (rd / 2 > ref_best_rd) {
-      for (i = 0; i < MAX_MB_PLANE; i++) {
-        xd->plane[i].dst.buf = orig_dst[i];
-        xd->plane[i].dst.stride = orig_dst_stride[i];
-      }
+      restore_dst_buf(xd, orig_dst, orig_dst_stride);
       return INT64_MAX;
     }
   }
 
-  if (cpi->common.mcomp_filter_type == SWITCHABLE)
-    *rate2 += get_switchable_rate(x);
+  if (cm->interp_filter == SWITCHABLE)
+    *rate2 += vp9_get_switchable_rate(cpi);
 
-  if (!is_comp_pred && cpi->enable_encode_breakout) {
-    if (cpi->active_map_enabled && x->active_ptr[0] == 0)
+  if (!is_comp_pred) {
+    if (!x->in_active_map) {
+      if (psse)
+        *psse = 0;
+      *distortion = 0;
       x->skip = 1;
-    else if (x->encode_breakout) {
+    } else if (cpi->allow_encode_breakout && x->encode_breakout) {
       const BLOCK_SIZE y_size = get_plane_block_size(bsize, &xd->plane[0]);
       const BLOCK_SIZE uv_size = get_plane_block_size(bsize, &xd->plane[1]);
       unsigned int var, sse;
       // Skipping threshold for ac.
       unsigned int thresh_ac;
-      // The encode_breakout input
-      unsigned int encode_breakout = x->encode_breakout << 4;
-      unsigned int max_thresh = 36000;
-
+      // Set a maximum for threshold to avoid big PSNR loss in low bitrate case.
       // Use extreme low threshold for static frames to limit skipping.
-      if (cpi->enable_encode_breakout == 2)
-        max_thresh = 128;
+      const unsigned int max_thresh = (cpi->allow_encode_breakout ==
+                                      ENCODE_BREAKOUT_LIMITED) ? 128 : 36000;
+      // The encode_breakout input
+      const unsigned int min_thresh =
+          MIN(((unsigned int)x->encode_breakout << 4), max_thresh);
 
       // Calculate threshold according to dequant value.
       thresh_ac = (xd->plane[0].dequant[1] * xd->plane[0].dequant[1]) / 9;
-
-      // Use encode_breakout input if it is bigger than internal threshold.
-      if (thresh_ac < encode_breakout)
-        thresh_ac = encode_breakout;
-
-      // Set a maximum for threshold to avoid big PSNR loss in low bitrate case.
-      if (thresh_ac > max_thresh)
-        thresh_ac = max_thresh;
+      thresh_ac = clamp(thresh_ac, min_thresh, max_thresh);
 
       var = cpi->fn_ptr[y_size].vf(x->plane[0].src.buf, x->plane[0].src.stride,
                                    xd->plane[0].dst.buf,
@@ -2975,7 +2874,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
               x->skip = 1;
 
               // The cost of skip bit needs to be added.
-              *rate2 += vp9_cost_bit(vp9_get_pred_prob_mbskip(cm, xd), 1);
+              *rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
 
               // Scaling factor for SSE from spatial domain to frequency domain
               // is 16. Adjust distortion accordingly.
@@ -2997,16 +2896,13 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
     int64_t rdcosty = INT64_MAX;
 
     // Y cost and distortion
-    super_block_yrd(cpi, x, rate_y, distortion_y, &skippable_y, psse,
-                    bsize, txfm_cache, ref_best_rd);
+    inter_super_block_yrd(cpi, x, rate_y, distortion_y, &skippable_y, psse,
+                          bsize, txfm_cache, ref_best_rd);
 
     if (*rate_y == INT_MAX) {
       *rate2 = INT_MAX;
       *distortion = INT64_MAX;
-      for (i = 0; i < MAX_MB_PLANE; i++) {
-        xd->plane[i].dst.buf = orig_dst[i];
-        xd->plane[i].dst.stride = orig_dst_stride[i];
-      }
+      restore_dst_buf(xd, orig_dst, orig_dst_stride);
       return INT64_MAX;
     }
 
@@ -3021,10 +2917,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
     if (*rate_uv == INT_MAX) {
       *rate2 = INT_MAX;
       *distortion = INT64_MAX;
-      for (i = 0; i < MAX_MB_PLANE; i++) {
-        xd->plane[i].dst.buf = orig_dst[i];
-        xd->plane[i].dst.stride = orig_dst_stride[i];
-      }
+      restore_dst_buf(xd, orig_dst, orig_dst_stride);
       return INT64_MAX;
     }
 
@@ -3034,14 +2927,34 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
     *skippable = skippable_y && skippable_uv;
   }
 
-  for (i = 0; i < MAX_MB_PLANE; i++) {
-    xd->plane[i].dst.buf = orig_dst[i];
-    xd->plane[i].dst.stride = orig_dst_stride[i];
-  }
-
+  restore_dst_buf(xd, orig_dst, orig_dst_stride);
   return this_rd;  // if 0, this will be re-calculated by caller
 }
 
+static void swap_block_ptr(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
+                           int max_plane) {
+  struct macroblock_plane *const p = x->plane;
+  struct macroblockd_plane *const pd = x->e_mbd.plane;
+  int i;
+
+  for (i = 0; i < max_plane; ++i) {
+    p[i].coeff    = ctx->coeff_pbuf[i][1];
+    p[i].qcoeff  = ctx->qcoeff_pbuf[i][1];
+    pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][1];
+    p[i].eobs    = ctx->eobs_pbuf[i][1];
+
+    ctx->coeff_pbuf[i][1]   = ctx->coeff_pbuf[i][0];
+    ctx->qcoeff_pbuf[i][1]  = ctx->qcoeff_pbuf[i][0];
+    ctx->dqcoeff_pbuf[i][1] = ctx->dqcoeff_pbuf[i][0];
+    ctx->eobs_pbuf[i][1]    = ctx->eobs_pbuf[i][0];
+
+    ctx->coeff_pbuf[i][0]   = p[i].coeff;
+    ctx->qcoeff_pbuf[i][0]  = p[i].qcoeff;
+    ctx->dqcoeff_pbuf[i][0] = pd[i].dqcoeff;
+    ctx->eobs_pbuf[i][0]    = p[i].eobs;
+  }
+}
+
 void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
                                int *returnrate, int64_t *returndist,
                                BLOCK_SIZE bsize,
@@ -3051,9 +2964,11 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   int rate_y = 0, rate_uv = 0, rate_y_tokenonly = 0, rate_uv_tokenonly = 0;
   int y_skip = 0, uv_skip = 0;
   int64_t dist_y = 0, dist_uv = 0, tx_cache[TX_MODES] = { 0 };
+  TX_SIZE max_uv_tx_size;
   x->skip_encode = 0;
   ctx->skip = 0;
-  xd->mi_8x8[0]->mbmi.ref_frame[0] = INTRA_FRAME;
+  xd->mi[0]->mbmi.ref_frame[0] = INTRA_FRAME;
+
   if (bsize >= BLOCK_8X8) {
     if (rd_pick_intra_sby_mode(cpi, x, &rate_y, &rate_y_tokenonly,
                                &dist_y, &y_skip, bsize, tx_cache,
@@ -3061,8 +2976,9 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
       *returnrate = INT_MAX;
       return;
     }
-    rd_pick_intra_sbuv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly,
-                            &dist_uv, &uv_skip, bsize);
+    max_uv_tx_size = get_uv_tx_size_impl(xd->mi[0]->mbmi.tx_size, bsize);
+    rd_pick_intra_sbuv_mode(cpi, x, ctx, &rate_uv, &rate_uv_tokenonly,
+                            &dist_uv, &uv_skip, bsize, max_uv_tx_size);
   } else {
     y_skip = 0;
     if (rd_pick_intra_sub_8x8_y_mode(cpi, x, &rate_y, &rate_y_tokenonly,
@@ -3070,19 +2986,19 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
       *returnrate = INT_MAX;
       return;
     }
-    rd_pick_intra_sbuv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly,
-                            &dist_uv, &uv_skip, BLOCK_8X8);
+    max_uv_tx_size = get_uv_tx_size_impl(xd->mi[0]->mbmi.tx_size, bsize);
+    rd_pick_intra_sbuv_mode(cpi, x, ctx, &rate_uv, &rate_uv_tokenonly,
+                            &dist_uv, &uv_skip, BLOCK_8X8, max_uv_tx_size);
   }
 
   if (y_skip && uv_skip) {
     *returnrate = rate_y + rate_uv - rate_y_tokenonly - rate_uv_tokenonly +
-                  vp9_cost_bit(vp9_get_pred_prob_mbskip(cm, xd), 1);
+                  vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
     *returndist = dist_y + dist_uv;
     vp9_zero(ctx->tx_rd_diff);
   } else {
     int i;
-    *returnrate = rate_y + rate_uv +
-        vp9_cost_bit(vp9_get_pred_prob_mbskip(cm, xd), 0);
+    *returnrate = rate_y + rate_uv + vp9_cost_bit(vp9_get_skip_prob(cm, xd), 0);
     *returndist = dist_y + dist_uv;
     if (cpi->sf.tx_size_search_method == USE_FULL_RD)
       for (i = 0; i < TX_MODES; i++) {
@@ -3093,7 +3009,35 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
       }
   }
 
-  ctx->mic = *xd->mi_8x8[0];
+  ctx->mic = *xd->mi[0];
+}
+
+static INLINE int rd_less_than_thresh(int64_t best_rd, int thresh,
+                                      int thresh_fact) {
+    return best_rd < ((int64_t)thresh * thresh_fact >> 5) || thresh == INT_MAX;
+}
+
+// Updating rd_thresh_freq_fact[] here means that the different
+// partition/block sizes are handled independently based on the best
+// choice for the current partition. It may well be better to keep a scaled
+// best rd so far value and update rd_thresh_freq_fact based on the mode/size
+// combination that wins out.
+static void update_rd_thresh_fact(VP9_COMP *cpi, int bsize,
+                                  int best_mode_index) {
+  if (cpi->sf.adaptive_rd_thresh > 0) {
+    const int top_mode = bsize < BLOCK_8X8 ? MAX_REFS : MAX_MODES;
+    int mode;
+    for (mode = 0; mode < top_mode; ++mode) {
+      int *const fact = &cpi->rd.thresh_freq_fact[bsize][mode];
+
+      if (mode == best_mode_index) {
+        *fact -= (*fact >> 3);
+      } else {
+        *fact = MIN(*fact + RD_THRESH_INC,
+                    cpi->sf.adaptive_rd_thresh * RD_THRESH_MAX_FACT);
+      }
+    }
+  }
 }
 
 int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
@@ -3104,12 +3048,12 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
                                   BLOCK_SIZE bsize,
                                   PICK_MODE_CONTEXT *ctx,
                                   int64_t best_rd_so_far) {
-  VP9_COMMON *cm = &cpi->common;
-  MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
-  const struct segmentation *seg = &cm->seg;
-  const BLOCK_SIZE block_size = get_plane_block_size(bsize, &xd->plane[0]);
-  MB_PREDICTION_MODE this_mode;
+  VP9_COMMON *const cm = &cpi->common;
+  RD_OPT *const rd_opt = &cpi->rd;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  const struct segmentation *const seg = &cm->seg;
+  PREDICTION_MODE this_mode;
   MV_REFERENCE_FRAME ref_frame, second_ref_frame;
   unsigned char segment_id = mbmi->segment_id;
   int comp_pred, i;
@@ -3118,51 +3062,46 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   int_mv single_newmv[MAX_REF_FRAMES] = { { 0 } };
   static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
                                     VP9_ALT_FLAG };
-  int idx_list[4] = {0,
-                     cpi->lst_fb_idx,
-                     cpi->gld_fb_idx,
-                     cpi->alt_fb_idx};
   int64_t best_rd = best_rd_so_far;
   int64_t best_tx_rd[TX_MODES];
   int64_t best_tx_diff[TX_MODES];
-  int64_t best_pred_diff[NB_PREDICTION_TYPES];
-  int64_t best_pred_rd[NB_PREDICTION_TYPES];
+  int64_t best_pred_diff[REFERENCE_MODES];
+  int64_t best_pred_rd[REFERENCE_MODES];
   int64_t best_filter_rd[SWITCHABLE_FILTER_CONTEXTS];
   int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS];
-  MB_MODE_INFO best_mbmode = { 0 };
-  int j;
-  int mode_index, best_mode_index = 0;
+  MB_MODE_INFO best_mbmode;
+  int mode_index, best_mode_index = -1;
   unsigned int ref_costs_single[MAX_REF_FRAMES], ref_costs_comp[MAX_REF_FRAMES];
   vp9_prob comp_mode_p;
   int64_t best_intra_rd = INT64_MAX;
   int64_t best_inter_rd = INT64_MAX;
-  MB_PREDICTION_MODE best_intra_mode = DC_PRED;
+  PREDICTION_MODE best_intra_mode = DC_PRED;
   MV_REFERENCE_FRAME best_inter_ref_frame = LAST_FRAME;
-  INTERPOLATION_TYPE tmp_best_filter = SWITCHABLE;
+  INTERP_FILTER tmp_best_filter = SWITCHABLE;
   int rate_uv_intra[TX_SIZES], rate_uv_tokenonly[TX_SIZES];
   int64_t dist_uv[TX_SIZES];
   int skip_uv[TX_SIZES];
-  MB_PREDICTION_MODE mode_uv[TX_SIZES];
-  struct scale_factors scale_factor[4];
-  unsigned int ref_frame_mask = 0;
-  unsigned int mode_mask = 0;
+  PREDICTION_MODE mode_uv[TX_SIZES];
   int64_t mode_distortions[MB_MODE_COUNT] = {-1};
-  int64_t frame_distortions[MAX_REF_FRAMES] = {-1};
   int intra_cost_penalty = 20 * vp9_dc_quant(cm->base_qindex, cm->y_dc_delta_q);
   const int bws = num_8x8_blocks_wide_lookup[bsize] / 2;
   const int bhs = num_8x8_blocks_high_lookup[bsize] / 2;
   int best_skip2 = 0;
-
-  x->skip_encode = cpi->sf.skip_encode_frame && xd->q_index < QIDX_SKIP_THRESH;
-
-  // Everywhere the flag is set the error is much higher than its neighbors.
-  ctx->frames_with_high_error = 0;
-  ctx->modes_with_high_error = 0;
-
-  estimate_ref_frame_costs(cpi, segment_id, ref_costs_single, ref_costs_comp,
+  int mode_skip_mask = 0;
+  int mode_skip_start = cpi->sf.mode_skip_start + 1;
+  const int *const rd_threshes = rd_opt->threshes[segment_id][bsize];
+  const int *const rd_thresh_freq_fact = rd_opt->thresh_freq_fact[bsize];
+  const int mode_search_skip_flags = cpi->sf.mode_search_skip_flags;
+  const int intra_y_mode_mask =
+      cpi->sf.intra_y_mode_mask[max_txsize_lookup[bsize]];
+  int disable_inter_mode_mask = cpi->sf.disable_inter_mode_mask[bsize];
+  vp9_zero(best_mbmode);
+  x->skip_encode = cpi->sf.skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
+
+  estimate_ref_frame_costs(cm, xd, segment_id, ref_costs_single, ref_costs_comp,
                            &comp_mode_p);
 
-  for (i = 0; i < NB_PREDICTION_TYPES; ++i)
+  for (i = 0; i < REFERENCE_MODES; ++i)
     best_pred_rd[i] = INT64_MAX;
   for (i = 0; i < TX_MODES; i++)
     best_tx_rd[i] = INT64_MAX;
@@ -3170,51 +3109,105 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     best_filter_rd[i] = INT64_MAX;
   for (i = 0; i < TX_SIZES; i++)
     rate_uv_intra[i] = INT_MAX;
+  for (i = 0; i < MAX_REF_FRAMES; ++i)
+    x->pred_sse[i] = INT_MAX;
 
   *returnrate = INT_MAX;
 
-  // Create a mask set to 1 for each reference frame used by a smaller
-  // resolution.
-  if (cpi->sf.use_avoid_tested_higherror) {
-    switch (block_size) {
-      case BLOCK_64X64:
-        for (i = 0; i < 4; i++) {
-          for (j = 0; j < 4; j++) {
-            ref_frame_mask |= x->mb_context[i][j].frames_with_high_error;
-            mode_mask |= x->mb_context[i][j].modes_with_high_error;
-          }
-        }
-        for (i = 0; i < 4; i++) {
-          ref_frame_mask |= x->sb32_context[i].frames_with_high_error;
-          mode_mask |= x->sb32_context[i].modes_with_high_error;
-        }
-        break;
-      case BLOCK_32X32:
-        for (i = 0; i < 4; i++) {
-          ref_frame_mask |=
-              x->mb_context[xd->sb_index][i].frames_with_high_error;
-          mode_mask |= x->mb_context[xd->sb_index][i].modes_with_high_error;
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+    x->pred_mv_sad[ref_frame] = INT_MAX;
+    if (cpi->ref_frame_flags & flag_list[ref_frame]) {
+      vp9_setup_buffer_inter(cpi, x, tile,
+                             ref_frame, bsize, mi_row, mi_col,
+                             frame_mv[NEARESTMV], frame_mv[NEARMV], yv12_mb);
+    }
+    frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
+    frame_mv[ZEROMV][ref_frame].as_int = 0;
+  }
+
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+    // All modes from vp9_mode_order that use this frame as any ref
+    static const int ref_frame_mask_all[] = {
+        0x0, 0x123291, 0x25c444, 0x39b722
+    };
+    // Fixed mv modes (NEARESTMV, NEARMV, ZEROMV) from vp9_mode_order that use
+    // this frame as their primary ref
+    static const int ref_frame_mask_fixedmv[] = {
+        0x0, 0x121281, 0x24c404, 0x080102
+    };
+    if (!(cpi->ref_frame_flags & flag_list[ref_frame])) {
+      // Skip modes for missing references
+      mode_skip_mask |= ref_frame_mask_all[ref_frame];
+    } else if (cpi->sf.reference_masking) {
+      for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
+        // Skip fixed mv modes for poor references
+        if ((x->pred_mv_sad[ref_frame] >> 2) > x->pred_mv_sad[i]) {
+          mode_skip_mask |= ref_frame_mask_fixedmv[ref_frame];
+          break;
         }
-        break;
-      default:
-        // Until we handle all block sizes set it to present;
-        ref_frame_mask = 0;
-        mode_mask = 0;
-        break;
+      }
+    }
+    // If the segment reference frame feature is enabled....
+    // then do nothing if the current ref frame is not allowed..
+    if (vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) &&
+        vp9_get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame) {
+      mode_skip_mask |= ref_frame_mask_all[ref_frame];
     }
-    ref_frame_mask = ~ref_frame_mask;
-    mode_mask = ~mode_mask;
   }
 
-  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
-    if (cpi->ref_frame_flags & flag_list[ref_frame]) {
-      setup_buffer_inter(cpi, x, tile, idx_list[ref_frame], ref_frame,
-                         block_size, mi_row, mi_col,
-                         frame_mv[NEARESTMV], frame_mv[NEARMV],
-                         yv12_mb, scale_factor);
+  // If the segment skip feature is enabled....
+  // then do nothing if the current mode is not allowed..
+  if (vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP)) {
+    const int inter_non_zero_mode_mask = 0x1F7F7;
+    mode_skip_mask |= inter_non_zero_mode_mask;
+  }
+
+  // Disable this drop out case if the ref frame
+  // segment level feature is enabled for this segment. This is to
+  // prevent the possibility that we end up unable to pick any mode.
+  if (!vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) {
+    // Only consider ZEROMV/ALTREF_FRAME for alt ref frame,
+    // unless ARNR filtering is enabled in which case we want
+    // an unfiltered alternative. We allow near/nearest as well
+    // because they may result in zero-zero MVs but be cheaper.
+    if (cpi->rc.is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0)) {
+      mode_skip_mask =
+          ~((1 << THR_NEARESTA) | (1 << THR_NEARA) | (1 << THR_ZEROA));
+      if (frame_mv[NEARMV][ALTREF_FRAME].as_int != 0)
+        mode_skip_mask |= (1 << THR_NEARA);
+      if (frame_mv[NEARESTMV][ALTREF_FRAME].as_int != 0)
+        mode_skip_mask |= (1 << THR_NEARESTA);
     }
-    frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
-    frame_mv[ZEROMV][ref_frame].as_int = 0;
+  }
+
+  // TODO(JBB): This is to make up for the fact that we don't have sad
+  // functions that work when the block size reads outside the umv.  We
+  // should fix this either by making the motion search just work on
+  // a representative block in the boundary ( first ) and then implement a
+  // function that does sads when inside the border..
+  if ((mi_row + bhs) > cm->mi_rows || (mi_col + bws) > cm->mi_cols) {
+    const int new_modes_mask =
+        (1 << THR_NEWMV) | (1 << THR_NEWG) | (1 << THR_NEWA) |
+        (1 << THR_COMP_NEWLA) | (1 << THR_COMP_NEWGA);
+    mode_skip_mask |= new_modes_mask;
+  }
+
+  if (bsize > cpi->sf.max_intra_bsize) {
+    mode_skip_mask |= 0xFF30808;
+  }
+
+  if (!x->in_active_map) {
+    int mode_index;
+    assert(cpi->ref_frame_flags & VP9_LAST_FLAG);
+    if (frame_mv[NEARESTMV][LAST_FRAME].as_int == 0)
+      mode_index = THR_NEARESTMV;
+    else if (frame_mv[NEARMV][LAST_FRAME].as_int == 0)
+      mode_index = THR_NEARMV;
+    else
+      mode_index = THR_ZEROMV;
+    mode_skip_mask = ~(1 << mode_index);
+    mode_skip_start = MAX_MODES;
+    disable_inter_mode_mask = 0;
   }
 
   for (mode_index = 0; mode_index < MAX_MODES; ++mode_index) {
@@ -3228,125 +3221,104 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     int64_t tx_cache[TX_MODES];
     int i;
     int this_skip2 = 0;
-    int64_t total_sse = INT_MAX;
+    int64_t total_sse = INT64_MAX;
     int early_term = 0;
 
-    for (i = 0; i < TX_MODES; ++i)
-      tx_cache[i] = INT64_MAX;
-
-    x->skip = 0;
-    this_mode = vp9_mode_order[mode_index].mode;
-    ref_frame = vp9_mode_order[mode_index].ref_frame;
-    second_ref_frame = vp9_mode_order[mode_index].second_ref_frame;
-
     // Look at the reference frame of the best mode so far and set the
     // skip mask to look at a subset of the remaining modes.
-    if (mode_index > cpi->sf.mode_skip_start) {
-      if (mode_index == (cpi->sf.mode_skip_start + 1)) {
-        switch (vp9_mode_order[best_mode_index].ref_frame) {
-          case INTRA_FRAME:
-            cpi->mode_skip_mask = 0;
-            break;
-          case LAST_FRAME:
-            cpi->mode_skip_mask = LAST_FRAME_MODE_MASK;
-            break;
-          case GOLDEN_FRAME:
-            cpi->mode_skip_mask = GOLDEN_FRAME_MODE_MASK;
-            break;
-          case ALTREF_FRAME:
-            cpi->mode_skip_mask = ALT_REF_MODE_MASK;
-            break;
-          case NONE:
-          case MAX_REF_FRAMES:
-            assert(!"Invalid Reference frame");
-        }
+    if (mode_index == mode_skip_start && best_mode_index >= 0) {
+      switch (vp9_mode_order[best_mode_index].ref_frame[0]) {
+        case INTRA_FRAME:
+          break;
+        case LAST_FRAME:
+          mode_skip_mask |= LAST_FRAME_MODE_MASK;
+          break;
+        case GOLDEN_FRAME:
+          mode_skip_mask |= GOLDEN_FRAME_MODE_MASK;
+          break;
+        case ALTREF_FRAME:
+          mode_skip_mask |= ALT_REF_MODE_MASK;
+          break;
+        case NONE:
+        case MAX_REF_FRAMES:
+          assert(0 && "Invalid Reference frame");
       }
-      if (cpi->mode_skip_mask & ((int64_t)1 << mode_index))
-        continue;
     }
-
-    // Skip if the current reference frame has been masked off
-    if (cpi->sf.reference_masking && !cpi->set_ref_frame_mask &&
-        (cpi->ref_frame_mask & (1 << ref_frame)))
+    if (mode_skip_mask & (1 << mode_index))
       continue;
 
     // Test best rd so far against threshold for trying this mode.
-    if ((best_rd < ((int64_t)cpi->rd_threshes[segment_id][bsize][mode_index] *
-                     cpi->rd_thresh_freq_fact[bsize][mode_index] >> 5)) ||
-        cpi->rd_threshes[segment_id][bsize][mode_index] == INT_MAX)
+    if (rd_less_than_thresh(best_rd, rd_threshes[mode_index],
+        rd_thresh_freq_fact[mode_index]))
       continue;
 
-    // Do not allow compound prediction if the segment level reference
-    // frame feature is in use as in this case there can only be one reference.
-    if ((second_ref_frame > INTRA_FRAME) &&
-         vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME))
-      continue;
-
-    // Skip some checking based on small partitions' result.
-    if (x->fast_ms > 1 && !ref_frame)
-      continue;
-    if (x->fast_ms > 2 && ref_frame != x->subblock_ref)
+    this_mode = vp9_mode_order[mode_index].mode;
+    ref_frame = vp9_mode_order[mode_index].ref_frame[0];
+    if (ref_frame != INTRA_FRAME &&
+        disable_inter_mode_mask & (1 << INTER_OFFSET(this_mode)))
       continue;
+    second_ref_frame = vp9_mode_order[mode_index].ref_frame[1];
 
-    if (cpi->sf.use_avoid_tested_higherror && bsize >= BLOCK_8X8) {
-      if (!(ref_frame_mask & (1 << ref_frame))) {
-        continue;
-      }
-      if (!(mode_mask & (1 << this_mode))) {
+    comp_pred = second_ref_frame > INTRA_FRAME;
+    if (comp_pred) {
+      if ((mode_search_skip_flags & FLAG_SKIP_COMP_BESTINTRA) &&
+          best_mode_index >=0 &&
+          vp9_mode_order[best_mode_index].ref_frame[0] == INTRA_FRAME)
         continue;
-      }
-      if (second_ref_frame != NONE
-          && !(ref_frame_mask & (1 << second_ref_frame))) {
+      if ((mode_search_skip_flags & FLAG_SKIP_COMP_REFMISMATCH) &&
+          ref_frame != best_inter_ref_frame &&
+          second_ref_frame != best_inter_ref_frame)
         continue;
-      }
-    }
-
-    mbmi->ref_frame[0] = ref_frame;
-    mbmi->ref_frame[1] = second_ref_frame;
-
-    if (!(ref_frame == INTRA_FRAME
-        || (cpi->ref_frame_flags & flag_list[ref_frame]))) {
-      continue;
-    }
-    if (!(second_ref_frame == NONE
-        || (cpi->ref_frame_flags & flag_list[second_ref_frame]))) {
-      continue;
+      mode_excluded = cm->reference_mode == SINGLE_REFERENCE;
+    } else {
+      if (ref_frame != INTRA_FRAME)
+        mode_excluded = cm->reference_mode == COMPOUND_REFERENCE;
     }
 
-    comp_pred = second_ref_frame > INTRA_FRAME;
-    if (comp_pred) {
-      if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_COMP_BESTINTRA)
-        if (vp9_mode_order[best_mode_index].ref_frame == INTRA_FRAME)
+    if (ref_frame == INTRA_FRAME) {
+      if (!(intra_y_mode_mask & (1 << this_mode)))
+        continue;
+      if (this_mode != DC_PRED) {
+        // Disable intra modes other than DC_PRED for blocks with low variance
+        // Threshold for intra skipping based on source variance
+        // TODO(debargha): Specialize the threshold for super block sizes
+        const unsigned int skip_intra_var_thresh = 64;
+        if ((mode_search_skip_flags & FLAG_SKIP_INTRA_LOWVAR) &&
+            x->source_variance < skip_intra_var_thresh)
           continue;
-      if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_COMP_REFMISMATCH)
-        if (ref_frame != best_inter_ref_frame &&
-            second_ref_frame != best_inter_ref_frame)
+        // Only search the oblique modes if the best so far is
+        // one of the neighboring directional modes
+        if ((mode_search_skip_flags & FLAG_SKIP_INTRA_BESTINTER) &&
+            (this_mode >= D45_PRED && this_mode <= TM_PRED)) {
+          if (best_mode_index >= 0 &&
+              vp9_mode_order[best_mode_index].ref_frame[0] > INTRA_FRAME)
+            continue;
+        }
+        if (mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) {
+          if (conditional_skipintra(this_mode, best_intra_mode))
+              continue;
+        }
+      }
+    } else {
+      if (x->in_active_map &&
+          !vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+        const MV_REFERENCE_FRAME ref_frames[2] = {ref_frame, second_ref_frame};
+        if (!check_best_zero_mv(cpi, mbmi->mode_context, frame_mv,
+                                disable_inter_mode_mask, this_mode, ref_frames))
           continue;
+      }
     }
 
-    set_scale_factors(xd, ref_frame, second_ref_frame, scale_factor);
-    mbmi->uv_mode = DC_PRED;
-
+    mbmi->mode = this_mode;
+    mbmi->uv_mode = x->in_active_map ? DC_PRED : this_mode;
+    mbmi->ref_frame[0] = ref_frame;
+    mbmi->ref_frame[1] = second_ref_frame;
     // Evaluate all sub-pel filters irrespective of whether we can use
     // them for this frame.
-    mbmi->interp_filter = cm->mcomp_filter_type;
-    vp9_setup_interp_filters(xd, mbmi->interp_filter, cm);
-
-    if (comp_pred) {
-      if (!(cpi->ref_frame_flags & flag_list[second_ref_frame]))
-        continue;
-      set_scale_factors(xd, ref_frame, second_ref_frame, scale_factor);
-
-      mode_excluded = mode_excluded
-                         ? mode_excluded
-                         : cm->comp_pred_mode == SINGLE_PREDICTION_ONLY;
-    } else {
-      if (ref_frame != INTRA_FRAME && second_ref_frame != INTRA_FRAME) {
-        mode_excluded =
-            mode_excluded ?
-                mode_excluded : cm->comp_pred_mode == COMP_PREDICTION_ONLY;
-      }
-    }
+    mbmi->interp_filter = cm->interp_filter == SWITCHABLE ? EIGHTTAP
+                                                          : cm->interp_filter;
+    x->skip = 0;
+    set_ref_ptrs(cm, xd, ref_frame, second_ref_frame);
 
     // Select prediction reference frames.
     for (i = 0; i < MAX_MB_PLANE; i++) {
@@ -3355,91 +3327,22 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
         xd->plane[i].pre[1] = yv12_mb[second_ref_frame][i];
     }
 
-    // If the segment reference frame feature is enabled....
-    // then do nothing if the current ref frame is not allowed..
-    if (vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) &&
-        vp9_get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) !=
-            (int)ref_frame) {
-      continue;
-    // If the segment skip feature is enabled....
-    // then do nothing if the current mode is not allowed..
-    } else if (vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP) &&
-               (this_mode != ZEROMV && ref_frame != INTRA_FRAME)) {
-      continue;
-    // Disable this drop out case if the ref frame
-    // segment level feature is enabled for this segment. This is to
-    // prevent the possibility that we end up unable to pick any mode.
-    } else if (!vp9_segfeature_active(seg, segment_id,
-                                      SEG_LVL_REF_FRAME)) {
-      // Only consider ZEROMV/ALTREF_FRAME for alt ref frame,
-      // unless ARNR filtering is enabled in which case we want
-      // an unfiltered alternative. We allow near/nearest as well
-      // because they may result in zero-zero MVs but be cheaper.
-      if (cpi->is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0)) {
-        if ((this_mode != ZEROMV &&
-             !(this_mode == NEARMV &&
-               frame_mv[NEARMV][ALTREF_FRAME].as_int == 0) &&
-             !(this_mode == NEARESTMV &&
-               frame_mv[NEARESTMV][ALTREF_FRAME].as_int == 0)) ||
-            ref_frame != ALTREF_FRAME) {
-          continue;
-        }
-      }
-    }
-    // TODO(JBB): This is to make up for the fact that we don't have sad
-    // functions that work when the block size reads outside the umv.  We
-    // should fix this either by making the motion search just work on
-    // a representative block in the boundary ( first ) and then implement a
-    // function that does sads when inside the border..
-    if (((mi_row + bhs) > cm->mi_rows || (mi_col + bws) > cm->mi_cols) &&
-        this_mode == NEWMV) {
-      continue;
-    }
-
-#ifdef MODE_TEST_HIT_STATS
-    // TEST/DEBUG CODE
-    // Keep a rcord of the number of test hits at each size
-    cpi->mode_test_hits[bsize]++;
-#endif
-
+    for (i = 0; i < TX_MODES; ++i)
+      tx_cache[i] = INT64_MAX;
 
     if (ref_frame == INTRA_FRAME) {
       TX_SIZE uv_tx;
-      // Disable intra modes other than DC_PRED for blocks with low variance
-      // Threshold for intra skipping based on source variance
-      // TODO(debargha): Specialize the threshold for super block sizes
-      static const unsigned int skip_intra_var_thresh[BLOCK_SIZES] = {
-        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
-      };
-      if ((cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_LOWVAR) &&
-          this_mode != DC_PRED &&
-          x->source_variance < skip_intra_var_thresh[mbmi->sb_type])
-        continue;
-      // Only search the oblique modes if the best so far is
-      // one of the neighboring directional modes
-      if ((cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_BESTINTER) &&
-          (this_mode >= D45_PRED && this_mode <= TM_PRED)) {
-        if (vp9_mode_order[best_mode_index].ref_frame > INTRA_FRAME)
-          continue;
-      }
-      mbmi->mode = this_mode;
-      if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) {
-        if (conditional_skipintra(mbmi->mode, best_intra_mode))
-            continue;
-      }
-
-      super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable, NULL,
-                      bsize, tx_cache, best_rd);
+      intra_super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable, NULL,
+                            bsize, tx_cache, best_rd);
 
       if (rate_y == INT_MAX)
         continue;
 
-      uv_tx = MIN(mbmi->tx_size, max_uv_txsize_lookup[bsize]);
+      uv_tx = get_uv_tx_size_impl(mbmi->tx_size, bsize);
       if (rate_uv_intra[uv_tx] == INT_MAX) {
-        choose_intra_uv_mode(cpi, bsize, &rate_uv_intra[uv_tx],
-                             &rate_uv_tokenonly[uv_tx],
-                             &dist_uv[uv_tx], &skip_uv[uv_tx],
-                             &mode_uv[uv_tx]);
+        choose_intra_uv_mode(cpi, ctx, bsize, uv_tx,
+                             &rate_uv_intra[uv_tx], &rate_uv_tokenonly[uv_tx],
+                             &dist_uv[uv_tx], &skip_uv[uv_tx], &mode_uv[uv_tx]);
       }
 
       rate_uv = rate_uv_tokenonly[uv_tx];
@@ -3447,14 +3350,12 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
       skippable = skippable && skip_uv[uv_tx];
       mbmi->uv_mode = mode_uv[uv_tx];
 
-      rate2 = rate_y + x->mbmode_cost[mbmi->mode] + rate_uv_intra[uv_tx];
+      rate2 = rate_y + cpi->mbmode_cost[mbmi->mode] + rate_uv_intra[uv_tx];
       if (this_mode != DC_PRED && this_mode != TM_PRED)
         rate2 += intra_cost_penalty;
       distortion2 = distortion_y + distortion_uv;
     } else {
-      mbmi->mode = this_mode;
-      compmode_cost = vp9_cost_bit(comp_mode_p, second_ref_frame > INTRA_FRAME);
-      this_rd = handle_inter_mode(cpi, x, tile, bsize,
+      this_rd = handle_inter_mode(cpi, x, bsize,
                                   tx_cache,
                                   &rate2, &distortion2, &skippable,
                                   &rate_y, &distortion_y,
@@ -3465,15 +3366,16 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
                                   single_newmv, &total_sse, best_rd);
       if (this_rd == INT64_MAX)
         continue;
-    }
 
-    if (cm->comp_pred_mode == HYBRID_PREDICTION) {
-      rate2 += compmode_cost;
+      compmode_cost = vp9_cost_bit(comp_mode_p, comp_pred);
+
+      if (cm->reference_mode == REFERENCE_MODE_SELECT)
+        rate2 += compmode_cost;
     }
 
     // Estimate the reference frame signaling cost and add it
     // to the rolling cost variable.
-    if (second_ref_frame > INTRA_FRAME) {
+    if (comp_pred) {
       rate2 += ref_costs_comp[ref_frame];
     } else {
       rate2 += ref_costs_single[ref_frame];
@@ -3498,9 +3400,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
           int prob_skip_cost;
 
           // Cost the skip mb case
-          vp9_prob skip_prob =
-            vp9_get_pred_prob_mbskip(cm, xd);
-
+          vp9_prob skip_prob = vp9_get_skip_prob(cm, xd);
           if (skip_prob) {
             prob_skip_cost = vp9_cost_bit(skip_prob, 1);
             rate2 += prob_skip_cost;
@@ -3510,14 +3410,10 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
         if (RDCOST(x->rdmult, x->rddiv, rate_y + rate_uv, distortion2) <
             RDCOST(x->rdmult, x->rddiv, 0, total_sse)) {
           // Add in the cost of the no skip flag.
-          int prob_skip_cost = vp9_cost_bit(vp9_get_pred_prob_mbskip(cm, xd),
-                                            0);
-          rate2 += prob_skip_cost;
+          rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 0);
         } else {
           // FIXME(rbultje) make this work for splitmv also
-          int prob_skip_cost = vp9_cost_bit(vp9_get_pred_prob_mbskip(cm, xd),
-                                            1);
-          rate2 += prob_skip_cost;
+          rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
           distortion2 = total_sse;
           assert(total_sse >= 0);
           rate2 -= (rate_y + rate_uv);
@@ -3527,33 +3423,29 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
         }
       } else if (mb_skip_allowed) {
         // Add in the cost of the no skip flag.
-        int prob_skip_cost = vp9_cost_bit(vp9_get_pred_prob_mbskip(cm, xd),
-                                          0);
-        rate2 += prob_skip_cost;
+        rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 0);
       }
 
       // Calculate the final RD estimate for this mode.
       this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
     }
 
+    if (ref_frame == INTRA_FRAME) {
     // Keep record of best intra rd
-    if (xd->mi_8x8[0]->mbmi.ref_frame[0] == INTRA_FRAME &&
-        is_intra_mode(xd->mi_8x8[0]->mbmi.mode) &&
-        this_rd < best_intra_rd) {
-      best_intra_rd = this_rd;
-      best_intra_mode = xd->mi_8x8[0]->mbmi.mode;
-    }
-    // Keep record of best inter rd with single reference
-    if (xd->mi_8x8[0]->mbmi.ref_frame[0] > INTRA_FRAME &&
-        xd->mi_8x8[0]->mbmi.ref_frame[1] == NONE &&
-        !mode_excluded &&
-        this_rd < best_inter_rd) {
-      best_inter_rd = this_rd;
-      best_inter_ref_frame = ref_frame;
+      if (this_rd < best_intra_rd) {
+        best_intra_rd = this_rd;
+        best_intra_mode = mbmi->mode;
+      }
+    } else {
+      // Keep record of best inter rd with single reference
+      if (!comp_pred && !mode_excluded && this_rd < best_inter_rd) {
+        best_inter_rd = this_rd;
+        best_inter_ref_frame = ref_frame;
+      }
     }
 
     if (!disable_skip && ref_frame == INTRA_FRAME) {
-      for (i = 0; i < NB_PREDICTION_TYPES; ++i)
+      for (i = 0; i < REFERENCE_MODES; ++i)
         best_pred_rd[i] = MIN(best_pred_rd[i], this_rd);
       for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
         best_filter_rd[i] = MIN(best_filter_rd[i], this_rd);
@@ -3564,13 +3456,10 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
         || distortion2 < mode_distortions[this_mode]) {
       mode_distortions[this_mode] = distortion2;
     }
-    if (frame_distortions[ref_frame] == -1
-        || distortion2 < frame_distortions[ref_frame]) {
-      frame_distortions[ref_frame] = distortion2;
-    }
 
     // Did this mode help.. i.e. is it the new best mode
     if (this_rd < best_rd || x->skip) {
+      int max_plane = MAX_MB_PLANE;
       if (!mode_excluded) {
         // Note index of best mode so far
         best_mode_index = mode_index;
@@ -3578,6 +3467,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
         if (ref_frame == INTRA_FRAME) {
           /* required for left and above block mv */
           mbmi->mv[0].as_int = 0;
+          max_plane = 1;
         }
 
         *returnrate = rate2;
@@ -3585,12 +3475,14 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
         best_rd = this_rd;
         best_mbmode = *mbmi;
         best_skip2 = this_skip2;
+        if (!x->select_txfm_size)
+          swap_block_ptr(x, ctx, max_plane);
         vpx_memcpy(ctx->zcoeff_blk, x->zcoeff_blk[mbmi->tx_size],
                    sizeof(uint8_t) * ctx->num_4x4_blk);
 
         // TODO(debargha): enhance this test with a better distortion prediction
         // based on qp, activity mask and history
-        if ((cpi->sf.mode_search_skip_flags & FLAG_EARLY_TERMINATE) &&
+        if ((mode_search_skip_flags & FLAG_EARLY_TERMINATE) &&
             (mode_index > MIN_EARLY_TERM_INDEX)) {
           const int qstep = xd->plane[0].dequant[1];
           // TODO(debargha): Enhance this by specializing for each mode_index
@@ -3609,9 +3501,9 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
 
     /* keep record of best compound/single-only prediction */
     if (!disable_skip && ref_frame != INTRA_FRAME) {
-      int single_rd, hybrid_rd, single_rate, hybrid_rate;
+      int64_t single_rd, hybrid_rd, single_rate, hybrid_rate;
 
-      if (cm->comp_pred_mode == HYBRID_PREDICTION) {
+      if (cm->reference_mode == REFERENCE_MODE_SELECT) {
         single_rate = rate2 - compmode_cost;
         hybrid_rate = rate2;
       } else {
@@ -3622,40 +3514,39 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
       single_rd = RDCOST(x->rdmult, x->rddiv, single_rate, distortion2);
       hybrid_rd = RDCOST(x->rdmult, x->rddiv, hybrid_rate, distortion2);
 
-      if (second_ref_frame <= INTRA_FRAME &&
-          single_rd < best_pred_rd[SINGLE_PREDICTION_ONLY]) {
-        best_pred_rd[SINGLE_PREDICTION_ONLY] = single_rd;
-      } else if (second_ref_frame > INTRA_FRAME &&
-                 single_rd < best_pred_rd[COMP_PREDICTION_ONLY]) {
-        best_pred_rd[COMP_PREDICTION_ONLY] = single_rd;
+      if (!comp_pred) {
+        if (single_rd < best_pred_rd[SINGLE_REFERENCE]) {
+          best_pred_rd[SINGLE_REFERENCE] = single_rd;
+        }
+      } else {
+        if (single_rd < best_pred_rd[COMPOUND_REFERENCE]) {
+          best_pred_rd[COMPOUND_REFERENCE] = single_rd;
+        }
       }
-      if (hybrid_rd < best_pred_rd[HYBRID_PREDICTION])
-        best_pred_rd[HYBRID_PREDICTION] = hybrid_rd;
-    }
+      if (hybrid_rd < best_pred_rd[REFERENCE_MODE_SELECT])
+        best_pred_rd[REFERENCE_MODE_SELECT] = hybrid_rd;
+
+      /* keep record of best filter type */
+      if (!mode_excluded && cm->interp_filter != BILINEAR) {
+        int64_t ref = rd_opt->filter_cache[cm->interp_filter == SWITCHABLE ?
+                              SWITCHABLE_FILTERS : cm->interp_filter];
+
+        for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
+          int64_t adj_rd;
+          if (ref == INT64_MAX)
+            adj_rd = 0;
+          else if (rd_opt->filter_cache[i] == INT64_MAX)
+            // when early termination is triggered, the encoder does not have
+            // access to the rate-distortion cost. it only knows that the cost
+            // should be above the maximum valid value. hence it takes the known
+            // maximum plus an arbitrary constant as the rate-distortion cost.
+            adj_rd = rd_opt->mask_filter - ref + 10;
+          else
+            adj_rd = rd_opt->filter_cache[i] - ref;
 
-    /* keep record of best filter type */
-    if (!mode_excluded && !disable_skip && ref_frame != INTRA_FRAME &&
-        cm->mcomp_filter_type != BILINEAR) {
-      int64_t ref = cpi->rd_filter_cache[cm->mcomp_filter_type == SWITCHABLE ?
-                              SWITCHABLE_FILTERS : cm->mcomp_filter_type];
-      for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
-        int64_t adj_rd;
-        // In cases of poor prediction, filter_cache[] can contain really big
-        // values, which actually are bigger than this_rd itself. This can
-        // cause negative best_filter_rd[] values, which is obviously silly.
-        // Therefore, if filter_cache < ref, we do an adjusted calculation.
-        if (cpi->rd_filter_cache[i] >= ref) {
-          adj_rd = this_rd + cpi->rd_filter_cache[i] - ref;
-        } else {
-          // FIXME(rbultje) do this for comppsred also
-          //
-          // To prevent out-of-range computation in
-          //    adj_rd = cpi->rd_filter_cache[i] * this_rd / ref
-          // cpi->rd_filter_cache[i] / ref is converted to a 256 based ratio.
-          int tmp = cpi->rd_filter_cache[i] * 256 / ref;
-          adj_rd = (this_rd * tmp) >> 8;
+          adj_rd += this_rd;
+          best_filter_rd[i] = MIN(best_filter_rd[i], adj_rd);
         }
-        best_filter_rd[i] = MIN(best_filter_rd[i], adj_rd);
       }
     }
 
@@ -3683,76 +3574,36 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
       break;
   }
 
-  if (best_rd >= best_rd_so_far)
+  if (best_mode_index < 0 || best_rd >= best_rd_so_far)
     return INT64_MAX;
 
   // If we used an estimate for the uv intra rd in the loop above...
   if (cpi->sf.use_uv_intra_rd_estimate) {
     // Do Intra UV best rd mode selection if best mode choice above was intra.
-    if (vp9_mode_order[best_mode_index].ref_frame == INTRA_FRAME) {
-      TX_SIZE uv_tx_size = get_uv_tx_size(mbmi);
-      rd_pick_intra_sbuv_mode(cpi, x, &rate_uv_intra[uv_tx_size],
+    if (vp9_mode_order[best_mode_index].ref_frame[0] == INTRA_FRAME) {
+      TX_SIZE uv_tx_size;
+      *mbmi = best_mbmode;
+      uv_tx_size = get_uv_tx_size(mbmi);
+      rd_pick_intra_sbuv_mode(cpi, x, ctx, &rate_uv_intra[uv_tx_size],
                               &rate_uv_tokenonly[uv_tx_size],
                               &dist_uv[uv_tx_size],
                               &skip_uv[uv_tx_size],
-                              bsize < BLOCK_8X8 ? BLOCK_8X8 : bsize);
-    }
-  }
-
-  // If we are using reference masking and the set mask flag is set then
-  // create the reference frame mask.
-  if (cpi->sf.reference_masking && cpi->set_ref_frame_mask)
-    cpi->ref_frame_mask = ~(1 << vp9_mode_order[best_mode_index].ref_frame);
-
-  // Flag all modes that have a distortion thats > 2x the best we found at
-  // this level.
-  for (mode_index = 0; mode_index < MB_MODE_COUNT; ++mode_index) {
-    if (mode_index == NEARESTMV || mode_index == NEARMV || mode_index == NEWMV)
-      continue;
-
-    if (mode_distortions[mode_index] > 2 * *returndistortion) {
-      ctx->modes_with_high_error |= (1 << mode_index);
+                              bsize < BLOCK_8X8 ? BLOCK_8X8 : bsize,
+                              uv_tx_size);
     }
   }
 
-  // Flag all ref frames that have a distortion thats > 2x the best we found at
-  // this level.
-  for (ref_frame = INTRA_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
-    if (frame_distortions[ref_frame] > 2 * *returndistortion) {
-      ctx->frames_with_high_error |= (1 << ref_frame);
-    }
-  }
+  assert((cm->interp_filter == SWITCHABLE) ||
+         (cm->interp_filter == best_mbmode.interp_filter) ||
+         !is_inter_block(&best_mbmode));
 
-  assert((cm->mcomp_filter_type == SWITCHABLE) ||
-         (cm->mcomp_filter_type == best_mbmode.interp_filter) ||
-         (best_mbmode.ref_frame[0] == INTRA_FRAME));
-
-  // Updating rd_thresh_freq_fact[] here means that the different
-  // partition/block sizes are handled independently based on the best
-  // choice for the current partition. It may well be better to keep a scaled
-  // best rd so far value and update rd_thresh_freq_fact based on the mode/size
-  // combination that wins out.
-  if (cpi->sf.adaptive_rd_thresh) {
-    for (mode_index = 0; mode_index < MAX_MODES; ++mode_index) {
-      if (mode_index == best_mode_index) {
-        cpi->rd_thresh_freq_fact[bsize][mode_index] -=
-          (cpi->rd_thresh_freq_fact[bsize][mode_index] >> 3);
-      } else {
-        cpi->rd_thresh_freq_fact[bsize][mode_index] += RD_THRESH_INC;
-        if (cpi->rd_thresh_freq_fact[bsize][mode_index] >
-            (cpi->sf.adaptive_rd_thresh * RD_THRESH_MAX_FACT)) {
-          cpi->rd_thresh_freq_fact[bsize][mode_index] =
-            cpi->sf.adaptive_rd_thresh * RD_THRESH_MAX_FACT;
-        }
-      }
-    }
-  }
+  update_rd_thresh_fact(cpi, bsize, best_mode_index);
 
   // macroblock modes
   *mbmi = best_mbmode;
   x->skip |= best_skip2;
 
-  for (i = 0; i < NB_PREDICTION_TYPES; ++i) {
+  for (i = 0; i < REFERENCE_MODES; ++i) {
     if (best_pred_rd[i] == INT64_MAX)
       best_pred_diff[i] = INT_MIN;
     else
@@ -3766,13 +3617,8 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
       else
         best_filter_diff[i] = best_rd - best_filter_rd[i];
     }
-    if (cm->mcomp_filter_type == SWITCHABLE)
+    if (cm->interp_filter == SWITCHABLE)
       assert(best_filter_diff[SWITCHABLE_FILTERS] == 0);
-  } else {
-    vp9_zero(best_filter_diff);
-  }
-
-  if (!x->skip) {
     for (i = 0; i < TX_MODES; i++) {
       if (best_tx_rd[i] == INT64_MAX)
         best_tx_diff[i] = 0;
@@ -3780,11 +3626,21 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
         best_tx_diff[i] = best_rd - best_tx_rd[i];
     }
   } else {
+    vp9_zero(best_filter_diff);
     vp9_zero(best_tx_diff);
   }
 
-  set_scale_factors(xd, mbmi->ref_frame[0], mbmi->ref_frame[1],
-                    scale_factor);
+  if (!x->in_active_map) {
+    assert(mbmi->ref_frame[0] == LAST_FRAME);
+    assert(mbmi->ref_frame[1] == NONE);
+    assert(mbmi->mode == NEARESTMV ||
+           mbmi->mode == NEARMV ||
+           mbmi->mode == ZEROMV);
+    assert(frame_mv[mbmi->mode][LAST_FRAME].as_int == 0);
+    assert(mbmi->mode == mbmi->uv_mode);
+  }
+
+  set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
   store_coding_context(x, ctx, best_mode_index,
                        &mbmi->ref_mvs[mbmi->ref_frame[0]][0],
                        &mbmi->ref_mvs[mbmi->ref_frame[1] < 0 ? 0 :
@@ -3803,11 +3659,11 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
                                       BLOCK_SIZE bsize,
                                       PICK_MODE_CONTEXT *ctx,
                                       int64_t best_rd_so_far) {
-  VP9_COMMON *cm = &cpi->common;
-  MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
-  const struct segmentation *seg = &cm->seg;
-  const BLOCK_SIZE block_size = get_plane_block_size(bsize, &xd->plane[0]);
+  VP9_COMMON *const cm = &cpi->common;
+  RD_OPT *const rd_opt = &cpi->rd;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  const struct segmentation *const seg = &cm->seg;
   MV_REFERENCE_FRAME ref_frame, second_ref_frame;
   unsigned char segment_id = mbmi->segment_id;
   int comp_pred, i;
@@ -3815,40 +3671,34 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
   struct buf_2d yv12_mb[4][MAX_MB_PLANE];
   static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
                                     VP9_ALT_FLAG };
-  int idx_list[4] = {0,
-                     cpi->lst_fb_idx,
-                     cpi->gld_fb_idx,
-                     cpi->alt_fb_idx};
   int64_t best_rd = best_rd_so_far;
   int64_t best_yrd = best_rd_so_far;  // FIXME(rbultje) more precise
-  int64_t best_tx_rd[TX_MODES];
-  int64_t best_tx_diff[TX_MODES];
-  int64_t best_pred_diff[NB_PREDICTION_TYPES];
-  int64_t best_pred_rd[NB_PREDICTION_TYPES];
+  static const int64_t best_tx_diff[TX_MODES] = { 0 };
+  int64_t best_pred_diff[REFERENCE_MODES];
+  int64_t best_pred_rd[REFERENCE_MODES];
   int64_t best_filter_rd[SWITCHABLE_FILTER_CONTEXTS];
   int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS];
-  MB_MODE_INFO best_mbmode = { 0 };
-  int mode_index, best_mode_index = 0;
+  MB_MODE_INFO best_mbmode;
+  int ref_index, best_ref_index = 0;
   unsigned int ref_costs_single[MAX_REF_FRAMES], ref_costs_comp[MAX_REF_FRAMES];
   vp9_prob comp_mode_p;
   int64_t best_inter_rd = INT64_MAX;
   MV_REFERENCE_FRAME best_inter_ref_frame = LAST_FRAME;
-  INTERPOLATION_TYPE tmp_best_filter = SWITCHABLE;
-  int rate_uv_intra[TX_SIZES], rate_uv_tokenonly[TX_SIZES];
-  int64_t dist_uv[TX_SIZES];
-  int skip_uv[TX_SIZES];
-  MB_PREDICTION_MODE mode_uv[TX_SIZES] = { 0 };
-  struct scale_factors scale_factor[4];
-  unsigned int ref_frame_mask = 0;
-  unsigned int mode_mask = 0;
-  int intra_cost_penalty = 20 * vp9_dc_quant(cpi->common.base_qindex,
-                                             cpi->common.y_dc_delta_q);
+  INTERP_FILTER tmp_best_filter = SWITCHABLE;
+  int rate_uv_intra, rate_uv_tokenonly;
+  int64_t dist_uv;
+  int skip_uv;
+  PREDICTION_MODE mode_uv = DC_PRED;
+  int intra_cost_penalty = 20 * vp9_dc_quant(cm->base_qindex, cm->y_dc_delta_q);
   int_mv seg_mvs[4][MAX_REF_FRAMES];
   b_mode_info best_bmodes[4];
   int best_skip2 = 0;
+  int ref_frame_mask = 0;
+  int mode_skip_mask = 0;
 
-  x->skip_encode = cpi->sf.skip_encode_frame && xd->q_index < QIDX_SKIP_THRESH;
+  x->skip_encode = cpi->sf.skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
   vpx_memset(x->zcoeff_blk[TX_4X4], 0, 4);
+  vp9_zero(best_mbmode);
 
   for (i = 0; i < 4; i++) {
     int j;
@@ -3856,41 +3706,40 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
       seg_mvs[i][j].as_int = INVALID_MV;
   }
 
-  estimate_ref_frame_costs(cpi, segment_id, ref_costs_single, ref_costs_comp,
+  estimate_ref_frame_costs(cm, xd, segment_id, ref_costs_single, ref_costs_comp,
                            &comp_mode_p);
 
-  for (i = 0; i < NB_PREDICTION_TYPES; ++i)
+  for (i = 0; i < REFERENCE_MODES; ++i)
     best_pred_rd[i] = INT64_MAX;
-  for (i = 0; i < TX_MODES; i++)
-    best_tx_rd[i] = INT64_MAX;
   for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
     best_filter_rd[i] = INT64_MAX;
-  for (i = 0; i < TX_SIZES; i++)
-    rate_uv_intra[i] = INT_MAX;
+  rate_uv_intra = INT_MAX;
 
   *returnrate = INT_MAX;
 
-  // Create a mask set to 1 for each reference frame used by a smaller
-  // resolution.
-  if (cpi->sf.use_avoid_tested_higherror) {
-    ref_frame_mask = 0;
-    mode_mask = 0;
-    ref_frame_mask = ~ref_frame_mask;
-    mode_mask = ~mode_mask;
-  }
-
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
     if (cpi->ref_frame_flags & flag_list[ref_frame]) {
-      setup_buffer_inter(cpi, x, tile, idx_list[ref_frame], ref_frame,
-                         block_size, mi_row, mi_col,
-                         frame_mv[NEARESTMV], frame_mv[NEARMV],
-                         yv12_mb, scale_factor);
+      vp9_setup_buffer_inter(cpi, x, tile,
+                             ref_frame, bsize, mi_row, mi_col,
+                             frame_mv[NEARESTMV], frame_mv[NEARMV],
+                             yv12_mb);
     }
     frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
     frame_mv[ZEROMV][ref_frame].as_int = 0;
   }
 
-  for (mode_index = 0; mode_index < MAX_REFS; ++mode_index) {
+  for (ref_frame = LAST_FRAME;
+       ref_frame <= ALTREF_FRAME && cpi->sf.reference_masking; ++ref_frame) {
+    int i;
+    for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
+      if ((x->pred_mv_sad[ref_frame] >> 1) > x->pred_mv_sad[i]) {
+        ref_frame_mask |= (1 << ref_frame);
+        break;
+      }
+    }
+  }
+
+  for (ref_index = 0; ref_index < MAX_REFS; ++ref_index) {
     int mode_excluded = 0;
     int64_t this_rd = INT64_MAX;
     int disable_skip = 0;
@@ -3898,125 +3747,84 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
     int rate2 = 0, rate_y = 0, rate_uv = 0;
     int64_t distortion2 = 0, distortion_y = 0, distortion_uv = 0;
     int skippable = 0;
-    int64_t tx_cache[TX_MODES];
     int i;
     int this_skip2 = 0;
     int64_t total_sse = INT_MAX;
     int early_term = 0;
 
-    for (i = 0; i < TX_MODES; ++i)
-      tx_cache[i] = INT64_MAX;
-
-    x->skip = 0;
-    ref_frame = vp9_ref_order[mode_index].ref_frame;
-    second_ref_frame = vp9_ref_order[mode_index].second_ref_frame;
+    ref_frame = vp9_ref_order[ref_index].ref_frame[0];
+    second_ref_frame = vp9_ref_order[ref_index].ref_frame[1];
 
     // Look at the reference frame of the best mode so far and set the
     // skip mask to look at a subset of the remaining modes.
-    if (mode_index > 2 && cpi->sf.mode_skip_start < MAX_MODES) {
-      if (mode_index == 3) {
-        switch (vp9_ref_order[best_mode_index].ref_frame) {
+    if (ref_index > 2 && cpi->sf.mode_skip_start < MAX_MODES) {
+      if (ref_index == 3) {
+        switch (vp9_ref_order[best_ref_index].ref_frame[0]) {
           case INTRA_FRAME:
-            cpi->mode_skip_mask = 0;
+            mode_skip_mask = 0;
             break;
           case LAST_FRAME:
-            cpi->mode_skip_mask = 0x0010;
+            mode_skip_mask = 0x0010;
             break;
           case GOLDEN_FRAME:
-            cpi->mode_skip_mask = 0x0008;
+            mode_skip_mask = 0x0008;
             break;
           case ALTREF_FRAME:
-            cpi->mode_skip_mask = 0x0000;
+            mode_skip_mask = 0x0000;
             break;
           case NONE:
           case MAX_REF_FRAMES:
-            assert(!"Invalid Reference frame");
+            assert(0 && "Invalid Reference frame");
         }
       }
-      if (cpi->mode_skip_mask & ((int64_t)1 << mode_index))
+      if (mode_skip_mask & (1 << ref_index))
         continue;
     }
 
-    // Skip if the current reference frame has been masked off
-    if (cpi->sf.reference_masking && !cpi->set_ref_frame_mask &&
-        (cpi->ref_frame_mask & (1 << ref_frame)))
-      continue;
-
     // Test best rd so far against threshold for trying this mode.
-    if ((best_rd <
-         ((int64_t)cpi->rd_thresh_sub8x8[segment_id][bsize][mode_index] *
-          cpi->rd_thresh_freq_sub8x8[bsize][mode_index] >> 5)) ||
-        cpi->rd_thresh_sub8x8[segment_id][bsize][mode_index] == INT_MAX)
+    if (rd_less_than_thresh(best_rd,
+                            rd_opt->threshes[segment_id][bsize][ref_index],
+                            rd_opt->thresh_freq_fact[bsize][ref_index]))
       continue;
 
-    // Do not allow compound prediction if the segment level reference
-    // frame feature is in use as in this case there can only be one reference.
-    if ((second_ref_frame > INTRA_FRAME) &&
-         vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME))
-      continue;
-
-    mbmi->ref_frame[0] = ref_frame;
-    mbmi->ref_frame[1] = second_ref_frame;
-
-    if (!(ref_frame == INTRA_FRAME
-        || (cpi->ref_frame_flags & flag_list[ref_frame]))) {
-      continue;
-    }
-    if (!(second_ref_frame == NONE
-        || (cpi->ref_frame_flags & flag_list[second_ref_frame]))) {
+    if (ref_frame > INTRA_FRAME &&
+        !(cpi->ref_frame_flags & flag_list[ref_frame])) {
       continue;
     }
 
     comp_pred = second_ref_frame > INTRA_FRAME;
     if (comp_pred) {
-      if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_COMP_BESTINTRA)
-        if (vp9_ref_order[best_mode_index].ref_frame == INTRA_FRAME)
-          continue;
-      if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_COMP_REFMISMATCH)
-        if (ref_frame != best_inter_ref_frame &&
-            second_ref_frame != best_inter_ref_frame)
-          continue;
+      if (!(cpi->ref_frame_flags & flag_list[second_ref_frame]))
+        continue;
+      // Do not allow compound prediction if the segment level reference frame
+      // feature is in use as in this case there can only be one reference.
+      if (vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME))
+        continue;
+      if ((cpi->sf.mode_search_skip_flags & FLAG_SKIP_COMP_BESTINTRA) &&
+          vp9_ref_order[best_ref_index].ref_frame[0] == INTRA_FRAME)
+        continue;
+      if ((cpi->sf.mode_search_skip_flags & FLAG_SKIP_COMP_REFMISMATCH) &&
+          ref_frame != best_inter_ref_frame &&
+          second_ref_frame != best_inter_ref_frame)
+        continue;
     }
 
     // TODO(jingning, jkoleszar): scaling reference frame not supported for
     // sub8x8 blocks.
-    if (ref_frame > 0 &&
-        vp9_is_scaled(scale_factor[ref_frame].sfc))
+    if (ref_frame > INTRA_FRAME &&
+        vp9_is_scaled(&cm->frame_refs[ref_frame - 1].sf))
       continue;
 
-    if (second_ref_frame > 0 &&
-        vp9_is_scaled(scale_factor[second_ref_frame].sfc))
+    if (second_ref_frame > INTRA_FRAME &&
+        vp9_is_scaled(&cm->frame_refs[second_ref_frame - 1].sf))
       continue;
 
-    set_scale_factors(xd, ref_frame, second_ref_frame, scale_factor);
-    mbmi->uv_mode = DC_PRED;
-
-    // Evaluate all sub-pel filters irrespective of whether we can use
-    // them for this frame.
-    mbmi->interp_filter = cm->mcomp_filter_type;
-    vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);
-
     if (comp_pred) {
-      if (!(cpi->ref_frame_flags & flag_list[second_ref_frame]))
-        continue;
-      set_scale_factors(xd, ref_frame, second_ref_frame, scale_factor);
-
-      mode_excluded = mode_excluded
-                         ? mode_excluded
-                         : cm->comp_pred_mode == SINGLE_PREDICTION_ONLY;
-    } else {
-      if (ref_frame != INTRA_FRAME && second_ref_frame != INTRA_FRAME) {
-        mode_excluded =
-            mode_excluded ?
-                mode_excluded : cm->comp_pred_mode == COMP_PREDICTION_ONLY;
-      }
-    }
-
-    // Select prediction reference frames.
-    for (i = 0; i < MAX_MB_PLANE; i++) {
-      xd->plane[i].pre[0] = yv12_mb[ref_frame][i];
-      if (comp_pred)
-        xd->plane[i].pre[1] = yv12_mb[second_ref_frame][i];
+      mode_excluded = mode_excluded ? mode_excluded
+                                    : cm->reference_mode == SINGLE_REFERENCE;
+    } else if (ref_frame != INTRA_FRAME) {
+      mode_excluded = mode_excluded ? mode_excluded
+                                    : cm->reference_mode == COMPOUND_REFERENCE;
     }
 
     // If the segment reference frame feature is enabled....
@@ -4039,19 +3847,30 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
       // unless ARNR filtering is enabled in which case we want
       // an unfiltered alternative. We allow near/nearest as well
       // because they may result in zero-zero MVs but be cheaper.
-      if (cpi->is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0))
+      if (cpi->rc.is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0))
         continue;
     }
 
-#ifdef MODE_TEST_HIT_STATS
-    // TEST/DEBUG CODE
-    // Keep a rcord of the number of test hits at each size
-    cpi->mode_test_hits[bsize]++;
-#endif
+    mbmi->tx_size = TX_4X4;
+    mbmi->uv_mode = DC_PRED;
+    mbmi->ref_frame[0] = ref_frame;
+    mbmi->ref_frame[1] = second_ref_frame;
+    // Evaluate all sub-pel filters irrespective of whether we can use
+    // them for this frame.
+    mbmi->interp_filter = cm->interp_filter == SWITCHABLE ? EIGHTTAP
+                                                          : cm->interp_filter;
+    x->skip = 0;
+    set_ref_ptrs(cm, xd, ref_frame, second_ref_frame);
+
+    // Select prediction reference frames.
+    for (i = 0; i < MAX_MB_PLANE; i++) {
+      xd->plane[i].pre[0] = yv12_mb[ref_frame][i];
+      if (comp_pred)
+        xd->plane[i].pre[1] = yv12_mb[second_ref_frame][i];
+    }
 
     if (ref_frame == INTRA_FRAME) {
       int rate;
-      mbmi->tx_size = TX_4X4;
       if (rd_pick_intra_sub_8x8_y_mode(cpi, x, &rate, &rate_y,
                                        &distortion_y, best_rd) >= best_rd)
         continue;
@@ -4059,20 +3878,18 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
       rate2 += intra_cost_penalty;
       distortion2 += distortion_y;
 
-      if (rate_uv_intra[TX_4X4] == INT_MAX) {
-        choose_intra_uv_mode(cpi, bsize, &rate_uv_intra[TX_4X4],
-                             &rate_uv_tokenonly[TX_4X4],
-                             &dist_uv[TX_4X4], &skip_uv[TX_4X4],
-                             &mode_uv[TX_4X4]);
+      if (rate_uv_intra == INT_MAX) {
+        choose_intra_uv_mode(cpi, ctx, bsize, TX_4X4,
+                             &rate_uv_intra,
+                             &rate_uv_tokenonly,
+                             &dist_uv, &skip_uv,
+                             &mode_uv);
       }
-      rate2 += rate_uv_intra[TX_4X4];
-      rate_uv = rate_uv_tokenonly[TX_4X4];
-      distortion2 += dist_uv[TX_4X4];
-      distortion_uv = dist_uv[TX_4X4];
-      mbmi->uv_mode = mode_uv[TX_4X4];
-      tx_cache[ONLY_4X4] = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
-      for (i = 0; i < TX_MODES; ++i)
-        tx_cache[i] = tx_cache[ONLY_4X4];
+      rate2 += rate_uv_intra;
+      rate_uv = rate_uv_tokenonly;
+      distortion2 += dist_uv;
+      distortion_uv = dist_uv;
+      mbmi->uv_mode = mode_uv;
     } else {
       int rate;
       int64_t distortion;
@@ -4091,19 +3908,24 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
       int uv_skippable;
 
       this_rd_thresh = (ref_frame == LAST_FRAME) ?
-          cpi->rd_thresh_sub8x8[segment_id][bsize][THR_LAST] :
-          cpi->rd_thresh_sub8x8[segment_id][bsize][THR_ALTR];
+          rd_opt->threshes[segment_id][bsize][THR_LAST] :
+          rd_opt->threshes[segment_id][bsize][THR_ALTR];
       this_rd_thresh = (ref_frame == GOLDEN_FRAME) ?
-          cpi->rd_thresh_sub8x8[segment_id][bsize][THR_GOLD] : this_rd_thresh;
-      xd->mi_8x8[0]->mbmi.tx_size = TX_4X4;
+      rd_opt->threshes[segment_id][bsize][THR_GOLD] : this_rd_thresh;
+      rd_opt->mask_filter = 0;
+      for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
+        rd_opt->filter_cache[i] = INT64_MAX;
 
-      cpi->rd_filter_cache[SWITCHABLE_FILTERS] = INT64_MAX;
-      if (cm->mcomp_filter_type != BILINEAR) {
+      if (cm->interp_filter != BILINEAR) {
         tmp_best_filter = EIGHTTAP;
-        if (x->source_variance <
-            cpi->sf.disable_filter_search_var_thresh) {
+        if (x->source_variance < cpi->sf.disable_filter_search_var_thresh) {
           tmp_best_filter = EIGHTTAP;
-          vp9_zero(cpi->rd_filter_cache);
+        } else if (cpi->sf.adaptive_pred_interp_filter == 1 &&
+                   ctx->pred_interp_filter < SWITCHABLE) {
+          tmp_best_filter = ctx->pred_interp_filter;
+        } else if (cpi->sf.adaptive_pred_interp_filter == 2) {
+          tmp_best_filter = ctx->pred_interp_filter < SWITCHABLE ?
+                              ctx->pred_interp_filter : 0;
         } else {
           for (switchable_filter_index = 0;
                switchable_filter_index < SWITCHABLE_FILTERS;
@@ -4111,37 +3933,36 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
             int newbest, rs;
             int64_t rs_rd;
             mbmi->interp_filter = switchable_filter_index;
-            vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);
-
-            tmp_rd = rd_pick_best_mbsegmentation(cpi, x, tile,
-                                                 &mbmi->ref_mvs[ref_frame][0],
-                                                 second_ref,
-                                                 best_yrd,
-                                                 &rate, &rate_y, &distortion,
-                                                 &skippable, &total_sse,
-                                                 (int)this_rd_thresh, seg_mvs,
-                                                 bsi, switchable_filter_index,
-                                                 mi_row, mi_col);
+            tmp_rd = rd_pick_best_sub8x8_mode(cpi, x, tile,
+                                              &mbmi->ref_mvs[ref_frame][0],
+                                              second_ref, best_yrd, &rate,
+                                              &rate_y, &distortion,
+                                              &skippable, &total_sse,
+                                              (int) this_rd_thresh, seg_mvs,
+                                              bsi, switchable_filter_index,
+                                              mi_row, mi_col);
 
             if (tmp_rd == INT64_MAX)
               continue;
-            cpi->rd_filter_cache[switchable_filter_index] = tmp_rd;
-            rs = get_switchable_rate(x);
+            rs = vp9_get_switchable_rate(cpi);
             rs_rd = RDCOST(x->rdmult, x->rddiv, rs, 0);
-            cpi->rd_filter_cache[SWITCHABLE_FILTERS] =
-                MIN(cpi->rd_filter_cache[SWITCHABLE_FILTERS],
+            rd_opt->filter_cache[switchable_filter_index] = tmp_rd;
+            rd_opt->filter_cache[SWITCHABLE_FILTERS] =
+                MIN(rd_opt->filter_cache[SWITCHABLE_FILTERS],
                     tmp_rd + rs_rd);
-            if (cm->mcomp_filter_type == SWITCHABLE)
+            if (cm->interp_filter == SWITCHABLE)
               tmp_rd += rs_rd;
 
+            rd_opt->mask_filter = MAX(rd_opt->mask_filter, tmp_rd);
+
             newbest = (tmp_rd < tmp_best_rd);
             if (newbest) {
               tmp_best_filter = mbmi->interp_filter;
               tmp_best_rd = tmp_rd;
             }
-            if ((newbest && cm->mcomp_filter_type == SWITCHABLE) ||
-                (mbmi->interp_filter == cm->mcomp_filter_type &&
-                 cm->mcomp_filter_type != SWITCHABLE)) {
+            if ((newbest && cm->interp_filter == SWITCHABLE) ||
+                (mbmi->interp_filter == cm->interp_filter &&
+                 cm->interp_filter != SWITCHABLE)) {
               tmp_best_rdu = tmp_rd;
               tmp_best_rate = rate;
               tmp_best_ratey = rate_y;
@@ -4150,8 +3971,8 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
               tmp_best_skippable = skippable;
               tmp_best_mbmode = *mbmi;
               for (i = 0; i < 4; i++) {
-                tmp_best_bmodes[i] = xd->mi_8x8[0]->bmi[i];
-                x->zcoeff_blk[TX_4X4][i] = !xd->plane[0].eobs[i];
+                tmp_best_bmodes[i] = xd->mi[0]->bmi[i];
+                x->zcoeff_blk[TX_4X4][i] = !x->plane[0].eobs[i];
               }
               pred_exists = 1;
               if (switchable_filter_index == 0 &&
@@ -4170,32 +3991,23 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
         }
       }
 
-      if (tmp_best_rdu == INT64_MAX)
+      if (tmp_best_rdu == INT64_MAX && pred_exists)
         continue;
 
-      mbmi->interp_filter = (cm->mcomp_filter_type == SWITCHABLE ?
-                             tmp_best_filter : cm->mcomp_filter_type);
-      vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);
+      mbmi->interp_filter = (cm->interp_filter == SWITCHABLE ?
+                             tmp_best_filter : cm->interp_filter);
       if (!pred_exists) {
         // Handles the special case when a filter that is not in the
         // switchable list (bilinear, 6-tap) is indicated at the frame level
-        tmp_rd = rd_pick_best_mbsegmentation(cpi, x, tile,
-                     &mbmi->ref_mvs[ref_frame][0],
-                     second_ref,
-                     best_yrd,
-                     &rate, &rate_y, &distortion,
-                     &skippable, &total_sse,
-                     (int)this_rd_thresh, seg_mvs,
-                     bsi, 0,
-                     mi_row, mi_col);
+        tmp_rd = rd_pick_best_sub8x8_mode(cpi, x, tile,
+                                          &mbmi->ref_mvs[ref_frame][0],
+                                          second_ref, best_yrd, &rate, &rate_y,
+                                          &distortion, &skippable, &total_sse,
+                                          (int) this_rd_thresh, seg_mvs, bsi, 0,
+                                          mi_row, mi_col);
         if (tmp_rd == INT64_MAX)
           continue;
       } else {
-        if (cpi->common.mcomp_filter_type == SWITCHABLE) {
-          int rs = get_switchable_rate(x);
-          tmp_best_rdu -= RDCOST(x->rdmult, x->rddiv, rs, 0);
-        }
-        tmp_rd = tmp_best_rdu;
         total_sse = tmp_best_sse;
         rate = tmp_best_rate;
         rate_y = tmp_best_ratey;
@@ -4203,21 +4015,19 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
         skippable = tmp_best_skippable;
         *mbmi = tmp_best_mbmode;
         for (i = 0; i < 4; i++)
-          xd->mi_8x8[0]->bmi[i] = tmp_best_bmodes[i];
+          xd->mi[0]->bmi[i] = tmp_best_bmodes[i];
       }
 
       rate2 += rate;
       distortion2 += distortion;
 
-      if (cpi->common.mcomp_filter_type == SWITCHABLE)
-        rate2 += get_switchable_rate(x);
+      if (cm->interp_filter == SWITCHABLE)
+        rate2 += vp9_get_switchable_rate(cpi);
+
+      if (!mode_excluded)
+        mode_excluded = comp_pred ? cm->reference_mode == SINGLE_REFERENCE
+                                  : cm->reference_mode == COMPOUND_REFERENCE;
 
-      if (!mode_excluded) {
-        if (comp_pred)
-          mode_excluded = cpi->common.comp_pred_mode == SINGLE_PREDICTION_ONLY;
-        else
-          mode_excluded = cpi->common.comp_pred_mode == COMP_PREDICTION_ONLY;
-      }
       compmode_cost = vp9_cost_bit(comp_mode_p, comp_pred);
 
       tmp_best_rdu = best_rd -
@@ -4237,16 +4047,11 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
         distortion2 += distortion_uv;
         skippable = skippable && uv_skippable;
         total_sse += uv_sse;
-
-        tx_cache[ONLY_4X4] = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
-        for (i = 0; i < TX_MODES; ++i)
-          tx_cache[i] = tx_cache[ONLY_4X4];
       }
     }
 
-    if (cpi->common.comp_pred_mode == HYBRID_PREDICTION) {
+    if (cm->reference_mode == REFERENCE_MODE_SELECT)
       rate2 += compmode_cost;
-    }
 
     // Estimate the reference frame signaling cost and add it
     // to the rolling cost variable.
@@ -4269,14 +4074,10 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
         if (RDCOST(x->rdmult, x->rddiv, rate_y + rate_uv, distortion2) <
             RDCOST(x->rdmult, x->rddiv, 0, total_sse)) {
           // Add in the cost of the no skip flag.
-          int prob_skip_cost = vp9_cost_bit(vp9_get_pred_prob_mbskip(cm, xd),
-                                            0);
-          rate2 += prob_skip_cost;
+          rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 0);
         } else {
           // FIXME(rbultje) make this work for splitmv also
-          int prob_skip_cost = vp9_cost_bit(vp9_get_pred_prob_mbskip(cm, xd),
-                                            1);
-          rate2 += prob_skip_cost;
+          rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
           distortion2 = total_sse;
           assert(total_sse >= 0);
           rate2 -= (rate_y + rate_uv);
@@ -4286,9 +4087,7 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
         }
       } else if (mb_skip_allowed) {
         // Add in the cost of the no skip flag.
-        int prob_skip_cost = vp9_cost_bit(vp9_get_pred_prob_mbskip(cm, xd),
-                                          0);
-        rate2 += prob_skip_cost;
+        rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 0);
       }
 
       // Calculate the final RD estimate for this mode.
@@ -4296,8 +4095,8 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
     }
 
     // Keep record of best inter rd with single reference
-    if (xd->mi_8x8[0]->mbmi.ref_frame[0] > INTRA_FRAME &&
-        xd->mi_8x8[0]->mbmi.ref_frame[1] == NONE &&
+    if (is_inter_block(mbmi) &&
+        !has_second_ref(mbmi) &&
         !mode_excluded &&
         this_rd < best_inter_rd) {
       best_inter_rd = this_rd;
@@ -4305,7 +4104,7 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
     }
 
     if (!disable_skip && ref_frame == INTRA_FRAME) {
-      for (i = 0; i < NB_PREDICTION_TYPES; ++i)
+      for (i = 0; i < REFERENCE_MODES; ++i)
         best_pred_rd[i] = MIN(best_pred_rd[i], this_rd);
       for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
         best_filter_rd[i] = MIN(best_filter_rd[i], this_rd);
@@ -4314,12 +4113,14 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
     // Did this mode help.. i.e. is it the new best mode
     if (this_rd < best_rd || x->skip) {
       if (!mode_excluded) {
+        int max_plane = MAX_MB_PLANE;
         // Note index of best mode so far
-        best_mode_index = mode_index;
+        best_ref_index = ref_index;
 
         if (ref_frame == INTRA_FRAME) {
           /* required for left and above block mv */
           mbmi->mv[0].as_int = 0;
+          max_plane = 1;
         }
 
         *returnrate = rate2;
@@ -4329,16 +4130,18 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
                    RDCOST(x->rdmult, x->rddiv, rate_uv, distortion_uv);
         best_mbmode = *mbmi;
         best_skip2 = this_skip2;
-        vpx_memcpy(ctx->zcoeff_blk, x->zcoeff_blk[mbmi->tx_size],
+        if (!x->select_txfm_size)
+          swap_block_ptr(x, ctx, max_plane);
+        vpx_memcpy(ctx->zcoeff_blk, x->zcoeff_blk[TX_4X4],
                    sizeof(uint8_t) * ctx->num_4x4_blk);
 
         for (i = 0; i < 4; i++)
-          best_bmodes[i] = xd->mi_8x8[0]->bmi[i];
+          best_bmodes[i] = xd->mi[0]->bmi[i];
 
         // TODO(debargha): enhance this test with a better distortion prediction
         // based on qp, activity mask and history
         if ((cpi->sf.mode_search_skip_flags & FLAG_EARLY_TERMINATE) &&
-            (mode_index > MIN_EARLY_TERM_INDEX)) {
+            (ref_index > MIN_EARLY_TERM_INDEX)) {
           const int qstep = xd->plane[0].dequant[1];
           // TODO(debargha): Enhance this by specializing for each mode_index
           int scale = 4;
@@ -4356,9 +4159,9 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
 
     /* keep record of best compound/single-only prediction */
     if (!disable_skip && ref_frame != INTRA_FRAME) {
-      int single_rd, hybrid_rd, single_rate, hybrid_rate;
+      int64_t single_rd, hybrid_rd, single_rate, hybrid_rate;
 
-      if (cpi->common.comp_pred_mode == HYBRID_PREDICTION) {
+      if (cm->reference_mode == REFERENCE_MODE_SELECT) {
         single_rate = rate2 - compmode_cost;
         hybrid_rate = rate2;
       } else {
@@ -4369,54 +4172,35 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
       single_rd = RDCOST(x->rdmult, x->rddiv, single_rate, distortion2);
       hybrid_rd = RDCOST(x->rdmult, x->rddiv, hybrid_rate, distortion2);
 
-      if (second_ref_frame <= INTRA_FRAME &&
-          single_rd < best_pred_rd[SINGLE_PREDICTION_ONLY]) {
-        best_pred_rd[SINGLE_PREDICTION_ONLY] = single_rd;
-      } else if (second_ref_frame > INTRA_FRAME &&
-                 single_rd < best_pred_rd[COMP_PREDICTION_ONLY]) {
-        best_pred_rd[COMP_PREDICTION_ONLY] = single_rd;
+      if (!comp_pred && single_rd < best_pred_rd[SINGLE_REFERENCE]) {
+        best_pred_rd[SINGLE_REFERENCE] = single_rd;
+      } else if (comp_pred && single_rd < best_pred_rd[COMPOUND_REFERENCE]) {
+        best_pred_rd[COMPOUND_REFERENCE] = single_rd;
       }
-      if (hybrid_rd < best_pred_rd[HYBRID_PREDICTION])
-        best_pred_rd[HYBRID_PREDICTION] = hybrid_rd;
+      if (hybrid_rd < best_pred_rd[REFERENCE_MODE_SELECT])
+        best_pred_rd[REFERENCE_MODE_SELECT] = hybrid_rd;
     }
 
     /* keep record of best filter type */
     if (!mode_excluded && !disable_skip && ref_frame != INTRA_FRAME &&
-        cm->mcomp_filter_type != BILINEAR) {
-      int64_t ref = cpi->rd_filter_cache[cm->mcomp_filter_type == SWITCHABLE ?
-                              SWITCHABLE_FILTERS : cm->mcomp_filter_type];
+        cm->interp_filter != BILINEAR) {
+      int64_t ref = rd_opt->filter_cache[cm->interp_filter == SWITCHABLE ?
+                              SWITCHABLE_FILTERS : cm->interp_filter];
+      int64_t adj_rd;
       for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
-        int64_t adj_rd;
-        // In cases of poor prediction, filter_cache[] can contain really big
-        // values, which actually are bigger than this_rd itself. This can
-        // cause negative best_filter_rd[] values, which is obviously silly.
-        // Therefore, if filter_cache < ref, we do an adjusted calculation.
-        if (cpi->rd_filter_cache[i] >= ref)
-          adj_rd = this_rd + cpi->rd_filter_cache[i] - ref;
-        else  // FIXME(rbultje) do this for comppred also
-          adj_rd = this_rd - (ref - cpi->rd_filter_cache[i]) * this_rd / ref;
-        best_filter_rd[i] = MIN(best_filter_rd[i], adj_rd);
-      }
-    }
-
-    /* keep record of best txfm size */
-    if (bsize < BLOCK_32X32) {
-      if (bsize < BLOCK_16X16) {
-        tx_cache[ALLOW_8X8] = tx_cache[ONLY_4X4];
-        tx_cache[ALLOW_16X16] = tx_cache[ALLOW_8X8];
-      }
-      tx_cache[ALLOW_32X32] = tx_cache[ALLOW_16X16];
-    }
-    if (!mode_excluded && this_rd != INT64_MAX) {
-      for (i = 0; i < TX_MODES && tx_cache[i] < INT64_MAX; i++) {
-        int64_t adj_rd = INT64_MAX;
-        if (ref_frame > INTRA_FRAME)
-          adj_rd = this_rd + tx_cache[i] - tx_cache[cm->tx_mode];
+        if (ref == INT64_MAX)
+          adj_rd = 0;
+        else if (rd_opt->filter_cache[i] == INT64_MAX)
+          // when early termination is triggered, the encoder does not have
+          // access to the rate-distortion cost. it only knows that the cost
+          // should be above the maximum valid value. hence it takes the known
+          // maximum plus an arbitrary constant as the rate-distortion cost.
+          adj_rd = rd_opt->mask_filter - ref + 10;
         else
-          adj_rd = this_rd;
+          adj_rd = rd_opt->filter_cache[i] - ref;
 
-        if (adj_rd < best_tx_rd[i])
-          best_tx_rd[i] = adj_rd;
+        adj_rd += this_rd;
+        best_filter_rd[i] = MIN(best_filter_rd[i], adj_rd);
       }
     }
 
@@ -4433,67 +4217,43 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
   // If we used an estimate for the uv intra rd in the loop above...
   if (cpi->sf.use_uv_intra_rd_estimate) {
     // Do Intra UV best rd mode selection if best mode choice above was intra.
-    if (vp9_ref_order[best_mode_index].ref_frame == INTRA_FRAME) {
-      TX_SIZE uv_tx_size = get_uv_tx_size(mbmi);
-      rd_pick_intra_sbuv_mode(cpi, x, &rate_uv_intra[uv_tx_size],
-                              &rate_uv_tokenonly[uv_tx_size],
-                              &dist_uv[uv_tx_size],
-                              &skip_uv[uv_tx_size],
-                              BLOCK_8X8);
+    if (vp9_ref_order[best_ref_index].ref_frame[0] == INTRA_FRAME) {
+      *mbmi = best_mbmode;
+      rd_pick_intra_sbuv_mode(cpi, x, ctx, &rate_uv_intra,
+                              &rate_uv_tokenonly,
+                              &dist_uv,
+                              &skip_uv,
+                              BLOCK_8X8, TX_4X4);
     }
   }
 
-  // If we are using reference masking and the set mask flag is set then
-  // create the reference frame mask.
-  if (cpi->sf.reference_masking && cpi->set_ref_frame_mask)
-    cpi->ref_frame_mask = ~(1 << vp9_ref_order[best_mode_index].ref_frame);
-
-  if (best_rd == INT64_MAX && bsize < BLOCK_8X8) {
+  if (best_rd == INT64_MAX) {
     *returnrate = INT_MAX;
-    *returndistortion = INT_MAX;
+    *returndistortion = INT64_MAX;
     return best_rd;
   }
 
-  assert((cm->mcomp_filter_type == SWITCHABLE) ||
-         (cm->mcomp_filter_type == best_mbmode.interp_filter) ||
-         (best_mbmode.ref_frame[0] == INTRA_FRAME));
-
-  // Updating rd_thresh_freq_fact[] here means that the different
-  // partition/block sizes are handled independently based on the best
-  // choice for the current partition. It may well be better to keep a scaled
-  // best rd so far value and update rd_thresh_freq_fact based on the mode/size
-  // combination that wins out.
-  if (cpi->sf.adaptive_rd_thresh) {
-    for (mode_index = 0; mode_index < MAX_REFS; ++mode_index) {
-      if (mode_index == best_mode_index) {
-        cpi->rd_thresh_freq_sub8x8[bsize][mode_index] -=
-          (cpi->rd_thresh_freq_sub8x8[bsize][mode_index] >> 3);
-      } else {
-        cpi->rd_thresh_freq_sub8x8[bsize][mode_index] += RD_THRESH_INC;
-        if (cpi->rd_thresh_freq_sub8x8[bsize][mode_index] >
-            (cpi->sf.adaptive_rd_thresh * RD_THRESH_MAX_FACT)) {
-          cpi->rd_thresh_freq_sub8x8[bsize][mode_index] =
-            cpi->sf.adaptive_rd_thresh * RD_THRESH_MAX_FACT;
-        }
-      }
-    }
-  }
+  assert((cm->interp_filter == SWITCHABLE) ||
+         (cm->interp_filter == best_mbmode.interp_filter) ||
+         !is_inter_block(&best_mbmode));
+
+  update_rd_thresh_fact(cpi, bsize, best_ref_index);
 
   // macroblock modes
   *mbmi = best_mbmode;
   x->skip |= best_skip2;
-  if (best_mbmode.ref_frame[0] == INTRA_FRAME) {
+  if (!is_inter_block(&best_mbmode)) {
     for (i = 0; i < 4; i++)
-      xd->mi_8x8[0]->bmi[i].as_mode = best_bmodes[i].as_mode;
+      xd->mi[0]->bmi[i].as_mode = best_bmodes[i].as_mode;
   } else {
     for (i = 0; i < 4; ++i)
-      vpx_memcpy(&xd->mi_8x8[0]->bmi[i], &best_bmodes[i], sizeof(b_mode_info));
+      vpx_memcpy(&xd->mi[0]->bmi[i], &best_bmodes[i], sizeof(b_mode_info));
 
-    mbmi->mv[0].as_int = xd->mi_8x8[0]->bmi[3].as_mv[0].as_int;
-    mbmi->mv[1].as_int = xd->mi_8x8[0]->bmi[3].as_mv[1].as_int;
+    mbmi->mv[0].as_int = xd->mi[0]->bmi[3].as_mv[0].as_int;
+    mbmi->mv[1].as_int = xd->mi[0]->bmi[3].as_mv[1].as_int;
   }
 
-  for (i = 0; i < NB_PREDICTION_TYPES; ++i) {
+  for (i = 0; i < REFERENCE_MODES; ++i) {
     if (best_pred_rd[i] == INT64_MAX)
       best_pred_diff[i] = INT_MIN;
     else
@@ -4507,26 +4267,14 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
       else
         best_filter_diff[i] = best_rd - best_filter_rd[i];
     }
-    if (cm->mcomp_filter_type == SWITCHABLE)
+    if (cm->interp_filter == SWITCHABLE)
       assert(best_filter_diff[SWITCHABLE_FILTERS] == 0);
   } else {
     vp9_zero(best_filter_diff);
   }
 
-  if (!x->skip) {
-    for (i = 0; i < TX_MODES; i++) {
-      if (best_tx_rd[i] == INT64_MAX)
-        best_tx_diff[i] = 0;
-      else
-        best_tx_diff[i] = best_rd - best_tx_rd[i];
-    }
-  } else {
-    vp9_zero(best_tx_diff);
-  }
-
-  set_scale_factors(xd, mbmi->ref_frame[0], mbmi->ref_frame[1],
-                    scale_factor);
-  store_coding_context(x, ctx, best_mode_index,
+  set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+  store_coding_context(x, ctx, best_ref_index,
                        &mbmi->ref_mvs[mbmi->ref_frame[0]][0],
                        &mbmi->ref_mvs[mbmi->ref_frame[1] < 0 ? 0 :
                                       mbmi->ref_frame[1]][0],
@@ -4534,3 +4282,120 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
 
   return best_rd;
 }
+
+void vp9_set_rd_speed_thresholds(VP9_COMP *cpi) {
+  int i;
+  RD_OPT *const rd = &cpi->rd;
+
+  // Set baseline threshold values
+  for (i = 0; i < MAX_MODES; ++i)
+    rd->thresh_mult[i] = is_best_mode(cpi->oxcf.mode) ? -500 : 0;
+
+  rd->thresh_mult[THR_NEARESTMV] = 0;
+  rd->thresh_mult[THR_NEARESTG] = 0;
+  rd->thresh_mult[THR_NEARESTA] = 0;
+
+  rd->thresh_mult[THR_DC] += 1000;
+
+  rd->thresh_mult[THR_NEWMV] += 1000;
+  rd->thresh_mult[THR_NEWA] += 1000;
+  rd->thresh_mult[THR_NEWG] += 1000;
+
+  rd->thresh_mult[THR_NEARMV] += 1000;
+  rd->thresh_mult[THR_NEARA] += 1000;
+  rd->thresh_mult[THR_COMP_NEARESTLA] += 1000;
+  rd->thresh_mult[THR_COMP_NEARESTGA] += 1000;
+
+  rd->thresh_mult[THR_TM] += 1000;
+
+  rd->thresh_mult[THR_COMP_NEARLA] += 1500;
+  rd->thresh_mult[THR_COMP_NEWLA] += 2000;
+  rd->thresh_mult[THR_NEARG] += 1000;
+  rd->thresh_mult[THR_COMP_NEARGA] += 1500;
+  rd->thresh_mult[THR_COMP_NEWGA] += 2000;
+
+  rd->thresh_mult[THR_ZEROMV] += 2000;
+  rd->thresh_mult[THR_ZEROG] += 2000;
+  rd->thresh_mult[THR_ZEROA] += 2000;
+  rd->thresh_mult[THR_COMP_ZEROLA] += 2500;
+  rd->thresh_mult[THR_COMP_ZEROGA] += 2500;
+
+  rd->thresh_mult[THR_H_PRED] += 2000;
+  rd->thresh_mult[THR_V_PRED] += 2000;
+  rd->thresh_mult[THR_D45_PRED ] += 2500;
+  rd->thresh_mult[THR_D135_PRED] += 2500;
+  rd->thresh_mult[THR_D117_PRED] += 2500;
+  rd->thresh_mult[THR_D153_PRED] += 2500;
+  rd->thresh_mult[THR_D207_PRED] += 2500;
+  rd->thresh_mult[THR_D63_PRED] += 2500;
+
+  /* disable frame modes if flags not set */
+  if (!(cpi->ref_frame_flags & VP9_LAST_FLAG)) {
+    rd->thresh_mult[THR_NEWMV    ] = INT_MAX;
+    rd->thresh_mult[THR_NEARESTMV] = INT_MAX;
+    rd->thresh_mult[THR_ZEROMV   ] = INT_MAX;
+    rd->thresh_mult[THR_NEARMV   ] = INT_MAX;
+  }
+  if (!(cpi->ref_frame_flags & VP9_GOLD_FLAG)) {
+    rd->thresh_mult[THR_NEARESTG ] = INT_MAX;
+    rd->thresh_mult[THR_ZEROG    ] = INT_MAX;
+    rd->thresh_mult[THR_NEARG    ] = INT_MAX;
+    rd->thresh_mult[THR_NEWG     ] = INT_MAX;
+  }
+  if (!(cpi->ref_frame_flags & VP9_ALT_FLAG)) {
+    rd->thresh_mult[THR_NEARESTA ] = INT_MAX;
+    rd->thresh_mult[THR_ZEROA    ] = INT_MAX;
+    rd->thresh_mult[THR_NEARA    ] = INT_MAX;
+    rd->thresh_mult[THR_NEWA     ] = INT_MAX;
+  }
+
+  if ((cpi->ref_frame_flags & (VP9_LAST_FLAG | VP9_ALT_FLAG)) !=
+      (VP9_LAST_FLAG | VP9_ALT_FLAG)) {
+    rd->thresh_mult[THR_COMP_ZEROLA   ] = INT_MAX;
+    rd->thresh_mult[THR_COMP_NEARESTLA] = INT_MAX;
+    rd->thresh_mult[THR_COMP_NEARLA   ] = INT_MAX;
+    rd->thresh_mult[THR_COMP_NEWLA    ] = INT_MAX;
+  }
+  if ((cpi->ref_frame_flags & (VP9_GOLD_FLAG | VP9_ALT_FLAG)) !=
+      (VP9_GOLD_FLAG | VP9_ALT_FLAG)) {
+    rd->thresh_mult[THR_COMP_ZEROGA   ] = INT_MAX;
+    rd->thresh_mult[THR_COMP_NEARESTGA] = INT_MAX;
+    rd->thresh_mult[THR_COMP_NEARGA   ] = INT_MAX;
+    rd->thresh_mult[THR_COMP_NEWGA    ] = INT_MAX;
+  }
+}
+
+void vp9_set_rd_speed_thresholds_sub8x8(VP9_COMP *cpi) {
+  const SPEED_FEATURES *const sf = &cpi->sf;
+  RD_OPT *const rd = &cpi->rd;
+  int i;
+
+  for (i = 0; i < MAX_REFS; ++i)
+    rd->thresh_mult_sub8x8[i] = is_best_mode(cpi->oxcf.mode)  ? -500 : 0;
+
+  rd->thresh_mult_sub8x8[THR_LAST] += 2500;
+  rd->thresh_mult_sub8x8[THR_GOLD] += 2500;
+  rd->thresh_mult_sub8x8[THR_ALTR] += 2500;
+  rd->thresh_mult_sub8x8[THR_INTRA] += 2500;
+  rd->thresh_mult_sub8x8[THR_COMP_LA] += 4500;
+  rd->thresh_mult_sub8x8[THR_COMP_GA] += 4500;
+
+  // Check for masked out split cases.
+  for (i = 0; i < MAX_REFS; i++)
+    if (sf->disable_split_mask & (1 << i))
+      rd->thresh_mult_sub8x8[i] = INT_MAX;
+
+  // disable mode test if frame flag is not set
+  if (!(cpi->ref_frame_flags & VP9_LAST_FLAG))
+    rd->thresh_mult_sub8x8[THR_LAST] = INT_MAX;
+  if (!(cpi->ref_frame_flags & VP9_GOLD_FLAG))
+    rd->thresh_mult_sub8x8[THR_GOLD] = INT_MAX;
+  if (!(cpi->ref_frame_flags & VP9_ALT_FLAG))
+    rd->thresh_mult_sub8x8[THR_ALTR] = INT_MAX;
+  if ((cpi->ref_frame_flags & (VP9_LAST_FLAG | VP9_ALT_FLAG)) !=
+      (VP9_LAST_FLAG | VP9_ALT_FLAG))
+    rd->thresh_mult_sub8x8[THR_COMP_LA] = INT_MAX;
+  if ((cpi->ref_frame_flags & (VP9_GOLD_FLAG | VP9_ALT_FLAG)) !=
+      (VP9_GOLD_FLAG | VP9_ALT_FLAG))
+    rd->thresh_mult_sub8x8[THR_COMP_GA] = INT_MAX;
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_rdopt.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_rdopt.h
index 92fb23548e0..b6b51e55382 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_rdopt.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_rdopt.h
@@ -8,24 +8,52 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-
 #ifndef VP9_ENCODER_VP9_RDOPT_H_
 #define VP9_ENCODER_VP9_RDOPT_H_
 
+#include "vp9/encoder/vp9_encoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 #define RDDIV_BITS          7
 
 #define RDCOST(RM, DM, R, D) \
   (((128 + ((int64_t)R) * (RM)) >> 8) + (D << DM))
 #define QIDX_SKIP_THRESH     115
 
+#define MV_COST_WEIGHT      108
+#define MV_COST_WEIGHT_SUB  120
+
+#define INVALID_MV 0x80008000
+
 struct TileInfo;
 
-int vp9_compute_rd_mult(VP9_COMP *cpi, int qindex);
+int vp9_compute_rd_mult(const VP9_COMP *cpi, int qindex);
 
 void vp9_initialize_rd_consts(VP9_COMP *cpi);
 
 void vp9_initialize_me_consts(VP9_COMP *cpi, int qindex);
 
+void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n,
+                                  unsigned int qstep, int *rate,
+                                  int64_t *dist);
+
+int vp9_get_switchable_rate(const VP9_COMP *cpi);
+
+void vp9_setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x,
+                            const TileInfo *const tile,
+                            MV_REFERENCE_FRAME ref_frame,
+                            BLOCK_SIZE block_size,
+                            int mi_row, int mi_col,
+                            int_mv frame_nearest_mv[MAX_REF_FRAMES],
+                            int_mv frame_near_mv[MAX_REF_FRAMES],
+                            struct buf_2d yv12_mb[4][MAX_MB_PLANE]);
+
+const YV12_BUFFER_CONFIG *vp9_get_scaled_ref_frame(const VP9_COMP *cpi,
+                                                   int ref_frame);
+
 void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
                                int *r, int64_t *d, BLOCK_SIZE bsize,
                                PICK_MODE_CONTEXT *ctx, int64_t best_rd);
@@ -50,12 +78,60 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
 
 void vp9_init_me_luts();
 
-void vp9_set_mbmode_and_mvs(MACROBLOCK *x,
-                            MB_PREDICTION_MODE mb, int_mv *mv);
+void vp9_get_entropy_contexts(BLOCK_SIZE bsize, TX_SIZE tx_size,
+                              const struct macroblockd_plane *pd,
+                              ENTROPY_CONTEXT t_above[16],
+                              ENTROPY_CONTEXT t_left[16]);
+
+void vp9_set_rd_speed_thresholds(VP9_COMP *cpi);
+
+void vp9_set_rd_speed_thresholds_sub8x8(VP9_COMP *cpi);
+
+static INLINE int full_pixel_search(VP9_COMP *cpi, MACROBLOCK *x,
+                                    BLOCK_SIZE bsize, MV *mvp_full,
+                                    int step_param, int error_per_bit,
+                                    const MV *ref_mv, MV *tmp_mv,
+                                    int var_max, int rd) {
+  int var = 0;
+
+  if (cpi->sf.search_method == FAST_DIAMOND) {
+    var = vp9_fast_dia_search(x, mvp_full, step_param, error_per_bit, 0,
+                              &cpi->fn_ptr[bsize], 1, ref_mv, tmp_mv);
+    if (rd && var < var_max)
+      var = vp9_get_mvpred_var(x, tmp_mv, ref_mv, &cpi->fn_ptr[bsize], 1);
+  } else if (cpi->sf.search_method == FAST_HEX) {
+    var = vp9_fast_hex_search(x, mvp_full, step_param, error_per_bit, 0,
+                              &cpi->fn_ptr[bsize], 1, ref_mv, tmp_mv);
+    if (rd && var < var_max)
+      var = vp9_get_mvpred_var(x, tmp_mv, ref_mv, &cpi->fn_ptr[bsize], 1);
+  } else if (cpi->sf.search_method == HEX) {
+    var = vp9_hex_search(x, mvp_full, step_param, error_per_bit, 1,
+                         &cpi->fn_ptr[bsize], 1, ref_mv, tmp_mv);
+    if (rd && var < var_max)
+      var = vp9_get_mvpred_var(x, tmp_mv, ref_mv, &cpi->fn_ptr[bsize], 1);
+  } else if (cpi->sf.search_method == SQUARE) {
+    var = vp9_square_search(x, mvp_full, step_param, error_per_bit, 1,
+                            &cpi->fn_ptr[bsize], 1, ref_mv, tmp_mv);
+    if (rd && var < var_max)
+      var = vp9_get_mvpred_var(x, tmp_mv, ref_mv, &cpi->fn_ptr[bsize], 1);
+  } else if (cpi->sf.search_method == BIGDIA) {
+    var = vp9_bigdia_search(x, mvp_full, step_param, error_per_bit, 1,
+                            &cpi->fn_ptr[bsize], 1, ref_mv, tmp_mv);
+    if (rd && var < var_max)
+      var = vp9_get_mvpred_var(x, tmp_mv, ref_mv, &cpi->fn_ptr[bsize], 1);
+  } else {
+    int further_steps = (cpi->sf.max_step_search_steps - 1) - step_param;
+
+    var = vp9_full_pixel_diamond(cpi, x, mvp_full, step_param, error_per_bit,
+                                 further_steps, 1, &cpi->fn_ptr[bsize],
+                                 ref_mv, tmp_mv);
+  }
+
+  return var;
+}
 
-void vp9_get_entropy_contexts(TX_SIZE tx_size,
-    ENTROPY_CONTEXT t_above[16], ENTROPY_CONTEXT t_left[16],
-    const ENTROPY_CONTEXT *above, const ENTROPY_CONTEXT *left,
-    int num_4x4_w, int num_4x4_h);
+#ifdef __cplusplus
+}  // extern "C"
+#endif
 
 #endif  // VP9_ENCODER_VP9_RDOPT_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_resize.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_resize.c
new file mode 100644
index 00000000000..4e6efaeb969
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_resize.c
@@ -0,0 +1,576 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <limits.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "vp9/common/vp9_common.h"
+#include "vp9/encoder/vp9_resize.h"
+
+#define FILTER_BITS               7
+
+#define INTERP_TAPS               8
+#define SUBPEL_BITS               5
+#define SUBPEL_MASK               ((1 << SUBPEL_BITS) - 1)
+#define INTERP_PRECISION_BITS     32
+
+typedef int16_t interp_kernel[INTERP_TAPS];
+
+// Filters for interpolation (0.5-band) - note this also filters integer pels.
+const interp_kernel vp9_filteredinterp_filters500[(1 << SUBPEL_BITS)] = {
+  {-3,  0, 35, 64, 35,  0, -3, 0},
+  {-3, -1, 34, 64, 36,  1, -3, 0},
+  {-3, -1, 32, 64, 38,  1, -3, 0},
+  {-2, -2, 31, 63, 39,  2, -3, 0},
+  {-2, -2, 29, 63, 41,  2, -3, 0},
+  {-2, -2, 28, 63, 42,  3, -4, 0},
+  {-2, -3, 27, 63, 43,  4, -4, 0},
+  {-2, -3, 25, 62, 45,  5, -4, 0},
+  {-2, -3, 24, 62, 46,  5, -4, 0},
+  {-2, -3, 23, 61, 47,  6, -4, 0},
+  {-2, -3, 21, 60, 49,  7, -4, 0},
+  {-1, -4, 20, 60, 50,  8, -4, -1},
+  {-1, -4, 19, 59, 51,  9, -4, -1},
+  {-1, -4, 17, 58, 52, 10, -4, 0},
+  {-1, -4, 16, 57, 53, 12, -4, -1},
+  {-1, -4, 15, 56, 54, 13, -4, -1},
+  {-1, -4, 14, 55, 55, 14, -4, -1},
+  {-1, -4, 13, 54, 56, 15, -4, -1},
+  {-1, -4, 12, 53, 57, 16, -4, -1},
+  {0, -4, 10, 52, 58, 17, -4, -1},
+  {-1, -4,  9, 51, 59, 19, -4, -1},
+  {-1, -4,  8, 50, 60, 20, -4, -1},
+  {0, -4,  7, 49, 60, 21, -3, -2},
+  {0, -4,  6, 47, 61, 23, -3, -2},
+  {0, -4,  5, 46, 62, 24, -3, -2},
+  {0, -4,  5, 45, 62, 25, -3, -2},
+  {0, -4,  4, 43, 63, 27, -3, -2},
+  {0, -4,  3, 42, 63, 28, -2, -2},
+  {0, -3,  2, 41, 63, 29, -2, -2},
+  {0, -3,  2, 39, 63, 31, -2, -2},
+  {0, -3,  1, 38, 64, 32, -1, -3},
+  {0, -3,  1, 36, 64, 34, -1, -3}
+};
+
+// Filters for interpolation (0.625-band) - note this also filters integer pels.
+const interp_kernel vp9_filteredinterp_filters625[(1 << SUBPEL_BITS)] = {
+  {-1, -8, 33, 80, 33, -8, -1, 0},
+  {-1, -8, 30, 80, 35, -8, -1, 1},
+  {-1, -8, 28, 80, 37, -7, -2, 1},
+  {0, -8, 26, 79, 39, -7, -2, 1},
+  {0, -8, 24, 79, 41, -7, -2, 1},
+  {0, -8, 22, 78, 43, -6, -2, 1},
+  {0, -8, 20, 78, 45, -5, -3, 1},
+  {0, -8, 18, 77, 48, -5, -3, 1},
+  {0, -8, 16, 76, 50, -4, -3, 1},
+  {0, -8, 15, 75, 52, -3, -4, 1},
+  {0, -7, 13, 74, 54, -3, -4, 1},
+  {0, -7, 11, 73, 56, -2, -4, 1},
+  {0, -7, 10, 71, 58, -1, -4, 1},
+  {1, -7,  8, 70, 60,  0, -5, 1},
+  {1, -6,  6, 68, 62,  1, -5, 1},
+  {1, -6,  5, 67, 63,  2, -5, 1},
+  {1, -6,  4, 65, 65,  4, -6, 1},
+  {1, -5,  2, 63, 67,  5, -6, 1},
+  {1, -5,  1, 62, 68,  6, -6, 1},
+  {1, -5,  0, 60, 70,  8, -7, 1},
+  {1, -4, -1, 58, 71, 10, -7, 0},
+  {1, -4, -2, 56, 73, 11, -7, 0},
+  {1, -4, -3, 54, 74, 13, -7, 0},
+  {1, -4, -3, 52, 75, 15, -8, 0},
+  {1, -3, -4, 50, 76, 16, -8, 0},
+  {1, -3, -5, 48, 77, 18, -8, 0},
+  {1, -3, -5, 45, 78, 20, -8, 0},
+  {1, -2, -6, 43, 78, 22, -8, 0},
+  {1, -2, -7, 41, 79, 24, -8, 0},
+  {1, -2, -7, 39, 79, 26, -8, 0},
+  {1, -2, -7, 37, 80, 28, -8, -1},
+  {1, -1, -8, 35, 80, 30, -8, -1},
+};
+
+// Filters for interpolation (0.75-band) - note this also filters integer pels.
+const interp_kernel vp9_filteredinterp_filters750[(1 << SUBPEL_BITS)] = {
+  {2, -11,  25,  96,  25, -11,   2, 0},
+  {2, -11,  22,  96,  28, -11,   2, 0},
+  {2, -10,  19,  95,  31, -11,   2, 0},
+  {2, -10,  17,  95,  34, -12,   2, 0},
+  {2,  -9,  14,  94,  37, -12,   2, 0},
+  {2,  -8,  12,  93,  40, -12,   1, 0},
+  {2,  -8,   9,  92,  43, -12,   1, 1},
+  {2,  -7,   7,  91,  46, -12,   1, 0},
+  {2,  -7,   5,  90,  49, -12,   1, 0},
+  {2,  -6,   3,  88,  52, -12,   0, 1},
+  {2,  -5,   1,  86,  55, -12,   0, 1},
+  {2,  -5,  -1,  84,  58, -11,   0, 1},
+  {2,  -4,  -2,  82,  61, -11,  -1, 1},
+  {2,  -4,  -4,  80,  64, -10,  -1, 1},
+  {1, -3, -5, 77, 67, -9, -1, 1},
+  {1, -3, -6, 75, 70, -8, -2, 1},
+  {1, -2, -7, 72, 72, -7, -2, 1},
+  {1, -2, -8, 70, 75, -6, -3, 1},
+  {1, -1, -9, 67, 77, -5, -3, 1},
+  {1,  -1, -10,  64,  80,  -4,  -4, 2},
+  {1,  -1, -11,  61,  82,  -2,  -4, 2},
+  {1,   0, -11,  58,  84,  -1,  -5, 2},
+  {1,   0, -12,  55,  86,   1,  -5, 2},
+  {1,   0, -12,  52,  88,   3,  -6, 2},
+  {0,   1, -12,  49,  90,   5,  -7, 2},
+  {0,   1, -12,  46,  91,   7,  -7, 2},
+  {1,   1, -12,  43,  92,   9,  -8, 2},
+  {0,   1, -12,  40,  93,  12,  -8, 2},
+  {0,   2, -12,  37,  94,  14,  -9, 2},
+  {0,   2, -12,  34,  95,  17, -10, 2},
+  {0,   2, -11,  31,  95,  19, -10, 2},
+  {0,   2, -11,  28,  96,  22, -11, 2}
+};
+
+// Filters for interpolation (0.875-band) - note this also filters integer pels.
+const interp_kernel vp9_filteredinterp_filters875[(1 << SUBPEL_BITS)] = {
+  {3,  -8,  13, 112,  13,  -8,   3, 0},
+  {3,  -7,  10, 112,  17,  -9,   3, -1},
+  {2,  -6,   7, 111,  21,  -9,   3, -1},
+  {2,  -5,   4, 111,  24, -10,   3, -1},
+  {2,  -4,   1, 110,  28, -11,   3, -1},
+  {1,  -3,  -1, 108,  32, -12,   4, -1},
+  {1,  -2,  -3, 106,  36, -13,   4, -1},
+  {1,  -1,  -6, 105,  40, -14,   4, -1},
+  {1,  -1,  -7, 102,  44, -14,   4, -1},
+  {1,   0,  -9, 100,  48, -15,   4, -1},
+  {1,   1, -11,  97,  53, -16,   4, -1},
+  {0,   1, -12,  95,  57, -16,   4, -1},
+  {0,   2, -13,  91,  61, -16,   4, -1},
+  {0,   2, -14,  88,  65, -16,   4, -1},
+  {0,   3, -15,  84,  69, -17,   4, 0},
+  {0,   3, -16,  81,  73, -16,   3, 0},
+  {0,   3, -16,  77,  77, -16,   3, 0},
+  {0,   3, -16,  73,  81, -16,   3, 0},
+  {0,   4, -17,  69,  84, -15,   3, 0},
+  {-1,   4, -16,  65,  88, -14,   2, 0},
+  {-1,   4, -16,  61,  91, -13,   2, 0},
+  {-1,   4, -16,  57,  95, -12,   1, 0},
+  {-1,   4, -16,  53,  97, -11,   1, 1},
+  {-1,   4, -15,  48, 100,  -9,   0, 1},
+  {-1,   4, -14,  44, 102,  -7,  -1, 1},
+  {-1,   4, -14,  40, 105,  -6,  -1, 1},
+  {-1,   4, -13,  36, 106,  -3,  -2, 1},
+  {-1,   4, -12,  32, 108,  -1,  -3, 1},
+  {-1,   3, -11,  28, 110,   1,  -4, 2},
+  {-1,   3, -10,  24, 111,   4,  -5, 2},
+  {-1,   3,  -9,  21, 111,   7,  -6, 2},
+  {-1,   3,  -9,  17, 112,  10,  -7, 3}
+};
+
+// Filters for interpolation (full-band) - no filtering for integer pixels
+const interp_kernel vp9_filteredinterp_filters1000[(1 << SUBPEL_BITS)] = {
+  {0,   0,   0, 128,   0,   0,   0, 0},
+  {0,   1,  -3, 128,   3,  -1,   0, 0},
+  {-1,   2,  -6, 127,   7,  -2,   1, 0},
+  {-1,   3,  -9, 126,  12,  -4,   1, 0},
+  {-1,   4, -12, 125,  16,  -5,   1, 0},
+  {-1,   4, -14, 123,  20,  -6,   2, 0},
+  {-1,   5, -15, 120,  25,  -8,   2, 0},
+  {-1,   5, -17, 118,  30,  -9,   3, -1},
+  {-1,   6, -18, 114,  35, -10,   3, -1},
+  {-1,   6, -19, 111,  41, -12,   3, -1},
+  {-1,   6, -20, 107,  46, -13,   4, -1},
+  {-1,   6, -21, 103,  52, -14,   4, -1},
+  {-1,   6, -21,  99,  57, -16,   5, -1},
+  {-1,   6, -21,  94,  63, -17,   5, -1},
+  {-1,   6, -20,  89,  68, -18,   5, -1},
+  {-1,   6, -20,  84,  73, -19,   6, -1},
+  {-1,   6, -20,  79,  79, -20,   6, -1},
+  {-1,   6, -19,  73,  84, -20,   6, -1},
+  {-1,   5, -18,  68,  89, -20,   6, -1},
+  {-1,   5, -17,  63,  94, -21,   6, -1},
+  {-1,   5, -16,  57,  99, -21,   6, -1},
+  {-1,   4, -14,  52, 103, -21,   6, -1},
+  {-1,   4, -13,  46, 107, -20,   6, -1},
+  {-1,   3, -12,  41, 111, -19,   6, -1},
+  {-1,   3, -10,  35, 114, -18,   6, -1},
+  {-1,   3,  -9,  30, 118, -17,   5, -1},
+  {0,   2,  -8,  25, 120, -15,   5, -1},
+  {0,   2,  -6,  20, 123, -14,   4, -1},
+  {0,   1,  -5,  16, 125, -12,   4, -1},
+  {0,   1,  -4,  12, 126,  -9,   3, -1},
+  {0,   1,  -2,   7, 127,  -6,   2, -1},
+  {0,   0,  -1,   3, 128,  -3,   1, 0}
+};
+
+// Filters for factor of 2 downsampling.
+static const int16_t vp9_down2_symeven_half_filter[] = {56, 12, -3, -1};
+static const int16_t vp9_down2_symodd_half_filter[] = {64, 35, 0, -3};
+
+static const interp_kernel *choose_interp_filter(int inlength, int outlength) {
+  int outlength16 = outlength * 16;
+  if (outlength16 >= inlength * 16)
+    return vp9_filteredinterp_filters1000;
+  else if (outlength16 >= inlength * 13)
+    return vp9_filteredinterp_filters875;
+  else if (outlength16 >= inlength * 11)
+    return vp9_filteredinterp_filters750;
+  else if (outlength16 >= inlength * 9)
+    return vp9_filteredinterp_filters625;
+  else
+    return vp9_filteredinterp_filters500;
+}
+
+static void interpolate(const uint8_t *const input, int inlength,
+                        uint8_t *output, int outlength) {
+  const int64_t delta = (((uint64_t)inlength << 32) + outlength / 2) /
+      outlength;
+  const int64_t offset = inlength > outlength ?
+      (((int64_t)(inlength - outlength) << 31) + outlength / 2) / outlength :
+      -(((int64_t)(outlength - inlength) << 31) + outlength / 2) / outlength;
+  uint8_t *optr = output;
+  int x, x1, x2, sum, k, int_pel, sub_pel;
+  int64_t y;
+
+  const interp_kernel *interp_filters =
+      choose_interp_filter(inlength, outlength);
+
+  x = 0;
+  y = offset;
+  while ((y >> INTERP_PRECISION_BITS) < (INTERP_TAPS / 2 - 1)) {
+    x++;
+    y += delta;
+  }
+  x1 = x;
+  x = outlength - 1;
+  y = delta * x + offset;
+  while ((y >> INTERP_PRECISION_BITS) +
+         (int64_t)(INTERP_TAPS / 2) >= inlength) {
+    x--;
+    y -= delta;
+  }
+  x2 = x;
+  if (x1 > x2) {
+    for (x = 0, y = offset; x < outlength; ++x, y += delta) {
+      const int16_t *filter;
+      int_pel = y >> INTERP_PRECISION_BITS;
+      sub_pel = (y >> (INTERP_PRECISION_BITS - SUBPEL_BITS)) & SUBPEL_MASK;
+      filter = interp_filters[sub_pel];
+      sum = 0;
+      for (k = 0; k < INTERP_TAPS; ++k) {
+        const int pk = int_pel - INTERP_TAPS / 2 + 1 + k;
+        sum += filter[k] * input[(pk < 0 ? 0 :
+                                  (pk >= inlength ? inlength - 1 : pk))];
+      }
+      *optr++ = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
+    }
+  } else {
+    // Initial part.
+    for (x = 0, y = offset; x < x1; ++x, y += delta) {
+      const int16_t *filter;
+      int_pel = y >> INTERP_PRECISION_BITS;
+      sub_pel = (y >> (INTERP_PRECISION_BITS - SUBPEL_BITS)) & SUBPEL_MASK;
+      filter = interp_filters[sub_pel];
+      sum = 0;
+      for (k = 0; k < INTERP_TAPS; ++k)
+        sum += filter[k] * input[(int_pel - INTERP_TAPS / 2 + 1 + k < 0 ?
+                                  0 :
+                                  int_pel - INTERP_TAPS / 2 + 1 + k)];
+      *optr++ = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
+    }
+    // Middle part.
+    for (; x <= x2; ++x, y += delta) {
+      const int16_t *filter;
+      int_pel = y >> INTERP_PRECISION_BITS;
+      sub_pel = (y >> (INTERP_PRECISION_BITS - SUBPEL_BITS)) & SUBPEL_MASK;
+      filter = interp_filters[sub_pel];
+      sum = 0;
+      for (k = 0; k < INTERP_TAPS; ++k)
+        sum += filter[k] * input[int_pel - INTERP_TAPS / 2 + 1 + k];
+      *optr++ = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
+    }
+    // End part.
+    for (; x < outlength; ++x, y += delta) {
+      const int16_t *filter;
+      int_pel = y >> INTERP_PRECISION_BITS;
+      sub_pel = (y >> (INTERP_PRECISION_BITS - SUBPEL_BITS)) & SUBPEL_MASK;
+      filter = interp_filters[sub_pel];
+      sum = 0;
+      for (k = 0; k < INTERP_TAPS; ++k)
+        sum += filter[k] * input[(int_pel - INTERP_TAPS / 2 + 1 + k >=
+                                  inlength ?  inlength - 1 :
+                                  int_pel - INTERP_TAPS / 2 + 1 + k)];
+      *optr++ = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
+    }
+  }
+}
+
+static void down2_symeven(const uint8_t *const input, int length,
+                          uint8_t *output) {
+  // Actual filter len = 2 * filter_len_half.
+  static const int16_t *filter = vp9_down2_symeven_half_filter;
+  const int filter_len_half = sizeof(vp9_down2_symeven_half_filter) / 2;
+  int i, j;
+  uint8_t *optr = output;
+  int l1 = filter_len_half;
+  int l2 = (length - filter_len_half);
+  l1 += (l1 & 1);
+  l2 += (l2 & 1);
+  if (l1 > l2) {
+    // Short input length.
+    for (i = 0; i < length; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1));
+      for (j = 0; j < filter_len_half; ++j) {
+        sum += (input[(i - j < 0 ? 0 : i - j)] +
+                input[(i + 1 + j >= length ? length - 1 : i + 1 + j)]) *
+            filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel(sum);
+    }
+  } else {
+    // Initial part.
+    for (i = 0; i < l1; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1));
+      for (j = 0; j < filter_len_half; ++j) {
+        sum += (input[(i - j < 0 ? 0 : i - j)] + input[i + 1 + j]) * filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel(sum);
+    }
+    // Middle part.
+    for (; i < l2; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1));
+      for (j = 0; j < filter_len_half; ++j) {
+        sum += (input[i - j] + input[i + 1 + j]) * filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel(sum);
+    }
+    // End part.
+    for (; i < length; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1));
+      for (j = 0; j < filter_len_half; ++j) {
+        sum += (input[i - j] +
+                input[(i + 1 + j >= length ? length - 1 : i + 1 + j)]) *
+            filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel(sum);
+    }
+  }
+}
+
+static void down2_symodd(const uint8_t *const input, int length,
+                         uint8_t *output) {
+  // Actual filter len = 2 * filter_len_half - 1.
+  static const int16_t *filter = vp9_down2_symodd_half_filter;
+  const int filter_len_half = sizeof(vp9_down2_symodd_half_filter) / 2;
+  int i, j;
+  uint8_t *optr = output;
+  int l1 = filter_len_half - 1;
+  int l2 = (length - filter_len_half + 1);
+  l1 += (l1 & 1);
+  l2 += (l2 & 1);
+  if (l1 > l2) {
+    // Short input length.
+    for (i = 0; i < length; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0];
+      for (j = 1; j < filter_len_half; ++j) {
+        sum += (input[(i - j < 0 ? 0 : i - j)] +
+                input[(i + j >= length ? length - 1 : i + j)]) *
+            filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel(sum);
+    }
+  } else {
+    // Initial part.
+    for (i = 0; i < l1; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0];
+      for (j = 1; j < filter_len_half; ++j) {
+        sum += (input[(i - j < 0 ? 0 : i - j)] + input[i + j]) * filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel(sum);
+    }
+    // Middle part.
+    for (; i < l2; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0];
+      for (j = 1; j < filter_len_half; ++j) {
+        sum += (input[i - j] + input[i + j]) * filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel(sum);
+    }
+    // End part.
+    for (; i < length; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0];
+      for (j = 1; j < filter_len_half; ++j) {
+        sum += (input[i - j] + input[(i + j >= length ? length - 1 : i + j)]) *
+            filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel(sum);
+    }
+  }
+}
+
+static int get_down2_length(int length, int steps) {
+  int s;
+  for (s = 0; s < steps; ++s)
+    length = (length + 1) >> 1;
+  return length;
+}
+
+int get_down2_steps(int in_length, int out_length) {
+  int steps = 0;
+  int proj_in_length;
+  while ((proj_in_length = get_down2_length(in_length, 1)) >= out_length) {
+    ++steps;
+    in_length = proj_in_length;
+  }
+  return steps;
+}
+
+static void resize_multistep(const uint8_t *const input,
+                             int length,
+                             uint8_t *output,
+                             int olength,
+                             uint8_t *buf) {
+  int steps;
+  if (length == olength) {
+    memcpy(output, input, sizeof(uint8_t) * length);
+    return;
+  }
+  steps = get_down2_steps(length, olength);
+
+  if (steps > 0) {
+    int s;
+    uint8_t *out = NULL;
+    uint8_t *tmpbuf = NULL;
+    uint8_t *otmp, *otmp2;
+    int filteredlength = length;
+    if (!tmpbuf) {
+      tmpbuf = (uint8_t *)malloc(sizeof(uint8_t) * length);
+      otmp = tmpbuf;
+    } else {
+      otmp = buf;
+    }
+    otmp2 = otmp + get_down2_length(length, 1);
+    for (s = 0; s < steps; ++s) {
+      const int proj_filteredlength = get_down2_length(filteredlength, 1);
+      const uint8_t *const in = (s == 0 ? input : out);
+      if (s == steps - 1 && proj_filteredlength == olength)
+        out = output;
+      else
+        out = (s & 1 ? otmp2 : otmp);
+      if (filteredlength & 1)
+        down2_symodd(in, filteredlength, out);
+      else
+        down2_symeven(in, filteredlength, out);
+      filteredlength = proj_filteredlength;
+    }
+    if (filteredlength != olength) {
+      interpolate(out, filteredlength, output, olength);
+    }
+    if (tmpbuf)
+      free(tmpbuf);
+  } else {
+    interpolate(input, length, output, olength);
+  }
+}
+
+static void fill_col_to_arr(uint8_t *img, int stride, int len, uint8_t *arr) {
+  int i;
+  uint8_t *iptr = img;
+  uint8_t *aptr = arr;
+  for (i = 0; i < len; ++i, iptr += stride) {
+    *aptr++ = *iptr;
+  }
+}
+
+static void fill_arr_to_col(uint8_t *img, int stride, int len, uint8_t *arr) {
+  int i;
+  uint8_t *iptr = img;
+  uint8_t *aptr = arr;
+  for (i = 0; i < len; ++i, iptr += stride) {
+    *iptr = *aptr++;
+  }
+}
+
+void vp9_resize_plane(const uint8_t *const input,
+                      int height,
+                      int width,
+                      int in_stride,
+                      uint8_t *output,
+                      int height2,
+                      int width2,
+                      int out_stride) {
+  int i;
+  uint8_t *intbuf = (uint8_t *)malloc(sizeof(uint8_t) * width2 * height);
+  uint8_t *tmpbuf = (uint8_t *)malloc(sizeof(uint8_t) *
+                                      (width < height ? height : width));
+  uint8_t *arrbuf = (uint8_t *)malloc(sizeof(uint8_t) * (height + height2));
+  for (i = 0; i < height; ++i)
+    resize_multistep(input + in_stride * i, width,
+                        intbuf + width2 * i, width2, tmpbuf);
+  for (i = 0; i < width2; ++i) {
+    fill_col_to_arr(intbuf + i, width2, height, arrbuf);
+    resize_multistep(arrbuf, height, arrbuf + height, height2, tmpbuf);
+    fill_arr_to_col(output + i, out_stride, height2, arrbuf + height);
+  }
+  free(intbuf);
+  free(tmpbuf);
+  free(arrbuf);
+}
+
+void vp9_resize_frame420(const uint8_t *const y,
+                         int y_stride,
+                         const uint8_t *const u, const uint8_t *const v,
+                         int uv_stride,
+                         int height, int width,
+                         uint8_t *oy, int oy_stride,
+                         uint8_t *ou, uint8_t *ov, int ouv_stride,
+                         int oheight, int owidth) {
+  vp9_resize_plane(y, height, width, y_stride,
+                   oy, oheight, owidth, oy_stride);
+  vp9_resize_plane(u, height / 2, width / 2, uv_stride,
+                   ou, oheight / 2, owidth / 2, ouv_stride);
+  vp9_resize_plane(v, height / 2, width / 2, uv_stride,
+                   ov, oheight / 2, owidth / 2, ouv_stride);
+}
+
+void vp9_resize_frame422(const uint8_t *const y, int y_stride,
+                         const uint8_t *const u, const uint8_t *const v,
+                         int uv_stride,
+                         int height, int width,
+                         uint8_t *oy, int oy_stride,
+                         uint8_t *ou, uint8_t *ov, int ouv_stride,
+                         int oheight, int owidth) {
+  vp9_resize_plane(y, height, width, y_stride,
+                   oy, oheight, owidth, oy_stride);
+  vp9_resize_plane(u, height, width / 2, uv_stride,
+                   ou, oheight, owidth / 2, ouv_stride);
+  vp9_resize_plane(v, height, width / 2, uv_stride,
+                   ov, oheight, owidth / 2, ouv_stride);
+}
+
+void vp9_resize_frame444(const uint8_t *const y, int y_stride,
+                         const uint8_t *const u, const uint8_t *const v,
+                         int uv_stride,
+                         int height, int width,
+                         uint8_t *oy, int oy_stride,
+                         uint8_t *ou, uint8_t *ov, int ouv_stride,
+                         int oheight, int owidth) {
+  vp9_resize_plane(y, height, width, y_stride,
+                   oy, oheight, owidth, oy_stride);
+  vp9_resize_plane(u, height, width, uv_stride,
+                   ou, oheight, owidth, ouv_stride);
+  vp9_resize_plane(v, height, width, uv_stride,
+                   ov, oheight, owidth, ouv_stride);
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_resize.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_resize.h
new file mode 100644
index 00000000000..1818cd47efb
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_resize.h
@@ -0,0 +1,68 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_ENCODER_VP9_RESIZE_H_
+#define VP9_ENCODER_VP9_RESIZE_H_
+
+#include <stdio.h>
+#include "vpx/vpx_integer.h"
+
+void vp9_resize_plane(const uint8_t *const input,
+                      int height,
+                      int width,
+                      int in_stride,
+                      uint8_t *output,
+                      int height2,
+                      int width2,
+                      int out_stride);
+void vp9_resize_frame420(const uint8_t *const y,
+                         int y_stride,
+                         const uint8_t *const u,
+                         const uint8_t *const v,
+                         int uv_stride,
+                         int height,
+                         int width,
+                         uint8_t *oy,
+                         int oy_stride,
+                         uint8_t *ou,
+                         uint8_t *ov,
+                         int ouv_stride,
+                         int oheight,
+                         int owidth);
+void vp9_resize_frame422(const uint8_t *const y,
+                         int y_stride,
+                         const uint8_t *const u,
+                         const uint8_t *const v,
+                         int uv_stride,
+                         int height,
+                         int width,
+                         uint8_t *oy,
+                         int oy_stride,
+                         uint8_t *ou,
+                         uint8_t *ov,
+                         int ouv_stride,
+                         int oheight,
+                         int owidth);
+void vp9_resize_frame444(const uint8_t *const y,
+                         int y_stride,
+                         const uint8_t *const u,
+                         const uint8_t *const v,
+                         int uv_stride,
+                         int height,
+                         int width,
+                         uint8_t *oy,
+                         int oy_stride,
+                         uint8_t *ou,
+                         uint8_t *ov,
+                         int ouv_stride,
+                         int oheight,
+                         int owidth);
+
+#endif    // VP9_ENCODER_VP9_RESIZE_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_sad.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_sad.c
new file mode 100644
index 00000000000..892e9055198
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_sad.c
@@ -0,0 +1,137 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+
+#include "./vp9_rtcd.h"
+#include "./vpx_config.h"
+
+#include "vpx/vpx_integer.h"
+#include "vp9/encoder/vp9_variance.h"
+
+static INLINE unsigned int sad(const uint8_t *a, int a_stride,
+                               const uint8_t *b, int b_stride,
+                               int width, int height) {
+  int y, x;
+  unsigned int sad = 0;
+
+  for (y = 0; y < height; y++) {
+    for (x = 0; x < width; x++)
+      sad += abs(a[x] - b[x]);
+
+    a += a_stride;
+    b += b_stride;
+  }
+
+  return sad;
+}
+
+#define sadMxN(m, n) \
+unsigned int vp9_sad##m##x##n##_c(const uint8_t *src, int src_stride, \
+                                  const uint8_t *ref, int ref_stride, \
+                                  unsigned int max_sad) { \
+  return sad(src, src_stride, ref, ref_stride, m, n); \
+} \
+unsigned int vp9_sad##m##x##n##_avg_c(const uint8_t *src, int src_stride, \
+                                      const uint8_t *ref, int ref_stride, \
+                                      const uint8_t *second_pred, \
+                                      unsigned int max_sad) { \
+  uint8_t comp_pred[m * n]; \
+  vp9_comp_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride); \
+  return sad(src, src_stride, comp_pred, m, m, n); \
+}
+
+#define sadMxNxK(m, n, k) \
+void vp9_sad##m##x##n##x##k##_c(const uint8_t *src, int src_stride, \
+                                const uint8_t *ref, int ref_stride, \
+                                unsigned int *sads) { \
+  int i; \
+  for (i = 0; i < k; ++i) \
+    sads[i] = vp9_sad##m##x##n##_c(src, src_stride, &ref[i], ref_stride, \
+                                   0x7fffffff); \
+}
+
+#define sadMxNx4D(m, n) \
+void vp9_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride, \
+                             const uint8_t *const refs[], int ref_stride, \
+                             unsigned int *sads) { \
+  int i; \
+  for (i = 0; i < 4; ++i) \
+    sads[i] = vp9_sad##m##x##n##_c(src, src_stride, refs[i], ref_stride, \
+                                   0x7fffffff); \
+}
+
+// 64x64
+sadMxN(64, 64)
+sadMxNxK(64, 64, 3)
+sadMxNxK(64, 64, 8)
+sadMxNx4D(64, 64)
+
+// 64x32
+sadMxN(64, 32)
+sadMxNx4D(64, 32)
+
+// 32x64
+sadMxN(32, 64)
+sadMxNx4D(32, 64)
+
+// 32x32
+sadMxN(32, 32)
+sadMxNxK(32, 32, 3)
+sadMxNxK(32, 32, 8)
+sadMxNx4D(32, 32)
+
+// 32x16
+sadMxN(32, 16)
+sadMxNx4D(32, 16)
+
+// 16x32
+sadMxN(16, 32)
+sadMxNx4D(16, 32)
+
+// 16x16
+sadMxN(16, 16)
+sadMxNxK(16, 16, 3)
+sadMxNxK(16, 16, 8)
+sadMxNx4D(16, 16)
+
+// 16x8
+sadMxN(16, 8)
+sadMxNxK(16, 8, 3)
+sadMxNxK(16, 8, 8)
+sadMxNx4D(16, 8)
+
+// 8x16
+sadMxN(8, 16)
+sadMxNxK(8, 16, 3)
+sadMxNxK(8, 16, 8)
+sadMxNx4D(8, 16)
+
+// 8x8
+sadMxN(8, 8)
+sadMxNxK(8, 8, 3)
+sadMxNxK(8, 8, 8)
+sadMxNx4D(8, 8)
+
+// 8x4
+sadMxN(8, 4)
+sadMxNxK(8, 4, 8)
+sadMxNx4D(8, 4)
+
+// 4x8
+sadMxN(4, 8)
+sadMxNxK(4, 8, 8)
+sadMxNx4D(4, 8)
+
+// 4x4
+sadMxN(4, 4)
+sadMxNxK(4, 4, 3)
+sadMxNxK(4, 4, 8)
+sadMxNx4D(4, 4)
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_sad_c.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_sad_c.c
deleted file mode 100644
index 42ddb21a51b..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_sad_c.c
+++ /dev/null
@@ -1,615 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include <stdlib.h>
-#include "vp9/common/vp9_sadmxn.h"
-#include "vp9/encoder/vp9_variance.h"
-#include "./vpx_config.h"
-#include "vpx/vpx_integer.h"
-#include "./vp9_rtcd.h"
-
-#define sad_mxn_func(m, n) \
-unsigned int vp9_sad##m##x##n##_c(const uint8_t *src_ptr, \
-                                  int  src_stride, \
-                                  const uint8_t *ref_ptr, \
-                                  int  ref_stride, \
-                                  unsigned int max_sad) { \
-  return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, m, n); \
-} \
-unsigned int vp9_sad##m##x##n##_avg_c(const uint8_t *src_ptr, \
-                                      int  src_stride, \
-                                      const uint8_t *ref_ptr, \
-                                      int  ref_stride, \
-                                      const uint8_t *second_pred, \
-                                      unsigned int max_sad) { \
-  uint8_t comp_pred[m * n]; \
-  comp_avg_pred(comp_pred, second_pred, m, n, ref_ptr, ref_stride); \
-  return sad_mx_n_c(src_ptr, src_stride, comp_pred, m, m, n); \
-}
-
-sad_mxn_func(64, 64)
-sad_mxn_func(64, 32)
-sad_mxn_func(32, 64)
-sad_mxn_func(32, 32)
-sad_mxn_func(32, 16)
-sad_mxn_func(16, 32)
-sad_mxn_func(16, 16)
-sad_mxn_func(16, 8)
-sad_mxn_func(8, 16)
-sad_mxn_func(8, 8)
-sad_mxn_func(8, 4)
-sad_mxn_func(4, 8)
-sad_mxn_func(4, 4)
-
-void vp9_sad64x32x4d_c(const uint8_t *src_ptr,
-                       int  src_stride,
-                       const uint8_t* const ref_ptr[],
-                       int  ref_stride,
-                       unsigned int *sad_array) {
-  sad_array[0] = vp9_sad64x32(src_ptr, src_stride,
-                              ref_ptr[0], ref_stride, 0x7fffffff);
-  sad_array[1] = vp9_sad64x32(src_ptr, src_stride,
-                              ref_ptr[1], ref_stride, 0x7fffffff);
-  sad_array[2] = vp9_sad64x32(src_ptr, src_stride,
-                              ref_ptr[2], ref_stride, 0x7fffffff);
-  sad_array[3] = vp9_sad64x32(src_ptr, src_stride,
-                              ref_ptr[3], ref_stride, 0x7fffffff);
-}
-
-void vp9_sad32x64x4d_c(const uint8_t *src_ptr,
-                       int  src_stride,
-                       const uint8_t* const ref_ptr[],
-                       int  ref_stride,
-                       unsigned int *sad_array) {
-  sad_array[0] = vp9_sad32x64(src_ptr, src_stride,
-                              ref_ptr[0], ref_stride, 0x7fffffff);
-  sad_array[1] = vp9_sad32x64(src_ptr, src_stride,
-                              ref_ptr[1], ref_stride, 0x7fffffff);
-  sad_array[2] = vp9_sad32x64(src_ptr, src_stride,
-                              ref_ptr[2], ref_stride, 0x7fffffff);
-  sad_array[3] = vp9_sad32x64(src_ptr, src_stride,
-                              ref_ptr[3], ref_stride, 0x7fffffff);
-}
-
-void vp9_sad32x16x4d_c(const uint8_t *src_ptr,
-                       int  src_stride,
-                       const uint8_t* const ref_ptr[],
-                       int  ref_stride,
-                       unsigned int *sad_array) {
-  sad_array[0] = vp9_sad32x16(src_ptr, src_stride,
-                              ref_ptr[0], ref_stride, 0x7fffffff);
-  sad_array[1] = vp9_sad32x16(src_ptr, src_stride,
-                              ref_ptr[1], ref_stride, 0x7fffffff);
-  sad_array[2] = vp9_sad32x16(src_ptr, src_stride,
-                              ref_ptr[2], ref_stride, 0x7fffffff);
-  sad_array[3] = vp9_sad32x16(src_ptr, src_stride,
-                              ref_ptr[3], ref_stride, 0x7fffffff);
-}
-
-void vp9_sad16x32x4d_c(const uint8_t *src_ptr,
-                       int  src_stride,
-                       const uint8_t* const ref_ptr[],
-                       int  ref_stride,
-                       unsigned int *sad_array) {
-  sad_array[0] = vp9_sad16x32(src_ptr, src_stride,
-                              ref_ptr[0], ref_stride, 0x7fffffff);
-  sad_array[1] = vp9_sad16x32(src_ptr, src_stride,
-                              ref_ptr[1], ref_stride, 0x7fffffff);
-  sad_array[2] = vp9_sad16x32(src_ptr, src_stride,
-                              ref_ptr[2], ref_stride, 0x7fffffff);
-  sad_array[3] = vp9_sad16x32(src_ptr, src_stride,
-                              ref_ptr[3], ref_stride, 0x7fffffff);
-}
-
-void vp9_sad64x64x3_c(const uint8_t *src_ptr,
-                      int  src_stride,
-                      const uint8_t *ref_ptr,
-                      int  ref_stride,
-                      unsigned int *sad_array) {
-  sad_array[0] = vp9_sad64x64(src_ptr, src_stride, ref_ptr, ref_stride,
-                              0x7fffffff);
-  sad_array[1] = vp9_sad64x64(src_ptr, src_stride, ref_ptr + 1, ref_stride,
-                              0x7fffffff);
-  sad_array[2] = vp9_sad64x64(src_ptr, src_stride, ref_ptr + 2, ref_stride,
-                              0x7fffffff);
-}
-
-void vp9_sad32x32x3_c(const uint8_t *src_ptr,
-                      int  src_stride,
-                      const uint8_t *ref_ptr,
-                      int  ref_stride,
-                      unsigned int *sad_array) {
-  sad_array[0] = vp9_sad32x32(src_ptr, src_stride,
-                              ref_ptr, ref_stride, 0x7fffffff);
-  sad_array[1] = vp9_sad32x32(src_ptr, src_stride,
-                              ref_ptr + 1, ref_stride, 0x7fffffff);
-  sad_array[2] = vp9_sad32x32(src_ptr, src_stride,
-                              ref_ptr + 2, ref_stride, 0x7fffffff);
-}
-
-void vp9_sad64x64x8_c(const uint8_t *src_ptr,
-                      int  src_stride,
-                      const uint8_t *ref_ptr,
-                      int  ref_stride,
-                      unsigned int *sad_array) {
-  sad_array[0] = vp9_sad64x64(src_ptr, src_stride,
-                              ref_ptr, ref_stride,
-                              0x7fffffff);
-  sad_array[1] = vp9_sad64x64(src_ptr, src_stride,
-                              ref_ptr + 1, ref_stride,
-                              0x7fffffff);
-  sad_array[2] = vp9_sad64x64(src_ptr, src_stride,
-                              ref_ptr + 2, ref_stride,
-                              0x7fffffff);
-  sad_array[3] = vp9_sad64x64(src_ptr, src_stride,
-                              ref_ptr + 3, ref_stride,
-                              0x7fffffff);
-  sad_array[4] = vp9_sad64x64(src_ptr, src_stride,
-                              ref_ptr + 4, ref_stride,
-                              0x7fffffff);
-  sad_array[5] = vp9_sad64x64(src_ptr, src_stride,
-                              ref_ptr + 5, ref_stride,
-                              0x7fffffff);
-  sad_array[6] = vp9_sad64x64(src_ptr, src_stride,
-                              ref_ptr + 6, ref_stride,
-                              0x7fffffff);
-  sad_array[7] = vp9_sad64x64(src_ptr, src_stride,
-                              ref_ptr + 7, ref_stride,
-                              0x7fffffff);
-}
-
-void vp9_sad32x32x8_c(const uint8_t *src_ptr,
-                      int  src_stride,
-                      const uint8_t *ref_ptr,
-                      int  ref_stride,
-                      unsigned int *sad_array) {
-  sad_array[0] = vp9_sad32x32(src_ptr, src_stride,
-                              ref_ptr, ref_stride,
-                              0x7fffffff);
-  sad_array[1] = vp9_sad32x32(src_ptr, src_stride,
-                              ref_ptr + 1, ref_stride,
-                              0x7fffffff);
-  sad_array[2] = vp9_sad32x32(src_ptr, src_stride,
-                              ref_ptr + 2, ref_stride,
-                              0x7fffffff);
-  sad_array[3] = vp9_sad32x32(src_ptr, src_stride,
-                              ref_ptr + 3, ref_stride,
-                              0x7fffffff);
-  sad_array[4] = vp9_sad32x32(src_ptr, src_stride,
-                              ref_ptr + 4, ref_stride,
-                              0x7fffffff);
-  sad_array[5] = vp9_sad32x32(src_ptr, src_stride,
-                              ref_ptr + 5, ref_stride,
-                              0x7fffffff);
-  sad_array[6] = vp9_sad32x32(src_ptr, src_stride,
-                              ref_ptr + 6, ref_stride,
-                              0x7fffffff);
-  sad_array[7] = vp9_sad32x32(src_ptr, src_stride,
-                              ref_ptr + 7, ref_stride,
-                              0x7fffffff);
-}
-
-void vp9_sad16x16x3_c(const uint8_t *src_ptr,
-                      int  src_stride,
-                      const uint8_t *ref_ptr,
-                      int  ref_stride,
-                      unsigned int *sad_array) {
-  sad_array[0] = vp9_sad16x16(src_ptr, src_stride,
-                              ref_ptr, ref_stride, 0x7fffffff);
-  sad_array[1] = vp9_sad16x16(src_ptr, src_stride,
-                              ref_ptr + 1, ref_stride, 0x7fffffff);
-  sad_array[2] = vp9_sad16x16(src_ptr, src_stride,
-                              ref_ptr + 2, ref_stride, 0x7fffffff);
-}
-
-void vp9_sad16x16x8_c(const uint8_t *src_ptr,
-                      int  src_stride,
-                      const uint8_t *ref_ptr,
-                      int  ref_stride,
-                      uint32_t *sad_array) {
-  sad_array[0] = vp9_sad16x16(src_ptr, src_stride,
-                              ref_ptr, ref_stride,
-                              0x7fffffff);
-  sad_array[1] = vp9_sad16x16(src_ptr, src_stride,
-                              ref_ptr + 1, ref_stride,
-                              0x7fffffff);
-  sad_array[2] = vp9_sad16x16(src_ptr, src_stride,
-                              ref_ptr + 2, ref_stride,
-                              0x7fffffff);
-  sad_array[3] = vp9_sad16x16(src_ptr, src_stride,
-                              ref_ptr + 3, ref_stride,
-                              0x7fffffff);
-  sad_array[4] = vp9_sad16x16(src_ptr, src_stride,
-                              ref_ptr + 4, ref_stride,
-                              0x7fffffff);
-  sad_array[5] = vp9_sad16x16(src_ptr, src_stride,
-                              ref_ptr + 5, ref_stride,
-                              0x7fffffff);
-  sad_array[6] = vp9_sad16x16(src_ptr, src_stride,
-                              ref_ptr + 6, ref_stride,
-                              0x7fffffff);
-  sad_array[7] = vp9_sad16x16(src_ptr, src_stride,
-                              ref_ptr + 7, ref_stride,
-                              0x7fffffff);
-}
-
-void vp9_sad16x8x3_c(const uint8_t *src_ptr,
-                     int  src_stride,
-                     const uint8_t *ref_ptr,
-                     int  ref_stride,
-                     unsigned int *sad_array) {
-  sad_array[0] = vp9_sad16x8(src_ptr, src_stride,
-                             ref_ptr, ref_stride, 0x7fffffff);
-  sad_array[1] = vp9_sad16x8(src_ptr, src_stride,
-                             ref_ptr + 1, ref_stride, 0x7fffffff);
-  sad_array[2] = vp9_sad16x8(src_ptr, src_stride,
-                             ref_ptr + 2, ref_stride, 0x7fffffff);
-}
-
-void vp9_sad16x8x8_c(const uint8_t *src_ptr,
-                     int  src_stride,
-                     const uint8_t *ref_ptr,
-                     int  ref_stride,
-                     uint32_t *sad_array) {
-  sad_array[0] = vp9_sad16x8(src_ptr, src_stride,
-                             ref_ptr, ref_stride,
-                             0x7fffffff);
-  sad_array[1] = vp9_sad16x8(src_ptr, src_stride,
-                             ref_ptr + 1, ref_stride,
-                             0x7fffffff);
-  sad_array[2] = vp9_sad16x8(src_ptr, src_stride,
-                             ref_ptr + 2, ref_stride,
-                             0x7fffffff);
-  sad_array[3] = vp9_sad16x8(src_ptr, src_stride,
-                             ref_ptr + 3, ref_stride,
-                             0x7fffffff);
-  sad_array[4] = vp9_sad16x8(src_ptr, src_stride,
-                             ref_ptr + 4, ref_stride,
-                             0x7fffffff);
-  sad_array[5] = vp9_sad16x8(src_ptr, src_stride,
-                             ref_ptr + 5, ref_stride,
-                             0x7fffffff);
-  sad_array[6] = vp9_sad16x8(src_ptr, src_stride,
-                             ref_ptr + 6, ref_stride,
-                             0x7fffffff);
-  sad_array[7] = vp9_sad16x8(src_ptr, src_stride,
-                             ref_ptr + 7, ref_stride,
-                             0x7fffffff);
-}
-
-void vp9_sad8x8x3_c(const uint8_t *src_ptr,
-                    int  src_stride,
-                    const uint8_t *ref_ptr,
-                    int  ref_stride,
-                    unsigned int *sad_array) {
-  sad_array[0] = vp9_sad8x8(src_ptr, src_stride,
-                            ref_ptr, ref_stride, 0x7fffffff);
-  sad_array[1] = vp9_sad8x8(src_ptr, src_stride,
-                            ref_ptr + 1, ref_stride, 0x7fffffff);
-  sad_array[2] = vp9_sad8x8(src_ptr, src_stride,
-                            ref_ptr + 2, ref_stride, 0x7fffffff);
-}
-
-void vp9_sad8x8x8_c(const uint8_t *src_ptr,
-                    int  src_stride,
-                    const uint8_t *ref_ptr,
-                    int  ref_stride,
-                    uint32_t *sad_array) {
-  sad_array[0] = vp9_sad8x8(src_ptr, src_stride,
-                            ref_ptr, ref_stride,
-                            0x7fffffff);
-  sad_array[1] = vp9_sad8x8(src_ptr, src_stride,
-                            ref_ptr + 1, ref_stride,
-                            0x7fffffff);
-  sad_array[2] = vp9_sad8x8(src_ptr, src_stride,
-                            ref_ptr + 2, ref_stride,
-                            0x7fffffff);
-  sad_array[3] = vp9_sad8x8(src_ptr, src_stride,
-                            ref_ptr + 3, ref_stride,
-                            0x7fffffff);
-  sad_array[4] = vp9_sad8x8(src_ptr, src_stride,
-                            ref_ptr + 4, ref_stride,
-                            0x7fffffff);
-  sad_array[5] = vp9_sad8x8(src_ptr, src_stride,
-                            ref_ptr + 5, ref_stride,
-                            0x7fffffff);
-  sad_array[6] = vp9_sad8x8(src_ptr, src_stride,
-                            ref_ptr + 6, ref_stride,
-                            0x7fffffff);
-  sad_array[7] = vp9_sad8x8(src_ptr, src_stride,
-                            ref_ptr + 7, ref_stride,
-                            0x7fffffff);
-}
-
-void vp9_sad8x16x3_c(const uint8_t *src_ptr,
-                     int  src_stride,
-                     const uint8_t *ref_ptr,
-                     int  ref_stride,
-                     unsigned int *sad_array) {
-  sad_array[0] = vp9_sad8x16(src_ptr, src_stride,
-                             ref_ptr, ref_stride, 0x7fffffff);
-  sad_array[1] = vp9_sad8x16(src_ptr, src_stride,
-                             ref_ptr + 1, ref_stride, 0x7fffffff);
-  sad_array[2] = vp9_sad8x16(src_ptr, src_stride,
-                             ref_ptr + 2, ref_stride, 0x7fffffff);
-}
-
-void vp9_sad8x16x8_c(const uint8_t *src_ptr,
-                     int  src_stride,
-                     const uint8_t *ref_ptr,
-                     int  ref_stride,
-                     uint32_t *sad_array) {
-  sad_array[0] = vp9_sad8x16(src_ptr, src_stride,
-                             ref_ptr, ref_stride,
-                             0x7fffffff);
-  sad_array[1] = vp9_sad8x16(src_ptr, src_stride,
-                             ref_ptr + 1, ref_stride,
-                             0x7fffffff);
-  sad_array[2] = vp9_sad8x16(src_ptr, src_stride,
-                             ref_ptr + 2, ref_stride,
-                             0x7fffffff);
-  sad_array[3] = vp9_sad8x16(src_ptr, src_stride,
-                             ref_ptr + 3, ref_stride,
-                             0x7fffffff);
-  sad_array[4] = vp9_sad8x16(src_ptr, src_stride,
-                             ref_ptr + 4, ref_stride,
-                             0x7fffffff);
-  sad_array[5] = vp9_sad8x16(src_ptr, src_stride,
-                             ref_ptr + 5, ref_stride,
-                             0x7fffffff);
-  sad_array[6] = vp9_sad8x16(src_ptr, src_stride,
-                             ref_ptr + 6, ref_stride,
-                             0x7fffffff);
-  sad_array[7] = vp9_sad8x16(src_ptr, src_stride,
-                             ref_ptr + 7, ref_stride,
-                             0x7fffffff);
-}
-
-void vp9_sad4x4x3_c(const uint8_t *src_ptr,
-                    int  src_stride,
-                    const uint8_t *ref_ptr,
-                    int  ref_stride,
-                    unsigned int *sad_array) {
-  sad_array[0] = vp9_sad4x4(src_ptr, src_stride,
-                            ref_ptr, ref_stride, 0x7fffffff);
-  sad_array[1] = vp9_sad4x4(src_ptr, src_stride,
-                            ref_ptr + 1, ref_stride, 0x7fffffff);
-  sad_array[2] = vp9_sad4x4(src_ptr, src_stride,
-                            ref_ptr + 2, ref_stride, 0x7fffffff);
-}
-
-void vp9_sad4x4x8_c(const uint8_t *src_ptr,
-                    int  src_stride,
-                    const uint8_t *ref_ptr,
-                    int  ref_stride,
-                    uint32_t *sad_array) {
-  sad_array[0] = vp9_sad4x4(src_ptr, src_stride,
-                            ref_ptr, ref_stride,
-                            0x7fffffff);
-  sad_array[1] = vp9_sad4x4(src_ptr, src_stride,
-                            ref_ptr + 1, ref_stride,
-                            0x7fffffff);
-  sad_array[2] = vp9_sad4x4(src_ptr, src_stride,
-                            ref_ptr + 2, ref_stride,
-                            0x7fffffff);
-  sad_array[3] = vp9_sad4x4(src_ptr, src_stride,
-                            ref_ptr + 3, ref_stride,
-                            0x7fffffff);
-  sad_array[4] = vp9_sad4x4(src_ptr, src_stride,
-                            ref_ptr + 4, ref_stride,
-                            0x7fffffff);
-  sad_array[5] = vp9_sad4x4(src_ptr, src_stride,
-                            ref_ptr + 5, ref_stride,
-                            0x7fffffff);
-  sad_array[6] = vp9_sad4x4(src_ptr, src_stride,
-                            ref_ptr + 6, ref_stride,
-                            0x7fffffff);
-  sad_array[7] = vp9_sad4x4(src_ptr, src_stride,
-                            ref_ptr + 7, ref_stride,
-                            0x7fffffff);
-}
-
-void vp9_sad64x64x4d_c(const uint8_t *src_ptr,
-                       int  src_stride,
-                       const uint8_t* const ref_ptr[],
-                       int  ref_stride,
-                       unsigned int *sad_array) {
-  sad_array[0] = vp9_sad64x64(src_ptr, src_stride,
-                              ref_ptr[0], ref_stride, 0x7fffffff);
-  sad_array[1] = vp9_sad64x64(src_ptr, src_stride,
-                              ref_ptr[1], ref_stride, 0x7fffffff);
-  sad_array[2] = vp9_sad64x64(src_ptr, src_stride,
-                              ref_ptr[2], ref_stride, 0x7fffffff);
-  sad_array[3] = vp9_sad64x64(src_ptr, src_stride,
-                              ref_ptr[3], ref_stride, 0x7fffffff);
-}
-
-void vp9_sad32x32x4d_c(const uint8_t *src_ptr,
-                       int  src_stride,
-                       const uint8_t* const ref_ptr[],
-                       int  ref_stride,
-                       unsigned int *sad_array) {
-  sad_array[0] = vp9_sad32x32(src_ptr, src_stride,
-                              ref_ptr[0], ref_stride, 0x7fffffff);
-  sad_array[1] = vp9_sad32x32(src_ptr, src_stride,
-                              ref_ptr[1], ref_stride, 0x7fffffff);
-  sad_array[2] = vp9_sad32x32(src_ptr, src_stride,
-                              ref_ptr[2], ref_stride, 0x7fffffff);
-  sad_array[3] = vp9_sad32x32(src_ptr, src_stride,
-                              ref_ptr[3], ref_stride, 0x7fffffff);
-}
-
-void vp9_sad16x16x4d_c(const uint8_t *src_ptr,
-                       int  src_stride,
-                       const uint8_t* const ref_ptr[],
-                       int  ref_stride,
-                       unsigned int *sad_array) {
-  sad_array[0] = vp9_sad16x16(src_ptr, src_stride,
-                              ref_ptr[0], ref_stride, 0x7fffffff);
-  sad_array[1] = vp9_sad16x16(src_ptr, src_stride,
-                              ref_ptr[1], ref_stride, 0x7fffffff);
-  sad_array[2] = vp9_sad16x16(src_ptr, src_stride,
-                              ref_ptr[2], ref_stride, 0x7fffffff);
-  sad_array[3] = vp9_sad16x16(src_ptr, src_stride,
-                              ref_ptr[3], ref_stride, 0x7fffffff);
-}
-
-void vp9_sad16x8x4d_c(const uint8_t *src_ptr,
-                      int  src_stride,
-                      const uint8_t* const ref_ptr[],
-                      int  ref_stride,
-                      unsigned int *sad_array) {
-  sad_array[0] = vp9_sad16x8(src_ptr, src_stride,
-                             ref_ptr[0], ref_stride, 0x7fffffff);
-  sad_array[1] = vp9_sad16x8(src_ptr, src_stride,
-                             ref_ptr[1], ref_stride, 0x7fffffff);
-  sad_array[2] = vp9_sad16x8(src_ptr, src_stride,
-                             ref_ptr[2], ref_stride, 0x7fffffff);
-  sad_array[3] = vp9_sad16x8(src_ptr, src_stride,
-                             ref_ptr[3], ref_stride, 0x7fffffff);
-}
-
-void vp9_sad8x8x4d_c(const uint8_t *src_ptr,
-                     int  src_stride,
-                     const uint8_t* const ref_ptr[],
-                     int  ref_stride,
-                     unsigned int *sad_array) {
-  sad_array[0] = vp9_sad8x8(src_ptr, src_stride,
-                            ref_ptr[0], ref_stride, 0x7fffffff);
-  sad_array[1] = vp9_sad8x8(src_ptr, src_stride,
-                            ref_ptr[1], ref_stride, 0x7fffffff);
-  sad_array[2] = vp9_sad8x8(src_ptr, src_stride,
-                            ref_ptr[2], ref_stride, 0x7fffffff);
-  sad_array[3] = vp9_sad8x8(src_ptr, src_stride,
-                            ref_ptr[3], ref_stride, 0x7fffffff);
-}
-
-void vp9_sad8x16x4d_c(const uint8_t *src_ptr,
-                      int  src_stride,
-                      const uint8_t* const ref_ptr[],
-                      int  ref_stride,
-                      unsigned int *sad_array) {
-  sad_array[0] = vp9_sad8x16(src_ptr, src_stride,
-                             ref_ptr[0], ref_stride, 0x7fffffff);
-  sad_array[1] = vp9_sad8x16(src_ptr, src_stride,
-                             ref_ptr[1], ref_stride, 0x7fffffff);
-  sad_array[2] = vp9_sad8x16(src_ptr, src_stride,
-                             ref_ptr[2], ref_stride, 0x7fffffff);
-  sad_array[3] = vp9_sad8x16(src_ptr, src_stride,
-                             ref_ptr[3], ref_stride, 0x7fffffff);
-}
-
-void vp9_sad8x4x4d_c(const uint8_t *src_ptr,
-                     int  src_stride,
-                     const uint8_t* const ref_ptr[],
-                     int  ref_stride,
-                     unsigned int *sad_array) {
-  sad_array[0] = vp9_sad8x4(src_ptr, src_stride,
-                            ref_ptr[0], ref_stride, 0x7fffffff);
-  sad_array[1] = vp9_sad8x4(src_ptr, src_stride,
-                            ref_ptr[1], ref_stride, 0x7fffffff);
-  sad_array[2] = vp9_sad8x4(src_ptr, src_stride,
-                            ref_ptr[2], ref_stride, 0x7fffffff);
-  sad_array[3] = vp9_sad8x4(src_ptr, src_stride,
-                            ref_ptr[3], ref_stride, 0x7fffffff);
-}
-
-void vp9_sad8x4x8_c(const uint8_t *src_ptr,
-                     int  src_stride,
-                     const uint8_t *ref_ptr,
-                     int  ref_stride,
-                     uint32_t *sad_array) {
-  sad_array[0] = vp9_sad8x4(src_ptr, src_stride,
-                             ref_ptr, ref_stride,
-                             0x7fffffff);
-  sad_array[1] = vp9_sad8x4(src_ptr, src_stride,
-                             ref_ptr + 1, ref_stride,
-                             0x7fffffff);
-  sad_array[2] = vp9_sad8x4(src_ptr, src_stride,
-                             ref_ptr + 2, ref_stride,
-                             0x7fffffff);
-  sad_array[3] = vp9_sad8x4(src_ptr, src_stride,
-                             ref_ptr + 3, ref_stride,
-                             0x7fffffff);
-  sad_array[4] = vp9_sad8x4(src_ptr, src_stride,
-                             ref_ptr + 4, ref_stride,
-                             0x7fffffff);
-  sad_array[5] = vp9_sad8x4(src_ptr, src_stride,
-                             ref_ptr + 5, ref_stride,
-                             0x7fffffff);
-  sad_array[6] = vp9_sad8x4(src_ptr, src_stride,
-                             ref_ptr + 6, ref_stride,
-                             0x7fffffff);
-  sad_array[7] = vp9_sad8x4(src_ptr, src_stride,
-                             ref_ptr + 7, ref_stride,
-                             0x7fffffff);
-}
-
-void vp9_sad4x8x4d_c(const uint8_t *src_ptr,
-                     int  src_stride,
-                     const uint8_t* const ref_ptr[],
-                     int  ref_stride,
-                     unsigned int *sad_array) {
-  sad_array[0] = vp9_sad4x8(src_ptr, src_stride,
-                            ref_ptr[0], ref_stride, 0x7fffffff);
-  sad_array[1] = vp9_sad4x8(src_ptr, src_stride,
-                            ref_ptr[1], ref_stride, 0x7fffffff);
-  sad_array[2] = vp9_sad4x8(src_ptr, src_stride,
-                            ref_ptr[2], ref_stride, 0x7fffffff);
-  sad_array[3] = vp9_sad4x8(src_ptr, src_stride,
-                            ref_ptr[3], ref_stride, 0x7fffffff);
-}
-
-void vp9_sad4x8x8_c(const uint8_t *src_ptr,
-                     int  src_stride,
-                     const uint8_t *ref_ptr,
-                     int  ref_stride,
-                     uint32_t *sad_array) {
-  sad_array[0] = vp9_sad4x8(src_ptr, src_stride,
-                             ref_ptr, ref_stride,
-                             0x7fffffff);
-  sad_array[1] = vp9_sad4x8(src_ptr, src_stride,
-                             ref_ptr + 1, ref_stride,
-                             0x7fffffff);
-  sad_array[2] = vp9_sad4x8(src_ptr, src_stride,
-                             ref_ptr + 2, ref_stride,
-                             0x7fffffff);
-  sad_array[3] = vp9_sad4x8(src_ptr, src_stride,
-                             ref_ptr + 3, ref_stride,
-                             0x7fffffff);
-  sad_array[4] = vp9_sad4x8(src_ptr, src_stride,
-                             ref_ptr + 4, ref_stride,
-                             0x7fffffff);
-  sad_array[5] = vp9_sad4x8(src_ptr, src_stride,
-                             ref_ptr + 5, ref_stride,
-                             0x7fffffff);
-  sad_array[6] = vp9_sad4x8(src_ptr, src_stride,
-                             ref_ptr + 6, ref_stride,
-                             0x7fffffff);
-  sad_array[7] = vp9_sad4x8(src_ptr, src_stride,
-                             ref_ptr + 7, ref_stride,
-                             0x7fffffff);
-}
-
-void vp9_sad4x4x4d_c(const uint8_t *src_ptr,
-                     int  src_stride,
-                     const uint8_t* const ref_ptr[],
-                     int  ref_stride,
-                     unsigned int *sad_array) {
-  sad_array[0] = vp9_sad4x4(src_ptr, src_stride,
-                            ref_ptr[0], ref_stride, 0x7fffffff);
-  sad_array[1] = vp9_sad4x4(src_ptr, src_stride,
-                            ref_ptr[1], ref_stride, 0x7fffffff);
-  sad_array[2] = vp9_sad4x4(src_ptr, src_stride,
-                            ref_ptr[2], ref_stride, 0x7fffffff);
-  sad_array[3] = vp9_sad4x4(src_ptr, src_stride,
-                            ref_ptr[3], ref_stride, 0x7fffffff);
-}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_segmentation.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_segmentation.c
index 24f011f8309..574df6293e1 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_segmentation.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_segmentation.c
@@ -10,46 +10,28 @@
 
 
 #include <limits.h>
+
 #include "vpx_mem/vpx_mem.h"
-#include "vp9/encoder/vp9_segmentation.h"
+
 #include "vp9/common/vp9_pred_common.h"
 #include "vp9/common/vp9_tile_common.h"
 
-void vp9_enable_segmentation(VP9_PTR ptr) {
-  VP9_COMP *cpi = (VP9_COMP *)ptr;
-  struct segmentation *const seg =  &cpi->common.seg;
+#include "vp9/encoder/vp9_cost.h"
+#include "vp9/encoder/vp9_segmentation.h"
 
+void vp9_enable_segmentation(struct segmentation *seg) {
   seg->enabled = 1;
   seg->update_map = 1;
   seg->update_data = 1;
 }
 
-void vp9_disable_segmentation(VP9_PTR ptr) {
-  VP9_COMP *cpi = (VP9_COMP *)ptr;
-  struct segmentation *const seg =  &cpi->common.seg;
+void vp9_disable_segmentation(struct segmentation *seg) {
   seg->enabled = 0;
 }
 
-void vp9_set_segmentation_map(VP9_PTR ptr,
-                              unsigned char *segmentation_map) {
-  VP9_COMP *cpi = (VP9_COMP *)ptr;
-  struct segmentation *const seg = &cpi->common.seg;
-
-  // Copy in the new segmentation map
-  vpx_memcpy(cpi->segmentation_map, segmentation_map,
-             (cpi->common.mi_rows * cpi->common.mi_cols));
-
-  // Signal that the map should be updated.
-  seg->update_map = 1;
-  seg->update_data = 1;
-}
-
-void vp9_set_segment_data(VP9_PTR ptr,
+void vp9_set_segment_data(struct segmentation *seg,
                           signed char *feature_data,
                           unsigned char abs_delta) {
-  VP9_COMP *cpi = (VP9_COMP *)ptr;
-  struct segmentation *const seg = &cpi->common.seg;
-
   seg->abs_delta = abs_delta;
 
   vpx_memcpy(seg->feature_data, feature_data, sizeof(seg->feature_data));
@@ -58,6 +40,15 @@ void vp9_set_segment_data(VP9_PTR ptr,
   // vpx_memcpy(cpi->mb.e_mbd.segment_feature_mask, 0,
   //            sizeof(cpi->mb.e_mbd.segment_feature_mask));
 }
+void vp9_disable_segfeature(struct segmentation *seg, int segment_id,
+                            SEG_LVL_FEATURES feature_id) {
+  seg->feature_mask[segment_id] &= ~(1 << feature_id);
+}
+
+void vp9_clear_segdata(struct segmentation *seg, int segment_id,
+                       SEG_LVL_FEATURES feature_id) {
+  seg->feature_data[segment_id][feature_id] = 0;
+}
 
 // Based on set of segment counts calculate a probability tree
 static void calc_segtree_probs(int *segcounts, vp9_prob *segment_tree_probs) {
@@ -118,7 +109,7 @@ static int cost_segmap(int *segcounts, vp9_prob *probs) {
 }
 
 static void count_segs(VP9_COMP *cpi, const TileInfo *const tile,
-                       MODE_INFO **mi_8x8,
+                       MODE_INFO **mi,
                        int *no_pred_segcounts,
                        int (*temporal_predictor_count)[2],
                        int *t_unpred_seg_counts,
@@ -130,8 +121,8 @@ static void count_segs(VP9_COMP *cpi, const TileInfo *const tile,
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
     return;
 
-  xd->mi_8x8 = mi_8x8;
-  segment_id = xd->mi_8x8[0]->mbmi.segment_id;
+  xd->mi = mi;
+  segment_id = xd->mi[0]->mbmi.segment_id;
 
   set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, cm->mi_rows, cm->mi_cols);
 
@@ -140,7 +131,7 @@ static void count_segs(VP9_COMP *cpi, const TileInfo *const tile,
 
   // Temporal prediction not allowed on key frames
   if (cm->frame_type != KEY_FRAME) {
-    const BLOCK_SIZE bsize = mi_8x8[0]->mbmi.sb_type;
+    const BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
     // Test to see if the segment id matches the predicted value.
     const int pred_segment_id = vp9_get_segment_id(cm, cm->last_frame_seg_map,
                                                    bsize, mi_row, mi_col);
@@ -149,46 +140,46 @@ static void count_segs(VP9_COMP *cpi, const TileInfo *const tile,
 
     // Store the prediction status for this mb and update counts
     // as appropriate
-    vp9_set_pred_flag_seg_id(xd, pred_flag);
+    xd->mi[0]->mbmi.seg_id_predicted = pred_flag;
     temporal_predictor_count[pred_context][pred_flag]++;
 
+    // Update the "unpredicted" segment count
     if (!pred_flag)
-      // Update the "unpredicted" segment count
       t_unpred_seg_counts[segment_id]++;
   }
 }
 
 static void count_segs_sb(VP9_COMP *cpi, const TileInfo *const tile,
-                          MODE_INFO **mi_8x8,
+                          MODE_INFO **mi,
                           int *no_pred_segcounts,
                           int (*temporal_predictor_count)[2],
                           int *t_unpred_seg_counts,
                           int mi_row, int mi_col,
                           BLOCK_SIZE bsize) {
   const VP9_COMMON *const cm = &cpi->common;
-  const int mis = cm->mode_info_stride;
+  const int mis = cm->mi_stride;
   int bw, bh;
   const int bs = num_8x8_blocks_wide_lookup[bsize], hbs = bs / 2;
 
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
     return;
 
-  bw = num_8x8_blocks_wide_lookup[mi_8x8[0]->mbmi.sb_type];
-  bh = num_8x8_blocks_high_lookup[mi_8x8[0]->mbmi.sb_type];
+  bw = num_8x8_blocks_wide_lookup[mi[0]->mbmi.sb_type];
+  bh = num_8x8_blocks_high_lookup[mi[0]->mbmi.sb_type];
 
   if (bw == bs && bh == bs) {
-    count_segs(cpi, tile, mi_8x8, no_pred_segcounts, temporal_predictor_count,
+    count_segs(cpi, tile, mi, no_pred_segcounts, temporal_predictor_count,
                t_unpred_seg_counts, bs, bs, mi_row, mi_col);
   } else if (bw == bs && bh < bs) {
-    count_segs(cpi, tile, mi_8x8, no_pred_segcounts, temporal_predictor_count,
+    count_segs(cpi, tile, mi, no_pred_segcounts, temporal_predictor_count,
                t_unpred_seg_counts, bs, hbs, mi_row, mi_col);
-    count_segs(cpi, tile, mi_8x8 + hbs * mis, no_pred_segcounts,
+    count_segs(cpi, tile, mi + hbs * mis, no_pred_segcounts,
                temporal_predictor_count, t_unpred_seg_counts, bs, hbs,
                mi_row + hbs, mi_col);
   } else if (bw < bs && bh == bs) {
-    count_segs(cpi, tile, mi_8x8, no_pred_segcounts, temporal_predictor_count,
+    count_segs(cpi, tile, mi, no_pred_segcounts, temporal_predictor_count,
                t_unpred_seg_counts, hbs, bs, mi_row, mi_col);
-    count_segs(cpi, tile, mi_8x8 + hbs,
+    count_segs(cpi, tile, mi + hbs,
                no_pred_segcounts, temporal_predictor_count, t_unpred_seg_counts,
                hbs, bs, mi_row, mi_col + hbs);
   } else {
@@ -201,7 +192,7 @@ static void count_segs_sb(VP9_COMP *cpi, const TileInfo *const tile,
       const int mi_dc = hbs * (n & 1);
       const int mi_dr = hbs * (n >> 1);
 
-      count_segs_sb(cpi, tile, &mi_8x8[mi_dr * mis + mi_dc],
+      count_segs_sb(cpi, tile, &mi[mi_dr * mis + mi_dc],
                     no_pred_segcounts, temporal_predictor_count,
                     t_unpred_seg_counts,
                     mi_row + mi_dr, mi_col + mi_dc, subsize);
@@ -226,9 +217,6 @@ void vp9_choose_segmap_coding_method(VP9_COMP *cpi) {
   vp9_prob t_pred_tree[SEG_TREE_PROBS];
   vp9_prob t_nopred_prob[PREDICTION_PROBS];
 
-  const int mis = cm->mode_info_stride;
-  MODE_INFO **mi_ptr, **mi;
-
   // Set default state for the segment tree probabilities and the
   // temporal coding probabilities
   vpx_memset(seg->tree_probs, 255, sizeof(seg->tree_probs));
@@ -238,12 +226,13 @@ void vp9_choose_segmap_coding_method(VP9_COMP *cpi) {
   // predicts this one
   for (tile_col = 0; tile_col < 1 << cm->log2_tile_cols; tile_col++) {
     TileInfo tile;
-
+    MODE_INFO **mi_ptr;
     vp9_tile_init(&tile, cm, 0, tile_col);
+
     mi_ptr = cm->mi_grid_visible + tile.mi_col_start;
     for (mi_row = 0; mi_row < cm->mi_rows;
-         mi_row += 8, mi_ptr += 8 * mis) {
-      mi = mi_ptr;
+         mi_row += 8, mi_ptr += 8 * cm->mi_stride) {
+      MODE_INFO **mi = mi_ptr;
       for (mi_col = tile.mi_col_start; mi_col < tile.mi_col_end;
            mi_col += 8, mi += 8)
         count_segs_sb(cpi, &tile, mi, no_pred_segcounts,
@@ -287,3 +276,12 @@ void vp9_choose_segmap_coding_method(VP9_COMP *cpi) {
     vpx_memcpy(seg->tree_probs, no_pred_tree, sizeof(no_pred_tree));
   }
 }
+
+void vp9_reset_segment_features(struct segmentation *seg) {
+  // Set up default state for MB feature flags
+  seg->enabled = 0;
+  seg->update_map = 0;
+  seg->update_data = 0;
+  vpx_memset(seg->tree_probs, 255, sizeof(seg->tree_probs));
+  vp9_clearall_segfeatures(seg);
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_segmentation.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_segmentation.h
index 2183771c459..50dd562c805 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_segmentation.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_segmentation.h
@@ -13,14 +13,21 @@
 #define VP9_ENCODER_VP9_SEGMENTATION_H_
 
 #include "vp9/common/vp9_blockd.h"
-#include "vp9/encoder/vp9_onyx_int.h"
+#include "vp9/encoder/vp9_encoder.h"
 
-void vp9_enable_segmentation(VP9_PTR ptr);
-void vp9_disable_segmentation(VP9_PTR ptr);
+#ifdef __cplusplus
+extern "C" {
+#endif
 
-// Valid values for a segment are 0 to 3
-// Segmentation map is arrange as [Rows][Columns]
-void vp9_set_segmentation_map(VP9_PTR ptr, unsigned char *segmentation_map);
+void vp9_enable_segmentation(struct segmentation *seg);
+void vp9_disable_segmentation(struct segmentation *seg);
+
+void vp9_disable_segfeature(struct segmentation *seg,
+                            int segment_id,
+                            SEG_LVL_FEATURES feature_id);
+void vp9_clear_segdata(struct segmentation *seg,
+                       int segment_id,
+                       SEG_LVL_FEATURES feature_id);
 
 // The values given for each segment can be either deltas (from the default
 // value chosen for the frame) or absolute values.
@@ -32,9 +39,15 @@ void vp9_set_segmentation_map(VP9_PTR ptr, unsigned char *segmentation_map);
 //
 // abs_delta = SEGMENT_DELTADATA (deltas) abs_delta = SEGMENT_ABSDATA (use
 // the absolute values given).
-void vp9_set_segment_data(VP9_PTR ptr, signed char *feature_data,
+void vp9_set_segment_data(struct segmentation *seg, signed char *feature_data,
                           unsigned char abs_delta);
 
 void vp9_choose_segmap_coding_method(VP9_COMP *cpi);
 
+void vp9_reset_segment_features(struct segmentation *seg);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
 #endif  // VP9_ENCODER_VP9_SEGMENTATION_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_speed_features.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_speed_features.c
new file mode 100644
index 00000000000..93e23eee282
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_speed_features.c
@@ -0,0 +1,397 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <limits.h>
+
+#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_speed_features.h"
+
+enum {
+  ALL_INTRA_MODES = (1 << DC_PRED) |
+                    (1 << V_PRED) | (1 << H_PRED) |
+                    (1 << D45_PRED) | (1 << D135_PRED) |
+                    (1 << D117_PRED) | (1 << D153_PRED) |
+                    (1 << D207_PRED) | (1 << D63_PRED) |
+                    (1 << TM_PRED),
+
+  INTRA_DC_ONLY   = (1 << DC_PRED),
+
+  INTRA_DC_TM     = (1 << TM_PRED) | (1 << DC_PRED),
+
+  INTRA_DC_H_V    = (1 << DC_PRED) | (1 << V_PRED) | (1 << H_PRED),
+
+  INTRA_DC_TM_H_V = INTRA_DC_TM | (1 << V_PRED) | (1 << H_PRED)
+};
+
+enum {
+  DISABLE_ALL_INTER_SPLIT   = (1 << THR_COMP_GA) |
+                              (1 << THR_COMP_LA) |
+                              (1 << THR_ALTR) |
+                              (1 << THR_GOLD) |
+                              (1 << THR_LAST),
+
+  DISABLE_ALL_SPLIT         = (1 << THR_INTRA) | DISABLE_ALL_INTER_SPLIT,
+
+  DISABLE_COMPOUND_SPLIT    = (1 << THR_COMP_GA) | (1 << THR_COMP_LA),
+
+  LAST_AND_INTRA_SPLIT_ONLY = (1 << THR_COMP_GA) |
+                              (1 << THR_COMP_LA) |
+                              (1 << THR_ALTR) |
+                              (1 << THR_GOLD)
+};
+
+static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm,
+                                   SPEED_FEATURES *sf, int speed) {
+  sf->adaptive_rd_thresh = 1;
+  sf->recode_loop = (speed < 1) ? ALLOW_RECODE : ALLOW_RECODE_KFMAXBW;
+  sf->allow_skip_recode = 1;
+
+  if (speed >= 1) {
+    sf->use_square_partition_only = !frame_is_intra_only(cm);
+    sf->less_rectangular_check  = 1;
+    sf->tx_size_search_method = frame_is_boosted(cpi) ? USE_FULL_RD
+                                                      : USE_LARGESTALL;
+
+    if (MIN(cm->width, cm->height) >= 720)
+      sf->disable_split_mask = cm->show_frame ? DISABLE_ALL_SPLIT
+                                              : DISABLE_ALL_INTER_SPLIT;
+    else
+      sf->disable_split_mask = DISABLE_COMPOUND_SPLIT;
+    sf->use_rd_breakout = 1;
+    sf->adaptive_motion_search = 1;
+    sf->auto_mv_step_size = 1;
+    sf->adaptive_rd_thresh = 2;
+    sf->subpel_iters_per_step = 1;
+    sf->mode_skip_start = 10;
+    sf->adaptive_pred_interp_filter = 1;
+
+    sf->recode_loop = ALLOW_RECODE_KFARFGF;
+    sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V;
+    sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC_H_V;
+    sf->intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V;
+    sf->intra_uv_mode_mask[TX_16X16] = INTRA_DC_H_V;
+  }
+
+  if (speed >= 2) {
+    if (MIN(cm->width, cm->height) >= 720)
+      sf->disable_split_mask = cm->show_frame ? DISABLE_ALL_SPLIT
+                                              : DISABLE_ALL_INTER_SPLIT;
+    else
+      sf->disable_split_mask = LAST_AND_INTRA_SPLIT_ONLY;
+
+    sf->adaptive_pred_interp_filter = 2;
+    sf->reference_masking = 1;
+    sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH |
+                                 FLAG_SKIP_INTRA_BESTINTER |
+                                 FLAG_SKIP_COMP_BESTINTRA |
+                                 FLAG_SKIP_INTRA_LOWVAR;
+    sf->disable_filter_search_var_thresh = 100;
+    sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
+    sf->auto_min_max_partition_size = RELAXED_NEIGHBORING_MIN_MAX;
+    sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_LOW_MOTION;
+    sf->adjust_partitioning_from_last_frame = 1;
+    sf->last_partitioning_redo_frequency = 3;
+  }
+
+  if (speed >= 3) {
+    sf->tx_size_search_method = frame_is_intra_only(cm) ? USE_FULL_RD
+                                                        : USE_LARGESTALL;
+    if (MIN(cm->width, cm->height) >= 720)
+      sf->disable_split_mask = DISABLE_ALL_SPLIT;
+    else
+      sf->disable_split_mask = DISABLE_ALL_INTER_SPLIT;
+
+    sf->recode_loop = ALLOW_RECODE_KFMAXBW;
+    sf->adaptive_rd_thresh = 3;
+    sf->mode_skip_start = 6;
+    sf->use_fast_coef_updates = ONE_LOOP_REDUCED;
+    sf->use_fast_coef_costing = 1;
+  }
+
+  if (speed >= 4) {
+    sf->use_square_partition_only = 1;
+    sf->tx_size_search_method = USE_LARGESTALL;
+    sf->disable_split_mask = DISABLE_ALL_SPLIT;
+    sf->adaptive_rd_thresh = 4;
+    sf->mode_search_skip_flags |= FLAG_SKIP_COMP_REFMISMATCH |
+                                  FLAG_EARLY_TERMINATE;
+    sf->disable_filter_search_var_thresh = 200;
+    sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_ALL;
+    sf->use_lp32x32fdct = 1;
+  }
+
+  if (speed >= 5) {
+    int i;
+
+    sf->partition_search_type = FIXED_PARTITION;
+    sf->optimize_coefficients = 0;
+    sf->search_method = HEX;
+    sf->disable_filter_search_var_thresh = 500;
+    for (i = 0; i < TX_SIZES; ++i) {
+      sf->intra_y_mode_mask[i] = INTRA_DC_ONLY;
+      sf->intra_uv_mode_mask[i] = INTRA_DC_ONLY;
+    }
+    cpi->allow_encode_breakout = ENCODE_BREAKOUT_ENABLED;
+  }
+}
+
+static void set_rt_speed_feature(VP9_COMMON *cm, SPEED_FEATURES *sf,
+                                 int speed) {
+  sf->static_segmentation = 0;
+  sf->adaptive_rd_thresh = 1;
+  sf->encode_breakout_thresh = 1;
+  sf->use_fast_coef_costing = 1;
+
+  if (speed == 1) {
+    sf->use_square_partition_only = !frame_is_intra_only(cm);
+    sf->less_rectangular_check = 1;
+    sf->tx_size_search_method = frame_is_intra_only(cm) ? USE_FULL_RD
+                                                        : USE_LARGESTALL;
+
+    if (MIN(cm->width, cm->height) >= 720)
+      sf->disable_split_mask = cm->show_frame ? DISABLE_ALL_SPLIT
+                                              : DISABLE_ALL_INTER_SPLIT;
+    else
+      sf->disable_split_mask = DISABLE_COMPOUND_SPLIT;
+
+    sf->use_rd_breakout = 1;
+    sf->adaptive_motion_search = 1;
+    sf->adaptive_pred_interp_filter = 1;
+    sf->auto_mv_step_size = 1;
+    sf->adaptive_rd_thresh = 2;
+    sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V;
+    sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC_H_V;
+    sf->intra_uv_mode_mask[TX_16X16] = INTRA_DC_H_V;
+    sf->encode_breakout_thresh = 8;
+  }
+
+  if (speed >= 2) {
+    sf->use_square_partition_only = !frame_is_intra_only(cm);
+    sf->less_rectangular_check = 1;
+    sf->tx_size_search_method = frame_is_intra_only(cm) ? USE_FULL_RD
+                                                        : USE_LARGESTALL;
+    if (MIN(cm->width, cm->height) >= 720)
+      sf->disable_split_mask = cm->show_frame ?
+        DISABLE_ALL_SPLIT : DISABLE_ALL_INTER_SPLIT;
+    else
+      sf->disable_split_mask = LAST_AND_INTRA_SPLIT_ONLY;
+
+    sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH |
+                                 FLAG_SKIP_INTRA_BESTINTER |
+                                 FLAG_SKIP_COMP_BESTINTRA |
+                                 FLAG_SKIP_INTRA_LOWVAR;
+    sf->use_rd_breakout = 1;
+    sf->adaptive_motion_search = 1;
+    sf->adaptive_pred_interp_filter = 2;
+    sf->auto_mv_step_size = 1;
+    sf->reference_masking = 1;
+
+    sf->disable_filter_search_var_thresh = 50;
+    sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
+
+    sf->auto_min_max_partition_size = RELAXED_NEIGHBORING_MIN_MAX;
+    sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_LOW_MOTION;
+    sf->adjust_partitioning_from_last_frame = 1;
+    sf->last_partitioning_redo_frequency = 3;
+
+    sf->adaptive_rd_thresh = 2;
+    sf->use_lp32x32fdct = 1;
+    sf->mode_skip_start = 11;
+    sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V;
+    sf->intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V;
+    sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC_H_V;
+    sf->intra_uv_mode_mask[TX_16X16] = INTRA_DC_H_V;
+    sf->encode_breakout_thresh = 200;
+  }
+
+  if (speed >= 3) {
+    sf->use_square_partition_only = 1;
+    sf->disable_filter_search_var_thresh = 100;
+    sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_ALL;
+    sf->constrain_copy_partition = 1;
+    sf->use_uv_intra_rd_estimate = 1;
+    sf->skip_encode_sb = 1;
+    sf->subpel_iters_per_step = 1;
+    sf->use_fast_coef_updates = ONE_LOOP_REDUCED;
+    sf->adaptive_rd_thresh = 4;
+    sf->mode_skip_start = 6;
+    sf->allow_skip_recode = 0;
+    sf->optimize_coefficients = 0;
+    sf->disable_split_mask = DISABLE_ALL_SPLIT;
+    sf->lpf_pick = LPF_PICK_FROM_Q;
+    sf->encode_breakout_thresh = 700;
+  }
+
+  if (speed >= 4) {
+    int i;
+    sf->last_partitioning_redo_frequency = 4;
+    sf->adaptive_rd_thresh = 5;
+    sf->use_fast_coef_costing = 0;
+    sf->auto_min_max_partition_size = STRICT_NEIGHBORING_MIN_MAX;
+    sf->adjust_partitioning_from_last_frame =
+        cm->last_frame_type != cm->frame_type || (0 ==
+        (cm->current_video_frame + 1) % sf->last_partitioning_redo_frequency);
+    sf->subpel_force_stop = 1;
+    for (i = 0; i < TX_SIZES; i++) {
+      sf->intra_y_mode_mask[i] = INTRA_DC_H_V;
+      sf->intra_uv_mode_mask[i] = INTRA_DC_ONLY;
+    }
+    sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_ONLY;
+    sf->frame_parameter_update = 0;
+    sf->encode_breakout_thresh = 1000;
+    sf->search_method = FAST_HEX;
+    sf->disable_inter_mode_mask[BLOCK_32X32] = 1 << INTER_OFFSET(ZEROMV);
+    sf->disable_inter_mode_mask[BLOCK_32X64] = ~(1 << INTER_OFFSET(NEARESTMV));
+    sf->disable_inter_mode_mask[BLOCK_64X32] = ~(1 << INTER_OFFSET(NEARESTMV));
+    sf->disable_inter_mode_mask[BLOCK_64X64] = ~(1 << INTER_OFFSET(NEARESTMV));
+    sf->max_intra_bsize = BLOCK_32X32;
+    sf->allow_skip_recode = 1;
+  }
+
+  if (speed >= 5) {
+    sf->max_partition_size = BLOCK_32X32;
+    sf->min_partition_size = BLOCK_8X8;
+    sf->partition_check =
+        (cm->current_video_frame % sf->last_partitioning_redo_frequency == 1);
+    sf->force_frame_boost = cm->frame_type == KEY_FRAME ||
+        (cm->current_video_frame %
+            (sf->last_partitioning_redo_frequency << 1) == 1);
+    sf->max_delta_qindex = (cm->frame_type == KEY_FRAME) ? 20 : 15;
+    sf->partition_search_type = REFERENCE_PARTITION;
+    sf->use_nonrd_pick_mode = 1;
+    sf->search_method = FAST_DIAMOND;
+    sf->allow_skip_recode = 0;
+    sf->chessboard_index = cm->current_video_frame & 0x01;
+  }
+
+  if (speed >= 6) {
+    // Adaptively switch between SOURCE_VAR_BASED_PARTITION and FIXED_PARTITION.
+    sf->partition_search_type = SOURCE_VAR_BASED_PARTITION;
+    sf->search_type_check_frequency = 50;
+    sf->source_var_thresh = 360;
+  }
+
+  if (speed >= 7) {
+    int i;
+    for (i = 0; i < BLOCK_SIZES; ++i)
+      sf->disable_inter_mode_mask[i] = ~(1 << INTER_OFFSET(NEARESTMV));
+  }
+}
+
+void vp9_set_speed_features(VP9_COMP *cpi) {
+  SPEED_FEATURES *const sf = &cpi->sf;
+  VP9_COMMON *const cm = &cpi->common;
+  const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+  int i;
+
+  // best quality defaults
+  sf->frame_parameter_update = 1;
+  sf->search_method = NSTEP;
+  sf->recode_loop = ALLOW_RECODE;
+  sf->subpel_search_method = SUBPEL_TREE;
+  sf->subpel_iters_per_step = 2;
+  sf->subpel_force_stop = 0;
+  sf->optimize_coefficients = !oxcf->lossless;
+  sf->reduce_first_step_size = 0;
+  sf->auto_mv_step_size = 0;
+  sf->max_step_search_steps = MAX_MVSEARCH_STEPS;
+  sf->comp_inter_joint_search_thresh = BLOCK_4X4;
+  sf->adaptive_rd_thresh = 0;
+  sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_OFF;
+  sf->tx_size_search_method = USE_FULL_RD;
+  sf->use_lp32x32fdct = 0;
+  sf->adaptive_motion_search = 0;
+  sf->adaptive_pred_interp_filter = 0;
+  sf->reference_masking = 0;
+  sf->partition_search_type = SEARCH_PARTITION;
+  sf->less_rectangular_check = 0;
+  sf->use_square_partition_only = 0;
+  sf->auto_min_max_partition_size = NOT_IN_USE;
+  sf->max_partition_size = BLOCK_64X64;
+  sf->min_partition_size = BLOCK_4X4;
+  sf->adjust_partitioning_from_last_frame = 0;
+  sf->last_partitioning_redo_frequency = 4;
+  sf->constrain_copy_partition = 0;
+  sf->disable_split_mask = 0;
+  sf->mode_search_skip_flags = 0;
+  sf->force_frame_boost = 0;
+  sf->max_delta_qindex = 0;
+  sf->disable_split_var_thresh = 0;
+  sf->disable_filter_search_var_thresh = 0;
+  for (i = 0; i < TX_SIZES; i++) {
+    sf->intra_y_mode_mask[i] = ALL_INTRA_MODES;
+    sf->intra_uv_mode_mask[i] = ALL_INTRA_MODES;
+  }
+  sf->use_rd_breakout = 0;
+  sf->skip_encode_sb = 0;
+  sf->use_uv_intra_rd_estimate = 0;
+  sf->allow_skip_recode = 0;
+  sf->lpf_pick = LPF_PICK_FROM_FULL_IMAGE;
+  sf->use_fast_coef_updates = TWO_LOOP;
+  sf->use_fast_coef_costing = 0;
+  sf->mode_skip_start = MAX_MODES;  // Mode index at which mode skip mask set
+  sf->use_nonrd_pick_mode = 0;
+  sf->encode_breakout_thresh = 0;
+  for (i = 0; i < BLOCK_SIZES; ++i)
+    sf->disable_inter_mode_mask[i] = 0;
+  sf->max_intra_bsize = BLOCK_64X64;
+  // This setting only takes effect when partition_search_type is set
+  // to FIXED_PARTITION.
+  sf->always_this_block_size = BLOCK_16X16;
+  sf->search_type_check_frequency = 50;
+  sf->source_var_thresh = 100;
+
+  // Recode loop tolerence %.
+  sf->recode_tolerance = 25;
+
+  switch (oxcf->mode) {
+    case ONE_PASS_BEST:
+    case TWO_PASS_SECOND_BEST:  // This is the best quality mode.
+      cpi->diamond_search_sad = vp9_full_range_search;
+      break;
+    case TWO_PASS_FIRST:
+    case ONE_PASS_GOOD:
+    case TWO_PASS_SECOND_GOOD:
+      set_good_speed_feature(cpi, cm, sf, oxcf->speed);
+      break;
+    case REALTIME:
+      set_rt_speed_feature(cm, sf, oxcf->speed);
+      break;
+  }
+
+  // Slow quant, dct and trellis not worthwhile for first pass
+  // so make sure they are always turned off.
+  if (cpi->pass == 1)
+    sf->optimize_coefficients = 0;
+
+  // No recode for 1 pass.
+  if (cpi->pass == 0) {
+    sf->recode_loop = DISALLOW_RECODE;
+    sf->optimize_coefficients = 0;
+  }
+
+  if (sf->subpel_search_method == SUBPEL_TREE) {
+    cpi->find_fractional_mv_step = vp9_find_best_sub_pixel_tree;
+    cpi->find_fractional_mv_step_comp = vp9_find_best_sub_pixel_comp_tree;
+  }
+
+  cpi->mb.optimize = sf->optimize_coefficients == 1 && cpi->pass != 1;
+
+  if (cpi->encode_breakout && oxcf->mode == REALTIME &&
+      sf->encode_breakout_thresh > cpi->encode_breakout)
+    cpi->encode_breakout = sf->encode_breakout_thresh;
+
+  if (sf->disable_split_mask == DISABLE_ALL_SPLIT)
+    sf->adaptive_pred_interp_filter = 0;
+
+  if (!cpi->oxcf.frame_periodic_boost) {
+    sf->max_delta_qindex = 0;
+  }
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_speed_features.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_speed_features.h
new file mode 100644
index 00000000000..46806c9a9fb
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_speed_features.h
@@ -0,0 +1,362 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_ENCODER_VP9_SPEED_FEATURES_H_
+#define VP9_ENCODER_VP9_SPEED_FEATURES_H_
+
+#include "vp9/common/vp9_enums.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef enum {
+  DIAMOND = 0,
+  NSTEP = 1,
+  HEX = 2,
+  BIGDIA = 3,
+  SQUARE = 4,
+  FAST_HEX = 5,
+  FAST_DIAMOND = 6
+} SEARCH_METHODS;
+
+typedef enum {
+  // No recode.
+  DISALLOW_RECODE = 0,
+  // Allow recode for KF and exceeding maximum frame bandwidth.
+  ALLOW_RECODE_KFMAXBW = 1,
+  // Allow recode only for KF/ARF/GF frames.
+  ALLOW_RECODE_KFARFGF = 2,
+  // Allow recode for all frames based on bitrate constraints.
+  ALLOW_RECODE = 3,
+} RECODE_LOOP_TYPE;
+
+typedef enum {
+  SUBPEL_TREE = 0,
+  // Other methods to come
+} SUBPEL_SEARCH_METHODS;
+
+typedef enum {
+  LAST_FRAME_PARTITION_OFF = 0,
+  LAST_FRAME_PARTITION_LOW_MOTION = 1,
+  LAST_FRAME_PARTITION_ALL = 2
+} LAST_FRAME_PARTITION_METHOD;
+
+typedef enum {
+  USE_FULL_RD = 0,
+  USE_LARGESTINTRA,
+  USE_LARGESTINTRA_MODELINTER,
+  USE_LARGESTALL
+} TX_SIZE_SEARCH_METHOD;
+
+typedef enum {
+  NOT_IN_USE = 0,
+  RELAXED_NEIGHBORING_MIN_MAX = 1,
+  STRICT_NEIGHBORING_MIN_MAX = 2
+} AUTO_MIN_MAX_MODE;
+
+typedef enum {
+  // Try the full image with different values.
+  LPF_PICK_FROM_FULL_IMAGE,
+  // Try a small portion of the image with different values.
+  LPF_PICK_FROM_SUBIMAGE,
+  // Estimate the level based on quantizer and frame type
+  LPF_PICK_FROM_Q,
+} LPF_PICK_METHOD;
+
+typedef enum {
+  // Terminate search early based on distortion so far compared to
+  // qp step, distortion in the neighborhood of the frame, etc.
+  FLAG_EARLY_TERMINATE = 1 << 0,
+
+  // Skips comp inter modes if the best so far is an intra mode.
+  FLAG_SKIP_COMP_BESTINTRA = 1 << 1,
+
+  // Skips comp inter modes if the best single intermode so far does
+  // not have the same reference as one of the two references being
+  // tested.
+  FLAG_SKIP_COMP_REFMISMATCH = 1 << 2,
+
+  // Skips oblique intra modes if the best so far is an inter mode.
+  FLAG_SKIP_INTRA_BESTINTER = 1 << 3,
+
+  // Skips oblique intra modes  at angles 27, 63, 117, 153 if the best
+  // intra so far is not one of the neighboring directions.
+  FLAG_SKIP_INTRA_DIRMISMATCH = 1 << 4,
+
+  // Skips intra modes other than DC_PRED if the source variance is small
+  FLAG_SKIP_INTRA_LOWVAR = 1 << 5,
+} MODE_SEARCH_SKIP_LOGIC;
+
+typedef enum {
+  // Search partitions using RD/NONRD criterion
+  SEARCH_PARTITION = 0,
+
+  // Always use a fixed size partition
+  FIXED_PARTITION = 1,
+
+  // Use a fixed size partition in every 64X64 SB, where the size is
+  // determined based on source variance
+  VAR_BASED_FIXED_PARTITION = 2,
+
+  REFERENCE_PARTITION = 3,
+
+  // Use an arbitrary partitioning scheme based on source variance within
+  // a 64X64 SB
+  VAR_BASED_PARTITION,
+
+  // Use non-fixed partitions based on source variance
+  SOURCE_VAR_BASED_PARTITION
+} PARTITION_SEARCH_TYPE;
+
+typedef enum {
+  // Does a dry run to see if any of the contexts need to be updated or not,
+  // before the final run.
+  TWO_LOOP = 0,
+
+  // No dry run conducted.
+  ONE_LOOP = 1,
+
+  // No dry run, also only half the coef contexts and bands are updated.
+  // The rest are not updated at all.
+  ONE_LOOP_REDUCED = 2
+} FAST_COEFF_UPDATE;
+
+typedef struct SPEED_FEATURES {
+  // Frame level coding parameter update
+  int frame_parameter_update;
+
+  // Motion search method (Diamond, NSTEP, Hex, Big Diamond, Square, etc).
+  SEARCH_METHODS search_method;
+
+  RECODE_LOOP_TYPE recode_loop;
+
+  // Subpel_search_method can only be subpel_tree which does a subpixel
+  // logarithmic search that keeps stepping at 1/2 pixel units until
+  // you stop getting a gain, and then goes on to 1/4 and repeats
+  // the same process. Along the way it skips many diagonals.
+  SUBPEL_SEARCH_METHODS subpel_search_method;
+
+  // Maximum number of steps in logarithmic subpel search before giving up.
+  int subpel_iters_per_step;
+
+  // Control when to stop subpel search
+  int subpel_force_stop;
+
+  // This parameter controls the number of steps we'll do in a diamond
+  // search.
+  int max_step_search_steps;
+
+  // This parameter controls which step in the n-step process we start at.
+  // It's changed adaptively based on circumstances.
+  int reduce_first_step_size;
+
+  // If this is set to 1, we limit the motion search range to 2 times the
+  // largest motion vector found in the last frame.
+  int auto_mv_step_size;
+
+  // Trellis (dynamic programming) optimization of quantized values (+1, 0).
+  int optimize_coefficients;
+
+  // Always set to 0. If on it enables 0 cost background transmission
+  // (except for the initial transmission of the segmentation). The feature is
+  // disabled because the addition of very large block sizes make the
+  // backgrounds very to cheap to encode, and the segmentation we have
+  // adds overhead.
+  int static_segmentation;
+
+  // If 1 we iterate finding a best reference for 2 ref frames together - via
+  // a log search that iterates 4 times (check around mv for last for best
+  // error of combined predictor then check around mv for alt). If 0 we
+  // we just use the best motion vector found for each frame by itself.
+  BLOCK_SIZE comp_inter_joint_search_thresh;
+
+  // This variable is used to cap the maximum number of times we skip testing a
+  // mode to be evaluated. A high value means we will be faster.
+  int adaptive_rd_thresh;
+
+  // Enables skipping the reconstruction step (idct, recon) in the
+  // intermediate steps assuming the last frame didn't have too many intra
+  // blocks and the q is less than a threshold.
+  int skip_encode_sb;
+  int skip_encode_frame;
+  // Speed feature to allow or disallow skipping of recode at block
+  // level within a frame.
+  int allow_skip_recode;
+
+  // This variable allows us to reuse the last frames partition choices
+  // (64x64 v 32x32 etc) for this frame. It can be set to only use the last
+  // frame as a starting point in low motion scenes or always use it. If set
+  // we use last partitioning_redo frequency to determine how often to redo
+  // the partitioning from scratch. Adjust_partitioning_from_last_frame
+  // enables us to adjust up or down one partitioning from the last frames
+  // partitioning.
+  LAST_FRAME_PARTITION_METHOD use_lastframe_partitioning;
+
+  // Determine which method we use to determine transform size. We can choose
+  // between options like full rd, largest for prediction size, largest
+  // for intra and model coefs for the rest.
+  TX_SIZE_SEARCH_METHOD tx_size_search_method;
+
+  // Low precision 32x32 fdct keeps everything in 16 bits and thus is less
+  // precise but significantly faster than the non lp version.
+  int use_lp32x32fdct;
+
+  // TODO(JBB): remove this as its no longer used.
+
+  // After looking at the first set of modes (set by index here), skip
+  // checking modes for reference frames that don't match the reference frame
+  // of the best so far.
+  int mode_skip_start;
+
+  // TODO(JBB): Remove this.
+  int reference_masking;
+
+  PARTITION_SEARCH_TYPE partition_search_type;
+
+  // Used if partition_search_type = FIXED_SIZE_PARTITION
+  BLOCK_SIZE always_this_block_size;
+
+  // Skip rectangular partition test when partition type none gives better
+  // rd than partition type split.
+  int less_rectangular_check;
+
+  // Disable testing non square partitions. (eg 16x32)
+  int use_square_partition_only;
+
+  // Sets min and max partition sizes for this 64x64 region based on the
+  // same 64x64 in last encoded frame, and the left and above neighbor.
+  AUTO_MIN_MAX_MODE auto_min_max_partition_size;
+
+  // Min and max partition size we enable (block_size) as per auto
+  // min max, but also used by adjust partitioning, and pick_partitioning.
+  BLOCK_SIZE min_partition_size;
+  BLOCK_SIZE max_partition_size;
+
+  // Whether or not we allow partitions one smaller or one greater than the last
+  // frame's partitioning. Only used if use_lastframe_partitioning is set.
+  int adjust_partitioning_from_last_frame;
+
+  // How frequently we re do the partitioning from scratch. Only used if
+  // use_lastframe_partitioning is set.
+  int last_partitioning_redo_frequency;
+
+  // This enables constrained copy partitioning, which, given an input block
+  // size bsize, will copy previous partition for partitions less than bsize,
+  // otherwise bsize partition is used. bsize is currently set to 16x16.
+  // Used for the case where motion is detected in superblock.
+  int constrain_copy_partition;
+
+  // Disables sub 8x8 blocksizes in different scenarios: Choices are to disable
+  // it always, to allow it for only Last frame and Intra, disable it for all
+  // inter modes or to enable it always.
+  int disable_split_mask;
+
+  // TODO(jingning): combine the related motion search speed features
+  // This allows us to use motion search at other sizes as a starting
+  // point for this motion search and limits the search range around it.
+  int adaptive_motion_search;
+
+  // Allows sub 8x8 modes to use the prediction filter that was determined
+  // best for 8x8 mode. If set to 0 we always re check all the filters for
+  // sizes less than 8x8, 1 means we check all filter modes if no 8x8 filter
+  // was selected, and 2 means we use 8 tap if no 8x8 filter mode was selected.
+  int adaptive_pred_interp_filter;
+
+  // Search through variable block partition types in non-RD mode decision
+  // encoding process for RTC.
+  int partition_check;
+
+  // Chessboard pattern index
+  int chessboard_index;
+
+  // Use finer quantizer in every other few frames that run variable block
+  // partition type search.
+  int force_frame_boost;
+
+  // Maximally allowed base quantization index fluctuation.
+  int max_delta_qindex;
+
+  // Implements various heuristics to skip searching modes
+  // The heuristics selected are based on  flags
+  // defined in the MODE_SEARCH_SKIP_HEURISTICS enum
+  unsigned int mode_search_skip_flags;
+
+  // A source variance threshold below which the split mode is disabled
+  unsigned int disable_split_var_thresh;
+
+  // A source variance threshold below which filter search is disabled
+  // Choose a very large value (UINT_MAX) to use 8-tap always
+  unsigned int disable_filter_search_var_thresh;
+
+  // These bit masks allow you to enable or disable intra modes for each
+  // transform size separately.
+  int intra_y_mode_mask[TX_SIZES];
+  int intra_uv_mode_mask[TX_SIZES];
+
+  // This variable enables an early break out of mode testing if the model for
+  // rd built from the prediction signal indicates a value that's much
+  // higher than the best rd we've seen so far.
+  int use_rd_breakout;
+
+  // This enables us to use an estimate for intra rd based on dc mode rather
+  // than choosing an actual uv mode in the stage of encoding before the actual
+  // final encode.
+  int use_uv_intra_rd_estimate;
+
+  // This feature controls how the loop filter level is determined.
+  LPF_PICK_METHOD lpf_pick;
+
+  // This feature limits the number of coefficients updates we actually do
+  // by only looking at counts from 1/2 the bands.
+  FAST_COEFF_UPDATE use_fast_coef_updates;
+
+  // This flag controls the use of non-RD mode decision.
+  int use_nonrd_pick_mode;
+
+  // This variable sets the encode_breakout threshold. Currently, it is only
+  // enabled in real time mode.
+  int encode_breakout_thresh;
+
+  // A binary mask indicating if NEARESTMV, NEARMV, ZEROMV, NEWMV
+  // modes are disabled in order from LSB to MSB for each BLOCK_SIZE.
+  int disable_inter_mode_mask[BLOCK_SIZES];
+
+  // This feature controls whether we do the expensive context update and
+  // calculation in the rd coefficient costing loop.
+  int use_fast_coef_costing;
+
+  // This feature controls the tolerence vs target used in deciding whether to
+  // recode a frame. It has no meaning if recode is disabled.
+  int recode_tolerance;
+
+  // This variable controls the maximum block size where intra blocks can be
+  // used in inter frames.
+  // TODO(aconverse): Fold this into one of the other many mode skips
+  BLOCK_SIZE max_intra_bsize;
+
+  // The frequency that we check if SOURCE_VAR_BASED_PARTITION or
+  // FIXED_PARTITION search type should be used.
+  int search_type_check_frequency;
+
+  // The threshold used in SOURCE_VAR_BASED_PARTITION search type.
+  unsigned int source_var_thresh;
+} SPEED_FEATURES;
+
+struct VP9_COMP;
+
+void vp9_set_speed_features(struct VP9_COMP *cpi);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_ENCODER_VP9_SPEED_FEATURES_H_
+
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ssim.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ssim.c
index a5f18e6313b..026e6a8fd9d 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ssim.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ssim.c
@@ -8,8 +8,9 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include "./vp9_rtcd.h"
 
-#include "vp9/encoder/vp9_onyx_int.h"
+#include "vp9/encoder/vp9_ssim.h"
 
 void vp9_ssim_parms_16x16_c(uint8_t *s, int sp, uint8_t *r,
                             int rp, unsigned long *sum_s, unsigned long *sum_r,
@@ -65,12 +66,6 @@ static double similarity(unsigned long sum_s, unsigned long sum_r,
   return ssim_n * 1.0 / ssim_d;
 }
 
-static double ssim_16x16(uint8_t *s, int sp, uint8_t *r, int rp) {
-  unsigned long sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0;
-  vp9_ssim_parms_16x16(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r,
-                       &sum_sxr);
-  return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 256);
-}
 static double ssim_8x8(uint8_t *s, int sp, uint8_t *r, int rp) {
   unsigned long sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0;
   vp9_ssim_parms_8x8(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r,
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ssim.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ssim.h
new file mode 100644
index 00000000000..a581c2c23d4
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ssim.h
@@ -0,0 +1,30 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_ENCODER_VP9_SSIM_H_
+#define VP9_ENCODER_VP9_SSIM_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "vpx_scale/yv12config.h"
+
+double vp9_calc_ssim(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest,
+                     int lumamask, double *weight);
+
+double vp9_calc_ssimg(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest,
+                      double *ssim_y, double *ssim_u, double *ssim_v);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_ENCODER_VP9_SSIM_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_subexp.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_subexp.c
index eb864d96cb5..9796d647624 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_subexp.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_subexp.c
@@ -11,34 +11,13 @@
 #include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_entropy.h"
 
-#include "vp9/encoder/vp9_boolhuff.h"
-#include "vp9/encoder/vp9_treewriter.h"
+#include "vp9/encoder/vp9_cost.h"
+#include "vp9/encoder/vp9_writer.h"
 
-#define vp9_cost_upd  ((int)(vp9_cost_one(upd) - vp9_cost_zero(upd)) >> 8)
 #define vp9_cost_upd256  ((int)(vp9_cost_one(upd) - vp9_cost_zero(upd)))
 
 static int update_bits[255];
 
-static int count_uniform(int v, int n) {
-  int l = get_unsigned_bits(n);
-  int m;
-  if (l == 0) return 0;
-  m = (1 << l) - n;
-  if (v < m)
-    return l - 1;
-  else
-    return l;
-}
-
-static int split_index(int i, int n, int modulus) {
-  int max1 = (n - 1 - modulus / 2) / modulus + 1;
-  if (i % modulus == modulus / 2)
-    i = i / modulus;
-  else
-    i = max1 + i - (i + modulus - modulus / 2) / modulus;
-  return i;
-}
-
 static int recenter_nonneg(int v, int m) {
   if (v > (m << 1))
     return v;
@@ -82,29 +61,16 @@ static int remap_prob(int v, int m) {
   return i;
 }
 
-static int count_term_subexp(int word, int k, int num_syms) {
-  int count = 0;
-  int i = 0;
-  int mk = 0;
-  while (1) {
-    int b = (i ? k + i - 1 : k);
-    int a = (1 << b);
-    if (num_syms <= mk + 3 * a) {
-      count += count_uniform(word - mk, num_syms - mk);
-      break;
-    } else {
-      int t = (word >= mk + a);
-      count++;
-      if (t) {
-        i = i + 1;
-        mk += a;
-      } else {
-        count += b;
-        break;
-      }
-    }
-  }
-  return count;
+static int count_term_subexp(int word) {
+  if (word < 16)
+    return 5;
+  if (word < 32)
+    return 6;
+  if (word < 64)
+    return 8;
+  if (word < 129)
+    return 10;
+  return 11;
 }
 
 static int prob_diff_update_cost(vp9_prob newp, vp9_prob oldp) {
@@ -112,12 +78,9 @@ static int prob_diff_update_cost(vp9_prob newp, vp9_prob oldp) {
   return update_bits[delp] * 256;
 }
 
-static void encode_uniform(vp9_writer *w, int v, int n) {
-  int l = get_unsigned_bits(n);
-  int m;
-  if (l == 0)
-    return;
-  m = (1 << l) - n;
+static void encode_uniform(vp9_writer *w, int v) {
+  const int l = 8;
+  const int m = (1 << l) - 191;
   if (v < m) {
     vp9_write_literal(w, v, l - 1);
   } else {
@@ -126,38 +89,32 @@ static void encode_uniform(vp9_writer *w, int v, int n) {
   }
 }
 
-static void encode_term_subexp(vp9_writer *w, int word, int k, int num_syms) {
-  int i = 0;
-  int mk = 0;
-  while (1) {
-    int b = (i ? k + i - 1 : k);
-    int a = (1 << b);
-    if (num_syms <= mk + 3 * a) {
-      encode_uniform(w, word - mk, num_syms - mk);
-      break;
-    } else {
-      int t = (word >= mk + a);
-      vp9_write_literal(w, t, 1);
-      if (t) {
-        i = i + 1;
-        mk += a;
-      } else {
-        vp9_write_literal(w, word - mk, b);
-        break;
-      }
-    }
+static INLINE int write_bit_gte(vp9_writer *w, int word, int test) {
+  vp9_write_literal(w, word >= test, 1);
+  return word >= test;
+}
+
+static void encode_term_subexp(vp9_writer *w, int word) {
+  if (!write_bit_gte(w, word, 16)) {
+    vp9_write_literal(w, word, 4);
+  } else if (!write_bit_gte(w, word, 32)) {
+    vp9_write_literal(w, word - 16, 4);
+  } else if (!write_bit_gte(w, word, 64)) {
+    vp9_write_literal(w, word - 32, 5);
+  } else {
+    encode_uniform(w, word - 64);
   }
 }
 
 void vp9_write_prob_diff_update(vp9_writer *w, vp9_prob newp, vp9_prob oldp) {
   const int delp = remap_prob(newp, oldp);
-  encode_term_subexp(w, delp, SUBEXP_PARAM, 255);
+  encode_term_subexp(w, delp);
 }
 
 void vp9_compute_update_table() {
   int i;
   for (i = 0; i < 254; i++)
-    update_bits[i] = count_term_subexp(i, SUBEXP_PARAM, 255);
+    update_bits[i] = count_term_subexp(i);
 }
 
 int vp9_prob_diff_update_savings_search(const unsigned int *ct,
@@ -184,8 +141,7 @@ int vp9_prob_diff_update_savings_search(const unsigned int *ct,
 int vp9_prob_diff_update_savings_search_model(const unsigned int *ct,
                                               const vp9_prob *oldp,
                                               vp9_prob *bestp,
-                                              vp9_prob upd,
-                                              int b, int r) {
+                                              vp9_prob upd) {
   int i, old_b, new_b, update_b, savings, bestsavings, step;
   int newp;
   vp9_prob bestnewp, newplist[ENTROPY_NODES], oldplist[ENTROPY_NODES];
@@ -221,7 +177,7 @@ int vp9_prob_diff_update_savings_search_model(const unsigned int *ct,
 }
 
 void vp9_cond_prob_diff_update(vp9_writer *w, vp9_prob *oldp,
-                               unsigned int *ct) {
+                               const unsigned int ct[2]) {
   const vp9_prob upd = DIFF_UPDATE_PROB;
   vp9_prob newp = get_binary_prob(ct[0], ct[1]);
   const int savings = vp9_prob_diff_update_savings_search(ct, *oldp, &newp,
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_subexp.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_subexp.h
index 521c7778d3a..8e9c0c62acd 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_subexp.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_subexp.h
@@ -9,8 +9,12 @@
  */
 
 
-#ifndef VP9_DECODER_VP9_SUBEXP_H_
-#define VP9_DECODER_VP9_SUBEXP_H_
+#ifndef VP9_ENCODER_VP9_SUBEXP_H_
+#define VP9_ENCODER_VP9_SUBEXP_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
 
 void vp9_compute_update_table();
 
@@ -29,7 +33,10 @@ int vp9_prob_diff_update_savings_search(const unsigned int *ct,
 int vp9_prob_diff_update_savings_search_model(const unsigned int *ct,
                                               const vp9_prob *oldp,
                                               vp9_prob *bestp,
-                                              vp9_prob upd,
-                                              int b, int r);
+                                              vp9_prob upd);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
 
-#endif  // VP9_DECODER_VP9_SUBEXP_H_
+#endif  // VP9_ENCODER_VP9_SUBEXP_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_svc_layercontext.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_svc_layercontext.c
new file mode 100644
index 00000000000..2e98fa71764
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_svc_layercontext.c
@@ -0,0 +1,229 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+
+#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_svc_layercontext.h"
+
+void vp9_init_layer_context(VP9_COMP *const cpi) {
+  SVC *const svc = &cpi->svc;
+  const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+  int layer;
+  int layer_end;
+
+  svc->spatial_layer_id = 0;
+  svc->temporal_layer_id = 0;
+
+  if (svc->number_temporal_layers > 1) {
+    layer_end = svc->number_temporal_layers;
+  } else {
+    layer_end = svc->number_spatial_layers;
+  }
+
+  for (layer = 0; layer < layer_end; ++layer) {
+    LAYER_CONTEXT *const lc = &svc->layer_context[layer];
+    RATE_CONTROL *const lrc = &lc->rc;
+    lc->current_video_frame_in_layer = 0;
+    lrc->avg_frame_qindex[INTER_FRAME] = oxcf->worst_allowed_q;
+    lrc->ni_av_qi = oxcf->worst_allowed_q;
+    lrc->total_actual_bits = 0;
+    lrc->total_target_vs_actual = 0;
+    lrc->ni_tot_qi = 0;
+    lrc->tot_q = 0.0;
+    lrc->avg_q = 0.0;
+    lrc->ni_frames = 0;
+    lrc->decimation_count = 0;
+    lrc->decimation_factor = 0;
+    lrc->rate_correction_factor = 1.0;
+    lrc->key_frame_rate_correction_factor = 1.0;
+
+    if (svc->number_temporal_layers > 1) {
+      lc->target_bandwidth = oxcf->ts_target_bitrate[layer] * 1000;
+      lrc->last_q[INTER_FRAME] = oxcf->worst_allowed_q;
+    } else {
+      lc->target_bandwidth = oxcf->ss_target_bitrate[layer] * 1000;
+      lrc->last_q[0] = oxcf->best_allowed_q;
+      lrc->last_q[1] = oxcf->best_allowed_q;
+      lrc->last_q[2] = oxcf->best_allowed_q;
+    }
+
+    lrc->buffer_level = vp9_rescale((int)(oxcf->starting_buffer_level),
+                                    lc->target_bandwidth, 1000);
+    lrc->bits_off_target = lrc->buffer_level;
+  }
+}
+
+// Update the layer context from a change_config() call.
+void vp9_update_layer_context_change_config(VP9_COMP *const cpi,
+                                            const int target_bandwidth) {
+  SVC *const svc = &cpi->svc;
+  const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+  const RATE_CONTROL *const rc = &cpi->rc;
+  int layer;
+  int layer_end;
+  float bitrate_alloc = 1.0;
+
+  if (svc->number_temporal_layers > 1) {
+    layer_end = svc->number_temporal_layers;
+  } else {
+    layer_end = svc->number_spatial_layers;
+  }
+
+  for (layer = 0; layer < layer_end; ++layer) {
+    LAYER_CONTEXT *const lc = &svc->layer_context[layer];
+    RATE_CONTROL *const lrc = &lc->rc;
+
+    if (svc->number_temporal_layers > 1) {
+      lc->target_bandwidth = oxcf->ts_target_bitrate[layer] * 1000;
+    } else {
+      lc->target_bandwidth = oxcf->ss_target_bitrate[layer] * 1000;
+    }
+    bitrate_alloc = (float)lc->target_bandwidth / target_bandwidth;
+    // Update buffer-related quantities.
+    lc->starting_buffer_level =
+        (int64_t)(oxcf->starting_buffer_level * bitrate_alloc);
+    lc->optimal_buffer_level =
+        (int64_t)(oxcf->optimal_buffer_level * bitrate_alloc);
+    lc->maximum_buffer_size =
+        (int64_t)(oxcf->maximum_buffer_size * bitrate_alloc);
+    lrc->bits_off_target = MIN(lrc->bits_off_target, lc->maximum_buffer_size);
+    lrc->buffer_level = MIN(lrc->buffer_level, lc->maximum_buffer_size);
+    // Update framerate-related quantities.
+    if (svc->number_temporal_layers > 1) {
+      lc->framerate = oxcf->framerate / oxcf->ts_rate_decimator[layer];
+    } else {
+      lc->framerate = oxcf->framerate;
+    }
+    lrc->avg_frame_bandwidth = (int)(lc->target_bandwidth / lc->framerate);
+    lrc->max_frame_bandwidth = rc->max_frame_bandwidth;
+    // Update qp-related quantities.
+    lrc->worst_quality = rc->worst_quality;
+    lrc->best_quality = rc->best_quality;
+  }
+}
+
+static LAYER_CONTEXT *get_layer_context(SVC *svc) {
+  return svc->number_temporal_layers > 1 ?
+         &svc->layer_context[svc->temporal_layer_id] :
+         &svc->layer_context[svc->spatial_layer_id];
+}
+
+void vp9_update_temporal_layer_framerate(VP9_COMP *const cpi) {
+  SVC *const svc = &cpi->svc;
+  const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+  LAYER_CONTEXT *const lc = get_layer_context(svc);
+  RATE_CONTROL *const lrc = &lc->rc;
+  const int layer = svc->temporal_layer_id;
+
+  lc->framerate = oxcf->framerate / oxcf->ts_rate_decimator[layer];
+  lrc->avg_frame_bandwidth = (int)(lc->target_bandwidth / lc->framerate);
+  lrc->max_frame_bandwidth = cpi->rc.max_frame_bandwidth;
+  // Update the average layer frame size (non-cumulative per-frame-bw).
+  if (layer == 0) {
+    lc->avg_frame_size = lrc->avg_frame_bandwidth;
+  } else {
+    const double prev_layer_framerate =
+        oxcf->framerate / oxcf->ts_rate_decimator[layer - 1];
+    const int prev_layer_target_bandwidth =
+        oxcf->ts_target_bitrate[layer - 1] * 1000;
+    lc->avg_frame_size =
+        (int)((lc->target_bandwidth - prev_layer_target_bandwidth) /
+              (lc->framerate - prev_layer_framerate));
+  }
+}
+
+void vp9_update_spatial_layer_framerate(VP9_COMP *const cpi, double framerate) {
+  const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+  LAYER_CONTEXT *const lc = get_layer_context(&cpi->svc);
+  RATE_CONTROL *const lrc = &lc->rc;
+
+  lc->framerate = framerate;
+  lrc->avg_frame_bandwidth = (int)(lc->target_bandwidth / lc->framerate);
+  lrc->min_frame_bandwidth = (int)(lrc->avg_frame_bandwidth *
+                                   oxcf->two_pass_vbrmin_section / 100);
+  lrc->max_frame_bandwidth = (int)(((int64_t)lrc->avg_frame_bandwidth *
+                                   oxcf->two_pass_vbrmax_section) / 100);
+  lrc->max_gf_interval = 16;
+
+  lrc->static_scene_max_gf_interval = cpi->oxcf.key_freq >> 1;
+
+  if (oxcf->play_alternate && oxcf->lag_in_frames) {
+    if (lrc->max_gf_interval > oxcf->lag_in_frames - 1)
+      lrc->max_gf_interval = oxcf->lag_in_frames - 1;
+
+    if (lrc->static_scene_max_gf_interval > oxcf->lag_in_frames - 1)
+      lrc->static_scene_max_gf_interval = oxcf->lag_in_frames - 1;
+  }
+
+  if (lrc->max_gf_interval > lrc->static_scene_max_gf_interval)
+    lrc->max_gf_interval = lrc->static_scene_max_gf_interval;
+}
+
+void vp9_restore_layer_context(VP9_COMP *const cpi) {
+  LAYER_CONTEXT *const lc = get_layer_context(&cpi->svc);
+  const int old_frame_since_key = cpi->rc.frames_since_key;
+  const int old_frame_to_key = cpi->rc.frames_to_key;
+
+  cpi->rc = lc->rc;
+  cpi->twopass = lc->twopass;
+  cpi->oxcf.target_bandwidth = lc->target_bandwidth;
+  cpi->oxcf.starting_buffer_level = lc->starting_buffer_level;
+  cpi->oxcf.optimal_buffer_level = lc->optimal_buffer_level;
+  cpi->oxcf.maximum_buffer_size = lc->maximum_buffer_size;
+  // Reset the frames_since_key and frames_to_key counters to their values
+  // before the layer restore. Keep these defined for the stream (not layer).
+  if (cpi->svc.number_temporal_layers > 1) {
+    cpi->rc.frames_since_key = old_frame_since_key;
+    cpi->rc.frames_to_key = old_frame_to_key;
+  }
+}
+
+void vp9_save_layer_context(VP9_COMP *const cpi) {
+  const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+  LAYER_CONTEXT *const lc = get_layer_context(&cpi->svc);
+
+  lc->rc = cpi->rc;
+  lc->twopass = cpi->twopass;
+  lc->target_bandwidth = (int)oxcf->target_bandwidth;
+  lc->starting_buffer_level = oxcf->starting_buffer_level;
+  lc->optimal_buffer_level = oxcf->optimal_buffer_level;
+  lc->maximum_buffer_size = oxcf->maximum_buffer_size;
+}
+
+void vp9_init_second_pass_spatial_svc(VP9_COMP *cpi) {
+  SVC *const svc = &cpi->svc;
+  int i;
+
+  for (i = 0; i < svc->number_spatial_layers; ++i) {
+    struct twopass_rc *const twopass = &svc->layer_context[i].twopass;
+
+    svc->spatial_layer_id = i;
+    vp9_init_second_pass(cpi);
+
+    twopass->total_stats.spatial_layer_id = i;
+    twopass->total_left_stats.spatial_layer_id = i;
+  }
+  svc->spatial_layer_id = 0;
+}
+
+void vp9_inc_frame_in_layer(SVC *svc) {
+  LAYER_CONTEXT *const lc = (svc->number_temporal_layers > 1)
+      ? &svc->layer_context[svc->temporal_layer_id]
+      : &svc->layer_context[svc->spatial_layer_id];
+  ++lc->current_video_frame_in_layer;
+}
+
+int vp9_is_upper_layer_key_frame(const VP9_COMP *const cpi) {
+  return cpi->use_svc &&
+         cpi->svc.number_temporal_layers == 1 &&
+         cpi->svc.spatial_layer_id > 0 &&
+         cpi->svc.layer_context[cpi->svc.spatial_layer_id].is_key_frame;
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_svc_layercontext.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_svc_layercontext.h
new file mode 100644
index 00000000000..74d9c1c0d42
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_svc_layercontext.h
@@ -0,0 +1,84 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_ENCODER_VP9_SVC_LAYERCONTEXT_H_
+#define VP9_ENCODER_VP9_SVC_LAYERCONTEXT_H_
+
+#include "vpx/vpx_encoder.h"
+
+#include "vp9/encoder/vp9_ratectrl.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+  RATE_CONTROL rc;
+  int target_bandwidth;
+  int64_t starting_buffer_level;
+  int64_t optimal_buffer_level;
+  int64_t maximum_buffer_size;
+  double framerate;
+  int avg_frame_size;
+  struct twopass_rc twopass;
+  struct vpx_fixed_buf rc_twopass_stats_in;
+  unsigned int current_video_frame_in_layer;
+  int is_key_frame;
+} LAYER_CONTEXT;
+
+typedef struct {
+  int spatial_layer_id;
+  int temporal_layer_id;
+  int number_spatial_layers;
+  int number_temporal_layers;
+  // Layer context used for rate control in one pass temporal CBR mode or
+  // two pass spatial mode. Defined for temporal or spatial layers for now.
+  // Does not support temporal combined with spatial RC.
+  LAYER_CONTEXT layer_context[MAX(VPX_TS_MAX_LAYERS, VPX_SS_MAX_LAYERS)];
+} SVC;
+
+struct VP9_COMP;
+
+// Initialize layer context data from init_config().
+void vp9_init_layer_context(struct VP9_COMP *const cpi);
+
+// Update the layer context from a change_config() call.
+void vp9_update_layer_context_change_config(struct VP9_COMP *const cpi,
+                                            const int target_bandwidth);
+
+// Prior to encoding the frame, update framerate-related quantities
+// for the current temporal layer.
+void vp9_update_temporal_layer_framerate(struct VP9_COMP *const cpi);
+
+// Update framerate-related quantities for the current spatial layer.
+void vp9_update_spatial_layer_framerate(struct VP9_COMP *const cpi,
+                                        double framerate);
+
+// Prior to encoding the frame, set the layer context, for the current layer
+// to be encoded, to the cpi struct.
+void vp9_restore_layer_context(struct VP9_COMP *const cpi);
+
+// Save the layer context after encoding the frame.
+void vp9_save_layer_context(struct VP9_COMP *const cpi);
+
+// Initialize second pass rc for spatial svc.
+void vp9_init_second_pass_spatial_svc(struct VP9_COMP *cpi);
+
+// Increment number of video frames in layer
+void vp9_inc_frame_in_layer(SVC *svc);
+
+// Check if current layer is key frame in spatial upper layer
+int vp9_is_upper_layer_key_frame(const struct VP9_COMP *const cpi);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_ENCODER_VP9_SVC_LAYERCONTEXT_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_temporal_filter.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_temporal_filter.c
index 2cace0378da..6eff2008014 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_temporal_filter.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_temporal_filter.c
@@ -11,37 +11,49 @@
 #include <math.h>
 #include <limits.h>
 
+#include "vp9/common/vp9_alloccommon.h"
 #include "vp9/common/vp9_onyxc_int.h"
+#include "vp9/common/vp9_quant_common.h"
 #include "vp9/common/vp9_reconinter.h"
-#include "vp9/encoder/vp9_onyx_int.h"
 #include "vp9/common/vp9_systemdependent.h"
-#include "vp9/encoder/vp9_quantize.h"
-#include "vp9/common/vp9_alloccommon.h"
-#include "vp9/encoder/vp9_mcomp.h"
+#include "vp9/encoder/vp9_extend.h"
 #include "vp9/encoder/vp9_firstpass.h"
-#include "vp9/encoder/vp9_psnr.h"
-#include "vpx_scale/vpx_scale.h"
-#include "vp9/common/vp9_extend.h"
+#include "vp9/encoder/vp9_mcomp.h"
+#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_quantize.h"
 #include "vp9/encoder/vp9_ratectrl.h"
-#include "vp9/common/vp9_quant_common.h"
 #include "vp9/encoder/vp9_segmentation.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/vpx_timer.h"
+#include "vpx_scale/vpx_scale.h"
 
-#define ALT_REF_MC_ENABLED 1    // dis/enable MC in AltRef filtering
-#define ALT_REF_SUBPEL_ENABLED 1  // dis/enable subpel in MC AltRef filtering
+static int fixed_divide[512];
 
 static void temporal_filter_predictors_mb_c(MACROBLOCKD *xd,
                                             uint8_t *y_mb_ptr,
                                             uint8_t *u_mb_ptr,
                                             uint8_t *v_mb_ptr,
                                             int stride,
+                                            int uv_block_size,
                                             int mv_row,
                                             int mv_col,
                                             uint8_t *pred,
-                                            struct scale_factors *scale) {
+                                            struct scale_factors *scale,
+                                            int x, int y) {
   const int which_mv = 0;
-  MV mv = { mv_row, mv_col };
+  const MV mv = { mv_row, mv_col };
+  const InterpKernel *const kernel =
+    vp9_get_interp_kernel(xd->mi[0]->mbmi.interp_filter);
+
+  enum mv_precision mv_precision_uv;
+  int uv_stride;
+  if (uv_block_size == 8) {
+    uv_stride = (stride + 1) >> 1;
+    mv_precision_uv = MV_PRECISION_Q4;
+  } else {
+    uv_stride = stride;
+    mv_precision_uv = MV_PRECISION_Q3;
+  }
 
   vp9_build_inter_predictor(y_mb_ptr, stride,
                             &pred[0], 16,
@@ -49,25 +61,31 @@ static void temporal_filter_predictors_mb_c(MACROBLOCKD *xd,
                             scale,
                             16, 16,
                             which_mv,
-                            &xd->subpix, MV_PRECISION_Q3);
-
-  stride = (stride + 1) >> 1;
+                            kernel, MV_PRECISION_Q3, x, y);
 
-  vp9_build_inter_predictor(u_mb_ptr, stride,
-                            &pred[256], 8,
+  vp9_build_inter_predictor(u_mb_ptr, uv_stride,
+                            &pred[256], uv_block_size,
                             &mv,
                             scale,
-                            8, 8,
+                            uv_block_size, uv_block_size,
                             which_mv,
-                            &xd->subpix, MV_PRECISION_Q4);
+                            kernel, mv_precision_uv, x, y);
 
-  vp9_build_inter_predictor(v_mb_ptr, stride,
-                            &pred[320], 8,
+  vp9_build_inter_predictor(v_mb_ptr, uv_stride,
+                            &pred[512], uv_block_size,
                             &mv,
                             scale,
-                            8, 8,
+                            uv_block_size, uv_block_size,
                             which_mv,
-                            &xd->subpix, MV_PRECISION_Q4);
+                            kernel, mv_precision_uv, x, y);
+}
+
+void vp9_temporal_filter_init() {
+  int i;
+
+  fixed_divide[0] = 0;
+  for (i = 1; i < 512; ++i)
+    fixed_divide[i] = 0x80000 / i;
 }
 
 void vp9_temporal_filter_apply_c(uint8_t *frame1,
@@ -81,6 +99,7 @@ void vp9_temporal_filter_apply_c(uint8_t *frame1,
   unsigned int i, j, k;
   int modifier;
   int byte = 0;
+  const int rounding = strength > 0 ? 1 << (strength - 1) : 0;
 
   for (i = 0, k = 0; i < block_size; i++) {
     for (j = 0; j < block_size; j++, k++) {
@@ -93,7 +112,7 @@ void vp9_temporal_filter_apply_c(uint8_t *frame1,
       // modifier =  (int)roundf(coeff > 16 ? 0 : 16-coeff);
       modifier  *= modifier;
       modifier  *= 3;
-      modifier  += 1 << (strength - 1);
+      modifier  += rounding;
       modifier >>= strength;
 
       if (modifier > 16)
@@ -112,30 +131,28 @@ void vp9_temporal_filter_apply_c(uint8_t *frame1,
   }
 }
 
-#if ALT_REF_MC_ENABLED
-
 static int temporal_filter_find_matching_mb_c(VP9_COMP *cpi,
                                               uint8_t *arf_frame_buf,
                                               uint8_t *frame_ptr_buf,
-                                              int stride,
-                                              int error_thresh) {
+                                              int stride) {
   MACROBLOCK *x = &cpi->mb;
   MACROBLOCKD* const xd = &x->e_mbd;
   int step_param;
   int sadpb = x->sadperbit16;
   int bestsme = INT_MAX;
+  int distortion;
+  unsigned int sse;
 
-  int_mv best_ref_mv1;
-  int_mv best_ref_mv1_full; /* full-pixel value of best_ref_mv1 */
-  int_mv *ref_mv;
+  MV best_ref_mv1 = {0, 0};
+  MV best_ref_mv1_full; /* full-pixel value of best_ref_mv1 */
+  MV *ref_mv = &x->e_mbd.mi[0]->bmi[0].as_mv[0].as_mv;
 
   // Save input state
   struct buf_2d src = x->plane[0].src;
   struct buf_2d pre = xd->plane[0].pre[0];
 
-  best_ref_mv1.as_int = 0;
-  best_ref_mv1_full.as_mv.col = best_ref_mv1.as_mv.col >> 3;
-  best_ref_mv1_full.as_mv.row = best_ref_mv1.as_mv.row >> 3;
+  best_ref_mv1_full.col = best_ref_mv1.col >> 3;
+  best_ref_mv1_full.row = best_ref_mv1.row >> 3;
 
   // Setup frame pointers
   x->plane[0].src.buf = arf_frame_buf;
@@ -143,38 +160,22 @@ static int temporal_filter_find_matching_mb_c(VP9_COMP *cpi,
   xd->plane[0].pre[0].buf = frame_ptr_buf;
   xd->plane[0].pre[0].stride = stride;
 
-  // Further step/diamond searches as necessary
-  if (cpi->speed < 8)
-    step_param = cpi->sf.reduce_first_step_size + ((cpi->speed > 5) ? 1 : 0);
-  else
-    step_param = cpi->sf.reduce_first_step_size + 2;
-  step_param = MIN(step_param, (cpi->sf.max_step_search_steps - 2));
+  step_param = cpi->sf.reduce_first_step_size + (cpi->oxcf.speed > 5 ? 1 : 0);
+  step_param = MIN(step_param, cpi->sf.max_step_search_steps - 2);
 
-  /*cpi->sf.search_method == HEX*/
   // Ignore mv costing by sending NULL pointer instead of cost arrays
-  ref_mv = &x->e_mbd.mi_8x8[0]->bmi[0].as_mv[0];
-  bestsme = vp9_hex_search(x, &best_ref_mv1_full.as_mv,
-                           step_param, sadpb, 1,
-                           &cpi->fn_ptr[BLOCK_16X16],
-                           0, &best_ref_mv1.as_mv, &ref_mv->as_mv);
-
-#if ALT_REF_SUBPEL_ENABLED
-  // Try sub-pixel MC?
-  // if (bestsme > error_thresh && bestsme < INT_MAX)
-  {
-    int distortion;
-    unsigned int sse;
-    // Ignore mv costing by sending NULL pointer instead of cost array
-    bestsme = cpi->find_fractional_mv_step(x, &ref_mv->as_mv,
-                                           &best_ref_mv1.as_mv,
-                                           cpi->common.allow_high_precision_mv,
-                                           x->errorperbit,
-                                           &cpi->fn_ptr[BLOCK_16X16],
-                                           0, cpi->sf.subpel_iters_per_step,
-                                           NULL, NULL,
-                                           &distortion, &sse);
-  }
-#endif
+  vp9_hex_search(x, &best_ref_mv1_full, step_param, sadpb, 1,
+                 &cpi->fn_ptr[BLOCK_16X16], 0, &best_ref_mv1, ref_mv);
+
+  // Ignore mv costing by sending NULL pointer instead of cost array
+  bestsme = cpi->find_fractional_mv_step(x, ref_mv,
+                                         &best_ref_mv1,
+                                         cpi->common.allow_high_precision_mv,
+                                         x->errorperbit,
+                                         &cpi->fn_ptr[BLOCK_16X16],
+                                         0, cpi->sf.subpel_iters_per_step,
+                                         NULL, NULL,
+                                         &distortion, &sse);
 
   // Restore input state
   x->plane[0].src = src;
@@ -182,7 +183,6 @@ static int temporal_filter_find_matching_mb_c(VP9_COMP *cpi,
 
   return bestsme;
 }
-#endif
 
 static void temporal_filter_iterate_c(VP9_COMP *cpi,
                                       int frame_count,
@@ -197,24 +197,27 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi,
   int mb_rows = cpi->common.mb_rows;
   int mb_y_offset = 0;
   int mb_uv_offset = 0;
-  DECLARE_ALIGNED_ARRAY(16, unsigned int, accumulator, 16 * 16 + 8 * 8 + 8 * 8);
-  DECLARE_ALIGNED_ARRAY(16, uint16_t, count, 16 * 16 + 8 * 8 + 8 * 8);
+  DECLARE_ALIGNED_ARRAY(16, unsigned int, accumulator, 16 * 16 * 3);
+  DECLARE_ALIGNED_ARRAY(16, uint16_t, count, 16 * 16 * 3);
   MACROBLOCKD *mbd = &cpi->mb.e_mbd;
   YV12_BUFFER_CONFIG *f = cpi->frames[alt_ref_index];
   uint8_t *dst1, *dst2;
-  DECLARE_ALIGNED_ARRAY(16, uint8_t,  predictor, 16 * 16 + 8 * 8 + 8 * 8);
+  DECLARE_ALIGNED_ARRAY(16, uint8_t,  predictor, 16 * 16 * 3);
+  const int mb_uv_height = 16 >> mbd->plane[1].subsampling_y;
 
   // Save input state
   uint8_t* input_buffer[MAX_MB_PLANE];
   int i;
 
+  // TODO(aconverse): Add 4:2:2 support
+  assert(mbd->plane[1].subsampling_x == mbd->plane[1].subsampling_y);
+
   for (i = 0; i < MAX_MB_PLANE; i++)
     input_buffer[i] = mbd->plane[i].pre[0].buf;
 
   for (mb_row = 0; mb_row < mb_rows; mb_row++) {
-#if ALT_REF_MC_ENABLED
-    // Source frames are extended to 16 pixels.  This is different than
-    //  L/A/G reference frames that have a border of 32 (VP9BORDERINPIXELS)
+    // Source frames are extended to 16 pixels. This is different than
+    //  L/A/G reference frames that have a border of 32 (VP9ENCBORDERINPIXELS)
     // A 6/8 tap filter is used for motion search.  This requires 2 pixels
     //  before and 3 pixels after.  So the largest Y mv on a border would
     //  then be 16 - VP9_INTERP_EXTEND. The UV blocks are half the size of the
@@ -227,62 +230,56 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi,
     cpi->mb.mv_row_min = -((mb_row * 16) + (17 - 2 * VP9_INTERP_EXTEND));
     cpi->mb.mv_row_max = ((cpi->common.mb_rows - 1 - mb_row) * 16)
                          + (17 - 2 * VP9_INTERP_EXTEND);
-#endif
 
     for (mb_col = 0; mb_col < mb_cols; mb_col++) {
       int i, j, k;
       int stride;
 
-      vpx_memset(accumulator, 0, 384 * sizeof(unsigned int));
-      vpx_memset(count, 0, 384 * sizeof(uint16_t));
+      vpx_memset(accumulator, 0, 16 * 16 * 3 * sizeof(accumulator[0]));
+      vpx_memset(count, 0, 16 * 16 * 3 * sizeof(count[0]));
 
-#if ALT_REF_MC_ENABLED
       cpi->mb.mv_col_min = -((mb_col * 16) + (17 - 2 * VP9_INTERP_EXTEND));
       cpi->mb.mv_col_max = ((cpi->common.mb_cols - 1 - mb_col) * 16)
                            + (17 - 2 * VP9_INTERP_EXTEND);
-#endif
 
       for (frame = 0; frame < frame_count; frame++) {
+        const int thresh_low  = 10000;
+        const int thresh_high = 20000;
+
         if (cpi->frames[frame] == NULL)
           continue;
 
-        mbd->mi_8x8[0]->bmi[0].as_mv[0].as_mv.row = 0;
-        mbd->mi_8x8[0]->bmi[0].as_mv[0].as_mv.col = 0;
+        mbd->mi[0]->bmi[0].as_mv[0].as_mv.row = 0;
+        mbd->mi[0]->bmi[0].as_mv[0].as_mv.col = 0;
 
         if (frame == alt_ref_index) {
           filter_weight = 2;
         } else {
-          int err = 0;
-#if ALT_REF_MC_ENABLED
-#define THRESH_LOW   10000
-#define THRESH_HIGH  20000
-
           // Find best match in this frame by MC
-          err = temporal_filter_find_matching_mb_c
-                (cpi,
-                 cpi->frames[alt_ref_index]->y_buffer + mb_y_offset,
-                 cpi->frames[frame]->y_buffer + mb_y_offset,
-                 cpi->frames[frame]->y_stride,
-                 THRESH_LOW);
-#endif
+          int err = temporal_filter_find_matching_mb_c(cpi,
+              cpi->frames[alt_ref_index]->y_buffer + mb_y_offset,
+              cpi->frames[frame]->y_buffer + mb_y_offset,
+              cpi->frames[frame]->y_stride);
+
           // Assign higher weight to matching MB if it's error
           // score is lower. If not applying MC default behavior
           // is to weight all MBs equal.
-          filter_weight = err < THRESH_LOW
-                          ? 2 : err < THRESH_HIGH ? 1 : 0;
+          filter_weight = err < thresh_low
+                          ? 2 : err < thresh_high ? 1 : 0;
         }
 
         if (filter_weight != 0) {
           // Construct the predictors
-          temporal_filter_predictors_mb_c
-          (mbd,
-           cpi->frames[frame]->y_buffer + mb_y_offset,
-           cpi->frames[frame]->u_buffer + mb_uv_offset,
-           cpi->frames[frame]->v_buffer + mb_uv_offset,
-           cpi->frames[frame]->y_stride,
-           mbd->mi_8x8[0]->bmi[0].as_mv[0].as_mv.row,
-           mbd->mi_8x8[0]->bmi[0].as_mv[0].as_mv.col,
-           predictor, scale);
+          temporal_filter_predictors_mb_c(mbd,
+              cpi->frames[frame]->y_buffer + mb_y_offset,
+              cpi->frames[frame]->u_buffer + mb_uv_offset,
+              cpi->frames[frame]->v_buffer + mb_uv_offset,
+              cpi->frames[frame]->y_stride,
+              mb_uv_height,
+              mbd->mi[0]->bmi[0].as_mv[0].as_mv.row,
+              mbd->mi[0]->bmi[0].as_mv[0].as_mv.col,
+              predictor, scale,
+              mb_col * 16, mb_row * 16);
 
           // Apply the filter (YUV)
           vp9_temporal_filter_apply(f->y_buffer + mb_y_offset, f->y_stride,
@@ -290,12 +287,14 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi,
                                     accumulator, count);
 
           vp9_temporal_filter_apply(f->u_buffer + mb_uv_offset, f->uv_stride,
-                                    predictor + 256, 8, strength, filter_weight,
-                                    accumulator + 256, count + 256);
+                                    predictor + 256, mb_uv_height, strength,
+                                    filter_weight, accumulator + 256,
+                                    count + 256);
 
           vp9_temporal_filter_apply(f->v_buffer + mb_uv_offset, f->uv_stride,
-                                    predictor + 320, 8, strength, filter_weight,
-                                    accumulator + 320, count + 320);
+                                    predictor + 512, mb_uv_height, strength,
+                                    filter_weight, accumulator + 512,
+                                    count + 512);
         }
       }
 
@@ -306,7 +305,7 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi,
       for (i = 0, k = 0; i < 16; i++) {
         for (j = 0; j < 16; j++, k++) {
           unsigned int pval = accumulator[k] + (count[k] >> 1);
-          pval *= cpi->fixed_divide[count[k]];
+          pval *= fixed_divide[count[k]];
           pval >>= 19;
 
           dst1[byte] = (uint8_t)pval;
@@ -314,7 +313,6 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi,
           // move to next pixel
           byte++;
         }
-
         byte += stride - 16;
       }
 
@@ -322,35 +320,32 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi,
       dst2 = cpi->alt_ref_buffer.v_buffer;
       stride = cpi->alt_ref_buffer.uv_stride;
       byte = mb_uv_offset;
-      for (i = 0, k = 256; i < 8; i++) {
-        for (j = 0; j < 8; j++, k++) {
-          int m = k + 64;
+      for (i = 0, k = 256; i < mb_uv_height; i++) {
+        for (j = 0; j < mb_uv_height; j++, k++) {
+          int m = k + 256;
 
           // U
           unsigned int pval = accumulator[k] + (count[k] >> 1);
-          pval *= cpi->fixed_divide[count[k]];
+          pval *= fixed_divide[count[k]];
           pval >>= 19;
           dst1[byte] = (uint8_t)pval;
 
           // V
           pval = accumulator[m] + (count[m] >> 1);
-          pval *= cpi->fixed_divide[count[m]];
+          pval *= fixed_divide[count[m]];
           pval >>= 19;
           dst2[byte] = (uint8_t)pval;
 
           // move to next pixel
           byte++;
         }
-
-        byte += stride - 8;
+        byte += stride - mb_uv_height;
       }
-
       mb_y_offset += 16;
-      mb_uv_offset += 8;
+      mb_uv_offset += mb_uv_height;
     }
-
     mb_y_offset += 16 * (f->y_stride - mb_cols);
-    mb_uv_offset += 8 * (f->uv_stride - mb_cols);
+    mb_uv_offset += mb_uv_height * (f->uv_stride - mb_cols);
   }
 
   // Restore input state
@@ -360,24 +355,18 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi,
 
 void vp9_temporal_filter_prepare(VP9_COMP *cpi, int distance) {
   VP9_COMMON *const cm = &cpi->common;
-
   int frame = 0;
-
   int frames_to_blur_backward = 0;
   int frames_to_blur_forward = 0;
   int frames_to_blur = 0;
   int start_frame = 0;
-
   int strength = cpi->active_arnr_strength;
   int blur_type = cpi->oxcf.arnr_type;
   int max_frames = cpi->active_arnr_frames;
-
   const int num_frames_backward = distance;
   const int num_frames_forward = vp9_lookahead_depth(cpi->lookahead)
                                - (num_frames_backward + 1);
-
-  struct scale_factors scale;
-  struct scale_factors_common scale_comm;
+  struct scale_factors sf;
 
   switch (blur_type) {
     case 1:
@@ -392,7 +381,6 @@ void vp9_temporal_filter_prepare(VP9_COMP *cpi, int distance) {
 
     case 2:
       // Forward Blur
-
       frames_to_blur_forward = num_frames_forward;
 
       if (frames_to_blur_forward >= max_frames)
@@ -437,7 +425,7 @@ void vp9_temporal_filter_prepare(VP9_COMP *cpi, int distance) {
 #endif
 
   // Setup scaling factors. Scaling on each of the arnr frames is not supported
-  vp9_setup_scale_factors_for_frame(&scale, &scale_comm,
+  vp9_setup_scale_factors_for_frame(&sf,
       get_frame_new_buffer(cm)->y_crop_width,
       get_frame_new_buffer(cm)->y_crop_height,
       cm->width, cm->height);
@@ -452,25 +440,27 @@ void vp9_temporal_filter_prepare(VP9_COMP *cpi, int distance) {
   }
 
   temporal_filter_iterate_c(cpi, frames_to_blur, frames_to_blur_backward,
-                            strength, &scale);
+                            strength, &sf);
 }
 
-void configure_arnr_filter(VP9_COMP *cpi, const unsigned int this_frame,
-                           const int group_boost) {
+void vp9_configure_arnr_filter(VP9_COMP *cpi,
+                               const unsigned int frames_to_arnr,
+                               const int group_boost) {
   int half_gf_int;
   int frames_after_arf;
   int frames_bwd = cpi->oxcf.arnr_max_frames - 1;
   int frames_fwd = cpi->oxcf.arnr_max_frames - 1;
   int q;
 
-  // Define the arnr filter width for this group of frames:
-  // We only filter frames that lie within a distance of half
-  // the GF interval from the ARF frame. We also have to trap
-  // cases where the filter extends beyond the end of clip.
-  // Note: this_frame->frame has been updated in the loop
-  // so it now points at the ARF frame.
-  half_gf_int = cpi->baseline_gf_interval >> 1;
-  frames_after_arf = (int)(cpi->twopass.total_stats.count - this_frame - 1);
+  // Define the arnr filter width for this group of frames. We only
+  // filter frames that lie within a distance of half the GF interval
+  // from the ARF frame. We also have to trap cases where the filter
+  // extends beyond the end of the lookahead buffer.
+  // Note: frames_to_arnr parameter is the offset of the arnr
+  // frame from the current frame.
+  half_gf_int = cpi->rc.baseline_gf_interval >> 1;
+  frames_after_arf = vp9_lookahead_depth(cpi->lookahead)
+      - frames_to_arnr - 1;
 
   switch (cpi->oxcf.arnr_type) {
     case 1:  // Backward filter
@@ -507,11 +497,16 @@ void configure_arnr_filter(VP9_COMP *cpi, const unsigned int this_frame,
   cpi->active_arnr_frames = frames_bwd + 1 + frames_fwd;
 
   // Adjust the strength based on active max q
-  q = ((int)vp9_convert_qindex_to_q(cpi->active_worst_quality) >> 1);
-  if (q > 8) {
+  if (cpi->common.current_video_frame > 1)
+    q = ((int)vp9_convert_qindex_to_q(
+        cpi->rc.avg_frame_qindex[INTER_FRAME]));
+  else
+    q = ((int)vp9_convert_qindex_to_q(
+        cpi->rc.avg_frame_qindex[KEY_FRAME]));
+  if (q > 16) {
     cpi->active_arnr_strength = cpi->oxcf.arnr_strength;
   } else {
-    cpi->active_arnr_strength = cpi->oxcf.arnr_strength - (8 - q);
+    cpi->active_arnr_strength = cpi->oxcf.arnr_strength - ((16 - q) / 2);
     if (cpi->active_arnr_strength < 0)
       cpi->active_arnr_strength = 0;
   }
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_temporal_filter.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_temporal_filter.h
index c5f3b467e54..9453dc16ae9 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_temporal_filter.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_temporal_filter.h
@@ -11,8 +11,18 @@
 #ifndef VP9_ENCODER_VP9_TEMPORAL_FILTER_H_
 #define VP9_ENCODER_VP9_TEMPORAL_FILTER_H_
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vp9_temporal_filter_init();
 void vp9_temporal_filter_prepare(VP9_COMP *cpi, int distance);
-void configure_arnr_filter(VP9_COMP *cpi, const unsigned int this_frame,
-                           const int group_boost);
+void vp9_configure_arnr_filter(VP9_COMP *cpi,
+                               const unsigned int frames_to_arnr,
+                               const int group_boost);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
 
 #endif  // VP9_ENCODER_VP9_TEMPORAL_FILTER_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_tokenize.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_tokenize.c
index 550263aa8fc..17214c3eeb7 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_tokenize.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_tokenize.c
@@ -8,33 +8,107 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-
+#include <assert.h>
 #include <math.h>
 #include <stdio.h>
 #include <string.h>
-#include <assert.h>
-#include "vp9/encoder/vp9_onyx_int.h"
-#include "vp9/encoder/vp9_tokenize.h"
+
 #include "vpx_mem/vpx_mem.h"
 
+#include "vp9/common/vp9_entropy.h"
 #include "vp9/common/vp9_pred_common.h"
 #include "vp9/common/vp9_seg_common.h"
-#include "vp9/common/vp9_entropy.h"
-
-/* Global event counters used for accumulating statistics across several
-   compressions, then generating vp9_context.c = initial stats. */
 
-#ifdef ENTROPY_STATS
-vp9_coeff_accum context_counters[TX_SIZES][BLOCK_TYPES];
-extern vp9_coeff_stats tree_update_hist[TX_SIZES][BLOCK_TYPES];
-#endif  /* ENTROPY_STATS */
+#include "vp9/encoder/vp9_cost.h"
+#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_tokenize.h"
 
 static TOKENVALUE dct_value_tokens[DCT_MAX_VALUE * 2];
 const TOKENVALUE *vp9_dct_value_tokens_ptr;
-static int dct_value_cost[DCT_MAX_VALUE * 2];
-const int *vp9_dct_value_cost_ptr;
+static int16_t dct_value_cost[DCT_MAX_VALUE * 2];
+const int16_t *vp9_dct_value_cost_ptr;
+
+// Array indices are identical to previously-existing CONTEXT_NODE indices
+const vp9_tree_index vp9_coef_tree[TREE_SIZE(ENTROPY_TOKENS)] = {
+  -EOB_TOKEN, 2,                       // 0  = EOB
+  -ZERO_TOKEN, 4,                      // 1  = ZERO
+  -ONE_TOKEN, 6,                       // 2  = ONE
+  8, 12,                               // 3  = LOW_VAL
+  -TWO_TOKEN, 10,                      // 4  = TWO
+  -THREE_TOKEN, -FOUR_TOKEN,           // 5  = THREE
+  14, 16,                              // 6  = HIGH_LOW
+  -CATEGORY1_TOKEN, -CATEGORY2_TOKEN,  // 7  = CAT_ONE
+  18, 20,                              // 8  = CAT_THREEFOUR
+  -CATEGORY3_TOKEN, -CATEGORY4_TOKEN,  // 9  = CAT_THREE
+  -CATEGORY5_TOKEN, -CATEGORY6_TOKEN   // 10 = CAT_FIVE
+};
 
-static void fill_value_tokens() {
+// Unconstrained Node Tree
+const vp9_tree_index vp9_coef_con_tree[TREE_SIZE(ENTROPY_TOKENS)] = {
+  2, 6,                                // 0 = LOW_VAL
+  -TWO_TOKEN, 4,                       // 1 = TWO
+  -THREE_TOKEN, -FOUR_TOKEN,           // 2 = THREE
+  8, 10,                               // 3 = HIGH_LOW
+  -CATEGORY1_TOKEN, -CATEGORY2_TOKEN,  // 4 = CAT_ONE
+  12, 14,                              // 5 = CAT_THREEFOUR
+  -CATEGORY3_TOKEN, -CATEGORY4_TOKEN,  // 6 = CAT_THREE
+  -CATEGORY5_TOKEN, -CATEGORY6_TOKEN   // 7 = CAT_FIVE
+};
+
+static const vp9_prob Pcat1[] = { 159};
+static const vp9_prob Pcat2[] = { 165, 145};
+static const vp9_prob Pcat3[] = { 173, 148, 140};
+static const vp9_prob Pcat4[] = { 176, 155, 140, 135};
+static const vp9_prob Pcat5[] = { 180, 157, 141, 134, 130};
+static const vp9_prob Pcat6[] = {
+  254, 254, 254, 252, 249, 243, 230, 196, 177, 153, 140, 133, 130, 129
+};
+
+static vp9_tree_index cat1[2], cat2[4], cat3[6], cat4[8], cat5[10], cat6[28];
+
+static void init_bit_tree(vp9_tree_index *p, int n) {
+  int i = 0;
+
+  while (++i < n) {
+    p[0] = p[1] = i << 1;
+    p += 2;
+  }
+
+  p[0] = p[1] = 0;
+}
+
+static void init_bit_trees() {
+  init_bit_tree(cat1, 1);
+  init_bit_tree(cat2, 2);
+  init_bit_tree(cat3, 3);
+  init_bit_tree(cat4, 4);
+  init_bit_tree(cat5, 5);
+  init_bit_tree(cat6, 14);
+}
+
+const vp9_extra_bit vp9_extra_bits[ENTROPY_TOKENS] = {
+  {0, 0, 0, 0},           // ZERO_TOKEN
+  {0, 0, 0, 1},           // ONE_TOKEN
+  {0, 0, 0, 2},           // TWO_TOKEN
+  {0, 0, 0, 3},           // THREE_TOKEN
+  {0, 0, 0, 4},           // FOUR_TOKEN
+  {cat1, Pcat1, 1, 5},    // CATEGORY1_TOKEN
+  {cat2, Pcat2, 2, 7},    // CATEGORY2_TOKEN
+  {cat3, Pcat3, 3, 11},   // CATEGORY3_TOKEN
+  {cat4, Pcat4, 4, 19},   // CATEGORY4_TOKEN
+  {cat5, Pcat5, 5, 35},   // CATEGORY5_TOKEN
+  {cat6, Pcat6, 14, 67},  // CATEGORY6_TOKEN
+  {0, 0, 0, 0}            // EOB_TOKEN
+};
+
+struct vp9_token vp9_coef_encodings[ENTROPY_TOKENS];
+
+void vp9_coef_tree_initialize() {
+  init_bit_trees();
+  vp9_tokens_from_tree(vp9_coef_encodings, vp9_coef_tree);
+}
+
+void vp9_tokenize_initialize() {
   TOKENVALUE *const t = dct_value_tokens + DCT_MAX_VALUE;
   const vp9_extra_bit *const e = vp9_extra_bits;
 
@@ -65,7 +139,7 @@ static void fill_value_tokens() {
     // initialize the cost for extra bits for all possible coefficient value.
     {
       int cost = 0;
-      const vp9_extra_bit *p = vp9_extra_bits + t[i].token;
+      const vp9_extra_bit *p = &vp9_extra_bits[t[i].token];
 
       if (p->base_val) {
         const int extra = t[i].extra;
@@ -81,24 +155,55 @@ static void fill_value_tokens() {
   } while (++i < DCT_MAX_VALUE);
 
   vp9_dct_value_tokens_ptr = dct_value_tokens + DCT_MAX_VALUE;
-  vp9_dct_value_cost_ptr   = dct_value_cost + DCT_MAX_VALUE;
+  vp9_dct_value_cost_ptr = dct_value_cost + DCT_MAX_VALUE;
 }
 
 struct tokenize_b_args {
   VP9_COMP *cpi;
   MACROBLOCKD *xd;
   TOKENEXTRA **tp;
-  TX_SIZE tx_size;
 };
 
 static void set_entropy_context_b(int plane, int block, BLOCK_SIZE plane_bsize,
                                   TX_SIZE tx_size, void *arg) {
   struct tokenize_b_args* const args = arg;
   MACROBLOCKD *const xd = args->xd;
+  struct macroblock_plane *p = &args->cpi->mb.plane[plane];
   struct macroblockd_plane *pd = &xd->plane[plane];
   int aoff, loff;
   txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &aoff, &loff);
-  set_contexts(xd, pd, plane_bsize, tx_size, pd->eobs[block] > 0, aoff, loff);
+  vp9_set_contexts(xd, pd, plane_bsize, tx_size, p->eobs[block] > 0,
+                   aoff, loff);
+}
+
+static INLINE void add_token(TOKENEXTRA **t, const vp9_prob *context_tree,
+                             int16_t extra, uint8_t token,
+                             uint8_t skip_eob_node,
+                             unsigned int *counts) {
+  (*t)->token = token;
+  (*t)->extra = extra;
+  (*t)->context_tree = context_tree;
+  (*t)->skip_eob_node = skip_eob_node;
+  (*t)++;
+  ++counts[token];
+}
+
+static INLINE void add_token_no_extra(TOKENEXTRA **t,
+                                      const vp9_prob *context_tree,
+                                      uint8_t token,
+                                      uint8_t skip_eob_node,
+                                      unsigned int *counts) {
+  (*t)->token = token;
+  (*t)->context_tree = context_tree;
+  (*t)->skip_eob_node = skip_eob_node;
+  (*t)++;
+  ++counts[token];
+}
+
+static INLINE int get_tx_eob(const struct segmentation *seg, int segment_id,
+                             TX_SIZE tx_size) {
+  const int eob_max = 16 << (tx_size << 1);
+  return vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP) ? 0 : eob_max;
 }
 
 static void tokenize_b(int plane, int block, BLOCK_SIZE plane_bsize,
@@ -107,70 +212,80 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE plane_bsize,
   VP9_COMP *cpi = args->cpi;
   MACROBLOCKD *xd = args->xd;
   TOKENEXTRA **tp = args->tp;
+  uint8_t token_cache[32 * 32];
+  struct macroblock_plane *p = &cpi->mb.plane[plane];
   struct macroblockd_plane *pd = &xd->plane[plane];
-  MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
   int pt; /* near block/prev token context index */
-  int c = 0, rc = 0;
+  int c;
   TOKENEXTRA *t = *tp;        /* store tokens starting here */
-  const int eob = pd->eobs[block];
+  int eob = p->eobs[block];
   const PLANE_TYPE type = pd->plane_type;
-  const int16_t *qcoeff_ptr = BLOCK_OFFSET(pd->qcoeff, block);
-
+  const int16_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
   const int segment_id = mbmi->segment_id;
   const int16_t *scan, *nb;
-  vp9_coeff_count *const counts = cpi->coef_counts[tx_size];
-  vp9_coeff_probs_model *const coef_probs = cpi->common.fc.coef_probs[tx_size];
+  const scan_order *so;
   const int ref = is_inter_block(mbmi);
-  uint8_t token_cache[1024];
-  const uint8_t *const band_translate = get_band_translate(tx_size);
+  unsigned int (*const counts)[COEFF_CONTEXTS][ENTROPY_TOKENS] =
+      cpi->coef_counts[tx_size][type][ref];
+  vp9_prob (*const coef_probs)[COEFF_CONTEXTS][UNCONSTRAINED_NODES] =
+      cpi->common.fc.coef_probs[tx_size][type][ref];
+  unsigned int (*const eob_branch)[COEFF_CONTEXTS] =
+      cpi->common.counts.eob_branch[tx_size][type][ref];
+  const uint8_t *const band = get_band_translate(tx_size);
   const int seg_eob = get_tx_eob(&cpi->common.seg, segment_id, tx_size);
+
   int aoff, loff;
   txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &aoff, &loff);
 
-  assert((!type && !plane) || (type && plane));
-
   pt = get_entropy_context(tx_size, pd->above_context + aoff,
-                                    pd->left_context + loff);
-  get_scan(xd, tx_size, type, block, &scan, &nb);
+                           pd->left_context + loff);
+  so = get_scan(xd, tx_size, type, block);
+  scan = so->scan;
+  nb = so->neighbors;
   c = 0;
-  do {
-    const int band = get_coef_band(band_translate, c);
-    int token;
+  while (c < eob) {
     int v = 0;
-    rc = scan[c];
-    if (c)
-      pt = get_coef_context(nb, token_cache, c);
-    if (c < eob) {
-      v = qcoeff_ptr[rc];
-      assert(-DCT_MAX_VALUE <= v  &&  v < DCT_MAX_VALUE);
-
-      t->extra = vp9_dct_value_tokens_ptr[v].extra;
-      token    = vp9_dct_value_tokens_ptr[v].token;
-    } else {
-      token = DCT_EOB_TOKEN;
-    }
+    int skip_eob = 0;
+    v = qcoeff[scan[c]];
 
-    t->token = token;
-    t->context_tree = coef_probs[type][ref][band][pt];
-    t->skip_eob_node = (c > 0) && (token_cache[scan[c - 1]] == 0);
+    while (!v) {
+      add_token_no_extra(&t, coef_probs[band[c]][pt], ZERO_TOKEN, skip_eob,
+                         counts[band[c]][pt]);
+      eob_branch[band[c]][pt] += !skip_eob;
 
-    assert(vp9_coef_encodings[t->token].len - t->skip_eob_node > 0);
-
-    ++counts[type][ref][band][pt][token];
-    if (!t->skip_eob_node)
-      ++cpi->common.counts.eob_branch[tx_size][type][ref][band][pt];
+      skip_eob = 1;
+      token_cache[scan[c]] = 0;
+      ++c;
+      pt = get_coef_context(nb, token_cache, c);
+      v = qcoeff[scan[c]];
+    }
 
-    token_cache[rc] = vp9_pt_energy_class[token];
-    ++t;
-  } while (c < eob && ++c < seg_eob);
+    add_token(&t, coef_probs[band[c]][pt],
+              vp9_dct_value_tokens_ptr[v].extra,
+              (uint8_t)vp9_dct_value_tokens_ptr[v].token,
+              (uint8_t)skip_eob,
+              counts[band[c]][pt]);
+    eob_branch[band[c]][pt] += !skip_eob;
+
+    token_cache[scan[c]] =
+        vp9_pt_energy_class[vp9_dct_value_tokens_ptr[v].token];
+    ++c;
+    pt = get_coef_context(nb, token_cache, c);
+  }
+  if (c < seg_eob) {
+    add_token_no_extra(&t, coef_probs[band[c]][pt], EOB_TOKEN, 0,
+                       counts[band[c]][pt]);
+    ++eob_branch[band[c]][pt];
+  }
 
   *tp = t;
 
-  set_contexts(xd, pd, plane_bsize, tx_size, c > 0, aoff, loff);
+  vp9_set_contexts(xd, pd, plane_bsize, tx_size, c > 0, aoff, loff);
 }
 
 struct is_skippable_args {
-  MACROBLOCKD *xd;
+  MACROBLOCK *x;
   int *skippable;
 };
 
@@ -178,21 +293,16 @@ static void is_skippable(int plane, int block,
                          BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
                          void *argv) {
   struct is_skippable_args *args = argv;
-  args->skippable[0] &= (!args->xd->plane[plane].eobs[block]);
+  (void)plane_bsize;
+  (void)tx_size;
+  args->skippable[0] &= (!args->x->plane[plane].eobs[block]);
 }
 
-int vp9_sb_is_skippable(MACROBLOCKD *xd, BLOCK_SIZE bsize) {
+int vp9_is_skippable_in_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {
   int result = 1;
-  struct is_skippable_args args = {xd, &result};
-  foreach_transformed_block(xd, bsize, is_skippable, &args);
-  return result;
-}
-
-int vp9_is_skippable_in_plane(MACROBLOCKD *xd, BLOCK_SIZE bsize,
-                              int plane) {
-  int result = 1;
-  struct is_skippable_args args = {xd, &result};
-  foreach_transformed_block_in_plane(xd, bsize, plane, is_skippable, &args);
+  struct is_skippable_args args = {x, &result};
+  vp9_foreach_transformed_block_in_plane(&x->e_mbd, bsize, plane, is_skippable,
+                                         &args);
   return result;
 }
 
@@ -200,17 +310,15 @@ void vp9_tokenize_sb(VP9_COMP *cpi, TOKENEXTRA **t, int dry_run,
                      BLOCK_SIZE bsize) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &cpi->mb.e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
   TOKENEXTRA *t_backup = *t;
-  const int mb_skip_context = vp9_get_pred_context_mbskip(xd);
+  const int ctx = vp9_get_skip_context(xd);
   const int skip_inc = !vp9_segfeature_active(&cm->seg, mbmi->segment_id,
                                               SEG_LVL_SKIP);
-  struct tokenize_b_args arg = {cpi, xd, t, mbmi->tx_size};
-
-  mbmi->skip_coeff = vp9_sb_is_skippable(xd, bsize);
-  if (mbmi->skip_coeff) {
+  struct tokenize_b_args arg = {cpi, xd, t};
+  if (mbmi->skip) {
     if (!dry_run)
-      cm->counts.mbskip[mb_skip_context][1] += skip_inc;
+      cm->counts.skip[ctx][1] += skip_inc;
     reset_skip_context(xd, bsize);
     if (dry_run)
       *t = t_backup;
@@ -218,157 +326,10 @@ void vp9_tokenize_sb(VP9_COMP *cpi, TOKENEXTRA **t, int dry_run,
   }
 
   if (!dry_run) {
-    cm->counts.mbskip[mb_skip_context][0] += skip_inc;
-    foreach_transformed_block(xd, bsize, tokenize_b, &arg);
+    cm->counts.skip[ctx][0] += skip_inc;
+    vp9_foreach_transformed_block(xd, bsize, tokenize_b, &arg);
   } else {
-    foreach_transformed_block(xd, bsize, set_entropy_context_b, &arg);
+    vp9_foreach_transformed_block(xd, bsize, set_entropy_context_b, &arg);
     *t = t_backup;
   }
 }
-
-#ifdef ENTROPY_STATS
-void init_context_counters(void) {
-  FILE *f = fopen("context.bin", "rb");
-  if (!f) {
-    vp9_zero(context_counters);
-  } else {
-    fread(context_counters, sizeof(context_counters), 1, f);
-    fclose(f);
-  }
-
-  f = fopen("treeupdate.bin", "rb");
-  if (!f) {
-    vpx_memset(tree_update_hist, 0, sizeof(tree_update_hist));
-  } else {
-    fread(tree_update_hist, sizeof(tree_update_hist), 1, f);
-    fclose(f);
-  }
-}
-
-static void print_counter(FILE *f, vp9_coeff_accum *context_counters,
-                          int block_types, const char *header) {
-  int type, ref, band, pt, t;
-
-  fprintf(f, "static const vp9_coeff_count %s = {\n", header);
-
-#define Comma(X) (X ? "," : "")
-  type = 0;
-  do {
-    ref = 0;
-    fprintf(f, "%s\n  { /* block Type %d */", Comma(type), type);
-    do {
-      fprintf(f, "%s\n    { /* %s */", Comma(type), ref ? "Inter" : "Intra");
-      band = 0;
-      do {
-        fprintf(f, "%s\n      { /* Coeff Band %d */", Comma(band), band);
-        pt = 0;
-        do {
-          fprintf(f, "%s\n        {", Comma(pt));
-
-          t = 0;
-          do {
-            const int64_t x = context_counters[type][ref][band][pt][t];
-            const int y = (int) x;
-
-            assert(x == (int64_t) y);  /* no overflow handling yet */
-            fprintf(f, "%s %d", Comma(t), y);
-          } while (++t < 1 + MAX_ENTROPY_TOKENS);
-          fprintf(f, "}");
-        } while (++pt < PREV_COEF_CONTEXTS);
-        fprintf(f, "\n      }");
-      } while (++band < COEF_BANDS);
-      fprintf(f, "\n    }");
-    } while (++ref < REF_TYPES);
-    fprintf(f, "\n  }");
-  } while (++type < block_types);
-  fprintf(f, "\n};\n");
-}
-
-static void print_probs(FILE *f, vp9_coeff_accum *context_counters,
-                        int block_types, const char *header) {
-  int type, ref, band, pt, t;
-
-  fprintf(f, "static const vp9_coeff_probs %s = {", header);
-
-  type = 0;
-#define Newline(x, spaces) (x ? " " : "\n" spaces)
-  do {
-    fprintf(f, "%s%s{ /* block Type %d */",
-            Comma(type), Newline(type, "  "), type);
-    ref = 0;
-    do {
-      fprintf(f, "%s%s{ /* %s */",
-              Comma(band), Newline(band, "    "), ref ? "Inter" : "Intra");
-      band = 0;
-      do {
-        fprintf(f, "%s%s{ /* Coeff Band %d */",
-                Comma(band), Newline(band, "      "), band);
-        pt = 0;
-        do {
-          unsigned int branch_ct[ENTROPY_NODES][2];
-          unsigned int coef_counts[MAX_ENTROPY_TOKENS + 1];
-          vp9_prob coef_probs[ENTROPY_NODES];
-
-          if (pt >= 3 && band == 0)
-            break;
-          for (t = 0; t < MAX_ENTROPY_TOKENS + 1; ++t)
-            coef_counts[t] = context_counters[type][ref][band][pt][t];
-          vp9_tree_probs_from_distribution(vp9_coef_tree, coef_probs,
-                                           branch_ct, coef_counts, 0);
-          branch_ct[0][1] = coef_counts[MAX_ENTROPY_TOKENS] - branch_ct[0][0];
-          coef_probs[0] = get_binary_prob(branch_ct[0][0], branch_ct[0][1]);
-          fprintf(f, "%s\n      {", Comma(pt));
-
-          t = 0;
-          do {
-            fprintf(f, "%s %3d", Comma(t), coef_probs[t]);
-          } while (++t < ENTROPY_NODES);
-
-          fprintf(f, " }");
-        } while (++pt < PREV_COEF_CONTEXTS);
-        fprintf(f, "\n      }");
-      } while (++band < COEF_BANDS);
-      fprintf(f, "\n    }");
-    } while (++ref < REF_TYPES);
-    fprintf(f, "\n  }");
-  } while (++type < block_types);
-  fprintf(f, "\n};\n");
-}
-
-void print_context_counters() {
-  FILE *f = fopen("vp9_context.c", "w");
-
-  fprintf(f, "#include \"vp9_entropy.h\"\n");
-  fprintf(f, "\n/* *** GENERATED FILE: DO NOT EDIT *** */\n\n");
-
-  /* print counts */
-  print_counter(f, context_counters[TX_4X4], BLOCK_TYPES,
-                "vp9_default_coef_counts_4x4[BLOCK_TYPES]");
-  print_counter(f, context_counters[TX_8X8], BLOCK_TYPES,
-                "vp9_default_coef_counts_8x8[BLOCK_TYPES]");
-  print_counter(f, context_counters[TX_16X16], BLOCK_TYPES,
-                "vp9_default_coef_counts_16x16[BLOCK_TYPES]");
-  print_counter(f, context_counters[TX_32X32], BLOCK_TYPES,
-                "vp9_default_coef_counts_32x32[BLOCK_TYPES]");
-
-  /* print coefficient probabilities */
-  print_probs(f, context_counters[TX_4X4], BLOCK_TYPES,
-              "default_coef_probs_4x4[BLOCK_TYPES]");
-  print_probs(f, context_counters[TX_8X8], BLOCK_TYPES,
-              "default_coef_probs_8x8[BLOCK_TYPES]");
-  print_probs(f, context_counters[TX_16X16], BLOCK_TYPES,
-              "default_coef_probs_16x16[BLOCK_TYPES]");
-  print_probs(f, context_counters[TX_32X32], BLOCK_TYPES,
-              "default_coef_probs_32x32[BLOCK_TYPES]");
-
-  fclose(f);
-
-  f = fopen("context.bin", "wb");
-  fwrite(context_counters, sizeof(context_counters), 1, f);
-  fclose(f);
-}
-#endif
-
-void vp9_tokenize_initialize() {
-  fill_value_tokens();
-}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_tokenize.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_tokenize.h
index b78e100ec95..063c0bafe7b 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_tokenize.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_tokenize.h
@@ -12,10 +12,18 @@
 #define VP9_ENCODER_VP9_TOKENIZE_H_
 
 #include "vp9/common/vp9_entropy.h"
+
 #include "vp9/encoder/vp9_block.h"
+#include "vp9/encoder/vp9_treewriter.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
 
 void vp9_tokenize_initialize();
 
+#define EOSB_TOKEN 127     // Not signalled, encoder only
+
 typedef struct {
   int16_t token;
   int16_t extra;
@@ -28,29 +36,26 @@ typedef struct {
   uint8_t         skip_eob_node;
 } TOKENEXTRA;
 
-typedef int64_t vp9_coeff_accum[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS]
-                               [MAX_ENTROPY_TOKENS + 1];
+extern const vp9_tree_index vp9_coef_tree[];
+extern const vp9_tree_index vp9_coef_con_tree[];
+extern struct vp9_token vp9_coef_encodings[];
+
+int vp9_is_skippable_in_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane);
 
-int vp9_sb_is_skippable(MACROBLOCKD *xd, BLOCK_SIZE bsize);
-int vp9_is_skippable_in_plane(MACROBLOCKD *xd, BLOCK_SIZE bsize,
-                              int plane);
 struct VP9_COMP;
 
 void vp9_tokenize_sb(struct VP9_COMP *cpi, TOKENEXTRA **t, int dry_run,
                      BLOCK_SIZE bsize);
 
-#ifdef ENTROPY_STATS
-void init_context_counters();
-void print_context_counters();
-
-extern vp9_coeff_accum context_counters[TX_SIZES][BLOCK_TYPES];
-#endif
-
-extern const int *vp9_dct_value_cost_ptr;
+extern const int16_t *vp9_dct_value_cost_ptr;
 /* TODO: The Token field should be broken out into a separate char array to
  *  improve cache locality, since it's needed for costing when the rest of the
  *  fields are not.
  */
 extern const TOKENVALUE *vp9_dct_value_tokens_ptr;
 
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
 #endif  // VP9_ENCODER_VP9_TOKENIZE_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_treewriter.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_treewriter.c
index e4aed5374cf..bb04b4025c8 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_treewriter.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_treewriter.c
@@ -10,29 +10,49 @@
 
 #include "vp9/encoder/vp9_treewriter.h"
 
-static void cost(int *costs, vp9_tree tree, const vp9_prob *probs,
-                 int i, int c) {
-  const vp9_prob prob = probs[i / 2];
-  int b;
-
-  for (b = 0; b <= 1; ++b) {
-    const int cc = c + vp9_cost_bit(prob, b);
-    const vp9_tree_index ii = tree[i + b];
-
-    if (ii <= 0)
-      costs[-ii] = cc;
-    else
-      cost(costs, tree, probs, ii, cc);
-  }
+static void tree2tok(struct vp9_token *tokens, const vp9_tree_index *tree,
+                     int i, int v, int l) {
+  v += v;
+  ++l;
+
+  do {
+    const vp9_tree_index j = tree[i++];
+    if (j <= 0) {
+      tokens[-j].value = v;
+      tokens[-j].len = l;
+    } else {
+      tree2tok(tokens, tree, j, v, l);
+    }
+  } while (++v & 1);
 }
 
-void vp9_cost_tokens(int *costs, const vp9_prob *probs, vp9_tree tree) {
-  cost(costs, tree, probs, 0, 0);
+void vp9_tokens_from_tree(struct vp9_token *tokens,
+                          const vp9_tree_index *tree) {
+  tree2tok(tokens, tree, 0, 0, 0);
 }
 
-void vp9_cost_tokens_skip(int *costs, const vp9_prob *probs, vp9_tree tree) {
-  assert(tree[0] <= 0 && tree[1] > 0);
+static unsigned int convert_distribution(unsigned int i, vp9_tree tree,
+                                         unsigned int branch_ct[][2],
+                                         const unsigned int num_events[]) {
+  unsigned int left, right;
+
+  if (tree[i] <= 0)
+    left = num_events[-tree[i]];
+  else
+    left = convert_distribution(tree[i], tree, branch_ct, num_events);
+
+  if (tree[i + 1] <= 0)
+    right = num_events[-tree[i + 1]];
+  else
+    right = convert_distribution(tree[i + 1], tree, branch_ct, num_events);
+
+  branch_ct[i >> 1][0] = left;
+  branch_ct[i >> 1][1] = right;
+  return left + right;
+}
 
-  costs[-tree[0]] = vp9_cost_bit(probs[0], 0);
-  cost(costs, tree, probs, 2, 0);
+void vp9_tree_probs_from_distribution(vp9_tree tree,
+                                      unsigned int branch_ct[/* n-1 */][2],
+                                      const unsigned int num_events[/* n */]) {
+  convert_distribution(0, tree, branch_ct, num_events);
 }
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_treewriter.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_treewriter.h
index eeda5cda796..4a76d87cdfe 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_treewriter.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_treewriter.h
@@ -8,47 +8,29 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-
 #ifndef VP9_ENCODER_VP9_TREEWRITER_H_
 #define VP9_ENCODER_VP9_TREEWRITER_H_
 
-/* Trees map alphabets into huffman-like codes suitable for an arithmetic
-   bit coder.  Timothy S Murphy  11 October 2004 */
-
-#include "vp9/common/vp9_treecoder.h"
-
-#include "vp9/encoder/vp9_boolhuff.h"       /* for now */
-
-
-#define vp9_write_prob(w, v) vp9_write_literal((w), (v), 8)
-
-/* Approximate length of an encoded bool in 256ths of a bit at given prob */
+#include "vp9/encoder/vp9_writer.h"
 
-#define vp9_cost_zero(x) (vp9_prob_cost[x])
-#define vp9_cost_one(x) vp9_cost_zero(vp9_complement(x))
-
-#define vp9_cost_bit(x, b) vp9_cost_zero((b) ? vp9_complement(x) : (x))
-
-/* VP8BC version is scaled by 2^20 rather than 2^8; see bool_coder.h */
-
-
-/* Both of these return bits, not scaled bits. */
-static INLINE unsigned int cost_branch256(const unsigned int ct[2],
-                                          vp9_prob p) {
-  return ct[0] * vp9_cost_zero(p) + ct[1] * vp9_cost_one(p);
-}
+#ifdef __cplusplus
+extern "C" {
+#endif
 
-static INLINE unsigned int cost_branch(const unsigned int ct[2],
-                                       vp9_prob p) {
-  return cost_branch256(ct, p) >> 8;
-}
+void vp9_tree_probs_from_distribution(vp9_tree tree,
+                                      unsigned int branch_ct[ /* n - 1 */ ][2],
+                                      const unsigned int num_events[ /* n */ ]);
 
+struct vp9_token {
+  int value;
+  int len;
+};
 
-static INLINE void treed_write(vp9_writer *w,
-                               vp9_tree tree, const vp9_prob *probs,
-                               int bits, int len) {
-  vp9_tree_index i = 0;
+void vp9_tokens_from_tree(struct vp9_token*, const vp9_tree_index *);
 
+static INLINE void vp9_write_tree(vp9_writer *w, const vp9_tree_index *tree,
+                                  const vp9_prob *probs, int bits, int len,
+                                  vp9_tree_index i) {
   do {
     const int bit = (bits >> --len) & 1;
     vp9_write(w, bit, probs[i >> 1]);
@@ -56,32 +38,14 @@ static INLINE void treed_write(vp9_writer *w,
   } while (len);
 }
 
-static INLINE void write_token(vp9_writer *w, vp9_tree tree,
-                               const vp9_prob *probs,
-                               const struct vp9_token *token) {
-  treed_write(w, tree, probs, token->value, token->len);
-}
-
-static INLINE int treed_cost(vp9_tree tree, const vp9_prob *probs,
-                             int bits, int len) {
-  int cost = 0;
-  vp9_tree_index i = 0;
-
-  do {
-    const int bit = (bits >> --len) & 1;
-    cost += vp9_cost_bit(probs[i >> 1], bit);
-    i = tree[i + bit];
-  } while (len);
-
-  return cost;
-}
-
-static INLINE int cost_token(vp9_tree tree, const vp9_prob *probs,
-                             const struct vp9_token *token) {
-  return treed_cost(tree, probs, token->value, token->len);
+static INLINE void vp9_write_token(vp9_writer *w, const vp9_tree_index *tree,
+                                   const vp9_prob *probs,
+                                   const struct vp9_token *token) {
+  vp9_write_tree(w, tree, probs, token->value, token->len, 0);
 }
 
-void vp9_cost_tokens(int *costs, const vp9_prob *probs, vp9_tree tree);
-void vp9_cost_tokens_skip(int *costs, const vp9_prob *probs, vp9_tree tree);
+#ifdef __cplusplus
+}  // extern "C"
+#endif
 
 #endif  // VP9_ENCODER_VP9_TREEWRITER_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_variance.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_variance.c
new file mode 100644
index 00000000000..91d8ea4dcba
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_variance.c
@@ -0,0 +1,256 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp9_rtcd.h"
+
+#include "vpx_ports/mem.h"
+#include "vpx/vpx_integer.h"
+
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_filter.h"
+
+#include "vp9/encoder/vp9_variance.h"
+
+void variance(const uint8_t *a, int  a_stride,
+              const uint8_t *b, int  b_stride,
+              int  w, int  h, unsigned int *sse, int *sum) {
+  int i, j;
+
+  *sum = 0;
+  *sse = 0;
+
+  for (i = 0; i < h; i++) {
+    for (j = 0; j < w; j++) {
+      const int diff = a[j] - b[j];
+      *sum += diff;
+      *sse += diff * diff;
+    }
+
+    a += a_stride;
+    b += b_stride;
+  }
+}
+
+// Applies a 1-D 2-tap bi-linear filter to the source block in either horizontal
+// or vertical direction to produce the filtered output block. Used to implement
+// first-pass of 2-D separable filter.
+//
+// Produces int32_t output to retain precision for next pass. Two filter taps
+// should sum to VP9_FILTER_WEIGHT. pixel_step defines whether the filter is
+// applied horizontally (pixel_step=1) or vertically (pixel_step=stride). It
+// defines the offset required to move from one input to the next.
+static void var_filter_block2d_bil_first_pass(const uint8_t *src_ptr,
+                                              uint16_t *output_ptr,
+                                              unsigned int src_pixels_per_line,
+                                              int pixel_step,
+                                              unsigned int output_height,
+                                              unsigned int output_width,
+                                              const int16_t *vp9_filter) {
+  unsigned int i, j;
+
+  for (i = 0; i < output_height; i++) {
+    for (j = 0; j < output_width; j++) {
+      output_ptr[j] = ROUND_POWER_OF_TWO((int)src_ptr[0] * vp9_filter[0] +
+                          (int)src_ptr[pixel_step] * vp9_filter[1],
+                          FILTER_BITS);
+
+      src_ptr++;
+    }
+
+    // Next row...
+    src_ptr    += src_pixels_per_line - output_width;
+    output_ptr += output_width;
+  }
+}
+
+// Applies a 1-D 2-tap bi-linear filter to the source block in either horizontal
+// or vertical direction to produce the filtered output block. Used to implement
+// second-pass of 2-D separable filter.
+//
+// Requires 32-bit input as produced by filter_block2d_bil_first_pass. Two
+// filter taps should sum to VP9_FILTER_WEIGHT. pixel_step defines whether the
+// filter is applied horizontally (pixel_step=1) or vertically (pixel_step=
+// stride). It defines the offset required to move from one input to the next.
+static void var_filter_block2d_bil_second_pass(const uint16_t *src_ptr,
+                                               uint8_t *output_ptr,
+                                               unsigned int src_pixels_per_line,
+                                               unsigned int pixel_step,
+                                               unsigned int output_height,
+                                               unsigned int output_width,
+                                               const int16_t *vp9_filter) {
+  unsigned int  i, j;
+
+  for (i = 0; i < output_height; i++) {
+    for (j = 0; j < output_width; j++) {
+      output_ptr[j] = ROUND_POWER_OF_TWO((int)src_ptr[0] * vp9_filter[0] +
+                          (int)src_ptr[pixel_step] * vp9_filter[1],
+                          FILTER_BITS);
+      src_ptr++;
+    }
+
+    src_ptr += src_pixels_per_line - output_width;
+    output_ptr += output_width;
+  }
+}
+
+unsigned int vp9_get_mb_ss_c(const int16_t *src_ptr) {
+  unsigned int i, sum = 0;
+
+  for (i = 0; i < 256; i++)
+    sum += src_ptr[i] * src_ptr[i];
+
+  return sum;
+}
+
+#define VAR(W, H) \
+unsigned int vp9_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
+                                       const uint8_t *b, int b_stride, \
+                                       unsigned int *sse) { \
+  int sum; \
+  variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
+  return *sse - (((int64_t)sum * sum) / (W * H)); \
+}
+
+#define SUBPIX_VAR(W, H) \
+unsigned int vp9_sub_pixel_variance##W##x##H##_c( \
+  const uint8_t *src, int  src_stride, \
+  int xoffset, int  yoffset, \
+  const uint8_t *dst, int dst_stride, \
+  unsigned int *sse) { \
+  uint16_t fdata3[(H + 1) * W]; \
+  uint8_t temp2[H * W]; \
+\
+  var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, W, \
+                                    BILINEAR_FILTERS_2TAP(xoffset)); \
+  var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+                                     BILINEAR_FILTERS_2TAP(yoffset)); \
+\
+  return vp9_variance##W##x##H##_c(temp2, W, dst, dst_stride, sse); \
+}
+
+#define SUBPIX_AVG_VAR(W, H) \
+unsigned int vp9_sub_pixel_avg_variance##W##x##H##_c( \
+  const uint8_t *src, int  src_stride, \
+  int xoffset, int  yoffset, \
+  const uint8_t *dst, int dst_stride, \
+  unsigned int *sse, \
+  const uint8_t *second_pred) { \
+  uint16_t fdata3[(H + 1) * W]; \
+  uint8_t temp2[H * W]; \
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, H * W); \
+\
+  var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, W, \
+                                    BILINEAR_FILTERS_2TAP(xoffset)); \
+  var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+                                     BILINEAR_FILTERS_2TAP(yoffset)); \
+\
+  vp9_comp_avg_pred(temp3, second_pred, W, H, temp2, W); \
+\
+  return vp9_variance##W##x##H##_c(temp3, W, dst, dst_stride, sse); \
+}
+
+unsigned int vp9_mse16x16_c(const uint8_t *src, int src_stride,
+                            const uint8_t *ref, int ref_stride,
+                            unsigned int *sse) {
+  int sum;
+  variance(src, src_stride, ref, ref_stride, 16, 16, sse, &sum);
+  return *sse;
+}
+
+unsigned int vp9_mse16x8_c(const uint8_t *src, int src_stride,
+                           const uint8_t *ref, int ref_stride,
+                           unsigned int *sse) {
+  int sum;
+  variance(src, src_stride, ref, ref_stride, 16, 8, sse, &sum);
+  return *sse;
+}
+
+unsigned int vp9_mse8x16_c(const uint8_t *src, int src_stride,
+                           const uint8_t *ref, int ref_stride,
+                           unsigned int *sse) {
+  int sum;
+  variance(src, src_stride, ref, ref_stride, 8, 16, sse, &sum);
+  return *sse;
+}
+
+unsigned int vp9_mse8x8_c(const uint8_t *src, int src_stride,
+                          const uint8_t *ref, int ref_stride,
+                          unsigned int *sse) {
+  int sum;
+  variance(src, src_stride, ref, ref_stride, 8, 8, sse, &sum);
+  return *sse;
+}
+
+VAR(4, 4)
+SUBPIX_VAR(4, 4)
+SUBPIX_AVG_VAR(4, 4)
+
+VAR(4, 8)
+SUBPIX_VAR(4, 8)
+SUBPIX_AVG_VAR(4, 8)
+
+VAR(8, 4)
+SUBPIX_VAR(8, 4)
+SUBPIX_AVG_VAR(8, 4)
+
+VAR(8, 8)
+SUBPIX_VAR(8, 8)
+SUBPIX_AVG_VAR(8, 8)
+
+VAR(8, 16)
+SUBPIX_VAR(8, 16)
+SUBPIX_AVG_VAR(8, 16)
+
+VAR(16, 8)
+SUBPIX_VAR(16, 8)
+SUBPIX_AVG_VAR(16, 8)
+
+VAR(16, 16)
+SUBPIX_VAR(16, 16)
+SUBPIX_AVG_VAR(16, 16)
+
+VAR(16, 32)
+SUBPIX_VAR(16, 32)
+SUBPIX_AVG_VAR(16, 32)
+
+VAR(32, 16)
+SUBPIX_VAR(32, 16)
+SUBPIX_AVG_VAR(32, 16)
+
+VAR(32, 32)
+SUBPIX_VAR(32, 32)
+SUBPIX_AVG_VAR(32, 32)
+
+VAR(32, 64)
+SUBPIX_VAR(32, 64)
+SUBPIX_AVG_VAR(32, 64)
+
+VAR(64, 32)
+SUBPIX_VAR(64, 32)
+SUBPIX_AVG_VAR(64, 32)
+
+VAR(64, 64)
+SUBPIX_VAR(64, 64)
+SUBPIX_AVG_VAR(64, 64)
+
+void vp9_comp_avg_pred(uint8_t *comp_pred, const uint8_t *pred, int width,
+                       int height, const uint8_t *ref, int ref_stride) {
+  int i, j;
+
+  for (i = 0; i < height; i++) {
+    for (j = 0; j < width; j++) {
+      const int tmp = pred[j] + ref[j];
+      comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
+    }
+    comp_pred += width;
+    pred += width;
+    ref += ref_stride;
+  }
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_variance.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_variance.h
index 2ded97c559e..c47fe133554 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_variance.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_variance.h
@@ -12,16 +12,15 @@
 #define VP9_ENCODER_VP9_VARIANCE_H_
 
 #include "vpx/vpx_integer.h"
-// #include "./vpx_config.h"
 
-void variance(const uint8_t *src_ptr,
-              int  source_stride,
-              const uint8_t *ref_ptr,
-              int  recon_stride,
-              int  w,
-              int  h,
-              unsigned int *sse,
-              int *sum);
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void variance(const uint8_t *a, int a_stride,
+              const uint8_t *b, int b_stride,
+              int  w, int  h,
+              unsigned int *sse, int *sum);
 
 typedef unsigned int(*vp9_sad_fn_t)(const uint8_t *src_ptr,
                                     int source_stride,
@@ -42,12 +41,6 @@ typedef void (*vp9_sad_multi_fn_t)(const uint8_t *src_ptr,
                                    int  ref_stride,
                                    unsigned int *sad_array);
 
-typedef void (*vp9_sad_multi1_fn_t)(const uint8_t *src_ptr,
-                                    int source_stride,
-                                    const uint8_t *ref_ptr,
-                                    int  ref_stride,
-                                    unsigned int *sad_array);
-
 typedef void (*vp9_sad_multi_d_fn_t)(const uint8_t *src_ptr,
                                      int source_stride,
                                      const uint8_t* const ref_ptr[],
@@ -76,40 +69,22 @@ typedef unsigned int (*vp9_subp_avg_variance_fn_t)(const uint8_t *src_ptr,
                                                    unsigned int *sse,
                                                    const uint8_t *second_pred);
 
-typedef unsigned int (*vp9_getmbss_fn_t)(const short *);
-
-typedef unsigned int (*vp9_get16x16prederror_fn_t)(const uint8_t *src_ptr,
-                                                   int source_stride,
-                                                   const uint8_t *ref_ptr,
-                                                   int  ref_stride);
-
 typedef struct vp9_variance_vtable {
   vp9_sad_fn_t               sdf;
   vp9_sad_avg_fn_t           sdaf;
   vp9_variance_fn_t          vf;
   vp9_subpixvariance_fn_t    svf;
   vp9_subp_avg_variance_fn_t svaf;
-  vp9_variance_fn_t          svf_halfpix_h;
-  vp9_variance_fn_t          svf_halfpix_v;
-  vp9_variance_fn_t          svf_halfpix_hv;
   vp9_sad_multi_fn_t         sdx3f;
-  vp9_sad_multi1_fn_t        sdx8f;
+  vp9_sad_multi_fn_t         sdx8f;
   vp9_sad_multi_d_fn_t       sdx4df;
 } vp9_variance_fn_ptr_t;
 
-static void comp_avg_pred(uint8_t *comp_pred, const uint8_t *pred, int width,
-                          int height, const uint8_t *ref, int ref_stride) {
-  int i, j;
+void vp9_comp_avg_pred(uint8_t *comp_pred, const uint8_t *pred, int width,
+                       int height, const uint8_t *ref, int ref_stride);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
 
-  for (i = 0; i < height; i++) {
-    for (j = 0; j < width; j++) {
-      int tmp;
-      tmp = pred[j] + ref[j];
-      comp_pred[j] = (tmp + 1) >> 1;
-    }
-    comp_pred += width;
-    pred += width;
-    ref += ref_stride;
-  }
-}
 #endif  // VP9_ENCODER_VP9_VARIANCE_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_variance_c.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_variance_c.c
deleted file mode 100644
index 8bc38508991..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_variance_c.c
+++ /dev/null
@@ -1,1094 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "./vp9_rtcd.h"
-
-#include "vpx_ports/mem.h"
-#include "vpx/vpx_integer.h"
-
-#include "vp9/common/vp9_common.h"
-#include "vp9/common/vp9_filter.h"
-
-#include "vp9/encoder/vp9_variance.h"
-
-void variance(const uint8_t *src_ptr,
-              int  source_stride,
-              const uint8_t *ref_ptr,
-              int  recon_stride,
-              int  w,
-              int  h,
-              unsigned int *sse,
-              int *sum) {
-  int i, j;
-  int diff;
-
-  *sum = 0;
-  *sse = 0;
-
-  for (i = 0; i < h; i++) {
-    for (j = 0; j < w; j++) {
-      diff = src_ptr[j] - ref_ptr[j];
-      *sum += diff;
-      *sse += diff * diff;
-    }
-
-    src_ptr += source_stride;
-    ref_ptr += recon_stride;
-  }
-}
-
-/****************************************************************************
- *
- *  ROUTINE       : filter_block2d_bil_first_pass
- *
- *  INPUTS        : uint8_t  *src_ptr          : Pointer to source block.
- *                  uint32_t src_pixels_per_line : Stride of input block.
- *                  uint32_t pixel_step        : Offset between filter input
- *                                               samples (see notes).
- *                  uint32_t output_height     : Input block height.
- *                  uint32_t output_width      : Input block width.
- *                  int32_t  *vp9_filter       : Array of 2 bi-linear filter
- *                                               taps.
- *
- *  OUTPUTS       : int32_t *output_ptr        : Pointer to filtered block.
- *
- *  RETURNS       : void
- *
- *  FUNCTION      : Applies a 1-D 2-tap bi-linear filter to the source block in
- *                  either horizontal or vertical direction to produce the
- *                  filtered output block. Used to implement first-pass
- *                  of 2-D separable filter.
- *
- *  SPECIAL NOTES : Produces int32_t output to retain precision for next pass.
- *                  Two filter taps should sum to VP9_FILTER_WEIGHT.
- *                  pixel_step defines whether the filter is applied
- *                  horizontally (pixel_step=1) or vertically (pixel_step=
- *                  stride).
- *                  It defines the offset required to move from one input
- *                  to the next.
- *
- ****************************************************************************/
-static void var_filter_block2d_bil_first_pass(const uint8_t *src_ptr,
-                                              uint16_t *output_ptr,
-                                              unsigned int src_pixels_per_line,
-                                              int pixel_step,
-                                              unsigned int output_height,
-                                              unsigned int output_width,
-                                              const int16_t *vp9_filter) {
-  unsigned int i, j;
-
-  for (i = 0; i < output_height; i++) {
-    for (j = 0; j < output_width; j++) {
-      output_ptr[j] = ROUND_POWER_OF_TWO((int)src_ptr[0] * vp9_filter[0] +
-                          (int)src_ptr[pixel_step] * vp9_filter[1],
-                          FILTER_BITS);
-
-      src_ptr++;
-    }
-
-    // Next row...
-    src_ptr    += src_pixels_per_line - output_width;
-    output_ptr += output_width;
-  }
-}
-
-/****************************************************************************
- *
- *  ROUTINE       : filter_block2d_bil_second_pass
- *
- *  INPUTS        : int32_t  *src_ptr          : Pointer to source block.
- *                  uint32_t src_pixels_per_line : Stride of input block.
- *                  uint32_t pixel_step        : Offset between filter input
- *                                               samples (see notes).
- *                  uint32_t output_height     : Input block height.
- *                  uint32_t output_width      : Input block width.
- *                  int32_t  *vp9_filter       : Array of 2 bi-linear filter
- *                                               taps.
- *
- *  OUTPUTS       : uint16_t *output_ptr       : Pointer to filtered block.
- *
- *  RETURNS       : void
- *
- *  FUNCTION      : Applies a 1-D 2-tap bi-linear filter to the source block in
- *                  either horizontal or vertical direction to produce the
- *                  filtered output block. Used to implement second-pass
- *                  of 2-D separable filter.
- *
- *  SPECIAL NOTES : Requires 32-bit input as produced by
- *                  filter_block2d_bil_first_pass.
- *                  Two filter taps should sum to VP9_FILTER_WEIGHT.
- *                  pixel_step defines whether the filter is applied
- *                  horizontally (pixel_step=1) or vertically (pixel_step=
- *                  stride).
- *                  It defines the offset required to move from one input
- *                  to the next.
- *
- ****************************************************************************/
-static void var_filter_block2d_bil_second_pass(const uint16_t *src_ptr,
-                                               uint8_t *output_ptr,
-                                               unsigned int src_pixels_per_line,
-                                               unsigned int pixel_step,
-                                               unsigned int output_height,
-                                               unsigned int output_width,
-                                               const int16_t *vp9_filter) {
-  unsigned int  i, j;
-
-  for (i = 0; i < output_height; i++) {
-    for (j = 0; j < output_width; j++) {
-      output_ptr[j] = ROUND_POWER_OF_TWO((int)src_ptr[0] * vp9_filter[0] +
-                          (int)src_ptr[pixel_step] * vp9_filter[1],
-                          FILTER_BITS);
-      src_ptr++;
-    }
-
-    src_ptr += src_pixels_per_line - output_width;
-    output_ptr += output_width;
-  }
-}
-
-unsigned int vp9_get_mb_ss_c(const int16_t *src_ptr) {
-  unsigned int i, sum = 0;
-
-  for (i = 0; i < 256; i++) {
-    sum += (src_ptr[i] * src_ptr[i]);
-  }
-
-  return sum;
-}
-
-unsigned int vp9_variance64x32_c(const uint8_t *src_ptr,
-                                 int  source_stride,
-                                 const uint8_t *ref_ptr,
-                                 int  recon_stride,
-                                 unsigned int *sse) {
-  unsigned int var;
-  int avg;
-
-  variance(src_ptr, source_stride, ref_ptr, recon_stride, 64, 32, &var, &avg);
-  *sse = var;
-  return (var - (((int64_t)avg * avg) >> 11));
-}
-
-unsigned int vp9_sub_pixel_variance64x32_c(const uint8_t *src_ptr,
-                                           int  src_pixels_per_line,
-                                           int  xoffset,
-                                           int  yoffset,
-                                           const uint8_t *dst_ptr,
-                                           int dst_pixels_per_line,
-                                           unsigned int *sse) {
-  uint16_t fdata3[65 * 64];  // Temp data buffer used in filtering
-  uint8_t temp2[68 * 64];
-  const int16_t *hfilter, *vfilter;
-
-  hfilter = BILINEAR_FILTERS_2TAP(xoffset);
-  vfilter = BILINEAR_FILTERS_2TAP(yoffset);
-
-  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
-                                    1, 33, 64, hfilter);
-  var_filter_block2d_bil_second_pass(fdata3, temp2, 64, 64, 32, 64, vfilter);
-
-  return vp9_variance64x32(temp2, 64, dst_ptr, dst_pixels_per_line, sse);
-}
-
-unsigned int vp9_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr,
-                                               int  src_pixels_per_line,
-                                               int  xoffset,
-                                               int  yoffset,
-                                               const uint8_t *dst_ptr,
-                                               int dst_pixels_per_line,
-                                               unsigned int *sse,
-                                               const uint8_t *second_pred) {
-  uint16_t fdata3[65 * 64];  // Temp data buffer used in filtering
-  uint8_t temp2[68 * 64];
-  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 64 * 64);  // compound pred buffer
-  const int16_t *hfilter, *vfilter;
-
-  hfilter = BILINEAR_FILTERS_2TAP(xoffset);
-  vfilter = BILINEAR_FILTERS_2TAP(yoffset);
-
-  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
-                                    1, 33, 64, hfilter);
-  var_filter_block2d_bil_second_pass(fdata3, temp2, 64, 64, 32, 64, vfilter);
-  comp_avg_pred(temp3, second_pred, 64, 32, temp2, 64);
-  return vp9_variance64x32(temp3, 64, dst_ptr, dst_pixels_per_line, sse);
-}
-
-unsigned int vp9_variance32x64_c(const uint8_t *src_ptr,
-                                 int  source_stride,
-                                 const uint8_t *ref_ptr,
-                                 int  recon_stride,
-                                 unsigned int *sse) {
-  unsigned int var;
-  int avg;
-
-  variance(src_ptr, source_stride, ref_ptr, recon_stride, 32, 64, &var, &avg);
-  *sse = var;
-  return (var - (((int64_t)avg * avg) >> 11));
-}
-
-unsigned int vp9_sub_pixel_variance32x64_c(const uint8_t *src_ptr,
-                                           int  src_pixels_per_line,
-                                           int  xoffset,
-                                           int  yoffset,
-                                           const uint8_t *dst_ptr,
-                                           int dst_pixels_per_line,
-                                           unsigned int *sse) {
-  uint16_t fdata3[65 * 64];  // Temp data buffer used in filtering
-  uint8_t temp2[68 * 64];
-  const int16_t *hfilter, *vfilter;
-
-  hfilter = BILINEAR_FILTERS_2TAP(xoffset);
-  vfilter = BILINEAR_FILTERS_2TAP(yoffset);
-
-  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
-                                    1, 65, 32, hfilter);
-  var_filter_block2d_bil_second_pass(fdata3, temp2, 32, 32, 64, 32, vfilter);
-
-  return vp9_variance32x64(temp2, 32, dst_ptr, dst_pixels_per_line, sse);
-}
-
-unsigned int vp9_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr,
-                                               int  src_pixels_per_line,
-                                               int  xoffset,
-                                               int  yoffset,
-                                               const uint8_t *dst_ptr,
-                                               int dst_pixels_per_line,
-                                               unsigned int *sse,
-                                               const uint8_t *second_pred) {
-  uint16_t fdata3[65 * 64];  // Temp data buffer used in filtering
-  uint8_t temp2[68 * 64];
-  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 32 * 64);  // compound pred buffer
-  const int16_t *hfilter, *vfilter;
-
-  hfilter = BILINEAR_FILTERS_2TAP(xoffset);
-  vfilter = BILINEAR_FILTERS_2TAP(yoffset);
-
-  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
-                                    1, 65, 32, hfilter);
-  var_filter_block2d_bil_second_pass(fdata3, temp2, 32, 32, 64, 32, vfilter);
-  comp_avg_pred(temp3, second_pred, 32, 64, temp2, 32);
-  return vp9_variance32x64(temp3, 32, dst_ptr, dst_pixels_per_line, sse);
-}
-
-unsigned int vp9_variance32x16_c(const uint8_t *src_ptr,
-                                 int  source_stride,
-                                 const uint8_t *ref_ptr,
-                                 int  recon_stride,
-                                 unsigned int *sse) {
-  unsigned int var;
-  int avg;
-
-  variance(src_ptr, source_stride, ref_ptr, recon_stride, 32, 16, &var, &avg);
-  *sse = var;
-  return (var - (((int64_t)avg * avg) >> 9));
-}
-
-unsigned int vp9_sub_pixel_variance32x16_c(const uint8_t *src_ptr,
-                                           int  src_pixels_per_line,
-                                           int  xoffset,
-                                           int  yoffset,
-                                           const uint8_t *dst_ptr,
-                                           int dst_pixels_per_line,
-                                           unsigned int *sse) {
-  uint16_t fdata3[33 * 32];  // Temp data buffer used in filtering
-  uint8_t temp2[36 * 32];
-  const int16_t *hfilter, *vfilter;
-
-  hfilter = BILINEAR_FILTERS_2TAP(xoffset);
-  vfilter = BILINEAR_FILTERS_2TAP(yoffset);
-
-  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
-                                    1, 17, 32, hfilter);
-  var_filter_block2d_bil_second_pass(fdata3, temp2, 32, 32, 16, 32, vfilter);
-
-  return vp9_variance32x16(temp2, 32, dst_ptr, dst_pixels_per_line, sse);
-}
-
-unsigned int vp9_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr,
-                                               int  src_pixels_per_line,
-                                               int  xoffset,
-                                               int  yoffset,
-                                               const uint8_t *dst_ptr,
-                                               int dst_pixels_per_line,
-                                               unsigned int *sse,
-                                               const uint8_t *second_pred) {
-  uint16_t fdata3[33 * 32];  // Temp data buffer used in filtering
-  uint8_t temp2[36 * 32];
-  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 32 * 16);  // compound pred buffer
-  const int16_t *hfilter, *vfilter;
-
-  hfilter = BILINEAR_FILTERS_2TAP(xoffset);
-  vfilter = BILINEAR_FILTERS_2TAP(yoffset);
-
-  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
-                                    1, 17, 32, hfilter);
-  var_filter_block2d_bil_second_pass(fdata3, temp2, 32, 32, 16, 32, vfilter);
-  comp_avg_pred(temp3, second_pred, 32, 16, temp2, 32);
-  return vp9_variance32x16(temp3, 32, dst_ptr, dst_pixels_per_line, sse);
-}
-
-unsigned int vp9_variance16x32_c(const uint8_t *src_ptr,
-                                 int  source_stride,
-                                 const uint8_t *ref_ptr,
-                                 int  recon_stride,
-                                 unsigned int *sse) {
-  unsigned int var;
-  int avg;
-
-  variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 32, &var, &avg);
-  *sse = var;
-  return (var - (((int64_t)avg * avg) >> 9));
-}
-
-unsigned int vp9_sub_pixel_variance16x32_c(const uint8_t *src_ptr,
-                                           int  src_pixels_per_line,
-                                           int  xoffset,
-                                           int  yoffset,
-                                           const uint8_t *dst_ptr,
-                                           int dst_pixels_per_line,
-                                           unsigned int *sse) {
-  uint16_t fdata3[33 * 32];  // Temp data buffer used in filtering
-  uint8_t temp2[36 * 32];
-  const int16_t *hfilter, *vfilter;
-
-  hfilter = BILINEAR_FILTERS_2TAP(xoffset);
-  vfilter = BILINEAR_FILTERS_2TAP(yoffset);
-
-  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
-                                    1, 33, 16, hfilter);
-  var_filter_block2d_bil_second_pass(fdata3, temp2, 16, 16, 32, 16, vfilter);
-
-  return vp9_variance16x32(temp2, 16, dst_ptr, dst_pixels_per_line, sse);
-}
-
-unsigned int vp9_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr,
-                                               int  src_pixels_per_line,
-                                               int  xoffset,
-                                               int  yoffset,
-                                               const uint8_t *dst_ptr,
-                                               int dst_pixels_per_line,
-                                               unsigned int *sse,
-                                               const uint8_t *second_pred) {
-  uint16_t fdata3[33 * 32];  // Temp data buffer used in filtering
-  uint8_t temp2[36 * 32];
-  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 16 * 32);  // compound pred buffer
-  const int16_t *hfilter, *vfilter;
-
-  hfilter = BILINEAR_FILTERS_2TAP(xoffset);
-  vfilter = BILINEAR_FILTERS_2TAP(yoffset);
-
-  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
-                                    1, 33, 16, hfilter);
-  var_filter_block2d_bil_second_pass(fdata3, temp2, 16, 16, 32, 16, vfilter);
-  comp_avg_pred(temp3, second_pred, 16, 32, temp2, 16);
-  return vp9_variance16x32(temp3, 16, dst_ptr, dst_pixels_per_line, sse);
-}
-
-unsigned int vp9_variance64x64_c(const uint8_t *src_ptr,
-                                 int  source_stride,
-                                 const uint8_t *ref_ptr,
-                                 int  recon_stride,
-                                 unsigned int *sse) {
-  unsigned int var;
-  int avg;
-
-  variance(src_ptr, source_stride, ref_ptr, recon_stride, 64, 64, &var, &avg);
-  *sse = var;
-  return (var - (((int64_t)avg * avg) >> 12));
-}
-
-unsigned int vp9_variance32x32_c(const uint8_t *src_ptr,
-                                 int  source_stride,
-                                 const uint8_t *ref_ptr,
-                                 int  recon_stride,
-                                 unsigned int *sse) {
-  unsigned int var;
-  int avg;
-
-  variance(src_ptr, source_stride, ref_ptr, recon_stride, 32, 32, &var, &avg);
-  *sse = var;
-  return (var - (((int64_t)avg * avg) >> 10));
-}
-
-unsigned int vp9_variance16x16_c(const uint8_t *src_ptr,
-                                 int  source_stride,
-                                 const uint8_t *ref_ptr,
-                                 int  recon_stride,
-                                 unsigned int *sse) {
-  unsigned int var;
-  int avg;
-
-  variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16, &var, &avg);
-  *sse = var;
-  return (var - (((unsigned int)avg * avg) >> 8));
-}
-
-unsigned int vp9_variance8x16_c(const uint8_t *src_ptr,
-                                int  source_stride,
-                                const uint8_t *ref_ptr,
-                                int  recon_stride,
-                                unsigned int *sse) {
-  unsigned int var;
-  int avg;
-
-  variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 16, &var, &avg);
-  *sse = var;
-  return (var - (((unsigned int)avg * avg) >> 7));
-}
-
-unsigned int vp9_variance16x8_c(const uint8_t *src_ptr,
-                                int  source_stride,
-                                const uint8_t *ref_ptr,
-                                int  recon_stride,
-                                unsigned int *sse) {
-  unsigned int var;
-  int avg;
-
-  variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 8, &var, &avg);
-  *sse = var;
-  return (var - (((unsigned int)avg * avg) >> 7));
-}
-
-void vp9_get_sse_sum_8x8_c(const uint8_t *src_ptr, int source_stride,
-                       const uint8_t *ref_ptr, int ref_stride,
-                       unsigned int *sse, int *sum) {
-  variance(src_ptr, source_stride, ref_ptr, ref_stride, 8, 8, sse, sum);
-}
-
-unsigned int vp9_variance8x8_c(const uint8_t *src_ptr,
-                               int  source_stride,
-                               const uint8_t *ref_ptr,
-                               int  recon_stride,
-                               unsigned int *sse) {
-  unsigned int var;
-  int avg;
-
-  variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 8, &var, &avg);
-  *sse = var;
-  return (var - (((unsigned int)avg * avg) >> 6));
-}
-
-unsigned int vp9_variance8x4_c(const uint8_t *src_ptr,
-                               int  source_stride,
-                               const uint8_t *ref_ptr,
-                               int  recon_stride,
-                               unsigned int *sse) {
-  unsigned int var;
-  int avg;
-
-  variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 4, &var, &avg);
-  *sse = var;
-  return (var - (((unsigned int)avg * avg) >> 5));
-}
-
-unsigned int vp9_variance4x8_c(const uint8_t *src_ptr,
-                               int  source_stride,
-                               const uint8_t *ref_ptr,
-                               int  recon_stride,
-                               unsigned int *sse) {
-  unsigned int var;
-  int avg;
-
-  variance(src_ptr, source_stride, ref_ptr, recon_stride, 4, 8, &var, &avg);
-  *sse = var;
-  return (var - (((unsigned int)avg * avg) >> 5));
-}
-
-unsigned int vp9_variance4x4_c(const uint8_t *src_ptr,
-                               int  source_stride,
-                               const uint8_t *ref_ptr,
-                               int  recon_stride,
-                               unsigned int *sse) {
-  unsigned int var;
-  int avg;
-
-  variance(src_ptr, source_stride, ref_ptr, recon_stride, 4, 4, &var, &avg);
-  *sse = var;
-  return (var - (((unsigned int)avg * avg) >> 4));
-}
-
-
-unsigned int vp9_mse16x16_c(const uint8_t *src_ptr,
-                            int  source_stride,
-                            const uint8_t *ref_ptr,
-                            int  recon_stride,
-                            unsigned int *sse) {
-  unsigned int var;
-  int avg;
-
-  variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16, &var, &avg);
-  *sse = var;
-  return var;
-}
-
-unsigned int vp9_mse16x8_c(const uint8_t *src_ptr,
-                           int  source_stride,
-                           const uint8_t *ref_ptr,
-                           int  recon_stride,
-                           unsigned int *sse) {
-  unsigned int var;
-  int avg;
-
-  variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 8, &var, &avg);
-  *sse = var;
-  return var;
-}
-
-unsigned int vp9_mse8x16_c(const uint8_t *src_ptr,
-                           int  source_stride,
-                           const uint8_t *ref_ptr,
-                           int  recon_stride,
-                           unsigned int *sse) {
-  unsigned int var;
-  int avg;
-
-  variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 16, &var, &avg);
-  *sse = var;
-  return var;
-}
-
-unsigned int vp9_mse8x8_c(const uint8_t *src_ptr,
-                          int  source_stride,
-                          const uint8_t *ref_ptr,
-                          int  recon_stride,
-                          unsigned int *sse) {
-  unsigned int var;
-  int avg;
-
-  variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 8, &var, &avg);
-  *sse = var;
-  return var;
-}
-
-
-unsigned int vp9_sub_pixel_variance4x4_c(const uint8_t *src_ptr,
-                                         int  src_pixels_per_line,
-                                         int  xoffset,
-                                         int  yoffset,
-                                         const uint8_t *dst_ptr,
-                                         int dst_pixels_per_line,
-                                         unsigned int *sse) {
-  uint8_t temp2[20 * 16];
-  const int16_t *hfilter, *vfilter;
-  uint16_t fdata3[5 * 4];  // Temp data buffer used in filtering
-
-  hfilter = BILINEAR_FILTERS_2TAP(xoffset);
-  vfilter = BILINEAR_FILTERS_2TAP(yoffset);
-
-  // First filter 1d Horizontal
-  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
-                                    1, 5, 4, hfilter);
-
-  // Now filter Verticaly
-  var_filter_block2d_bil_second_pass(fdata3, temp2, 4,  4,  4,  4, vfilter);
-
-  return vp9_variance4x4(temp2, 4, dst_ptr, dst_pixels_per_line, sse);
-}
-
-unsigned int vp9_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr,
-                                             int  src_pixels_per_line,
-                                             int  xoffset,
-                                             int  yoffset,
-                                             const uint8_t *dst_ptr,
-                                             int dst_pixels_per_line,
-                                             unsigned int *sse,
-                                             const uint8_t *second_pred) {
-  uint8_t temp2[20 * 16];
-  const int16_t *hfilter, *vfilter;
-  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 4 * 4);  // compound pred buffer
-  uint16_t fdata3[5 * 4];  // Temp data buffer used in filtering
-
-  hfilter = BILINEAR_FILTERS_2TAP(xoffset);
-  vfilter = BILINEAR_FILTERS_2TAP(yoffset);
-
-  // First filter 1d Horizontal
-  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
-                                    1, 5, 4, hfilter);
-
-  // Now filter Verticaly
-  var_filter_block2d_bil_second_pass(fdata3, temp2, 4,  4,  4,  4, vfilter);
-  comp_avg_pred(temp3, second_pred, 4, 4, temp2, 4);
-  return vp9_variance4x4(temp3, 4, dst_ptr, dst_pixels_per_line, sse);
-}
-
-unsigned int vp9_sub_pixel_variance8x8_c(const uint8_t *src_ptr,
-                                         int  src_pixels_per_line,
-                                         int  xoffset,
-                                         int  yoffset,
-                                         const uint8_t *dst_ptr,
-                                         int dst_pixels_per_line,
-                                         unsigned int *sse) {
-  uint16_t fdata3[9 * 8];  // Temp data buffer used in filtering
-  uint8_t temp2[20 * 16];
-  const int16_t *hfilter, *vfilter;
-
-  hfilter = BILINEAR_FILTERS_2TAP(xoffset);
-  vfilter = BILINEAR_FILTERS_2TAP(yoffset);
-
-  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
-                                    1, 9, 8, hfilter);
-  var_filter_block2d_bil_second_pass(fdata3, temp2, 8, 8, 8, 8, vfilter);
-
-  return vp9_variance8x8(temp2, 8, dst_ptr, dst_pixels_per_line, sse);
-}
-
-unsigned int vp9_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr,
-                                             int  src_pixels_per_line,
-                                             int  xoffset,
-                                             int  yoffset,
-                                             const uint8_t *dst_ptr,
-                                             int dst_pixels_per_line,
-                                             unsigned int *sse,
-                                             const uint8_t *second_pred) {
-  uint16_t fdata3[9 * 8];  // Temp data buffer used in filtering
-  uint8_t temp2[20 * 16];
-  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 8 * 8);  // compound pred buffer
-  const int16_t *hfilter, *vfilter;
-
-  hfilter = BILINEAR_FILTERS_2TAP(xoffset);
-  vfilter = BILINEAR_FILTERS_2TAP(yoffset);
-
-  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
-                                    1, 9, 8, hfilter);
-  var_filter_block2d_bil_second_pass(fdata3, temp2, 8, 8, 8, 8, vfilter);
-  comp_avg_pred(temp3, second_pred, 8, 8, temp2, 8);
-  return vp9_variance8x8(temp3, 8, dst_ptr, dst_pixels_per_line, sse);
-}
-
-unsigned int vp9_sub_pixel_variance16x16_c(const uint8_t *src_ptr,
-                                           int  src_pixels_per_line,
-                                           int  xoffset,
-                                           int  yoffset,
-                                           const uint8_t *dst_ptr,
-                                           int dst_pixels_per_line,
-                                           unsigned int *sse) {
-  uint16_t fdata3[17 * 16];  // Temp data buffer used in filtering
-  uint8_t temp2[20 * 16];
-  const int16_t *hfilter, *vfilter;
-
-  hfilter = BILINEAR_FILTERS_2TAP(xoffset);
-  vfilter = BILINEAR_FILTERS_2TAP(yoffset);
-
-  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
-                                    1, 17, 16, hfilter);
-  var_filter_block2d_bil_second_pass(fdata3, temp2, 16, 16, 16, 16, vfilter);
-
-  return vp9_variance16x16(temp2, 16, dst_ptr, dst_pixels_per_line, sse);
-}
-
-unsigned int vp9_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr,
-                                               int  src_pixels_per_line,
-                                               int  xoffset,
-                                               int  yoffset,
-                                               const uint8_t *dst_ptr,
-                                               int dst_pixels_per_line,
-                                               unsigned int *sse,
-                                               const uint8_t *second_pred) {
-  uint16_t fdata3[17 * 16];
-  uint8_t temp2[20 * 16];
-  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 16 * 16);  // compound pred buffer
-  const int16_t *hfilter, *vfilter;
-
-  hfilter = BILINEAR_FILTERS_2TAP(xoffset);
-  vfilter = BILINEAR_FILTERS_2TAP(yoffset);
-
-  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
-                                    1, 17, 16, hfilter);
-  var_filter_block2d_bil_second_pass(fdata3, temp2, 16, 16, 16, 16, vfilter);
-
-  comp_avg_pred(temp3, second_pred, 16, 16, temp2, 16);
-  return vp9_variance16x16(temp3, 16, dst_ptr, dst_pixels_per_line, sse);
-}
-
-unsigned int vp9_sub_pixel_variance64x64_c(const uint8_t *src_ptr,
-                                           int  src_pixels_per_line,
-                                           int  xoffset,
-                                           int  yoffset,
-                                           const uint8_t *dst_ptr,
-                                           int dst_pixels_per_line,
-                                           unsigned int *sse) {
-  uint16_t fdata3[65 * 64];  // Temp data buffer used in filtering
-  uint8_t temp2[68 * 64];
-  const int16_t *hfilter, *vfilter;
-
-  hfilter = BILINEAR_FILTERS_2TAP(xoffset);
-  vfilter = BILINEAR_FILTERS_2TAP(yoffset);
-
-  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
-                                    1, 65, 64, hfilter);
-  var_filter_block2d_bil_second_pass(fdata3, temp2, 64, 64, 64, 64, vfilter);
-
-  return vp9_variance64x64(temp2, 64, dst_ptr, dst_pixels_per_line, sse);
-}
-
-unsigned int vp9_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr,
-                                               int  src_pixels_per_line,
-                                               int  xoffset,
-                                               int  yoffset,
-                                               const uint8_t *dst_ptr,
-                                               int dst_pixels_per_line,
-                                               unsigned int *sse,
-                                               const uint8_t *second_pred) {
-  uint16_t fdata3[65 * 64];  // Temp data buffer used in filtering
-  uint8_t temp2[68 * 64];
-  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 64 * 64);  // compound pred buffer
-  const int16_t *hfilter, *vfilter;
-
-  hfilter = BILINEAR_FILTERS_2TAP(xoffset);
-  vfilter = BILINEAR_FILTERS_2TAP(yoffset);
-
-  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
-                                    1, 65, 64, hfilter);
-  var_filter_block2d_bil_second_pass(fdata3, temp2, 64, 64, 64, 64, vfilter);
-  comp_avg_pred(temp3, second_pred, 64, 64, temp2, 64);
-  return vp9_variance64x64(temp3, 64, dst_ptr, dst_pixels_per_line, sse);
-}
-
-unsigned int vp9_sub_pixel_variance32x32_c(const uint8_t *src_ptr,
-                                           int  src_pixels_per_line,
-                                           int  xoffset,
-                                           int  yoffset,
-                                           const uint8_t *dst_ptr,
-                                           int dst_pixels_per_line,
-                                           unsigned int *sse) {
-  uint16_t fdata3[33 * 32];  // Temp data buffer used in filtering
-  uint8_t temp2[36 * 32];
-  const int16_t *hfilter, *vfilter;
-
-  hfilter = BILINEAR_FILTERS_2TAP(xoffset);
-  vfilter = BILINEAR_FILTERS_2TAP(yoffset);
-
-  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
-                                    1, 33, 32, hfilter);
-  var_filter_block2d_bil_second_pass(fdata3, temp2, 32, 32, 32, 32, vfilter);
-
-  return vp9_variance32x32(temp2, 32, dst_ptr, dst_pixels_per_line, sse);
-}
-
-unsigned int vp9_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr,
-                                               int  src_pixels_per_line,
-                                               int  xoffset,
-                                               int  yoffset,
-                                               const uint8_t *dst_ptr,
-                                               int dst_pixels_per_line,
-                                               unsigned int *sse,
-                                               const uint8_t *second_pred) {
-  uint16_t fdata3[33 * 32];  // Temp data buffer used in filtering
-  uint8_t temp2[36 * 32];
-  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 32 * 32);  // compound pred buffer
-  const int16_t *hfilter, *vfilter;
-
-  hfilter = BILINEAR_FILTERS_2TAP(xoffset);
-  vfilter = BILINEAR_FILTERS_2TAP(yoffset);
-
-  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
-                                    1, 33, 32, hfilter);
-  var_filter_block2d_bil_second_pass(fdata3, temp2, 32, 32, 32, 32, vfilter);
-  comp_avg_pred(temp3, second_pred, 32, 32, temp2, 32);
-  return vp9_variance32x32(temp3, 32, dst_ptr, dst_pixels_per_line, sse);
-}
-
-unsigned int vp9_variance_halfpixvar16x16_h_c(const uint8_t *src_ptr,
-                                              int  source_stride,
-                                              const uint8_t *ref_ptr,
-                                              int  recon_stride,
-                                              unsigned int *sse) {
-  return vp9_sub_pixel_variance16x16_c(src_ptr, source_stride, 8, 0,
-                                       ref_ptr, recon_stride, sse);
-}
-
-unsigned int vp9_variance_halfpixvar32x32_h_c(const uint8_t *src_ptr,
-                                              int  source_stride,
-                                              const uint8_t *ref_ptr,
-                                              int  recon_stride,
-                                              unsigned int *sse) {
-  return vp9_sub_pixel_variance32x32_c(src_ptr, source_stride, 8, 0,
-                                       ref_ptr, recon_stride, sse);
-}
-
-unsigned int vp9_variance_halfpixvar64x64_h_c(const uint8_t *src_ptr,
-                                              int  source_stride,
-                                              const uint8_t *ref_ptr,
-                                              int  recon_stride,
-                                              unsigned int *sse) {
-  return vp9_sub_pixel_variance64x64_c(src_ptr, source_stride, 8, 0,
-                                       ref_ptr, recon_stride, sse);
-}
-
-unsigned int vp9_variance_halfpixvar16x16_v_c(const uint8_t *src_ptr,
-                                              int  source_stride,
-                                              const uint8_t *ref_ptr,
-                                              int  recon_stride,
-                                              unsigned int *sse) {
-  return vp9_sub_pixel_variance16x16_c(src_ptr, source_stride, 0, 8,
-                                       ref_ptr, recon_stride, sse);
-}
-
-unsigned int vp9_variance_halfpixvar32x32_v_c(const uint8_t *src_ptr,
-                                              int  source_stride,
-                                              const uint8_t *ref_ptr,
-                                              int  recon_stride,
-                                              unsigned int *sse) {
-  return vp9_sub_pixel_variance32x32_c(src_ptr, source_stride, 0, 8,
-                                       ref_ptr, recon_stride, sse);
-}
-
-unsigned int vp9_variance_halfpixvar64x64_v_c(const uint8_t *src_ptr,
-                                              int  source_stride,
-                                              const uint8_t *ref_ptr,
-                                              int  recon_stride,
-                                              unsigned int *sse) {
-  return vp9_sub_pixel_variance64x64_c(src_ptr, source_stride, 0, 8,
-                                       ref_ptr, recon_stride, sse);
-}
-
-unsigned int vp9_variance_halfpixvar16x16_hv_c(const uint8_t *src_ptr,
-                                               int  source_stride,
-                                               const uint8_t *ref_ptr,
-                                               int  recon_stride,
-                                               unsigned int *sse) {
-  return vp9_sub_pixel_variance16x16_c(src_ptr, source_stride, 8, 8,
-                                       ref_ptr, recon_stride, sse);
-}
-
-unsigned int vp9_variance_halfpixvar32x32_hv_c(const uint8_t *src_ptr,
-                                               int  source_stride,
-                                               const uint8_t *ref_ptr,
-                                               int  recon_stride,
-                                               unsigned int *sse) {
-  return vp9_sub_pixel_variance32x32_c(src_ptr, source_stride, 8, 8,
-                                       ref_ptr, recon_stride, sse);
-}
-
-unsigned int vp9_variance_halfpixvar64x64_hv_c(const uint8_t *src_ptr,
-                                               int  source_stride,
-                                               const uint8_t *ref_ptr,
-                                               int  recon_stride,
-                                               unsigned int *sse) {
-  return vp9_sub_pixel_variance64x64_c(src_ptr, source_stride, 8, 8,
-                                       ref_ptr, recon_stride, sse);
-}
-
-unsigned int vp9_sub_pixel_mse16x16_c(const uint8_t *src_ptr,
-                                      int  src_pixels_per_line,
-                                      int  xoffset,
-                                      int  yoffset,
-                                      const uint8_t *dst_ptr,
-                                      int dst_pixels_per_line,
-                                      unsigned int *sse) {
-  vp9_sub_pixel_variance16x16_c(src_ptr, src_pixels_per_line,
-                                xoffset, yoffset, dst_ptr,
-                                dst_pixels_per_line, sse);
-  return *sse;
-}
-
-unsigned int vp9_sub_pixel_mse32x32_c(const uint8_t *src_ptr,
-                                      int  src_pixels_per_line,
-                                      int  xoffset,
-                                      int  yoffset,
-                                      const uint8_t *dst_ptr,
-                                      int dst_pixels_per_line,
-                                      unsigned int *sse) {
-  vp9_sub_pixel_variance32x32_c(src_ptr, src_pixels_per_line,
-                                xoffset, yoffset, dst_ptr,
-                                dst_pixels_per_line, sse);
-  return *sse;
-}
-
-unsigned int vp9_sub_pixel_mse64x64_c(const uint8_t *src_ptr,
-                                      int  src_pixels_per_line,
-                                      int  xoffset,
-                                      int  yoffset,
-                                      const uint8_t *dst_ptr,
-                                      int dst_pixels_per_line,
-                                      unsigned int *sse) {
-  vp9_sub_pixel_variance64x64_c(src_ptr, src_pixels_per_line,
-                                xoffset, yoffset, dst_ptr,
-                                dst_pixels_per_line, sse);
-  return *sse;
-}
-
-unsigned int vp9_sub_pixel_variance16x8_c(const uint8_t *src_ptr,
-                                          int  src_pixels_per_line,
-                                          int  xoffset,
-                                          int  yoffset,
-                                          const uint8_t *dst_ptr,
-                                          int dst_pixels_per_line,
-                                          unsigned int *sse) {
-  uint16_t fdata3[16 * 9];  // Temp data buffer used in filtering
-  uint8_t temp2[20 * 16];
-  const int16_t *hfilter, *vfilter;
-
-  hfilter = BILINEAR_FILTERS_2TAP(xoffset);
-  vfilter = BILINEAR_FILTERS_2TAP(yoffset);
-
-  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
-                                    1, 9, 16, hfilter);
-  var_filter_block2d_bil_second_pass(fdata3, temp2, 16, 16, 8, 16, vfilter);
-
-  return vp9_variance16x8(temp2, 16, dst_ptr, dst_pixels_per_line, sse);
-}
-
-unsigned int vp9_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr,
-                                              int  src_pixels_per_line,
-                                              int  xoffset,
-                                              int  yoffset,
-                                              const uint8_t *dst_ptr,
-                                              int dst_pixels_per_line,
-                                              unsigned int *sse,
-                                              const uint8_t *second_pred) {
-  uint16_t fdata3[16 * 9];  // Temp data buffer used in filtering
-  uint8_t temp2[20 * 16];
-  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 16 * 8);  // compound pred buffer
-  const int16_t *hfilter, *vfilter;
-
-  hfilter = BILINEAR_FILTERS_2TAP(xoffset);
-  vfilter = BILINEAR_FILTERS_2TAP(yoffset);
-
-  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
-                                    1, 9, 16, hfilter);
-  var_filter_block2d_bil_second_pass(fdata3, temp2, 16, 16, 8, 16, vfilter);
-  comp_avg_pred(temp3, second_pred, 16, 8, temp2, 16);
-  return vp9_variance16x8(temp3, 16, dst_ptr, dst_pixels_per_line, sse);
-}
-
-unsigned int vp9_sub_pixel_variance8x16_c(const uint8_t *src_ptr,
-                                          int  src_pixels_per_line,
-                                          int  xoffset,
-                                          int  yoffset,
-                                          const uint8_t *dst_ptr,
-                                          int dst_pixels_per_line,
-                                          unsigned int *sse) {
-  uint16_t fdata3[9 * 16];  // Temp data buffer used in filtering
-  uint8_t temp2[20 * 16];
-  const int16_t *hfilter, *vfilter;
-
-  hfilter = BILINEAR_FILTERS_2TAP(xoffset);
-  vfilter = BILINEAR_FILTERS_2TAP(yoffset);
-
-  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
-                                    1, 17, 8, hfilter);
-  var_filter_block2d_bil_second_pass(fdata3, temp2, 8, 8, 16, 8, vfilter);
-
-  return vp9_variance8x16(temp2, 8, dst_ptr, dst_pixels_per_line, sse);
-}
-
-unsigned int vp9_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr,
-                                              int  src_pixels_per_line,
-                                              int  xoffset,
-                                              int  yoffset,
-                                              const uint8_t *dst_ptr,
-                                              int dst_pixels_per_line,
-                                              unsigned int *sse,
-                                              const uint8_t *second_pred) {
-  uint16_t fdata3[9 * 16];  // Temp data buffer used in filtering
-  uint8_t temp2[20 * 16];
-  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 8 * 16);  // compound pred buffer
-  const int16_t *hfilter, *vfilter;
-
-  hfilter = BILINEAR_FILTERS_2TAP(xoffset);
-  vfilter = BILINEAR_FILTERS_2TAP(yoffset);
-
-  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
-                                    1, 17, 8, hfilter);
-  var_filter_block2d_bil_second_pass(fdata3, temp2, 8, 8, 16, 8, vfilter);
-  comp_avg_pred(temp3, second_pred, 8, 16, temp2, 8);
-  return vp9_variance8x16(temp3, 8, dst_ptr, dst_pixels_per_line, sse);
-}
-
-unsigned int vp9_sub_pixel_variance8x4_c(const uint8_t *src_ptr,
-                                         int  src_pixels_per_line,
-                                         int  xoffset,
-                                         int  yoffset,
-                                         const uint8_t *dst_ptr,
-                                         int dst_pixels_per_line,
-                                         unsigned int *sse) {
-  uint16_t fdata3[8 * 5];  // Temp data buffer used in filtering
-  uint8_t temp2[20 * 16];
-  const int16_t *hfilter, *vfilter;
-
-  hfilter = BILINEAR_FILTERS_2TAP(xoffset);
-  vfilter = BILINEAR_FILTERS_2TAP(yoffset);
-
-  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
-                                    1, 5, 8, hfilter);
-  var_filter_block2d_bil_second_pass(fdata3, temp2, 8, 8, 4, 8, vfilter);
-
-  return vp9_variance8x4(temp2, 8, dst_ptr, dst_pixels_per_line, sse);
-}
-
-unsigned int vp9_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr,
-                                             int  src_pixels_per_line,
-                                             int  xoffset,
-                                             int  yoffset,
-                                             const uint8_t *dst_ptr,
-                                             int dst_pixels_per_line,
-                                             unsigned int *sse,
-                                             const uint8_t *second_pred) {
-  uint16_t fdata3[8 * 5];  // Temp data buffer used in filtering
-  uint8_t temp2[20 * 16];
-  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 8 * 4);  // compound pred buffer
-  const int16_t *hfilter, *vfilter;
-
-  hfilter = BILINEAR_FILTERS_2TAP(xoffset);
-  vfilter = BILINEAR_FILTERS_2TAP(yoffset);
-
-  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
-                                    1, 5, 8, hfilter);
-  var_filter_block2d_bil_second_pass(fdata3, temp2, 8, 8, 4, 8, vfilter);
-  comp_avg_pred(temp3, second_pred, 8, 4, temp2, 8);
-  return vp9_variance8x4(temp3, 8, dst_ptr, dst_pixels_per_line, sse);
-}
-
-unsigned int vp9_sub_pixel_variance4x8_c(const uint8_t *src_ptr,
-                                         int  src_pixels_per_line,
-                                         int  xoffset,
-                                         int  yoffset,
-                                         const uint8_t *dst_ptr,
-                                         int dst_pixels_per_line,
-                                         unsigned int *sse) {
-  uint16_t fdata3[5 * 8];  // Temp data buffer used in filtering
-  // FIXME(jingning,rbultje): this temp2 buffer probably doesn't need to be
-  // of this big? same issue appears in all other block size settings.
-  uint8_t temp2[20 * 16];
-  const int16_t *hfilter, *vfilter;
-
-  hfilter = BILINEAR_FILTERS_2TAP(xoffset);
-  vfilter = BILINEAR_FILTERS_2TAP(yoffset);
-
-  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
-                                    1, 9, 4, hfilter);
-  var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 8, 4, vfilter);
-
-  return vp9_variance4x8(temp2, 4, dst_ptr, dst_pixels_per_line, sse);
-}
-
-unsigned int vp9_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr,
-                                             int  src_pixels_per_line,
-                                             int  xoffset,
-                                             int  yoffset,
-                                             const uint8_t *dst_ptr,
-                                             int dst_pixels_per_line,
-                                             unsigned int *sse,
-                                             const uint8_t *second_pred) {
-  uint16_t fdata3[5 * 8];  // Temp data buffer used in filtering
-  uint8_t temp2[20 * 16];
-  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 4 * 8);  // compound pred buffer
-  const int16_t *hfilter, *vfilter;
-
-  hfilter = BILINEAR_FILTERS_2TAP(xoffset);
-  vfilter = BILINEAR_FILTERS_2TAP(yoffset);
-
-  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
-                                    1, 9, 4, hfilter);
-  var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 8, 4, vfilter);
-  comp_avg_pred(temp3, second_pred, 4, 8, temp2, 4);
-  return vp9_variance4x8(temp3, 4, dst_ptr, dst_pixels_per_line, sse);
-}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_write_bit_buffer.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_write_bit_buffer.c
new file mode 100644
index 00000000000..962d0ca5645
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_write_bit_buffer.c
@@ -0,0 +1,34 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vp9/encoder/vp9_write_bit_buffer.h"
+
+size_t vp9_rb_bytes_written(struct vp9_write_bit_buffer *wb) {
+  return wb->bit_offset / CHAR_BIT + (wb->bit_offset % CHAR_BIT > 0);
+}
+
+void vp9_wb_write_bit(struct vp9_write_bit_buffer *wb, int bit) {
+  const int off = (int)wb->bit_offset;
+  const int p = off / CHAR_BIT;
+  const int q = CHAR_BIT - 1 - off % CHAR_BIT;
+  if (q == CHAR_BIT -1) {
+    wb->bit_buffer[p] = bit << q;
+  } else {
+    wb->bit_buffer[p] &= ~(1 << q);
+    wb->bit_buffer[p] |= bit << q;
+  }
+  wb->bit_offset = off + 1;
+}
+
+void vp9_wb_write_literal(struct vp9_write_bit_buffer *wb, int data, int bits) {
+  int bit;
+  for (bit = bits - 1; bit >= 0; bit--)
+    vp9_wb_write_bit(wb, (data >> bit) & 1);
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_write_bit_buffer.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_write_bit_buffer.h
index 6f91cfc85c9..073608d7f9f 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_write_bit_buffer.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_write_bit_buffer.h
@@ -8,41 +8,31 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_BIT_WRITE_BUFFER_H_
-#define VP9_BIT_WRITE_BUFFER_H_
+#ifndef VP9_ENCODER_VP9_WRITE_BIT_BUFFER_H_
+#define VP9_ENCODER_VP9_WRITE_BIT_BUFFER_H_
 
 #include <limits.h>
 
 #include "vpx/vpx_integer.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 struct vp9_write_bit_buffer {
   uint8_t *bit_buffer;
   size_t bit_offset;
 };
 
-static size_t vp9_rb_bytes_written(struct vp9_write_bit_buffer *wb) {
-  return wb->bit_offset / CHAR_BIT + (wb->bit_offset % CHAR_BIT > 0);
-}
-
-static void vp9_wb_write_bit(struct vp9_write_bit_buffer *wb, int bit) {
-  const int off = wb->bit_offset;
-  const int p = off / CHAR_BIT;
-  const int q = CHAR_BIT - 1 - off % CHAR_BIT;
-  if (q == CHAR_BIT -1) {
-    wb->bit_buffer[p] = bit << q;
-  } else {
-    wb->bit_buffer[p] &= ~(1 << q);
-    wb->bit_buffer[p] |= bit << q;
-  }
-  wb->bit_offset = off + 1;
-}
-
-static void vp9_wb_write_literal(struct vp9_write_bit_buffer *wb,
-                              int data, int bits) {
-  int bit;
-  for (bit = bits - 1; bit >= 0; bit--)
-    vp9_wb_write_bit(wb, (data >> bit) & 1);
-}
-
-
-#endif  // VP9_BIT_WRITE_BUFFER_H_
+size_t vp9_rb_bytes_written(struct vp9_write_bit_buffer *wb);
+
+void vp9_wb_write_bit(struct vp9_write_bit_buffer *wb, int bit);
+
+void vp9_wb_write_literal(struct vp9_write_bit_buffer *wb, int data, int bits);
+
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_ENCODER_VP9_WRITE_BIT_BUFFER_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_writer.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_writer.c
new file mode 100644
index 00000000000..8398fc07a4a
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_writer.c
@@ -0,0 +1,35 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include "vp9/encoder/vp9_writer.h"
+#include "vp9/common/vp9_entropy.h"
+
+void vp9_start_encode(vp9_writer *br, uint8_t *source) {
+  br->lowvalue = 0;
+  br->range    = 255;
+  br->value    = 0;
+  br->count    = -24;
+  br->buffer   = source;
+  br->pos      = 0;
+  vp9_write_bit(br, 0);
+}
+
+void vp9_stop_encode(vp9_writer *br) {
+  int i;
+
+  for (i = 0; i < 32; i++)
+    vp9_write_bit(br, 0);
+
+  // Ensure there's no ambigous collision with any index marker bytes
+  if ((br->buffer[br->pos - 1] & 0xe0) == 0xc0)
+    br->buffer[br->pos++] = 0;
+}
+
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_boolhuff.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_writer.h
index c3f340d1bdf..7f4fa1ef2b8 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_boolhuff.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_writer.h
@@ -8,19 +8,17 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-
-/****************************************************************************
-*
-*   Module Title :     vp9_boolhuff.h
-*
-*   Description  :     Bool Coder header file.
-*
-****************************************************************************/
-#ifndef VP9_ENCODER_VP9_BOOLHUFF_H_
-#define VP9_ENCODER_VP9_BOOLHUFF_H_
+#ifndef VP9_ENCODER_VP9_WRITER_H_
+#define VP9_ENCODER_VP9_WRITER_H_
 
 #include "vpx_ports/mem.h"
 
+#include "vp9/common/vp9_prob.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 typedef struct {
   unsigned int lowvalue;
   unsigned int range;
@@ -31,16 +29,12 @@ typedef struct {
 
   // Variables used to track bit costs without outputing to the bitstream
   unsigned int  measure_cost;
-  unsigned long bit_counter;
+  uint64_t bit_counter;
 } vp9_writer;
 
-extern const unsigned int vp9_prob_cost[256];
-
 void vp9_start_encode(vp9_writer *bc, uint8_t *buffer);
 void vp9_stop_encode(vp9_writer *bc);
 
-DECLARE_ALIGNED(16, extern const unsigned char, vp9_norm[256]);
-
 static void vp9_write(vp9_writer *br, int bit, int probability) {
   unsigned int split;
   int count = br->count;
@@ -48,17 +42,6 @@ static void vp9_write(vp9_writer *br, int bit, int probability) {
   unsigned int lowvalue = br->lowvalue;
   register unsigned int shift;
 
-#ifdef ENTROPY_STATS
-#if defined(SECTIONBITS_OUTPUT)
-
-  if (bit)
-    Sectionbits[active_section] += vp9_prob_cost[255 - probability];
-  else
-    Sectionbits[active_section] += vp9_prob_cost[probability];
-
-#endif
-#endif
-
   split = 1 + (((range - 1) * probability) >> 8);
 
   range = split;
@@ -111,5 +94,10 @@ static void vp9_write_literal(vp9_writer *w, int data, int bits) {
     vp9_write_bit(w, 1 & (data >> bit));
 }
 
+#define vp9_write_prob(w, v) vp9_write_literal((w), (v), 8)
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
 
-#endif  // VP9_ENCODER_VP9_BOOLHUFF_H_
+#endif  // VP9_ENCODER_VP9_WRITER_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct32x32_avx2.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct32x32_avx2.c
new file mode 100644
index 00000000000..9ea22fed2b7
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct32x32_avx2.c
@@ -0,0 +1,2710 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <immintrin.h>  // AVX2
+#include "vp9/common/vp9_idct.h"  // for cospi constants
+#include "vpx_ports/mem.h"
+
+#define pair256_set_epi16(a, b) \
+  _mm256_set_epi16(b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a)
+
+#define pair256_set_epi32(a, b) \
+  _mm256_set_epi32(b, a, b, a, b, a, b, a)
+
+
+
+
+#if FDCT32x32_HIGH_PRECISION
+static INLINE __m256i k_madd_epi32_avx2(__m256i a, __m256i b) {
+  __m256i buf0, buf1;
+  buf0 = _mm256_mul_epu32(a, b);
+  a = _mm256_srli_epi64(a, 32);
+  b = _mm256_srli_epi64(b, 32);
+  buf1 = _mm256_mul_epu32(a, b);
+  return _mm256_add_epi64(buf0, buf1);
+}
+
+static INLINE __m256i k_packs_epi64_avx2(__m256i a, __m256i b) {
+  __m256i buf0 = _mm256_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 2, 0));
+  __m256i buf1 = _mm256_shuffle_epi32(b, _MM_SHUFFLE(0, 0, 2, 0));
+  return _mm256_unpacklo_epi64(buf0, buf1);
+}
+#endif
+
+void FDCT32x32_2D_AVX2(const int16_t *input,
+                  int16_t *output_org, int stride) {
+  // Calculate pre-multiplied strides
+  const int str1 = stride;
+  const int str2 = 2 * stride;
+  const int str3 = 2 * stride + str1;
+  // We need an intermediate buffer between passes.
+  DECLARE_ALIGNED(32, int16_t, intermediate[32 * 32]);
+  // Constants
+  //    When we use them, in one case, they are all the same. In all others
+  //    it's a pair of them that we need to repeat four times. This is done
+  //    by constructing the 32 bit constant corresponding to that pair.
+  const __m256i k__cospi_p16_p16 = _mm256_set1_epi16(+cospi_16_64);
+  const __m256i k__cospi_p16_m16 = pair256_set_epi16(+cospi_16_64, -cospi_16_64);
+  const __m256i k__cospi_m08_p24 = pair256_set_epi16(-cospi_8_64,   cospi_24_64);
+  const __m256i k__cospi_m24_m08 = pair256_set_epi16(-cospi_24_64, -cospi_8_64);
+  const __m256i k__cospi_p24_p08 = pair256_set_epi16(+cospi_24_64,  cospi_8_64);
+  const __m256i k__cospi_p12_p20 = pair256_set_epi16(+cospi_12_64,  cospi_20_64);
+  const __m256i k__cospi_m20_p12 = pair256_set_epi16(-cospi_20_64,  cospi_12_64);
+  const __m256i k__cospi_m04_p28 = pair256_set_epi16(-cospi_4_64,   cospi_28_64);
+  const __m256i k__cospi_p28_p04 = pair256_set_epi16(+cospi_28_64,  cospi_4_64);
+  const __m256i k__cospi_m28_m04 = pair256_set_epi16(-cospi_28_64, -cospi_4_64);
+  const __m256i k__cospi_m12_m20 = pair256_set_epi16(-cospi_12_64, -cospi_20_64);
+  const __m256i k__cospi_p30_p02 = pair256_set_epi16(+cospi_30_64,  cospi_2_64);
+  const __m256i k__cospi_p14_p18 = pair256_set_epi16(+cospi_14_64,  cospi_18_64);
+  const __m256i k__cospi_p22_p10 = pair256_set_epi16(+cospi_22_64,  cospi_10_64);
+  const __m256i k__cospi_p06_p26 = pair256_set_epi16(+cospi_6_64,   cospi_26_64);
+  const __m256i k__cospi_m26_p06 = pair256_set_epi16(-cospi_26_64,  cospi_6_64);
+  const __m256i k__cospi_m10_p22 = pair256_set_epi16(-cospi_10_64,  cospi_22_64);
+  const __m256i k__cospi_m18_p14 = pair256_set_epi16(-cospi_18_64,  cospi_14_64);
+  const __m256i k__cospi_m02_p30 = pair256_set_epi16(-cospi_2_64,   cospi_30_64);
+  const __m256i k__cospi_p31_p01 = pair256_set_epi16(+cospi_31_64,  cospi_1_64);
+  const __m256i k__cospi_p15_p17 = pair256_set_epi16(+cospi_15_64,  cospi_17_64);
+  const __m256i k__cospi_p23_p09 = pair256_set_epi16(+cospi_23_64,  cospi_9_64);
+  const __m256i k__cospi_p07_p25 = pair256_set_epi16(+cospi_7_64,   cospi_25_64);
+  const __m256i k__cospi_m25_p07 = pair256_set_epi16(-cospi_25_64,  cospi_7_64);
+  const __m256i k__cospi_m09_p23 = pair256_set_epi16(-cospi_9_64,   cospi_23_64);
+  const __m256i k__cospi_m17_p15 = pair256_set_epi16(-cospi_17_64,  cospi_15_64);
+  const __m256i k__cospi_m01_p31 = pair256_set_epi16(-cospi_1_64,   cospi_31_64);
+  const __m256i k__cospi_p27_p05 = pair256_set_epi16(+cospi_27_64,  cospi_5_64);
+  const __m256i k__cospi_p11_p21 = pair256_set_epi16(+cospi_11_64,  cospi_21_64);
+  const __m256i k__cospi_p19_p13 = pair256_set_epi16(+cospi_19_64,  cospi_13_64);
+  const __m256i k__cospi_p03_p29 = pair256_set_epi16(+cospi_3_64,   cospi_29_64);
+  const __m256i k__cospi_m29_p03 = pair256_set_epi16(-cospi_29_64,  cospi_3_64);
+  const __m256i k__cospi_m13_p19 = pair256_set_epi16(-cospi_13_64,  cospi_19_64);
+  const __m256i k__cospi_m21_p11 = pair256_set_epi16(-cospi_21_64,  cospi_11_64);
+  const __m256i k__cospi_m05_p27 = pair256_set_epi16(-cospi_5_64,   cospi_27_64);
+  const __m256i k__DCT_CONST_ROUNDING = _mm256_set1_epi32(DCT_CONST_ROUNDING);
+  const __m256i kZero = _mm256_set1_epi16(0);
+  const __m256i kOne  = _mm256_set1_epi16(1);
+  // Do the two transform/transpose passes
+  int pass;
+  for (pass = 0; pass < 2; ++pass) {
+    // We process sixteen columns (transposed rows in second pass) at a time.
+    int column_start;
+    for (column_start = 0; column_start < 32; column_start += 16) {
+      __m256i step1[32];
+      __m256i step2[32];
+      __m256i step3[32];
+      __m256i out[32];
+      // Stage 1
+      // Note: even though all the loads below are aligned, using the aligned
+      //       intrinsic make the code slightly slower.
+      if (0 == pass) {
+        const int16_t *in  = &input[column_start];
+        // step1[i] =  (in[ 0 * stride] + in[(32 -  1) * stride]) << 2;
+        // Note: the next four blocks could be in a loop. That would help the
+        //       instruction cache but is actually slower.
+        {
+          const int16_t *ina =  in +  0 * str1;
+          const int16_t *inb =  in + 31 * str1;
+          __m256i *step1a = &step1[ 0];
+          __m256i *step1b = &step1[31];
+          const __m256i ina0  = _mm256_loadu_si256((const __m256i *)(ina));
+          const __m256i ina1  = _mm256_loadu_si256((const __m256i *)(ina + str1));
+          const __m256i ina2  = _mm256_loadu_si256((const __m256i *)(ina + str2));
+          const __m256i ina3  = _mm256_loadu_si256((const __m256i *)(ina + str3));
+          const __m256i inb3  = _mm256_loadu_si256((const __m256i *)(inb - str3));
+          const __m256i inb2  = _mm256_loadu_si256((const __m256i *)(inb - str2));
+          const __m256i inb1  = _mm256_loadu_si256((const __m256i *)(inb - str1));
+          const __m256i inb0  = _mm256_loadu_si256((const __m256i *)(inb));
+          step1a[ 0] = _mm256_add_epi16(ina0, inb0);
+          step1a[ 1] = _mm256_add_epi16(ina1, inb1);
+          step1a[ 2] = _mm256_add_epi16(ina2, inb2);
+          step1a[ 3] = _mm256_add_epi16(ina3, inb3);
+          step1b[-3] = _mm256_sub_epi16(ina3, inb3);
+          step1b[-2] = _mm256_sub_epi16(ina2, inb2);
+          step1b[-1] = _mm256_sub_epi16(ina1, inb1);
+          step1b[-0] = _mm256_sub_epi16(ina0, inb0);
+          step1a[ 0] = _mm256_slli_epi16(step1a[ 0], 2);
+          step1a[ 1] = _mm256_slli_epi16(step1a[ 1], 2);
+          step1a[ 2] = _mm256_slli_epi16(step1a[ 2], 2);
+          step1a[ 3] = _mm256_slli_epi16(step1a[ 3], 2);
+          step1b[-3] = _mm256_slli_epi16(step1b[-3], 2);
+          step1b[-2] = _mm256_slli_epi16(step1b[-2], 2);
+          step1b[-1] = _mm256_slli_epi16(step1b[-1], 2);
+          step1b[-0] = _mm256_slli_epi16(step1b[-0], 2);
+        }
+        {
+          const int16_t *ina =  in +  4 * str1;
+          const int16_t *inb =  in + 27 * str1;
+          __m256i *step1a = &step1[ 4];
+          __m256i *step1b = &step1[27];
+          const __m256i ina0  = _mm256_loadu_si256((const __m256i *)(ina));
+          const __m256i ina1  = _mm256_loadu_si256((const __m256i *)(ina + str1));
+          const __m256i ina2  = _mm256_loadu_si256((const __m256i *)(ina + str2));
+          const __m256i ina3  = _mm256_loadu_si256((const __m256i *)(ina + str3));
+          const __m256i inb3  = _mm256_loadu_si256((const __m256i *)(inb - str3));
+          const __m256i inb2  = _mm256_loadu_si256((const __m256i *)(inb - str2));
+          const __m256i inb1  = _mm256_loadu_si256((const __m256i *)(inb - str1));
+          const __m256i inb0  = _mm256_loadu_si256((const __m256i *)(inb));
+          step1a[ 0] = _mm256_add_epi16(ina0, inb0);
+          step1a[ 1] = _mm256_add_epi16(ina1, inb1);
+          step1a[ 2] = _mm256_add_epi16(ina2, inb2);
+          step1a[ 3] = _mm256_add_epi16(ina3, inb3);
+          step1b[-3] = _mm256_sub_epi16(ina3, inb3);
+          step1b[-2] = _mm256_sub_epi16(ina2, inb2);
+          step1b[-1] = _mm256_sub_epi16(ina1, inb1);
+          step1b[-0] = _mm256_sub_epi16(ina0, inb0);
+          step1a[ 0] = _mm256_slli_epi16(step1a[ 0], 2);
+          step1a[ 1] = _mm256_slli_epi16(step1a[ 1], 2);
+          step1a[ 2] = _mm256_slli_epi16(step1a[ 2], 2);
+          step1a[ 3] = _mm256_slli_epi16(step1a[ 3], 2);
+          step1b[-3] = _mm256_slli_epi16(step1b[-3], 2);
+          step1b[-2] = _mm256_slli_epi16(step1b[-2], 2);
+          step1b[-1] = _mm256_slli_epi16(step1b[-1], 2);
+          step1b[-0] = _mm256_slli_epi16(step1b[-0], 2);
+        }
+        {
+          const int16_t *ina =  in +  8 * str1;
+          const int16_t *inb =  in + 23 * str1;
+          __m256i *step1a = &step1[ 8];
+          __m256i *step1b = &step1[23];
+          const __m256i ina0  = _mm256_loadu_si256((const __m256i *)(ina));
+          const __m256i ina1  = _mm256_loadu_si256((const __m256i *)(ina + str1));
+          const __m256i ina2  = _mm256_loadu_si256((const __m256i *)(ina + str2));
+          const __m256i ina3  = _mm256_loadu_si256((const __m256i *)(ina + str3));
+          const __m256i inb3  = _mm256_loadu_si256((const __m256i *)(inb - str3));
+          const __m256i inb2  = _mm256_loadu_si256((const __m256i *)(inb - str2));
+          const __m256i inb1  = _mm256_loadu_si256((const __m256i *)(inb - str1));
+          const __m256i inb0  = _mm256_loadu_si256((const __m256i *)(inb));
+          step1a[ 0] = _mm256_add_epi16(ina0, inb0);
+          step1a[ 1] = _mm256_add_epi16(ina1, inb1);
+          step1a[ 2] = _mm256_add_epi16(ina2, inb2);
+          step1a[ 3] = _mm256_add_epi16(ina3, inb3);
+          step1b[-3] = _mm256_sub_epi16(ina3, inb3);
+          step1b[-2] = _mm256_sub_epi16(ina2, inb2);
+          step1b[-1] = _mm256_sub_epi16(ina1, inb1);
+          step1b[-0] = _mm256_sub_epi16(ina0, inb0);
+          step1a[ 0] = _mm256_slli_epi16(step1a[ 0], 2);
+          step1a[ 1] = _mm256_slli_epi16(step1a[ 1], 2);
+          step1a[ 2] = _mm256_slli_epi16(step1a[ 2], 2);
+          step1a[ 3] = _mm256_slli_epi16(step1a[ 3], 2);
+          step1b[-3] = _mm256_slli_epi16(step1b[-3], 2);
+          step1b[-2] = _mm256_slli_epi16(step1b[-2], 2);
+          step1b[-1] = _mm256_slli_epi16(step1b[-1], 2);
+          step1b[-0] = _mm256_slli_epi16(step1b[-0], 2);
+        }
+        {
+          const int16_t *ina =  in + 12 * str1;
+          const int16_t *inb =  in + 19 * str1;
+          __m256i *step1a = &step1[12];
+          __m256i *step1b = &step1[19];
+          const __m256i ina0  = _mm256_loadu_si256((const __m256i *)(ina));
+          const __m256i ina1  = _mm256_loadu_si256((const __m256i *)(ina + str1));
+          const __m256i ina2  = _mm256_loadu_si256((const __m256i *)(ina + str2));
+          const __m256i ina3  = _mm256_loadu_si256((const __m256i *)(ina + str3));
+          const __m256i inb3  = _mm256_loadu_si256((const __m256i *)(inb - str3));
+          const __m256i inb2  = _mm256_loadu_si256((const __m256i *)(inb - str2));
+          const __m256i inb1  = _mm256_loadu_si256((const __m256i *)(inb - str1));
+          const __m256i inb0  = _mm256_loadu_si256((const __m256i *)(inb));
+          step1a[ 0] = _mm256_add_epi16(ina0, inb0);
+          step1a[ 1] = _mm256_add_epi16(ina1, inb1);
+          step1a[ 2] = _mm256_add_epi16(ina2, inb2);
+          step1a[ 3] = _mm256_add_epi16(ina3, inb3);
+          step1b[-3] = _mm256_sub_epi16(ina3, inb3);
+          step1b[-2] = _mm256_sub_epi16(ina2, inb2);
+          step1b[-1] = _mm256_sub_epi16(ina1, inb1);
+          step1b[-0] = _mm256_sub_epi16(ina0, inb0);
+          step1a[ 0] = _mm256_slli_epi16(step1a[ 0], 2);
+          step1a[ 1] = _mm256_slli_epi16(step1a[ 1], 2);
+          step1a[ 2] = _mm256_slli_epi16(step1a[ 2], 2);
+          step1a[ 3] = _mm256_slli_epi16(step1a[ 3], 2);
+          step1b[-3] = _mm256_slli_epi16(step1b[-3], 2);
+          step1b[-2] = _mm256_slli_epi16(step1b[-2], 2);
+          step1b[-1] = _mm256_slli_epi16(step1b[-1], 2);
+          step1b[-0] = _mm256_slli_epi16(step1b[-0], 2);
+        }
+      } else {
+        int16_t *in = &intermediate[column_start];
+        // step1[i] =  in[ 0 * 32] + in[(32 -  1) * 32];
+        // Note: using the same approach as above to have common offset is
+        //       counter-productive as all offsets can be calculated at compile
+        //       time.
+        // Note: the next four blocks could be in a loop. That would help the
+        //       instruction cache but is actually slower.
+        {
+          __m256i in00  = _mm256_loadu_si256((const __m256i *)(in +  0 * 32));
+          __m256i in01  = _mm256_loadu_si256((const __m256i *)(in +  1 * 32));
+          __m256i in02  = _mm256_loadu_si256((const __m256i *)(in +  2 * 32));
+          __m256i in03  = _mm256_loadu_si256((const __m256i *)(in +  3 * 32));
+          __m256i in28  = _mm256_loadu_si256((const __m256i *)(in + 28 * 32));
+          __m256i in29  = _mm256_loadu_si256((const __m256i *)(in + 29 * 32));
+          __m256i in30  = _mm256_loadu_si256((const __m256i *)(in + 30 * 32));
+          __m256i in31  = _mm256_loadu_si256((const __m256i *)(in + 31 * 32));
+          step1[ 0] = _mm256_add_epi16(in00, in31);
+          step1[ 1] = _mm256_add_epi16(in01, in30);
+          step1[ 2] = _mm256_add_epi16(in02, in29);
+          step1[ 3] = _mm256_add_epi16(in03, in28);
+          step1[28] = _mm256_sub_epi16(in03, in28);
+          step1[29] = _mm256_sub_epi16(in02, in29);
+          step1[30] = _mm256_sub_epi16(in01, in30);
+          step1[31] = _mm256_sub_epi16(in00, in31);
+        }
+        {
+          __m256i in04  = _mm256_loadu_si256((const __m256i *)(in +  4 * 32));
+          __m256i in05  = _mm256_loadu_si256((const __m256i *)(in +  5 * 32));
+          __m256i in06  = _mm256_loadu_si256((const __m256i *)(in +  6 * 32));
+          __m256i in07  = _mm256_loadu_si256((const __m256i *)(in +  7 * 32));
+          __m256i in24  = _mm256_loadu_si256((const __m256i *)(in + 24 * 32));
+          __m256i in25  = _mm256_loadu_si256((const __m256i *)(in + 25 * 32));
+          __m256i in26  = _mm256_loadu_si256((const __m256i *)(in + 26 * 32));
+          __m256i in27  = _mm256_loadu_si256((const __m256i *)(in + 27 * 32));
+          step1[ 4] = _mm256_add_epi16(in04, in27);
+          step1[ 5] = _mm256_add_epi16(in05, in26);
+          step1[ 6] = _mm256_add_epi16(in06, in25);
+          step1[ 7] = _mm256_add_epi16(in07, in24);
+          step1[24] = _mm256_sub_epi16(in07, in24);
+          step1[25] = _mm256_sub_epi16(in06, in25);
+          step1[26] = _mm256_sub_epi16(in05, in26);
+          step1[27] = _mm256_sub_epi16(in04, in27);
+        }
+        {
+          __m256i in08  = _mm256_loadu_si256((const __m256i *)(in +  8 * 32));
+          __m256i in09  = _mm256_loadu_si256((const __m256i *)(in +  9 * 32));
+          __m256i in10  = _mm256_loadu_si256((const __m256i *)(in + 10 * 32));
+          __m256i in11  = _mm256_loadu_si256((const __m256i *)(in + 11 * 32));
+          __m256i in20  = _mm256_loadu_si256((const __m256i *)(in + 20 * 32));
+          __m256i in21  = _mm256_loadu_si256((const __m256i *)(in + 21 * 32));
+          __m256i in22  = _mm256_loadu_si256((const __m256i *)(in + 22 * 32));
+          __m256i in23  = _mm256_loadu_si256((const __m256i *)(in + 23 * 32));
+          step1[ 8] = _mm256_add_epi16(in08, in23);
+          step1[ 9] = _mm256_add_epi16(in09, in22);
+          step1[10] = _mm256_add_epi16(in10, in21);
+          step1[11] = _mm256_add_epi16(in11, in20);
+          step1[20] = _mm256_sub_epi16(in11, in20);
+          step1[21] = _mm256_sub_epi16(in10, in21);
+          step1[22] = _mm256_sub_epi16(in09, in22);
+          step1[23] = _mm256_sub_epi16(in08, in23);
+        }
+        {
+          __m256i in12  = _mm256_loadu_si256((const __m256i *)(in + 12 * 32));
+          __m256i in13  = _mm256_loadu_si256((const __m256i *)(in + 13 * 32));
+          __m256i in14  = _mm256_loadu_si256((const __m256i *)(in + 14 * 32));
+          __m256i in15  = _mm256_loadu_si256((const __m256i *)(in + 15 * 32));
+          __m256i in16  = _mm256_loadu_si256((const __m256i *)(in + 16 * 32));
+          __m256i in17  = _mm256_loadu_si256((const __m256i *)(in + 17 * 32));
+          __m256i in18  = _mm256_loadu_si256((const __m256i *)(in + 18 * 32));
+          __m256i in19  = _mm256_loadu_si256((const __m256i *)(in + 19 * 32));
+          step1[12] = _mm256_add_epi16(in12, in19);
+          step1[13] = _mm256_add_epi16(in13, in18);
+          step1[14] = _mm256_add_epi16(in14, in17);
+          step1[15] = _mm256_add_epi16(in15, in16);
+          step1[16] = _mm256_sub_epi16(in15, in16);
+          step1[17] = _mm256_sub_epi16(in14, in17);
+          step1[18] = _mm256_sub_epi16(in13, in18);
+          step1[19] = _mm256_sub_epi16(in12, in19);
+        }
+      }
+      // Stage 2
+      {
+        step2[ 0] = _mm256_add_epi16(step1[0], step1[15]);
+        step2[ 1] = _mm256_add_epi16(step1[1], step1[14]);
+        step2[ 2] = _mm256_add_epi16(step1[2], step1[13]);
+        step2[ 3] = _mm256_add_epi16(step1[3], step1[12]);
+        step2[ 4] = _mm256_add_epi16(step1[4], step1[11]);
+        step2[ 5] = _mm256_add_epi16(step1[5], step1[10]);
+        step2[ 6] = _mm256_add_epi16(step1[6], step1[ 9]);
+        step2[ 7] = _mm256_add_epi16(step1[7], step1[ 8]);
+        step2[ 8] = _mm256_sub_epi16(step1[7], step1[ 8]);
+        step2[ 9] = _mm256_sub_epi16(step1[6], step1[ 9]);
+        step2[10] = _mm256_sub_epi16(step1[5], step1[10]);
+        step2[11] = _mm256_sub_epi16(step1[4], step1[11]);
+        step2[12] = _mm256_sub_epi16(step1[3], step1[12]);
+        step2[13] = _mm256_sub_epi16(step1[2], step1[13]);
+        step2[14] = _mm256_sub_epi16(step1[1], step1[14]);
+        step2[15] = _mm256_sub_epi16(step1[0], step1[15]);
+      }
+      {
+        const __m256i s2_20_0 = _mm256_unpacklo_epi16(step1[27], step1[20]);
+        const __m256i s2_20_1 = _mm256_unpackhi_epi16(step1[27], step1[20]);
+        const __m256i s2_21_0 = _mm256_unpacklo_epi16(step1[26], step1[21]);
+        const __m256i s2_21_1 = _mm256_unpackhi_epi16(step1[26], step1[21]);
+        const __m256i s2_22_0 = _mm256_unpacklo_epi16(step1[25], step1[22]);
+        const __m256i s2_22_1 = _mm256_unpackhi_epi16(step1[25], step1[22]);
+        const __m256i s2_23_0 = _mm256_unpacklo_epi16(step1[24], step1[23]);
+        const __m256i s2_23_1 = _mm256_unpackhi_epi16(step1[24], step1[23]);
+        const __m256i s2_20_2 = _mm256_madd_epi16(s2_20_0, k__cospi_p16_m16);
+        const __m256i s2_20_3 = _mm256_madd_epi16(s2_20_1, k__cospi_p16_m16);
+        const __m256i s2_21_2 = _mm256_madd_epi16(s2_21_0, k__cospi_p16_m16);
+        const __m256i s2_21_3 = _mm256_madd_epi16(s2_21_1, k__cospi_p16_m16);
+        const __m256i s2_22_2 = _mm256_madd_epi16(s2_22_0, k__cospi_p16_m16);
+        const __m256i s2_22_3 = _mm256_madd_epi16(s2_22_1, k__cospi_p16_m16);
+        const __m256i s2_23_2 = _mm256_madd_epi16(s2_23_0, k__cospi_p16_m16);
+        const __m256i s2_23_3 = _mm256_madd_epi16(s2_23_1, k__cospi_p16_m16);
+        const __m256i s2_24_2 = _mm256_madd_epi16(s2_23_0, k__cospi_p16_p16);
+        const __m256i s2_24_3 = _mm256_madd_epi16(s2_23_1, k__cospi_p16_p16);
+        const __m256i s2_25_2 = _mm256_madd_epi16(s2_22_0, k__cospi_p16_p16);
+        const __m256i s2_25_3 = _mm256_madd_epi16(s2_22_1, k__cospi_p16_p16);
+        const __m256i s2_26_2 = _mm256_madd_epi16(s2_21_0, k__cospi_p16_p16);
+        const __m256i s2_26_3 = _mm256_madd_epi16(s2_21_1, k__cospi_p16_p16);
+        const __m256i s2_27_2 = _mm256_madd_epi16(s2_20_0, k__cospi_p16_p16);
+        const __m256i s2_27_3 = _mm256_madd_epi16(s2_20_1, k__cospi_p16_p16);
+        // dct_const_round_shift
+        const __m256i s2_20_4 = _mm256_add_epi32(s2_20_2, k__DCT_CONST_ROUNDING);
+        const __m256i s2_20_5 = _mm256_add_epi32(s2_20_3, k__DCT_CONST_ROUNDING);
+        const __m256i s2_21_4 = _mm256_add_epi32(s2_21_2, k__DCT_CONST_ROUNDING);
+        const __m256i s2_21_5 = _mm256_add_epi32(s2_21_3, k__DCT_CONST_ROUNDING);
+        const __m256i s2_22_4 = _mm256_add_epi32(s2_22_2, k__DCT_CONST_ROUNDING);
+        const __m256i s2_22_5 = _mm256_add_epi32(s2_22_3, k__DCT_CONST_ROUNDING);
+        const __m256i s2_23_4 = _mm256_add_epi32(s2_23_2, k__DCT_CONST_ROUNDING);
+        const __m256i s2_23_5 = _mm256_add_epi32(s2_23_3, k__DCT_CONST_ROUNDING);
+        const __m256i s2_24_4 = _mm256_add_epi32(s2_24_2, k__DCT_CONST_ROUNDING);
+        const __m256i s2_24_5 = _mm256_add_epi32(s2_24_3, k__DCT_CONST_ROUNDING);
+        const __m256i s2_25_4 = _mm256_add_epi32(s2_25_2, k__DCT_CONST_ROUNDING);
+        const __m256i s2_25_5 = _mm256_add_epi32(s2_25_3, k__DCT_CONST_ROUNDING);
+        const __m256i s2_26_4 = _mm256_add_epi32(s2_26_2, k__DCT_CONST_ROUNDING);
+        const __m256i s2_26_5 = _mm256_add_epi32(s2_26_3, k__DCT_CONST_ROUNDING);
+        const __m256i s2_27_4 = _mm256_add_epi32(s2_27_2, k__DCT_CONST_ROUNDING);
+        const __m256i s2_27_5 = _mm256_add_epi32(s2_27_3, k__DCT_CONST_ROUNDING);
+        const __m256i s2_20_6 = _mm256_srai_epi32(s2_20_4, DCT_CONST_BITS);
+        const __m256i s2_20_7 = _mm256_srai_epi32(s2_20_5, DCT_CONST_BITS);
+        const __m256i s2_21_6 = _mm256_srai_epi32(s2_21_4, DCT_CONST_BITS);
+        const __m256i s2_21_7 = _mm256_srai_epi32(s2_21_5, DCT_CONST_BITS);
+        const __m256i s2_22_6 = _mm256_srai_epi32(s2_22_4, DCT_CONST_BITS);
+        const __m256i s2_22_7 = _mm256_srai_epi32(s2_22_5, DCT_CONST_BITS);
+        const __m256i s2_23_6 = _mm256_srai_epi32(s2_23_4, DCT_CONST_BITS);
+        const __m256i s2_23_7 = _mm256_srai_epi32(s2_23_5, DCT_CONST_BITS);
+        const __m256i s2_24_6 = _mm256_srai_epi32(s2_24_4, DCT_CONST_BITS);
+        const __m256i s2_24_7 = _mm256_srai_epi32(s2_24_5, DCT_CONST_BITS);
+        const __m256i s2_25_6 = _mm256_srai_epi32(s2_25_4, DCT_CONST_BITS);
+        const __m256i s2_25_7 = _mm256_srai_epi32(s2_25_5, DCT_CONST_BITS);
+        const __m256i s2_26_6 = _mm256_srai_epi32(s2_26_4, DCT_CONST_BITS);
+        const __m256i s2_26_7 = _mm256_srai_epi32(s2_26_5, DCT_CONST_BITS);
+        const __m256i s2_27_6 = _mm256_srai_epi32(s2_27_4, DCT_CONST_BITS);
+        const __m256i s2_27_7 = _mm256_srai_epi32(s2_27_5, DCT_CONST_BITS);
+        // Combine
+        step2[20] = _mm256_packs_epi32(s2_20_6, s2_20_7);
+        step2[21] = _mm256_packs_epi32(s2_21_6, s2_21_7);
+        step2[22] = _mm256_packs_epi32(s2_22_6, s2_22_7);
+        step2[23] = _mm256_packs_epi32(s2_23_6, s2_23_7);
+        step2[24] = _mm256_packs_epi32(s2_24_6, s2_24_7);
+        step2[25] = _mm256_packs_epi32(s2_25_6, s2_25_7);
+        step2[26] = _mm256_packs_epi32(s2_26_6, s2_26_7);
+        step2[27] = _mm256_packs_epi32(s2_27_6, s2_27_7);
+      }
+
+#if !FDCT32x32_HIGH_PRECISION
+      // dump the magnitude by half, hence the intermediate values are within
+      // the range of 16 bits.
+      if (1 == pass) {
+        __m256i s3_00_0 = _mm256_cmpgt_epi16(kZero,step2[ 0]);
+        __m256i s3_01_0 = _mm256_cmpgt_epi16(kZero,step2[ 1]);
+        __m256i s3_02_0 = _mm256_cmpgt_epi16(kZero,step2[ 2]);
+        __m256i s3_03_0 = _mm256_cmpgt_epi16(kZero,step2[ 3]);
+        __m256i s3_04_0 = _mm256_cmpgt_epi16(kZero,step2[ 4]);
+        __m256i s3_05_0 = _mm256_cmpgt_epi16(kZero,step2[ 5]);
+        __m256i s3_06_0 = _mm256_cmpgt_epi16(kZero,step2[ 6]);
+        __m256i s3_07_0 = _mm256_cmpgt_epi16(kZero,step2[ 7]);
+        __m256i s2_08_0 = _mm256_cmpgt_epi16(kZero,step2[ 8]);
+        __m256i s2_09_0 = _mm256_cmpgt_epi16(kZero,step2[ 9]);
+        __m256i s3_10_0 = _mm256_cmpgt_epi16(kZero,step2[10]);
+        __m256i s3_11_0 = _mm256_cmpgt_epi16(kZero,step2[11]);
+        __m256i s3_12_0 = _mm256_cmpgt_epi16(kZero,step2[12]);
+        __m256i s3_13_0 = _mm256_cmpgt_epi16(kZero,step2[13]);
+        __m256i s2_14_0 = _mm256_cmpgt_epi16(kZero,step2[14]);
+        __m256i s2_15_0 = _mm256_cmpgt_epi16(kZero,step2[15]);
+        __m256i s3_16_0 = _mm256_cmpgt_epi16(kZero,step1[16]);
+        __m256i s3_17_0 = _mm256_cmpgt_epi16(kZero,step1[17]);
+        __m256i s3_18_0 = _mm256_cmpgt_epi16(kZero,step1[18]);
+        __m256i s3_19_0 = _mm256_cmpgt_epi16(kZero,step1[19]);
+        __m256i s3_20_0 = _mm256_cmpgt_epi16(kZero,step2[20]);
+        __m256i s3_21_0 = _mm256_cmpgt_epi16(kZero,step2[21]);
+        __m256i s3_22_0 = _mm256_cmpgt_epi16(kZero,step2[22]);
+        __m256i s3_23_0 = _mm256_cmpgt_epi16(kZero,step2[23]);
+        __m256i s3_24_0 = _mm256_cmpgt_epi16(kZero,step2[24]);
+        __m256i s3_25_0 = _mm256_cmpgt_epi16(kZero,step2[25]);
+        __m256i s3_26_0 = _mm256_cmpgt_epi16(kZero,step2[26]);
+        __m256i s3_27_0 = _mm256_cmpgt_epi16(kZero,step2[27]);
+        __m256i s3_28_0 = _mm256_cmpgt_epi16(kZero,step1[28]);
+        __m256i s3_29_0 = _mm256_cmpgt_epi16(kZero,step1[29]);
+        __m256i s3_30_0 = _mm256_cmpgt_epi16(kZero,step1[30]);
+        __m256i s3_31_0 = _mm256_cmpgt_epi16(kZero,step1[31]);
+
+        step2[ 0] = _mm256_sub_epi16(step2[ 0], s3_00_0);
+        step2[ 1] = _mm256_sub_epi16(step2[ 1], s3_01_0);
+        step2[ 2] = _mm256_sub_epi16(step2[ 2], s3_02_0);
+        step2[ 3] = _mm256_sub_epi16(step2[ 3], s3_03_0);
+        step2[ 4] = _mm256_sub_epi16(step2[ 4], s3_04_0);
+        step2[ 5] = _mm256_sub_epi16(step2[ 5], s3_05_0);
+        step2[ 6] = _mm256_sub_epi16(step2[ 6], s3_06_0);
+        step2[ 7] = _mm256_sub_epi16(step2[ 7], s3_07_0);
+        step2[ 8] = _mm256_sub_epi16(step2[ 8], s2_08_0);
+        step2[ 9] = _mm256_sub_epi16(step2[ 9], s2_09_0);
+        step2[10] = _mm256_sub_epi16(step2[10], s3_10_0);
+        step2[11] = _mm256_sub_epi16(step2[11], s3_11_0);
+        step2[12] = _mm256_sub_epi16(step2[12], s3_12_0);
+        step2[13] = _mm256_sub_epi16(step2[13], s3_13_0);
+        step2[14] = _mm256_sub_epi16(step2[14], s2_14_0);
+        step2[15] = _mm256_sub_epi16(step2[15], s2_15_0);
+        step1[16] = _mm256_sub_epi16(step1[16], s3_16_0);
+        step1[17] = _mm256_sub_epi16(step1[17], s3_17_0);
+        step1[18] = _mm256_sub_epi16(step1[18], s3_18_0);
+        step1[19] = _mm256_sub_epi16(step1[19], s3_19_0);
+        step2[20] = _mm256_sub_epi16(step2[20], s3_20_0);
+        step2[21] = _mm256_sub_epi16(step2[21], s3_21_0);
+        step2[22] = _mm256_sub_epi16(step2[22], s3_22_0);
+        step2[23] = _mm256_sub_epi16(step2[23], s3_23_0);
+        step2[24] = _mm256_sub_epi16(step2[24], s3_24_0);
+        step2[25] = _mm256_sub_epi16(step2[25], s3_25_0);
+        step2[26] = _mm256_sub_epi16(step2[26], s3_26_0);
+        step2[27] = _mm256_sub_epi16(step2[27], s3_27_0);
+        step1[28] = _mm256_sub_epi16(step1[28], s3_28_0);
+        step1[29] = _mm256_sub_epi16(step1[29], s3_29_0);
+        step1[30] = _mm256_sub_epi16(step1[30], s3_30_0);
+        step1[31] = _mm256_sub_epi16(step1[31], s3_31_0);
+
+        step2[ 0] = _mm256_add_epi16(step2[ 0], kOne);
+        step2[ 1] = _mm256_add_epi16(step2[ 1], kOne);
+        step2[ 2] = _mm256_add_epi16(step2[ 2], kOne);
+        step2[ 3] = _mm256_add_epi16(step2[ 3], kOne);
+        step2[ 4] = _mm256_add_epi16(step2[ 4], kOne);
+        step2[ 5] = _mm256_add_epi16(step2[ 5], kOne);
+        step2[ 6] = _mm256_add_epi16(step2[ 6], kOne);
+        step2[ 7] = _mm256_add_epi16(step2[ 7], kOne);
+        step2[ 8] = _mm256_add_epi16(step2[ 8], kOne);
+        step2[ 9] = _mm256_add_epi16(step2[ 9], kOne);
+        step2[10] = _mm256_add_epi16(step2[10], kOne);
+        step2[11] = _mm256_add_epi16(step2[11], kOne);
+        step2[12] = _mm256_add_epi16(step2[12], kOne);
+        step2[13] = _mm256_add_epi16(step2[13], kOne);
+        step2[14] = _mm256_add_epi16(step2[14], kOne);
+        step2[15] = _mm256_add_epi16(step2[15], kOne);
+        step1[16] = _mm256_add_epi16(step1[16], kOne);
+        step1[17] = _mm256_add_epi16(step1[17], kOne);
+        step1[18] = _mm256_add_epi16(step1[18], kOne);
+        step1[19] = _mm256_add_epi16(step1[19], kOne);
+        step2[20] = _mm256_add_epi16(step2[20], kOne);
+        step2[21] = _mm256_add_epi16(step2[21], kOne);
+        step2[22] = _mm256_add_epi16(step2[22], kOne);
+        step2[23] = _mm256_add_epi16(step2[23], kOne);
+        step2[24] = _mm256_add_epi16(step2[24], kOne);
+        step2[25] = _mm256_add_epi16(step2[25], kOne);
+        step2[26] = _mm256_add_epi16(step2[26], kOne);
+        step2[27] = _mm256_add_epi16(step2[27], kOne);
+        step1[28] = _mm256_add_epi16(step1[28], kOne);
+        step1[29] = _mm256_add_epi16(step1[29], kOne);
+        step1[30] = _mm256_add_epi16(step1[30], kOne);
+        step1[31] = _mm256_add_epi16(step1[31], kOne);
+
+        step2[ 0] = _mm256_srai_epi16(step2[ 0], 2);
+        step2[ 1] = _mm256_srai_epi16(step2[ 1], 2);
+        step2[ 2] = _mm256_srai_epi16(step2[ 2], 2);
+        step2[ 3] = _mm256_srai_epi16(step2[ 3], 2);
+        step2[ 4] = _mm256_srai_epi16(step2[ 4], 2);
+        step2[ 5] = _mm256_srai_epi16(step2[ 5], 2);
+        step2[ 6] = _mm256_srai_epi16(step2[ 6], 2);
+        step2[ 7] = _mm256_srai_epi16(step2[ 7], 2);
+        step2[ 8] = _mm256_srai_epi16(step2[ 8], 2);
+        step2[ 9] = _mm256_srai_epi16(step2[ 9], 2);
+        step2[10] = _mm256_srai_epi16(step2[10], 2);
+        step2[11] = _mm256_srai_epi16(step2[11], 2);
+        step2[12] = _mm256_srai_epi16(step2[12], 2);
+        step2[13] = _mm256_srai_epi16(step2[13], 2);
+        step2[14] = _mm256_srai_epi16(step2[14], 2);
+        step2[15] = _mm256_srai_epi16(step2[15], 2);
+        step1[16] = _mm256_srai_epi16(step1[16], 2);
+        step1[17] = _mm256_srai_epi16(step1[17], 2);
+        step1[18] = _mm256_srai_epi16(step1[18], 2);
+        step1[19] = _mm256_srai_epi16(step1[19], 2);
+        step2[20] = _mm256_srai_epi16(step2[20], 2);
+        step2[21] = _mm256_srai_epi16(step2[21], 2);
+        step2[22] = _mm256_srai_epi16(step2[22], 2);
+        step2[23] = _mm256_srai_epi16(step2[23], 2);
+        step2[24] = _mm256_srai_epi16(step2[24], 2);
+        step2[25] = _mm256_srai_epi16(step2[25], 2);
+        step2[26] = _mm256_srai_epi16(step2[26], 2);
+        step2[27] = _mm256_srai_epi16(step2[27], 2);
+        step1[28] = _mm256_srai_epi16(step1[28], 2);
+        step1[29] = _mm256_srai_epi16(step1[29], 2);
+        step1[30] = _mm256_srai_epi16(step1[30], 2);
+        step1[31] = _mm256_srai_epi16(step1[31], 2);
+      }
+#endif
+
+#if FDCT32x32_HIGH_PRECISION
+      if (pass == 0) {
+#endif
+      // Stage 3
+      {
+        step3[0] = _mm256_add_epi16(step2[(8 - 1)], step2[0]);
+        step3[1] = _mm256_add_epi16(step2[(8 - 2)], step2[1]);
+        step3[2] = _mm256_add_epi16(step2[(8 - 3)], step2[2]);
+        step3[3] = _mm256_add_epi16(step2[(8 - 4)], step2[3]);
+        step3[4] = _mm256_sub_epi16(step2[(8 - 5)], step2[4]);
+        step3[5] = _mm256_sub_epi16(step2[(8 - 6)], step2[5]);
+        step3[6] = _mm256_sub_epi16(step2[(8 - 7)], step2[6]);
+        step3[7] = _mm256_sub_epi16(step2[(8 - 8)], step2[7]);
+      }
+      {
+        const __m256i s3_10_0 = _mm256_unpacklo_epi16(step2[13], step2[10]);
+        const __m256i s3_10_1 = _mm256_unpackhi_epi16(step2[13], step2[10]);
+        const __m256i s3_11_0 = _mm256_unpacklo_epi16(step2[12], step2[11]);
+        const __m256i s3_11_1 = _mm256_unpackhi_epi16(step2[12], step2[11]);
+        const __m256i s3_10_2 = _mm256_madd_epi16(s3_10_0, k__cospi_p16_m16);
+        const __m256i s3_10_3 = _mm256_madd_epi16(s3_10_1, k__cospi_p16_m16);
+        const __m256i s3_11_2 = _mm256_madd_epi16(s3_11_0, k__cospi_p16_m16);
+        const __m256i s3_11_3 = _mm256_madd_epi16(s3_11_1, k__cospi_p16_m16);
+        const __m256i s3_12_2 = _mm256_madd_epi16(s3_11_0, k__cospi_p16_p16);
+        const __m256i s3_12_3 = _mm256_madd_epi16(s3_11_1, k__cospi_p16_p16);
+        const __m256i s3_13_2 = _mm256_madd_epi16(s3_10_0, k__cospi_p16_p16);
+        const __m256i s3_13_3 = _mm256_madd_epi16(s3_10_1, k__cospi_p16_p16);
+        // dct_const_round_shift
+        const __m256i s3_10_4 = _mm256_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING);
+        const __m256i s3_10_5 = _mm256_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING);
+        const __m256i s3_11_4 = _mm256_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING);
+        const __m256i s3_11_5 = _mm256_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING);
+        const __m256i s3_12_4 = _mm256_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING);
+        const __m256i s3_12_5 = _mm256_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING);
+        const __m256i s3_13_4 = _mm256_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING);
+        const __m256i s3_13_5 = _mm256_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING);
+        const __m256i s3_10_6 = _mm256_srai_epi32(s3_10_4, DCT_CONST_BITS);
+        const __m256i s3_10_7 = _mm256_srai_epi32(s3_10_5, DCT_CONST_BITS);
+        const __m256i s3_11_6 = _mm256_srai_epi32(s3_11_4, DCT_CONST_BITS);
+        const __m256i s3_11_7 = _mm256_srai_epi32(s3_11_5, DCT_CONST_BITS);
+        const __m256i s3_12_6 = _mm256_srai_epi32(s3_12_4, DCT_CONST_BITS);
+        const __m256i s3_12_7 = _mm256_srai_epi32(s3_12_5, DCT_CONST_BITS);
+        const __m256i s3_13_6 = _mm256_srai_epi32(s3_13_4, DCT_CONST_BITS);
+        const __m256i s3_13_7 = _mm256_srai_epi32(s3_13_5, DCT_CONST_BITS);
+        // Combine
+        step3[10] = _mm256_packs_epi32(s3_10_6, s3_10_7);
+        step3[11] = _mm256_packs_epi32(s3_11_6, s3_11_7);
+        step3[12] = _mm256_packs_epi32(s3_12_6, s3_12_7);
+        step3[13] = _mm256_packs_epi32(s3_13_6, s3_13_7);
+      }
+      {
+        step3[16] = _mm256_add_epi16(step2[23], step1[16]);
+        step3[17] = _mm256_add_epi16(step2[22], step1[17]);
+        step3[18] = _mm256_add_epi16(step2[21], step1[18]);
+        step3[19] = _mm256_add_epi16(step2[20], step1[19]);
+        step3[20] = _mm256_sub_epi16(step1[19], step2[20]);
+        step3[21] = _mm256_sub_epi16(step1[18], step2[21]);
+        step3[22] = _mm256_sub_epi16(step1[17], step2[22]);
+        step3[23] = _mm256_sub_epi16(step1[16], step2[23]);
+        step3[24] = _mm256_sub_epi16(step1[31], step2[24]);
+        step3[25] = _mm256_sub_epi16(step1[30], step2[25]);
+        step3[26] = _mm256_sub_epi16(step1[29], step2[26]);
+        step3[27] = _mm256_sub_epi16(step1[28], step2[27]);
+        step3[28] = _mm256_add_epi16(step2[27], step1[28]);
+        step3[29] = _mm256_add_epi16(step2[26], step1[29]);
+        step3[30] = _mm256_add_epi16(step2[25], step1[30]);
+        step3[31] = _mm256_add_epi16(step2[24], step1[31]);
+      }
+
+      // Stage 4
+      {
+        step1[ 0] = _mm256_add_epi16(step3[ 3], step3[ 0]);
+        step1[ 1] = _mm256_add_epi16(step3[ 2], step3[ 1]);
+        step1[ 2] = _mm256_sub_epi16(step3[ 1], step3[ 2]);
+        step1[ 3] = _mm256_sub_epi16(step3[ 0], step3[ 3]);
+        step1[ 8] = _mm256_add_epi16(step3[11], step2[ 8]);
+        step1[ 9] = _mm256_add_epi16(step3[10], step2[ 9]);
+        step1[10] = _mm256_sub_epi16(step2[ 9], step3[10]);
+        step1[11] = _mm256_sub_epi16(step2[ 8], step3[11]);
+        step1[12] = _mm256_sub_epi16(step2[15], step3[12]);
+        step1[13] = _mm256_sub_epi16(step2[14], step3[13]);
+        step1[14] = _mm256_add_epi16(step3[13], step2[14]);
+        step1[15] = _mm256_add_epi16(step3[12], step2[15]);
+      }
+      {
+        const __m256i s1_05_0 = _mm256_unpacklo_epi16(step3[6], step3[5]);
+        const __m256i s1_05_1 = _mm256_unpackhi_epi16(step3[6], step3[5]);
+        const __m256i s1_05_2 = _mm256_madd_epi16(s1_05_0, k__cospi_p16_m16);
+        const __m256i s1_05_3 = _mm256_madd_epi16(s1_05_1, k__cospi_p16_m16);
+        const __m256i s1_06_2 = _mm256_madd_epi16(s1_05_0, k__cospi_p16_p16);
+        const __m256i s1_06_3 = _mm256_madd_epi16(s1_05_1, k__cospi_p16_p16);
+        // dct_const_round_shift
+        const __m256i s1_05_4 = _mm256_add_epi32(s1_05_2, k__DCT_CONST_ROUNDING);
+        const __m256i s1_05_5 = _mm256_add_epi32(s1_05_3, k__DCT_CONST_ROUNDING);
+        const __m256i s1_06_4 = _mm256_add_epi32(s1_06_2, k__DCT_CONST_ROUNDING);
+        const __m256i s1_06_5 = _mm256_add_epi32(s1_06_3, k__DCT_CONST_ROUNDING);
+        const __m256i s1_05_6 = _mm256_srai_epi32(s1_05_4, DCT_CONST_BITS);
+        const __m256i s1_05_7 = _mm256_srai_epi32(s1_05_5, DCT_CONST_BITS);
+        const __m256i s1_06_6 = _mm256_srai_epi32(s1_06_4, DCT_CONST_BITS);
+        const __m256i s1_06_7 = _mm256_srai_epi32(s1_06_5, DCT_CONST_BITS);
+        // Combine
+        step1[5] = _mm256_packs_epi32(s1_05_6, s1_05_7);
+        step1[6] = _mm256_packs_epi32(s1_06_6, s1_06_7);
+      }
+      {
+        const __m256i s1_18_0 = _mm256_unpacklo_epi16(step3[18], step3[29]);
+        const __m256i s1_18_1 = _mm256_unpackhi_epi16(step3[18], step3[29]);
+        const __m256i s1_19_0 = _mm256_unpacklo_epi16(step3[19], step3[28]);
+        const __m256i s1_19_1 = _mm256_unpackhi_epi16(step3[19], step3[28]);
+        const __m256i s1_20_0 = _mm256_unpacklo_epi16(step3[20], step3[27]);
+        const __m256i s1_20_1 = _mm256_unpackhi_epi16(step3[20], step3[27]);
+        const __m256i s1_21_0 = _mm256_unpacklo_epi16(step3[21], step3[26]);
+        const __m256i s1_21_1 = _mm256_unpackhi_epi16(step3[21], step3[26]);
+        const __m256i s1_18_2 = _mm256_madd_epi16(s1_18_0, k__cospi_m08_p24);
+        const __m256i s1_18_3 = _mm256_madd_epi16(s1_18_1, k__cospi_m08_p24);
+        const __m256i s1_19_2 = _mm256_madd_epi16(s1_19_0, k__cospi_m08_p24);
+        const __m256i s1_19_3 = _mm256_madd_epi16(s1_19_1, k__cospi_m08_p24);
+        const __m256i s1_20_2 = _mm256_madd_epi16(s1_20_0, k__cospi_m24_m08);
+        const __m256i s1_20_3 = _mm256_madd_epi16(s1_20_1, k__cospi_m24_m08);
+        const __m256i s1_21_2 = _mm256_madd_epi16(s1_21_0, k__cospi_m24_m08);
+        const __m256i s1_21_3 = _mm256_madd_epi16(s1_21_1, k__cospi_m24_m08);
+        const __m256i s1_26_2 = _mm256_madd_epi16(s1_21_0, k__cospi_m08_p24);
+        const __m256i s1_26_3 = _mm256_madd_epi16(s1_21_1, k__cospi_m08_p24);
+        const __m256i s1_27_2 = _mm256_madd_epi16(s1_20_0, k__cospi_m08_p24);
+        const __m256i s1_27_3 = _mm256_madd_epi16(s1_20_1, k__cospi_m08_p24);
+        const __m256i s1_28_2 = _mm256_madd_epi16(s1_19_0, k__cospi_p24_p08);
+        const __m256i s1_28_3 = _mm256_madd_epi16(s1_19_1, k__cospi_p24_p08);
+        const __m256i s1_29_2 = _mm256_madd_epi16(s1_18_0, k__cospi_p24_p08);
+        const __m256i s1_29_3 = _mm256_madd_epi16(s1_18_1, k__cospi_p24_p08);
+        // dct_const_round_shift
+        const __m256i s1_18_4 = _mm256_add_epi32(s1_18_2, k__DCT_CONST_ROUNDING);
+        const __m256i s1_18_5 = _mm256_add_epi32(s1_18_3, k__DCT_CONST_ROUNDING);
+        const __m256i s1_19_4 = _mm256_add_epi32(s1_19_2, k__DCT_CONST_ROUNDING);
+        const __m256i s1_19_5 = _mm256_add_epi32(s1_19_3, k__DCT_CONST_ROUNDING);
+        const __m256i s1_20_4 = _mm256_add_epi32(s1_20_2, k__DCT_CONST_ROUNDING);
+        const __m256i s1_20_5 = _mm256_add_epi32(s1_20_3, k__DCT_CONST_ROUNDING);
+        const __m256i s1_21_4 = _mm256_add_epi32(s1_21_2, k__DCT_CONST_ROUNDING);
+        const __m256i s1_21_5 = _mm256_add_epi32(s1_21_3, k__DCT_CONST_ROUNDING);
+        const __m256i s1_26_4 = _mm256_add_epi32(s1_26_2, k__DCT_CONST_ROUNDING);
+        const __m256i s1_26_5 = _mm256_add_epi32(s1_26_3, k__DCT_CONST_ROUNDING);
+        const __m256i s1_27_4 = _mm256_add_epi32(s1_27_2, k__DCT_CONST_ROUNDING);
+        const __m256i s1_27_5 = _mm256_add_epi32(s1_27_3, k__DCT_CONST_ROUNDING);
+        const __m256i s1_28_4 = _mm256_add_epi32(s1_28_2, k__DCT_CONST_ROUNDING);
+        const __m256i s1_28_5 = _mm256_add_epi32(s1_28_3, k__DCT_CONST_ROUNDING);
+        const __m256i s1_29_4 = _mm256_add_epi32(s1_29_2, k__DCT_CONST_ROUNDING);
+        const __m256i s1_29_5 = _mm256_add_epi32(s1_29_3, k__DCT_CONST_ROUNDING);
+        const __m256i s1_18_6 = _mm256_srai_epi32(s1_18_4, DCT_CONST_BITS);
+        const __m256i s1_18_7 = _mm256_srai_epi32(s1_18_5, DCT_CONST_BITS);
+        const __m256i s1_19_6 = _mm256_srai_epi32(s1_19_4, DCT_CONST_BITS);
+        const __m256i s1_19_7 = _mm256_srai_epi32(s1_19_5, DCT_CONST_BITS);
+        const __m256i s1_20_6 = _mm256_srai_epi32(s1_20_4, DCT_CONST_BITS);
+        const __m256i s1_20_7 = _mm256_srai_epi32(s1_20_5, DCT_CONST_BITS);
+        const __m256i s1_21_6 = _mm256_srai_epi32(s1_21_4, DCT_CONST_BITS);
+        const __m256i s1_21_7 = _mm256_srai_epi32(s1_21_5, DCT_CONST_BITS);
+        const __m256i s1_26_6 = _mm256_srai_epi32(s1_26_4, DCT_CONST_BITS);
+        const __m256i s1_26_7 = _mm256_srai_epi32(s1_26_5, DCT_CONST_BITS);
+        const __m256i s1_27_6 = _mm256_srai_epi32(s1_27_4, DCT_CONST_BITS);
+        const __m256i s1_27_7 = _mm256_srai_epi32(s1_27_5, DCT_CONST_BITS);
+        const __m256i s1_28_6 = _mm256_srai_epi32(s1_28_4, DCT_CONST_BITS);
+        const __m256i s1_28_7 = _mm256_srai_epi32(s1_28_5, DCT_CONST_BITS);
+        const __m256i s1_29_6 = _mm256_srai_epi32(s1_29_4, DCT_CONST_BITS);
+        const __m256i s1_29_7 = _mm256_srai_epi32(s1_29_5, DCT_CONST_BITS);
+        // Combine
+        step1[18] = _mm256_packs_epi32(s1_18_6, s1_18_7);
+        step1[19] = _mm256_packs_epi32(s1_19_6, s1_19_7);
+        step1[20] = _mm256_packs_epi32(s1_20_6, s1_20_7);
+        step1[21] = _mm256_packs_epi32(s1_21_6, s1_21_7);
+        step1[26] = _mm256_packs_epi32(s1_26_6, s1_26_7);
+        step1[27] = _mm256_packs_epi32(s1_27_6, s1_27_7);
+        step1[28] = _mm256_packs_epi32(s1_28_6, s1_28_7);
+        step1[29] = _mm256_packs_epi32(s1_29_6, s1_29_7);
+      }
+      // Stage 5
+      {
+        step2[4] = _mm256_add_epi16(step1[5], step3[4]);
+        step2[5] = _mm256_sub_epi16(step3[4], step1[5]);
+        step2[6] = _mm256_sub_epi16(step3[7], step1[6]);
+        step2[7] = _mm256_add_epi16(step1[6], step3[7]);
+      }
+      {
+        const __m256i out_00_0 = _mm256_unpacklo_epi16(step1[0], step1[1]);
+        const __m256i out_00_1 = _mm256_unpackhi_epi16(step1[0], step1[1]);
+        const __m256i out_08_0 = _mm256_unpacklo_epi16(step1[2], step1[3]);
+        const __m256i out_08_1 = _mm256_unpackhi_epi16(step1[2], step1[3]);
+        const __m256i out_00_2 = _mm256_madd_epi16(out_00_0, k__cospi_p16_p16);
+        const __m256i out_00_3 = _mm256_madd_epi16(out_00_1, k__cospi_p16_p16);
+        const __m256i out_16_2 = _mm256_madd_epi16(out_00_0, k__cospi_p16_m16);
+        const __m256i out_16_3 = _mm256_madd_epi16(out_00_1, k__cospi_p16_m16);
+        const __m256i out_08_2 = _mm256_madd_epi16(out_08_0, k__cospi_p24_p08);
+        const __m256i out_08_3 = _mm256_madd_epi16(out_08_1, k__cospi_p24_p08);
+        const __m256i out_24_2 = _mm256_madd_epi16(out_08_0, k__cospi_m08_p24);
+        const __m256i out_24_3 = _mm256_madd_epi16(out_08_1, k__cospi_m08_p24);
+        // dct_const_round_shift
+        const __m256i out_00_4 = _mm256_add_epi32(out_00_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_00_5 = _mm256_add_epi32(out_00_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_16_4 = _mm256_add_epi32(out_16_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_16_5 = _mm256_add_epi32(out_16_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_08_4 = _mm256_add_epi32(out_08_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_08_5 = _mm256_add_epi32(out_08_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_24_4 = _mm256_add_epi32(out_24_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_24_5 = _mm256_add_epi32(out_24_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_00_6 = _mm256_srai_epi32(out_00_4, DCT_CONST_BITS);
+        const __m256i out_00_7 = _mm256_srai_epi32(out_00_5, DCT_CONST_BITS);
+        const __m256i out_16_6 = _mm256_srai_epi32(out_16_4, DCT_CONST_BITS);
+        const __m256i out_16_7 = _mm256_srai_epi32(out_16_5, DCT_CONST_BITS);
+        const __m256i out_08_6 = _mm256_srai_epi32(out_08_4, DCT_CONST_BITS);
+        const __m256i out_08_7 = _mm256_srai_epi32(out_08_5, DCT_CONST_BITS);
+        const __m256i out_24_6 = _mm256_srai_epi32(out_24_4, DCT_CONST_BITS);
+        const __m256i out_24_7 = _mm256_srai_epi32(out_24_5, DCT_CONST_BITS);
+        // Combine
+        out[ 0] = _mm256_packs_epi32(out_00_6, out_00_7);
+        out[16] = _mm256_packs_epi32(out_16_6, out_16_7);
+        out[ 8] = _mm256_packs_epi32(out_08_6, out_08_7);
+        out[24] = _mm256_packs_epi32(out_24_6, out_24_7);
+      }
+      {
+        const __m256i s2_09_0 = _mm256_unpacklo_epi16(step1[ 9], step1[14]);
+        const __m256i s2_09_1 = _mm256_unpackhi_epi16(step1[ 9], step1[14]);
+        const __m256i s2_10_0 = _mm256_unpacklo_epi16(step1[10], step1[13]);
+        const __m256i s2_10_1 = _mm256_unpackhi_epi16(step1[10], step1[13]);
+        const __m256i s2_09_2 = _mm256_madd_epi16(s2_09_0, k__cospi_m08_p24);
+        const __m256i s2_09_3 = _mm256_madd_epi16(s2_09_1, k__cospi_m08_p24);
+        const __m256i s2_10_2 = _mm256_madd_epi16(s2_10_0, k__cospi_m24_m08);
+        const __m256i s2_10_3 = _mm256_madd_epi16(s2_10_1, k__cospi_m24_m08);
+        const __m256i s2_13_2 = _mm256_madd_epi16(s2_10_0, k__cospi_m08_p24);
+        const __m256i s2_13_3 = _mm256_madd_epi16(s2_10_1, k__cospi_m08_p24);
+        const __m256i s2_14_2 = _mm256_madd_epi16(s2_09_0, k__cospi_p24_p08);
+        const __m256i s2_14_3 = _mm256_madd_epi16(s2_09_1, k__cospi_p24_p08);
+        // dct_const_round_shift
+        const __m256i s2_09_4 = _mm256_add_epi32(s2_09_2, k__DCT_CONST_ROUNDING);
+        const __m256i s2_09_5 = _mm256_add_epi32(s2_09_3, k__DCT_CONST_ROUNDING);
+        const __m256i s2_10_4 = _mm256_add_epi32(s2_10_2, k__DCT_CONST_ROUNDING);
+        const __m256i s2_10_5 = _mm256_add_epi32(s2_10_3, k__DCT_CONST_ROUNDING);
+        const __m256i s2_13_4 = _mm256_add_epi32(s2_13_2, k__DCT_CONST_ROUNDING);
+        const __m256i s2_13_5 = _mm256_add_epi32(s2_13_3, k__DCT_CONST_ROUNDING);
+        const __m256i s2_14_4 = _mm256_add_epi32(s2_14_2, k__DCT_CONST_ROUNDING);
+        const __m256i s2_14_5 = _mm256_add_epi32(s2_14_3, k__DCT_CONST_ROUNDING);
+        const __m256i s2_09_6 = _mm256_srai_epi32(s2_09_4, DCT_CONST_BITS);
+        const __m256i s2_09_7 = _mm256_srai_epi32(s2_09_5, DCT_CONST_BITS);
+        const __m256i s2_10_6 = _mm256_srai_epi32(s2_10_4, DCT_CONST_BITS);
+        const __m256i s2_10_7 = _mm256_srai_epi32(s2_10_5, DCT_CONST_BITS);
+        const __m256i s2_13_6 = _mm256_srai_epi32(s2_13_4, DCT_CONST_BITS);
+        const __m256i s2_13_7 = _mm256_srai_epi32(s2_13_5, DCT_CONST_BITS);
+        const __m256i s2_14_6 = _mm256_srai_epi32(s2_14_4, DCT_CONST_BITS);
+        const __m256i s2_14_7 = _mm256_srai_epi32(s2_14_5, DCT_CONST_BITS);
+        // Combine
+        step2[ 9] = _mm256_packs_epi32(s2_09_6, s2_09_7);
+        step2[10] = _mm256_packs_epi32(s2_10_6, s2_10_7);
+        step2[13] = _mm256_packs_epi32(s2_13_6, s2_13_7);
+        step2[14] = _mm256_packs_epi32(s2_14_6, s2_14_7);
+      }
+      {
+        step2[16] = _mm256_add_epi16(step1[19], step3[16]);
+        step2[17] = _mm256_add_epi16(step1[18], step3[17]);
+        step2[18] = _mm256_sub_epi16(step3[17], step1[18]);
+        step2[19] = _mm256_sub_epi16(step3[16], step1[19]);
+        step2[20] = _mm256_sub_epi16(step3[23], step1[20]);
+        step2[21] = _mm256_sub_epi16(step3[22], step1[21]);
+        step2[22] = _mm256_add_epi16(step1[21], step3[22]);
+        step2[23] = _mm256_add_epi16(step1[20], step3[23]);
+        step2[24] = _mm256_add_epi16(step1[27], step3[24]);
+        step2[25] = _mm256_add_epi16(step1[26], step3[25]);
+        step2[26] = _mm256_sub_epi16(step3[25], step1[26]);
+        step2[27] = _mm256_sub_epi16(step3[24], step1[27]);
+        step2[28] = _mm256_sub_epi16(step3[31], step1[28]);
+        step2[29] = _mm256_sub_epi16(step3[30], step1[29]);
+        step2[30] = _mm256_add_epi16(step1[29], step3[30]);
+        step2[31] = _mm256_add_epi16(step1[28], step3[31]);
+      }
+      // Stage 6
+      {
+        const __m256i out_04_0 = _mm256_unpacklo_epi16(step2[4], step2[7]);
+        const __m256i out_04_1 = _mm256_unpackhi_epi16(step2[4], step2[7]);
+        const __m256i out_20_0 = _mm256_unpacklo_epi16(step2[5], step2[6]);
+        const __m256i out_20_1 = _mm256_unpackhi_epi16(step2[5], step2[6]);
+        const __m256i out_12_0 = _mm256_unpacklo_epi16(step2[5], step2[6]);
+        const __m256i out_12_1 = _mm256_unpackhi_epi16(step2[5], step2[6]);
+        const __m256i out_28_0 = _mm256_unpacklo_epi16(step2[4], step2[7]);
+        const __m256i out_28_1 = _mm256_unpackhi_epi16(step2[4], step2[7]);
+        const __m256i out_04_2 = _mm256_madd_epi16(out_04_0, k__cospi_p28_p04);
+        const __m256i out_04_3 = _mm256_madd_epi16(out_04_1, k__cospi_p28_p04);
+        const __m256i out_20_2 = _mm256_madd_epi16(out_20_0, k__cospi_p12_p20);
+        const __m256i out_20_3 = _mm256_madd_epi16(out_20_1, k__cospi_p12_p20);
+        const __m256i out_12_2 = _mm256_madd_epi16(out_12_0, k__cospi_m20_p12);
+        const __m256i out_12_3 = _mm256_madd_epi16(out_12_1, k__cospi_m20_p12);
+        const __m256i out_28_2 = _mm256_madd_epi16(out_28_0, k__cospi_m04_p28);
+        const __m256i out_28_3 = _mm256_madd_epi16(out_28_1, k__cospi_m04_p28);
+        // dct_const_round_shift
+        const __m256i out_04_4 = _mm256_add_epi32(out_04_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_04_5 = _mm256_add_epi32(out_04_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_20_4 = _mm256_add_epi32(out_20_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_20_5 = _mm256_add_epi32(out_20_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_12_4 = _mm256_add_epi32(out_12_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_12_5 = _mm256_add_epi32(out_12_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_28_4 = _mm256_add_epi32(out_28_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_28_5 = _mm256_add_epi32(out_28_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_04_6 = _mm256_srai_epi32(out_04_4, DCT_CONST_BITS);
+        const __m256i out_04_7 = _mm256_srai_epi32(out_04_5, DCT_CONST_BITS);
+        const __m256i out_20_6 = _mm256_srai_epi32(out_20_4, DCT_CONST_BITS);
+        const __m256i out_20_7 = _mm256_srai_epi32(out_20_5, DCT_CONST_BITS);
+        const __m256i out_12_6 = _mm256_srai_epi32(out_12_4, DCT_CONST_BITS);
+        const __m256i out_12_7 = _mm256_srai_epi32(out_12_5, DCT_CONST_BITS);
+        const __m256i out_28_6 = _mm256_srai_epi32(out_28_4, DCT_CONST_BITS);
+        const __m256i out_28_7 = _mm256_srai_epi32(out_28_5, DCT_CONST_BITS);
+        // Combine
+        out[ 4] = _mm256_packs_epi32(out_04_6, out_04_7);
+        out[20] = _mm256_packs_epi32(out_20_6, out_20_7);
+        out[12] = _mm256_packs_epi32(out_12_6, out_12_7);
+        out[28] = _mm256_packs_epi32(out_28_6, out_28_7);
+      }
+      {
+        step3[ 8] = _mm256_add_epi16(step2[ 9], step1[ 8]);
+        step3[ 9] = _mm256_sub_epi16(step1[ 8], step2[ 9]);
+        step3[10] = _mm256_sub_epi16(step1[11], step2[10]);
+        step3[11] = _mm256_add_epi16(step2[10], step1[11]);
+        step3[12] = _mm256_add_epi16(step2[13], step1[12]);
+        step3[13] = _mm256_sub_epi16(step1[12], step2[13]);
+        step3[14] = _mm256_sub_epi16(step1[15], step2[14]);
+        step3[15] = _mm256_add_epi16(step2[14], step1[15]);
+      }
+      {
+        const __m256i s3_17_0 = _mm256_unpacklo_epi16(step2[17], step2[30]);
+        const __m256i s3_17_1 = _mm256_unpackhi_epi16(step2[17], step2[30]);
+        const __m256i s3_18_0 = _mm256_unpacklo_epi16(step2[18], step2[29]);
+        const __m256i s3_18_1 = _mm256_unpackhi_epi16(step2[18], step2[29]);
+        const __m256i s3_21_0 = _mm256_unpacklo_epi16(step2[21], step2[26]);
+        const __m256i s3_21_1 = _mm256_unpackhi_epi16(step2[21], step2[26]);
+        const __m256i s3_22_0 = _mm256_unpacklo_epi16(step2[22], step2[25]);
+        const __m256i s3_22_1 = _mm256_unpackhi_epi16(step2[22], step2[25]);
+        const __m256i s3_17_2 = _mm256_madd_epi16(s3_17_0, k__cospi_m04_p28);
+        const __m256i s3_17_3 = _mm256_madd_epi16(s3_17_1, k__cospi_m04_p28);
+        const __m256i s3_18_2 = _mm256_madd_epi16(s3_18_0, k__cospi_m28_m04);
+        const __m256i s3_18_3 = _mm256_madd_epi16(s3_18_1, k__cospi_m28_m04);
+        const __m256i s3_21_2 = _mm256_madd_epi16(s3_21_0, k__cospi_m20_p12);
+        const __m256i s3_21_3 = _mm256_madd_epi16(s3_21_1, k__cospi_m20_p12);
+        const __m256i s3_22_2 = _mm256_madd_epi16(s3_22_0, k__cospi_m12_m20);
+        const __m256i s3_22_3 = _mm256_madd_epi16(s3_22_1, k__cospi_m12_m20);
+        const __m256i s3_25_2 = _mm256_madd_epi16(s3_22_0, k__cospi_m20_p12);
+        const __m256i s3_25_3 = _mm256_madd_epi16(s3_22_1, k__cospi_m20_p12);
+        const __m256i s3_26_2 = _mm256_madd_epi16(s3_21_0, k__cospi_p12_p20);
+        const __m256i s3_26_3 = _mm256_madd_epi16(s3_21_1, k__cospi_p12_p20);
+        const __m256i s3_29_2 = _mm256_madd_epi16(s3_18_0, k__cospi_m04_p28);
+        const __m256i s3_29_3 = _mm256_madd_epi16(s3_18_1, k__cospi_m04_p28);
+        const __m256i s3_30_2 = _mm256_madd_epi16(s3_17_0, k__cospi_p28_p04);
+        const __m256i s3_30_3 = _mm256_madd_epi16(s3_17_1, k__cospi_p28_p04);
+        // dct_const_round_shift
+        const __m256i s3_17_4 = _mm256_add_epi32(s3_17_2, k__DCT_CONST_ROUNDING);
+        const __m256i s3_17_5 = _mm256_add_epi32(s3_17_3, k__DCT_CONST_ROUNDING);
+        const __m256i s3_18_4 = _mm256_add_epi32(s3_18_2, k__DCT_CONST_ROUNDING);
+        const __m256i s3_18_5 = _mm256_add_epi32(s3_18_3, k__DCT_CONST_ROUNDING);
+        const __m256i s3_21_4 = _mm256_add_epi32(s3_21_2, k__DCT_CONST_ROUNDING);
+        const __m256i s3_21_5 = _mm256_add_epi32(s3_21_3, k__DCT_CONST_ROUNDING);
+        const __m256i s3_22_4 = _mm256_add_epi32(s3_22_2, k__DCT_CONST_ROUNDING);
+        const __m256i s3_22_5 = _mm256_add_epi32(s3_22_3, k__DCT_CONST_ROUNDING);
+        const __m256i s3_17_6 = _mm256_srai_epi32(s3_17_4, DCT_CONST_BITS);
+        const __m256i s3_17_7 = _mm256_srai_epi32(s3_17_5, DCT_CONST_BITS);
+        const __m256i s3_18_6 = _mm256_srai_epi32(s3_18_4, DCT_CONST_BITS);
+        const __m256i s3_18_7 = _mm256_srai_epi32(s3_18_5, DCT_CONST_BITS);
+        const __m256i s3_21_6 = _mm256_srai_epi32(s3_21_4, DCT_CONST_BITS);
+        const __m256i s3_21_7 = _mm256_srai_epi32(s3_21_5, DCT_CONST_BITS);
+        const __m256i s3_22_6 = _mm256_srai_epi32(s3_22_4, DCT_CONST_BITS);
+        const __m256i s3_22_7 = _mm256_srai_epi32(s3_22_5, DCT_CONST_BITS);
+        const __m256i s3_25_4 = _mm256_add_epi32(s3_25_2, k__DCT_CONST_ROUNDING);
+        const __m256i s3_25_5 = _mm256_add_epi32(s3_25_3, k__DCT_CONST_ROUNDING);
+        const __m256i s3_26_4 = _mm256_add_epi32(s3_26_2, k__DCT_CONST_ROUNDING);
+        const __m256i s3_26_5 = _mm256_add_epi32(s3_26_3, k__DCT_CONST_ROUNDING);
+        const __m256i s3_29_4 = _mm256_add_epi32(s3_29_2, k__DCT_CONST_ROUNDING);
+        const __m256i s3_29_5 = _mm256_add_epi32(s3_29_3, k__DCT_CONST_ROUNDING);
+        const __m256i s3_30_4 = _mm256_add_epi32(s3_30_2, k__DCT_CONST_ROUNDING);
+        const __m256i s3_30_5 = _mm256_add_epi32(s3_30_3, k__DCT_CONST_ROUNDING);
+        const __m256i s3_25_6 = _mm256_srai_epi32(s3_25_4, DCT_CONST_BITS);
+        const __m256i s3_25_7 = _mm256_srai_epi32(s3_25_5, DCT_CONST_BITS);
+        const __m256i s3_26_6 = _mm256_srai_epi32(s3_26_4, DCT_CONST_BITS);
+        const __m256i s3_26_7 = _mm256_srai_epi32(s3_26_5, DCT_CONST_BITS);
+        const __m256i s3_29_6 = _mm256_srai_epi32(s3_29_4, DCT_CONST_BITS);
+        const __m256i s3_29_7 = _mm256_srai_epi32(s3_29_5, DCT_CONST_BITS);
+        const __m256i s3_30_6 = _mm256_srai_epi32(s3_30_4, DCT_CONST_BITS);
+        const __m256i s3_30_7 = _mm256_srai_epi32(s3_30_5, DCT_CONST_BITS);
+        // Combine
+        step3[17] = _mm256_packs_epi32(s3_17_6, s3_17_7);
+        step3[18] = _mm256_packs_epi32(s3_18_6, s3_18_7);
+        step3[21] = _mm256_packs_epi32(s3_21_6, s3_21_7);
+        step3[22] = _mm256_packs_epi32(s3_22_6, s3_22_7);
+        // Combine
+        step3[25] = _mm256_packs_epi32(s3_25_6, s3_25_7);
+        step3[26] = _mm256_packs_epi32(s3_26_6, s3_26_7);
+        step3[29] = _mm256_packs_epi32(s3_29_6, s3_29_7);
+        step3[30] = _mm256_packs_epi32(s3_30_6, s3_30_7);
+      }
+      // Stage 7
+      {
+        const __m256i out_02_0 = _mm256_unpacklo_epi16(step3[ 8], step3[15]);
+        const __m256i out_02_1 = _mm256_unpackhi_epi16(step3[ 8], step3[15]);
+        const __m256i out_18_0 = _mm256_unpacklo_epi16(step3[ 9], step3[14]);
+        const __m256i out_18_1 = _mm256_unpackhi_epi16(step3[ 9], step3[14]);
+        const __m256i out_10_0 = _mm256_unpacklo_epi16(step3[10], step3[13]);
+        const __m256i out_10_1 = _mm256_unpackhi_epi16(step3[10], step3[13]);
+        const __m256i out_26_0 = _mm256_unpacklo_epi16(step3[11], step3[12]);
+        const __m256i out_26_1 = _mm256_unpackhi_epi16(step3[11], step3[12]);
+        const __m256i out_02_2 = _mm256_madd_epi16(out_02_0, k__cospi_p30_p02);
+        const __m256i out_02_3 = _mm256_madd_epi16(out_02_1, k__cospi_p30_p02);
+        const __m256i out_18_2 = _mm256_madd_epi16(out_18_0, k__cospi_p14_p18);
+        const __m256i out_18_3 = _mm256_madd_epi16(out_18_1, k__cospi_p14_p18);
+        const __m256i out_10_2 = _mm256_madd_epi16(out_10_0, k__cospi_p22_p10);
+        const __m256i out_10_3 = _mm256_madd_epi16(out_10_1, k__cospi_p22_p10);
+        const __m256i out_26_2 = _mm256_madd_epi16(out_26_0, k__cospi_p06_p26);
+        const __m256i out_26_3 = _mm256_madd_epi16(out_26_1, k__cospi_p06_p26);
+        const __m256i out_06_2 = _mm256_madd_epi16(out_26_0, k__cospi_m26_p06);
+        const __m256i out_06_3 = _mm256_madd_epi16(out_26_1, k__cospi_m26_p06);
+        const __m256i out_22_2 = _mm256_madd_epi16(out_10_0, k__cospi_m10_p22);
+        const __m256i out_22_3 = _mm256_madd_epi16(out_10_1, k__cospi_m10_p22);
+        const __m256i out_14_2 = _mm256_madd_epi16(out_18_0, k__cospi_m18_p14);
+        const __m256i out_14_3 = _mm256_madd_epi16(out_18_1, k__cospi_m18_p14);
+        const __m256i out_30_2 = _mm256_madd_epi16(out_02_0, k__cospi_m02_p30);
+        const __m256i out_30_3 = _mm256_madd_epi16(out_02_1, k__cospi_m02_p30);
+        // dct_const_round_shift
+        const __m256i out_02_4 = _mm256_add_epi32(out_02_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_02_5 = _mm256_add_epi32(out_02_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_18_4 = _mm256_add_epi32(out_18_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_18_5 = _mm256_add_epi32(out_18_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_10_4 = _mm256_add_epi32(out_10_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_10_5 = _mm256_add_epi32(out_10_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_26_4 = _mm256_add_epi32(out_26_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_26_5 = _mm256_add_epi32(out_26_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_06_4 = _mm256_add_epi32(out_06_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_06_5 = _mm256_add_epi32(out_06_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_22_4 = _mm256_add_epi32(out_22_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_22_5 = _mm256_add_epi32(out_22_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_14_4 = _mm256_add_epi32(out_14_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_14_5 = _mm256_add_epi32(out_14_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_30_4 = _mm256_add_epi32(out_30_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_30_5 = _mm256_add_epi32(out_30_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_02_6 = _mm256_srai_epi32(out_02_4, DCT_CONST_BITS);
+        const __m256i out_02_7 = _mm256_srai_epi32(out_02_5, DCT_CONST_BITS);
+        const __m256i out_18_6 = _mm256_srai_epi32(out_18_4, DCT_CONST_BITS);
+        const __m256i out_18_7 = _mm256_srai_epi32(out_18_5, DCT_CONST_BITS);
+        const __m256i out_10_6 = _mm256_srai_epi32(out_10_4, DCT_CONST_BITS);
+        const __m256i out_10_7 = _mm256_srai_epi32(out_10_5, DCT_CONST_BITS);
+        const __m256i out_26_6 = _mm256_srai_epi32(out_26_4, DCT_CONST_BITS);
+        const __m256i out_26_7 = _mm256_srai_epi32(out_26_5, DCT_CONST_BITS);
+        const __m256i out_06_6 = _mm256_srai_epi32(out_06_4, DCT_CONST_BITS);
+        const __m256i out_06_7 = _mm256_srai_epi32(out_06_5, DCT_CONST_BITS);
+        const __m256i out_22_6 = _mm256_srai_epi32(out_22_4, DCT_CONST_BITS);
+        const __m256i out_22_7 = _mm256_srai_epi32(out_22_5, DCT_CONST_BITS);
+        const __m256i out_14_6 = _mm256_srai_epi32(out_14_4, DCT_CONST_BITS);
+        const __m256i out_14_7 = _mm256_srai_epi32(out_14_5, DCT_CONST_BITS);
+        const __m256i out_30_6 = _mm256_srai_epi32(out_30_4, DCT_CONST_BITS);
+        const __m256i out_30_7 = _mm256_srai_epi32(out_30_5, DCT_CONST_BITS);
+        // Combine
+        out[ 2] = _mm256_packs_epi32(out_02_6, out_02_7);
+        out[18] = _mm256_packs_epi32(out_18_6, out_18_7);
+        out[10] = _mm256_packs_epi32(out_10_6, out_10_7);
+        out[26] = _mm256_packs_epi32(out_26_6, out_26_7);
+        out[ 6] = _mm256_packs_epi32(out_06_6, out_06_7);
+        out[22] = _mm256_packs_epi32(out_22_6, out_22_7);
+        out[14] = _mm256_packs_epi32(out_14_6, out_14_7);
+        out[30] = _mm256_packs_epi32(out_30_6, out_30_7);
+      }
+      {
+        step1[16] = _mm256_add_epi16(step3[17], step2[16]);
+        step1[17] = _mm256_sub_epi16(step2[16], step3[17]);
+        step1[18] = _mm256_sub_epi16(step2[19], step3[18]);
+        step1[19] = _mm256_add_epi16(step3[18], step2[19]);
+        step1[20] = _mm256_add_epi16(step3[21], step2[20]);
+        step1[21] = _mm256_sub_epi16(step2[20], step3[21]);
+        step1[22] = _mm256_sub_epi16(step2[23], step3[22]);
+        step1[23] = _mm256_add_epi16(step3[22], step2[23]);
+        step1[24] = _mm256_add_epi16(step3[25], step2[24]);
+        step1[25] = _mm256_sub_epi16(step2[24], step3[25]);
+        step1[26] = _mm256_sub_epi16(step2[27], step3[26]);
+        step1[27] = _mm256_add_epi16(step3[26], step2[27]);
+        step1[28] = _mm256_add_epi16(step3[29], step2[28]);
+        step1[29] = _mm256_sub_epi16(step2[28], step3[29]);
+        step1[30] = _mm256_sub_epi16(step2[31], step3[30]);
+        step1[31] = _mm256_add_epi16(step3[30], step2[31]);
+      }
+      // Final stage --- outputs indices are bit-reversed.
+      {
+        const __m256i out_01_0 = _mm256_unpacklo_epi16(step1[16], step1[31]);
+        const __m256i out_01_1 = _mm256_unpackhi_epi16(step1[16], step1[31]);
+        const __m256i out_17_0 = _mm256_unpacklo_epi16(step1[17], step1[30]);
+        const __m256i out_17_1 = _mm256_unpackhi_epi16(step1[17], step1[30]);
+        const __m256i out_09_0 = _mm256_unpacklo_epi16(step1[18], step1[29]);
+        const __m256i out_09_1 = _mm256_unpackhi_epi16(step1[18], step1[29]);
+        const __m256i out_25_0 = _mm256_unpacklo_epi16(step1[19], step1[28]);
+        const __m256i out_25_1 = _mm256_unpackhi_epi16(step1[19], step1[28]);
+        const __m256i out_01_2 = _mm256_madd_epi16(out_01_0, k__cospi_p31_p01);
+        const __m256i out_01_3 = _mm256_madd_epi16(out_01_1, k__cospi_p31_p01);
+        const __m256i out_17_2 = _mm256_madd_epi16(out_17_0, k__cospi_p15_p17);
+        const __m256i out_17_3 = _mm256_madd_epi16(out_17_1, k__cospi_p15_p17);
+        const __m256i out_09_2 = _mm256_madd_epi16(out_09_0, k__cospi_p23_p09);
+        const __m256i out_09_3 = _mm256_madd_epi16(out_09_1, k__cospi_p23_p09);
+        const __m256i out_25_2 = _mm256_madd_epi16(out_25_0, k__cospi_p07_p25);
+        const __m256i out_25_3 = _mm256_madd_epi16(out_25_1, k__cospi_p07_p25);
+        const __m256i out_07_2 = _mm256_madd_epi16(out_25_0, k__cospi_m25_p07);
+        const __m256i out_07_3 = _mm256_madd_epi16(out_25_1, k__cospi_m25_p07);
+        const __m256i out_23_2 = _mm256_madd_epi16(out_09_0, k__cospi_m09_p23);
+        const __m256i out_23_3 = _mm256_madd_epi16(out_09_1, k__cospi_m09_p23);
+        const __m256i out_15_2 = _mm256_madd_epi16(out_17_0, k__cospi_m17_p15);
+        const __m256i out_15_3 = _mm256_madd_epi16(out_17_1, k__cospi_m17_p15);
+        const __m256i out_31_2 = _mm256_madd_epi16(out_01_0, k__cospi_m01_p31);
+        const __m256i out_31_3 = _mm256_madd_epi16(out_01_1, k__cospi_m01_p31);
+        // dct_const_round_shift
+        const __m256i out_01_4 = _mm256_add_epi32(out_01_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_01_5 = _mm256_add_epi32(out_01_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_17_4 = _mm256_add_epi32(out_17_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_17_5 = _mm256_add_epi32(out_17_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_09_4 = _mm256_add_epi32(out_09_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_09_5 = _mm256_add_epi32(out_09_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_25_4 = _mm256_add_epi32(out_25_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_25_5 = _mm256_add_epi32(out_25_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_07_4 = _mm256_add_epi32(out_07_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_07_5 = _mm256_add_epi32(out_07_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_23_4 = _mm256_add_epi32(out_23_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_23_5 = _mm256_add_epi32(out_23_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_15_4 = _mm256_add_epi32(out_15_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_15_5 = _mm256_add_epi32(out_15_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_31_4 = _mm256_add_epi32(out_31_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_31_5 = _mm256_add_epi32(out_31_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_01_6 = _mm256_srai_epi32(out_01_4, DCT_CONST_BITS);
+        const __m256i out_01_7 = _mm256_srai_epi32(out_01_5, DCT_CONST_BITS);
+        const __m256i out_17_6 = _mm256_srai_epi32(out_17_4, DCT_CONST_BITS);
+        const __m256i out_17_7 = _mm256_srai_epi32(out_17_5, DCT_CONST_BITS);
+        const __m256i out_09_6 = _mm256_srai_epi32(out_09_4, DCT_CONST_BITS);
+        const __m256i out_09_7 = _mm256_srai_epi32(out_09_5, DCT_CONST_BITS);
+        const __m256i out_25_6 = _mm256_srai_epi32(out_25_4, DCT_CONST_BITS);
+        const __m256i out_25_7 = _mm256_srai_epi32(out_25_5, DCT_CONST_BITS);
+        const __m256i out_07_6 = _mm256_srai_epi32(out_07_4, DCT_CONST_BITS);
+        const __m256i out_07_7 = _mm256_srai_epi32(out_07_5, DCT_CONST_BITS);
+        const __m256i out_23_6 = _mm256_srai_epi32(out_23_4, DCT_CONST_BITS);
+        const __m256i out_23_7 = _mm256_srai_epi32(out_23_5, DCT_CONST_BITS);
+        const __m256i out_15_6 = _mm256_srai_epi32(out_15_4, DCT_CONST_BITS);
+        const __m256i out_15_7 = _mm256_srai_epi32(out_15_5, DCT_CONST_BITS);
+        const __m256i out_31_6 = _mm256_srai_epi32(out_31_4, DCT_CONST_BITS);
+        const __m256i out_31_7 = _mm256_srai_epi32(out_31_5, DCT_CONST_BITS);
+        // Combine
+        out[ 1] = _mm256_packs_epi32(out_01_6, out_01_7);
+        out[17] = _mm256_packs_epi32(out_17_6, out_17_7);
+        out[ 9] = _mm256_packs_epi32(out_09_6, out_09_7);
+        out[25] = _mm256_packs_epi32(out_25_6, out_25_7);
+        out[ 7] = _mm256_packs_epi32(out_07_6, out_07_7);
+        out[23] = _mm256_packs_epi32(out_23_6, out_23_7);
+        out[15] = _mm256_packs_epi32(out_15_6, out_15_7);
+        out[31] = _mm256_packs_epi32(out_31_6, out_31_7);
+      }
+      {
+        const __m256i out_05_0 = _mm256_unpacklo_epi16(step1[20], step1[27]);
+        const __m256i out_05_1 = _mm256_unpackhi_epi16(step1[20], step1[27]);
+        const __m256i out_21_0 = _mm256_unpacklo_epi16(step1[21], step1[26]);
+        const __m256i out_21_1 = _mm256_unpackhi_epi16(step1[21], step1[26]);
+        const __m256i out_13_0 = _mm256_unpacklo_epi16(step1[22], step1[25]);
+        const __m256i out_13_1 = _mm256_unpackhi_epi16(step1[22], step1[25]);
+        const __m256i out_29_0 = _mm256_unpacklo_epi16(step1[23], step1[24]);
+        const __m256i out_29_1 = _mm256_unpackhi_epi16(step1[23], step1[24]);
+        const __m256i out_05_2 = _mm256_madd_epi16(out_05_0, k__cospi_p27_p05);
+        const __m256i out_05_3 = _mm256_madd_epi16(out_05_1, k__cospi_p27_p05);
+        const __m256i out_21_2 = _mm256_madd_epi16(out_21_0, k__cospi_p11_p21);
+        const __m256i out_21_3 = _mm256_madd_epi16(out_21_1, k__cospi_p11_p21);
+        const __m256i out_13_2 = _mm256_madd_epi16(out_13_0, k__cospi_p19_p13);
+        const __m256i out_13_3 = _mm256_madd_epi16(out_13_1, k__cospi_p19_p13);
+        const __m256i out_29_2 = _mm256_madd_epi16(out_29_0, k__cospi_p03_p29);
+        const __m256i out_29_3 = _mm256_madd_epi16(out_29_1, k__cospi_p03_p29);
+        const __m256i out_03_2 = _mm256_madd_epi16(out_29_0, k__cospi_m29_p03);
+        const __m256i out_03_3 = _mm256_madd_epi16(out_29_1, k__cospi_m29_p03);
+        const __m256i out_19_2 = _mm256_madd_epi16(out_13_0, k__cospi_m13_p19);
+        const __m256i out_19_3 = _mm256_madd_epi16(out_13_1, k__cospi_m13_p19);
+        const __m256i out_11_2 = _mm256_madd_epi16(out_21_0, k__cospi_m21_p11);
+        const __m256i out_11_3 = _mm256_madd_epi16(out_21_1, k__cospi_m21_p11);
+        const __m256i out_27_2 = _mm256_madd_epi16(out_05_0, k__cospi_m05_p27);
+        const __m256i out_27_3 = _mm256_madd_epi16(out_05_1, k__cospi_m05_p27);
+        // dct_const_round_shift
+        const __m256i out_05_4 = _mm256_add_epi32(out_05_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_05_5 = _mm256_add_epi32(out_05_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_21_4 = _mm256_add_epi32(out_21_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_21_5 = _mm256_add_epi32(out_21_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_13_4 = _mm256_add_epi32(out_13_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_13_5 = _mm256_add_epi32(out_13_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_29_4 = _mm256_add_epi32(out_29_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_29_5 = _mm256_add_epi32(out_29_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_03_4 = _mm256_add_epi32(out_03_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_03_5 = _mm256_add_epi32(out_03_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_19_4 = _mm256_add_epi32(out_19_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_19_5 = _mm256_add_epi32(out_19_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_11_4 = _mm256_add_epi32(out_11_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_11_5 = _mm256_add_epi32(out_11_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_27_4 = _mm256_add_epi32(out_27_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_27_5 = _mm256_add_epi32(out_27_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_05_6 = _mm256_srai_epi32(out_05_4, DCT_CONST_BITS);
+        const __m256i out_05_7 = _mm256_srai_epi32(out_05_5, DCT_CONST_BITS);
+        const __m256i out_21_6 = _mm256_srai_epi32(out_21_4, DCT_CONST_BITS);
+        const __m256i out_21_7 = _mm256_srai_epi32(out_21_5, DCT_CONST_BITS);
+        const __m256i out_13_6 = _mm256_srai_epi32(out_13_4, DCT_CONST_BITS);
+        const __m256i out_13_7 = _mm256_srai_epi32(out_13_5, DCT_CONST_BITS);
+        const __m256i out_29_6 = _mm256_srai_epi32(out_29_4, DCT_CONST_BITS);
+        const __m256i out_29_7 = _mm256_srai_epi32(out_29_5, DCT_CONST_BITS);
+        const __m256i out_03_6 = _mm256_srai_epi32(out_03_4, DCT_CONST_BITS);
+        const __m256i out_03_7 = _mm256_srai_epi32(out_03_5, DCT_CONST_BITS);
+        const __m256i out_19_6 = _mm256_srai_epi32(out_19_4, DCT_CONST_BITS);
+        const __m256i out_19_7 = _mm256_srai_epi32(out_19_5, DCT_CONST_BITS);
+        const __m256i out_11_6 = _mm256_srai_epi32(out_11_4, DCT_CONST_BITS);
+        const __m256i out_11_7 = _mm256_srai_epi32(out_11_5, DCT_CONST_BITS);
+        const __m256i out_27_6 = _mm256_srai_epi32(out_27_4, DCT_CONST_BITS);
+        const __m256i out_27_7 = _mm256_srai_epi32(out_27_5, DCT_CONST_BITS);
+        // Combine
+        out[ 5] = _mm256_packs_epi32(out_05_6, out_05_7);
+        out[21] = _mm256_packs_epi32(out_21_6, out_21_7);
+        out[13] = _mm256_packs_epi32(out_13_6, out_13_7);
+        out[29] = _mm256_packs_epi32(out_29_6, out_29_7);
+        out[ 3] = _mm256_packs_epi32(out_03_6, out_03_7);
+        out[19] = _mm256_packs_epi32(out_19_6, out_19_7);
+        out[11] = _mm256_packs_epi32(out_11_6, out_11_7);
+        out[27] = _mm256_packs_epi32(out_27_6, out_27_7);
+      }
+#if FDCT32x32_HIGH_PRECISION
+      } else {
+        __m256i lstep1[64], lstep2[64], lstep3[64];
+        __m256i u[32], v[32], sign[16];
+        const __m256i K32One = _mm256_set_epi32(1, 1, 1, 1, 1, 1, 1, 1);
+        // start using 32-bit operations
+        // stage 3
+        {
+          // expanding to 32-bit length priori to addition operations
+          lstep2[ 0] = _mm256_unpacklo_epi16(step2[ 0], kZero);
+          lstep2[ 1] = _mm256_unpackhi_epi16(step2[ 0], kZero);
+          lstep2[ 2] = _mm256_unpacklo_epi16(step2[ 1], kZero);
+          lstep2[ 3] = _mm256_unpackhi_epi16(step2[ 1], kZero);
+          lstep2[ 4] = _mm256_unpacklo_epi16(step2[ 2], kZero);
+          lstep2[ 5] = _mm256_unpackhi_epi16(step2[ 2], kZero);
+          lstep2[ 6] = _mm256_unpacklo_epi16(step2[ 3], kZero);
+          lstep2[ 7] = _mm256_unpackhi_epi16(step2[ 3], kZero);
+          lstep2[ 8] = _mm256_unpacklo_epi16(step2[ 4], kZero);
+          lstep2[ 9] = _mm256_unpackhi_epi16(step2[ 4], kZero);
+          lstep2[10] = _mm256_unpacklo_epi16(step2[ 5], kZero);
+          lstep2[11] = _mm256_unpackhi_epi16(step2[ 5], kZero);
+          lstep2[12] = _mm256_unpacklo_epi16(step2[ 6], kZero);
+          lstep2[13] = _mm256_unpackhi_epi16(step2[ 6], kZero);
+          lstep2[14] = _mm256_unpacklo_epi16(step2[ 7], kZero);
+          lstep2[15] = _mm256_unpackhi_epi16(step2[ 7], kZero);
+          lstep2[ 0] = _mm256_madd_epi16(lstep2[ 0], kOne);
+          lstep2[ 1] = _mm256_madd_epi16(lstep2[ 1], kOne);
+          lstep2[ 2] = _mm256_madd_epi16(lstep2[ 2], kOne);
+          lstep2[ 3] = _mm256_madd_epi16(lstep2[ 3], kOne);
+          lstep2[ 4] = _mm256_madd_epi16(lstep2[ 4], kOne);
+          lstep2[ 5] = _mm256_madd_epi16(lstep2[ 5], kOne);
+          lstep2[ 6] = _mm256_madd_epi16(lstep2[ 6], kOne);
+          lstep2[ 7] = _mm256_madd_epi16(lstep2[ 7], kOne);
+          lstep2[ 8] = _mm256_madd_epi16(lstep2[ 8], kOne);
+          lstep2[ 9] = _mm256_madd_epi16(lstep2[ 9], kOne);
+          lstep2[10] = _mm256_madd_epi16(lstep2[10], kOne);
+          lstep2[11] = _mm256_madd_epi16(lstep2[11], kOne);
+          lstep2[12] = _mm256_madd_epi16(lstep2[12], kOne);
+          lstep2[13] = _mm256_madd_epi16(lstep2[13], kOne);
+          lstep2[14] = _mm256_madd_epi16(lstep2[14], kOne);
+          lstep2[15] = _mm256_madd_epi16(lstep2[15], kOne);
+
+          lstep3[ 0] = _mm256_add_epi32(lstep2[14], lstep2[ 0]);
+          lstep3[ 1] = _mm256_add_epi32(lstep2[15], lstep2[ 1]);
+          lstep3[ 2] = _mm256_add_epi32(lstep2[12], lstep2[ 2]);
+          lstep3[ 3] = _mm256_add_epi32(lstep2[13], lstep2[ 3]);
+          lstep3[ 4] = _mm256_add_epi32(lstep2[10], lstep2[ 4]);
+          lstep3[ 5] = _mm256_add_epi32(lstep2[11], lstep2[ 5]);
+          lstep3[ 6] = _mm256_add_epi32(lstep2[ 8], lstep2[ 6]);
+          lstep3[ 7] = _mm256_add_epi32(lstep2[ 9], lstep2[ 7]);
+          lstep3[ 8] = _mm256_sub_epi32(lstep2[ 6], lstep2[ 8]);
+          lstep3[ 9] = _mm256_sub_epi32(lstep2[ 7], lstep2[ 9]);
+          lstep3[10] = _mm256_sub_epi32(lstep2[ 4], lstep2[10]);
+          lstep3[11] = _mm256_sub_epi32(lstep2[ 5], lstep2[11]);
+          lstep3[12] = _mm256_sub_epi32(lstep2[ 2], lstep2[12]);
+          lstep3[13] = _mm256_sub_epi32(lstep2[ 3], lstep2[13]);
+          lstep3[14] = _mm256_sub_epi32(lstep2[ 0], lstep2[14]);
+          lstep3[15] = _mm256_sub_epi32(lstep2[ 1], lstep2[15]);
+        }
+        {
+          const __m256i s3_10_0 = _mm256_unpacklo_epi16(step2[13], step2[10]);
+          const __m256i s3_10_1 = _mm256_unpackhi_epi16(step2[13], step2[10]);
+          const __m256i s3_11_0 = _mm256_unpacklo_epi16(step2[12], step2[11]);
+          const __m256i s3_11_1 = _mm256_unpackhi_epi16(step2[12], step2[11]);
+          const __m256i s3_10_2 = _mm256_madd_epi16(s3_10_0, k__cospi_p16_m16);
+          const __m256i s3_10_3 = _mm256_madd_epi16(s3_10_1, k__cospi_p16_m16);
+          const __m256i s3_11_2 = _mm256_madd_epi16(s3_11_0, k__cospi_p16_m16);
+          const __m256i s3_11_3 = _mm256_madd_epi16(s3_11_1, k__cospi_p16_m16);
+          const __m256i s3_12_2 = _mm256_madd_epi16(s3_11_0, k__cospi_p16_p16);
+          const __m256i s3_12_3 = _mm256_madd_epi16(s3_11_1, k__cospi_p16_p16);
+          const __m256i s3_13_2 = _mm256_madd_epi16(s3_10_0, k__cospi_p16_p16);
+          const __m256i s3_13_3 = _mm256_madd_epi16(s3_10_1, k__cospi_p16_p16);
+          // dct_const_round_shift
+          const __m256i s3_10_4 = _mm256_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING);
+          const __m256i s3_10_5 = _mm256_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING);
+          const __m256i s3_11_4 = _mm256_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING);
+          const __m256i s3_11_5 = _mm256_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING);
+          const __m256i s3_12_4 = _mm256_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING);
+          const __m256i s3_12_5 = _mm256_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING);
+          const __m256i s3_13_4 = _mm256_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING);
+          const __m256i s3_13_5 = _mm256_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING);
+          lstep3[20] = _mm256_srai_epi32(s3_10_4, DCT_CONST_BITS);
+          lstep3[21] = _mm256_srai_epi32(s3_10_5, DCT_CONST_BITS);
+          lstep3[22] = _mm256_srai_epi32(s3_11_4, DCT_CONST_BITS);
+          lstep3[23] = _mm256_srai_epi32(s3_11_5, DCT_CONST_BITS);
+          lstep3[24] = _mm256_srai_epi32(s3_12_4, DCT_CONST_BITS);
+          lstep3[25] = _mm256_srai_epi32(s3_12_5, DCT_CONST_BITS);
+          lstep3[26] = _mm256_srai_epi32(s3_13_4, DCT_CONST_BITS);
+          lstep3[27] = _mm256_srai_epi32(s3_13_5, DCT_CONST_BITS);
+        }
+        {
+          lstep2[40] = _mm256_unpacklo_epi16(step2[20], kZero);
+          lstep2[41] = _mm256_unpackhi_epi16(step2[20], kZero);
+          lstep2[42] = _mm256_unpacklo_epi16(step2[21], kZero);
+          lstep2[43] = _mm256_unpackhi_epi16(step2[21], kZero);
+          lstep2[44] = _mm256_unpacklo_epi16(step2[22], kZero);
+          lstep2[45] = _mm256_unpackhi_epi16(step2[22], kZero);
+          lstep2[46] = _mm256_unpacklo_epi16(step2[23], kZero);
+          lstep2[47] = _mm256_unpackhi_epi16(step2[23], kZero);
+          lstep2[48] = _mm256_unpacklo_epi16(step2[24], kZero);
+          lstep2[49] = _mm256_unpackhi_epi16(step2[24], kZero);
+          lstep2[50] = _mm256_unpacklo_epi16(step2[25], kZero);
+          lstep2[51] = _mm256_unpackhi_epi16(step2[25], kZero);
+          lstep2[52] = _mm256_unpacklo_epi16(step2[26], kZero);
+          lstep2[53] = _mm256_unpackhi_epi16(step2[26], kZero);
+          lstep2[54] = _mm256_unpacklo_epi16(step2[27], kZero);
+          lstep2[55] = _mm256_unpackhi_epi16(step2[27], kZero);
+          lstep2[40] = _mm256_madd_epi16(lstep2[40], kOne);
+          lstep2[41] = _mm256_madd_epi16(lstep2[41], kOne);
+          lstep2[42] = _mm256_madd_epi16(lstep2[42], kOne);
+          lstep2[43] = _mm256_madd_epi16(lstep2[43], kOne);
+          lstep2[44] = _mm256_madd_epi16(lstep2[44], kOne);
+          lstep2[45] = _mm256_madd_epi16(lstep2[45], kOne);
+          lstep2[46] = _mm256_madd_epi16(lstep2[46], kOne);
+          lstep2[47] = _mm256_madd_epi16(lstep2[47], kOne);
+          lstep2[48] = _mm256_madd_epi16(lstep2[48], kOne);
+          lstep2[49] = _mm256_madd_epi16(lstep2[49], kOne);
+          lstep2[50] = _mm256_madd_epi16(lstep2[50], kOne);
+          lstep2[51] = _mm256_madd_epi16(lstep2[51], kOne);
+          lstep2[52] = _mm256_madd_epi16(lstep2[52], kOne);
+          lstep2[53] = _mm256_madd_epi16(lstep2[53], kOne);
+          lstep2[54] = _mm256_madd_epi16(lstep2[54], kOne);
+          lstep2[55] = _mm256_madd_epi16(lstep2[55], kOne);
+
+          lstep1[32] = _mm256_unpacklo_epi16(step1[16], kZero);
+          lstep1[33] = _mm256_unpackhi_epi16(step1[16], kZero);
+          lstep1[34] = _mm256_unpacklo_epi16(step1[17], kZero);
+          lstep1[35] = _mm256_unpackhi_epi16(step1[17], kZero);
+          lstep1[36] = _mm256_unpacklo_epi16(step1[18], kZero);
+          lstep1[37] = _mm256_unpackhi_epi16(step1[18], kZero);
+          lstep1[38] = _mm256_unpacklo_epi16(step1[19], kZero);
+          lstep1[39] = _mm256_unpackhi_epi16(step1[19], kZero);
+          lstep1[56] = _mm256_unpacklo_epi16(step1[28], kZero);
+          lstep1[57] = _mm256_unpackhi_epi16(step1[28], kZero);
+          lstep1[58] = _mm256_unpacklo_epi16(step1[29], kZero);
+          lstep1[59] = _mm256_unpackhi_epi16(step1[29], kZero);
+          lstep1[60] = _mm256_unpacklo_epi16(step1[30], kZero);
+          lstep1[61] = _mm256_unpackhi_epi16(step1[30], kZero);
+          lstep1[62] = _mm256_unpacklo_epi16(step1[31], kZero);
+          lstep1[63] = _mm256_unpackhi_epi16(step1[31], kZero);
+          lstep1[32] = _mm256_madd_epi16(lstep1[32], kOne);
+          lstep1[33] = _mm256_madd_epi16(lstep1[33], kOne);
+          lstep1[34] = _mm256_madd_epi16(lstep1[34], kOne);
+          lstep1[35] = _mm256_madd_epi16(lstep1[35], kOne);
+          lstep1[36] = _mm256_madd_epi16(lstep1[36], kOne);
+          lstep1[37] = _mm256_madd_epi16(lstep1[37], kOne);
+          lstep1[38] = _mm256_madd_epi16(lstep1[38], kOne);
+          lstep1[39] = _mm256_madd_epi16(lstep1[39], kOne);
+          lstep1[56] = _mm256_madd_epi16(lstep1[56], kOne);
+          lstep1[57] = _mm256_madd_epi16(lstep1[57], kOne);
+          lstep1[58] = _mm256_madd_epi16(lstep1[58], kOne);
+          lstep1[59] = _mm256_madd_epi16(lstep1[59], kOne);
+          lstep1[60] = _mm256_madd_epi16(lstep1[60], kOne);
+          lstep1[61] = _mm256_madd_epi16(lstep1[61], kOne);
+          lstep1[62] = _mm256_madd_epi16(lstep1[62], kOne);
+          lstep1[63] = _mm256_madd_epi16(lstep1[63], kOne);
+
+          lstep3[32] = _mm256_add_epi32(lstep2[46], lstep1[32]);
+          lstep3[33] = _mm256_add_epi32(lstep2[47], lstep1[33]);
+
+          lstep3[34] = _mm256_add_epi32(lstep2[44], lstep1[34]);
+          lstep3[35] = _mm256_add_epi32(lstep2[45], lstep1[35]);
+          lstep3[36] = _mm256_add_epi32(lstep2[42], lstep1[36]);
+          lstep3[37] = _mm256_add_epi32(lstep2[43], lstep1[37]);
+          lstep3[38] = _mm256_add_epi32(lstep2[40], lstep1[38]);
+          lstep3[39] = _mm256_add_epi32(lstep2[41], lstep1[39]);
+          lstep3[40] = _mm256_sub_epi32(lstep1[38], lstep2[40]);
+          lstep3[41] = _mm256_sub_epi32(lstep1[39], lstep2[41]);
+          lstep3[42] = _mm256_sub_epi32(lstep1[36], lstep2[42]);
+          lstep3[43] = _mm256_sub_epi32(lstep1[37], lstep2[43]);
+          lstep3[44] = _mm256_sub_epi32(lstep1[34], lstep2[44]);
+          lstep3[45] = _mm256_sub_epi32(lstep1[35], lstep2[45]);
+          lstep3[46] = _mm256_sub_epi32(lstep1[32], lstep2[46]);
+          lstep3[47] = _mm256_sub_epi32(lstep1[33], lstep2[47]);
+          lstep3[48] = _mm256_sub_epi32(lstep1[62], lstep2[48]);
+          lstep3[49] = _mm256_sub_epi32(lstep1[63], lstep2[49]);
+          lstep3[50] = _mm256_sub_epi32(lstep1[60], lstep2[50]);
+          lstep3[51] = _mm256_sub_epi32(lstep1[61], lstep2[51]);
+          lstep3[52] = _mm256_sub_epi32(lstep1[58], lstep2[52]);
+          lstep3[53] = _mm256_sub_epi32(lstep1[59], lstep2[53]);
+          lstep3[54] = _mm256_sub_epi32(lstep1[56], lstep2[54]);
+          lstep3[55] = _mm256_sub_epi32(lstep1[57], lstep2[55]);
+          lstep3[56] = _mm256_add_epi32(lstep2[54], lstep1[56]);
+          lstep3[57] = _mm256_add_epi32(lstep2[55], lstep1[57]);
+          lstep3[58] = _mm256_add_epi32(lstep2[52], lstep1[58]);
+          lstep3[59] = _mm256_add_epi32(lstep2[53], lstep1[59]);
+          lstep3[60] = _mm256_add_epi32(lstep2[50], lstep1[60]);
+          lstep3[61] = _mm256_add_epi32(lstep2[51], lstep1[61]);
+          lstep3[62] = _mm256_add_epi32(lstep2[48], lstep1[62]);
+          lstep3[63] = _mm256_add_epi32(lstep2[49], lstep1[63]);
+        }
+
+        // stage 4
+        {
+          // expanding to 32-bit length priori to addition operations
+          lstep2[16] = _mm256_unpacklo_epi16(step2[ 8], kZero);
+          lstep2[17] = _mm256_unpackhi_epi16(step2[ 8], kZero);
+          lstep2[18] = _mm256_unpacklo_epi16(step2[ 9], kZero);
+          lstep2[19] = _mm256_unpackhi_epi16(step2[ 9], kZero);
+          lstep2[28] = _mm256_unpacklo_epi16(step2[14], kZero);
+          lstep2[29] = _mm256_unpackhi_epi16(step2[14], kZero);
+          lstep2[30] = _mm256_unpacklo_epi16(step2[15], kZero);
+          lstep2[31] = _mm256_unpackhi_epi16(step2[15], kZero);
+          lstep2[16] = _mm256_madd_epi16(lstep2[16], kOne);
+          lstep2[17] = _mm256_madd_epi16(lstep2[17], kOne);
+          lstep2[18] = _mm256_madd_epi16(lstep2[18], kOne);
+          lstep2[19] = _mm256_madd_epi16(lstep2[19], kOne);
+          lstep2[28] = _mm256_madd_epi16(lstep2[28], kOne);
+          lstep2[29] = _mm256_madd_epi16(lstep2[29], kOne);
+          lstep2[30] = _mm256_madd_epi16(lstep2[30], kOne);
+          lstep2[31] = _mm256_madd_epi16(lstep2[31], kOne);
+
+          lstep1[ 0] = _mm256_add_epi32(lstep3[ 6], lstep3[ 0]);
+          lstep1[ 1] = _mm256_add_epi32(lstep3[ 7], lstep3[ 1]);
+          lstep1[ 2] = _mm256_add_epi32(lstep3[ 4], lstep3[ 2]);
+          lstep1[ 3] = _mm256_add_epi32(lstep3[ 5], lstep3[ 3]);
+          lstep1[ 4] = _mm256_sub_epi32(lstep3[ 2], lstep3[ 4]);
+          lstep1[ 5] = _mm256_sub_epi32(lstep3[ 3], lstep3[ 5]);
+          lstep1[ 6] = _mm256_sub_epi32(lstep3[ 0], lstep3[ 6]);
+          lstep1[ 7] = _mm256_sub_epi32(lstep3[ 1], lstep3[ 7]);
+          lstep1[16] = _mm256_add_epi32(lstep3[22], lstep2[16]);
+          lstep1[17] = _mm256_add_epi32(lstep3[23], lstep2[17]);
+          lstep1[18] = _mm256_add_epi32(lstep3[20], lstep2[18]);
+          lstep1[19] = _mm256_add_epi32(lstep3[21], lstep2[19]);
+          lstep1[20] = _mm256_sub_epi32(lstep2[18], lstep3[20]);
+          lstep1[21] = _mm256_sub_epi32(lstep2[19], lstep3[21]);
+          lstep1[22] = _mm256_sub_epi32(lstep2[16], lstep3[22]);
+          lstep1[23] = _mm256_sub_epi32(lstep2[17], lstep3[23]);
+          lstep1[24] = _mm256_sub_epi32(lstep2[30], lstep3[24]);
+          lstep1[25] = _mm256_sub_epi32(lstep2[31], lstep3[25]);
+          lstep1[26] = _mm256_sub_epi32(lstep2[28], lstep3[26]);
+          lstep1[27] = _mm256_sub_epi32(lstep2[29], lstep3[27]);
+          lstep1[28] = _mm256_add_epi32(lstep3[26], lstep2[28]);
+          lstep1[29] = _mm256_add_epi32(lstep3[27], lstep2[29]);
+          lstep1[30] = _mm256_add_epi32(lstep3[24], lstep2[30]);
+          lstep1[31] = _mm256_add_epi32(lstep3[25], lstep2[31]);
+        }
+        {
+        // to be continued...
+        //
+        const __m256i k32_p16_p16 = pair256_set_epi32(cospi_16_64, cospi_16_64);
+        const __m256i k32_p16_m16 = pair256_set_epi32(cospi_16_64, -cospi_16_64);
+
+        u[0] = _mm256_unpacklo_epi32(lstep3[12], lstep3[10]);
+        u[1] = _mm256_unpackhi_epi32(lstep3[12], lstep3[10]);
+        u[2] = _mm256_unpacklo_epi32(lstep3[13], lstep3[11]);
+        u[3] = _mm256_unpackhi_epi32(lstep3[13], lstep3[11]);
+
+        // TODO(jingning): manually inline k_madd_epi32_avx2_ to further hide
+        // instruction latency.
+        v[ 0] = k_madd_epi32_avx2(u[0], k32_p16_m16);
+        v[ 1] = k_madd_epi32_avx2(u[1], k32_p16_m16);
+        v[ 2] = k_madd_epi32_avx2(u[2], k32_p16_m16);
+        v[ 3] = k_madd_epi32_avx2(u[3], k32_p16_m16);
+        v[ 4] = k_madd_epi32_avx2(u[0], k32_p16_p16);
+        v[ 5] = k_madd_epi32_avx2(u[1], k32_p16_p16);
+        v[ 6] = k_madd_epi32_avx2(u[2], k32_p16_p16);
+        v[ 7] = k_madd_epi32_avx2(u[3], k32_p16_p16);
+
+        u[0] = k_packs_epi64_avx2(v[0], v[1]);
+        u[1] = k_packs_epi64_avx2(v[2], v[3]);
+        u[2] = k_packs_epi64_avx2(v[4], v[5]);
+        u[3] = k_packs_epi64_avx2(v[6], v[7]);
+
+        v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+        v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+        v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+        v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+
+        lstep1[10] = _mm256_srai_epi32(v[0], DCT_CONST_BITS);
+        lstep1[11] = _mm256_srai_epi32(v[1], DCT_CONST_BITS);
+        lstep1[12] = _mm256_srai_epi32(v[2], DCT_CONST_BITS);
+        lstep1[13] = _mm256_srai_epi32(v[3], DCT_CONST_BITS);
+        }
+        {
+          const __m256i k32_m08_p24 = pair256_set_epi32(-cospi_8_64, cospi_24_64);
+          const __m256i k32_m24_m08 = pair256_set_epi32(-cospi_24_64, -cospi_8_64);
+          const __m256i k32_p24_p08 = pair256_set_epi32(cospi_24_64, cospi_8_64);
+
+          u[ 0] = _mm256_unpacklo_epi32(lstep3[36], lstep3[58]);
+          u[ 1] = _mm256_unpackhi_epi32(lstep3[36], lstep3[58]);
+          u[ 2] = _mm256_unpacklo_epi32(lstep3[37], lstep3[59]);
+          u[ 3] = _mm256_unpackhi_epi32(lstep3[37], lstep3[59]);
+          u[ 4] = _mm256_unpacklo_epi32(lstep3[38], lstep3[56]);
+          u[ 5] = _mm256_unpackhi_epi32(lstep3[38], lstep3[56]);
+          u[ 6] = _mm256_unpacklo_epi32(lstep3[39], lstep3[57]);
+          u[ 7] = _mm256_unpackhi_epi32(lstep3[39], lstep3[57]);
+          u[ 8] = _mm256_unpacklo_epi32(lstep3[40], lstep3[54]);
+          u[ 9] = _mm256_unpackhi_epi32(lstep3[40], lstep3[54]);
+          u[10] = _mm256_unpacklo_epi32(lstep3[41], lstep3[55]);
+          u[11] = _mm256_unpackhi_epi32(lstep3[41], lstep3[55]);
+          u[12] = _mm256_unpacklo_epi32(lstep3[42], lstep3[52]);
+          u[13] = _mm256_unpackhi_epi32(lstep3[42], lstep3[52]);
+          u[14] = _mm256_unpacklo_epi32(lstep3[43], lstep3[53]);
+          u[15] = _mm256_unpackhi_epi32(lstep3[43], lstep3[53]);
+
+          v[ 0] = k_madd_epi32_avx2(u[ 0], k32_m08_p24);
+          v[ 1] = k_madd_epi32_avx2(u[ 1], k32_m08_p24);
+          v[ 2] = k_madd_epi32_avx2(u[ 2], k32_m08_p24);
+          v[ 3] = k_madd_epi32_avx2(u[ 3], k32_m08_p24);
+          v[ 4] = k_madd_epi32_avx2(u[ 4], k32_m08_p24);
+          v[ 5] = k_madd_epi32_avx2(u[ 5], k32_m08_p24);
+          v[ 6] = k_madd_epi32_avx2(u[ 6], k32_m08_p24);
+          v[ 7] = k_madd_epi32_avx2(u[ 7], k32_m08_p24);
+          v[ 8] = k_madd_epi32_avx2(u[ 8], k32_m24_m08);
+          v[ 9] = k_madd_epi32_avx2(u[ 9], k32_m24_m08);
+          v[10] = k_madd_epi32_avx2(u[10], k32_m24_m08);
+          v[11] = k_madd_epi32_avx2(u[11], k32_m24_m08);
+          v[12] = k_madd_epi32_avx2(u[12], k32_m24_m08);
+          v[13] = k_madd_epi32_avx2(u[13], k32_m24_m08);
+          v[14] = k_madd_epi32_avx2(u[14], k32_m24_m08);
+          v[15] = k_madd_epi32_avx2(u[15], k32_m24_m08);
+          v[16] = k_madd_epi32_avx2(u[12], k32_m08_p24);
+          v[17] = k_madd_epi32_avx2(u[13], k32_m08_p24);
+          v[18] = k_madd_epi32_avx2(u[14], k32_m08_p24);
+          v[19] = k_madd_epi32_avx2(u[15], k32_m08_p24);
+          v[20] = k_madd_epi32_avx2(u[ 8], k32_m08_p24);
+          v[21] = k_madd_epi32_avx2(u[ 9], k32_m08_p24);
+          v[22] = k_madd_epi32_avx2(u[10], k32_m08_p24);
+          v[23] = k_madd_epi32_avx2(u[11], k32_m08_p24);
+          v[24] = k_madd_epi32_avx2(u[ 4], k32_p24_p08);
+          v[25] = k_madd_epi32_avx2(u[ 5], k32_p24_p08);
+          v[26] = k_madd_epi32_avx2(u[ 6], k32_p24_p08);
+          v[27] = k_madd_epi32_avx2(u[ 7], k32_p24_p08);
+          v[28] = k_madd_epi32_avx2(u[ 0], k32_p24_p08);
+          v[29] = k_madd_epi32_avx2(u[ 1], k32_p24_p08);
+          v[30] = k_madd_epi32_avx2(u[ 2], k32_p24_p08);
+          v[31] = k_madd_epi32_avx2(u[ 3], k32_p24_p08);
+
+          u[ 0] = k_packs_epi64_avx2(v[ 0], v[ 1]);
+          u[ 1] = k_packs_epi64_avx2(v[ 2], v[ 3]);
+          u[ 2] = k_packs_epi64_avx2(v[ 4], v[ 5]);
+          u[ 3] = k_packs_epi64_avx2(v[ 6], v[ 7]);
+          u[ 4] = k_packs_epi64_avx2(v[ 8], v[ 9]);
+          u[ 5] = k_packs_epi64_avx2(v[10], v[11]);
+          u[ 6] = k_packs_epi64_avx2(v[12], v[13]);
+          u[ 7] = k_packs_epi64_avx2(v[14], v[15]);
+          u[ 8] = k_packs_epi64_avx2(v[16], v[17]);
+          u[ 9] = k_packs_epi64_avx2(v[18], v[19]);
+          u[10] = k_packs_epi64_avx2(v[20], v[21]);
+          u[11] = k_packs_epi64_avx2(v[22], v[23]);
+          u[12] = k_packs_epi64_avx2(v[24], v[25]);
+          u[13] = k_packs_epi64_avx2(v[26], v[27]);
+          u[14] = k_packs_epi64_avx2(v[28], v[29]);
+          u[15] = k_packs_epi64_avx2(v[30], v[31]);
+
+          v[ 0] = _mm256_add_epi32(u[ 0], k__DCT_CONST_ROUNDING);
+          v[ 1] = _mm256_add_epi32(u[ 1], k__DCT_CONST_ROUNDING);
+          v[ 2] = _mm256_add_epi32(u[ 2], k__DCT_CONST_ROUNDING);
+          v[ 3] = _mm256_add_epi32(u[ 3], k__DCT_CONST_ROUNDING);
+          v[ 4] = _mm256_add_epi32(u[ 4], k__DCT_CONST_ROUNDING);
+          v[ 5] = _mm256_add_epi32(u[ 5], k__DCT_CONST_ROUNDING);
+          v[ 6] = _mm256_add_epi32(u[ 6], k__DCT_CONST_ROUNDING);
+          v[ 7] = _mm256_add_epi32(u[ 7], k__DCT_CONST_ROUNDING);
+          v[ 8] = _mm256_add_epi32(u[ 8], k__DCT_CONST_ROUNDING);
+          v[ 9] = _mm256_add_epi32(u[ 9], k__DCT_CONST_ROUNDING);
+          v[10] = _mm256_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+          v[11] = _mm256_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+          v[12] = _mm256_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+          v[13] = _mm256_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+          v[14] = _mm256_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+          v[15] = _mm256_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+          lstep1[36] = _mm256_srai_epi32(v[ 0], DCT_CONST_BITS);
+          lstep1[37] = _mm256_srai_epi32(v[ 1], DCT_CONST_BITS);
+          lstep1[38] = _mm256_srai_epi32(v[ 2], DCT_CONST_BITS);
+          lstep1[39] = _mm256_srai_epi32(v[ 3], DCT_CONST_BITS);
+          lstep1[40] = _mm256_srai_epi32(v[ 4], DCT_CONST_BITS);
+          lstep1[41] = _mm256_srai_epi32(v[ 5], DCT_CONST_BITS);
+          lstep1[42] = _mm256_srai_epi32(v[ 6], DCT_CONST_BITS);
+          lstep1[43] = _mm256_srai_epi32(v[ 7], DCT_CONST_BITS);
+          lstep1[52] = _mm256_srai_epi32(v[ 8], DCT_CONST_BITS);
+          lstep1[53] = _mm256_srai_epi32(v[ 9], DCT_CONST_BITS);
+          lstep1[54] = _mm256_srai_epi32(v[10], DCT_CONST_BITS);
+          lstep1[55] = _mm256_srai_epi32(v[11], DCT_CONST_BITS);
+          lstep1[56] = _mm256_srai_epi32(v[12], DCT_CONST_BITS);
+          lstep1[57] = _mm256_srai_epi32(v[13], DCT_CONST_BITS);
+          lstep1[58] = _mm256_srai_epi32(v[14], DCT_CONST_BITS);
+          lstep1[59] = _mm256_srai_epi32(v[15], DCT_CONST_BITS);
+        }
+        // stage 5
+        {
+          lstep2[ 8] = _mm256_add_epi32(lstep1[10], lstep3[ 8]);
+          lstep2[ 9] = _mm256_add_epi32(lstep1[11], lstep3[ 9]);
+          lstep2[10] = _mm256_sub_epi32(lstep3[ 8], lstep1[10]);
+          lstep2[11] = _mm256_sub_epi32(lstep3[ 9], lstep1[11]);
+          lstep2[12] = _mm256_sub_epi32(lstep3[14], lstep1[12]);
+          lstep2[13] = _mm256_sub_epi32(lstep3[15], lstep1[13]);
+          lstep2[14] = _mm256_add_epi32(lstep1[12], lstep3[14]);
+          lstep2[15] = _mm256_add_epi32(lstep1[13], lstep3[15]);
+        }
+        {
+          const __m256i k32_p16_p16 = pair256_set_epi32(cospi_16_64, cospi_16_64);
+          const __m256i k32_p16_m16 = pair256_set_epi32(cospi_16_64, -cospi_16_64);
+          const __m256i k32_p24_p08 = pair256_set_epi32(cospi_24_64, cospi_8_64);
+          const __m256i k32_m08_p24 = pair256_set_epi32(-cospi_8_64, cospi_24_64);
+
+          u[0] = _mm256_unpacklo_epi32(lstep1[0], lstep1[2]);
+          u[1] = _mm256_unpackhi_epi32(lstep1[0], lstep1[2]);
+          u[2] = _mm256_unpacklo_epi32(lstep1[1], lstep1[3]);
+          u[3] = _mm256_unpackhi_epi32(lstep1[1], lstep1[3]);
+          u[4] = _mm256_unpacklo_epi32(lstep1[4], lstep1[6]);
+          u[5] = _mm256_unpackhi_epi32(lstep1[4], lstep1[6]);
+          u[6] = _mm256_unpacklo_epi32(lstep1[5], lstep1[7]);
+          u[7] = _mm256_unpackhi_epi32(lstep1[5], lstep1[7]);
+
+          // TODO(jingning): manually inline k_madd_epi32_avx2_ to further hide
+          // instruction latency.
+          v[ 0] = k_madd_epi32_avx2(u[0], k32_p16_p16);
+          v[ 1] = k_madd_epi32_avx2(u[1], k32_p16_p16);
+          v[ 2] = k_madd_epi32_avx2(u[2], k32_p16_p16);
+          v[ 3] = k_madd_epi32_avx2(u[3], k32_p16_p16);
+          v[ 4] = k_madd_epi32_avx2(u[0], k32_p16_m16);
+          v[ 5] = k_madd_epi32_avx2(u[1], k32_p16_m16);
+          v[ 6] = k_madd_epi32_avx2(u[2], k32_p16_m16);
+          v[ 7] = k_madd_epi32_avx2(u[3], k32_p16_m16);
+          v[ 8] = k_madd_epi32_avx2(u[4], k32_p24_p08);
+          v[ 9] = k_madd_epi32_avx2(u[5], k32_p24_p08);
+          v[10] = k_madd_epi32_avx2(u[6], k32_p24_p08);
+          v[11] = k_madd_epi32_avx2(u[7], k32_p24_p08);
+          v[12] = k_madd_epi32_avx2(u[4], k32_m08_p24);
+          v[13] = k_madd_epi32_avx2(u[5], k32_m08_p24);
+          v[14] = k_madd_epi32_avx2(u[6], k32_m08_p24);
+          v[15] = k_madd_epi32_avx2(u[7], k32_m08_p24);
+
+          u[0] = k_packs_epi64_avx2(v[0], v[1]);
+          u[1] = k_packs_epi64_avx2(v[2], v[3]);
+          u[2] = k_packs_epi64_avx2(v[4], v[5]);
+          u[3] = k_packs_epi64_avx2(v[6], v[7]);
+          u[4] = k_packs_epi64_avx2(v[8], v[9]);
+          u[5] = k_packs_epi64_avx2(v[10], v[11]);
+          u[6] = k_packs_epi64_avx2(v[12], v[13]);
+          u[7] = k_packs_epi64_avx2(v[14], v[15]);
+
+          v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+          v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+          v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+          v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+          v[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+          v[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+          v[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+          v[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+
+          u[0] = _mm256_srai_epi32(v[0], DCT_CONST_BITS);
+          u[1] = _mm256_srai_epi32(v[1], DCT_CONST_BITS);
+          u[2] = _mm256_srai_epi32(v[2], DCT_CONST_BITS);
+          u[3] = _mm256_srai_epi32(v[3], DCT_CONST_BITS);
+          u[4] = _mm256_srai_epi32(v[4], DCT_CONST_BITS);
+          u[5] = _mm256_srai_epi32(v[5], DCT_CONST_BITS);
+          u[6] = _mm256_srai_epi32(v[6], DCT_CONST_BITS);
+          u[7] = _mm256_srai_epi32(v[7], DCT_CONST_BITS);
+
+          sign[0] = _mm256_cmpgt_epi32(kZero,u[0]);
+          sign[1] = _mm256_cmpgt_epi32(kZero,u[1]);
+          sign[2] = _mm256_cmpgt_epi32(kZero,u[2]);
+          sign[3] = _mm256_cmpgt_epi32(kZero,u[3]);
+          sign[4] = _mm256_cmpgt_epi32(kZero,u[4]);
+          sign[5] = _mm256_cmpgt_epi32(kZero,u[5]);
+          sign[6] = _mm256_cmpgt_epi32(kZero,u[6]);
+          sign[7] = _mm256_cmpgt_epi32(kZero,u[7]);
+
+          u[0] = _mm256_sub_epi32(u[0], sign[0]);
+          u[1] = _mm256_sub_epi32(u[1], sign[1]);
+          u[2] = _mm256_sub_epi32(u[2], sign[2]);
+          u[3] = _mm256_sub_epi32(u[3], sign[3]);
+          u[4] = _mm256_sub_epi32(u[4], sign[4]);
+          u[5] = _mm256_sub_epi32(u[5], sign[5]);
+          u[6] = _mm256_sub_epi32(u[6], sign[6]);
+          u[7] = _mm256_sub_epi32(u[7], sign[7]);
+
+          u[0] = _mm256_add_epi32(u[0], K32One);
+          u[1] = _mm256_add_epi32(u[1], K32One);
+          u[2] = _mm256_add_epi32(u[2], K32One);
+          u[3] = _mm256_add_epi32(u[3], K32One);
+          u[4] = _mm256_add_epi32(u[4], K32One);
+          u[5] = _mm256_add_epi32(u[5], K32One);
+          u[6] = _mm256_add_epi32(u[6], K32One);
+          u[7] = _mm256_add_epi32(u[7], K32One);
+
+          u[0] = _mm256_srai_epi32(u[0], 2);
+          u[1] = _mm256_srai_epi32(u[1], 2);
+          u[2] = _mm256_srai_epi32(u[2], 2);
+          u[3] = _mm256_srai_epi32(u[3], 2);
+          u[4] = _mm256_srai_epi32(u[4], 2);
+          u[5] = _mm256_srai_epi32(u[5], 2);
+          u[6] = _mm256_srai_epi32(u[6], 2);
+          u[7] = _mm256_srai_epi32(u[7], 2);
+
+          // Combine
+          out[ 0] = _mm256_packs_epi32(u[0], u[1]);
+          out[16] = _mm256_packs_epi32(u[2], u[3]);
+          out[ 8] = _mm256_packs_epi32(u[4], u[5]);
+          out[24] = _mm256_packs_epi32(u[6], u[7]);
+        }
+        {
+          const __m256i k32_m08_p24 = pair256_set_epi32(-cospi_8_64, cospi_24_64);
+          const __m256i k32_m24_m08 = pair256_set_epi32(-cospi_24_64, -cospi_8_64);
+          const __m256i k32_p24_p08 = pair256_set_epi32(cospi_24_64, cospi_8_64);
+
+          u[0] = _mm256_unpacklo_epi32(lstep1[18], lstep1[28]);
+          u[1] = _mm256_unpackhi_epi32(lstep1[18], lstep1[28]);
+          u[2] = _mm256_unpacklo_epi32(lstep1[19], lstep1[29]);
+          u[3] = _mm256_unpackhi_epi32(lstep1[19], lstep1[29]);
+          u[4] = _mm256_unpacklo_epi32(lstep1[20], lstep1[26]);
+          u[5] = _mm256_unpackhi_epi32(lstep1[20], lstep1[26]);
+          u[6] = _mm256_unpacklo_epi32(lstep1[21], lstep1[27]);
+          u[7] = _mm256_unpackhi_epi32(lstep1[21], lstep1[27]);
+
+          v[0] = k_madd_epi32_avx2(u[0], k32_m08_p24);
+          v[1] = k_madd_epi32_avx2(u[1], k32_m08_p24);
+          v[2] = k_madd_epi32_avx2(u[2], k32_m08_p24);
+          v[3] = k_madd_epi32_avx2(u[3], k32_m08_p24);
+          v[4] = k_madd_epi32_avx2(u[4], k32_m24_m08);
+          v[5] = k_madd_epi32_avx2(u[5], k32_m24_m08);
+          v[6] = k_madd_epi32_avx2(u[6], k32_m24_m08);
+          v[7] = k_madd_epi32_avx2(u[7], k32_m24_m08);
+          v[ 8] = k_madd_epi32_avx2(u[4], k32_m08_p24);
+          v[ 9] = k_madd_epi32_avx2(u[5], k32_m08_p24);
+          v[10] = k_madd_epi32_avx2(u[6], k32_m08_p24);
+          v[11] = k_madd_epi32_avx2(u[7], k32_m08_p24);
+          v[12] = k_madd_epi32_avx2(u[0], k32_p24_p08);
+          v[13] = k_madd_epi32_avx2(u[1], k32_p24_p08);
+          v[14] = k_madd_epi32_avx2(u[2], k32_p24_p08);
+          v[15] = k_madd_epi32_avx2(u[3], k32_p24_p08);
+
+          u[0] = k_packs_epi64_avx2(v[0], v[1]);
+          u[1] = k_packs_epi64_avx2(v[2], v[3]);
+          u[2] = k_packs_epi64_avx2(v[4], v[5]);
+          u[3] = k_packs_epi64_avx2(v[6], v[7]);
+          u[4] = k_packs_epi64_avx2(v[8], v[9]);
+          u[5] = k_packs_epi64_avx2(v[10], v[11]);
+          u[6] = k_packs_epi64_avx2(v[12], v[13]);
+          u[7] = k_packs_epi64_avx2(v[14], v[15]);
+
+          u[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+          u[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+          u[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+          u[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+          u[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+          u[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+          u[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+          u[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+
+          lstep2[18] = _mm256_srai_epi32(u[0], DCT_CONST_BITS);
+          lstep2[19] = _mm256_srai_epi32(u[1], DCT_CONST_BITS);
+          lstep2[20] = _mm256_srai_epi32(u[2], DCT_CONST_BITS);
+          lstep2[21] = _mm256_srai_epi32(u[3], DCT_CONST_BITS);
+          lstep2[26] = _mm256_srai_epi32(u[4], DCT_CONST_BITS);
+          lstep2[27] = _mm256_srai_epi32(u[5], DCT_CONST_BITS);
+          lstep2[28] = _mm256_srai_epi32(u[6], DCT_CONST_BITS);
+          lstep2[29] = _mm256_srai_epi32(u[7], DCT_CONST_BITS);
+        }
+        {
+          lstep2[32] = _mm256_add_epi32(lstep1[38], lstep3[32]);
+          lstep2[33] = _mm256_add_epi32(lstep1[39], lstep3[33]);
+          lstep2[34] = _mm256_add_epi32(lstep1[36], lstep3[34]);
+          lstep2[35] = _mm256_add_epi32(lstep1[37], lstep3[35]);
+          lstep2[36] = _mm256_sub_epi32(lstep3[34], lstep1[36]);
+          lstep2[37] = _mm256_sub_epi32(lstep3[35], lstep1[37]);
+          lstep2[38] = _mm256_sub_epi32(lstep3[32], lstep1[38]);
+          lstep2[39] = _mm256_sub_epi32(lstep3[33], lstep1[39]);
+          lstep2[40] = _mm256_sub_epi32(lstep3[46], lstep1[40]);
+          lstep2[41] = _mm256_sub_epi32(lstep3[47], lstep1[41]);
+          lstep2[42] = _mm256_sub_epi32(lstep3[44], lstep1[42]);
+          lstep2[43] = _mm256_sub_epi32(lstep3[45], lstep1[43]);
+          lstep2[44] = _mm256_add_epi32(lstep1[42], lstep3[44]);
+          lstep2[45] = _mm256_add_epi32(lstep1[43], lstep3[45]);
+          lstep2[46] = _mm256_add_epi32(lstep1[40], lstep3[46]);
+          lstep2[47] = _mm256_add_epi32(lstep1[41], lstep3[47]);
+          lstep2[48] = _mm256_add_epi32(lstep1[54], lstep3[48]);
+          lstep2[49] = _mm256_add_epi32(lstep1[55], lstep3[49]);
+          lstep2[50] = _mm256_add_epi32(lstep1[52], lstep3[50]);
+          lstep2[51] = _mm256_add_epi32(lstep1[53], lstep3[51]);
+          lstep2[52] = _mm256_sub_epi32(lstep3[50], lstep1[52]);
+          lstep2[53] = _mm256_sub_epi32(lstep3[51], lstep1[53]);
+          lstep2[54] = _mm256_sub_epi32(lstep3[48], lstep1[54]);
+          lstep2[55] = _mm256_sub_epi32(lstep3[49], lstep1[55]);
+          lstep2[56] = _mm256_sub_epi32(lstep3[62], lstep1[56]);
+          lstep2[57] = _mm256_sub_epi32(lstep3[63], lstep1[57]);
+          lstep2[58] = _mm256_sub_epi32(lstep3[60], lstep1[58]);
+          lstep2[59] = _mm256_sub_epi32(lstep3[61], lstep1[59]);
+          lstep2[60] = _mm256_add_epi32(lstep1[58], lstep3[60]);
+          lstep2[61] = _mm256_add_epi32(lstep1[59], lstep3[61]);
+          lstep2[62] = _mm256_add_epi32(lstep1[56], lstep3[62]);
+          lstep2[63] = _mm256_add_epi32(lstep1[57], lstep3[63]);
+        }
+        // stage 6
+        {
+          const __m256i k32_p28_p04 = pair256_set_epi32(cospi_28_64, cospi_4_64);
+          const __m256i k32_p12_p20 = pair256_set_epi32(cospi_12_64, cospi_20_64);
+          const __m256i k32_m20_p12 = pair256_set_epi32(-cospi_20_64, cospi_12_64);
+          const __m256i k32_m04_p28 = pair256_set_epi32(-cospi_4_64, cospi_28_64);
+
+          u[0] = _mm256_unpacklo_epi32(lstep2[ 8], lstep2[14]);
+          u[1] = _mm256_unpackhi_epi32(lstep2[ 8], lstep2[14]);
+          u[2] = _mm256_unpacklo_epi32(lstep2[ 9], lstep2[15]);
+          u[3] = _mm256_unpackhi_epi32(lstep2[ 9], lstep2[15]);
+          u[4] = _mm256_unpacklo_epi32(lstep2[10], lstep2[12]);
+          u[5] = _mm256_unpackhi_epi32(lstep2[10], lstep2[12]);
+          u[6] = _mm256_unpacklo_epi32(lstep2[11], lstep2[13]);
+          u[7] = _mm256_unpackhi_epi32(lstep2[11], lstep2[13]);
+          u[8] = _mm256_unpacklo_epi32(lstep2[10], lstep2[12]);
+          u[9] = _mm256_unpackhi_epi32(lstep2[10], lstep2[12]);
+          u[10] = _mm256_unpacklo_epi32(lstep2[11], lstep2[13]);
+          u[11] = _mm256_unpackhi_epi32(lstep2[11], lstep2[13]);
+          u[12] = _mm256_unpacklo_epi32(lstep2[ 8], lstep2[14]);
+          u[13] = _mm256_unpackhi_epi32(lstep2[ 8], lstep2[14]);
+          u[14] = _mm256_unpacklo_epi32(lstep2[ 9], lstep2[15]);
+          u[15] = _mm256_unpackhi_epi32(lstep2[ 9], lstep2[15]);
+
+          v[0] = k_madd_epi32_avx2(u[0], k32_p28_p04);
+          v[1] = k_madd_epi32_avx2(u[1], k32_p28_p04);
+          v[2] = k_madd_epi32_avx2(u[2], k32_p28_p04);
+          v[3] = k_madd_epi32_avx2(u[3], k32_p28_p04);
+          v[4] = k_madd_epi32_avx2(u[4], k32_p12_p20);
+          v[5] = k_madd_epi32_avx2(u[5], k32_p12_p20);
+          v[6] = k_madd_epi32_avx2(u[6], k32_p12_p20);
+          v[7] = k_madd_epi32_avx2(u[7], k32_p12_p20);
+          v[ 8] = k_madd_epi32_avx2(u[ 8], k32_m20_p12);
+          v[ 9] = k_madd_epi32_avx2(u[ 9], k32_m20_p12);
+          v[10] = k_madd_epi32_avx2(u[10], k32_m20_p12);
+          v[11] = k_madd_epi32_avx2(u[11], k32_m20_p12);
+          v[12] = k_madd_epi32_avx2(u[12], k32_m04_p28);
+          v[13] = k_madd_epi32_avx2(u[13], k32_m04_p28);
+          v[14] = k_madd_epi32_avx2(u[14], k32_m04_p28);
+          v[15] = k_madd_epi32_avx2(u[15], k32_m04_p28);
+
+          u[0] = k_packs_epi64_avx2(v[0], v[1]);
+          u[1] = k_packs_epi64_avx2(v[2], v[3]);
+          u[2] = k_packs_epi64_avx2(v[4], v[5]);
+          u[3] = k_packs_epi64_avx2(v[6], v[7]);
+          u[4] = k_packs_epi64_avx2(v[8], v[9]);
+          u[5] = k_packs_epi64_avx2(v[10], v[11]);
+          u[6] = k_packs_epi64_avx2(v[12], v[13]);
+          u[7] = k_packs_epi64_avx2(v[14], v[15]);
+
+          v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+          v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+          v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+          v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+          v[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+          v[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+          v[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+          v[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+
+          u[0] = _mm256_srai_epi32(v[0], DCT_CONST_BITS);
+          u[1] = _mm256_srai_epi32(v[1], DCT_CONST_BITS);
+          u[2] = _mm256_srai_epi32(v[2], DCT_CONST_BITS);
+          u[3] = _mm256_srai_epi32(v[3], DCT_CONST_BITS);
+          u[4] = _mm256_srai_epi32(v[4], DCT_CONST_BITS);
+          u[5] = _mm256_srai_epi32(v[5], DCT_CONST_BITS);
+          u[6] = _mm256_srai_epi32(v[6], DCT_CONST_BITS);
+          u[7] = _mm256_srai_epi32(v[7], DCT_CONST_BITS);
+
+          sign[0] = _mm256_cmpgt_epi32(kZero,u[0]);
+          sign[1] = _mm256_cmpgt_epi32(kZero,u[1]);
+          sign[2] = _mm256_cmpgt_epi32(kZero,u[2]);
+          sign[3] = _mm256_cmpgt_epi32(kZero,u[3]);
+          sign[4] = _mm256_cmpgt_epi32(kZero,u[4]);
+          sign[5] = _mm256_cmpgt_epi32(kZero,u[5]);
+          sign[6] = _mm256_cmpgt_epi32(kZero,u[6]);
+          sign[7] = _mm256_cmpgt_epi32(kZero,u[7]);
+
+          u[0] = _mm256_sub_epi32(u[0], sign[0]);
+          u[1] = _mm256_sub_epi32(u[1], sign[1]);
+          u[2] = _mm256_sub_epi32(u[2], sign[2]);
+          u[3] = _mm256_sub_epi32(u[3], sign[3]);
+          u[4] = _mm256_sub_epi32(u[4], sign[4]);
+          u[5] = _mm256_sub_epi32(u[5], sign[5]);
+          u[6] = _mm256_sub_epi32(u[6], sign[6]);
+          u[7] = _mm256_sub_epi32(u[7], sign[7]);
+
+          u[0] = _mm256_add_epi32(u[0], K32One);
+          u[1] = _mm256_add_epi32(u[1], K32One);
+          u[2] = _mm256_add_epi32(u[2], K32One);
+          u[3] = _mm256_add_epi32(u[3], K32One);
+          u[4] = _mm256_add_epi32(u[4], K32One);
+          u[5] = _mm256_add_epi32(u[5], K32One);
+          u[6] = _mm256_add_epi32(u[6], K32One);
+          u[7] = _mm256_add_epi32(u[7], K32One);
+
+          u[0] = _mm256_srai_epi32(u[0], 2);
+          u[1] = _mm256_srai_epi32(u[1], 2);
+          u[2] = _mm256_srai_epi32(u[2], 2);
+          u[3] = _mm256_srai_epi32(u[3], 2);
+          u[4] = _mm256_srai_epi32(u[4], 2);
+          u[5] = _mm256_srai_epi32(u[5], 2);
+          u[6] = _mm256_srai_epi32(u[6], 2);
+          u[7] = _mm256_srai_epi32(u[7], 2);
+
+          out[ 4] = _mm256_packs_epi32(u[0], u[1]);
+          out[20] = _mm256_packs_epi32(u[2], u[3]);
+          out[12] = _mm256_packs_epi32(u[4], u[5]);
+          out[28] = _mm256_packs_epi32(u[6], u[7]);
+        }
+        {
+          lstep3[16] = _mm256_add_epi32(lstep2[18], lstep1[16]);
+          lstep3[17] = _mm256_add_epi32(lstep2[19], lstep1[17]);
+          lstep3[18] = _mm256_sub_epi32(lstep1[16], lstep2[18]);
+          lstep3[19] = _mm256_sub_epi32(lstep1[17], lstep2[19]);
+          lstep3[20] = _mm256_sub_epi32(lstep1[22], lstep2[20]);
+          lstep3[21] = _mm256_sub_epi32(lstep1[23], lstep2[21]);
+          lstep3[22] = _mm256_add_epi32(lstep2[20], lstep1[22]);
+          lstep3[23] = _mm256_add_epi32(lstep2[21], lstep1[23]);
+          lstep3[24] = _mm256_add_epi32(lstep2[26], lstep1[24]);
+          lstep3[25] = _mm256_add_epi32(lstep2[27], lstep1[25]);
+          lstep3[26] = _mm256_sub_epi32(lstep1[24], lstep2[26]);
+          lstep3[27] = _mm256_sub_epi32(lstep1[25], lstep2[27]);
+          lstep3[28] = _mm256_sub_epi32(lstep1[30], lstep2[28]);
+          lstep3[29] = _mm256_sub_epi32(lstep1[31], lstep2[29]);
+          lstep3[30] = _mm256_add_epi32(lstep2[28], lstep1[30]);
+          lstep3[31] = _mm256_add_epi32(lstep2[29], lstep1[31]);
+        }
+        {
+          const __m256i k32_m04_p28 = pair256_set_epi32(-cospi_4_64, cospi_28_64);
+          const __m256i k32_m28_m04 = pair256_set_epi32(-cospi_28_64, -cospi_4_64);
+          const __m256i k32_m20_p12 = pair256_set_epi32(-cospi_20_64, cospi_12_64);
+          const __m256i k32_m12_m20 = pair256_set_epi32(-cospi_12_64,
+                                                     -cospi_20_64);
+          const __m256i k32_p12_p20 = pair256_set_epi32(cospi_12_64, cospi_20_64);
+          const __m256i k32_p28_p04 = pair256_set_epi32(cospi_28_64, cospi_4_64);
+
+          u[ 0] = _mm256_unpacklo_epi32(lstep2[34], lstep2[60]);
+          u[ 1] = _mm256_unpackhi_epi32(lstep2[34], lstep2[60]);
+          u[ 2] = _mm256_unpacklo_epi32(lstep2[35], lstep2[61]);
+          u[ 3] = _mm256_unpackhi_epi32(lstep2[35], lstep2[61]);
+          u[ 4] = _mm256_unpacklo_epi32(lstep2[36], lstep2[58]);
+          u[ 5] = _mm256_unpackhi_epi32(lstep2[36], lstep2[58]);
+          u[ 6] = _mm256_unpacklo_epi32(lstep2[37], lstep2[59]);
+          u[ 7] = _mm256_unpackhi_epi32(lstep2[37], lstep2[59]);
+          u[ 8] = _mm256_unpacklo_epi32(lstep2[42], lstep2[52]);
+          u[ 9] = _mm256_unpackhi_epi32(lstep2[42], lstep2[52]);
+          u[10] = _mm256_unpacklo_epi32(lstep2[43], lstep2[53]);
+          u[11] = _mm256_unpackhi_epi32(lstep2[43], lstep2[53]);
+          u[12] = _mm256_unpacklo_epi32(lstep2[44], lstep2[50]);
+          u[13] = _mm256_unpackhi_epi32(lstep2[44], lstep2[50]);
+          u[14] = _mm256_unpacklo_epi32(lstep2[45], lstep2[51]);
+          u[15] = _mm256_unpackhi_epi32(lstep2[45], lstep2[51]);
+
+          v[ 0] = k_madd_epi32_avx2(u[ 0], k32_m04_p28);
+          v[ 1] = k_madd_epi32_avx2(u[ 1], k32_m04_p28);
+          v[ 2] = k_madd_epi32_avx2(u[ 2], k32_m04_p28);
+          v[ 3] = k_madd_epi32_avx2(u[ 3], k32_m04_p28);
+          v[ 4] = k_madd_epi32_avx2(u[ 4], k32_m28_m04);
+          v[ 5] = k_madd_epi32_avx2(u[ 5], k32_m28_m04);
+          v[ 6] = k_madd_epi32_avx2(u[ 6], k32_m28_m04);
+          v[ 7] = k_madd_epi32_avx2(u[ 7], k32_m28_m04);
+          v[ 8] = k_madd_epi32_avx2(u[ 8], k32_m20_p12);
+          v[ 9] = k_madd_epi32_avx2(u[ 9], k32_m20_p12);
+          v[10] = k_madd_epi32_avx2(u[10], k32_m20_p12);
+          v[11] = k_madd_epi32_avx2(u[11], k32_m20_p12);
+          v[12] = k_madd_epi32_avx2(u[12], k32_m12_m20);
+          v[13] = k_madd_epi32_avx2(u[13], k32_m12_m20);
+          v[14] = k_madd_epi32_avx2(u[14], k32_m12_m20);
+          v[15] = k_madd_epi32_avx2(u[15], k32_m12_m20);
+          v[16] = k_madd_epi32_avx2(u[12], k32_m20_p12);
+          v[17] = k_madd_epi32_avx2(u[13], k32_m20_p12);
+          v[18] = k_madd_epi32_avx2(u[14], k32_m20_p12);
+          v[19] = k_madd_epi32_avx2(u[15], k32_m20_p12);
+          v[20] = k_madd_epi32_avx2(u[ 8], k32_p12_p20);
+          v[21] = k_madd_epi32_avx2(u[ 9], k32_p12_p20);
+          v[22] = k_madd_epi32_avx2(u[10], k32_p12_p20);
+          v[23] = k_madd_epi32_avx2(u[11], k32_p12_p20);
+          v[24] = k_madd_epi32_avx2(u[ 4], k32_m04_p28);
+          v[25] = k_madd_epi32_avx2(u[ 5], k32_m04_p28);
+          v[26] = k_madd_epi32_avx2(u[ 6], k32_m04_p28);
+          v[27] = k_madd_epi32_avx2(u[ 7], k32_m04_p28);
+          v[28] = k_madd_epi32_avx2(u[ 0], k32_p28_p04);
+          v[29] = k_madd_epi32_avx2(u[ 1], k32_p28_p04);
+          v[30] = k_madd_epi32_avx2(u[ 2], k32_p28_p04);
+          v[31] = k_madd_epi32_avx2(u[ 3], k32_p28_p04);
+
+          u[ 0] = k_packs_epi64_avx2(v[ 0], v[ 1]);
+          u[ 1] = k_packs_epi64_avx2(v[ 2], v[ 3]);
+          u[ 2] = k_packs_epi64_avx2(v[ 4], v[ 5]);
+          u[ 3] = k_packs_epi64_avx2(v[ 6], v[ 7]);
+          u[ 4] = k_packs_epi64_avx2(v[ 8], v[ 9]);
+          u[ 5] = k_packs_epi64_avx2(v[10], v[11]);
+          u[ 6] = k_packs_epi64_avx2(v[12], v[13]);
+          u[ 7] = k_packs_epi64_avx2(v[14], v[15]);
+          u[ 8] = k_packs_epi64_avx2(v[16], v[17]);
+          u[ 9] = k_packs_epi64_avx2(v[18], v[19]);
+          u[10] = k_packs_epi64_avx2(v[20], v[21]);
+          u[11] = k_packs_epi64_avx2(v[22], v[23]);
+          u[12] = k_packs_epi64_avx2(v[24], v[25]);
+          u[13] = k_packs_epi64_avx2(v[26], v[27]);
+          u[14] = k_packs_epi64_avx2(v[28], v[29]);
+          u[15] = k_packs_epi64_avx2(v[30], v[31]);
+
+          v[ 0] = _mm256_add_epi32(u[ 0], k__DCT_CONST_ROUNDING);
+          v[ 1] = _mm256_add_epi32(u[ 1], k__DCT_CONST_ROUNDING);
+          v[ 2] = _mm256_add_epi32(u[ 2], k__DCT_CONST_ROUNDING);
+          v[ 3] = _mm256_add_epi32(u[ 3], k__DCT_CONST_ROUNDING);
+          v[ 4] = _mm256_add_epi32(u[ 4], k__DCT_CONST_ROUNDING);
+          v[ 5] = _mm256_add_epi32(u[ 5], k__DCT_CONST_ROUNDING);
+          v[ 6] = _mm256_add_epi32(u[ 6], k__DCT_CONST_ROUNDING);
+          v[ 7] = _mm256_add_epi32(u[ 7], k__DCT_CONST_ROUNDING);
+          v[ 8] = _mm256_add_epi32(u[ 8], k__DCT_CONST_ROUNDING);
+          v[ 9] = _mm256_add_epi32(u[ 9], k__DCT_CONST_ROUNDING);
+          v[10] = _mm256_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+          v[11] = _mm256_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+          v[12] = _mm256_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+          v[13] = _mm256_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+          v[14] = _mm256_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+          v[15] = _mm256_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+          lstep3[34] = _mm256_srai_epi32(v[ 0], DCT_CONST_BITS);
+          lstep3[35] = _mm256_srai_epi32(v[ 1], DCT_CONST_BITS);
+          lstep3[36] = _mm256_srai_epi32(v[ 2], DCT_CONST_BITS);
+          lstep3[37] = _mm256_srai_epi32(v[ 3], DCT_CONST_BITS);
+          lstep3[42] = _mm256_srai_epi32(v[ 4], DCT_CONST_BITS);
+          lstep3[43] = _mm256_srai_epi32(v[ 5], DCT_CONST_BITS);
+          lstep3[44] = _mm256_srai_epi32(v[ 6], DCT_CONST_BITS);
+          lstep3[45] = _mm256_srai_epi32(v[ 7], DCT_CONST_BITS);
+          lstep3[50] = _mm256_srai_epi32(v[ 8], DCT_CONST_BITS);
+          lstep3[51] = _mm256_srai_epi32(v[ 9], DCT_CONST_BITS);
+          lstep3[52] = _mm256_srai_epi32(v[10], DCT_CONST_BITS);
+          lstep3[53] = _mm256_srai_epi32(v[11], DCT_CONST_BITS);
+          lstep3[58] = _mm256_srai_epi32(v[12], DCT_CONST_BITS);
+          lstep3[59] = _mm256_srai_epi32(v[13], DCT_CONST_BITS);
+          lstep3[60] = _mm256_srai_epi32(v[14], DCT_CONST_BITS);
+          lstep3[61] = _mm256_srai_epi32(v[15], DCT_CONST_BITS);
+        }
+        // stage 7
+        {
+          const __m256i k32_p30_p02 = pair256_set_epi32(cospi_30_64, cospi_2_64);
+          const __m256i k32_p14_p18 = pair256_set_epi32(cospi_14_64, cospi_18_64);
+          const __m256i k32_p22_p10 = pair256_set_epi32(cospi_22_64, cospi_10_64);
+          const __m256i k32_p06_p26 = pair256_set_epi32(cospi_6_64,  cospi_26_64);
+          const __m256i k32_m26_p06 = pair256_set_epi32(-cospi_26_64, cospi_6_64);
+          const __m256i k32_m10_p22 = pair256_set_epi32(-cospi_10_64, cospi_22_64);
+          const __m256i k32_m18_p14 = pair256_set_epi32(-cospi_18_64, cospi_14_64);
+          const __m256i k32_m02_p30 = pair256_set_epi32(-cospi_2_64, cospi_30_64);
+
+          u[ 0] = _mm256_unpacklo_epi32(lstep3[16], lstep3[30]);
+          u[ 1] = _mm256_unpackhi_epi32(lstep3[16], lstep3[30]);
+          u[ 2] = _mm256_unpacklo_epi32(lstep3[17], lstep3[31]);
+          u[ 3] = _mm256_unpackhi_epi32(lstep3[17], lstep3[31]);
+          u[ 4] = _mm256_unpacklo_epi32(lstep3[18], lstep3[28]);
+          u[ 5] = _mm256_unpackhi_epi32(lstep3[18], lstep3[28]);
+          u[ 6] = _mm256_unpacklo_epi32(lstep3[19], lstep3[29]);
+          u[ 7] = _mm256_unpackhi_epi32(lstep3[19], lstep3[29]);
+          u[ 8] = _mm256_unpacklo_epi32(lstep3[20], lstep3[26]);
+          u[ 9] = _mm256_unpackhi_epi32(lstep3[20], lstep3[26]);
+          u[10] = _mm256_unpacklo_epi32(lstep3[21], lstep3[27]);
+          u[11] = _mm256_unpackhi_epi32(lstep3[21], lstep3[27]);
+          u[12] = _mm256_unpacklo_epi32(lstep3[22], lstep3[24]);
+          u[13] = _mm256_unpackhi_epi32(lstep3[22], lstep3[24]);
+          u[14] = _mm256_unpacklo_epi32(lstep3[23], lstep3[25]);
+          u[15] = _mm256_unpackhi_epi32(lstep3[23], lstep3[25]);
+
+          v[ 0] = k_madd_epi32_avx2(u[ 0], k32_p30_p02);
+          v[ 1] = k_madd_epi32_avx2(u[ 1], k32_p30_p02);
+          v[ 2] = k_madd_epi32_avx2(u[ 2], k32_p30_p02);
+          v[ 3] = k_madd_epi32_avx2(u[ 3], k32_p30_p02);
+          v[ 4] = k_madd_epi32_avx2(u[ 4], k32_p14_p18);
+          v[ 5] = k_madd_epi32_avx2(u[ 5], k32_p14_p18);
+          v[ 6] = k_madd_epi32_avx2(u[ 6], k32_p14_p18);
+          v[ 7] = k_madd_epi32_avx2(u[ 7], k32_p14_p18);
+          v[ 8] = k_madd_epi32_avx2(u[ 8], k32_p22_p10);
+          v[ 9] = k_madd_epi32_avx2(u[ 9], k32_p22_p10);
+          v[10] = k_madd_epi32_avx2(u[10], k32_p22_p10);
+          v[11] = k_madd_epi32_avx2(u[11], k32_p22_p10);
+          v[12] = k_madd_epi32_avx2(u[12], k32_p06_p26);
+          v[13] = k_madd_epi32_avx2(u[13], k32_p06_p26);
+          v[14] = k_madd_epi32_avx2(u[14], k32_p06_p26);
+          v[15] = k_madd_epi32_avx2(u[15], k32_p06_p26);
+          v[16] = k_madd_epi32_avx2(u[12], k32_m26_p06);
+          v[17] = k_madd_epi32_avx2(u[13], k32_m26_p06);
+          v[18] = k_madd_epi32_avx2(u[14], k32_m26_p06);
+          v[19] = k_madd_epi32_avx2(u[15], k32_m26_p06);
+          v[20] = k_madd_epi32_avx2(u[ 8], k32_m10_p22);
+          v[21] = k_madd_epi32_avx2(u[ 9], k32_m10_p22);
+          v[22] = k_madd_epi32_avx2(u[10], k32_m10_p22);
+          v[23] = k_madd_epi32_avx2(u[11], k32_m10_p22);
+          v[24] = k_madd_epi32_avx2(u[ 4], k32_m18_p14);
+          v[25] = k_madd_epi32_avx2(u[ 5], k32_m18_p14);
+          v[26] = k_madd_epi32_avx2(u[ 6], k32_m18_p14);
+          v[27] = k_madd_epi32_avx2(u[ 7], k32_m18_p14);
+          v[28] = k_madd_epi32_avx2(u[ 0], k32_m02_p30);
+          v[29] = k_madd_epi32_avx2(u[ 1], k32_m02_p30);
+          v[30] = k_madd_epi32_avx2(u[ 2], k32_m02_p30);
+          v[31] = k_madd_epi32_avx2(u[ 3], k32_m02_p30);
+
+          u[ 0] = k_packs_epi64_avx2(v[ 0], v[ 1]);
+          u[ 1] = k_packs_epi64_avx2(v[ 2], v[ 3]);
+          u[ 2] = k_packs_epi64_avx2(v[ 4], v[ 5]);
+          u[ 3] = k_packs_epi64_avx2(v[ 6], v[ 7]);
+          u[ 4] = k_packs_epi64_avx2(v[ 8], v[ 9]);
+          u[ 5] = k_packs_epi64_avx2(v[10], v[11]);
+          u[ 6] = k_packs_epi64_avx2(v[12], v[13]);
+          u[ 7] = k_packs_epi64_avx2(v[14], v[15]);
+          u[ 8] = k_packs_epi64_avx2(v[16], v[17]);
+          u[ 9] = k_packs_epi64_avx2(v[18], v[19]);
+          u[10] = k_packs_epi64_avx2(v[20], v[21]);
+          u[11] = k_packs_epi64_avx2(v[22], v[23]);
+          u[12] = k_packs_epi64_avx2(v[24], v[25]);
+          u[13] = k_packs_epi64_avx2(v[26], v[27]);
+          u[14] = k_packs_epi64_avx2(v[28], v[29]);
+          u[15] = k_packs_epi64_avx2(v[30], v[31]);
+
+          v[ 0] = _mm256_add_epi32(u[ 0], k__DCT_CONST_ROUNDING);
+          v[ 1] = _mm256_add_epi32(u[ 1], k__DCT_CONST_ROUNDING);
+          v[ 2] = _mm256_add_epi32(u[ 2], k__DCT_CONST_ROUNDING);
+          v[ 3] = _mm256_add_epi32(u[ 3], k__DCT_CONST_ROUNDING);
+          v[ 4] = _mm256_add_epi32(u[ 4], k__DCT_CONST_ROUNDING);
+          v[ 5] = _mm256_add_epi32(u[ 5], k__DCT_CONST_ROUNDING);
+          v[ 6] = _mm256_add_epi32(u[ 6], k__DCT_CONST_ROUNDING);
+          v[ 7] = _mm256_add_epi32(u[ 7], k__DCT_CONST_ROUNDING);
+          v[ 8] = _mm256_add_epi32(u[ 8], k__DCT_CONST_ROUNDING);
+          v[ 9] = _mm256_add_epi32(u[ 9], k__DCT_CONST_ROUNDING);
+          v[10] = _mm256_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+          v[11] = _mm256_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+          v[12] = _mm256_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+          v[13] = _mm256_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+          v[14] = _mm256_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+          v[15] = _mm256_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+          u[ 0] = _mm256_srai_epi32(v[ 0], DCT_CONST_BITS);
+          u[ 1] = _mm256_srai_epi32(v[ 1], DCT_CONST_BITS);
+          u[ 2] = _mm256_srai_epi32(v[ 2], DCT_CONST_BITS);
+          u[ 3] = _mm256_srai_epi32(v[ 3], DCT_CONST_BITS);
+          u[ 4] = _mm256_srai_epi32(v[ 4], DCT_CONST_BITS);
+          u[ 5] = _mm256_srai_epi32(v[ 5], DCT_CONST_BITS);
+          u[ 6] = _mm256_srai_epi32(v[ 6], DCT_CONST_BITS);
+          u[ 7] = _mm256_srai_epi32(v[ 7], DCT_CONST_BITS);
+          u[ 8] = _mm256_srai_epi32(v[ 8], DCT_CONST_BITS);
+          u[ 9] = _mm256_srai_epi32(v[ 9], DCT_CONST_BITS);
+          u[10] = _mm256_srai_epi32(v[10], DCT_CONST_BITS);
+          u[11] = _mm256_srai_epi32(v[11], DCT_CONST_BITS);
+          u[12] = _mm256_srai_epi32(v[12], DCT_CONST_BITS);
+          u[13] = _mm256_srai_epi32(v[13], DCT_CONST_BITS);
+          u[14] = _mm256_srai_epi32(v[14], DCT_CONST_BITS);
+          u[15] = _mm256_srai_epi32(v[15], DCT_CONST_BITS);
+
+          v[ 0] = _mm256_cmpgt_epi32(kZero,u[ 0]);
+          v[ 1] = _mm256_cmpgt_epi32(kZero,u[ 1]);
+          v[ 2] = _mm256_cmpgt_epi32(kZero,u[ 2]);
+          v[ 3] = _mm256_cmpgt_epi32(kZero,u[ 3]);
+          v[ 4] = _mm256_cmpgt_epi32(kZero,u[ 4]);
+          v[ 5] = _mm256_cmpgt_epi32(kZero,u[ 5]);
+          v[ 6] = _mm256_cmpgt_epi32(kZero,u[ 6]);
+          v[ 7] = _mm256_cmpgt_epi32(kZero,u[ 7]);
+          v[ 8] = _mm256_cmpgt_epi32(kZero,u[ 8]);
+          v[ 9] = _mm256_cmpgt_epi32(kZero,u[ 9]);
+          v[10] = _mm256_cmpgt_epi32(kZero,u[10]);
+          v[11] = _mm256_cmpgt_epi32(kZero,u[11]);
+          v[12] = _mm256_cmpgt_epi32(kZero,u[12]);
+          v[13] = _mm256_cmpgt_epi32(kZero,u[13]);
+          v[14] = _mm256_cmpgt_epi32(kZero,u[14]);
+          v[15] = _mm256_cmpgt_epi32(kZero,u[15]);
+
+          u[ 0] = _mm256_sub_epi32(u[ 0], v[ 0]);
+          u[ 1] = _mm256_sub_epi32(u[ 1], v[ 1]);
+          u[ 2] = _mm256_sub_epi32(u[ 2], v[ 2]);
+          u[ 3] = _mm256_sub_epi32(u[ 3], v[ 3]);
+          u[ 4] = _mm256_sub_epi32(u[ 4], v[ 4]);
+          u[ 5] = _mm256_sub_epi32(u[ 5], v[ 5]);
+          u[ 6] = _mm256_sub_epi32(u[ 6], v[ 6]);
+          u[ 7] = _mm256_sub_epi32(u[ 7], v[ 7]);
+          u[ 8] = _mm256_sub_epi32(u[ 8], v[ 8]);
+          u[ 9] = _mm256_sub_epi32(u[ 9], v[ 9]);
+          u[10] = _mm256_sub_epi32(u[10], v[10]);
+          u[11] = _mm256_sub_epi32(u[11], v[11]);
+          u[12] = _mm256_sub_epi32(u[12], v[12]);
+          u[13] = _mm256_sub_epi32(u[13], v[13]);
+          u[14] = _mm256_sub_epi32(u[14], v[14]);
+          u[15] = _mm256_sub_epi32(u[15], v[15]);
+
+          v[ 0] = _mm256_add_epi32(u[ 0], K32One);
+          v[ 1] = _mm256_add_epi32(u[ 1], K32One);
+          v[ 2] = _mm256_add_epi32(u[ 2], K32One);
+          v[ 3] = _mm256_add_epi32(u[ 3], K32One);
+          v[ 4] = _mm256_add_epi32(u[ 4], K32One);
+          v[ 5] = _mm256_add_epi32(u[ 5], K32One);
+          v[ 6] = _mm256_add_epi32(u[ 6], K32One);
+          v[ 7] = _mm256_add_epi32(u[ 7], K32One);
+          v[ 8] = _mm256_add_epi32(u[ 8], K32One);
+          v[ 9] = _mm256_add_epi32(u[ 9], K32One);
+          v[10] = _mm256_add_epi32(u[10], K32One);
+          v[11] = _mm256_add_epi32(u[11], K32One);
+          v[12] = _mm256_add_epi32(u[12], K32One);
+          v[13] = _mm256_add_epi32(u[13], K32One);
+          v[14] = _mm256_add_epi32(u[14], K32One);
+          v[15] = _mm256_add_epi32(u[15], K32One);
+
+          u[ 0] = _mm256_srai_epi32(v[ 0], 2);
+          u[ 1] = _mm256_srai_epi32(v[ 1], 2);
+          u[ 2] = _mm256_srai_epi32(v[ 2], 2);
+          u[ 3] = _mm256_srai_epi32(v[ 3], 2);
+          u[ 4] = _mm256_srai_epi32(v[ 4], 2);
+          u[ 5] = _mm256_srai_epi32(v[ 5], 2);
+          u[ 6] = _mm256_srai_epi32(v[ 6], 2);
+          u[ 7] = _mm256_srai_epi32(v[ 7], 2);
+          u[ 8] = _mm256_srai_epi32(v[ 8], 2);
+          u[ 9] = _mm256_srai_epi32(v[ 9], 2);
+          u[10] = _mm256_srai_epi32(v[10], 2);
+          u[11] = _mm256_srai_epi32(v[11], 2);
+          u[12] = _mm256_srai_epi32(v[12], 2);
+          u[13] = _mm256_srai_epi32(v[13], 2);
+          u[14] = _mm256_srai_epi32(v[14], 2);
+          u[15] = _mm256_srai_epi32(v[15], 2);
+
+          out[ 2] = _mm256_packs_epi32(u[0], u[1]);
+          out[18] = _mm256_packs_epi32(u[2], u[3]);
+          out[10] = _mm256_packs_epi32(u[4], u[5]);
+          out[26] = _mm256_packs_epi32(u[6], u[7]);
+          out[ 6] = _mm256_packs_epi32(u[8], u[9]);
+          out[22] = _mm256_packs_epi32(u[10], u[11]);
+          out[14] = _mm256_packs_epi32(u[12], u[13]);
+          out[30] = _mm256_packs_epi32(u[14], u[15]);
+        }
+        {
+          lstep1[32] = _mm256_add_epi32(lstep3[34], lstep2[32]);
+          lstep1[33] = _mm256_add_epi32(lstep3[35], lstep2[33]);
+          lstep1[34] = _mm256_sub_epi32(lstep2[32], lstep3[34]);
+          lstep1[35] = _mm256_sub_epi32(lstep2[33], lstep3[35]);
+          lstep1[36] = _mm256_sub_epi32(lstep2[38], lstep3[36]);
+          lstep1[37] = _mm256_sub_epi32(lstep2[39], lstep3[37]);
+          lstep1[38] = _mm256_add_epi32(lstep3[36], lstep2[38]);
+          lstep1[39] = _mm256_add_epi32(lstep3[37], lstep2[39]);
+          lstep1[40] = _mm256_add_epi32(lstep3[42], lstep2[40]);
+          lstep1[41] = _mm256_add_epi32(lstep3[43], lstep2[41]);
+          lstep1[42] = _mm256_sub_epi32(lstep2[40], lstep3[42]);
+          lstep1[43] = _mm256_sub_epi32(lstep2[41], lstep3[43]);
+          lstep1[44] = _mm256_sub_epi32(lstep2[46], lstep3[44]);
+          lstep1[45] = _mm256_sub_epi32(lstep2[47], lstep3[45]);
+          lstep1[46] = _mm256_add_epi32(lstep3[44], lstep2[46]);
+          lstep1[47] = _mm256_add_epi32(lstep3[45], lstep2[47]);
+          lstep1[48] = _mm256_add_epi32(lstep3[50], lstep2[48]);
+          lstep1[49] = _mm256_add_epi32(lstep3[51], lstep2[49]);
+          lstep1[50] = _mm256_sub_epi32(lstep2[48], lstep3[50]);
+          lstep1[51] = _mm256_sub_epi32(lstep2[49], lstep3[51]);
+          lstep1[52] = _mm256_sub_epi32(lstep2[54], lstep3[52]);
+          lstep1[53] = _mm256_sub_epi32(lstep2[55], lstep3[53]);
+          lstep1[54] = _mm256_add_epi32(lstep3[52], lstep2[54]);
+          lstep1[55] = _mm256_add_epi32(lstep3[53], lstep2[55]);
+          lstep1[56] = _mm256_add_epi32(lstep3[58], lstep2[56]);
+          lstep1[57] = _mm256_add_epi32(lstep3[59], lstep2[57]);
+          lstep1[58] = _mm256_sub_epi32(lstep2[56], lstep3[58]);
+          lstep1[59] = _mm256_sub_epi32(lstep2[57], lstep3[59]);
+          lstep1[60] = _mm256_sub_epi32(lstep2[62], lstep3[60]);
+          lstep1[61] = _mm256_sub_epi32(lstep2[63], lstep3[61]);
+          lstep1[62] = _mm256_add_epi32(lstep3[60], lstep2[62]);
+          lstep1[63] = _mm256_add_epi32(lstep3[61], lstep2[63]);
+        }
+        // stage 8
+        {
+          const __m256i k32_p31_p01 = pair256_set_epi32(cospi_31_64, cospi_1_64);
+          const __m256i k32_p15_p17 = pair256_set_epi32(cospi_15_64, cospi_17_64);
+          const __m256i k32_p23_p09 = pair256_set_epi32(cospi_23_64, cospi_9_64);
+          const __m256i k32_p07_p25 = pair256_set_epi32(cospi_7_64, cospi_25_64);
+          const __m256i k32_m25_p07 = pair256_set_epi32(-cospi_25_64, cospi_7_64);
+          const __m256i k32_m09_p23 = pair256_set_epi32(-cospi_9_64, cospi_23_64);
+          const __m256i k32_m17_p15 = pair256_set_epi32(-cospi_17_64, cospi_15_64);
+          const __m256i k32_m01_p31 = pair256_set_epi32(-cospi_1_64, cospi_31_64);
+
+          u[ 0] = _mm256_unpacklo_epi32(lstep1[32], lstep1[62]);
+          u[ 1] = _mm256_unpackhi_epi32(lstep1[32], lstep1[62]);
+          u[ 2] = _mm256_unpacklo_epi32(lstep1[33], lstep1[63]);
+          u[ 3] = _mm256_unpackhi_epi32(lstep1[33], lstep1[63]);
+          u[ 4] = _mm256_unpacklo_epi32(lstep1[34], lstep1[60]);
+          u[ 5] = _mm256_unpackhi_epi32(lstep1[34], lstep1[60]);
+          u[ 6] = _mm256_unpacklo_epi32(lstep1[35], lstep1[61]);
+          u[ 7] = _mm256_unpackhi_epi32(lstep1[35], lstep1[61]);
+          u[ 8] = _mm256_unpacklo_epi32(lstep1[36], lstep1[58]);
+          u[ 9] = _mm256_unpackhi_epi32(lstep1[36], lstep1[58]);
+          u[10] = _mm256_unpacklo_epi32(lstep1[37], lstep1[59]);
+          u[11] = _mm256_unpackhi_epi32(lstep1[37], lstep1[59]);
+          u[12] = _mm256_unpacklo_epi32(lstep1[38], lstep1[56]);
+          u[13] = _mm256_unpackhi_epi32(lstep1[38], lstep1[56]);
+          u[14] = _mm256_unpacklo_epi32(lstep1[39], lstep1[57]);
+          u[15] = _mm256_unpackhi_epi32(lstep1[39], lstep1[57]);
+
+          v[ 0] = k_madd_epi32_avx2(u[ 0], k32_p31_p01);
+          v[ 1] = k_madd_epi32_avx2(u[ 1], k32_p31_p01);
+          v[ 2] = k_madd_epi32_avx2(u[ 2], k32_p31_p01);
+          v[ 3] = k_madd_epi32_avx2(u[ 3], k32_p31_p01);
+          v[ 4] = k_madd_epi32_avx2(u[ 4], k32_p15_p17);
+          v[ 5] = k_madd_epi32_avx2(u[ 5], k32_p15_p17);
+          v[ 6] = k_madd_epi32_avx2(u[ 6], k32_p15_p17);
+          v[ 7] = k_madd_epi32_avx2(u[ 7], k32_p15_p17);
+          v[ 8] = k_madd_epi32_avx2(u[ 8], k32_p23_p09);
+          v[ 9] = k_madd_epi32_avx2(u[ 9], k32_p23_p09);
+          v[10] = k_madd_epi32_avx2(u[10], k32_p23_p09);
+          v[11] = k_madd_epi32_avx2(u[11], k32_p23_p09);
+          v[12] = k_madd_epi32_avx2(u[12], k32_p07_p25);
+          v[13] = k_madd_epi32_avx2(u[13], k32_p07_p25);
+          v[14] = k_madd_epi32_avx2(u[14], k32_p07_p25);
+          v[15] = k_madd_epi32_avx2(u[15], k32_p07_p25);
+          v[16] = k_madd_epi32_avx2(u[12], k32_m25_p07);
+          v[17] = k_madd_epi32_avx2(u[13], k32_m25_p07);
+          v[18] = k_madd_epi32_avx2(u[14], k32_m25_p07);
+          v[19] = k_madd_epi32_avx2(u[15], k32_m25_p07);
+          v[20] = k_madd_epi32_avx2(u[ 8], k32_m09_p23);
+          v[21] = k_madd_epi32_avx2(u[ 9], k32_m09_p23);
+          v[22] = k_madd_epi32_avx2(u[10], k32_m09_p23);
+          v[23] = k_madd_epi32_avx2(u[11], k32_m09_p23);
+          v[24] = k_madd_epi32_avx2(u[ 4], k32_m17_p15);
+          v[25] = k_madd_epi32_avx2(u[ 5], k32_m17_p15);
+          v[26] = k_madd_epi32_avx2(u[ 6], k32_m17_p15);
+          v[27] = k_madd_epi32_avx2(u[ 7], k32_m17_p15);
+          v[28] = k_madd_epi32_avx2(u[ 0], k32_m01_p31);
+          v[29] = k_madd_epi32_avx2(u[ 1], k32_m01_p31);
+          v[30] = k_madd_epi32_avx2(u[ 2], k32_m01_p31);
+          v[31] = k_madd_epi32_avx2(u[ 3], k32_m01_p31);
+
+          u[ 0] = k_packs_epi64_avx2(v[ 0], v[ 1]);
+          u[ 1] = k_packs_epi64_avx2(v[ 2], v[ 3]);
+          u[ 2] = k_packs_epi64_avx2(v[ 4], v[ 5]);
+          u[ 3] = k_packs_epi64_avx2(v[ 6], v[ 7]);
+          u[ 4] = k_packs_epi64_avx2(v[ 8], v[ 9]);
+          u[ 5] = k_packs_epi64_avx2(v[10], v[11]);
+          u[ 6] = k_packs_epi64_avx2(v[12], v[13]);
+          u[ 7] = k_packs_epi64_avx2(v[14], v[15]);
+          u[ 8] = k_packs_epi64_avx2(v[16], v[17]);
+          u[ 9] = k_packs_epi64_avx2(v[18], v[19]);
+          u[10] = k_packs_epi64_avx2(v[20], v[21]);
+          u[11] = k_packs_epi64_avx2(v[22], v[23]);
+          u[12] = k_packs_epi64_avx2(v[24], v[25]);
+          u[13] = k_packs_epi64_avx2(v[26], v[27]);
+          u[14] = k_packs_epi64_avx2(v[28], v[29]);
+          u[15] = k_packs_epi64_avx2(v[30], v[31]);
+
+          v[ 0] = _mm256_add_epi32(u[ 0], k__DCT_CONST_ROUNDING);
+          v[ 1] = _mm256_add_epi32(u[ 1], k__DCT_CONST_ROUNDING);
+          v[ 2] = _mm256_add_epi32(u[ 2], k__DCT_CONST_ROUNDING);
+          v[ 3] = _mm256_add_epi32(u[ 3], k__DCT_CONST_ROUNDING);
+          v[ 4] = _mm256_add_epi32(u[ 4], k__DCT_CONST_ROUNDING);
+          v[ 5] = _mm256_add_epi32(u[ 5], k__DCT_CONST_ROUNDING);
+          v[ 6] = _mm256_add_epi32(u[ 6], k__DCT_CONST_ROUNDING);
+          v[ 7] = _mm256_add_epi32(u[ 7], k__DCT_CONST_ROUNDING);
+          v[ 8] = _mm256_add_epi32(u[ 8], k__DCT_CONST_ROUNDING);
+          v[ 9] = _mm256_add_epi32(u[ 9], k__DCT_CONST_ROUNDING);
+          v[10] = _mm256_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+          v[11] = _mm256_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+          v[12] = _mm256_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+          v[13] = _mm256_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+          v[14] = _mm256_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+          v[15] = _mm256_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+          u[ 0] = _mm256_srai_epi32(v[ 0], DCT_CONST_BITS);
+          u[ 1] = _mm256_srai_epi32(v[ 1], DCT_CONST_BITS);
+          u[ 2] = _mm256_srai_epi32(v[ 2], DCT_CONST_BITS);
+          u[ 3] = _mm256_srai_epi32(v[ 3], DCT_CONST_BITS);
+          u[ 4] = _mm256_srai_epi32(v[ 4], DCT_CONST_BITS);
+          u[ 5] = _mm256_srai_epi32(v[ 5], DCT_CONST_BITS);
+          u[ 6] = _mm256_srai_epi32(v[ 6], DCT_CONST_BITS);
+          u[ 7] = _mm256_srai_epi32(v[ 7], DCT_CONST_BITS);
+          u[ 8] = _mm256_srai_epi32(v[ 8], DCT_CONST_BITS);
+          u[ 9] = _mm256_srai_epi32(v[ 9], DCT_CONST_BITS);
+          u[10] = _mm256_srai_epi32(v[10], DCT_CONST_BITS);
+          u[11] = _mm256_srai_epi32(v[11], DCT_CONST_BITS);
+          u[12] = _mm256_srai_epi32(v[12], DCT_CONST_BITS);
+          u[13] = _mm256_srai_epi32(v[13], DCT_CONST_BITS);
+          u[14] = _mm256_srai_epi32(v[14], DCT_CONST_BITS);
+          u[15] = _mm256_srai_epi32(v[15], DCT_CONST_BITS);
+
+          v[ 0] = _mm256_cmpgt_epi32(kZero,u[ 0]);
+          v[ 1] = _mm256_cmpgt_epi32(kZero,u[ 1]);
+          v[ 2] = _mm256_cmpgt_epi32(kZero,u[ 2]);
+          v[ 3] = _mm256_cmpgt_epi32(kZero,u[ 3]);
+          v[ 4] = _mm256_cmpgt_epi32(kZero,u[ 4]);
+          v[ 5] = _mm256_cmpgt_epi32(kZero,u[ 5]);
+          v[ 6] = _mm256_cmpgt_epi32(kZero,u[ 6]);
+          v[ 7] = _mm256_cmpgt_epi32(kZero,u[ 7]);
+          v[ 8] = _mm256_cmpgt_epi32(kZero,u[ 8]);
+          v[ 9] = _mm256_cmpgt_epi32(kZero,u[ 9]);
+          v[10] = _mm256_cmpgt_epi32(kZero,u[10]);
+          v[11] = _mm256_cmpgt_epi32(kZero,u[11]);
+          v[12] = _mm256_cmpgt_epi32(kZero,u[12]);
+          v[13] = _mm256_cmpgt_epi32(kZero,u[13]);
+          v[14] = _mm256_cmpgt_epi32(kZero,u[14]);
+          v[15] = _mm256_cmpgt_epi32(kZero,u[15]);
+
+          u[ 0] = _mm256_sub_epi32(u[ 0], v[ 0]);
+          u[ 1] = _mm256_sub_epi32(u[ 1], v[ 1]);
+          u[ 2] = _mm256_sub_epi32(u[ 2], v[ 2]);
+          u[ 3] = _mm256_sub_epi32(u[ 3], v[ 3]);
+          u[ 4] = _mm256_sub_epi32(u[ 4], v[ 4]);
+          u[ 5] = _mm256_sub_epi32(u[ 5], v[ 5]);
+          u[ 6] = _mm256_sub_epi32(u[ 6], v[ 6]);
+          u[ 7] = _mm256_sub_epi32(u[ 7], v[ 7]);
+          u[ 8] = _mm256_sub_epi32(u[ 8], v[ 8]);
+          u[ 9] = _mm256_sub_epi32(u[ 9], v[ 9]);
+          u[10] = _mm256_sub_epi32(u[10], v[10]);
+          u[11] = _mm256_sub_epi32(u[11], v[11]);
+          u[12] = _mm256_sub_epi32(u[12], v[12]);
+          u[13] = _mm256_sub_epi32(u[13], v[13]);
+          u[14] = _mm256_sub_epi32(u[14], v[14]);
+          u[15] = _mm256_sub_epi32(u[15], v[15]);
+
+          v[0] = _mm256_add_epi32(u[0], K32One);
+          v[1] = _mm256_add_epi32(u[1], K32One);
+          v[2] = _mm256_add_epi32(u[2], K32One);
+          v[3] = _mm256_add_epi32(u[3], K32One);
+          v[4] = _mm256_add_epi32(u[4], K32One);
+          v[5] = _mm256_add_epi32(u[5], K32One);
+          v[6] = _mm256_add_epi32(u[6], K32One);
+          v[7] = _mm256_add_epi32(u[7], K32One);
+          v[8] = _mm256_add_epi32(u[8], K32One);
+          v[9] = _mm256_add_epi32(u[9], K32One);
+          v[10] = _mm256_add_epi32(u[10], K32One);
+          v[11] = _mm256_add_epi32(u[11], K32One);
+          v[12] = _mm256_add_epi32(u[12], K32One);
+          v[13] = _mm256_add_epi32(u[13], K32One);
+          v[14] = _mm256_add_epi32(u[14], K32One);
+          v[15] = _mm256_add_epi32(u[15], K32One);
+
+          u[0] = _mm256_srai_epi32(v[0], 2);
+          u[1] = _mm256_srai_epi32(v[1], 2);
+          u[2] = _mm256_srai_epi32(v[2], 2);
+          u[3] = _mm256_srai_epi32(v[3], 2);
+          u[4] = _mm256_srai_epi32(v[4], 2);
+          u[5] = _mm256_srai_epi32(v[5], 2);
+          u[6] = _mm256_srai_epi32(v[6], 2);
+          u[7] = _mm256_srai_epi32(v[7], 2);
+          u[8] = _mm256_srai_epi32(v[8], 2);
+          u[9] = _mm256_srai_epi32(v[9], 2);
+          u[10] = _mm256_srai_epi32(v[10], 2);
+          u[11] = _mm256_srai_epi32(v[11], 2);
+          u[12] = _mm256_srai_epi32(v[12], 2);
+          u[13] = _mm256_srai_epi32(v[13], 2);
+          u[14] = _mm256_srai_epi32(v[14], 2);
+          u[15] = _mm256_srai_epi32(v[15], 2);
+
+          out[ 1] = _mm256_packs_epi32(u[0], u[1]);
+          out[17] = _mm256_packs_epi32(u[2], u[3]);
+          out[ 9] = _mm256_packs_epi32(u[4], u[5]);
+          out[25] = _mm256_packs_epi32(u[6], u[7]);
+          out[ 7] = _mm256_packs_epi32(u[8], u[9]);
+          out[23] = _mm256_packs_epi32(u[10], u[11]);
+          out[15] = _mm256_packs_epi32(u[12], u[13]);
+          out[31] = _mm256_packs_epi32(u[14], u[15]);
+        }
+        {
+          const __m256i k32_p27_p05 = pair256_set_epi32(cospi_27_64, cospi_5_64);
+          const __m256i k32_p11_p21 = pair256_set_epi32(cospi_11_64, cospi_21_64);
+          const __m256i k32_p19_p13 = pair256_set_epi32(cospi_19_64, cospi_13_64);
+          const __m256i k32_p03_p29 = pair256_set_epi32(cospi_3_64, cospi_29_64);
+          const __m256i k32_m29_p03 = pair256_set_epi32(-cospi_29_64, cospi_3_64);
+          const __m256i k32_m13_p19 = pair256_set_epi32(-cospi_13_64, cospi_19_64);
+          const __m256i k32_m21_p11 = pair256_set_epi32(-cospi_21_64, cospi_11_64);
+          const __m256i k32_m05_p27 = pair256_set_epi32(-cospi_5_64, cospi_27_64);
+
+          u[ 0] = _mm256_unpacklo_epi32(lstep1[40], lstep1[54]);
+          u[ 1] = _mm256_unpackhi_epi32(lstep1[40], lstep1[54]);
+          u[ 2] = _mm256_unpacklo_epi32(lstep1[41], lstep1[55]);
+          u[ 3] = _mm256_unpackhi_epi32(lstep1[41], lstep1[55]);
+          u[ 4] = _mm256_unpacklo_epi32(lstep1[42], lstep1[52]);
+          u[ 5] = _mm256_unpackhi_epi32(lstep1[42], lstep1[52]);
+          u[ 6] = _mm256_unpacklo_epi32(lstep1[43], lstep1[53]);
+          u[ 7] = _mm256_unpackhi_epi32(lstep1[43], lstep1[53]);
+          u[ 8] = _mm256_unpacklo_epi32(lstep1[44], lstep1[50]);
+          u[ 9] = _mm256_unpackhi_epi32(lstep1[44], lstep1[50]);
+          u[10] = _mm256_unpacklo_epi32(lstep1[45], lstep1[51]);
+          u[11] = _mm256_unpackhi_epi32(lstep1[45], lstep1[51]);
+          u[12] = _mm256_unpacklo_epi32(lstep1[46], lstep1[48]);
+          u[13] = _mm256_unpackhi_epi32(lstep1[46], lstep1[48]);
+          u[14] = _mm256_unpacklo_epi32(lstep1[47], lstep1[49]);
+          u[15] = _mm256_unpackhi_epi32(lstep1[47], lstep1[49]);
+
+          v[ 0] = k_madd_epi32_avx2(u[ 0], k32_p27_p05);
+          v[ 1] = k_madd_epi32_avx2(u[ 1], k32_p27_p05);
+          v[ 2] = k_madd_epi32_avx2(u[ 2], k32_p27_p05);
+          v[ 3] = k_madd_epi32_avx2(u[ 3], k32_p27_p05);
+          v[ 4] = k_madd_epi32_avx2(u[ 4], k32_p11_p21);
+          v[ 5] = k_madd_epi32_avx2(u[ 5], k32_p11_p21);
+          v[ 6] = k_madd_epi32_avx2(u[ 6], k32_p11_p21);
+          v[ 7] = k_madd_epi32_avx2(u[ 7], k32_p11_p21);
+          v[ 8] = k_madd_epi32_avx2(u[ 8], k32_p19_p13);
+          v[ 9] = k_madd_epi32_avx2(u[ 9], k32_p19_p13);
+          v[10] = k_madd_epi32_avx2(u[10], k32_p19_p13);
+          v[11] = k_madd_epi32_avx2(u[11], k32_p19_p13);
+          v[12] = k_madd_epi32_avx2(u[12], k32_p03_p29);
+          v[13] = k_madd_epi32_avx2(u[13], k32_p03_p29);
+          v[14] = k_madd_epi32_avx2(u[14], k32_p03_p29);
+          v[15] = k_madd_epi32_avx2(u[15], k32_p03_p29);
+          v[16] = k_madd_epi32_avx2(u[12], k32_m29_p03);
+          v[17] = k_madd_epi32_avx2(u[13], k32_m29_p03);
+          v[18] = k_madd_epi32_avx2(u[14], k32_m29_p03);
+          v[19] = k_madd_epi32_avx2(u[15], k32_m29_p03);
+          v[20] = k_madd_epi32_avx2(u[ 8], k32_m13_p19);
+          v[21] = k_madd_epi32_avx2(u[ 9], k32_m13_p19);
+          v[22] = k_madd_epi32_avx2(u[10], k32_m13_p19);
+          v[23] = k_madd_epi32_avx2(u[11], k32_m13_p19);
+          v[24] = k_madd_epi32_avx2(u[ 4], k32_m21_p11);
+          v[25] = k_madd_epi32_avx2(u[ 5], k32_m21_p11);
+          v[26] = k_madd_epi32_avx2(u[ 6], k32_m21_p11);
+          v[27] = k_madd_epi32_avx2(u[ 7], k32_m21_p11);
+          v[28] = k_madd_epi32_avx2(u[ 0], k32_m05_p27);
+          v[29] = k_madd_epi32_avx2(u[ 1], k32_m05_p27);
+          v[30] = k_madd_epi32_avx2(u[ 2], k32_m05_p27);
+          v[31] = k_madd_epi32_avx2(u[ 3], k32_m05_p27);
+
+          u[ 0] = k_packs_epi64_avx2(v[ 0], v[ 1]);
+          u[ 1] = k_packs_epi64_avx2(v[ 2], v[ 3]);
+          u[ 2] = k_packs_epi64_avx2(v[ 4], v[ 5]);
+          u[ 3] = k_packs_epi64_avx2(v[ 6], v[ 7]);
+          u[ 4] = k_packs_epi64_avx2(v[ 8], v[ 9]);
+          u[ 5] = k_packs_epi64_avx2(v[10], v[11]);
+          u[ 6] = k_packs_epi64_avx2(v[12], v[13]);
+          u[ 7] = k_packs_epi64_avx2(v[14], v[15]);
+          u[ 8] = k_packs_epi64_avx2(v[16], v[17]);
+          u[ 9] = k_packs_epi64_avx2(v[18], v[19]);
+          u[10] = k_packs_epi64_avx2(v[20], v[21]);
+          u[11] = k_packs_epi64_avx2(v[22], v[23]);
+          u[12] = k_packs_epi64_avx2(v[24], v[25]);
+          u[13] = k_packs_epi64_avx2(v[26], v[27]);
+          u[14] = k_packs_epi64_avx2(v[28], v[29]);
+          u[15] = k_packs_epi64_avx2(v[30], v[31]);
+
+          v[ 0] = _mm256_add_epi32(u[ 0], k__DCT_CONST_ROUNDING);
+          v[ 1] = _mm256_add_epi32(u[ 1], k__DCT_CONST_ROUNDING);
+          v[ 2] = _mm256_add_epi32(u[ 2], k__DCT_CONST_ROUNDING);
+          v[ 3] = _mm256_add_epi32(u[ 3], k__DCT_CONST_ROUNDING);
+          v[ 4] = _mm256_add_epi32(u[ 4], k__DCT_CONST_ROUNDING);
+          v[ 5] = _mm256_add_epi32(u[ 5], k__DCT_CONST_ROUNDING);
+          v[ 6] = _mm256_add_epi32(u[ 6], k__DCT_CONST_ROUNDING);
+          v[ 7] = _mm256_add_epi32(u[ 7], k__DCT_CONST_ROUNDING);
+          v[ 8] = _mm256_add_epi32(u[ 8], k__DCT_CONST_ROUNDING);
+          v[ 9] = _mm256_add_epi32(u[ 9], k__DCT_CONST_ROUNDING);
+          v[10] = _mm256_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+          v[11] = _mm256_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+          v[12] = _mm256_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+          v[13] = _mm256_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+          v[14] = _mm256_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+          v[15] = _mm256_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+          u[ 0] = _mm256_srai_epi32(v[ 0], DCT_CONST_BITS);
+          u[ 1] = _mm256_srai_epi32(v[ 1], DCT_CONST_BITS);
+          u[ 2] = _mm256_srai_epi32(v[ 2], DCT_CONST_BITS);
+          u[ 3] = _mm256_srai_epi32(v[ 3], DCT_CONST_BITS);
+          u[ 4] = _mm256_srai_epi32(v[ 4], DCT_CONST_BITS);
+          u[ 5] = _mm256_srai_epi32(v[ 5], DCT_CONST_BITS);
+          u[ 6] = _mm256_srai_epi32(v[ 6], DCT_CONST_BITS);
+          u[ 7] = _mm256_srai_epi32(v[ 7], DCT_CONST_BITS);
+          u[ 8] = _mm256_srai_epi32(v[ 8], DCT_CONST_BITS);
+          u[ 9] = _mm256_srai_epi32(v[ 9], DCT_CONST_BITS);
+          u[10] = _mm256_srai_epi32(v[10], DCT_CONST_BITS);
+          u[11] = _mm256_srai_epi32(v[11], DCT_CONST_BITS);
+          u[12] = _mm256_srai_epi32(v[12], DCT_CONST_BITS);
+          u[13] = _mm256_srai_epi32(v[13], DCT_CONST_BITS);
+          u[14] = _mm256_srai_epi32(v[14], DCT_CONST_BITS);
+          u[15] = _mm256_srai_epi32(v[15], DCT_CONST_BITS);
+
+          v[ 0] = _mm256_cmpgt_epi32(kZero,u[ 0]);
+          v[ 1] = _mm256_cmpgt_epi32(kZero,u[ 1]);
+          v[ 2] = _mm256_cmpgt_epi32(kZero,u[ 2]);
+          v[ 3] = _mm256_cmpgt_epi32(kZero,u[ 3]);
+          v[ 4] = _mm256_cmpgt_epi32(kZero,u[ 4]);
+          v[ 5] = _mm256_cmpgt_epi32(kZero,u[ 5]);
+          v[ 6] = _mm256_cmpgt_epi32(kZero,u[ 6]);
+          v[ 7] = _mm256_cmpgt_epi32(kZero,u[ 7]);
+          v[ 8] = _mm256_cmpgt_epi32(kZero,u[ 8]);
+          v[ 9] = _mm256_cmpgt_epi32(kZero,u[ 9]);
+          v[10] = _mm256_cmpgt_epi32(kZero,u[10]);
+          v[11] = _mm256_cmpgt_epi32(kZero,u[11]);
+          v[12] = _mm256_cmpgt_epi32(kZero,u[12]);
+          v[13] = _mm256_cmpgt_epi32(kZero,u[13]);
+          v[14] = _mm256_cmpgt_epi32(kZero,u[14]);
+          v[15] = _mm256_cmpgt_epi32(kZero,u[15]);
+
+          u[ 0] = _mm256_sub_epi32(u[ 0], v[ 0]);
+          u[ 1] = _mm256_sub_epi32(u[ 1], v[ 1]);
+          u[ 2] = _mm256_sub_epi32(u[ 2], v[ 2]);
+          u[ 3] = _mm256_sub_epi32(u[ 3], v[ 3]);
+          u[ 4] = _mm256_sub_epi32(u[ 4], v[ 4]);
+          u[ 5] = _mm256_sub_epi32(u[ 5], v[ 5]);
+          u[ 6] = _mm256_sub_epi32(u[ 6], v[ 6]);
+          u[ 7] = _mm256_sub_epi32(u[ 7], v[ 7]);
+          u[ 8] = _mm256_sub_epi32(u[ 8], v[ 8]);
+          u[ 9] = _mm256_sub_epi32(u[ 9], v[ 9]);
+          u[10] = _mm256_sub_epi32(u[10], v[10]);
+          u[11] = _mm256_sub_epi32(u[11], v[11]);
+          u[12] = _mm256_sub_epi32(u[12], v[12]);
+          u[13] = _mm256_sub_epi32(u[13], v[13]);
+          u[14] = _mm256_sub_epi32(u[14], v[14]);
+          u[15] = _mm256_sub_epi32(u[15], v[15]);
+
+          v[0] = _mm256_add_epi32(u[0], K32One);
+          v[1] = _mm256_add_epi32(u[1], K32One);
+          v[2] = _mm256_add_epi32(u[2], K32One);
+          v[3] = _mm256_add_epi32(u[3], K32One);
+          v[4] = _mm256_add_epi32(u[4], K32One);
+          v[5] = _mm256_add_epi32(u[5], K32One);
+          v[6] = _mm256_add_epi32(u[6], K32One);
+          v[7] = _mm256_add_epi32(u[7], K32One);
+          v[8] = _mm256_add_epi32(u[8], K32One);
+          v[9] = _mm256_add_epi32(u[9], K32One);
+          v[10] = _mm256_add_epi32(u[10], K32One);
+          v[11] = _mm256_add_epi32(u[11], K32One);
+          v[12] = _mm256_add_epi32(u[12], K32One);
+          v[13] = _mm256_add_epi32(u[13], K32One);
+          v[14] = _mm256_add_epi32(u[14], K32One);
+          v[15] = _mm256_add_epi32(u[15], K32One);
+
+          u[0] = _mm256_srai_epi32(v[0], 2);
+          u[1] = _mm256_srai_epi32(v[1], 2);
+          u[2] = _mm256_srai_epi32(v[2], 2);
+          u[3] = _mm256_srai_epi32(v[3], 2);
+          u[4] = _mm256_srai_epi32(v[4], 2);
+          u[5] = _mm256_srai_epi32(v[5], 2);
+          u[6] = _mm256_srai_epi32(v[6], 2);
+          u[7] = _mm256_srai_epi32(v[7], 2);
+          u[8] = _mm256_srai_epi32(v[8], 2);
+          u[9] = _mm256_srai_epi32(v[9], 2);
+          u[10] = _mm256_srai_epi32(v[10], 2);
+          u[11] = _mm256_srai_epi32(v[11], 2);
+          u[12] = _mm256_srai_epi32(v[12], 2);
+          u[13] = _mm256_srai_epi32(v[13], 2);
+          u[14] = _mm256_srai_epi32(v[14], 2);
+          u[15] = _mm256_srai_epi32(v[15], 2);
+
+          out[ 5] = _mm256_packs_epi32(u[0], u[1]);
+          out[21] = _mm256_packs_epi32(u[2], u[3]);
+          out[13] = _mm256_packs_epi32(u[4], u[5]);
+          out[29] = _mm256_packs_epi32(u[6], u[7]);
+          out[ 3] = _mm256_packs_epi32(u[8], u[9]);
+          out[19] = _mm256_packs_epi32(u[10], u[11]);
+          out[11] = _mm256_packs_epi32(u[12], u[13]);
+          out[27] = _mm256_packs_epi32(u[14], u[15]);
+        }
+      }
+#endif
+      // Transpose the results, do it as four 8x8 transposes.
+      {
+        int transpose_block;
+        int16_t *output_currStep,*output_nextStep;
+        if (0 == pass){
+                 output_currStep = &intermediate[column_start * 32];
+                 output_nextStep = &intermediate[(column_start + 8) * 32];
+        } else{
+                 output_currStep = &output_org[column_start * 32];
+                 output_nextStep = &output_org[(column_start + 8) * 32];
+        }
+        for (transpose_block = 0; transpose_block < 4; ++transpose_block) {
+          __m256i *this_out = &out[8 * transpose_block];
+          // 00  01  02  03  04  05  06  07  08  09  10  11  12  13  14  15
+          // 20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
+          // 40  41  42  43  44  45  46  47  48  49  50  51  52  53  54  55
+          // 60  61  62  63  64  65  66  67  68  69  70  71  72  73  74  75
+          // 80  81  82  83  84  85  86  87  88  89  90  91  92  93  94  95
+          // 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115
+          // 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135
+          // 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155
+          const __m256i tr0_0 = _mm256_unpacklo_epi16(this_out[0], this_out[1]);
+          const __m256i tr0_1 = _mm256_unpacklo_epi16(this_out[2], this_out[3]);
+          const __m256i tr0_2 = _mm256_unpackhi_epi16(this_out[0], this_out[1]);
+          const __m256i tr0_3 = _mm256_unpackhi_epi16(this_out[2], this_out[3]);
+          const __m256i tr0_4 = _mm256_unpacklo_epi16(this_out[4], this_out[5]);
+          const __m256i tr0_5 = _mm256_unpacklo_epi16(this_out[6], this_out[7]);
+          const __m256i tr0_6 = _mm256_unpackhi_epi16(this_out[4], this_out[5]);
+          const __m256i tr0_7 = _mm256_unpackhi_epi16(this_out[6], this_out[7]);
+          // 00  20  01  21  02  22  03  23  08  28  09  29  10  30  11  31
+          // 40  60  41  61  42  62  43  63  48  68  49  69  50  70  51  71
+          // 04  24  05  25  06  26  07  27  12  32  13  33  14  34  15  35
+          // 44  64  45  65  46  66  47  67  52  72  53  73  54  74  55  75
+          // 80  100 81  101 82  102 83  103 88  108 89  109 90  110 91  101
+          // 120 140 121 141 122 142 123 143 128 148 129 149 130 150 131 151
+          // 84  104 85  105 86  106 87  107 92  112 93  113 94  114 95  115
+          // 124 144 125 145 126 146 127 147 132 152 133 153 134 154 135 155
+
+          const __m256i tr1_0 = _mm256_unpacklo_epi32(tr0_0, tr0_1);
+          const __m256i tr1_1 = _mm256_unpacklo_epi32(tr0_2, tr0_3);
+          const __m256i tr1_2 = _mm256_unpackhi_epi32(tr0_0, tr0_1);
+          const __m256i tr1_3 = _mm256_unpackhi_epi32(tr0_2, tr0_3);
+          const __m256i tr1_4 = _mm256_unpacklo_epi32(tr0_4, tr0_5);
+          const __m256i tr1_5 = _mm256_unpacklo_epi32(tr0_6, tr0_7);
+          const __m256i tr1_6 = _mm256_unpackhi_epi32(tr0_4, tr0_5);
+          const __m256i tr1_7 = _mm256_unpackhi_epi32(tr0_6, tr0_7);
+          // 00 20  40  60  01 21  41  61  08 28  48  68  09 29  49  69
+          // 04 24  44  64  05 25  45  65  12 32  52  72  13 33  53  73
+          // 02 22  42  62  03 23  43  63  10 30  50  70  11 31  51  71
+          // 06 26  46  66  07 27  47  67  14 34  54  74  15 35  55  75
+          // 80 100 120 140 81 101 121 141 88 108 128 148 89 109 129 149
+          // 84 104 124 144 85 105 125 145 92 112 132 152 93 113 133 153
+          // 82 102 122 142 83 103 123 143 90 110 130 150 91 101 131 151
+          // 86 106 126 146 87 107 127 147 94 114 134 154 95 115 135 155
+          __m256i tr2_0 = _mm256_unpacklo_epi64(tr1_0, tr1_4);
+          __m256i tr2_1 = _mm256_unpackhi_epi64(tr1_0, tr1_4);
+          __m256i tr2_2 = _mm256_unpacklo_epi64(tr1_2, tr1_6);
+          __m256i tr2_3 = _mm256_unpackhi_epi64(tr1_2, tr1_6);
+          __m256i tr2_4 = _mm256_unpacklo_epi64(tr1_1, tr1_5);
+          __m256i tr2_5 = _mm256_unpackhi_epi64(tr1_1, tr1_5);
+          __m256i tr2_6 = _mm256_unpacklo_epi64(tr1_3, tr1_7);
+          __m256i tr2_7 = _mm256_unpackhi_epi64(tr1_3, tr1_7);
+          // 00 20 40 60 80 100 120 140 08 28 48 68 88 108 128 148
+          // 01 21 41 61 81 101 121 141 09 29 49 69 89 109 129 149
+          // 02 22 42 62 82 102 122 142 10 30 50 70 90 110 130 150
+          // 03 23 43 63 83 103 123 143 11 31 51 71 91 101 131 151
+          // 04 24 44 64 84 104 124 144 12 32 52 72 92 112 132 152
+          // 05 25 45 65 85 105 125 145 13 33 53 73 93 113 133 153
+          // 06 26 46 66 86 106 126 146 14 34 54 74 94 114 134 154
+          // 07 27 47 67 87 107 127 147 15 35 55 75 95 115 135 155
+          if (0 == pass) {
+            // output[j] = (output[j] + 1 + (output[j] > 0)) >> 2;
+            // TODO(cd): see quality impact of only doing
+            //           output[j] = (output[j] + 1) >> 2;
+            //           which would remove the code between here ...
+            __m256i tr2_0_0 = _mm256_cmpgt_epi16(tr2_0, kZero);
+            __m256i tr2_1_0 = _mm256_cmpgt_epi16(tr2_1, kZero);
+            __m256i tr2_2_0 = _mm256_cmpgt_epi16(tr2_2, kZero);
+            __m256i tr2_3_0 = _mm256_cmpgt_epi16(tr2_3, kZero);
+            __m256i tr2_4_0 = _mm256_cmpgt_epi16(tr2_4, kZero);
+            __m256i tr2_5_0 = _mm256_cmpgt_epi16(tr2_5, kZero);
+            __m256i tr2_6_0 = _mm256_cmpgt_epi16(tr2_6, kZero);
+            __m256i tr2_7_0 = _mm256_cmpgt_epi16(tr2_7, kZero);
+            tr2_0 = _mm256_sub_epi16(tr2_0, tr2_0_0);
+            tr2_1 = _mm256_sub_epi16(tr2_1, tr2_1_0);
+            tr2_2 = _mm256_sub_epi16(tr2_2, tr2_2_0);
+            tr2_3 = _mm256_sub_epi16(tr2_3, tr2_3_0);
+            tr2_4 = _mm256_sub_epi16(tr2_4, tr2_4_0);
+            tr2_5 = _mm256_sub_epi16(tr2_5, tr2_5_0);
+            tr2_6 = _mm256_sub_epi16(tr2_6, tr2_6_0);
+            tr2_7 = _mm256_sub_epi16(tr2_7, tr2_7_0);
+            //           ... and here.
+            //           PS: also change code in vp9/encoder/vp9_dct.c
+            tr2_0 = _mm256_add_epi16(tr2_0, kOne);
+            tr2_1 = _mm256_add_epi16(tr2_1, kOne);
+            tr2_2 = _mm256_add_epi16(tr2_2, kOne);
+            tr2_3 = _mm256_add_epi16(tr2_3, kOne);
+            tr2_4 = _mm256_add_epi16(tr2_4, kOne);
+            tr2_5 = _mm256_add_epi16(tr2_5, kOne);
+            tr2_6 = _mm256_add_epi16(tr2_6, kOne);
+            tr2_7 = _mm256_add_epi16(tr2_7, kOne);
+            tr2_0 = _mm256_srai_epi16(tr2_0, 2);
+            tr2_1 = _mm256_srai_epi16(tr2_1, 2);
+            tr2_2 = _mm256_srai_epi16(tr2_2, 2);
+            tr2_3 = _mm256_srai_epi16(tr2_3, 2);
+            tr2_4 = _mm256_srai_epi16(tr2_4, 2);
+            tr2_5 = _mm256_srai_epi16(tr2_5, 2);
+            tr2_6 = _mm256_srai_epi16(tr2_6, 2);
+            tr2_7 = _mm256_srai_epi16(tr2_7, 2);
+          }
+          // Note: even though all these stores are aligned, using the aligned
+          //       intrinsic make the code slightly slower.
+          _mm_storeu_si128((__m128i *)(output_currStep + 0 * 32), _mm256_castsi256_si128(tr2_0));
+          _mm_storeu_si128((__m128i *)(output_currStep + 1 * 32), _mm256_castsi256_si128(tr2_1));
+          _mm_storeu_si128((__m128i *)(output_currStep + 2 * 32), _mm256_castsi256_si128(tr2_2));
+          _mm_storeu_si128((__m128i *)(output_currStep + 3 * 32), _mm256_castsi256_si128(tr2_3));
+          _mm_storeu_si128((__m128i *)(output_currStep + 4 * 32), _mm256_castsi256_si128(tr2_4));
+          _mm_storeu_si128((__m128i *)(output_currStep + 5 * 32), _mm256_castsi256_si128(tr2_5));
+          _mm_storeu_si128((__m128i *)(output_currStep + 6 * 32), _mm256_castsi256_si128(tr2_6));
+          _mm_storeu_si128((__m128i *)(output_currStep + 7 * 32), _mm256_castsi256_si128(tr2_7));
+
+          _mm_storeu_si128((__m128i *)(output_nextStep + 0 * 32), _mm256_extractf128_si256(tr2_0,1));
+          _mm_storeu_si128((__m128i *)(output_nextStep + 1 * 32), _mm256_extractf128_si256(tr2_1,1));
+          _mm_storeu_si128((__m128i *)(output_nextStep + 2 * 32), _mm256_extractf128_si256(tr2_2,1));
+          _mm_storeu_si128((__m128i *)(output_nextStep + 3 * 32), _mm256_extractf128_si256(tr2_3,1));
+          _mm_storeu_si128((__m128i *)(output_nextStep + 4 * 32), _mm256_extractf128_si256(tr2_4,1));
+          _mm_storeu_si128((__m128i *)(output_nextStep + 5 * 32), _mm256_extractf128_si256(tr2_5,1));
+          _mm_storeu_si128((__m128i *)(output_nextStep + 6 * 32), _mm256_extractf128_si256(tr2_6,1));
+          _mm_storeu_si128((__m128i *)(output_nextStep + 7 * 32), _mm256_extractf128_si256(tr2_7,1));
+          // Process next 8x8
+          output_currStep += 8;
+          output_nextStep += 8;
+        }
+      }
+    }
+  }
+}  // NOLINT
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct32x32_sse2.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct32x32_sse2.c
index 2d59775cec6..42fdbbdc5ce 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct32x32_sse2.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct32x32_sse2.c
@@ -12,6 +12,9 @@
 #include "vp9/common/vp9_idct.h"  // for cospi constants
 #include "vpx_ports/mem.h"
 
+#define pair_set_epi32(a, b) \
+  _mm_set_epi32(b, a, b, a)
+
 #if FDCT32x32_HIGH_PRECISION
 static INLINE __m128i k_madd_epi32(__m128i a, __m128i b) {
   __m128i buf0, buf1;
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct_avx2.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct_avx2.c
new file mode 100644
index 00000000000..b5269ed0303
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct_avx2.c
@@ -0,0 +1,2592 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <immintrin.h>  // AVX2
+#include "vp9/common/vp9_idct.h"  // for cospi constants
+#include "vpx_ports/mem.h"
+
+void vp9_fdct4x4_avx2(const int16_t *input, int16_t *output, int stride) {
+  // The 2D transform is done with two passes which are actually pretty
+  // similar. In the first one, we transform the columns and transpose
+  // the results. In the second one, we transform the rows. To achieve that,
+  // as the first pass results are transposed, we transpose the columns (that
+  // is the transposed rows) and transpose the results (so that it goes back
+  // in normal/row positions).
+  int pass;
+  // Constants
+  //    When we use them, in one case, they are all the same. In all others
+  //    it's a pair of them that we need to repeat four times. This is done
+  //    by constructing the 32 bit constant corresponding to that pair.
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
+  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
+  const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
+  const __m128i kOne = _mm_set1_epi16(1);
+  __m128i in0, in1, in2, in3;
+  // Load inputs.
+  {
+    in0  = _mm_loadl_epi64((const __m128i *)(input +  0 * stride));
+    in1  = _mm_loadl_epi64((const __m128i *)(input +  1 * stride));
+    in2  = _mm_loadl_epi64((const __m128i *)(input +  2 * stride));
+    in3  = _mm_loadl_epi64((const __m128i *)(input +  3 * stride));
+    // x = x << 4
+    in0 = _mm_slli_epi16(in0, 4);
+    in1 = _mm_slli_epi16(in1, 4);
+    in2 = _mm_slli_epi16(in2, 4);
+    in3 = _mm_slli_epi16(in3, 4);
+    // if (i == 0 && input[0]) input[0] += 1;
+    {
+      // The mask will only contain whether the first value is zero, all
+      // other comparison will fail as something shifted by 4 (above << 4)
+      // can never be equal to one. To increment in the non-zero case, we
+      // add the mask and one for the first element:
+      //   - if zero, mask = -1, v = v - 1 + 1 = v
+      //   - if non-zero, mask = 0, v = v + 0 + 1 = v + 1
+      __m128i mask = _mm_cmpeq_epi16(in0, k__nonzero_bias_a);
+      in0 = _mm_add_epi16(in0, mask);
+      in0 = _mm_add_epi16(in0, k__nonzero_bias_b);
+    }
+  }
+  // Do the two transform/transpose passes
+  for (pass = 0; pass < 2; ++pass) {
+    // Transform 1/2: Add/subtract
+    const __m128i r0 = _mm_add_epi16(in0, in3);
+    const __m128i r1 = _mm_add_epi16(in1, in2);
+    const __m128i r2 = _mm_sub_epi16(in1, in2);
+    const __m128i r3 = _mm_sub_epi16(in0, in3);
+    // Transform 1/2: Interleave to do the multiply by constants which gets us
+    //                into 32 bits.
+    const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
+    const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
+    const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
+    const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
+    const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
+    const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
+    const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+    const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+    const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
+    const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
+    const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+    const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+    const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+    const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+    // Combine and transpose
+    const __m128i res0 = _mm_packs_epi32(w0, w2);
+    const __m128i res1 = _mm_packs_epi32(w4, w6);
+    // 00 01 02 03 20 21 22 23
+    // 10 11 12 13 30 31 32 33
+    const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1);
+    const __m128i tr0_1 = _mm_unpackhi_epi16(res0, res1);
+    // 00 10 01 11 02 12 03 13
+    // 20 30 21 31 22 32 23 33
+    in0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+    in2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+    // 00 10 20 30 01 11 21 31      in0 contains 0 followed by 1
+    // 02 12 22 32 03 13 23 33      in2 contains 2 followed by 3
+    if (0 == pass) {
+      // Extract values in the high part for second pass as transform code
+      // only uses the first four values.
+      in1 = _mm_unpackhi_epi64(in0, in0);
+      in3 = _mm_unpackhi_epi64(in2, in2);
+    } else {
+      // Post-condition output and store it (v + 1) >> 2, taking advantage
+      // of the fact 1/3 are stored just after 0/2.
+      __m128i out01 = _mm_add_epi16(in0, kOne);
+      __m128i out23 = _mm_add_epi16(in2, kOne);
+      out01 = _mm_srai_epi16(out01, 2);
+      out23 = _mm_srai_epi16(out23, 2);
+      _mm_storeu_si128((__m128i *)(output + 0 * 4), out01);
+      _mm_storeu_si128((__m128i *)(output + 2 * 4), out23);
+    }
+  }
+}
+
+static INLINE void load_buffer_4x4_avx2(const int16_t *input, __m128i *in,
+                                   int stride) {
+  const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
+  const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
+  __m128i mask;
+
+  in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
+  in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
+  in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
+  in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
+
+  in[0] = _mm_slli_epi16(in[0], 4);
+  in[1] = _mm_slli_epi16(in[1], 4);
+  in[2] = _mm_slli_epi16(in[2], 4);
+  in[3] = _mm_slli_epi16(in[3], 4);
+
+  mask = _mm_cmpeq_epi16(in[0], k__nonzero_bias_a);
+  in[0] = _mm_add_epi16(in[0], mask);
+  in[0] = _mm_add_epi16(in[0], k__nonzero_bias_b);
+}
+
+static INLINE void write_buffer_4x4_avx2(int16_t *output, __m128i *res) {
+  const __m128i kOne = _mm_set1_epi16(1);
+  __m128i in01 = _mm_unpacklo_epi64(res[0], res[1]);
+  __m128i in23 = _mm_unpacklo_epi64(res[2], res[3]);
+  __m128i out01 = _mm_add_epi16(in01, kOne);
+  __m128i out23 = _mm_add_epi16(in23, kOne);
+  out01 = _mm_srai_epi16(out01, 2);
+  out23 = _mm_srai_epi16(out23, 2);
+  _mm_store_si128((__m128i *)(output + 0 * 8), out01);
+  _mm_store_si128((__m128i *)(output + 1 * 8), out23);
+}
+
+static INLINE void transpose_4x4_avx2(__m128i *res) {
+  // Combine and transpose
+  // 00 01 02 03 20 21 22 23
+  // 10 11 12 13 30 31 32 33
+  const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
+  const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]);
+
+  // 00 10 01 11 02 12 03 13
+  // 20 30 21 31 22 32 23 33
+  res[0] = _mm_unpacklo_epi32(tr0_0, tr0_1);
+  res[2] = _mm_unpackhi_epi32(tr0_0, tr0_1);
+
+  // 00 10 20 30 01 11 21 31
+  // 02 12 22 32 03 13 23 33
+  // only use the first 4 16-bit integers
+  res[1] = _mm_unpackhi_epi64(res[0], res[0]);
+  res[3] = _mm_unpackhi_epi64(res[2], res[2]);
+}
+
+void fdct4_avx2(__m128i *in) {
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+
+  __m128i u[4], v[4];
+  u[0]=_mm_unpacklo_epi16(in[0], in[1]);
+  u[1]=_mm_unpacklo_epi16(in[3], in[2]);
+
+  v[0] = _mm_add_epi16(u[0], u[1]);
+  v[1] = _mm_sub_epi16(u[0], u[1]);
+
+  u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16);  // 0
+  u[1] = _mm_madd_epi16(v[0], k__cospi_p16_m16);  // 2
+  u[2] = _mm_madd_epi16(v[1], k__cospi_p08_p24);  // 1
+  u[3] = _mm_madd_epi16(v[1], k__cospi_p24_m08);  // 3
+
+  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+
+  in[0] = _mm_packs_epi32(u[0], u[1]);
+  in[1] = _mm_packs_epi32(u[2], u[3]);
+  transpose_4x4_avx2(in);
+}
+
+void fadst4_avx2(__m128i *in) {
+  const __m128i k__sinpi_p01_p02 = pair_set_epi16(sinpi_1_9, sinpi_2_9);
+  const __m128i k__sinpi_p04_m01 = pair_set_epi16(sinpi_4_9, -sinpi_1_9);
+  const __m128i k__sinpi_p03_p04 = pair_set_epi16(sinpi_3_9, sinpi_4_9);
+  const __m128i k__sinpi_m03_p02 = pair_set_epi16(-sinpi_3_9, sinpi_2_9);
+  const __m128i k__sinpi_p03_p03 = _mm_set1_epi16(sinpi_3_9);
+  const __m128i kZero = _mm_set1_epi16(0);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  __m128i u[8], v[8];
+  __m128i in7 = _mm_add_epi16(in[0], in[1]);
+
+  u[0] = _mm_unpacklo_epi16(in[0], in[1]);
+  u[1] = _mm_unpacklo_epi16(in[2], in[3]);
+  u[2] = _mm_unpacklo_epi16(in7, kZero);
+  u[3] = _mm_unpacklo_epi16(in[2], kZero);
+  u[4] = _mm_unpacklo_epi16(in[3], kZero);
+
+  v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p02);  // s0 + s2
+  v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p04);  // s4 + s5
+  v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03);  // x1
+  v[3] = _mm_madd_epi16(u[0], k__sinpi_p04_m01);  // s1 - s3
+  v[4] = _mm_madd_epi16(u[1], k__sinpi_m03_p02);  // -s4 + s6
+  v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03);  // s4
+  v[6] = _mm_madd_epi16(u[4], k__sinpi_p03_p03);
+
+  u[0] = _mm_add_epi32(v[0], v[1]);
+  u[1] = _mm_sub_epi32(v[2], v[6]);
+  u[2] = _mm_add_epi32(v[3], v[4]);
+  u[3] = _mm_sub_epi32(u[2], u[0]);
+  u[4] = _mm_slli_epi32(v[5], 2);
+  u[5] = _mm_sub_epi32(u[4], v[5]);
+  u[6] = _mm_add_epi32(u[3], u[5]);
+
+  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+  v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+
+  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+
+  in[0] = _mm_packs_epi32(u[0], u[2]);
+  in[1] = _mm_packs_epi32(u[1], u[3]);
+  transpose_4x4_avx2(in);
+}
+
+void vp9_fht4x4_avx2(const int16_t *input, int16_t *output,
+                     int stride, int tx_type) {
+  __m128i in[4];
+
+  switch (tx_type) {
+    case DCT_DCT:
+      vp9_fdct4x4_avx2(input, output, stride);
+      break;
+    case ADST_DCT:
+      load_buffer_4x4_avx2(input, in, stride);
+      fadst4_avx2(in);
+      fdct4_avx2(in);
+      write_buffer_4x4_avx2(output, in);
+      break;
+    case DCT_ADST:
+      load_buffer_4x4_avx2(input, in, stride);
+      fdct4_avx2(in);
+      fadst4_avx2(in);
+      write_buffer_4x4_avx2(output, in);
+      break;
+    case ADST_ADST:
+      load_buffer_4x4_avx2(input, in, stride);
+      fadst4_avx2(in);
+      fadst4_avx2(in);
+      write_buffer_4x4_avx2(output, in);
+      break;
+    default:
+      assert(0);
+      break;
+  }
+}
+
+void vp9_fdct8x8_avx2(const int16_t *input, int16_t *output, int stride) {
+  int pass;
+  // Constants
+  //    When we use them, in one case, they are all the same. In all others
+  //    it's a pair of them that we need to repeat four times. This is done
+  //    by constructing the 32 bit constant corresponding to that pair.
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
+  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
+  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+  const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
+  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  // Load input
+  __m128i in0  = _mm_load_si128((const __m128i *)(input + 0 * stride));
+  __m128i in1  = _mm_load_si128((const __m128i *)(input + 1 * stride));
+  __m128i in2  = _mm_load_si128((const __m128i *)(input + 2 * stride));
+  __m128i in3  = _mm_load_si128((const __m128i *)(input + 3 * stride));
+  __m128i in4  = _mm_load_si128((const __m128i *)(input + 4 * stride));
+  __m128i in5  = _mm_load_si128((const __m128i *)(input + 5 * stride));
+  __m128i in6  = _mm_load_si128((const __m128i *)(input + 6 * stride));
+  __m128i in7  = _mm_load_si128((const __m128i *)(input + 7 * stride));
+  // Pre-condition input (shift by two)
+  in0 = _mm_slli_epi16(in0, 2);
+  in1 = _mm_slli_epi16(in1, 2);
+  in2 = _mm_slli_epi16(in2, 2);
+  in3 = _mm_slli_epi16(in3, 2);
+  in4 = _mm_slli_epi16(in4, 2);
+  in5 = _mm_slli_epi16(in5, 2);
+  in6 = _mm_slli_epi16(in6, 2);
+  in7 = _mm_slli_epi16(in7, 2);
+
+  // We do two passes, first the columns, then the rows. The results of the
+  // first pass are transposed so that the same column code can be reused. The
+  // results of the second pass are also transposed so that the rows (processed
+  // as columns) are put back in row positions.
+  for (pass = 0; pass < 2; pass++) {
+    // To store results of each pass before the transpose.
+    __m128i res0, res1, res2, res3, res4, res5, res6, res7;
+    // Add/subtract
+    const __m128i q0 = _mm_add_epi16(in0, in7);
+    const __m128i q1 = _mm_add_epi16(in1, in6);
+    const __m128i q2 = _mm_add_epi16(in2, in5);
+    const __m128i q3 = _mm_add_epi16(in3, in4);
+    const __m128i q4 = _mm_sub_epi16(in3, in4);
+    const __m128i q5 = _mm_sub_epi16(in2, in5);
+    const __m128i q6 = _mm_sub_epi16(in1, in6);
+    const __m128i q7 = _mm_sub_epi16(in0, in7);
+    // Work on first four results
+    {
+      // Add/subtract
+      const __m128i r0 = _mm_add_epi16(q0, q3);
+      const __m128i r1 = _mm_add_epi16(q1, q2);
+      const __m128i r2 = _mm_sub_epi16(q1, q2);
+      const __m128i r3 = _mm_sub_epi16(q0, q3);
+      // Interleave to do the multiply by constants which gets us into 32bits
+      const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
+      const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
+      const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
+      const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
+      const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
+      const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
+      const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
+      const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16);
+      const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
+      const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08);
+      const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
+      const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24);
+      // dct_const_round_shift
+      const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+      const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+      const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+      const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+      const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
+      const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
+      const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
+      const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
+      const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+      const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+      const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+      const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+      const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+      const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+      const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+      const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+      // Combine
+      res0 = _mm_packs_epi32(w0, w1);
+      res4 = _mm_packs_epi32(w2, w3);
+      res2 = _mm_packs_epi32(w4, w5);
+      res6 = _mm_packs_epi32(w6, w7);
+    }
+    // Work on next four results
+    {
+      // Interleave to do the multiply by constants which gets us into 32bits
+      const __m128i d0 = _mm_unpacklo_epi16(q6, q5);
+      const __m128i d1 = _mm_unpackhi_epi16(q6, q5);
+      const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16);
+      const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16);
+      const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16);
+      const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16);
+      // dct_const_round_shift
+      const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING);
+      const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING);
+      const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING);
+      const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING);
+      const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS);
+      const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS);
+      const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS);
+      const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS);
+      // Combine
+      const __m128i r0 = _mm_packs_epi32(s0, s1);
+      const __m128i r1 = _mm_packs_epi32(s2, s3);
+      // Add/subtract
+      const __m128i x0 = _mm_add_epi16(q4, r0);
+      const __m128i x1 = _mm_sub_epi16(q4, r0);
+      const __m128i x2 = _mm_sub_epi16(q7, r1);
+      const __m128i x3 = _mm_add_epi16(q7, r1);
+      // Interleave to do the multiply by constants which gets us into 32bits
+      const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
+      const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
+      const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
+      const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
+      const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04);
+      const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04);
+      const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28);
+      const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28);
+      const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20);
+      const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20);
+      const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12);
+      const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12);
+      // dct_const_round_shift
+      const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+      const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+      const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+      const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+      const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
+      const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
+      const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
+      const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
+      const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+      const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+      const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+      const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+      const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+      const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+      const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+      const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+      // Combine
+      res1 = _mm_packs_epi32(w0, w1);
+      res7 = _mm_packs_epi32(w2, w3);
+      res5 = _mm_packs_epi32(w4, w5);
+      res3 = _mm_packs_epi32(w6, w7);
+    }
+    // Transpose the 8x8.
+    {
+      // 00 01 02 03 04 05 06 07
+      // 10 11 12 13 14 15 16 17
+      // 20 21 22 23 24 25 26 27
+      // 30 31 32 33 34 35 36 37
+      // 40 41 42 43 44 45 46 47
+      // 50 51 52 53 54 55 56 57
+      // 60 61 62 63 64 65 66 67
+      // 70 71 72 73 74 75 76 77
+      const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1);
+      const __m128i tr0_1 = _mm_unpacklo_epi16(res2, res3);
+      const __m128i tr0_2 = _mm_unpackhi_epi16(res0, res1);
+      const __m128i tr0_3 = _mm_unpackhi_epi16(res2, res3);
+      const __m128i tr0_4 = _mm_unpacklo_epi16(res4, res5);
+      const __m128i tr0_5 = _mm_unpacklo_epi16(res6, res7);
+      const __m128i tr0_6 = _mm_unpackhi_epi16(res4, res5);
+      const __m128i tr0_7 = _mm_unpackhi_epi16(res6, res7);
+      // 00 10 01 11 02 12 03 13
+      // 20 30 21 31 22 32 23 33
+      // 04 14 05 15 06 16 07 17
+      // 24 34 25 35 26 36 27 37
+      // 40 50 41 51 42 52 43 53
+      // 60 70 61 71 62 72 63 73
+      // 54 54 55 55 56 56 57 57
+      // 64 74 65 75 66 76 67 77
+      const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+      const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
+      const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+      const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
+      const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+      const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
+      const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
+      const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
+      // 00 10 20 30 01 11 21 31
+      // 40 50 60 70 41 51 61 71
+      // 02 12 22 32 03 13 23 33
+      // 42 52 62 72 43 53 63 73
+      // 04 14 24 34 05 15 21 36
+      // 44 54 64 74 45 55 61 76
+      // 06 16 26 36 07 17 27 37
+      // 46 56 66 76 47 57 67 77
+      in0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
+      in1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
+      in2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
+      in3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
+      in4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
+      in5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
+      in6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
+      in7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
+      // 00 10 20 30 40 50 60 70
+      // 01 11 21 31 41 51 61 71
+      // 02 12 22 32 42 52 62 72
+      // 03 13 23 33 43 53 63 73
+      // 04 14 24 34 44 54 64 74
+      // 05 15 25 35 45 55 65 75
+      // 06 16 26 36 46 56 66 76
+      // 07 17 27 37 47 57 67 77
+    }
+  }
+  // Post-condition output and store it
+  {
+    // Post-condition (division by two)
+    //    division of two 16 bits signed numbers using shifts
+    //    n / 2 = (n - (n >> 15)) >> 1
+    const __m128i sign_in0 = _mm_srai_epi16(in0, 15);
+    const __m128i sign_in1 = _mm_srai_epi16(in1, 15);
+    const __m128i sign_in2 = _mm_srai_epi16(in2, 15);
+    const __m128i sign_in3 = _mm_srai_epi16(in3, 15);
+    const __m128i sign_in4 = _mm_srai_epi16(in4, 15);
+    const __m128i sign_in5 = _mm_srai_epi16(in5, 15);
+    const __m128i sign_in6 = _mm_srai_epi16(in6, 15);
+    const __m128i sign_in7 = _mm_srai_epi16(in7, 15);
+    in0 = _mm_sub_epi16(in0, sign_in0);
+    in1 = _mm_sub_epi16(in1, sign_in1);
+    in2 = _mm_sub_epi16(in2, sign_in2);
+    in3 = _mm_sub_epi16(in3, sign_in3);
+    in4 = _mm_sub_epi16(in4, sign_in4);
+    in5 = _mm_sub_epi16(in5, sign_in5);
+    in6 = _mm_sub_epi16(in6, sign_in6);
+    in7 = _mm_sub_epi16(in7, sign_in7);
+    in0 = _mm_srai_epi16(in0, 1);
+    in1 = _mm_srai_epi16(in1, 1);
+    in2 = _mm_srai_epi16(in2, 1);
+    in3 = _mm_srai_epi16(in3, 1);
+    in4 = _mm_srai_epi16(in4, 1);
+    in5 = _mm_srai_epi16(in5, 1);
+    in6 = _mm_srai_epi16(in6, 1);
+    in7 = _mm_srai_epi16(in7, 1);
+    // store results
+    _mm_store_si128((__m128i *)(output + 0 * 8), in0);
+    _mm_store_si128((__m128i *)(output + 1 * 8), in1);
+    _mm_store_si128((__m128i *)(output + 2 * 8), in2);
+    _mm_store_si128((__m128i *)(output + 3 * 8), in3);
+    _mm_store_si128((__m128i *)(output + 4 * 8), in4);
+    _mm_store_si128((__m128i *)(output + 5 * 8), in5);
+    _mm_store_si128((__m128i *)(output + 6 * 8), in6);
+    _mm_store_si128((__m128i *)(output + 7 * 8), in7);
+  }
+}
+
+// load 8x8 array
+static INLINE void load_buffer_8x8_avx2(const int16_t *input, __m128i *in,
+                                   int stride) {
+  in[0]  = _mm_load_si128((const __m128i *)(input + 0 * stride));
+  in[1]  = _mm_load_si128((const __m128i *)(input + 1 * stride));
+  in[2]  = _mm_load_si128((const __m128i *)(input + 2 * stride));
+  in[3]  = _mm_load_si128((const __m128i *)(input + 3 * stride));
+  in[4]  = _mm_load_si128((const __m128i *)(input + 4 * stride));
+  in[5]  = _mm_load_si128((const __m128i *)(input + 5 * stride));
+  in[6]  = _mm_load_si128((const __m128i *)(input + 6 * stride));
+  in[7]  = _mm_load_si128((const __m128i *)(input + 7 * stride));
+
+  in[0] = _mm_slli_epi16(in[0], 2);
+  in[1] = _mm_slli_epi16(in[1], 2);
+  in[2] = _mm_slli_epi16(in[2], 2);
+  in[3] = _mm_slli_epi16(in[3], 2);
+  in[4] = _mm_slli_epi16(in[4], 2);
+  in[5] = _mm_slli_epi16(in[5], 2);
+  in[6] = _mm_slli_epi16(in[6], 2);
+  in[7] = _mm_slli_epi16(in[7], 2);
+}
+
+// right shift and rounding
+static INLINE void right_shift_8x8_avx2(__m128i *res, int const bit) {
+  const __m128i kOne = _mm_set1_epi16(1);
+  const int bit_m02 = bit - 2;
+  __m128i sign0 = _mm_srai_epi16(res[0], 15);
+  __m128i sign1 = _mm_srai_epi16(res[1], 15);
+  __m128i sign2 = _mm_srai_epi16(res[2], 15);
+  __m128i sign3 = _mm_srai_epi16(res[3], 15);
+  __m128i sign4 = _mm_srai_epi16(res[4], 15);
+  __m128i sign5 = _mm_srai_epi16(res[5], 15);
+  __m128i sign6 = _mm_srai_epi16(res[6], 15);
+  __m128i sign7 = _mm_srai_epi16(res[7], 15);
+
+  if (bit_m02 >= 0) {
+    __m128i k_const_rounding = _mm_slli_epi16(kOne, bit_m02);
+    res[0] = _mm_add_epi16(res[0], k_const_rounding);
+    res[1] = _mm_add_epi16(res[1], k_const_rounding);
+    res[2] = _mm_add_epi16(res[2], k_const_rounding);
+    res[3] = _mm_add_epi16(res[3], k_const_rounding);
+    res[4] = _mm_add_epi16(res[4], k_const_rounding);
+    res[5] = _mm_add_epi16(res[5], k_const_rounding);
+    res[6] = _mm_add_epi16(res[6], k_const_rounding);
+    res[7] = _mm_add_epi16(res[7], k_const_rounding);
+  }
+
+  res[0] = _mm_sub_epi16(res[0], sign0);
+  res[1] = _mm_sub_epi16(res[1], sign1);
+  res[2] = _mm_sub_epi16(res[2], sign2);
+  res[3] = _mm_sub_epi16(res[3], sign3);
+  res[4] = _mm_sub_epi16(res[4], sign4);
+  res[5] = _mm_sub_epi16(res[5], sign5);
+  res[6] = _mm_sub_epi16(res[6], sign6);
+  res[7] = _mm_sub_epi16(res[7], sign7);
+
+  res[0] = _mm_srai_epi16(res[0], bit);
+  res[1] = _mm_srai_epi16(res[1], bit);
+  res[2] = _mm_srai_epi16(res[2], bit);
+  res[3] = _mm_srai_epi16(res[3], bit);
+  res[4] = _mm_srai_epi16(res[4], bit);
+  res[5] = _mm_srai_epi16(res[5], bit);
+  res[6] = _mm_srai_epi16(res[6], bit);
+  res[7] = _mm_srai_epi16(res[7], bit);
+}
+
+// write 8x8 array
+static INLINE void write_buffer_8x8_avx2(int16_t *output, __m128i *res, int stride) {
+  _mm_store_si128((__m128i *)(output + 0 * stride), res[0]);
+  _mm_store_si128((__m128i *)(output + 1 * stride), res[1]);
+  _mm_store_si128((__m128i *)(output + 2 * stride), res[2]);
+  _mm_store_si128((__m128i *)(output + 3 * stride), res[3]);
+  _mm_store_si128((__m128i *)(output + 4 * stride), res[4]);
+  _mm_store_si128((__m128i *)(output + 5 * stride), res[5]);
+  _mm_store_si128((__m128i *)(output + 6 * stride), res[6]);
+  _mm_store_si128((__m128i *)(output + 7 * stride), res[7]);
+}
+
+// perform in-place transpose
+static INLINE void array_transpose_8x8_avx2(__m128i *in, __m128i *res) {
+  const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
+  const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
+  const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]);
+  const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]);
+  const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
+  const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
+  const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]);
+  const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]);
+  // 00 10 01 11 02 12 03 13
+  // 20 30 21 31 22 32 23 33
+  // 04 14 05 15 06 16 07 17
+  // 24 34 25 35 26 36 27 37
+  // 40 50 41 51 42 52 43 53
+  // 60 70 61 71 62 72 63 73
+  // 44 54 45 55 46 56 47 57
+  // 64 74 65 75 66 76 67 77
+  const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+  const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+  const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+  const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5);
+  const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3);
+  const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
+  const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3);
+  const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
+  // 00 10 20 30 01 11 21 31
+  // 40 50 60 70 41 51 61 71
+  // 02 12 22 32 03 13 23 33
+  // 42 52 62 72 43 53 63 73
+  // 04 14 24 34 05 15 25 35
+  // 44 54 64 74 45 55 65 75
+  // 06 16 26 36 07 17 27 37
+  // 46 56 66 76 47 57 67 77
+  res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1);
+  res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1);
+  res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3);
+  res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3);
+  res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5);
+  res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5);
+  res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7);
+  res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7);
+  // 00 10 20 30 40 50 60 70
+  // 01 11 21 31 41 51 61 71
+  // 02 12 22 32 42 52 62 72
+  // 03 13 23 33 43 53 63 73
+  // 04 14 24 34 44 54 64 74
+  // 05 15 25 35 45 55 65 75
+  // 06 16 26 36 46 56 66 76
+  // 07 17 27 37 47 57 67 77
+}
+
+void fdct8_avx2(__m128i *in) {
+  // constants
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
+  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
+  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+  const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
+  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  __m128i u0, u1, u2, u3, u4, u5, u6, u7;
+  __m128i v0, v1, v2, v3, v4, v5, v6, v7;
+  __m128i s0, s1, s2, s3, s4, s5, s6, s7;
+
+  // stage 1
+  s0 = _mm_add_epi16(in[0], in[7]);
+  s1 = _mm_add_epi16(in[1], in[6]);
+  s2 = _mm_add_epi16(in[2], in[5]);
+  s3 = _mm_add_epi16(in[3], in[4]);
+  s4 = _mm_sub_epi16(in[3], in[4]);
+  s5 = _mm_sub_epi16(in[2], in[5]);
+  s6 = _mm_sub_epi16(in[1], in[6]);
+  s7 = _mm_sub_epi16(in[0], in[7]);
+
+  u0 = _mm_add_epi16(s0, s3);
+  u1 = _mm_add_epi16(s1, s2);
+  u2 = _mm_sub_epi16(s1, s2);
+  u3 = _mm_sub_epi16(s0, s3);
+  // interleave and perform butterfly multiplication/addition
+  v0 = _mm_unpacklo_epi16(u0, u1);
+  v1 = _mm_unpackhi_epi16(u0, u1);
+  v2 = _mm_unpacklo_epi16(u2, u3);
+  v3 = _mm_unpackhi_epi16(u2, u3);
+
+  u0 = _mm_madd_epi16(v0, k__cospi_p16_p16);
+  u1 = _mm_madd_epi16(v1, k__cospi_p16_p16);
+  u2 = _mm_madd_epi16(v0, k__cospi_p16_m16);
+  u3 = _mm_madd_epi16(v1, k__cospi_p16_m16);
+  u4 = _mm_madd_epi16(v2, k__cospi_p24_p08);
+  u5 = _mm_madd_epi16(v3, k__cospi_p24_p08);
+  u6 = _mm_madd_epi16(v2, k__cospi_m08_p24);
+  u7 = _mm_madd_epi16(v3, k__cospi_m08_p24);
+
+  // shift and rounding
+  v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+  v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+  v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+  v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+  v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
+  v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
+  v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
+  v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
+
+  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+  u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+  u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+  u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+  u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+
+  in[0] = _mm_packs_epi32(u0, u1);
+  in[2] = _mm_packs_epi32(u4, u5);
+  in[4] = _mm_packs_epi32(u2, u3);
+  in[6] = _mm_packs_epi32(u6, u7);
+
+  // stage 2
+  // interleave and perform butterfly multiplication/addition
+  u0 = _mm_unpacklo_epi16(s6, s5);
+  u1 = _mm_unpackhi_epi16(s6, s5);
+  v0 = _mm_madd_epi16(u0, k__cospi_p16_m16);
+  v1 = _mm_madd_epi16(u1, k__cospi_p16_m16);
+  v2 = _mm_madd_epi16(u0, k__cospi_p16_p16);
+  v3 = _mm_madd_epi16(u1, k__cospi_p16_p16);
+
+  // shift and rounding
+  u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
+  u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
+  u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
+  u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
+
+  v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
+  v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
+  v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
+  v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
+
+  u0 = _mm_packs_epi32(v0, v1);
+  u1 = _mm_packs_epi32(v2, v3);
+
+  // stage 3
+  s0 = _mm_add_epi16(s4, u0);
+  s1 = _mm_sub_epi16(s4, u0);
+  s2 = _mm_sub_epi16(s7, u1);
+  s3 = _mm_add_epi16(s7, u1);
+
+  // stage 4
+  u0 = _mm_unpacklo_epi16(s0, s3);
+  u1 = _mm_unpackhi_epi16(s0, s3);
+  u2 = _mm_unpacklo_epi16(s1, s2);
+  u3 = _mm_unpackhi_epi16(s1, s2);
+
+  v0 = _mm_madd_epi16(u0, k__cospi_p28_p04);
+  v1 = _mm_madd_epi16(u1, k__cospi_p28_p04);
+  v2 = _mm_madd_epi16(u2, k__cospi_p12_p20);
+  v3 = _mm_madd_epi16(u3, k__cospi_p12_p20);
+  v4 = _mm_madd_epi16(u2, k__cospi_m20_p12);
+  v5 = _mm_madd_epi16(u3, k__cospi_m20_p12);
+  v6 = _mm_madd_epi16(u0, k__cospi_m04_p28);
+  v7 = _mm_madd_epi16(u1, k__cospi_m04_p28);
+
+  // shift and rounding
+  u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
+  u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
+  u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
+  u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
+  u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
+  u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
+  u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
+  u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
+
+  v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
+  v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
+  v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
+  v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
+  v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
+  v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
+  v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
+  v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
+
+  in[1] = _mm_packs_epi32(v0, v1);
+  in[3] = _mm_packs_epi32(v4, v5);
+  in[5] = _mm_packs_epi32(v2, v3);
+  in[7] = _mm_packs_epi32(v6, v7);
+
+  // transpose
+  array_transpose_8x8_avx2(in, in);
+}
+
+void fadst8_avx2(__m128i *in) {
+  // Constants
+  const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
+  const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
+  const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
+  const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
+  const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
+  const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
+  const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
+  const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
+  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+  const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+  const __m128i k__const_0 = _mm_set1_epi16(0);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+
+  __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15;
+  __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;
+  __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15;
+  __m128i s0, s1, s2, s3, s4, s5, s6, s7;
+  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+
+  // properly aligned for butterfly input
+  in0  = in[7];
+  in1  = in[0];
+  in2  = in[5];
+  in3  = in[2];
+  in4  = in[3];
+  in5  = in[4];
+  in6  = in[1];
+  in7  = in[6];
+
+  // column transformation
+  // stage 1
+  // interleave and multiply/add into 32-bit integer
+  s0 = _mm_unpacklo_epi16(in0, in1);
+  s1 = _mm_unpackhi_epi16(in0, in1);
+  s2 = _mm_unpacklo_epi16(in2, in3);
+  s3 = _mm_unpackhi_epi16(in2, in3);
+  s4 = _mm_unpacklo_epi16(in4, in5);
+  s5 = _mm_unpackhi_epi16(in4, in5);
+  s6 = _mm_unpacklo_epi16(in6, in7);
+  s7 = _mm_unpackhi_epi16(in6, in7);
+
+  u0 = _mm_madd_epi16(s0, k__cospi_p02_p30);
+  u1 = _mm_madd_epi16(s1, k__cospi_p02_p30);
+  u2 = _mm_madd_epi16(s0, k__cospi_p30_m02);
+  u3 = _mm_madd_epi16(s1, k__cospi_p30_m02);
+  u4 = _mm_madd_epi16(s2, k__cospi_p10_p22);
+  u5 = _mm_madd_epi16(s3, k__cospi_p10_p22);
+  u6 = _mm_madd_epi16(s2, k__cospi_p22_m10);
+  u7 = _mm_madd_epi16(s3, k__cospi_p22_m10);
+  u8 = _mm_madd_epi16(s4, k__cospi_p18_p14);
+  u9 = _mm_madd_epi16(s5, k__cospi_p18_p14);
+  u10 = _mm_madd_epi16(s4, k__cospi_p14_m18);
+  u11 = _mm_madd_epi16(s5, k__cospi_p14_m18);
+  u12 = _mm_madd_epi16(s6, k__cospi_p26_p06);
+  u13 = _mm_madd_epi16(s7, k__cospi_p26_p06);
+  u14 = _mm_madd_epi16(s6, k__cospi_p06_m26);
+  u15 = _mm_madd_epi16(s7, k__cospi_p06_m26);
+
+  // addition
+  w0 = _mm_add_epi32(u0, u8);
+  w1 = _mm_add_epi32(u1, u9);
+  w2 = _mm_add_epi32(u2, u10);
+  w3 = _mm_add_epi32(u3, u11);
+  w4 = _mm_add_epi32(u4, u12);
+  w5 = _mm_add_epi32(u5, u13);
+  w6 = _mm_add_epi32(u6, u14);
+  w7 = _mm_add_epi32(u7, u15);
+  w8 = _mm_sub_epi32(u0, u8);
+  w9 = _mm_sub_epi32(u1, u9);
+  w10 = _mm_sub_epi32(u2, u10);
+  w11 = _mm_sub_epi32(u3, u11);
+  w12 = _mm_sub_epi32(u4, u12);
+  w13 = _mm_sub_epi32(u5, u13);
+  w14 = _mm_sub_epi32(u6, u14);
+  w15 = _mm_sub_epi32(u7, u15);
+
+  // shift and rounding
+  v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
+  v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
+  v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
+  v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
+  v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
+  v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
+  v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
+  v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
+  v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING);
+  v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING);
+  v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING);
+  v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING);
+  v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING);
+  v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING);
+  v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING);
+  v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING);
+
+  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+  u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+  u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+  u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+  u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+  u8 = _mm_srai_epi32(v8, DCT_CONST_BITS);
+  u9 = _mm_srai_epi32(v9, DCT_CONST_BITS);
+  u10 = _mm_srai_epi32(v10, DCT_CONST_BITS);
+  u11 = _mm_srai_epi32(v11, DCT_CONST_BITS);
+  u12 = _mm_srai_epi32(v12, DCT_CONST_BITS);
+  u13 = _mm_srai_epi32(v13, DCT_CONST_BITS);
+  u14 = _mm_srai_epi32(v14, DCT_CONST_BITS);
+  u15 = _mm_srai_epi32(v15, DCT_CONST_BITS);
+
+  // back to 16-bit and pack 8 integers into __m128i
+  in[0] = _mm_packs_epi32(u0, u1);
+  in[1] = _mm_packs_epi32(u2, u3);
+  in[2] = _mm_packs_epi32(u4, u5);
+  in[3] = _mm_packs_epi32(u6, u7);
+  in[4] = _mm_packs_epi32(u8, u9);
+  in[5] = _mm_packs_epi32(u10, u11);
+  in[6] = _mm_packs_epi32(u12, u13);
+  in[7] = _mm_packs_epi32(u14, u15);
+
+  // stage 2
+  s0 = _mm_add_epi16(in[0], in[2]);
+  s1 = _mm_add_epi16(in[1], in[3]);
+  s2 = _mm_sub_epi16(in[0], in[2]);
+  s3 = _mm_sub_epi16(in[1], in[3]);
+  u0 = _mm_unpacklo_epi16(in[4], in[5]);
+  u1 = _mm_unpackhi_epi16(in[4], in[5]);
+  u2 = _mm_unpacklo_epi16(in[6], in[7]);
+  u3 = _mm_unpackhi_epi16(in[6], in[7]);
+
+  v0 = _mm_madd_epi16(u0, k__cospi_p08_p24);
+  v1 = _mm_madd_epi16(u1, k__cospi_p08_p24);
+  v2 = _mm_madd_epi16(u0, k__cospi_p24_m08);
+  v3 = _mm_madd_epi16(u1, k__cospi_p24_m08);
+  v4 = _mm_madd_epi16(u2, k__cospi_m24_p08);
+  v5 = _mm_madd_epi16(u3, k__cospi_m24_p08);
+  v6 = _mm_madd_epi16(u2, k__cospi_p08_p24);
+  v7 = _mm_madd_epi16(u3, k__cospi_p08_p24);
+
+  w0 = _mm_add_epi32(v0, v4);
+  w1 = _mm_add_epi32(v1, v5);
+  w2 = _mm_add_epi32(v2, v6);
+  w3 = _mm_add_epi32(v3, v7);
+  w4 = _mm_sub_epi32(v0, v4);
+  w5 = _mm_sub_epi32(v1, v5);
+  w6 = _mm_sub_epi32(v2, v6);
+  w7 = _mm_sub_epi32(v3, v7);
+
+  v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
+  v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
+  v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
+  v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
+  v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
+  v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
+  v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
+  v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
+
+  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+  u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+  u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+  u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+  u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+
+  // back to 16-bit intergers
+  s4 = _mm_packs_epi32(u0, u1);
+  s5 = _mm_packs_epi32(u2, u3);
+  s6 = _mm_packs_epi32(u4, u5);
+  s7 = _mm_packs_epi32(u6, u7);
+
+  // stage 3
+  u0 = _mm_unpacklo_epi16(s2, s3);
+  u1 = _mm_unpackhi_epi16(s2, s3);
+  u2 = _mm_unpacklo_epi16(s6, s7);
+  u3 = _mm_unpackhi_epi16(s6, s7);
+
+  v0 = _mm_madd_epi16(u0, k__cospi_p16_p16);
+  v1 = _mm_madd_epi16(u1, k__cospi_p16_p16);
+  v2 = _mm_madd_epi16(u0, k__cospi_p16_m16);
+  v3 = _mm_madd_epi16(u1, k__cospi_p16_m16);
+  v4 = _mm_madd_epi16(u2, k__cospi_p16_p16);
+  v5 = _mm_madd_epi16(u3, k__cospi_p16_p16);
+  v6 = _mm_madd_epi16(u2, k__cospi_p16_m16);
+  v7 = _mm_madd_epi16(u3, k__cospi_p16_m16);
+
+  u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
+  u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
+  u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
+  u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
+  u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
+  u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
+  u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
+  u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
+
+  v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
+  v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
+  v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
+  v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
+  v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
+  v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
+  v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
+  v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
+
+  s2 = _mm_packs_epi32(v0, v1);
+  s3 = _mm_packs_epi32(v2, v3);
+  s6 = _mm_packs_epi32(v4, v5);
+  s7 = _mm_packs_epi32(v6, v7);
+
+  // FIXME(jingning): do subtract using bit inversion?
+  in[0] = s0;
+  in[1] = _mm_sub_epi16(k__const_0, s4);
+  in[2] = s6;
+  in[3] = _mm_sub_epi16(k__const_0, s2);
+  in[4] = s3;
+  in[5] = _mm_sub_epi16(k__const_0, s7);
+  in[6] = s5;
+  in[7] = _mm_sub_epi16(k__const_0, s1);
+
+  // transpose
+  array_transpose_8x8_avx2(in, in);
+}
+
+void vp9_fht8x8_avx2(const int16_t *input, int16_t *output,
+                     int stride, int tx_type) {
+  __m128i in[8];
+
+  switch (tx_type) {
+    case DCT_DCT:
+      vp9_fdct8x8_avx2(input, output, stride);
+      break;
+    case ADST_DCT:
+      load_buffer_8x8_avx2(input, in, stride);
+      fadst8_avx2(in);
+      fdct8_avx2(in);
+      right_shift_8x8_avx2(in, 1);
+      write_buffer_8x8_avx2(output, in, 8);
+      break;
+    case DCT_ADST:
+      load_buffer_8x8_avx2(input, in, stride);
+      fdct8_avx2(in);
+      fadst8_avx2(in);
+      right_shift_8x8_avx2(in, 1);
+      write_buffer_8x8_avx2(output, in, 8);
+      break;
+    case ADST_ADST:
+      load_buffer_8x8_avx2(input, in, stride);
+      fadst8_avx2(in);
+      fadst8_avx2(in);
+      right_shift_8x8_avx2(in, 1);
+      write_buffer_8x8_avx2(output, in, 8);
+      break;
+    default:
+      assert(0);
+      break;
+  }
+}
+
+void vp9_fdct16x16_avx2(const int16_t *input, int16_t *output, int stride) {
+  // The 2D transform is done with two passes which are actually pretty
+  // similar. In the first one, we transform the columns and transpose
+  // the results. In the second one, we transform the rows. To achieve that,
+  // as the first pass results are transposed, we transpose the columns (that
+  // is the transposed rows) and transpose the results (so that it goes back
+  // in normal/row positions).
+  int pass;
+  // We need an intermediate buffer between passes.
+  DECLARE_ALIGNED_ARRAY(16, int16_t, intermediate, 256);
+  const int16_t *in = input;
+  int16_t *out = intermediate;
+  // Constants
+  //    When we use them, in one case, they are all the same. In all others
+  //    it's a pair of them that we need to repeat four times. This is done
+  //    by constructing the 32 bit constant corresponding to that pair.
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
+  const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
+  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+  const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
+  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+  const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64);
+  const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64);
+  const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);
+  const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);
+  const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64);
+  const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64);
+  const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64);
+  const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  const __m128i kOne = _mm_set1_epi16(1);
+  // Do the two transform/transpose passes
+  for (pass = 0; pass < 2; ++pass) {
+    // We process eight columns (transposed rows in second pass) at a time.
+    int column_start;
+    for (column_start = 0; column_start < 16; column_start += 8) {
+      __m128i in00, in01, in02, in03, in04, in05, in06, in07;
+      __m128i in08, in09, in10, in11, in12, in13, in14, in15;
+      __m128i input0, input1, input2, input3, input4, input5, input6, input7;
+      __m128i step1_0, step1_1, step1_2, step1_3;
+      __m128i step1_4, step1_5, step1_6, step1_7;
+      __m128i step2_1, step2_2, step2_3, step2_4, step2_5, step2_6;
+      __m128i step3_0, step3_1, step3_2, step3_3;
+      __m128i step3_4, step3_5, step3_6, step3_7;
+      __m128i res00, res01, res02, res03, res04, res05, res06, res07;
+      __m128i res08, res09, res10, res11, res12, res13, res14, res15;
+      // Load and pre-condition input.
+      if (0 == pass) {
+        in00  = _mm_load_si128((const __m128i *)(in +  0 * stride));
+        in01  = _mm_load_si128((const __m128i *)(in +  1 * stride));
+        in02  = _mm_load_si128((const __m128i *)(in +  2 * stride));
+        in03  = _mm_load_si128((const __m128i *)(in +  3 * stride));
+        in04  = _mm_load_si128((const __m128i *)(in +  4 * stride));
+        in05  = _mm_load_si128((const __m128i *)(in +  5 * stride));
+        in06  = _mm_load_si128((const __m128i *)(in +  6 * stride));
+        in07  = _mm_load_si128((const __m128i *)(in +  7 * stride));
+        in08  = _mm_load_si128((const __m128i *)(in +  8 * stride));
+        in09  = _mm_load_si128((const __m128i *)(in +  9 * stride));
+        in10  = _mm_load_si128((const __m128i *)(in + 10 * stride));
+        in11  = _mm_load_si128((const __m128i *)(in + 11 * stride));
+        in12  = _mm_load_si128((const __m128i *)(in + 12 * stride));
+        in13  = _mm_load_si128((const __m128i *)(in + 13 * stride));
+        in14  = _mm_load_si128((const __m128i *)(in + 14 * stride));
+        in15  = _mm_load_si128((const __m128i *)(in + 15 * stride));
+        // x = x << 2
+        in00 = _mm_slli_epi16(in00, 2);
+        in01 = _mm_slli_epi16(in01, 2);
+        in02 = _mm_slli_epi16(in02, 2);
+        in03 = _mm_slli_epi16(in03, 2);
+        in04 = _mm_slli_epi16(in04, 2);
+        in05 = _mm_slli_epi16(in05, 2);
+        in06 = _mm_slli_epi16(in06, 2);
+        in07 = _mm_slli_epi16(in07, 2);
+        in08 = _mm_slli_epi16(in08, 2);
+        in09 = _mm_slli_epi16(in09, 2);
+        in10 = _mm_slli_epi16(in10, 2);
+        in11 = _mm_slli_epi16(in11, 2);
+        in12 = _mm_slli_epi16(in12, 2);
+        in13 = _mm_slli_epi16(in13, 2);
+        in14 = _mm_slli_epi16(in14, 2);
+        in15 = _mm_slli_epi16(in15, 2);
+      } else {
+        in00  = _mm_load_si128((const __m128i *)(in +  0 * 16));
+        in01  = _mm_load_si128((const __m128i *)(in +  1 * 16));
+        in02  = _mm_load_si128((const __m128i *)(in +  2 * 16));
+        in03  = _mm_load_si128((const __m128i *)(in +  3 * 16));
+        in04  = _mm_load_si128((const __m128i *)(in +  4 * 16));
+        in05  = _mm_load_si128((const __m128i *)(in +  5 * 16));
+        in06  = _mm_load_si128((const __m128i *)(in +  6 * 16));
+        in07  = _mm_load_si128((const __m128i *)(in +  7 * 16));
+        in08  = _mm_load_si128((const __m128i *)(in +  8 * 16));
+        in09  = _mm_load_si128((const __m128i *)(in +  9 * 16));
+        in10  = _mm_load_si128((const __m128i *)(in + 10 * 16));
+        in11  = _mm_load_si128((const __m128i *)(in + 11 * 16));
+        in12  = _mm_load_si128((const __m128i *)(in + 12 * 16));
+        in13  = _mm_load_si128((const __m128i *)(in + 13 * 16));
+        in14  = _mm_load_si128((const __m128i *)(in + 14 * 16));
+        in15  = _mm_load_si128((const __m128i *)(in + 15 * 16));
+        // x = (x + 1) >> 2
+        in00 = _mm_add_epi16(in00, kOne);
+        in01 = _mm_add_epi16(in01, kOne);
+        in02 = _mm_add_epi16(in02, kOne);
+        in03 = _mm_add_epi16(in03, kOne);
+        in04 = _mm_add_epi16(in04, kOne);
+        in05 = _mm_add_epi16(in05, kOne);
+        in06 = _mm_add_epi16(in06, kOne);
+        in07 = _mm_add_epi16(in07, kOne);
+        in08 = _mm_add_epi16(in08, kOne);
+        in09 = _mm_add_epi16(in09, kOne);
+        in10 = _mm_add_epi16(in10, kOne);
+        in11 = _mm_add_epi16(in11, kOne);
+        in12 = _mm_add_epi16(in12, kOne);
+        in13 = _mm_add_epi16(in13, kOne);
+        in14 = _mm_add_epi16(in14, kOne);
+        in15 = _mm_add_epi16(in15, kOne);
+        in00 = _mm_srai_epi16(in00, 2);
+        in01 = _mm_srai_epi16(in01, 2);
+        in02 = _mm_srai_epi16(in02, 2);
+        in03 = _mm_srai_epi16(in03, 2);
+        in04 = _mm_srai_epi16(in04, 2);
+        in05 = _mm_srai_epi16(in05, 2);
+        in06 = _mm_srai_epi16(in06, 2);
+        in07 = _mm_srai_epi16(in07, 2);
+        in08 = _mm_srai_epi16(in08, 2);
+        in09 = _mm_srai_epi16(in09, 2);
+        in10 = _mm_srai_epi16(in10, 2);
+        in11 = _mm_srai_epi16(in11, 2);
+        in12 = _mm_srai_epi16(in12, 2);
+        in13 = _mm_srai_epi16(in13, 2);
+        in14 = _mm_srai_epi16(in14, 2);
+        in15 = _mm_srai_epi16(in15, 2);
+      }
+      in += 8;
+      // Calculate input for the first 8 results.
+      {
+        input0 = _mm_add_epi16(in00, in15);
+        input1 = _mm_add_epi16(in01, in14);
+        input2 = _mm_add_epi16(in02, in13);
+        input3 = _mm_add_epi16(in03, in12);
+        input4 = _mm_add_epi16(in04, in11);
+        input5 = _mm_add_epi16(in05, in10);
+        input6 = _mm_add_epi16(in06, in09);
+        input7 = _mm_add_epi16(in07, in08);
+      }
+      // Calculate input for the next 8 results.
+      {
+        step1_0 = _mm_sub_epi16(in07, in08);
+        step1_1 = _mm_sub_epi16(in06, in09);
+        step1_2 = _mm_sub_epi16(in05, in10);
+        step1_3 = _mm_sub_epi16(in04, in11);
+        step1_4 = _mm_sub_epi16(in03, in12);
+        step1_5 = _mm_sub_epi16(in02, in13);
+        step1_6 = _mm_sub_epi16(in01, in14);
+        step1_7 = _mm_sub_epi16(in00, in15);
+      }
+      // Work on the first eight values; fdct8(input, even_results);
+      {
+        // Add/subtract
+        const __m128i q0 = _mm_add_epi16(input0, input7);
+        const __m128i q1 = _mm_add_epi16(input1, input6);
+        const __m128i q2 = _mm_add_epi16(input2, input5);
+        const __m128i q3 = _mm_add_epi16(input3, input4);
+        const __m128i q4 = _mm_sub_epi16(input3, input4);
+        const __m128i q5 = _mm_sub_epi16(input2, input5);
+        const __m128i q6 = _mm_sub_epi16(input1, input6);
+        const __m128i q7 = _mm_sub_epi16(input0, input7);
+        // Work on first four results
+        {
+          // Add/subtract
+          const __m128i r0 = _mm_add_epi16(q0, q3);
+          const __m128i r1 = _mm_add_epi16(q1, q2);
+          const __m128i r2 = _mm_sub_epi16(q1, q2);
+          const __m128i r3 = _mm_sub_epi16(q0, q3);
+          // Interleave to do the multiply by constants which gets us
+          // into 32 bits.
+          const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
+          const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
+          const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
+          const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
+          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
+          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
+          const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
+          const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16);
+          const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
+          const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08);
+          const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
+          const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24);
+          // dct_const_round_shift
+          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+          const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
+          const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
+          const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
+          const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
+          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+          const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+          const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+          const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+          const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+          // Combine
+          res00 = _mm_packs_epi32(w0, w1);
+          res08 = _mm_packs_epi32(w2, w3);
+          res04 = _mm_packs_epi32(w4, w5);
+          res12 = _mm_packs_epi32(w6, w7);
+        }
+        // Work on next four results
+        {
+          // Interleave to do the multiply by constants which gets us
+          // into 32 bits.
+          const __m128i d0 = _mm_unpacklo_epi16(q6, q5);
+          const __m128i d1 = _mm_unpackhi_epi16(q6, q5);
+          const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16);
+          const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16);
+          const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16);
+          const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16);
+          // dct_const_round_shift
+          const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING);
+          const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING);
+          const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING);
+          const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING);
+          const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS);
+          const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS);
+          const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS);
+          const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS);
+          // Combine
+          const __m128i r0 = _mm_packs_epi32(s0, s1);
+          const __m128i r1 = _mm_packs_epi32(s2, s3);
+          // Add/subtract
+          const __m128i x0 = _mm_add_epi16(q4, r0);
+          const __m128i x1 = _mm_sub_epi16(q4, r0);
+          const __m128i x2 = _mm_sub_epi16(q7, r1);
+          const __m128i x3 = _mm_add_epi16(q7, r1);
+          // Interleave to do the multiply by constants which gets us
+          // into 32 bits.
+          const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
+          const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
+          const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
+          const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
+          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04);
+          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04);
+          const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28);
+          const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28);
+          const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20);
+          const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20);
+          const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12);
+          const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12);
+          // dct_const_round_shift
+          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+          const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
+          const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
+          const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
+          const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
+          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+          const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+          const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+          const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+          const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+          // Combine
+          res02 = _mm_packs_epi32(w0, w1);
+          res14 = _mm_packs_epi32(w2, w3);
+          res10 = _mm_packs_epi32(w4, w5);
+          res06 = _mm_packs_epi32(w6, w7);
+        }
+      }
+      // Work on the next eight values; step1 -> odd_results
+      {
+        // step 2
+        {
+          const __m128i t0 = _mm_unpacklo_epi16(step1_5, step1_2);
+          const __m128i t1 = _mm_unpackhi_epi16(step1_5, step1_2);
+          const __m128i t2 = _mm_unpacklo_epi16(step1_4, step1_3);
+          const __m128i t3 = _mm_unpackhi_epi16(step1_4, step1_3);
+          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_m16);
+          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_m16);
+          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p16_m16);
+          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p16_m16);
+          // dct_const_round_shift
+          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+          // Combine
+          step2_2 = _mm_packs_epi32(w0, w1);
+          step2_3 = _mm_packs_epi32(w2, w3);
+        }
+        {
+          const __m128i t0 = _mm_unpacklo_epi16(step1_5, step1_2);
+          const __m128i t1 = _mm_unpackhi_epi16(step1_5, step1_2);
+          const __m128i t2 = _mm_unpacklo_epi16(step1_4, step1_3);
+          const __m128i t3 = _mm_unpackhi_epi16(step1_4, step1_3);
+          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
+          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
+          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p16_p16);
+          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p16_p16);
+          // dct_const_round_shift
+          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+          // Combine
+          step2_5 = _mm_packs_epi32(w0, w1);
+          step2_4 = _mm_packs_epi32(w2, w3);
+        }
+        // step 3
+        {
+          step3_0 = _mm_add_epi16(step1_0, step2_3);
+          step3_1 = _mm_add_epi16(step1_1, step2_2);
+          step3_2 = _mm_sub_epi16(step1_1, step2_2);
+          step3_3 = _mm_sub_epi16(step1_0, step2_3);
+          step3_4 = _mm_sub_epi16(step1_7, step2_4);
+          step3_5 = _mm_sub_epi16(step1_6, step2_5);
+          step3_6 = _mm_add_epi16(step1_6, step2_5);
+          step3_7 = _mm_add_epi16(step1_7, step2_4);
+        }
+        // step 4
+        {
+          const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6);
+          const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6);
+          const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5);
+          const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5);
+          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m08_p24);
+          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m08_p24);
+          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m24_m08);
+          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m24_m08);
+          // dct_const_round_shift
+          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+          // Combine
+          step2_1 = _mm_packs_epi32(w0, w1);
+          step2_2 = _mm_packs_epi32(w2, w3);
+        }
+        {
+          const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6);
+          const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6);
+          const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5);
+          const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5);
+          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p24_p08);
+          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p24_p08);
+          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m08_p24);
+          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m08_p24);
+          // dct_const_round_shift
+          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+          // Combine
+          step2_6 = _mm_packs_epi32(w0, w1);
+          step2_5 = _mm_packs_epi32(w2, w3);
+        }
+        // step 5
+        {
+          step1_0 = _mm_add_epi16(step3_0, step2_1);
+          step1_1 = _mm_sub_epi16(step3_0, step2_1);
+          step1_2 = _mm_sub_epi16(step3_3, step2_2);
+          step1_3 = _mm_add_epi16(step3_3, step2_2);
+          step1_4 = _mm_add_epi16(step3_4, step2_5);
+          step1_5 = _mm_sub_epi16(step3_4, step2_5);
+          step1_6 = _mm_sub_epi16(step3_7, step2_6);
+          step1_7 = _mm_add_epi16(step3_7, step2_6);
+        }
+        // step 6
+        {
+          const __m128i t0 = _mm_unpacklo_epi16(step1_0, step1_7);
+          const __m128i t1 = _mm_unpackhi_epi16(step1_0, step1_7);
+          const __m128i t2 = _mm_unpacklo_epi16(step1_1, step1_6);
+          const __m128i t3 = _mm_unpackhi_epi16(step1_1, step1_6);
+          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p30_p02);
+          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p30_p02);
+          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p14_p18);
+          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p14_p18);
+          // dct_const_round_shift
+          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+          // Combine
+          res01 = _mm_packs_epi32(w0, w1);
+          res09 = _mm_packs_epi32(w2, w3);
+        }
+        {
+          const __m128i t0 = _mm_unpacklo_epi16(step1_2, step1_5);
+          const __m128i t1 = _mm_unpackhi_epi16(step1_2, step1_5);
+          const __m128i t2 = _mm_unpacklo_epi16(step1_3, step1_4);
+          const __m128i t3 = _mm_unpackhi_epi16(step1_3, step1_4);
+          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p22_p10);
+          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p22_p10);
+          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p06_p26);
+          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p06_p26);
+          // dct_const_round_shift
+          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+          // Combine
+          res05 = _mm_packs_epi32(w0, w1);
+          res13 = _mm_packs_epi32(w2, w3);
+        }
+        {
+          const __m128i t0 = _mm_unpacklo_epi16(step1_2, step1_5);
+          const __m128i t1 = _mm_unpackhi_epi16(step1_2, step1_5);
+          const __m128i t2 = _mm_unpacklo_epi16(step1_3, step1_4);
+          const __m128i t3 = _mm_unpackhi_epi16(step1_3, step1_4);
+          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m10_p22);
+          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m10_p22);
+          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m26_p06);
+          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m26_p06);
+          // dct_const_round_shift
+          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+          // Combine
+          res11 = _mm_packs_epi32(w0, w1);
+          res03 = _mm_packs_epi32(w2, w3);
+        }
+        {
+          const __m128i t0 = _mm_unpacklo_epi16(step1_0, step1_7);
+          const __m128i t1 = _mm_unpackhi_epi16(step1_0, step1_7);
+          const __m128i t2 = _mm_unpacklo_epi16(step1_1, step1_6);
+          const __m128i t3 = _mm_unpackhi_epi16(step1_1, step1_6);
+          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m02_p30);
+          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m02_p30);
+          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m18_p14);
+          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m18_p14);
+          // dct_const_round_shift
+          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+          // Combine
+          res15 = _mm_packs_epi32(w0, w1);
+          res07 = _mm_packs_epi32(w2, w3);
+        }
+      }
+      // Transpose the results, do it as two 8x8 transposes.
+      {
+        // 00 01 02 03 04 05 06 07
+        // 10 11 12 13 14 15 16 17
+        // 20 21 22 23 24 25 26 27
+        // 30 31 32 33 34 35 36 37
+        // 40 41 42 43 44 45 46 47
+        // 50 51 52 53 54 55 56 57
+        // 60 61 62 63 64 65 66 67
+        // 70 71 72 73 74 75 76 77
+        const __m128i tr0_0 = _mm_unpacklo_epi16(res00, res01);
+        const __m128i tr0_1 = _mm_unpacklo_epi16(res02, res03);
+        const __m128i tr0_2 = _mm_unpackhi_epi16(res00, res01);
+        const __m128i tr0_3 = _mm_unpackhi_epi16(res02, res03);
+        const __m128i tr0_4 = _mm_unpacklo_epi16(res04, res05);
+        const __m128i tr0_5 = _mm_unpacklo_epi16(res06, res07);
+        const __m128i tr0_6 = _mm_unpackhi_epi16(res04, res05);
+        const __m128i tr0_7 = _mm_unpackhi_epi16(res06, res07);
+        // 00 10 01 11 02 12 03 13
+        // 20 30 21 31 22 32 23 33
+        // 04 14 05 15 06 16 07 17
+        // 24 34 25 35 26 36 27 37
+        // 40 50 41 51 42 52 43 53
+        // 60 70 61 71 62 72 63 73
+        // 54 54 55 55 56 56 57 57
+        // 64 74 65 75 66 76 67 77
+        const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+        const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
+        const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+        const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
+        const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+        const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
+        const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
+        const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
+        // 00 10 20 30 01 11 21 31
+        // 40 50 60 70 41 51 61 71
+        // 02 12 22 32 03 13 23 33
+        // 42 52 62 72 43 53 63 73
+        // 04 14 24 34 05 15 21 36
+        // 44 54 64 74 45 55 61 76
+        // 06 16 26 36 07 17 27 37
+        // 46 56 66 76 47 57 67 77
+        const __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
+        const __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
+        const __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
+        const __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
+        const __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
+        const __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
+        const __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
+        const __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
+        // 00 10 20 30 40 50 60 70
+        // 01 11 21 31 41 51 61 71
+        // 02 12 22 32 42 52 62 72
+        // 03 13 23 33 43 53 63 73
+        // 04 14 24 34 44 54 64 74
+        // 05 15 25 35 45 55 65 75
+        // 06 16 26 36 46 56 66 76
+        // 07 17 27 37 47 57 67 77
+        _mm_storeu_si128((__m128i *)(out + 0 * 16), tr2_0);
+        _mm_storeu_si128((__m128i *)(out + 1 * 16), tr2_1);
+        _mm_storeu_si128((__m128i *)(out + 2 * 16), tr2_2);
+        _mm_storeu_si128((__m128i *)(out + 3 * 16), tr2_3);
+        _mm_storeu_si128((__m128i *)(out + 4 * 16), tr2_4);
+        _mm_storeu_si128((__m128i *)(out + 5 * 16), tr2_5);
+        _mm_storeu_si128((__m128i *)(out + 6 * 16), tr2_6);
+        _mm_storeu_si128((__m128i *)(out + 7 * 16), tr2_7);
+      }
+      {
+        // 00 01 02 03 04 05 06 07
+        // 10 11 12 13 14 15 16 17
+        // 20 21 22 23 24 25 26 27
+        // 30 31 32 33 34 35 36 37
+        // 40 41 42 43 44 45 46 47
+        // 50 51 52 53 54 55 56 57
+        // 60 61 62 63 64 65 66 67
+        // 70 71 72 73 74 75 76 77
+        const __m128i tr0_0 = _mm_unpacklo_epi16(res08, res09);
+        const __m128i tr0_1 = _mm_unpacklo_epi16(res10, res11);
+        const __m128i tr0_2 = _mm_unpackhi_epi16(res08, res09);
+        const __m128i tr0_3 = _mm_unpackhi_epi16(res10, res11);
+        const __m128i tr0_4 = _mm_unpacklo_epi16(res12, res13);
+        const __m128i tr0_5 = _mm_unpacklo_epi16(res14, res15);
+        const __m128i tr0_6 = _mm_unpackhi_epi16(res12, res13);
+        const __m128i tr0_7 = _mm_unpackhi_epi16(res14, res15);
+        // 00 10 01 11 02 12 03 13
+        // 20 30 21 31 22 32 23 33
+        // 04 14 05 15 06 16 07 17
+        // 24 34 25 35 26 36 27 37
+        // 40 50 41 51 42 52 43 53
+        // 60 70 61 71 62 72 63 73
+        // 54 54 55 55 56 56 57 57
+        // 64 74 65 75 66 76 67 77
+        const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+        const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
+        const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+        const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
+        const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+        const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
+        const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
+        const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
+        // 00 10 20 30 01 11 21 31
+        // 40 50 60 70 41 51 61 71
+        // 02 12 22 32 03 13 23 33
+        // 42 52 62 72 43 53 63 73
+        // 04 14 24 34 05 15 21 36
+        // 44 54 64 74 45 55 61 76
+        // 06 16 26 36 07 17 27 37
+        // 46 56 66 76 47 57 67 77
+        const __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
+        const __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
+        const __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
+        const __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
+        const __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
+        const __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
+        const __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
+        const __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
+        // 00 10 20 30 40 50 60 70
+        // 01 11 21 31 41 51 61 71
+        // 02 12 22 32 42 52 62 72
+        // 03 13 23 33 43 53 63 73
+        // 04 14 24 34 44 54 64 74
+        // 05 15 25 35 45 55 65 75
+        // 06 16 26 36 46 56 66 76
+        // 07 17 27 37 47 57 67 77
+        // Store results
+        _mm_store_si128((__m128i *)(out + 8 + 0 * 16), tr2_0);
+        _mm_store_si128((__m128i *)(out + 8 + 1 * 16), tr2_1);
+        _mm_store_si128((__m128i *)(out + 8 + 2 * 16), tr2_2);
+        _mm_store_si128((__m128i *)(out + 8 + 3 * 16), tr2_3);
+        _mm_store_si128((__m128i *)(out + 8 + 4 * 16), tr2_4);
+        _mm_store_si128((__m128i *)(out + 8 + 5 * 16), tr2_5);
+        _mm_store_si128((__m128i *)(out + 8 + 6 * 16), tr2_6);
+        _mm_store_si128((__m128i *)(out + 8 + 7 * 16), tr2_7);
+      }
+      out += 8*16;
+    }
+    // Setup in/out for next pass.
+    in = intermediate;
+    out = output;
+  }
+}
+
+static INLINE void load_buffer_16x16_avx2(const int16_t* input, __m128i *in0,
+                                     __m128i *in1, int stride) {
+  // load first 8 columns
+  load_buffer_8x8_avx2(input, in0, stride);
+  load_buffer_8x8_avx2(input + 8 * stride, in0 + 8, stride);
+
+  input += 8;
+  // load second 8 columns
+  load_buffer_8x8_avx2(input, in1, stride);
+  load_buffer_8x8_avx2(input + 8 * stride, in1 + 8, stride);
+}
+
+static INLINE void write_buffer_16x16_avx2(int16_t *output, __m128i *in0,
+                                      __m128i *in1, int stride) {
+  // write first 8 columns
+  write_buffer_8x8_avx2(output, in0, stride);
+  write_buffer_8x8_avx2(output + 8 * stride, in0 + 8, stride);
+  // write second 8 columns
+  output += 8;
+  write_buffer_8x8_avx2(output, in1, stride);
+  write_buffer_8x8_avx2(output + 8 * stride, in1 + 8, stride);
+}
+
+static INLINE void array_transpose_16x16_avx2(__m128i *res0, __m128i *res1) {
+  __m128i tbuf[8];
+  array_transpose_8x8_avx2(res0, res0);
+  array_transpose_8x8_avx2(res1, tbuf);
+  array_transpose_8x8_avx2(res0 + 8, res1);
+  array_transpose_8x8_avx2(res1 + 8, res1 + 8);
+
+  res0[8] = tbuf[0];
+  res0[9] = tbuf[1];
+  res0[10] = tbuf[2];
+  res0[11] = tbuf[3];
+  res0[12] = tbuf[4];
+  res0[13] = tbuf[5];
+  res0[14] = tbuf[6];
+  res0[15] = tbuf[7];
+}
+
+static INLINE void right_shift_16x16_avx2(__m128i *res0, __m128i *res1) {
+  // perform rounding operations
+  right_shift_8x8_avx2(res0, 2);
+  right_shift_8x8_avx2(res0 + 8, 2);
+  right_shift_8x8_avx2(res1, 2);
+  right_shift_8x8_avx2(res1 + 8, 2);
+}
+
+void fdct16_8col_avx2(__m128i *in) {
+  // perform 16x16 1-D DCT for 8 columns
+  __m128i i[8], s[8], p[8], t[8], u[16], v[16];
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
+  const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
+  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+  const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
+  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+  const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64);
+  const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64);
+  const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);
+  const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);
+  const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64);
+  const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64);
+  const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64);
+  const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+
+  // stage 1
+  i[0] = _mm_add_epi16(in[0], in[15]);
+  i[1] = _mm_add_epi16(in[1], in[14]);
+  i[2] = _mm_add_epi16(in[2], in[13]);
+  i[3] = _mm_add_epi16(in[3], in[12]);
+  i[4] = _mm_add_epi16(in[4], in[11]);
+  i[5] = _mm_add_epi16(in[5], in[10]);
+  i[6] = _mm_add_epi16(in[6], in[9]);
+  i[7] = _mm_add_epi16(in[7], in[8]);
+
+  s[0] = _mm_sub_epi16(in[7], in[8]);
+  s[1] = _mm_sub_epi16(in[6], in[9]);
+  s[2] = _mm_sub_epi16(in[5], in[10]);
+  s[3] = _mm_sub_epi16(in[4], in[11]);
+  s[4] = _mm_sub_epi16(in[3], in[12]);
+  s[5] = _mm_sub_epi16(in[2], in[13]);
+  s[6] = _mm_sub_epi16(in[1], in[14]);
+  s[7] = _mm_sub_epi16(in[0], in[15]);
+
+  p[0] = _mm_add_epi16(i[0], i[7]);
+  p[1] = _mm_add_epi16(i[1], i[6]);
+  p[2] = _mm_add_epi16(i[2], i[5]);
+  p[3] = _mm_add_epi16(i[3], i[4]);
+  p[4] = _mm_sub_epi16(i[3], i[4]);
+  p[5] = _mm_sub_epi16(i[2], i[5]);
+  p[6] = _mm_sub_epi16(i[1], i[6]);
+  p[7] = _mm_sub_epi16(i[0], i[7]);
+
+  u[0] = _mm_add_epi16(p[0], p[3]);
+  u[1] = _mm_add_epi16(p[1], p[2]);
+  u[2] = _mm_sub_epi16(p[1], p[2]);
+  u[3] = _mm_sub_epi16(p[0], p[3]);
+
+  v[0] = _mm_unpacklo_epi16(u[0], u[1]);
+  v[1] = _mm_unpackhi_epi16(u[0], u[1]);
+  v[2] = _mm_unpacklo_epi16(u[2], u[3]);
+  v[3] = _mm_unpackhi_epi16(u[2], u[3]);
+
+  u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16);
+  u[1] = _mm_madd_epi16(v[1], k__cospi_p16_p16);
+  u[2] = _mm_madd_epi16(v[0], k__cospi_p16_m16);
+  u[3] = _mm_madd_epi16(v[1], k__cospi_p16_m16);
+  u[4] = _mm_madd_epi16(v[2], k__cospi_p24_p08);
+  u[5] = _mm_madd_epi16(v[3], k__cospi_p24_p08);
+  u[6] = _mm_madd_epi16(v[2], k__cospi_m08_p24);
+  u[7] = _mm_madd_epi16(v[3], k__cospi_m08_p24);
+
+  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+  v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+  v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+  v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+  v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+
+  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+  u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
+  u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
+  u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
+  u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
+
+  in[0] = _mm_packs_epi32(u[0], u[1]);
+  in[4] = _mm_packs_epi32(u[4], u[5]);
+  in[8] = _mm_packs_epi32(u[2], u[3]);
+  in[12] = _mm_packs_epi32(u[6], u[7]);
+
+  u[0] = _mm_unpacklo_epi16(p[5], p[6]);
+  u[1] = _mm_unpackhi_epi16(p[5], p[6]);
+  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
+  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
+  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
+
+  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+
+  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+
+  u[0] = _mm_packs_epi32(v[0], v[1]);
+  u[1] = _mm_packs_epi32(v[2], v[3]);
+
+  t[0] = _mm_add_epi16(p[4], u[0]);
+  t[1] = _mm_sub_epi16(p[4], u[0]);
+  t[2] = _mm_sub_epi16(p[7], u[1]);
+  t[3] = _mm_add_epi16(p[7], u[1]);
+
+  u[0] = _mm_unpacklo_epi16(t[0], t[3]);
+  u[1] = _mm_unpackhi_epi16(t[0], t[3]);
+  u[2] = _mm_unpacklo_epi16(t[1], t[2]);
+  u[3] = _mm_unpackhi_epi16(t[1], t[2]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_p28_p04);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_p28_p04);
+  v[2] = _mm_madd_epi16(u[2], k__cospi_p12_p20);
+  v[3] = _mm_madd_epi16(u[3], k__cospi_p12_p20);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_m20_p12);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_m20_p12);
+  v[6] = _mm_madd_epi16(u[0], k__cospi_m04_p28);
+  v[7] = _mm_madd_epi16(u[1], k__cospi_m04_p28);
+
+  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+
+  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+
+  in[2] = _mm_packs_epi32(v[0], v[1]);
+  in[6] = _mm_packs_epi32(v[4], v[5]);
+  in[10] = _mm_packs_epi32(v[2], v[3]);
+  in[14] = _mm_packs_epi32(v[6], v[7]);
+
+  // stage 2
+  u[0] = _mm_unpacklo_epi16(s[2], s[5]);
+  u[1] = _mm_unpackhi_epi16(s[2], s[5]);
+  u[2] = _mm_unpacklo_epi16(s[3], s[4]);
+  u[3] = _mm_unpackhi_epi16(s[3], s[4]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
+  v[2] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
+  v[3] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
+  v[6] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
+  v[7] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
+
+  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+
+  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+
+  t[2] = _mm_packs_epi32(v[0], v[1]);
+  t[3] = _mm_packs_epi32(v[2], v[3]);
+  t[4] = _mm_packs_epi32(v[4], v[5]);
+  t[5] = _mm_packs_epi32(v[6], v[7]);
+
+  // stage 3
+  p[0] = _mm_add_epi16(s[0], t[3]);
+  p[1] = _mm_add_epi16(s[1], t[2]);
+  p[2] = _mm_sub_epi16(s[1], t[2]);
+  p[3] = _mm_sub_epi16(s[0], t[3]);
+  p[4] = _mm_sub_epi16(s[7], t[4]);
+  p[5] = _mm_sub_epi16(s[6], t[5]);
+  p[6] = _mm_add_epi16(s[6], t[5]);
+  p[7] = _mm_add_epi16(s[7], t[4]);
+
+  // stage 4
+  u[0] = _mm_unpacklo_epi16(p[1], p[6]);
+  u[1] = _mm_unpackhi_epi16(p[1], p[6]);
+  u[2] = _mm_unpacklo_epi16(p[2], p[5]);
+  u[3] = _mm_unpackhi_epi16(p[2], p[5]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_m08_p24);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_m08_p24);
+  v[2] = _mm_madd_epi16(u[2], k__cospi_m24_m08);
+  v[3] = _mm_madd_epi16(u[3], k__cospi_m24_m08);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_m08_p24);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_m08_p24);
+  v[6] = _mm_madd_epi16(u[0], k__cospi_p24_p08);
+  v[7] = _mm_madd_epi16(u[1], k__cospi_p24_p08);
+
+  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+
+  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+
+  t[1] = _mm_packs_epi32(v[0], v[1]);
+  t[2] = _mm_packs_epi32(v[2], v[3]);
+  t[5] = _mm_packs_epi32(v[4], v[5]);
+  t[6] = _mm_packs_epi32(v[6], v[7]);
+
+  // stage 5
+  s[0] = _mm_add_epi16(p[0], t[1]);
+  s[1] = _mm_sub_epi16(p[0], t[1]);
+  s[2] = _mm_sub_epi16(p[3], t[2]);
+  s[3] = _mm_add_epi16(p[3], t[2]);
+  s[4] = _mm_add_epi16(p[4], t[5]);
+  s[5] = _mm_sub_epi16(p[4], t[5]);
+  s[6] = _mm_sub_epi16(p[7], t[6]);
+  s[7] = _mm_add_epi16(p[7], t[6]);
+
+  // stage 6
+  u[0] = _mm_unpacklo_epi16(s[0], s[7]);
+  u[1] = _mm_unpackhi_epi16(s[0], s[7]);
+  u[2] = _mm_unpacklo_epi16(s[1], s[6]);
+  u[3] = _mm_unpackhi_epi16(s[1], s[6]);
+  u[4] = _mm_unpacklo_epi16(s[2], s[5]);
+  u[5] = _mm_unpackhi_epi16(s[2], s[5]);
+  u[6] = _mm_unpacklo_epi16(s[3], s[4]);
+  u[7] = _mm_unpackhi_epi16(s[3], s[4]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_p30_p02);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_p30_p02);
+  v[2] = _mm_madd_epi16(u[2], k__cospi_p14_p18);
+  v[3] = _mm_madd_epi16(u[3], k__cospi_p14_p18);
+  v[4] = _mm_madd_epi16(u[4], k__cospi_p22_p10);
+  v[5] = _mm_madd_epi16(u[5], k__cospi_p22_p10);
+  v[6] = _mm_madd_epi16(u[6], k__cospi_p06_p26);
+  v[7] = _mm_madd_epi16(u[7], k__cospi_p06_p26);
+  v[8] = _mm_madd_epi16(u[6], k__cospi_m26_p06);
+  v[9] = _mm_madd_epi16(u[7], k__cospi_m26_p06);
+  v[10] = _mm_madd_epi16(u[4], k__cospi_m10_p22);
+  v[11] = _mm_madd_epi16(u[5], k__cospi_m10_p22);
+  v[12] = _mm_madd_epi16(u[2], k__cospi_m18_p14);
+  v[13] = _mm_madd_epi16(u[3], k__cospi_m18_p14);
+  v[14] = _mm_madd_epi16(u[0], k__cospi_m02_p30);
+  v[15] = _mm_madd_epi16(u[1], k__cospi_m02_p30);
+
+  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
+  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
+  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
+  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
+  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
+  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
+  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
+  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
+
+  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+  v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
+  v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
+  v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
+  v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
+  v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
+  v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
+  v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
+  v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
+
+  in[1]  = _mm_packs_epi32(v[0], v[1]);
+  in[9]  = _mm_packs_epi32(v[2], v[3]);
+  in[5]  = _mm_packs_epi32(v[4], v[5]);
+  in[13] = _mm_packs_epi32(v[6], v[7]);
+  in[3]  = _mm_packs_epi32(v[8], v[9]);
+  in[11] = _mm_packs_epi32(v[10], v[11]);
+  in[7]  = _mm_packs_epi32(v[12], v[13]);
+  in[15] = _mm_packs_epi32(v[14], v[15]);
+}
+
+void fadst16_8col_avx2(__m128i *in) {
+  // perform 16x16 1-D ADST for 8 columns
+  __m128i s[16], x[16], u[32], v[32];
+  const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
+  const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64);
+  const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64);
+  const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64);
+  const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64);
+  const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64);
+  const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64);
+  const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64);
+  const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64);
+  const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64);
+  const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64);
+  const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64);
+  const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64);
+  const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64);
+  const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64);
+  const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64);
+  const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
+  const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+  const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
+  const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
+  const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64);
+  const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64);
+  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+  const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
+  const __m128i k__cospi_m16_m16 = _mm_set1_epi16(-cospi_16_64);
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  const __m128i kZero = _mm_set1_epi16(0);
+
+  u[0] = _mm_unpacklo_epi16(in[15], in[0]);
+  u[1] = _mm_unpackhi_epi16(in[15], in[0]);
+  u[2] = _mm_unpacklo_epi16(in[13], in[2]);
+  u[3] = _mm_unpackhi_epi16(in[13], in[2]);
+  u[4] = _mm_unpacklo_epi16(in[11], in[4]);
+  u[5] = _mm_unpackhi_epi16(in[11], in[4]);
+  u[6] = _mm_unpacklo_epi16(in[9], in[6]);
+  u[7] = _mm_unpackhi_epi16(in[9], in[6]);
+  u[8] = _mm_unpacklo_epi16(in[7], in[8]);
+  u[9] = _mm_unpackhi_epi16(in[7], in[8]);
+  u[10] = _mm_unpacklo_epi16(in[5], in[10]);
+  u[11] = _mm_unpackhi_epi16(in[5], in[10]);
+  u[12] = _mm_unpacklo_epi16(in[3], in[12]);
+  u[13] = _mm_unpackhi_epi16(in[3], in[12]);
+  u[14] = _mm_unpacklo_epi16(in[1], in[14]);
+  u[15] = _mm_unpackhi_epi16(in[1], in[14]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31);
+  v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01);
+  v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27);
+  v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05);
+  v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05);
+  v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23);
+  v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23);
+  v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09);
+  v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09);
+  v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19);
+  v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19);
+  v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13);
+  v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13);
+  v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15);
+  v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15);
+  v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17);
+  v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17);
+  v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11);
+  v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11);
+  v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21);
+  v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21);
+  v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07);
+  v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07);
+  v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25);
+  v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25);
+  v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03);
+  v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03);
+  v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29);
+  v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29);
+
+  u[0] = _mm_add_epi32(v[0], v[16]);
+  u[1] = _mm_add_epi32(v[1], v[17]);
+  u[2] = _mm_add_epi32(v[2], v[18]);
+  u[3] = _mm_add_epi32(v[3], v[19]);
+  u[4] = _mm_add_epi32(v[4], v[20]);
+  u[5] = _mm_add_epi32(v[5], v[21]);
+  u[6] = _mm_add_epi32(v[6], v[22]);
+  u[7] = _mm_add_epi32(v[7], v[23]);
+  u[8] = _mm_add_epi32(v[8], v[24]);
+  u[9] = _mm_add_epi32(v[9], v[25]);
+  u[10] = _mm_add_epi32(v[10], v[26]);
+  u[11] = _mm_add_epi32(v[11], v[27]);
+  u[12] = _mm_add_epi32(v[12], v[28]);
+  u[13] = _mm_add_epi32(v[13], v[29]);
+  u[14] = _mm_add_epi32(v[14], v[30]);
+  u[15] = _mm_add_epi32(v[15], v[31]);
+  u[16] = _mm_sub_epi32(v[0], v[16]);
+  u[17] = _mm_sub_epi32(v[1], v[17]);
+  u[18] = _mm_sub_epi32(v[2], v[18]);
+  u[19] = _mm_sub_epi32(v[3], v[19]);
+  u[20] = _mm_sub_epi32(v[4], v[20]);
+  u[21] = _mm_sub_epi32(v[5], v[21]);
+  u[22] = _mm_sub_epi32(v[6], v[22]);
+  u[23] = _mm_sub_epi32(v[7], v[23]);
+  u[24] = _mm_sub_epi32(v[8], v[24]);
+  u[25] = _mm_sub_epi32(v[9], v[25]);
+  u[26] = _mm_sub_epi32(v[10], v[26]);
+  u[27] = _mm_sub_epi32(v[11], v[27]);
+  u[28] = _mm_sub_epi32(v[12], v[28]);
+  u[29] = _mm_sub_epi32(v[13], v[29]);
+  u[30] = _mm_sub_epi32(v[14], v[30]);
+  u[31] = _mm_sub_epi32(v[15], v[31]);
+
+  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+  v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+  v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+  v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+  v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+  v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+  v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
+  v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+  v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+  v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+  v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+  v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+  v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+  v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING);
+  v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING);
+  v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING);
+  v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING);
+  v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING);
+  v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING);
+  v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING);
+  v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING);
+  v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING);
+  v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING);
+  v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING);
+  v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING);
+  v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING);
+  v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING);
+  v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING);
+  v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING);
+
+  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+  u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
+  u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
+  u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
+  u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
+  u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
+  u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
+  u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
+  u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
+  u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
+  u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
+  u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
+  u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
+  u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS);
+  u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS);
+  u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS);
+  u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS);
+  u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS);
+  u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS);
+  u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS);
+  u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS);
+  u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS);
+  u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS);
+  u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS);
+  u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS);
+  u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS);
+  u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS);
+  u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS);
+  u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS);
+
+  s[0] = _mm_packs_epi32(u[0], u[1]);
+  s[1] = _mm_packs_epi32(u[2], u[3]);
+  s[2] = _mm_packs_epi32(u[4], u[5]);
+  s[3] = _mm_packs_epi32(u[6], u[7]);
+  s[4] = _mm_packs_epi32(u[8], u[9]);
+  s[5] = _mm_packs_epi32(u[10], u[11]);
+  s[6] = _mm_packs_epi32(u[12], u[13]);
+  s[7] = _mm_packs_epi32(u[14], u[15]);
+  s[8] = _mm_packs_epi32(u[16], u[17]);
+  s[9] = _mm_packs_epi32(u[18], u[19]);
+  s[10] = _mm_packs_epi32(u[20], u[21]);
+  s[11] = _mm_packs_epi32(u[22], u[23]);
+  s[12] = _mm_packs_epi32(u[24], u[25]);
+  s[13] = _mm_packs_epi32(u[26], u[27]);
+  s[14] = _mm_packs_epi32(u[28], u[29]);
+  s[15] = _mm_packs_epi32(u[30], u[31]);
+
+  // stage 2
+  u[0] = _mm_unpacklo_epi16(s[8], s[9]);
+  u[1] = _mm_unpackhi_epi16(s[8], s[9]);
+  u[2] = _mm_unpacklo_epi16(s[10], s[11]);
+  u[3] = _mm_unpackhi_epi16(s[10], s[11]);
+  u[4] = _mm_unpacklo_epi16(s[12], s[13]);
+  u[5] = _mm_unpackhi_epi16(s[12], s[13]);
+  u[6] = _mm_unpacklo_epi16(s[14], s[15]);
+  u[7] = _mm_unpackhi_epi16(s[14], s[15]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
+  v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
+  v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
+  v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
+  v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
+  v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04);
+  v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04);
+  v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28);
+  v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28);
+  v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20);
+  v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20);
+  v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12);
+  v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12);
+
+  u[0] = _mm_add_epi32(v[0], v[8]);
+  u[1] = _mm_add_epi32(v[1], v[9]);
+  u[2] = _mm_add_epi32(v[2], v[10]);
+  u[3] = _mm_add_epi32(v[3], v[11]);
+  u[4] = _mm_add_epi32(v[4], v[12]);
+  u[5] = _mm_add_epi32(v[5], v[13]);
+  u[6] = _mm_add_epi32(v[6], v[14]);
+  u[7] = _mm_add_epi32(v[7], v[15]);
+  u[8] = _mm_sub_epi32(v[0], v[8]);
+  u[9] = _mm_sub_epi32(v[1], v[9]);
+  u[10] = _mm_sub_epi32(v[2], v[10]);
+  u[11] = _mm_sub_epi32(v[3], v[11]);
+  u[12] = _mm_sub_epi32(v[4], v[12]);
+  u[13] = _mm_sub_epi32(v[5], v[13]);
+  u[14] = _mm_sub_epi32(v[6], v[14]);
+  u[15] = _mm_sub_epi32(v[7], v[15]);
+
+  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+  v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+  v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+  v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+  v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+  v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+  v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
+  v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+  v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+  v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+  v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+  v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+  v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+  u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
+  u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
+  u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
+  u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
+  u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
+  u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
+  u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
+  u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
+  u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
+  u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
+  u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
+  u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
+
+  x[0] = _mm_add_epi16(s[0], s[4]);
+  x[1] = _mm_add_epi16(s[1], s[5]);
+  x[2] = _mm_add_epi16(s[2], s[6]);
+  x[3] = _mm_add_epi16(s[3], s[7]);
+  x[4] = _mm_sub_epi16(s[0], s[4]);
+  x[5] = _mm_sub_epi16(s[1], s[5]);
+  x[6] = _mm_sub_epi16(s[2], s[6]);
+  x[7] = _mm_sub_epi16(s[3], s[7]);
+  x[8] = _mm_packs_epi32(u[0], u[1]);
+  x[9] = _mm_packs_epi32(u[2], u[3]);
+  x[10] = _mm_packs_epi32(u[4], u[5]);
+  x[11] = _mm_packs_epi32(u[6], u[7]);
+  x[12] = _mm_packs_epi32(u[8], u[9]);
+  x[13] = _mm_packs_epi32(u[10], u[11]);
+  x[14] = _mm_packs_epi32(u[12], u[13]);
+  x[15] = _mm_packs_epi32(u[14], u[15]);
+
+  // stage 3
+  u[0] = _mm_unpacklo_epi16(x[4], x[5]);
+  u[1] = _mm_unpackhi_epi16(x[4], x[5]);
+  u[2] = _mm_unpacklo_epi16(x[6], x[7]);
+  u[3] = _mm_unpackhi_epi16(x[6], x[7]);
+  u[4] = _mm_unpacklo_epi16(x[12], x[13]);
+  u[5] = _mm_unpackhi_epi16(x[12], x[13]);
+  u[6] = _mm_unpacklo_epi16(x[14], x[15]);
+  u[7] = _mm_unpackhi_epi16(x[14], x[15]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
+  v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08);
+  v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08);
+  v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
+  v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
+  v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24);
+  v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24);
+  v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08);
+  v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08);
+  v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08);
+  v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08);
+  v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24);
+  v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24);
+
+  u[0] = _mm_add_epi32(v[0], v[4]);
+  u[1] = _mm_add_epi32(v[1], v[5]);
+  u[2] = _mm_add_epi32(v[2], v[6]);
+  u[3] = _mm_add_epi32(v[3], v[7]);
+  u[4] = _mm_sub_epi32(v[0], v[4]);
+  u[5] = _mm_sub_epi32(v[1], v[5]);
+  u[6] = _mm_sub_epi32(v[2], v[6]);
+  u[7] = _mm_sub_epi32(v[3], v[7]);
+  u[8] = _mm_add_epi32(v[8], v[12]);
+  u[9] = _mm_add_epi32(v[9], v[13]);
+  u[10] = _mm_add_epi32(v[10], v[14]);
+  u[11] = _mm_add_epi32(v[11], v[15]);
+  u[12] = _mm_sub_epi32(v[8], v[12]);
+  u[13] = _mm_sub_epi32(v[9], v[13]);
+  u[14] = _mm_sub_epi32(v[10], v[14]);
+  u[15] = _mm_sub_epi32(v[11], v[15]);
+
+  u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+  u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+  u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+  u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+  u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+  u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+  u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
+  u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+  u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+  u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+  u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+  u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+  u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+  v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
+  v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
+  v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
+  v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
+  v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
+  v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
+  v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
+  v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
+
+  s[0] = _mm_add_epi16(x[0], x[2]);
+  s[1] = _mm_add_epi16(x[1], x[3]);
+  s[2] = _mm_sub_epi16(x[0], x[2]);
+  s[3] = _mm_sub_epi16(x[1], x[3]);
+  s[4] = _mm_packs_epi32(v[0], v[1]);
+  s[5] = _mm_packs_epi32(v[2], v[3]);
+  s[6] = _mm_packs_epi32(v[4], v[5]);
+  s[7] = _mm_packs_epi32(v[6], v[7]);
+  s[8] = _mm_add_epi16(x[8], x[10]);
+  s[9] = _mm_add_epi16(x[9], x[11]);
+  s[10] = _mm_sub_epi16(x[8], x[10]);
+  s[11] = _mm_sub_epi16(x[9], x[11]);
+  s[12] = _mm_packs_epi32(v[8], v[9]);
+  s[13] = _mm_packs_epi32(v[10], v[11]);
+  s[14] = _mm_packs_epi32(v[12], v[13]);
+  s[15] = _mm_packs_epi32(v[14], v[15]);
+
+  // stage 4
+  u[0] = _mm_unpacklo_epi16(s[2], s[3]);
+  u[1] = _mm_unpackhi_epi16(s[2], s[3]);
+  u[2] = _mm_unpacklo_epi16(s[6], s[7]);
+  u[3] = _mm_unpackhi_epi16(s[6], s[7]);
+  u[4] = _mm_unpacklo_epi16(s[10], s[11]);
+  u[5] = _mm_unpackhi_epi16(s[10], s[11]);
+  u[6] = _mm_unpacklo_epi16(s[14], s[15]);
+  u[7] = _mm_unpackhi_epi16(s[14], s[15]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16);
+  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
+  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
+  v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
+  v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
+  v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16);
+  v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16);
+  v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16);
+  v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16);
+  v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16);
+  v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16);
+  v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16);
+  v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16);
+
+  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
+  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
+  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
+  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
+  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
+  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
+  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
+  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
+
+  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+  v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
+  v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
+  v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
+  v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
+  v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
+  v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
+  v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
+  v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
+
+  in[0] = s[0];
+  in[1] = _mm_sub_epi16(kZero, s[8]);
+  in[2] = s[12];
+  in[3] = _mm_sub_epi16(kZero, s[4]);
+  in[4] = _mm_packs_epi32(v[4], v[5]);
+  in[5] = _mm_packs_epi32(v[12], v[13]);
+  in[6] = _mm_packs_epi32(v[8], v[9]);
+  in[7] = _mm_packs_epi32(v[0], v[1]);
+  in[8] = _mm_packs_epi32(v[2], v[3]);
+  in[9] = _mm_packs_epi32(v[10], v[11]);
+  in[10] = _mm_packs_epi32(v[14], v[15]);
+  in[11] = _mm_packs_epi32(v[6], v[7]);
+  in[12] = s[5];
+  in[13] = _mm_sub_epi16(kZero, s[13]);
+  in[14] = s[9];
+  in[15] = _mm_sub_epi16(kZero, s[1]);
+}
+
+void fdct16_avx2(__m128i *in0, __m128i *in1) {
+  fdct16_8col_avx2(in0);
+  fdct16_8col_avx2(in1);
+  array_transpose_16x16_avx2(in0, in1);
+}
+
+void fadst16_avx2(__m128i *in0, __m128i *in1) {
+  fadst16_8col_avx2(in0);
+  fadst16_8col_avx2(in1);
+  array_transpose_16x16_avx2(in0, in1);
+}
+
+void vp9_fht16x16_avx2(const int16_t *input, int16_t *output,
+                      int stride, int tx_type) {
+  __m128i in0[16], in1[16];
+
+  switch (tx_type) {
+    case DCT_DCT:
+      vp9_fdct16x16_avx2(input, output, stride);
+      break;
+    case ADST_DCT:
+      load_buffer_16x16_avx2(input, in0, in1, stride);
+      fadst16_avx2(in0, in1);
+      right_shift_16x16_avx2(in0, in1);
+      fdct16_avx2(in0, in1);
+      write_buffer_16x16_avx2(output, in0, in1, 16);
+      break;
+    case DCT_ADST:
+      load_buffer_16x16_avx2(input, in0, in1, stride);
+      fdct16_avx2(in0, in1);
+      right_shift_16x16_avx2(in0, in1);
+      fadst16_avx2(in0, in1);
+      write_buffer_16x16_avx2(output, in0, in1, 16);
+      break;
+    case ADST_ADST:
+      load_buffer_16x16_avx2(input, in0, in1, stride);
+      fadst16_avx2(in0, in1);
+      right_shift_16x16_avx2(in0, in1);
+      fadst16_avx2(in0, in1);
+      write_buffer_16x16_avx2(output, in0, in1, 16);
+      break;
+    default:
+      assert(0);
+      break;
+  }
+}
+
+#define FDCT32x32_2D_AVX2 vp9_fdct32x32_rd_avx2
+#define FDCT32x32_HIGH_PRECISION 0
+#include "vp9/encoder/x86/vp9_dct32x32_avx2.c"
+#undef  FDCT32x32_2D_AVX2
+#undef  FDCT32x32_HIGH_PRECISION
+
+#define FDCT32x32_2D_AVX2 vp9_fdct32x32_avx2
+#define FDCT32x32_HIGH_PRECISION 1
+#include "vp9/encoder/x86/vp9_dct32x32_avx2.c" // NOLINT
+#undef  FDCT32x32_2D_AVX2
+#undef  FDCT32x32_HIGH_PRECISION
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct_mmx.asm b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct_mmx.asm
new file mode 100644
index 00000000000..f71181c5e91
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct_mmx.asm
@@ -0,0 +1,70 @@
+;
+;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+%macro TRANSFORM_COLS 0
+  paddw           m0,        m1
+  movq            m4,        m0
+  psubw           m3,        m2
+  psubw           m4,        m3
+  psraw           m4,        1
+  movq            m5,        m4
+  psubw           m5,        m1 ;b1
+  psubw           m4,        m2 ;c1
+  psubw           m0,        m4
+  paddw           m3,        m5
+                                ; m0 a0
+  SWAP            1,         4  ; m1 c1
+  SWAP            2,         3  ; m2 d1
+  SWAP            3,         5  ; m3 b1
+%endmacro
+
+%macro TRANSPOSE_4X4 0
+  movq            m4,        m0
+  movq            m5,        m2
+  punpcklwd       m4,        m1
+  punpckhwd       m0,        m1
+  punpcklwd       m5,        m3
+  punpckhwd       m2,        m3
+  movq            m1,        m4
+  movq            m3,        m0
+  punpckldq       m1,        m5
+  punpckhdq       m4,        m5
+  punpckldq       m3,        m2
+  punpckhdq       m0,        m2
+  SWAP            2, 3, 0, 1, 4
+%endmacro
+
+INIT_MMX mmx
+cglobal fwht4x4, 3, 4, 8, input, output, stride
+  lea             r3q,       [inputq + strideq*4]
+  movq            m0,        [inputq] ;a1
+  movq            m1,        [inputq + strideq*2] ;b1
+  movq            m2,        [r3q] ;c1
+  movq            m3,        [r3q + strideq*2] ;d1
+
+  TRANSFORM_COLS
+  TRANSPOSE_4X4
+  TRANSFORM_COLS
+  TRANSPOSE_4X4
+
+  psllw           m0,        2
+  psllw           m1,        2
+  psllw           m2,        2
+  psllw           m3,        2
+
+  movq            [outputq],      m0
+  movq            [outputq + 8],  m1
+  movq            [outputq + 16], m2
+  movq            [outputq + 24], m3
+
+  RET
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct_sse2.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct_sse2.c
index dc115018ec4..68658223858 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct_sse2.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct_sse2.c
@@ -13,40 +13,82 @@
 #include "vpx_ports/mem.h"
 
 void vp9_fdct4x4_sse2(const int16_t *input, int16_t *output, int stride) {
-  // The 2D transform is done with two passes which are actually pretty
-  // similar. In the first one, we transform the columns and transpose
-  // the results. In the second one, we transform the rows. To achieve that,
-  // as the first pass results are transposed, we tranpose the columns (that
-  // is the transposed rows) and transpose the results (so that it goes back
-  // in normal/row positions).
-  int pass;
+  // This 2D transform implements 4 vertical 1D transforms followed
+  // by 4 horizontal 1D transforms.  The multiplies and adds are as given
+  // by Chen, Smith and Fralick ('77).  The commands for moving the data
+  // around have been minimized by hand.
+  // For the purposes of the comments, the 16 inputs are referred to at i0
+  // through iF (in raster order), intermediate variables are a0, b0, c0
+  // through f, and correspond to the in-place computations mapped to input
+  // locations.  The outputs, o0 through oF are labeled according to the
+  // output locations.
+
   // Constants
-  //    When we use them, in one case, they are all the same. In all others
-  //    it's a pair of them that we need to repeat four times. This is done
-  //    by constructing the 32 bit constant corresponding to that pair.
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
-  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+  // These are the coefficients used for the multiplies.
+  // In the comments, pN means cos(N pi /64) and mN is -cos(N pi /64),
+  // where cospi_N_64 = cos(N pi /64)
+  const __m128i k__cospi_A = _mm_setr_epi16(cospi_16_64, cospi_16_64,
+                                            cospi_16_64, cospi_16_64,
+                                            cospi_16_64, -cospi_16_64,
+                                            cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_B = _mm_setr_epi16(cospi_16_64, -cospi_16_64,
+                                            cospi_16_64, -cospi_16_64,
+                                            cospi_16_64, cospi_16_64,
+                                            cospi_16_64, cospi_16_64);
+  const __m128i k__cospi_C = _mm_setr_epi16(cospi_8_64, cospi_24_64,
+                                            cospi_8_64, cospi_24_64,
+                                            cospi_24_64, -cospi_8_64,
+                                            cospi_24_64, -cospi_8_64);
+  const __m128i k__cospi_D = _mm_setr_epi16(cospi_24_64, -cospi_8_64,
+                                            cospi_24_64, -cospi_8_64,
+                                            cospi_8_64, cospi_24_64,
+                                            cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_E = _mm_setr_epi16(cospi_16_64, cospi_16_64,
+                                            cospi_16_64, cospi_16_64,
+                                            cospi_16_64, cospi_16_64,
+                                            cospi_16_64, cospi_16_64);
+  const __m128i k__cospi_F = _mm_setr_epi16(cospi_16_64, -cospi_16_64,
+                                            cospi_16_64, -cospi_16_64,
+                                            cospi_16_64, -cospi_16_64,
+                                            cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_G = _mm_setr_epi16(cospi_8_64, cospi_24_64,
+                                            cospi_8_64, cospi_24_64,
+                                            -cospi_8_64, -cospi_24_64,
+                                            -cospi_8_64, -cospi_24_64);
+  const __m128i k__cospi_H = _mm_setr_epi16(cospi_24_64, -cospi_8_64,
+                                            cospi_24_64, -cospi_8_64,
+                                            -cospi_24_64, cospi_8_64,
+                                            -cospi_24_64, cospi_8_64);
+
   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  // This second rounding constant saves doing some extra adds at the end
+  const __m128i k__DCT_CONST_ROUNDING2 = _mm_set1_epi32(DCT_CONST_ROUNDING
+                                               +(DCT_CONST_ROUNDING << 1));
+  const int DCT_CONST_BITS2 =  DCT_CONST_BITS+2;
   const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
   const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
-  const __m128i kOne = _mm_set1_epi16(1);
-  __m128i in0, in1, in2, in3;
+  __m128i in0, in1;
+
   // Load inputs.
   {
     in0  = _mm_loadl_epi64((const __m128i *)(input +  0 * stride));
     in1  = _mm_loadl_epi64((const __m128i *)(input +  1 * stride));
-    in2  = _mm_loadl_epi64((const __m128i *)(input +  2 * stride));
-    in3  = _mm_loadl_epi64((const __m128i *)(input +  3 * stride));
-    // x = x << 4
+    in1  = _mm_unpacklo_epi64(in1, _mm_loadl_epi64((const __m128i *)
+           (input +  2 * stride)));
+    in0  = _mm_unpacklo_epi64(in0, _mm_loadl_epi64((const __m128i *)
+           (input +  3 * stride)));
+    // in0 = [i0 i1 i2 i3 iC iD iE iF]
+    // in1 = [i4 i5 i6 i7 i8 i9 iA iB]
+
+
+    // multiply by 16 to give some extra precision
     in0 = _mm_slli_epi16(in0, 4);
     in1 = _mm_slli_epi16(in1, 4);
-    in2 = _mm_slli_epi16(in2, 4);
-    in3 = _mm_slli_epi16(in3, 4);
     // if (i == 0 && input[0]) input[0] += 1;
+    // add 1 to the upper left pixel if it is non-zero, which helps reduce
+    // the round-trip error
     {
-      // The mask will only contain wether the first value is zero, all
+      // The mask will only contain whether the first value is zero, all
       // other comparison will fail as something shifted by 4 (above << 4)
       // can never be equal to one. To increment in the non-zero case, we
       // add the mask and one for the first element:
@@ -57,60 +99,119 @@ void vp9_fdct4x4_sse2(const int16_t *input, int16_t *output, int stride) {
       in0 = _mm_add_epi16(in0, k__nonzero_bias_b);
     }
   }
-  // Do the two transform/transpose passes
-  for (pass = 0; pass < 2; ++pass) {
-    // Transform 1/2: Add/substract
-    const __m128i r0 = _mm_add_epi16(in0, in3);
-    const __m128i r1 = _mm_add_epi16(in1, in2);
-    const __m128i r2 = _mm_sub_epi16(in1, in2);
-    const __m128i r3 = _mm_sub_epi16(in0, in3);
-    // Transform 1/2: Interleave to do the multiply by constants which gets us
-    //                into 32 bits.
-    const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
-    const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
-    const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
-    const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
-    const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
-    const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
+  // There are 4 total stages, alternating between an add/subtract stage
+  // followed by an multiply-and-add stage.
+  {
+    // Stage 1: Add/subtract
+
+    // in0 = [i0 i1 i2 i3 iC iD iE iF]
+    // in1 = [i4 i5 i6 i7 i8 i9 iA iB]
+    const __m128i r0 = _mm_unpacklo_epi16(in0, in1);
+    const __m128i r1 = _mm_unpackhi_epi16(in0, in1);
+    // r0 = [i0 i4 i1 i5 i2 i6 i3 i7]
+    // r1 = [iC i8 iD i9 iE iA iF iB]
+    const __m128i r2 = _mm_shuffle_epi32(r0, 0xB4);
+    const __m128i r3 = _mm_shuffle_epi32(r1, 0xB4);
+    // r2 = [i0 i4 i1 i5 i3 i7 i2 i6]
+    // r3 = [iC i8 iD i9 iF iB iE iA]
+
+    const __m128i t0 = _mm_add_epi16(r2, r3);
+    const __m128i t1 = _mm_sub_epi16(r2, r3);
+    // t0 = [a0 a4 a1 a5 a3 a7 a2 a6]
+    // t1 = [aC a8 aD a9 aF aB aE aA]
+
+    // Stage 2: multiply by constants (which gets us into 32 bits).
+    // The constants needed here are:
+    // k__cospi_A = [p16 p16 p16 p16 p16 m16 p16 m16]
+    // k__cospi_B = [p16 m16 p16 m16 p16 p16 p16 p16]
+    // k__cospi_C = [p08 p24 p08 p24 p24 m08 p24 m08]
+    // k__cospi_D = [p24 m08 p24 m08 p08 p24 p08 p24]
+    const __m128i u0 = _mm_madd_epi16(t0, k__cospi_A);
+    const __m128i u2 = _mm_madd_epi16(t0, k__cospi_B);
+    const __m128i u1 = _mm_madd_epi16(t1, k__cospi_C);
+    const __m128i u3 = _mm_madd_epi16(t1, k__cospi_D);
+    // Then add and right-shift to get back to 16-bit range
     const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+    const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
     const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
-    const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
-    const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
+    const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
     const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+    const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
     const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
-    const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
-    const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
-    // Combine and transpose
-    const __m128i res0 = _mm_packs_epi32(w0, w2);
-    const __m128i res1 = _mm_packs_epi32(w4, w6);
-    // 00 01 02 03 20 21 22 23
-    // 10 11 12 13 30 31 32 33
-    const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1);
-    const __m128i tr0_1 = _mm_unpackhi_epi16(res0, res1);
-    // 00 10 01 11 02 12 03 13
-    // 20 30 21 31 22 32 23 33
-    in0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
-    in2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
-    // 00 10 20 30 01 11 21 31      in0 contains 0 followed by 1
-    // 02 12 22 32 03 13 23 33      in2 contains 2 followed by 3
-    if (0 == pass) {
-      // Extract values in the high part for second pass as transform code
-      // only uses the first four values.
-      in1 = _mm_unpackhi_epi64(in0, in0);
-      in3 = _mm_unpackhi_epi64(in2, in2);
-    } else {
-      // Post-condition output and store it (v + 1) >> 2, taking advantage
-      // of the fact 1/3 are stored just after 0/2.
-      __m128i out01 = _mm_add_epi16(in0, kOne);
-      __m128i out23 = _mm_add_epi16(in2, kOne);
-      out01 = _mm_srai_epi16(out01, 2);
-      out23 = _mm_srai_epi16(out23, 2);
-      _mm_storeu_si128((__m128i *)(output + 0 * 4), out01);
-      _mm_storeu_si128((__m128i *)(output + 2 * 4), out23);
-    }
+    const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+    // w0 = [b0 b1 b7 b6]
+    // w1 = [b8 b9 bF bE]
+    // w2 = [b4 b5 b3 b2]
+    // w3 = [bC bD bB bA]
+    const __m128i x0 = _mm_packs_epi32(w0, w1);
+    const __m128i x1 = _mm_packs_epi32(w2, w3);
+    // x0 = [b0 b1 b7 b6 b8 b9 bF bE]
+    // x1 = [b4 b5 b3 b2 bC bD bB bA]
+    in0 = _mm_shuffle_epi32(x0, 0xD8);
+    in1 = _mm_shuffle_epi32(x1, 0x8D);
+    // in0 = [b0 b1 b8 b9 b7 b6 bF bE]
+    // in1 = [b3 b2 bB bA b4 b5 bC bD]
+  }
+  {
+    // vertical DCTs finished. Now we do the horizontal DCTs.
+    // Stage 3: Add/subtract
+
+    const __m128i t0 = _mm_add_epi16(in0, in1);
+    const __m128i t1 = _mm_sub_epi16(in0, in1);
+    // t0 = [c0 c1 c8 c9  c4  c5  cC  cD]
+    // t1 = [c3 c2 cB cA -c7 -c6 -cF -cE]
+
+    // Stage 4: multiply by constants (which gets us into 32 bits).
+    // The constants needed here are:
+    // k__cospi_E = [p16 p16 p16 p16 p16 p16 p16 p16]
+    // k__cospi_F = [p16 m16 p16 m16 p16 m16 p16 m16]
+    // k__cospi_G = [p08 p24 p08 p24 m08 m24 m08 m24]
+    // k__cospi_H = [p24 m08 p24 m08 m24 p08 m24 p08]
+    const __m128i u0 = _mm_madd_epi16(t0, k__cospi_E);
+    const __m128i u1 = _mm_madd_epi16(t0, k__cospi_F);
+    const __m128i u2 = _mm_madd_epi16(t1, k__cospi_G);
+    const __m128i u3 = _mm_madd_epi16(t1, k__cospi_H);
+    // Then add and right-shift to get back to 16-bit range
+    // but this combines the final right-shift as well to save operations
+    // This unusual rounding operations is to maintain bit-accurate
+    // compatibility with the c version of this function which has two
+    // rounding steps in a row.
+    const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING2);
+    const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING2);
+    const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING2);
+    const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING2);
+    const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS2);
+    const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS2);
+    const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS2);
+    const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS2);
+    // w0 = [o0 o4 o8 oC]
+    // w1 = [o2 o6 oA oE]
+    // w2 = [o1 o5 o9 oD]
+    // w3 = [o3 o7 oB oF]
+    // remember the o's are numbered according to the correct output location
+    const __m128i x0 = _mm_packs_epi32(w0, w1);
+    const __m128i x1 = _mm_packs_epi32(w2, w3);
+    // x0 = [o0 o4 o8 oC o2 o6 oA oE]
+    // x1 = [o1 o5 o9 oD o3 o7 oB oF]
+    const __m128i y0 = _mm_unpacklo_epi16(x0, x1);
+    const __m128i y1 = _mm_unpackhi_epi16(x0, x1);
+    // y0 = [o0 o1 o4 o5 o8 o9 oC oD]
+    // y1 = [o2 o3 o6 o7 oA oB oE oF]
+    in0 = _mm_unpacklo_epi32(y0, y1);
+    // in0 = [o0 o1 o2 o3 o4 o5 o6 o7]
+    in1 = _mm_unpackhi_epi32(y0, y1);
+    // in1 = [o8 o9 oA oB oC oD oE oF]
+  }
+  // Post-condition (v + 1) >> 2 is now incorporated into previous
+  // add and right-shift commands.  Only 2 store instructions needed
+  // because we are using the fact that 1/3 are stored just after 0/2.
+  {
+     _mm_storeu_si128((__m128i *)(output + 0 * 4), in0);
+     _mm_storeu_si128((__m128i *)(output + 2 * 4), in1);
   }
 }
 
+
 static INLINE void load_buffer_4x4(const int16_t *input, __m128i *in,
                                    int stride) {
   const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
@@ -163,7 +264,7 @@ static INLINE void transpose_4x4(__m128i *res) {
   res[3] = _mm_unpackhi_epi64(res[2], res[2]);
 }
 
-void fdct4_1d_sse2(__m128i *in) {
+void fdct4_sse2(__m128i *in) {
   const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
   const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
@@ -196,7 +297,7 @@ void fdct4_1d_sse2(__m128i *in) {
   transpose_4x4(in);
 }
 
-void fadst4_1d_sse2(__m128i *in) {
+void fadst4_sse2(__m128i *in) {
   const __m128i k__sinpi_p01_p02 = pair_set_epi16(sinpi_1_9, sinpi_2_9);
   const __m128i k__sinpi_p04_m01 = pair_set_epi16(sinpi_4_9, -sinpi_1_9);
   const __m128i k__sinpi_p03_p04 = pair_set_epi16(sinpi_3_9, sinpi_4_9);
@@ -206,12 +307,12 @@ void fadst4_1d_sse2(__m128i *in) {
   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
   __m128i u[8], v[8];
   __m128i in7 = _mm_add_epi16(in[0], in[1]);
-  in7 = _mm_sub_epi16(in7, in[3]);
 
   u[0] = _mm_unpacklo_epi16(in[0], in[1]);
   u[1] = _mm_unpacklo_epi16(in[2], in[3]);
   u[2] = _mm_unpacklo_epi16(in7, kZero);
   u[3] = _mm_unpacklo_epi16(in[2], kZero);
+  u[4] = _mm_unpacklo_epi16(in[3], kZero);
 
   v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p02);  // s0 + s2
   v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p04);  // s4 + s5
@@ -219,9 +320,10 @@ void fadst4_1d_sse2(__m128i *in) {
   v[3] = _mm_madd_epi16(u[0], k__sinpi_p04_m01);  // s1 - s3
   v[4] = _mm_madd_epi16(u[1], k__sinpi_m03_p02);  // -s4 + s6
   v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03);  // s4
+  v[6] = _mm_madd_epi16(u[4], k__sinpi_p03_p03);
 
   u[0] = _mm_add_epi32(v[0], v[1]);
-  u[1] = v[2];
+  u[1] = _mm_sub_epi32(v[2], v[6]);
   u[2] = _mm_add_epi32(v[3], v[4]);
   u[3] = _mm_sub_epi32(u[2], u[0]);
   u[4] = _mm_slli_epi32(v[5], 2);
@@ -243,32 +345,36 @@ void fadst4_1d_sse2(__m128i *in) {
   transpose_4x4(in);
 }
 
-void vp9_short_fht4x4_sse2(const int16_t *input, int16_t *output,
-                           int stride, int tx_type) {
+void vp9_fht4x4_sse2(const int16_t *input, int16_t *output,
+                     int stride, int tx_type) {
   __m128i in[4];
-  load_buffer_4x4(input, in, stride);
+
   switch (tx_type) {
-    case 0:  // DCT_DCT
-      fdct4_1d_sse2(in);
-      fdct4_1d_sse2(in);
+    case DCT_DCT:
+      vp9_fdct4x4_sse2(input, output, stride);
       break;
-    case 1:  // ADST_DCT
-      fadst4_1d_sse2(in);
-      fdct4_1d_sse2(in);
+    case ADST_DCT:
+      load_buffer_4x4(input, in, stride);
+      fadst4_sse2(in);
+      fdct4_sse2(in);
+      write_buffer_4x4(output, in);
       break;
-    case 2:  // DCT_ADST
-      fdct4_1d_sse2(in);
-      fadst4_1d_sse2(in);
+    case DCT_ADST:
+      load_buffer_4x4(input, in, stride);
+      fdct4_sse2(in);
+      fadst4_sse2(in);
+      write_buffer_4x4(output, in);
       break;
-    case 3:  // ADST_ADST
-      fadst4_1d_sse2(in);
-      fadst4_1d_sse2(in);
-      break;
-    default:
-      assert(0);
+    case ADST_ADST:
+      load_buffer_4x4(input, in, stride);
+      fadst4_sse2(in);
+      fadst4_sse2(in);
+      write_buffer_4x4(output, in);
       break;
+   default:
+     assert(0);
+     break;
   }
-  write_buffer_4x4(output, in);
 }
 
 void vp9_fdct8x8_sse2(const int16_t *input, int16_t *output, int stride) {
@@ -312,7 +418,7 @@ void vp9_fdct8x8_sse2(const int16_t *input, int16_t *output, int stride) {
   for (pass = 0; pass < 2; pass++) {
     // To store results of each pass before the transpose.
     __m128i res0, res1, res2, res3, res4, res5, res6, res7;
-    // Add/substract
+    // Add/subtract
     const __m128i q0 = _mm_add_epi16(in0, in7);
     const __m128i q1 = _mm_add_epi16(in1, in6);
     const __m128i q2 = _mm_add_epi16(in2, in5);
@@ -323,7 +429,7 @@ void vp9_fdct8x8_sse2(const int16_t *input, int16_t *output, int stride) {
     const __m128i q7 = _mm_sub_epi16(in0, in7);
     // Work on first four results
     {
-      // Add/substract
+      // Add/subtract
       const __m128i r0 = _mm_add_epi16(q0, q3);
       const __m128i r1 = _mm_add_epi16(q1, q2);
       const __m128i r2 = _mm_sub_epi16(q1, q2);
@@ -385,7 +491,7 @@ void vp9_fdct8x8_sse2(const int16_t *input, int16_t *output, int stride) {
       // Combine
       const __m128i r0 = _mm_packs_epi32(s0, s1);
       const __m128i r1 = _mm_packs_epi32(s2, s3);
-      // Add/substract
+      // Add/subtract
       const __m128i x0 = _mm_add_epi16(q4, r0);
       const __m128i x1 = _mm_sub_epi16(q4, r0);
       const __m128i x2 = _mm_sub_epi16(q7, r1);
@@ -657,7 +763,7 @@ static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) {
   // 07 17 27 37 47 57 67 77
 }
 
-void fdct8_1d_sse2(__m128i *in) {
+void fdct8_sse2(__m128i *in) {
   // constants
   const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
@@ -797,7 +903,7 @@ void fdct8_1d_sse2(__m128i *in) {
   array_transpose_8x8(in, in);
 }
 
-void fadst8_1d_sse2(__m128i *in) {
+void fadst8_sse2(__m128i *in) {
   // Constants
   const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
   const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
@@ -1027,40 +1133,46 @@ void fadst8_1d_sse2(__m128i *in) {
   array_transpose_8x8(in, in);
 }
 
-void vp9_short_fht8x8_sse2(const int16_t *input, int16_t *output,
-                           int stride, int tx_type) {
+void vp9_fht8x8_sse2(const int16_t *input, int16_t *output,
+                     int stride, int tx_type) {
   __m128i in[8];
-  load_buffer_8x8(input, in, stride);
+
   switch (tx_type) {
-    case 0:  // DCT_DCT
-      fdct8_1d_sse2(in);
-      fdct8_1d_sse2(in);
+    case DCT_DCT:
+      vp9_fdct8x8_sse2(input, output, stride);
       break;
-    case 1:  // ADST_DCT
-      fadst8_1d_sse2(in);
-      fdct8_1d_sse2(in);
+    case ADST_DCT:
+      load_buffer_8x8(input, in, stride);
+      fadst8_sse2(in);
+      fdct8_sse2(in);
+      right_shift_8x8(in, 1);
+      write_buffer_8x8(output, in, 8);
       break;
-    case 2:  // DCT_ADST
-      fdct8_1d_sse2(in);
-      fadst8_1d_sse2(in);
+    case DCT_ADST:
+      load_buffer_8x8(input, in, stride);
+      fdct8_sse2(in);
+      fadst8_sse2(in);
+      right_shift_8x8(in, 1);
+      write_buffer_8x8(output, in, 8);
       break;
-    case 3:  // ADST_ADST
-      fadst8_1d_sse2(in);
-      fadst8_1d_sse2(in);
+    case ADST_ADST:
+      load_buffer_8x8(input, in, stride);
+      fadst8_sse2(in);
+      fadst8_sse2(in);
+      right_shift_8x8(in, 1);
+      write_buffer_8x8(output, in, 8);
       break;
     default:
       assert(0);
       break;
   }
-  right_shift_8x8(in, 1);
-  write_buffer_8x8(output, in, 8);
 }
 
 void vp9_fdct16x16_sse2(const int16_t *input, int16_t *output, int stride) {
   // The 2D transform is done with two passes which are actually pretty
   // similar. In the first one, we transform the columns and transpose
   // the results. In the second one, we transform the rows. To achieve that,
-  // as the first pass results are transposed, we tranpose the columns (that
+  // as the first pass results are transposed, we transpose the columns (that
   // is the transposed rows) and transpose the results (so that it goes back
   // in normal/row positions).
   int pass;
@@ -1215,9 +1327,9 @@ void vp9_fdct16x16_sse2(const int16_t *input, int16_t *output, int stride) {
         step1_6 = _mm_sub_epi16(in01, in14);
         step1_7 = _mm_sub_epi16(in00, in15);
       }
-      // Work on the first eight values; fdct8_1d(input, even_results);
+      // Work on the first eight values; fdct8(input, even_results);
       {
-        // Add/substract
+        // Add/subtract
         const __m128i q0 = _mm_add_epi16(input0, input7);
         const __m128i q1 = _mm_add_epi16(input1, input6);
         const __m128i q2 = _mm_add_epi16(input2, input5);
@@ -1228,7 +1340,7 @@ void vp9_fdct16x16_sse2(const int16_t *input, int16_t *output, int stride) {
         const __m128i q7 = _mm_sub_epi16(input0, input7);
         // Work on first four results
         {
-          // Add/substract
+          // Add/subtract
           const __m128i r0 = _mm_add_epi16(q0, q3);
           const __m128i r1 = _mm_add_epi16(q1, q2);
           const __m128i r2 = _mm_sub_epi16(q1, q2);
@@ -1292,7 +1404,7 @@ void vp9_fdct16x16_sse2(const int16_t *input, int16_t *output, int stride) {
           // Combine
           const __m128i r0 = _mm_packs_epi32(s0, s1);
           const __m128i r1 = _mm_packs_epi32(s2, s3);
-          // Add/substract
+          // Add/subtract
           const __m128i x0 = _mm_add_epi16(q4, r0);
           const __m128i x1 = _mm_sub_epi16(q4, r0);
           const __m128i x2 = _mm_sub_epi16(q7, r1);
@@ -1729,7 +1841,7 @@ static INLINE void right_shift_16x16(__m128i *res0, __m128i *res1) {
   right_shift_8x8(res1 + 8, 2);
 }
 
-void fdct16_1d_8col(__m128i *in) {
+void fdct16_8col(__m128i *in) {
   // perform 16x16 1-D DCT for 8 columns
   __m128i i[8], s[8], p[8], t[8], u[16], v[16];
   const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
@@ -2051,7 +2163,7 @@ void fdct16_1d_8col(__m128i *in) {
   in[15] = _mm_packs_epi32(v[14], v[15]);
 }
 
-void fadst16_1d_8col(__m128i *in) {
+void fadst16_8col(__m128i *in) {
   // perform 16x16 1-D ADST for 8 columns
   __m128i s[16], x[16], u[32], v[32];
   const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
@@ -2521,48 +2633,51 @@ void fadst16_1d_8col(__m128i *in) {
   in[15] = _mm_sub_epi16(kZero, s[1]);
 }
 
-void fdct16_1d_sse2(__m128i *in0, __m128i *in1) {
-  fdct16_1d_8col(in0);
-  fdct16_1d_8col(in1);
+void fdct16_sse2(__m128i *in0, __m128i *in1) {
+  fdct16_8col(in0);
+  fdct16_8col(in1);
   array_transpose_16x16(in0, in1);
 }
 
-void fadst16_1d_sse2(__m128i *in0, __m128i *in1) {
-  fadst16_1d_8col(in0);
-  fadst16_1d_8col(in1);
+void fadst16_sse2(__m128i *in0, __m128i *in1) {
+  fadst16_8col(in0);
+  fadst16_8col(in1);
   array_transpose_16x16(in0, in1);
 }
 
-void vp9_short_fht16x16_sse2(const int16_t *input, int16_t *output,
-                             int stride, int tx_type) {
+void vp9_fht16x16_sse2(const int16_t *input, int16_t *output,
+                       int stride, int tx_type) {
   __m128i in0[16], in1[16];
-  load_buffer_16x16(input, in0, in1, stride);
+
   switch (tx_type) {
-    case 0:  // DCT_DCT
-      fdct16_1d_sse2(in0, in1);
-      right_shift_16x16(in0, in1);
-      fdct16_1d_sse2(in0, in1);
+    case DCT_DCT:
+      vp9_fdct16x16_sse2(input, output, stride);
       break;
-    case 1:  // ADST_DCT
-      fadst16_1d_sse2(in0, in1);
+    case ADST_DCT:
+      load_buffer_16x16(input, in0, in1, stride);
+      fadst16_sse2(in0, in1);
       right_shift_16x16(in0, in1);
-      fdct16_1d_sse2(in0, in1);
+      fdct16_sse2(in0, in1);
+      write_buffer_16x16(output, in0, in1, 16);
       break;
-    case 2:  // DCT_ADST
-      fdct16_1d_sse2(in0, in1);
+    case DCT_ADST:
+      load_buffer_16x16(input, in0, in1, stride);
+      fdct16_sse2(in0, in1);
       right_shift_16x16(in0, in1);
-      fadst16_1d_sse2(in0, in1);
+      fadst16_sse2(in0, in1);
+      write_buffer_16x16(output, in0, in1, 16);
       break;
-    case 3:  // ADST_ADST
-      fadst16_1d_sse2(in0, in1);
+    case ADST_ADST:
+      load_buffer_16x16(input, in0, in1, stride);
+      fadst16_sse2(in0, in1);
       right_shift_16x16(in0, in1);
-      fadst16_1d_sse2(in0, in1);
+      fadst16_sse2(in0, in1);
+      write_buffer_16x16(output, in0, in1, 16);
       break;
     default:
       assert(0);
       break;
   }
-  write_buffer_16x16(output, in0, in1, 16);
 }
 
 #define FDCT32x32_2D vp9_fdct32x32_rd_sse2
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct_ssse3.asm b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct_ssse3.asm
new file mode 100644
index 00000000000..8723a71140e
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct_ssse3.asm
@@ -0,0 +1,174 @@
+;
+;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+%include "third_party/x86inc/x86inc.asm"
+
+; This file provides SSSE3 version of the forward transformation. Part
+; of the macro definitions are originally derived from the ffmpeg project.
+; The current version applies to x86 64-bit only.
+
+SECTION_RODATA
+
+pw_11585x2: times 8 dw 23170
+pd_8192:    times 4 dd 8192
+
+%macro TRANSFORM_COEFFS 2
+pw_%1_%2:   dw  %1,  %2,  %1,  %2,  %1,  %2,  %1,  %2
+pw_%2_m%1:  dw  %2, -%1,  %2, -%1,  %2, -%1,  %2, -%1
+%endmacro
+
+TRANSFORM_COEFFS 15137,   6270
+TRANSFORM_COEFFS 16069,   3196
+TRANSFORM_COEFFS  9102,  13623
+
+SECTION .text
+
+%if ARCH_X86_64
+%macro SUM_SUB 3
+  psubw  m%3, m%1, m%2
+  paddw  m%1, m%2
+  SWAP    %2, %3
+%endmacro
+
+; butterfly operation
+%macro MUL_ADD_2X 6 ; dst1, dst2, src, round, coefs1, coefs2
+  pmaddwd            m%1, m%3, %5
+  pmaddwd            m%2, m%3, %6
+  paddd              m%1,  %4
+  paddd              m%2,  %4
+  psrad              m%1,  14
+  psrad              m%2,  14
+%endmacro
+
+%macro BUTTERFLY_4X 7 ; dst1, dst2, coef1, coef2, round, tmp1, tmp2
+  punpckhwd          m%6, m%2, m%1
+  MUL_ADD_2X         %7,  %6,  %6,  %5, [pw_%4_%3], [pw_%3_m%4]
+  punpcklwd          m%2, m%1
+  MUL_ADD_2X         %1,  %2,  %2,  %5, [pw_%4_%3], [pw_%3_m%4]
+  packssdw           m%1, m%7
+  packssdw           m%2, m%6
+%endmacro
+
+; matrix transpose
+%macro INTERLEAVE_2X 4
+  punpckh%1          m%4, m%2, m%3
+  punpckl%1          m%2, m%3
+  SWAP               %3,  %4
+%endmacro
+
+%macro TRANSPOSE8X8 9
+  INTERLEAVE_2X  wd, %1, %2, %9
+  INTERLEAVE_2X  wd, %3, %4, %9
+  INTERLEAVE_2X  wd, %5, %6, %9
+  INTERLEAVE_2X  wd, %7, %8, %9
+
+  INTERLEAVE_2X  dq, %1, %3, %9
+  INTERLEAVE_2X  dq, %2, %4, %9
+  INTERLEAVE_2X  dq, %5, %7, %9
+  INTERLEAVE_2X  dq, %6, %8, %9
+
+  INTERLEAVE_2X  qdq, %1, %5, %9
+  INTERLEAVE_2X  qdq, %3, %7, %9
+  INTERLEAVE_2X  qdq, %2, %6, %9
+  INTERLEAVE_2X  qdq, %4, %8, %9
+
+  SWAP  %2, %5
+  SWAP  %4, %7
+%endmacro
+
+; 1D forward 8x8 DCT transform
+%macro FDCT8_1D 0
+  SUM_SUB            0,  7,  9
+  SUM_SUB            1,  6,  9
+  SUM_SUB            2,  5,  9
+  SUM_SUB            3,  4,  9
+
+  SUM_SUB            0,  3,  9
+  SUM_SUB            1,  2,  9
+  SUM_SUB            6,  5,  9
+  SUM_SUB            0,  1,  9
+
+  BUTTERFLY_4X       2,  3,  6270,  15137,  m8,  9,  10
+
+  pmulhrsw           m6, m12
+  pmulhrsw           m5, m12
+  pmulhrsw           m0, m12
+  pmulhrsw           m1, m12
+
+  SUM_SUB            4,  5,  9
+  SUM_SUB            7,  6,  9
+  BUTTERFLY_4X       4,  7,  3196,  16069,  m8,  9,  10
+  BUTTERFLY_4X       5,  6,  13623,  9102,  m8,  9,  10
+  SWAP               1,  4
+  SWAP               3,  6
+%endmacro
+
+%macro DIVIDE_ROUND_2X 4 ; dst1, dst2, tmp1, tmp2
+  psraw              m%3, m%1, 15
+  psraw              m%4, m%2, 15
+  psubw              m%1, m%3
+  psubw              m%2, m%4
+  psraw              m%1, 1
+  psraw              m%2, 1
+%endmacro
+
+INIT_XMM ssse3
+cglobal fdct8x8, 3, 5, 13, input, output, stride
+
+  mova               m8, [pd_8192]
+  mova              m12, [pw_11585x2]
+  pxor              m11, m11
+
+  lea                r3, [2 * strideq]
+  lea                r4, [4 * strideq]
+  mova               m0, [inputq]
+  mova               m1, [inputq + r3]
+  lea                inputq, [inputq + r4]
+  mova               m2, [inputq]
+  mova               m3, [inputq + r3]
+  lea                inputq, [inputq + r4]
+  mova               m4, [inputq]
+  mova               m5, [inputq + r3]
+  lea                inputq, [inputq + r4]
+  mova               m6, [inputq]
+  mova               m7, [inputq + r3]
+
+  ; left shift by 2 to increase forward transformation precision
+  psllw              m0, 2
+  psllw              m1, 2
+  psllw              m2, 2
+  psllw              m3, 2
+  psllw              m4, 2
+  psllw              m5, 2
+  psllw              m6, 2
+  psllw              m7, 2
+
+  ; column transform
+  FDCT8_1D
+  TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
+
+  FDCT8_1D
+  TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
+
+  DIVIDE_ROUND_2X   0, 1, 9, 10
+  DIVIDE_ROUND_2X   2, 3, 9, 10
+  DIVIDE_ROUND_2X   4, 5, 9, 10
+  DIVIDE_ROUND_2X   6, 7, 9, 10
+
+  mova              [outputq +   0], m0
+  mova              [outputq +  16], m1
+  mova              [outputq +  32], m2
+  mova              [outputq +  48], m3
+  mova              [outputq +  64], m4
+  mova              [outputq +  80], m5
+  mova              [outputq +  96], m6
+  mova              [outputq + 112], m7
+
+  RET
+%endif
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_error_intrin_avx2.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_error_intrin_avx2.c
new file mode 100644
index 00000000000..c67490fad34
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_error_intrin_avx2.c
@@ -0,0 +1,72 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Usee of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <immintrin.h>  // AVX2
+#include "vpx/vpx_integer.h"
+
+
+int64_t vp9_block_error_avx2(const int16_t *coeff,
+                             const int16_t *dqcoeff,
+                             intptr_t block_size,
+                             int64_t *ssz) {
+  __m256i sse_reg, ssz_reg, coeff_reg, dqcoeff_reg;
+  __m256i exp_dqcoeff_lo, exp_dqcoeff_hi, exp_coeff_lo, exp_coeff_hi;
+  __m256i sse_reg_64hi, ssz_reg_64hi;
+  __m128i sse_reg128, ssz_reg128;
+  int64_t sse;
+  int i;
+  const __m256i zero_reg = _mm256_set1_epi16(0);
+
+  // init sse and ssz registerd to zero
+  sse_reg = _mm256_set1_epi16(0);
+  ssz_reg = _mm256_set1_epi16(0);
+
+  for (i = 0 ; i < block_size ; i+= 16) {
+    // load 32 bytes from coeff and dqcoeff
+    coeff_reg = _mm256_loadu_si256((const __m256i *)(coeff + i));
+    dqcoeff_reg = _mm256_loadu_si256((const __m256i *)(dqcoeff + i));
+    // dqcoeff - coeff
+    dqcoeff_reg = _mm256_sub_epi16(dqcoeff_reg, coeff_reg);
+    // madd (dqcoeff - coeff)
+    dqcoeff_reg = _mm256_madd_epi16(dqcoeff_reg, dqcoeff_reg);
+    // madd coeff
+    coeff_reg = _mm256_madd_epi16(coeff_reg, coeff_reg);
+    // expand each double word of madd (dqcoeff - coeff) to quad word
+    exp_dqcoeff_lo = _mm256_unpacklo_epi32(dqcoeff_reg, zero_reg);
+    exp_dqcoeff_hi = _mm256_unpackhi_epi32(dqcoeff_reg, zero_reg);
+    // expand each double word of madd (coeff) to quad word
+    exp_coeff_lo = _mm256_unpacklo_epi32(coeff_reg, zero_reg);
+    exp_coeff_hi = _mm256_unpackhi_epi32(coeff_reg, zero_reg);
+    // add each quad word of madd (dqcoeff - coeff) and madd (coeff)
+    sse_reg = _mm256_add_epi64(sse_reg, exp_dqcoeff_lo);
+    ssz_reg = _mm256_add_epi64(ssz_reg, exp_coeff_lo);
+    sse_reg = _mm256_add_epi64(sse_reg, exp_dqcoeff_hi);
+    ssz_reg = _mm256_add_epi64(ssz_reg, exp_coeff_hi);
+  }
+  // save the higher 64 bit of each 128 bit lane
+  sse_reg_64hi = _mm256_srli_si256(sse_reg, 8);
+  ssz_reg_64hi = _mm256_srli_si256(ssz_reg, 8);
+  // add the higher 64 bit to the low 64 bit
+  sse_reg = _mm256_add_epi64(sse_reg, sse_reg_64hi);
+  ssz_reg = _mm256_add_epi64(ssz_reg, ssz_reg_64hi);
+
+  // add each 64 bit from each of the 128 bit lane of the 256 bit
+  sse_reg128 = _mm_add_epi64(_mm256_castsi256_si128(sse_reg),
+                             _mm256_extractf128_si256(sse_reg, 1));
+
+  ssz_reg128 = _mm_add_epi64(_mm256_castsi256_si128(ssz_reg),
+                             _mm256_extractf128_si256(ssz_reg, 1));
+
+  // store the results
+  _mm_storel_epi64((__m128i*)(&sse), sse_reg128);
+
+  _mm_storel_epi64((__m128i*)(ssz), ssz_reg128);
+  return sse;
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_mcomp_x86.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_mcomp_x86.h
deleted file mode 100644
index ca80b8bff6b..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_mcomp_x86.h
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef VP9_ENCODER_X86_VP9_MCOMP_X86_H_
-#define VP9_ENCODER_X86_VP9_MCOMP_X86_H_
-
-#if HAVE_SSE3
-#if !CONFIG_RUNTIME_CPU_DETECT
-
-#undef  vp9_search_full_search
-#define vp9_search_full_search vp9_full_search_sadx3
-
-#undef  vp9_search_refining_search
-#define vp9_search_refining_search vp9_refining_search_sadx4
-
-#undef  vp9_search_diamond_search
-#define vp9_search_diamond_search vp9_diamond_search_sadx4
-
-#endif
-#endif
-
-#if HAVE_SSE4_1
-#if !CONFIG_RUNTIME_CPU_DETECT
-
-#undef  vp9_search_full_search
-#define vp9_search_full_search vp9_full_search_sadx8
-
-#endif
-#endif
-
-#endif
-
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_quantize_ssse3.asm b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_quantize_ssse3.asm
index db306603b5f..48ccef8ccfb 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_quantize_ssse3.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_quantize_ssse3.asm
@@ -188,7 +188,8 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
   pmaxsw                          m8, m7
   pshuflw                         m7, m8, 0x1
   pmaxsw                          m8, m7
-  pextrw                        [r2], m8, 0
+  pextrw                          r6, m8, 0
+  mov                             [r2], r6
   RET
 
   ; skip-block, i.e. just write all zeroes
@@ -214,5 +215,5 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
 %endmacro
 
 INIT_XMM ssse3
-QUANTIZE_FN b, 6
+QUANTIZE_FN b, 7
 QUANTIZE_FN b_32x32, 7
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_sad4d_intrin_avx2.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_sad4d_intrin_avx2.c
new file mode 100644
index 00000000000..f31b176e569
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_sad4d_intrin_avx2.c
@@ -0,0 +1,167 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include <immintrin.h>  // AVX2
+#include "vpx/vpx_integer.h"
+
+void vp9_sad32x32x4d_avx2(uint8_t *src,
+                          int src_stride,
+                          uint8_t *ref[4],
+                          int ref_stride,
+                          unsigned int res[4]) {
+  __m256i src_reg, ref0_reg, ref1_reg, ref2_reg, ref3_reg;
+  __m256i sum_ref0, sum_ref1, sum_ref2, sum_ref3;
+  __m256i sum_mlow, sum_mhigh;
+  int i;
+  uint8_t *ref0, *ref1, *ref2, *ref3;
+
+  ref0 = ref[0];
+  ref1 = ref[1];
+  ref2 = ref[2];
+  ref3 = ref[3];
+  sum_ref0 = _mm256_set1_epi16(0);
+  sum_ref1 = _mm256_set1_epi16(0);
+  sum_ref2 = _mm256_set1_epi16(0);
+  sum_ref3 = _mm256_set1_epi16(0);
+  for (i = 0; i < 32 ; i++) {
+    // load src and all refs
+    src_reg = _mm256_load_si256((__m256i *)(src));
+    ref0_reg = _mm256_loadu_si256((__m256i *) (ref0));
+    ref1_reg = _mm256_loadu_si256((__m256i *) (ref1));
+    ref2_reg = _mm256_loadu_si256((__m256i *) (ref2));
+    ref3_reg = _mm256_loadu_si256((__m256i *) (ref3));
+    // sum of the absolute differences between every ref-i to src
+    ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg);
+    ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg);
+    ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg);
+    ref3_reg = _mm256_sad_epu8(ref3_reg, src_reg);
+    // sum every ref-i
+    sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg);
+    sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg);
+    sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg);
+    sum_ref3 = _mm256_add_epi32(sum_ref3, ref3_reg);
+
+    src+= src_stride;
+    ref0+= ref_stride;
+    ref1+= ref_stride;
+    ref2+= ref_stride;
+    ref3+= ref_stride;
+  }
+  {
+    __m128i sum;
+    // in sum_ref-i the result is saved in the first 4 bytes
+    // the other 4 bytes are zeroed.
+    // sum_ref1 and sum_ref3 are shifted left by 4 bytes
+    sum_ref1 = _mm256_slli_si256(sum_ref1, 4);
+    sum_ref3 = _mm256_slli_si256(sum_ref3, 4);
+
+    // merge sum_ref0 and sum_ref1 also sum_ref2 and sum_ref3
+    sum_ref0 = _mm256_or_si256(sum_ref0, sum_ref1);
+    sum_ref2 = _mm256_or_si256(sum_ref2, sum_ref3);
+
+    // merge every 64 bit from each sum_ref-i
+    sum_mlow = _mm256_unpacklo_epi64(sum_ref0, sum_ref2);
+    sum_mhigh = _mm256_unpackhi_epi64(sum_ref0, sum_ref2);
+
+    // add the low 64 bit to the high 64 bit
+    sum_mlow = _mm256_add_epi32(sum_mlow, sum_mhigh);
+
+    // add the low 128 bit to the high 128 bit
+    sum = _mm_add_epi32(_mm256_castsi256_si128(sum_mlow),
+                        _mm256_extractf128_si256(sum_mlow, 1));
+
+    _mm_storeu_si128((__m128i *)(res), sum);
+  }
+}
+
+void vp9_sad64x64x4d_avx2(uint8_t *src,
+                          int src_stride,
+                          uint8_t *ref[4],
+                          int ref_stride,
+                          unsigned int res[4]) {
+  __m256i src_reg, srcnext_reg, ref0_reg, ref0next_reg;
+  __m256i ref1_reg, ref1next_reg, ref2_reg, ref2next_reg;
+  __m256i ref3_reg, ref3next_reg;
+  __m256i sum_ref0, sum_ref1, sum_ref2, sum_ref3;
+  __m256i sum_mlow, sum_mhigh;
+  int i;
+  uint8_t *ref0, *ref1, *ref2, *ref3;
+
+  ref0 = ref[0];
+  ref1 = ref[1];
+  ref2 = ref[2];
+  ref3 = ref[3];
+  sum_ref0 = _mm256_set1_epi16(0);
+  sum_ref1 = _mm256_set1_epi16(0);
+  sum_ref2 = _mm256_set1_epi16(0);
+  sum_ref3 = _mm256_set1_epi16(0);
+  for (i = 0; i < 64 ; i++) {
+    // load 64 bytes from src and all refs
+    src_reg = _mm256_load_si256((__m256i *)(src));
+    srcnext_reg = _mm256_load_si256((__m256i *)(src + 32));
+    ref0_reg = _mm256_loadu_si256((__m256i *) (ref0));
+    ref0next_reg = _mm256_loadu_si256((__m256i *) (ref0 + 32));
+    ref1_reg = _mm256_loadu_si256((__m256i *) (ref1));
+    ref1next_reg = _mm256_loadu_si256((__m256i *) (ref1 + 32));
+    ref2_reg = _mm256_loadu_si256((__m256i *) (ref2));
+    ref2next_reg = _mm256_loadu_si256((__m256i *) (ref2 + 32));
+    ref3_reg = _mm256_loadu_si256((__m256i *) (ref3));
+    ref3next_reg = _mm256_loadu_si256((__m256i *) (ref3 + 32));
+    // sum of the absolute differences between every ref-i to src
+    ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg);
+    ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg);
+    ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg);
+    ref3_reg = _mm256_sad_epu8(ref3_reg, src_reg);
+    ref0next_reg = _mm256_sad_epu8(ref0next_reg, srcnext_reg);
+    ref1next_reg = _mm256_sad_epu8(ref1next_reg, srcnext_reg);
+    ref2next_reg = _mm256_sad_epu8(ref2next_reg, srcnext_reg);
+    ref3next_reg = _mm256_sad_epu8(ref3next_reg, srcnext_reg);
+
+    // sum every ref-i
+    sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg);
+    sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg);
+    sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg);
+    sum_ref3 = _mm256_add_epi32(sum_ref3, ref3_reg);
+    sum_ref0 = _mm256_add_epi32(sum_ref0, ref0next_reg);
+    sum_ref1 = _mm256_add_epi32(sum_ref1, ref1next_reg);
+    sum_ref2 = _mm256_add_epi32(sum_ref2, ref2next_reg);
+    sum_ref3 = _mm256_add_epi32(sum_ref3, ref3next_reg);
+    src+= src_stride;
+    ref0+= ref_stride;
+    ref1+= ref_stride;
+    ref2+= ref_stride;
+    ref3+= ref_stride;
+  }
+  {
+    __m128i sum;
+
+    // in sum_ref-i the result is saved in the first 4 bytes
+    // the other 4 bytes are zeroed.
+    // sum_ref1 and sum_ref3 are shifted left by 4 bytes
+    sum_ref1 = _mm256_slli_si256(sum_ref1, 4);
+    sum_ref3 = _mm256_slli_si256(sum_ref3, 4);
+
+    // merge sum_ref0 and sum_ref1 also sum_ref2 and sum_ref3
+    sum_ref0 = _mm256_or_si256(sum_ref0, sum_ref1);
+    sum_ref2 = _mm256_or_si256(sum_ref2, sum_ref3);
+
+    // merge every 64 bit from each sum_ref-i
+    sum_mlow = _mm256_unpacklo_epi64(sum_ref0, sum_ref2);
+    sum_mhigh = _mm256_unpackhi_epi64(sum_ref0, sum_ref2);
+
+    // add the low 64 bit to the high 64 bit
+    sum_mlow = _mm256_add_epi32(sum_mlow, sum_mhigh);
+
+    // add the low 128 bit to the high 128 bit
+    sum = _mm_add_epi32(_mm256_castsi256_si128(sum_mlow),
+                        _mm256_extractf128_si256(sum_mlow, 1));
+
+    _mm_storeu_si128((__m128i *)(res), sum);
+  }
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_subpel_variance.asm b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_subpel_variance.asm
index 533456b77d8..1a9e4e8b6bd 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_subpel_variance.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_subpel_variance.asm
@@ -118,6 +118,14 @@ SECTION .text
   RET
 %endmacro
 
+%macro INC_SRC_BY_SRC_STRIDE  0
+%if ARCH_X86=1 && CONFIG_PIC=1
+  add                srcq, src_stridemp
+%else
+  add                srcq, src_strideq
+%endif
+%endmacro
+
 %macro SUBPEL_VARIANCE 1-2 0 ; W
 %if cpuflag(ssse3)
 %define bilin_filter_m bilin_filter_m_ssse3
@@ -129,41 +137,85 @@ SECTION .text
 ; FIXME(rbultje) only bilinear filters use >8 registers, and ssse3 only uses
 ; 11, not 13, if the registers are ordered correctly. May make a minor speed
 ; difference on Win64
-%ifdef PIC
-%if %2 == 1 ; avg
-cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \
-                                              x_offset, y_offset, \
-                                              dst, dst_stride, \
-                                              sec, sec_stride, height, sse
-%define sec_str sec_strideq
-%else
-cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, y_offset, \
-                                          dst, dst_stride, height, sse
-%endif
-%define h heightd
-%define bilin_filter sseq
-%else
-%if %2 == 1 ; avg
-cglobal sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \
-                                    7 + 2 * ARCH_X86_64, 13, src, src_stride, \
-                                                         x_offset, y_offset, \
-                                                         dst, dst_stride, \
-                                                         sec, sec_stride, \
-                                                         height, sse
-%if ARCH_X86_64
-%define h heightd
-%define sec_str sec_strideq
-%else
-%define h dword heightm
-%define sec_str sec_stridemp
-%endif
+
+%ifdef PIC    ; 64bit PIC
+  %if %2 == 1 ; avg
+    cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \
+                                      x_offset, y_offset, \
+                                      dst, dst_stride, \
+                                      sec, sec_stride, height, sse
+    %define sec_str sec_strideq
+  %else
+    cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, \
+                                  y_offset, dst, dst_stride, height, sse
+  %endif
+  %define h heightd
+  %define bilin_filter sseq
 %else
-cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \
-                                          dst, dst_stride, height, sse
-%define h heightd
-%endif
-%define bilin_filter bilin_filter_m
+  %if ARCH_X86=1 && CONFIG_PIC=1
+    %if %2 == 1 ; avg
+      cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
+                                  x_offset, y_offset, \
+                                  dst, dst_stride, \
+                                  sec, sec_stride, \
+                                  height, sse, g_bilin_filter, g_pw_8
+      %define h dword heightm
+      %define sec_str sec_stridemp
+
+      ;Store bilin_filter and pw_8 location in stack
+      GET_GOT eax
+      add esp, 4                ; restore esp
+
+      lea ecx, [GLOBAL(bilin_filter_m)]
+      mov g_bilin_filterm, ecx
+
+      lea ecx, [GLOBAL(pw_8)]
+      mov g_pw_8m, ecx
+
+      LOAD_IF_USED 0, 1         ; load eax, ecx back
+    %else
+      cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \
+                                y_offset, dst, dst_stride, height, sse, \
+                                g_bilin_filter, g_pw_8
+      %define h heightd
+
+      ;Store bilin_filter and pw_8 location in stack
+      GET_GOT eax
+      add esp, 4                ; restore esp
+
+      lea ecx, [GLOBAL(bilin_filter_m)]
+      mov g_bilin_filterm, ecx
+
+      lea ecx, [GLOBAL(pw_8)]
+      mov g_pw_8m, ecx
+
+      LOAD_IF_USED 0, 1         ; load eax, ecx back
+    %endif
+  %else
+    %if %2 == 1 ; avg
+      cglobal sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \
+                        7 + 2 * ARCH_X86_64, 13, src, src_stride, \
+                                             x_offset, y_offset, \
+                                             dst, dst_stride, \
+                                             sec, sec_stride, \
+                                             height, sse
+      %if ARCH_X86_64
+      %define h heightd
+      %define sec_str sec_strideq
+      %else
+      %define h dword heightm
+      %define sec_str sec_stridemp
+      %endif
+    %else
+      cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \
+                              y_offset, dst, dst_stride, height, sse
+      %define h heightd
+    %endif
+
+    %define bilin_filter bilin_filter_m
+  %endif
 %endif
+
   ASSERT               %1 <= 16         ; m6 overflows if w > 16
   pxor                 m6, m6           ; sum
   pxor                 m7, m7           ; sse
@@ -329,11 +381,22 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \
 %define filter_y_b m9
 %define filter_rnd m10
 %else ; x86-32 or mmx
+%if ARCH_X86=1 && CONFIG_PIC=1
+; x_offset == 0, reuse x_offset reg
+%define tempq x_offsetq
+  add y_offsetq, g_bilin_filterm
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+  mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
   add           y_offsetq, bilin_filter
 %define filter_y_a [y_offsetq]
 %define filter_y_b [y_offsetq+16]
 %define filter_rnd [pw_8]
 %endif
+%endif
+
 .x_zero_y_other_loop:
 %if %1 == 16
   movu                 m0, [srcq]
@@ -615,12 +678,23 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \
 %define filter_y_a m8
 %define filter_y_b m9
 %define filter_rnd m10
+%else  ;x86_32
+%if ARCH_X86=1 && CONFIG_PIC=1
+; x_offset == 0.5. We can reuse x_offset reg
+%define tempq x_offsetq
+  add y_offsetq, g_bilin_filterm
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+  mov tempq, g_pw_8m
+%define filter_rnd [tempq]
 %else
   add           y_offsetq, bilin_filter
 %define filter_y_a [y_offsetq]
 %define filter_y_b [y_offsetq+16]
 %define filter_rnd [pw_8]
 %endif
+%endif
+
 %if %1 == 16
   movu                 m0, [srcq]
   movu                 m3, [srcq+1]
@@ -752,12 +826,23 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \
 %define filter_x_a m8
 %define filter_x_b m9
 %define filter_rnd m10
+%else    ; x86-32
+%if ARCH_X86=1 && CONFIG_PIC=1
+;y_offset == 0. We can reuse y_offset reg.
+%define tempq y_offsetq
+  add x_offsetq, g_bilin_filterm
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+  mov tempq, g_pw_8m
+%define filter_rnd [tempq]
 %else
   add           x_offsetq, bilin_filter
 %define filter_x_a [x_offsetq]
 %define filter_x_b [x_offsetq+16]
 %define filter_rnd [pw_8]
 %endif
+%endif
+
 .x_other_y_zero_loop:
 %if %1 == 16
   movu                 m0, [srcq]
@@ -873,12 +958,23 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \
 %define filter_x_a m8
 %define filter_x_b m9
 %define filter_rnd m10
+%else    ; x86-32
+%if ARCH_X86=1 && CONFIG_PIC=1
+; y_offset == 0.5. We can reuse y_offset reg.
+%define tempq y_offsetq
+  add x_offsetq, g_bilin_filterm
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+  mov tempq, g_pw_8m
+%define filter_rnd [tempq]
 %else
   add           x_offsetq, bilin_filter
 %define filter_x_a [x_offsetq]
 %define filter_x_b [x_offsetq+16]
 %define filter_rnd [pw_8]
 %endif
+%endif
+
 %if %1 == 16
   movu                 m0, [srcq]
   movu                 m1, [srcq+1]
@@ -1057,6 +1153,21 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \
 %define filter_y_a m10
 %define filter_y_b m11
 %define filter_rnd m12
+%else   ; x86-32
+%if ARCH_X86=1 && CONFIG_PIC=1
+; In this case, there is NO unused register. Used src_stride register. Later,
+; src_stride has to be loaded from stack when it is needed.
+%define tempq src_strideq
+  mov tempq, g_bilin_filterm
+  add           x_offsetq, tempq
+  add           y_offsetq, tempq
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+
+  mov tempq, g_pw_8m
+%define filter_rnd [tempq]
 %else
   add           x_offsetq, bilin_filter
   add           y_offsetq, bilin_filter
@@ -1066,6 +1177,8 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \
 %define filter_y_b [y_offsetq+16]
 %define filter_rnd [pw_8]
 %endif
+%endif
+
   ; x_offset == bilin interpolation && y_offset == bilin interpolation
 %if %1 == 16
   movu                 m0, [srcq]
@@ -1093,7 +1206,9 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \
 %endif
   psraw                m0, 4
   psraw                m2, 4
-  add                srcq, src_strideq
+
+  INC_SRC_BY_SRC_STRIDE
+
   packuswb             m0, m2
 .x_other_y_other_loop:
 %if cpuflag(ssse3)
@@ -1163,7 +1278,7 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \
   SUM_SSE              m0, m1, m2, m3, m6, m7
   mova                 m0, m4
 
-  add                srcq, src_strideq
+  INC_SRC_BY_SRC_STRIDE
   add                dstq, dst_strideq
 %else ; %1 < 16
   movh                 m0, [srcq]
@@ -1184,12 +1299,17 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \
 %if cpuflag(ssse3)
   packuswb             m0, m0
 %endif
-  add                srcq, src_strideq
+
+  INC_SRC_BY_SRC_STRIDE
+
 .x_other_y_other_loop:
   movh                 m2, [srcq]
   movh                 m1, [srcq+1]
-  movh                 m4, [srcq+src_strideq]
-  movh                 m3, [srcq+src_strideq+1]
+
+  INC_SRC_BY_SRC_STRIDE
+  movh                 m4, [srcq]
+  movh                 m3, [srcq+1]
+
 %if cpuflag(ssse3)
   punpcklbw            m2, m1
   punpcklbw            m4, m3
@@ -1253,7 +1373,7 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \
   SUM_SSE              m0, m1, m2, m3, m6, m7
   mova                 m0, m4
 
-  lea                srcq, [srcq+src_strideq*2]
+  INC_SRC_BY_SRC_STRIDE
   lea                dstq, [dstq+dst_strideq*2]
 %endif
 %if %2 == 1 ; avg
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c
new file mode 100644
index 00000000000..34ed1867f61
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c
@@ -0,0 +1,539 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <immintrin.h>  // AVX2
+#include "vpx_ports/mem.h"
+#include "vp9/encoder/vp9_variance.h"
+
+DECLARE_ALIGNED(32, static const uint8_t, bilinear_filters_avx2[512]) = {
+  16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0,
+  16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0,
+  15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1,
+  15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1,
+  14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2,
+  14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2,
+  13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3,
+  13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3,
+  12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4,
+  12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4,
+  11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5,
+  11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5,
+  10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6,
+  10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6,
+  9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7,
+  9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7,
+  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9,
+  7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9,
+  6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10,
+  6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10,
+  5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11,
+  5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11,
+  4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12,
+  4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12,
+  3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13,
+  3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13,
+  2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14,
+  2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14,
+  1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15,
+  1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15
+};
+
+#define FILTER_SRC(filter) \
+  /* filter the source */ \
+  exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter); \
+  exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter); \
+  \
+  /* add 8 to source */ \
+  exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8); \
+  exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8); \
+  \
+  /* divide source by 16 */ \
+  exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4); \
+  exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);
+
+#define MERGE_WITH_SRC(src_reg, reg) \
+  exp_src_lo = _mm256_unpacklo_epi8(src_reg, reg); \
+  exp_src_hi = _mm256_unpackhi_epi8(src_reg, reg);
+
+#define LOAD_SRC_DST \
+  /* load source and destination */ \
+  src_reg = _mm256_loadu_si256((__m256i const *) (src)); \
+  dst_reg = _mm256_load_si256((__m256i const *) (dst));
+
+#define AVG_NEXT_SRC(src_reg, size_stride) \
+  src_next_reg = _mm256_loadu_si256((__m256i const *) \
+                                   (src + size_stride)); \
+  /* average between current and next stride source */ \
+  src_reg = _mm256_avg_epu8(src_reg, src_next_reg);
+
+#define MERGE_NEXT_SRC(src_reg, size_stride) \
+  src_next_reg = _mm256_loadu_si256((__m256i const *) \
+                                   (src + size_stride)); \
+  MERGE_WITH_SRC(src_reg, src_next_reg)
+
+#define CALC_SUM_SSE_INSIDE_LOOP \
+  /* expand each byte to 2 bytes */ \
+  exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg); \
+  exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg); \
+  /* source - dest */ \
+  exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo); \
+  exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi); \
+  /* caculate sum */ \
+  sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo); \
+  exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo); \
+  sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi); \
+  exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi); \
+  /* calculate sse */ \
+  sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo); \
+  sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi);
+
+// final calculation to sum and sse
+#define CALC_SUM_AND_SSE \
+  res_cmp = _mm256_cmpgt_epi16(zero_reg, sum_reg); \
+  sse_reg_hi = _mm256_srli_si256(sse_reg, 8); \
+  sum_reg_lo = _mm256_unpacklo_epi16(sum_reg, res_cmp); \
+  sum_reg_hi = _mm256_unpackhi_epi16(sum_reg, res_cmp); \
+  sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \
+  sum_reg = _mm256_add_epi32(sum_reg_lo, sum_reg_hi); \
+  \
+  sse_reg_hi = _mm256_srli_si256(sse_reg, 4); \
+  sum_reg_hi = _mm256_srli_si256(sum_reg, 8); \
+  \
+  sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \
+  sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \
+  *((int*)sse)= _mm_cvtsi128_si32(_mm256_castsi256_si128(sse_reg)) + \
+                _mm_cvtsi128_si32(_mm256_extractf128_si256(sse_reg, 1)); \
+  sum_reg_hi = _mm256_srli_si256(sum_reg, 4); \
+  sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \
+  sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_reg)) + \
+        _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_reg, 1));
+
+
+unsigned int vp9_sub_pixel_variance32xh_avx2(const uint8_t *src,
+                                             int src_stride,
+                                             int x_offset,
+                                             int y_offset,
+                                             const uint8_t *dst,
+                                             int dst_stride,
+                                             int height,
+                                             unsigned int *sse) {
+  __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
+  __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;
+  __m256i zero_reg;
+  int i, sum;
+  sum_reg = _mm256_set1_epi16(0);
+  sse_reg = _mm256_set1_epi16(0);
+  zero_reg = _mm256_set1_epi16(0);
+
+  // x_offset = 0 and y_offset = 0
+  if (x_offset == 0) {
+    if (y_offset == 0) {
+      for (i = 0; i < height ; i++) {
+        LOAD_SRC_DST
+        // expend each byte to 2 bytes
+        MERGE_WITH_SRC(src_reg, zero_reg)
+        CALC_SUM_SSE_INSIDE_LOOP
+        src+= src_stride;
+        dst+= dst_stride;
+      }
+    // x_offset = 0 and y_offset = 8
+    } else if (y_offset == 8) {
+      __m256i src_next_reg;
+      for (i = 0; i < height ; i++) {
+        LOAD_SRC_DST
+        AVG_NEXT_SRC(src_reg, src_stride)
+        // expend each byte to 2 bytes
+        MERGE_WITH_SRC(src_reg, zero_reg)
+        CALC_SUM_SSE_INSIDE_LOOP
+        src+= src_stride;
+        dst+= dst_stride;
+      }
+    // x_offset = 0 and y_offset = bilin interpolation
+    } else {
+      __m256i filter, pw8, src_next_reg;
+
+      y_offset <<= 5;
+      filter = _mm256_load_si256((__m256i const *)
+               (bilinear_filters_avx2 + y_offset));
+      pw8 = _mm256_set1_epi16(8);
+      for (i = 0; i < height ; i++) {
+        LOAD_SRC_DST
+        MERGE_NEXT_SRC(src_reg, src_stride)
+        FILTER_SRC(filter)
+        CALC_SUM_SSE_INSIDE_LOOP
+        src+= src_stride;
+        dst+= dst_stride;
+      }
+    }
+  // x_offset = 8  and y_offset = 0
+  } else if (x_offset == 8) {
+    if (y_offset == 0) {
+      __m256i src_next_reg;
+      for (i = 0; i < height ; i++) {
+        LOAD_SRC_DST
+        AVG_NEXT_SRC(src_reg, 1)
+        // expand each byte to 2 bytes
+        MERGE_WITH_SRC(src_reg, zero_reg)
+        CALC_SUM_SSE_INSIDE_LOOP
+        src+= src_stride;
+        dst+= dst_stride;
+      }
+    // x_offset = 8  and y_offset = 8
+    } else if (y_offset == 8) {
+      __m256i src_next_reg, src_avg;
+      // load source and another source starting from the next
+      // following byte
+      src_reg = _mm256_loadu_si256((__m256i const *) (src));
+      AVG_NEXT_SRC(src_reg, 1)
+      for (i = 0; i < height ; i++) {
+        src_avg = src_reg;
+        src+= src_stride;
+        LOAD_SRC_DST
+        AVG_NEXT_SRC(src_reg, 1)
+        // average between previous average to current average
+        src_avg = _mm256_avg_epu8(src_avg, src_reg);
+        // expand each byte to 2 bytes
+        MERGE_WITH_SRC(src_avg, zero_reg)
+        // save current source average
+        CALC_SUM_SSE_INSIDE_LOOP
+        dst+= dst_stride;
+      }
+    // x_offset = 8  and y_offset = bilin interpolation
+    } else {
+      __m256i filter, pw8, src_next_reg, src_avg;
+      y_offset <<= 5;
+      filter = _mm256_load_si256((__m256i const *)
+               (bilinear_filters_avx2 + y_offset));
+      pw8 = _mm256_set1_epi16(8);
+      // load source and another source starting from the next
+      // following byte
+      src_reg = _mm256_loadu_si256((__m256i const *) (src));
+      AVG_NEXT_SRC(src_reg, 1)
+      for (i = 0; i < height ; i++) {
+        // save current source average
+        src_avg = src_reg;
+        src+= src_stride;
+        LOAD_SRC_DST
+        AVG_NEXT_SRC(src_reg, 1)
+        MERGE_WITH_SRC(src_avg, src_reg)
+        FILTER_SRC(filter)
+        CALC_SUM_SSE_INSIDE_LOOP
+        dst+= dst_stride;
+      }
+    }
+  // x_offset = bilin interpolation and y_offset = 0
+  } else {
+    if (y_offset == 0) {
+      __m256i filter, pw8, src_next_reg;
+      x_offset <<= 5;
+      filter = _mm256_load_si256((__m256i const *)
+               (bilinear_filters_avx2 + x_offset));
+      pw8 = _mm256_set1_epi16(8);
+      for (i = 0; i < height ; i++) {
+        LOAD_SRC_DST
+        MERGE_NEXT_SRC(src_reg, 1)
+        FILTER_SRC(filter)
+        CALC_SUM_SSE_INSIDE_LOOP
+        src+= src_stride;
+        dst+= dst_stride;
+      }
+    // x_offset = bilin interpolation and y_offset = 8
+    } else if (y_offset == 8) {
+      __m256i filter, pw8, src_next_reg, src_pack;
+      x_offset <<= 5;
+      filter = _mm256_load_si256((__m256i const *)
+               (bilinear_filters_avx2 + x_offset));
+      pw8 = _mm256_set1_epi16(8);
+      src_reg = _mm256_loadu_si256((__m256i const *) (src));
+      MERGE_NEXT_SRC(src_reg, 1)
+      FILTER_SRC(filter)
+      // convert each 16 bit to 8 bit to each low and high lane source
+      src_pack =  _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+      for (i = 0; i < height ; i++) {
+        src+= src_stride;
+        LOAD_SRC_DST
+        MERGE_NEXT_SRC(src_reg, 1)
+        FILTER_SRC(filter)
+        src_reg =  _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+        // average between previous pack to the current
+        src_pack = _mm256_avg_epu8(src_pack, src_reg);
+        MERGE_WITH_SRC(src_pack, zero_reg)
+        CALC_SUM_SSE_INSIDE_LOOP
+        src_pack = src_reg;
+        dst+= dst_stride;
+      }
+    // x_offset = bilin interpolation and y_offset = bilin interpolation
+    } else {
+      __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;
+      x_offset <<= 5;
+      xfilter = _mm256_load_si256((__m256i const *)
+                (bilinear_filters_avx2 + x_offset));
+      y_offset <<= 5;
+      yfilter = _mm256_load_si256((__m256i const *)
+                (bilinear_filters_avx2 + y_offset));
+      pw8 = _mm256_set1_epi16(8);
+      // load source and another source starting from the next
+      // following byte
+      src_reg = _mm256_loadu_si256((__m256i const *) (src));
+      MERGE_NEXT_SRC(src_reg, 1)
+
+      FILTER_SRC(xfilter)
+      // convert each 16 bit to 8 bit to each low and high lane source
+      src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+      for (i = 0; i < height ; i++) {
+        src+= src_stride;
+        LOAD_SRC_DST
+        MERGE_NEXT_SRC(src_reg, 1)
+        FILTER_SRC(xfilter)
+        src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+        // merge previous pack to current pack source
+        MERGE_WITH_SRC(src_pack, src_reg)
+        // filter the source
+        FILTER_SRC(yfilter)
+        src_pack = src_reg;
+        CALC_SUM_SSE_INSIDE_LOOP
+        dst+= dst_stride;
+      }
+    }
+  }
+  CALC_SUM_AND_SSE
+  return sum;
+}
+
+unsigned int vp9_sub_pixel_avg_variance32xh_avx2(const uint8_t *src,
+                                             int src_stride,
+                                             int x_offset,
+                                             int y_offset,
+                                             const uint8_t *dst,
+                                             int dst_stride,
+                                             const uint8_t *sec,
+                                             int sec_stride,
+                                             int height,
+                                             unsigned int *sse) {
+  __m256i sec_reg;
+  __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
+  __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;
+  __m256i zero_reg;
+  int i, sum;
+  sum_reg = _mm256_set1_epi16(0);
+  sse_reg = _mm256_set1_epi16(0);
+  zero_reg = _mm256_set1_epi16(0);
+
+  // x_offset = 0 and y_offset = 0
+  if (x_offset == 0) {
+    if (y_offset == 0) {
+      for (i = 0; i < height ; i++) {
+        LOAD_SRC_DST
+        sec_reg = _mm256_load_si256((__m256i const *) (sec));
+        src_reg = _mm256_avg_epu8(src_reg, sec_reg);
+        sec+= sec_stride;
+        // expend each byte to 2 bytes
+        MERGE_WITH_SRC(src_reg, zero_reg)
+        CALC_SUM_SSE_INSIDE_LOOP
+        src+= src_stride;
+        dst+= dst_stride;
+      }
+    } else if (y_offset == 8) {
+      __m256i src_next_reg;
+      for (i = 0; i < height ; i++) {
+        LOAD_SRC_DST
+        AVG_NEXT_SRC(src_reg, src_stride)
+        sec_reg = _mm256_load_si256((__m256i const *) (sec));
+        src_reg = _mm256_avg_epu8(src_reg, sec_reg);
+        sec+= sec_stride;
+        // expend each byte to 2 bytes
+        MERGE_WITH_SRC(src_reg, zero_reg)
+        CALC_SUM_SSE_INSIDE_LOOP
+        src+= src_stride;
+        dst+= dst_stride;
+      }
+    // x_offset = 0 and y_offset = bilin interpolation
+    } else {
+      __m256i filter, pw8, src_next_reg;
+
+      y_offset <<= 5;
+      filter = _mm256_load_si256((__m256i const *)
+                 (bilinear_filters_avx2 + y_offset));
+      pw8 = _mm256_set1_epi16(8);
+      for (i = 0; i < height ; i++) {
+        LOAD_SRC_DST
+        MERGE_NEXT_SRC(src_reg, src_stride)
+        FILTER_SRC(filter)
+        src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+        sec_reg = _mm256_load_si256((__m256i const *) (sec));
+        src_reg = _mm256_avg_epu8(src_reg, sec_reg);
+        sec+= sec_stride;
+        MERGE_WITH_SRC(src_reg, zero_reg)
+        CALC_SUM_SSE_INSIDE_LOOP
+        src+= src_stride;
+        dst+= dst_stride;
+      }
+    }
+  // x_offset = 8  and y_offset = 0
+  } else if (x_offset == 8) {
+    if (y_offset == 0) {
+      __m256i src_next_reg;
+      for (i = 0; i < height ; i++) {
+        LOAD_SRC_DST
+        AVG_NEXT_SRC(src_reg, 1)
+        sec_reg = _mm256_load_si256((__m256i const *) (sec));
+        src_reg = _mm256_avg_epu8(src_reg, sec_reg);
+        sec+= sec_stride;
+        // expand each byte to 2 bytes
+        MERGE_WITH_SRC(src_reg, zero_reg)
+        CALC_SUM_SSE_INSIDE_LOOP
+        src+= src_stride;
+        dst+= dst_stride;
+      }
+    // x_offset = 8  and y_offset = 8
+    } else if (y_offset == 8) {
+      __m256i src_next_reg, src_avg;
+      // load source and another source starting from the next
+      // following byte
+      src_reg = _mm256_loadu_si256((__m256i const *) (src));
+      AVG_NEXT_SRC(src_reg, 1)
+      for (i = 0; i < height ; i++) {
+        // save current source average
+        src_avg = src_reg;
+        src+= src_stride;
+        LOAD_SRC_DST
+        AVG_NEXT_SRC(src_reg, 1)
+        // average between previous average to current average
+        src_avg = _mm256_avg_epu8(src_avg, src_reg);
+        sec_reg = _mm256_load_si256((__m256i const *) (sec));
+        src_avg = _mm256_avg_epu8(src_avg, sec_reg);
+        sec+= sec_stride;
+        // expand each byte to 2 bytes
+        MERGE_WITH_SRC(src_avg, zero_reg)
+        CALC_SUM_SSE_INSIDE_LOOP
+        dst+= dst_stride;
+      }
+    // x_offset = 8  and y_offset = bilin interpolation
+    } else {
+      __m256i filter, pw8, src_next_reg, src_avg;
+      y_offset <<= 5;
+      filter = _mm256_load_si256((__m256i const *)
+               (bilinear_filters_avx2 + y_offset));
+      pw8 = _mm256_set1_epi16(8);
+      // load source and another source starting from the next
+      // following byte
+      src_reg = _mm256_loadu_si256((__m256i const *) (src));
+      AVG_NEXT_SRC(src_reg, 1)
+      for (i = 0; i < height ; i++) {
+        // save current source average
+        src_avg = src_reg;
+        src+= src_stride;
+        LOAD_SRC_DST
+        AVG_NEXT_SRC(src_reg, 1)
+        MERGE_WITH_SRC(src_avg, src_reg)
+        FILTER_SRC(filter)
+        src_avg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+        sec_reg = _mm256_load_si256((__m256i const *) (sec));
+        src_avg = _mm256_avg_epu8(src_avg, sec_reg);
+        // expand each byte to 2 bytes
+        MERGE_WITH_SRC(src_avg, zero_reg)
+        sec+= sec_stride;
+        CALC_SUM_SSE_INSIDE_LOOP
+        dst+= dst_stride;
+      }
+    }
+  // x_offset = bilin interpolation and y_offset = 0
+  } else {
+    if (y_offset == 0) {
+      __m256i filter, pw8, src_next_reg;
+      x_offset <<= 5;
+      filter = _mm256_load_si256((__m256i const *)
+               (bilinear_filters_avx2 + x_offset));
+      pw8 = _mm256_set1_epi16(8);
+      for (i = 0; i < height ; i++) {
+        LOAD_SRC_DST
+        MERGE_NEXT_SRC(src_reg, 1)
+        FILTER_SRC(filter)
+        src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+        sec_reg = _mm256_load_si256((__m256i const *) (sec));
+        src_reg = _mm256_avg_epu8(src_reg, sec_reg);
+        MERGE_WITH_SRC(src_reg, zero_reg)
+        sec+= sec_stride;
+        CALC_SUM_SSE_INSIDE_LOOP
+        src+= src_stride;
+        dst+= dst_stride;
+      }
+    // x_offset = bilin interpolation and y_offset = 8
+    } else if (y_offset == 8) {
+      __m256i filter, pw8, src_next_reg, src_pack;
+      x_offset <<= 5;
+      filter = _mm256_load_si256((__m256i const *)
+               (bilinear_filters_avx2 + x_offset));
+      pw8 = _mm256_set1_epi16(8);
+      src_reg = _mm256_loadu_si256((__m256i const *) (src));
+      MERGE_NEXT_SRC(src_reg, 1)
+      FILTER_SRC(filter)
+      // convert each 16 bit to 8 bit to each low and high lane source
+      src_pack =  _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+      for (i = 0; i < height ; i++) {
+        src+= src_stride;
+        LOAD_SRC_DST
+        MERGE_NEXT_SRC(src_reg, 1)
+        FILTER_SRC(filter)
+        src_reg =  _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+        // average between previous pack to the current
+        src_pack = _mm256_avg_epu8(src_pack, src_reg);
+        sec_reg = _mm256_load_si256((__m256i const *) (sec));
+        src_pack = _mm256_avg_epu8(src_pack, sec_reg);
+        sec+= sec_stride;
+        MERGE_WITH_SRC(src_pack, zero_reg)
+        src_pack = src_reg;
+        CALC_SUM_SSE_INSIDE_LOOP
+        dst+= dst_stride;
+      }
+    // x_offset = bilin interpolation and y_offset = bilin interpolation
+    } else {
+      __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;
+      x_offset <<= 5;
+      xfilter = _mm256_load_si256((__m256i const *)
+                (bilinear_filters_avx2 + x_offset));
+      y_offset <<= 5;
+      yfilter = _mm256_load_si256((__m256i const *)
+                (bilinear_filters_avx2 + y_offset));
+      pw8 = _mm256_set1_epi16(8);
+      // load source and another source starting from the next
+      // following byte
+      src_reg = _mm256_loadu_si256((__m256i const *) (src));
+      MERGE_NEXT_SRC(src_reg, 1)
+
+      FILTER_SRC(xfilter)
+      // convert each 16 bit to 8 bit to each low and high lane source
+      src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+      for (i = 0; i < height ; i++) {
+        src+= src_stride;
+        LOAD_SRC_DST
+        MERGE_NEXT_SRC(src_reg, 1)
+        FILTER_SRC(xfilter)
+        src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+        // merge previous pack to current pack source
+        MERGE_WITH_SRC(src_pack, src_reg)
+        // filter the source
+        FILTER_SRC(yfilter)
+        src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+        sec_reg = _mm256_load_si256((__m256i const *) (sec));
+        src_pack = _mm256_avg_epu8(src_pack, sec_reg);
+        MERGE_WITH_SRC(src_pack, zero_reg)
+        src_pack = src_reg;
+        sec+= sec_stride;
+        CALC_SUM_SSE_INSIDE_LOOP
+        dst+= dst_stride;
+      }
+    }
+  }
+  CALC_SUM_AND_SSE
+  return sum;
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_subpel_variance_impl_sse2.asm b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_subpel_variance_impl_sse2.asm
deleted file mode 100644
index 2ecc23e5594..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_subpel_variance_impl_sse2.asm
+++ /dev/null
@@ -1,337 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-%include "vpx_ports/x86_abi_support.asm"
-
-;void vp9_half_horiz_vert_variance16x_h_sse2
-;(
-;    unsigned char *ref_ptr,
-;    int ref_pixels_per_line,
-;    unsigned char *src_ptr,
-;    int src_pixels_per_line,
-;    unsigned int Height,
-;    int *sum,
-;    unsigned int *sumsquared
-;)
-global sym(vp9_half_horiz_vert_variance16x_h_sse2) PRIVATE
-sym(vp9_half_horiz_vert_variance16x_h_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    ; end prolog
-
-        pxor            xmm6,           xmm6                ;  error accumulator
-        pxor            xmm7,           xmm7                ;  sse eaccumulator
-        mov             rsi,            arg(0) ;ref_ptr              ;
-
-        mov             rdi,            arg(2) ;src_ptr              ;
-        movsxd          rcx,            dword ptr arg(4) ;Height              ;
-        movsxd          rax,            dword ptr arg(1) ;ref_pixels_per_line
-        movsxd          rdx,            dword ptr arg(3)    ;src_pixels_per_line
-
-        pxor            xmm0,           xmm0                ;
-
-        movdqu          xmm5,           XMMWORD PTR [rsi]
-        movdqu          xmm3,           XMMWORD PTR [rsi+1]
-        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3) horizontal line 1
-
-        lea             rsi,            [rsi + rax]
-
-.half_horiz_vert_variance16x_h_1:
-        movdqu          xmm1,           XMMWORD PTR [rsi]     ;
-        movdqu          xmm2,           XMMWORD PTR [rsi+1]   ;
-        pavgb           xmm1,           xmm2                ;  xmm1 = avg(xmm1,xmm3) horizontal line i+1
-
-        pavgb           xmm5,           xmm1                ;  xmm = vertical average of the above
-
-        movdqa          xmm4,           xmm5
-        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
-        punpckhbw       xmm4,           xmm0
-
-        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d7
-        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
-        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
-
-        movq            xmm3,           QWORD PTR [rdi+8]
-        punpcklbw       xmm3,           xmm0
-        psubw           xmm4,           xmm3
-
-        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
-        paddw           xmm6,           xmm4
-        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
-        pmaddwd         xmm4,           xmm4
-        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
-        paddd           xmm7,           xmm4
-
-        movdqa          xmm5,           xmm1                ;  save xmm1 for use on the next row
-
-        lea             rsi,            [rsi + rax]
-        lea             rdi,            [rdi + rdx]
-
-        sub             rcx,            1                   ;
-        jnz             .half_horiz_vert_variance16x_h_1    ;
-
-        pxor        xmm1,           xmm1
-        pxor        xmm5,           xmm5
-
-        punpcklwd   xmm0,           xmm6
-        punpckhwd   xmm1,           xmm6
-        psrad       xmm0,           16
-        psrad       xmm1,           16
-        paddd       xmm0,           xmm1
-        movdqa      xmm1,           xmm0
-
-        movdqa      xmm6,           xmm7
-        punpckldq   xmm6,           xmm5
-        punpckhdq   xmm7,           xmm5
-        paddd       xmm6,           xmm7
-
-        punpckldq   xmm0,           xmm5
-        punpckhdq   xmm1,           xmm5
-        paddd       xmm0,           xmm1
-
-        movdqa      xmm7,           xmm6
-        movdqa      xmm1,           xmm0
-
-        psrldq      xmm7,           8
-        psrldq      xmm1,           8
-
-        paddd       xmm6,           xmm7
-        paddd       xmm0,           xmm1
-
-        mov         rsi,            arg(5) ;[Sum]
-        mov         rdi,            arg(6) ;[SSE]
-
-        movd        [rsi],       xmm0
-        movd        [rdi],       xmm6
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void vp9_half_vert_variance16x_h_sse2
-;(
-;    unsigned char *ref_ptr,
-;    int ref_pixels_per_line,
-;    unsigned char *src_ptr,
-;    int src_pixels_per_line,
-;    unsigned int Height,
-;    int *sum,
-;    unsigned int *sumsquared
-;)
-global sym(vp9_half_vert_variance16x_h_sse2) PRIVATE
-sym(vp9_half_vert_variance16x_h_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    ; end prolog
-
-        pxor            xmm6,           xmm6                ;  error accumulator
-        pxor            xmm7,           xmm7                ;  sse eaccumulator
-        mov             rsi,            arg(0)              ;ref_ptr
-
-        mov             rdi,            arg(2)              ;src_ptr
-        movsxd          rcx,            dword ptr arg(4)    ;Height
-        movsxd          rax,            dword ptr arg(1)    ;ref_pixels_per_line
-        movsxd          rdx,            dword ptr arg(3)    ;src_pixels_per_line
-
-        movdqu          xmm5,           XMMWORD PTR [rsi]
-        lea             rsi,            [rsi + rax          ]
-        pxor            xmm0,           xmm0
-
-.half_vert_variance16x_h_1:
-        movdqu          xmm3,           XMMWORD PTR [rsi]
-
-        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3)
-        movdqa          xmm4,           xmm5
-        punpcklbw       xmm5,           xmm0
-        punpckhbw       xmm4,           xmm0
-
-        movq            xmm2,           QWORD PTR [rdi]
-        punpcklbw       xmm2,           xmm0
-        psubw           xmm5,           xmm2
-        movq            xmm2,           QWORD PTR [rdi+8]
-        punpcklbw       xmm2,           xmm0
-        psubw           xmm4,           xmm2
-
-        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
-        paddw           xmm6,           xmm4
-        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
-        pmaddwd         xmm4,           xmm4
-        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
-        paddd           xmm7,           xmm4
-
-        movdqa          xmm5,           xmm3
-
-        lea             rsi,            [rsi + rax]
-        lea             rdi,            [rdi + rdx]
-
-        sub             rcx,            1
-        jnz             .half_vert_variance16x_h_1
-
-        pxor        xmm1,           xmm1
-        pxor        xmm5,           xmm5
-
-        punpcklwd   xmm0,           xmm6
-        punpckhwd   xmm1,           xmm6
-        psrad       xmm0,           16
-        psrad       xmm1,           16
-        paddd       xmm0,           xmm1
-        movdqa      xmm1,           xmm0
-
-        movdqa      xmm6,           xmm7
-        punpckldq   xmm6,           xmm5
-        punpckhdq   xmm7,           xmm5
-        paddd       xmm6,           xmm7
-
-        punpckldq   xmm0,           xmm5
-        punpckhdq   xmm1,           xmm5
-        paddd       xmm0,           xmm1
-
-        movdqa      xmm7,           xmm6
-        movdqa      xmm1,           xmm0
-
-        psrldq      xmm7,           8
-        psrldq      xmm1,           8
-
-        paddd       xmm6,           xmm7
-        paddd       xmm0,           xmm1
-
-        mov         rsi,            arg(5) ;[Sum]
-        mov         rdi,            arg(6) ;[SSE]
-
-        movd        [rsi],       xmm0
-        movd        [rdi],       xmm6
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void vp9_half_horiz_variance16x_h_sse2
-;(
-;    unsigned char *ref_ptr,
-;    int ref_pixels_per_line,
-;    unsigned char *src_ptr,
-;    int src_pixels_per_line,
-;    unsigned int Height,
-;    int *sum,
-;    unsigned int *sumsquared
-;)
-global sym(vp9_half_horiz_variance16x_h_sse2) PRIVATE
-sym(vp9_half_horiz_variance16x_h_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    ; end prolog
-
-        pxor            xmm6,           xmm6                ;  error accumulator
-        pxor            xmm7,           xmm7                ;  sse eaccumulator
-        mov             rsi,            arg(0) ;ref_ptr              ;
-
-        mov             rdi,            arg(2) ;src_ptr              ;
-        movsxd          rcx,            dword ptr arg(4) ;Height              ;
-        movsxd          rax,            dword ptr arg(1) ;ref_pixels_per_line
-        movsxd          rdx,            dword ptr arg(3)    ;src_pixels_per_line
-
-        pxor            xmm0,           xmm0                ;
-
-.half_horiz_variance16x_h_1:
-        movdqu          xmm5,           XMMWORD PTR [rsi]     ;  xmm5 = s0,s1,s2..s15
-        movdqu          xmm3,           XMMWORD PTR [rsi+1]   ;  xmm3 = s1,s2,s3..s16
-
-        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3)
-        movdqa          xmm1,           xmm5
-        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
-        punpckhbw       xmm1,           xmm0
-
-        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d7
-        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
-        movq            xmm2,           QWORD PTR [rdi+8]
-        punpcklbw       xmm2,           xmm0
-
-        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
-        psubw           xmm1,           xmm2
-        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
-        paddw           xmm6,           xmm1
-        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
-        pmaddwd         xmm1,           xmm1
-        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
-        paddd           xmm7,           xmm1
-
-        lea             rsi,            [rsi + rax]
-        lea             rdi,            [rdi + rdx]
-
-        sub             rcx,            1                   ;
-        jnz             .half_horiz_variance16x_h_1         ;
-
-        pxor        xmm1,           xmm1
-        pxor        xmm5,           xmm5
-
-        punpcklwd   xmm0,           xmm6
-        punpckhwd   xmm1,           xmm6
-        psrad       xmm0,           16
-        psrad       xmm1,           16
-        paddd       xmm0,           xmm1
-        movdqa      xmm1,           xmm0
-
-        movdqa      xmm6,           xmm7
-        punpckldq   xmm6,           xmm5
-        punpckhdq   xmm7,           xmm5
-        paddd       xmm6,           xmm7
-
-        punpckldq   xmm0,           xmm5
-        punpckhdq   xmm1,           xmm5
-        paddd       xmm0,           xmm1
-
-        movdqa      xmm7,           xmm6
-        movdqa      xmm1,           xmm0
-
-        psrldq      xmm7,           8
-        psrldq      xmm1,           8
-
-        paddd       xmm6,           xmm7
-        paddd       xmm0,           xmm1
-
-        mov         rsi,            arg(5) ;[Sum]
-        mov         rdi,            arg(6) ;[SSE]
-
-        movd        [rsi],       xmm0
-        movd        [rdi],       xmm6
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_variance_avx2.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_variance_avx2.c
new file mode 100644
index 00000000000..835c519576e
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_variance_avx2.c
@@ -0,0 +1,268 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include "./vpx_config.h"
+
+#include "vp9/encoder/vp9_variance.h"
+#include "vp9/common/vp9_pragmas.h"
+#include "vpx_ports/mem.h"
+
+typedef void (*get_var_avx2) (
+  const unsigned char *src_ptr,
+  int source_stride,
+  const unsigned char *ref_ptr,
+  int recon_stride,
+  unsigned int *SSE,
+  int *Sum
+);
+
+void vp9_get16x16var_avx2
+(
+  const unsigned char *src_ptr,
+  int source_stride,
+  const unsigned char *ref_ptr,
+  int recon_stride,
+  unsigned int *SSE,
+  int *Sum
+);
+
+void vp9_get32x32var_avx2
+(
+  const unsigned char *src_ptr,
+  int source_stride,
+  const unsigned char *ref_ptr,
+  int recon_stride,
+  unsigned int *SSE,
+  int *Sum
+);
+
+unsigned int vp9_sub_pixel_variance32xh_avx2
+(
+  const uint8_t *src,
+  int src_stride,
+  int x_offset,
+  int y_offset,
+  const uint8_t *dst,
+  int dst_stride,
+  int height,
+  unsigned int *sse
+);
+
+unsigned int vp9_sub_pixel_avg_variance32xh_avx2
+(
+  const uint8_t *src,
+  int src_stride,
+  int x_offset,
+  int y_offset,
+  const uint8_t *dst,
+  int dst_stride,
+  const uint8_t *sec,
+  int sec_stride,
+  int height,
+  unsigned int *sseptr
+);
+
+static void variance_avx2(const unsigned char *src_ptr, int  source_stride,
+                        const unsigned char *ref_ptr, int  recon_stride,
+                        int  w, int  h, unsigned int *sse, int *sum,
+                        get_var_avx2 var_fn, int block_size) {
+  unsigned int sse0;
+  int sum0;
+  int i, j;
+
+  *sse = 0;
+  *sum = 0;
+
+  for (i = 0; i < h; i += 16) {
+    for (j = 0; j < w; j += block_size) {
+      // processing 16 rows horizontally each call
+      var_fn(src_ptr + source_stride * i + j, source_stride,
+             ref_ptr + recon_stride * i + j, recon_stride, &sse0, &sum0);
+      *sse += sse0;
+      *sum += sum0;
+    }
+  }
+}
+
+unsigned int vp9_variance16x16_avx2
+(
+  const unsigned char *src_ptr,
+  int  source_stride,
+  const unsigned char *ref_ptr,
+  int  recon_stride,
+  unsigned int *sse) {
+  unsigned int var;
+  int avg;
+
+  variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16,
+                &var, &avg, vp9_get16x16var_avx2, 16);
+  *sse = var;
+  return (var - (((unsigned int)avg * avg) >> 8));
+}
+
+unsigned int vp9_mse16x16_avx2(
+  const unsigned char *src_ptr,
+  int  source_stride,
+  const unsigned char *ref_ptr,
+  int  recon_stride,
+  unsigned int *sse) {
+  unsigned int sse0;
+  int sum0;
+  vp9_get16x16var_avx2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0,
+                       &sum0);
+  *sse = sse0;
+  return sse0;
+}
+
+unsigned int vp9_variance32x32_avx2(const uint8_t *src_ptr,
+                                    int  source_stride,
+                                    const uint8_t *ref_ptr,
+                                    int  recon_stride,
+                                    unsigned int *sse) {
+  unsigned int var;
+  int avg;
+
+  // processing 32 elements vertically in parallel
+  variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 32, 32,
+                &var, &avg, vp9_get32x32var_avx2, 32);
+  *sse = var;
+  return (var - (((int64_t)avg * avg) >> 10));
+}
+
+unsigned int vp9_variance32x16_avx2(const uint8_t *src_ptr,
+                                    int  source_stride,
+                                    const uint8_t *ref_ptr,
+                                    int  recon_stride,
+                                    unsigned int *sse) {
+  unsigned int var;
+  int avg;
+
+  // processing 32 elements vertically in parallel
+  variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 32, 16,
+                &var, &avg, vp9_get32x32var_avx2, 32);
+  *sse = var;
+  return (var - (((int64_t)avg * avg) >> 9));
+}
+
+
+unsigned int vp9_variance64x64_avx2(const uint8_t *src_ptr,
+                                    int  source_stride,
+                                    const uint8_t *ref_ptr,
+                                    int  recon_stride,
+                                    unsigned int *sse) {
+  unsigned int var;
+  int avg;
+
+  // processing 32 elements vertically in parallel
+  variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 64, 64,
+                &var, &avg, vp9_get32x32var_avx2, 32);
+  *sse = var;
+  return (var - (((int64_t)avg * avg) >> 12));
+}
+
+unsigned int vp9_variance64x32_avx2(const uint8_t *src_ptr,
+                                    int  source_stride,
+                                    const uint8_t *ref_ptr,
+                                    int  recon_stride,
+                                    unsigned int *sse) {
+  unsigned int var;
+  int avg;
+
+  // processing 32 elements vertically in parallel
+  variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 64, 32,
+                &var, &avg, vp9_get32x32var_avx2, 32);
+
+  *sse = var;
+  return (var - (((int64_t)avg * avg) >> 11));
+}
+
+unsigned int vp9_sub_pixel_variance64x64_avx2(const uint8_t *src,
+                                              int src_stride,
+                                              int x_offset,
+                                              int y_offset,
+                                              const uint8_t *dst,
+                                              int dst_stride,
+                                              unsigned int *sse_ptr) {
+  // processing 32 elements in parallel
+  unsigned int sse;
+  int se = vp9_sub_pixel_variance32xh_avx2(src, src_stride, x_offset,
+                                           y_offset, dst, dst_stride,
+                                           64, &sse);
+  // processing the next 32 elements in parallel
+  unsigned int sse2;
+  int se2 = vp9_sub_pixel_variance32xh_avx2(src + 32, src_stride,
+                                            x_offset, y_offset,
+                                            dst + 32, dst_stride,
+                                            64, &sse2);
+  se += se2;
+  sse += sse2;
+  *sse_ptr = sse;
+  return sse - (((int64_t)se * se) >> 12);
+}
+
+unsigned int vp9_sub_pixel_variance32x32_avx2(const uint8_t *src,
+                                              int src_stride,
+                                              int x_offset,
+                                              int y_offset,
+                                              const uint8_t *dst,
+                                              int dst_stride,
+                                              unsigned int *sse_ptr) {
+  // processing 32 element in parallel
+  unsigned int sse;
+  int se = vp9_sub_pixel_variance32xh_avx2(src, src_stride, x_offset,
+                                           y_offset, dst, dst_stride,
+                                           32, &sse);
+  *sse_ptr = sse;
+  return sse - (((int64_t)se * se) >> 10);
+}
+
+unsigned int vp9_sub_pixel_avg_variance64x64_avx2(const uint8_t *src,
+                                                  int src_stride,
+                                                  int x_offset,
+                                                  int y_offset,
+                                                  const uint8_t *dst,
+                                                  int dst_stride,
+                                                  unsigned int *sseptr,
+                                                  const uint8_t *sec) {
+  // processing 32 elements in parallel
+  unsigned int sse;
+
+  int se = vp9_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset,
+                                               y_offset, dst, dst_stride,
+                                               sec, 64, 64, &sse);
+  unsigned int sse2;
+  // processing the next 32 elements in parallel
+  int se2 = vp9_sub_pixel_avg_variance32xh_avx2(src + 32, src_stride, x_offset,
+                                                y_offset, dst + 32, dst_stride,
+                                                sec + 32, 64, 64, &sse2);
+  se += se2;
+  sse += sse2;
+  *sseptr = sse;
+
+  return sse - (((int64_t)se * se) >> 12);
+}
+
+unsigned int vp9_sub_pixel_avg_variance32x32_avx2(const uint8_t *src,
+                                                  int src_stride,
+                                                  int x_offset,
+                                                  int y_offset,
+                                                  const uint8_t *dst,
+                                                  int dst_stride,
+                                                  unsigned int *sseptr,
+                                                  const uint8_t *sec) {
+  // processing 32 element in parallel
+  unsigned int sse;
+  int se = vp9_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset,
+                                                 y_offset, dst, dst_stride,
+                                                 sec, 32, 32, &sse);
+  *sseptr = sse;
+  return sse - (((int64_t)se * se) >> 10);
+}
+
+
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_variance_impl_intrin_avx2.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_variance_impl_intrin_avx2.c
new file mode 100644
index 00000000000..f9923280a34
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_variance_impl_intrin_avx2.c
@@ -0,0 +1,213 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <immintrin.h>  // AVX2
+
+void vp9_get16x16var_avx2(const unsigned char *src_ptr,
+                          int source_stride,
+                          const unsigned char *ref_ptr,
+                          int recon_stride,
+                          unsigned int *SSE,
+                          int *Sum) {
+    __m256i src, src_expand_low, src_expand_high, ref, ref_expand_low;
+    __m256i ref_expand_high, madd_low, madd_high;
+    unsigned int i, src_2strides, ref_2strides;
+    __m256i zero_reg = _mm256_set1_epi16(0);
+    __m256i sum_ref_src = _mm256_set1_epi16(0);
+    __m256i madd_ref_src = _mm256_set1_epi16(0);
+
+    // processing two strides in a 256 bit register reducing the number
+    // of loop stride by half (comparing to the sse2 code)
+    src_2strides = source_stride << 1;
+    ref_2strides = recon_stride << 1;
+    for (i = 0; i < 8; i++) {
+        src = _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i const *) (src_ptr)));
+        src = _mm256_inserti128_si256(src,
+              _mm_loadu_si128((__m128i const *)(src_ptr+source_stride)), 1);
+
+        ref =_mm256_castsi128_si256(
+             _mm_loadu_si128((__m128i const *) (ref_ptr)));
+        ref = _mm256_inserti128_si256(ref,
+              _mm_loadu_si128((__m128i const *)(ref_ptr+recon_stride)), 1);
+
+        // expanding to 16 bit each lane
+        src_expand_low = _mm256_unpacklo_epi8(src, zero_reg);
+        src_expand_high = _mm256_unpackhi_epi8(src, zero_reg);
+
+        ref_expand_low = _mm256_unpacklo_epi8(ref, zero_reg);
+        ref_expand_high = _mm256_unpackhi_epi8(ref, zero_reg);
+
+        // src-ref
+        src_expand_low = _mm256_sub_epi16(src_expand_low, ref_expand_low);
+        src_expand_high = _mm256_sub_epi16(src_expand_high, ref_expand_high);
+
+        // madd low (src - ref)
+        madd_low = _mm256_madd_epi16(src_expand_low, src_expand_low);
+
+        // add high to low
+        src_expand_low = _mm256_add_epi16(src_expand_low, src_expand_high);
+
+        // madd high (src - ref)
+        madd_high = _mm256_madd_epi16(src_expand_high, src_expand_high);
+
+        sum_ref_src = _mm256_add_epi16(sum_ref_src, src_expand_low);
+
+        // add high to low
+        madd_ref_src = _mm256_add_epi32(madd_ref_src,
+                       _mm256_add_epi32(madd_low, madd_high));
+
+        src_ptr+= src_2strides;
+        ref_ptr+= ref_2strides;
+    }
+
+    {
+        __m128i sum_res, madd_res;
+        __m128i expand_sum_low, expand_sum_high, expand_sum;
+        __m128i expand_madd_low, expand_madd_high, expand_madd;
+        __m128i ex_expand_sum_low, ex_expand_sum_high, ex_expand_sum;
+
+        // extract the low lane and add it to the high lane
+        sum_res = _mm_add_epi16(_mm256_castsi256_si128(sum_ref_src),
+                                _mm256_extractf128_si256(sum_ref_src, 1));
+
+        madd_res = _mm_add_epi32(_mm256_castsi256_si128(madd_ref_src),
+                                 _mm256_extractf128_si256(madd_ref_src, 1));
+
+        // padding each 2 bytes with another 2 zeroed bytes
+        expand_sum_low = _mm_unpacklo_epi16(_mm256_castsi256_si128(zero_reg),
+                                            sum_res);
+        expand_sum_high = _mm_unpackhi_epi16(_mm256_castsi256_si128(zero_reg),
+                                             sum_res);
+
+        // shifting the sign 16 bits right
+        expand_sum_low = _mm_srai_epi32(expand_sum_low, 16);
+        expand_sum_high = _mm_srai_epi32(expand_sum_high, 16);
+
+        expand_sum = _mm_add_epi32(expand_sum_low, expand_sum_high);
+
+        // expand each 32 bits of the madd result to 64 bits
+        expand_madd_low = _mm_unpacklo_epi32(madd_res,
+                          _mm256_castsi256_si128(zero_reg));
+        expand_madd_high = _mm_unpackhi_epi32(madd_res,
+                           _mm256_castsi256_si128(zero_reg));
+
+        expand_madd = _mm_add_epi32(expand_madd_low, expand_madd_high);
+
+        ex_expand_sum_low = _mm_unpacklo_epi32(expand_sum,
+                            _mm256_castsi256_si128(zero_reg));
+        ex_expand_sum_high = _mm_unpackhi_epi32(expand_sum,
+                             _mm256_castsi256_si128(zero_reg));
+
+        ex_expand_sum = _mm_add_epi32(ex_expand_sum_low, ex_expand_sum_high);
+
+        // shift 8 bytes eight
+        madd_res = _mm_srli_si128(expand_madd, 8);
+        sum_res = _mm_srli_si128(ex_expand_sum, 8);
+
+        madd_res = _mm_add_epi32(madd_res, expand_madd);
+        sum_res = _mm_add_epi32(sum_res, ex_expand_sum);
+
+        *((int*)SSE)= _mm_cvtsi128_si32(madd_res);
+
+        *((int*)Sum)= _mm_cvtsi128_si32(sum_res);
+    }
+}
+
+void vp9_get32x32var_avx2(const unsigned char *src_ptr,
+                          int source_stride,
+                          const unsigned char *ref_ptr,
+                          int recon_stride,
+                          unsigned int *SSE,
+                          int *Sum) {
+    __m256i src, src_expand_low, src_expand_high, ref, ref_expand_low;
+    __m256i ref_expand_high, madd_low, madd_high;
+    unsigned int i;
+    __m256i zero_reg = _mm256_set1_epi16(0);
+    __m256i sum_ref_src = _mm256_set1_epi16(0);
+    __m256i madd_ref_src = _mm256_set1_epi16(0);
+
+    // processing 32 elements in parallel
+    for (i = 0; i < 16; i++) {
+       src = _mm256_loadu_si256((__m256i const *) (src_ptr));
+
+       ref = _mm256_loadu_si256((__m256i const *) (ref_ptr));
+
+       // expanding to 16 bit each lane
+       src_expand_low = _mm256_unpacklo_epi8(src, zero_reg);
+       src_expand_high = _mm256_unpackhi_epi8(src, zero_reg);
+
+       ref_expand_low = _mm256_unpacklo_epi8(ref, zero_reg);
+       ref_expand_high = _mm256_unpackhi_epi8(ref, zero_reg);
+
+       // src-ref
+       src_expand_low = _mm256_sub_epi16(src_expand_low, ref_expand_low);
+       src_expand_high = _mm256_sub_epi16(src_expand_high, ref_expand_high);
+
+       // madd low (src - ref)
+       madd_low = _mm256_madd_epi16(src_expand_low, src_expand_low);
+
+       // add high to low
+       src_expand_low = _mm256_add_epi16(src_expand_low, src_expand_high);
+
+       // madd high (src - ref)
+       madd_high = _mm256_madd_epi16(src_expand_high, src_expand_high);
+
+       sum_ref_src = _mm256_add_epi16(sum_ref_src, src_expand_low);
+
+       // add high to low
+       madd_ref_src = _mm256_add_epi32(madd_ref_src,
+                      _mm256_add_epi32(madd_low, madd_high));
+
+       src_ptr+= source_stride;
+       ref_ptr+= recon_stride;
+    }
+
+    {
+      __m256i expand_sum_low, expand_sum_high, expand_sum;
+      __m256i expand_madd_low, expand_madd_high, expand_madd;
+      __m256i ex_expand_sum_low, ex_expand_sum_high, ex_expand_sum;
+
+      // padding each 2 bytes with another 2 zeroed bytes
+      expand_sum_low = _mm256_unpacklo_epi16(zero_reg, sum_ref_src);
+      expand_sum_high = _mm256_unpackhi_epi16(zero_reg, sum_ref_src);
+
+      // shifting the sign 16 bits right
+      expand_sum_low = _mm256_srai_epi32(expand_sum_low, 16);
+      expand_sum_high = _mm256_srai_epi32(expand_sum_high, 16);
+
+      expand_sum = _mm256_add_epi32(expand_sum_low, expand_sum_high);
+
+      // expand each 32 bits of the madd result to 64 bits
+      expand_madd_low = _mm256_unpacklo_epi32(madd_ref_src, zero_reg);
+      expand_madd_high = _mm256_unpackhi_epi32(madd_ref_src, zero_reg);
+
+      expand_madd = _mm256_add_epi32(expand_madd_low, expand_madd_high);
+
+      ex_expand_sum_low = _mm256_unpacklo_epi32(expand_sum, zero_reg);
+      ex_expand_sum_high = _mm256_unpackhi_epi32(expand_sum, zero_reg);
+
+      ex_expand_sum = _mm256_add_epi32(ex_expand_sum_low, ex_expand_sum_high);
+
+      // shift 8 bytes eight
+      madd_ref_src = _mm256_srli_si256(expand_madd, 8);
+      sum_ref_src = _mm256_srli_si256(ex_expand_sum, 8);
+
+      madd_ref_src = _mm256_add_epi32(madd_ref_src, expand_madd);
+      sum_ref_src = _mm256_add_epi32(sum_ref_src, ex_expand_sum);
+
+      // extract the low lane and the high lane and add the results
+      *((int*)SSE)= _mm_cvtsi128_si32(_mm256_castsi256_si128(madd_ref_src)) +
+      _mm_cvtsi128_si32(_mm256_extractf128_si256(madd_ref_src, 1));
+
+      *((int*)Sum)= _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_ref_src)) +
+      _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_ref_src, 1));
+    }
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_variance_impl_sse2.asm b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_variance_impl_sse2.asm
index 2c50881340d..4830412788e 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_variance_impl_sse2.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_variance_impl_sse2.asm
@@ -398,337 +398,4 @@ sym(vp9_get8x8var_sse2):
     pop         rbp
     ret
 
-;void vp9_half_horiz_vert_variance8x_h_sse2
-;(
-;    unsigned char *ref_ptr,
-;    int ref_pixels_per_line,
-;    unsigned char *src_ptr,
-;    int src_pixels_per_line,
-;    unsigned int Height,
-;    int *sum,
-;    unsigned int *sumsquared
-;)
-global sym(vp9_half_horiz_vert_variance8x_h_sse2) PRIVATE
-sym(vp9_half_horiz_vert_variance8x_h_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    ; end prolog
-
-%if ABI_IS_32BIT=0
-    movsxd          r8, dword ptr arg(1) ;ref_pixels_per_line
-    movsxd          r9, dword ptr arg(3) ;src_pixels_per_line
-%endif
-
-        pxor            xmm6,           xmm6                ;  error accumulator
-        pxor            xmm7,           xmm7                ;  sse eaccumulator
-        mov             rsi,            arg(0) ;ref_ptr              ;
-
-        mov             rdi,            arg(2) ;src_ptr              ;
-        movsxd          rcx,            dword ptr arg(4) ;Height              ;
-        movsxd          rax,            dword ptr arg(1) ;ref_pixels_per_line
-
-        pxor            xmm0,           xmm0                ;
-
-        movq            xmm5,           QWORD PTR [rsi]     ;  xmm5 = s0,s1,s2..s8
-        movq            xmm3,           QWORD PTR [rsi+1]   ;  xmm3 = s1,s2,s3..s9
-        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3) horizontal line 1
-
-%if ABI_IS_32BIT
-        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line    ;  next source
-%else
-        add             rsi, r8
-%endif
-
-.half_horiz_vert_variance8x_h_1:
-
-        movq            xmm1,           QWORD PTR [rsi]     ;
-        movq            xmm2,           QWORD PTR [rsi+1]   ;
-        pavgb           xmm1,           xmm2                ;  xmm1 = avg(xmm1,xmm3) horizontal line i+1
-
-        pavgb           xmm5,           xmm1                ;  xmm = vertical average of the above
-        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
-
-        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d8
-        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
-
-        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
-        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
-        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
-        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
-
-        movdqa          xmm5,           xmm1                ;  save xmm1 for use on the next row
-
-%if ABI_IS_32BIT
-        add             esi,            dword ptr arg(1) ;ref_pixels_per_line    ;  next source
-        add             edi,            dword ptr arg(3) ;src_pixels_per_line    ;  next destination
-%else
-        add             rsi, r8
-        add             rdi, r9
-%endif
-
-        sub             rcx,            1                   ;
-        jnz             .half_horiz_vert_variance8x_h_1     ;
-
-        movdq2q         mm6,            xmm6                ;
-        movdq2q         mm7,            xmm7                ;
-
-        psrldq          xmm6,           8
-        psrldq          xmm7,           8
-
-        movdq2q         mm2,            xmm6
-        movdq2q         mm3,            xmm7
-
-        paddw           mm6,            mm2
-        paddd           mm7,            mm3
-
-        pxor            mm3,            mm3                 ;
-        pxor            mm2,            mm2                 ;
-
-        punpcklwd       mm2,            mm6                 ;
-        punpckhwd       mm3,            mm6                 ;
-
-        paddd           mm2,            mm3                 ;
-        movq            mm6,            mm2                 ;
-
-        psrlq           mm6,            32                  ;
-        paddd           mm2,            mm6                 ;
-
-        psrad           mm2,            16                  ;
-        movq            mm4,            mm7                 ;
-
-        psrlq           mm4,            32                  ;
-        paddd           mm4,            mm7                 ;
-
-        mov             rsi,            arg(5) ; sum
-        mov             rdi,            arg(6) ; sumsquared
-
-        movd            [rsi],          mm2                 ;
-        movd            [rdi],          mm4                 ;
-
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void vp9_half_vert_variance8x_h_sse2
-;(
-;    unsigned char *ref_ptr,
-;    int ref_pixels_per_line,
-;    unsigned char *src_ptr,
-;    int src_pixels_per_line,
-;    unsigned int Height,
-;    int *sum,
-;    unsigned int *sumsquared
-;)
-global sym(vp9_half_vert_variance8x_h_sse2) PRIVATE
-sym(vp9_half_vert_variance8x_h_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    ; end prolog
-
-%if ABI_IS_32BIT=0
-    movsxd          r8, dword ptr arg(1) ;ref_pixels_per_line
-    movsxd          r9, dword ptr arg(3) ;src_pixels_per_line
-%endif
-
-        pxor            xmm6,           xmm6                ;  error accumulator
-        pxor            xmm7,           xmm7                ;  sse eaccumulator
-        mov             rsi,            arg(0) ;ref_ptr              ;
-
-        mov             rdi,            arg(2) ;src_ptr              ;
-        movsxd          rcx,            dword ptr arg(4) ;Height              ;
-        movsxd          rax,            dword ptr arg(1) ;ref_pixels_per_line
-
-        pxor            xmm0,           xmm0                ;
-.half_vert_variance8x_h_1:
-        movq            xmm5,           QWORD PTR [rsi]     ;  xmm5 = s0,s1,s2..s8
-        movq            xmm3,           QWORD PTR [rsi+rax] ;  xmm3 = s1,s2,s3..s9
-
-        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3)
-        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
-
-        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d8
-        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
-
-        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
-        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
-        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
-        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
-
-%if ABI_IS_32BIT
-        add             esi,            dword ptr arg(1) ;ref_pixels_per_line    ;  next source
-        add             edi,            dword ptr arg(3) ;src_pixels_per_line    ;  next destination
-%else
-        add             rsi, r8
-        add             rdi, r9
-%endif
 
-        sub             rcx,            1                   ;
-        jnz             .half_vert_variance8x_h_1          ;
-
-        movdq2q         mm6,            xmm6                ;
-        movdq2q         mm7,            xmm7                ;
-
-        psrldq          xmm6,           8
-        psrldq          xmm7,           8
-
-        movdq2q         mm2,            xmm6
-        movdq2q         mm3,            xmm7
-
-        paddw           mm6,            mm2
-        paddd           mm7,            mm3
-
-        pxor            mm3,            mm3                 ;
-        pxor            mm2,            mm2                 ;
-
-        punpcklwd       mm2,            mm6                 ;
-        punpckhwd       mm3,            mm6                 ;
-
-        paddd           mm2,            mm3                 ;
-        movq            mm6,            mm2                 ;
-
-        psrlq           mm6,            32                  ;
-        paddd           mm2,            mm6                 ;
-
-        psrad           mm2,            16                  ;
-        movq            mm4,            mm7                 ;
-
-        psrlq           mm4,            32                  ;
-        paddd           mm4,            mm7                 ;
-
-        mov             rsi,            arg(5) ; sum
-        mov             rdi,            arg(6) ; sumsquared
-
-        movd            [rsi],          mm2                 ;
-        movd            [rdi],          mm4                 ;
-
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp9_half_horiz_variance8x_h_sse2
-;(
-;    unsigned char *ref_ptr,
-;    int ref_pixels_per_line,
-;    unsigned char *src_ptr,
-;    int src_pixels_per_line,
-;    unsigned int Height,
-;    int *sum,
-;    unsigned int *sumsquared
-;)
-global sym(vp9_half_horiz_variance8x_h_sse2) PRIVATE
-sym(vp9_half_horiz_variance8x_h_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    ; end prolog
-
-%if ABI_IS_32BIT=0
-    movsxd          r8, dword ptr arg(1) ;ref_pixels_per_line
-    movsxd          r9, dword ptr arg(3) ;src_pixels_per_line
-%endif
-
-        pxor            xmm6,           xmm6                ;  error accumulator
-        pxor            xmm7,           xmm7                ;  sse eaccumulator
-        mov             rsi,            arg(0) ;ref_ptr              ;
-
-        mov             rdi,            arg(2) ;src_ptr              ;
-        movsxd          rcx,            dword ptr arg(4) ;Height              ;
-
-        pxor            xmm0,           xmm0                ;
-.half_horiz_variance8x_h_1:
-        movq            xmm5,           QWORD PTR [rsi]     ;  xmm5 = s0,s1,s2..s8
-        movq            xmm3,           QWORD PTR [rsi+1]   ;  xmm3 = s1,s2,s3..s9
-
-        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3)
-        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
-
-        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d8
-        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
-
-        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
-        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
-        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
-        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
-
-%if ABI_IS_32BIT
-        add             esi,            dword ptr arg(1) ;ref_pixels_per_line    ;  next source
-        add             edi,            dword ptr arg(3) ;src_pixels_per_line    ;  next destination
-%else
-        add             rsi, r8
-        add             rdi, r9
-%endif
-        sub             rcx,            1                   ;
-        jnz             .half_horiz_variance8x_h_1          ;
-
-        movdq2q         mm6,            xmm6                ;
-        movdq2q         mm7,            xmm7                ;
-
-        psrldq          xmm6,           8
-        psrldq          xmm7,           8
-
-        movdq2q         mm2,            xmm6
-        movdq2q         mm3,            xmm7
-
-        paddw           mm6,            mm2
-        paddd           mm7,            mm3
-
-        pxor            mm3,            mm3                 ;
-        pxor            mm2,            mm2                 ;
-
-        punpcklwd       mm2,            mm6                 ;
-        punpckhwd       mm3,            mm6                 ;
-
-        paddd           mm2,            mm3                 ;
-        movq            mm6,            mm2                 ;
-
-        psrlq           mm6,            32                  ;
-        paddd           mm2,            mm6                 ;
-
-        psrad           mm2,            16                  ;
-        movq            mm4,            mm7                 ;
-
-        psrlq           mm4,            32                  ;
-        paddd           mm4,            mm7                 ;
-
-        mov             rsi,            arg(5) ; sum
-        mov             rdi,            arg(6) ; sumsquared
-
-        movd            [rsi],          mm2                 ;
-        movd            [rdi],          mm4                 ;
-
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_variance_mmx.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_variance_mmx.c
index a3d011401dd..c4d17fc0f74 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_variance_mmx.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_variance_mmx.c
@@ -13,7 +13,6 @@
 #include "vp9/common/vp9_pragmas.h"
 #include "vpx_ports/mem.h"
 
-extern unsigned int vp9_get_mb_ss_mmx(const int16_t *src_ptr);
 extern unsigned int vp9_get8x8var_mmx
 (
   const unsigned char *src_ptr,
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_variance_sse2.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_variance_sse2.c
index 79e42c4cd4a..41f225922e4 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_variance_sse2.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_variance_sse2.c
@@ -24,10 +24,6 @@ extern unsigned int vp9_get4x4var_mmx
   int *Sum
 );
 
-unsigned int vp9_get_mb_ss_sse2
-(
-  const int16_t *src_ptr
-);
 unsigned int vp9_get16x16var_sse2
 (
   const unsigned char *src_ptr,
@@ -46,66 +42,6 @@ unsigned int vp9_get8x8var_sse2
   unsigned int *SSE,
   int *Sum
 );
-void vp9_half_horiz_vert_variance8x_h_sse2
-(
-  const unsigned char *ref_ptr,
-  int ref_pixels_per_line,
-  const unsigned char *src_ptr,
-  int src_pixels_per_line,
-  unsigned int Height,
-  int *sum,
-  unsigned int *sumsquared
-);
-void vp9_half_horiz_vert_variance16x_h_sse2
-(
-  const unsigned char *ref_ptr,
-  int ref_pixels_per_line,
-  const unsigned char *src_ptr,
-  int src_pixels_per_line,
-  unsigned int Height,
-  int *sum,
-  unsigned int *sumsquared
-);
-void vp9_half_horiz_variance8x_h_sse2
-(
-  const unsigned char *ref_ptr,
-  int ref_pixels_per_line,
-  const unsigned char *src_ptr,
-  int src_pixels_per_line,
-  unsigned int Height,
-  int *sum,
-  unsigned int *sumsquared
-);
-void vp9_half_horiz_variance16x_h_sse2
-(
-  const unsigned char *ref_ptr,
-  int ref_pixels_per_line,
-  const unsigned char *src_ptr,
-  int src_pixels_per_line,
-  unsigned int Height,
-  int *sum,
-  unsigned int *sumsquared
-);
-void vp9_half_vert_variance8x_h_sse2
-(
-  const unsigned char *ref_ptr,
-  int ref_pixels_per_line,
-  const unsigned char *src_ptr,
-  int src_pixels_per_line,
-  unsigned int Height,
-  int *sum,
-  unsigned int *sumsquared
-);
-void vp9_half_vert_variance16x_h_sse2
-(
-  const unsigned char *ref_ptr,
-  int ref_pixels_per_line,
-  const unsigned char *src_ptr,
-  int src_pixels_per_line,
-  unsigned int Height,
-  int *sum,
-  unsigned int *sumsquared
-);
 
 typedef unsigned int (*get_var_sse2) (
   const unsigned char *src_ptr,
@@ -498,58 +434,3 @@ FNS(ssse3, ssse3);
 
 #undef FNS
 #undef FN
-
-unsigned int vp9_variance_halfpixvar16x16_h_sse2(
-  const unsigned char *src_ptr,
-  int  src_pixels_per_line,
-  const unsigned char *dst_ptr,
-  int  dst_pixels_per_line,
-  unsigned int *sse) {
-  int xsum0;
-  unsigned int xxsum0;
-
-  vp9_half_horiz_variance16x_h_sse2(
-    src_ptr, src_pixels_per_line,
-    dst_ptr, dst_pixels_per_line, 16,
-    &xsum0, &xxsum0);
-
-  *sse = xxsum0;
-  return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
-}
-
-
-unsigned int vp9_variance_halfpixvar16x16_v_sse2(
-  const unsigned char *src_ptr,
-  int  src_pixels_per_line,
-  const unsigned char *dst_ptr,
-  int  dst_pixels_per_line,
-  unsigned int *sse) {
-  int xsum0;
-  unsigned int xxsum0;
-  vp9_half_vert_variance16x_h_sse2(
-    src_ptr, src_pixels_per_line,
-    dst_ptr, dst_pixels_per_line, 16,
-    &xsum0, &xxsum0);
-
-  *sse = xxsum0;
-  return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
-}
-
-
-unsigned int vp9_variance_halfpixvar16x16_hv_sse2(
-  const unsigned char *src_ptr,
-  int  src_pixels_per_line,
-  const unsigned char *dst_ptr,
-  int  dst_pixels_per_line,
-  unsigned int *sse) {
-  int xsum0;
-  unsigned int xxsum0;
-
-  vp9_half_horiz_vert_variance16x_h_sse2(
-    src_ptr, src_pixels_per_line,
-    dst_ptr, dst_pixels_per_line, 16,
-    &xsum0, &xxsum0);
-
-  *sse = xxsum0;
-  return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
-}
author	Jocelyn Turcotte <jocelyn.turcotte@digia.com>	2014-08-08 14:30:41 +0200
committer	Jocelyn Turcotte <jocelyn.turcotte@digia.com>	2014-08-12 13:49:54 +0200
commit	ab0a50979b9eb4dfa3320eff7e187e41efedf7a9 (patch)
tree	498dfb8a97ff3361a9f7486863a52bb4e26bb898 /chromium/third_party/libvpx/source/libvpx/vp9/encoder
parent	4ce69f7403811819800e7c5ae1318b2647e778d1 (diff)